diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
187 files changed, 17235 insertions, 10575 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index ce2b4a5..5df11a4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -69,7 +69,7 @@ FunctionPass *createAMDGPUPreloadKernArgPrologLegacyPass(); ModulePass *createAMDGPUPreloadKernelArgumentsLegacyPass(const TargetMachine *); struct AMDGPUSimplifyLibCallsPass : PassInfoMixin<AMDGPUSimplifyLibCallsPass> { - AMDGPUSimplifyLibCallsPass() {} + AMDGPUSimplifyLibCallsPass() = default; PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); }; @@ -298,6 +298,15 @@ private: bool GlobalOpt; }; +void initializeAMDGPULowerExecSyncLegacyPass(PassRegistry &); +extern char &AMDGPULowerExecSyncLegacyPassID; +ModulePass *createAMDGPULowerExecSyncLegacyPass(); + +struct AMDGPULowerExecSyncPass : PassInfoMixin<AMDGPULowerExecSyncPass> { + AMDGPULowerExecSyncPass() {} + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); +}; + void initializeAMDGPUSwLowerLDSLegacyPass(PassRegistry &); extern char &AMDGPUSwLowerLDSLegacyPassID; ModulePass * @@ -371,13 +380,13 @@ public: class AMDGPUAnnotateUniformValuesPass : public PassInfoMixin<AMDGPUAnnotateUniformValuesPass> { public: - AMDGPUAnnotateUniformValuesPass() {} + AMDGPUAnnotateUniformValuesPass() = default; PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); }; class SIModeRegisterPass : public PassInfoMixin<SIModeRegisterPass> { public: - SIModeRegisterPass() {} + SIModeRegisterPass() = default; PreservedAnalyses run(MachineFunction &F, MachineFunctionAnalysisManager &AM); }; @@ -527,7 +536,7 @@ void initializeAMDGPUAAWrapperPassPass(PassRegistry&); ImmutablePass *createAMDGPUExternalAAWrapperPass(); void initializeAMDGPUExternalAAWrapperPass(PassRegistry&); -void initializeAMDGPUArgumentUsageInfoPass(PassRegistry &); +void initializeAMDGPUArgumentUsageInfoWrapperLegacyPass(PassRegistry &); ModulePass *createAMDGPUExportKernelRuntimeHandlesLegacyPass(); void initializeAMDGPUExportKernelRuntimeHandlesLegacyPass(PassRegistry &); @@ -562,9 +571,13 @@ public: void initializeAMDGPURewriteAGPRCopyMFMALegacyPass(PassRegistry &); extern char &AMDGPURewriteAGPRCopyMFMALegacyID; +void initializeAMDGPUUniformIntrinsicCombineLegacyPass(PassRegistry &); +extern char &AMDGPUUniformIntrinsicCombineLegacyPassID; +FunctionPass *createAMDGPUUniformIntrinsicCombineLegacyPass(); + struct AMDGPUUniformIntrinsicCombinePass : public PassInfoMixin<AMDGPUUniformIntrinsicCombinePass> { - PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); }; namespace AMDGPU { diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index ea32748..9ad2f2e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -19,69 +19,105 @@ def p4 : PtrValueType<i64, 4>; def p5 : PtrValueType<i32, 5>; def p6 : PtrValueType<i32, 6>; -//===------------------------------------------------------------===// -// Subtarget Features (device properties) -//===------------------------------------------------------------===// +//===-----------------------------------------------------------------------===// +// AMDGPU Subtarget Feature (device properties) +//===----------------------------------------------------------------------===// -def FeatureFastFMAF32 : SubtargetFeature<"fast-fmaf", - "FastFMAF32", - "true", - "Assuming f32 fma is at least as fast as mul + add" +// Multiclass to define a SubtargetFeature along with optional predicates. +// Parameters: +// - FeatureString: The feature string used in the SubtargetFeature. +// - Description: The description of the feature. +// - GenPredicate: If 1 (default), generates a Has#NAME predicate. +// - GenAssemblerPredicate: If 1 (default), the predicate includes AssemblerPredicate. +// - Deps: List of dependent SubtargetFeatures (default empty). +// +// Usage: +// defm MadMixInsts : AMDGPUSubtargetFeature<"mad-mix-insts", "description">; +// This generates: +// - FeatureMadMixInsts : SubtargetFeature<"mad-mix-insts", "HasMadMixInsts", "true", "description"> +// - HasMadMixInsts : Predicate<"Subtarget->hasMadMixInsts()">, +// AssemblerPredicate<(any_of FeatureMadMixInsts)> +// +// With GenAssemblerPredicate=0: +// defm Foo : AMDGPUSubtargetFeature<"foo", "desc", 1, 0>; +// This generates: +// - FeatureFoo : SubtargetFeature<...> +// - HasFoo : Predicate<"Subtarget->hasFoo()"> (no AssemblerPredicate) +// +// With dependencies: +// defm Bar : AMDGPUSubtargetFeature<"bar", "desc", 1, 1, [FeatureFoo]>; +// This generates: +// - FeatureBar : SubtargetFeature<"bar", "HasBar", "true", "desc", [FeatureFoo]> +// - HasBar : Predicate + AssemblerPredicate +multiclass AMDGPUSubtargetFeature<string FeatureString, + string Description, + bit GenPredicate = 1, + bit GenAssemblerPredicate = 1, + list<SubtargetFeature> Deps = []> { + def Feature#NAME : SubtargetFeature<FeatureString, + "Has"#NAME, + "true", + Description, + Deps + >; + + if GenPredicate then + if GenAssemblerPredicate then + def Has#NAME + : Predicate<"Subtarget->has"#NAME#"()">, + AssemblerPredicate<(any_of !cast<SubtargetFeature>("Feature"#NAME))>; + else + def Has#NAME : Predicate<"Subtarget->has"#NAME#"()">; +} + +defm FastFMAF32 : AMDGPUSubtargetFeature<"fast-fmaf", + "Assuming f32 fma is at least as fast as mul + add", + /*GenPredicate=*/0 >; -def FeatureFastDenormalF32 : SubtargetFeature<"fast-denormal-f32", - "FastDenormalF32", - "true", - "Enabling denormals does not cause f32 instructions to run at f64 rates" +defm FastDenormalF32 : AMDGPUSubtargetFeature<"fast-denormal-f32", + "Enabling denormals does not cause f32 instructions to run at f64 rates", + /*GenPredicate=*/0 >; -def FeatureMIMG_R128 : SubtargetFeature<"mimg-r128", - "MIMG_R128", - "true", - "Support 128-bit texture resources" +defm MIMG_R128 : AMDGPUSubtargetFeature<"mimg-r128", + "Support 128-bit texture resources", + /*GenPredicate=*/0 >; -def HalfRate64Ops : SubtargetFeature<"half-rate-64-ops", - "HalfRate64Ops", - "true", - "Most fp64 instructions are half rate instead of quarter" +defm HalfRate64Ops : AMDGPUSubtargetFeature<"half-rate-64-ops", + "Most fp64 instructions are half rate instead of quarter", + /*GenPredicate=*/0 >; -def FullRate64Ops : SubtargetFeature<"full-rate-64-ops", - "FullRate64Ops", - "true", - "Most fp64 instructions are full rate" +defm FullRate64Ops : AMDGPUSubtargetFeature<"full-rate-64-ops", + "Most fp64 instructions are full rate", + /*GenPredicate=*/0 >; -def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space", - "FlatAddressSpace", - "true", +defm FlatAddressSpace : AMDGPUSubtargetFeature<"flat-address-space", "Support flat address space" >; -def FeatureFlatInstOffsets : SubtargetFeature<"flat-inst-offsets", - "FlatInstOffsets", - "true", +defm FlatInstOffsets : AMDGPUSubtargetFeature<"flat-inst-offsets", "Flat instructions have immediate offset addressing mode" >; -def FeatureFlatGlobalInsts : SubtargetFeature<"flat-global-insts", - "FlatGlobalInsts", - "true", +defm FlatGlobalInsts : AMDGPUSubtargetFeature<"flat-global-insts", "Have global_* flat memory instructions", + /*GenPredicate=*/1, + /*GenAssemblerPredicate=*/1, [FeatureFlatAddressSpace] >; -def FeatureFlatScratchInsts : SubtargetFeature<"flat-scratch-insts", - "FlatScratchInsts", - "true", +defm FlatScratchInsts : AMDGPUSubtargetFeature<"flat-scratch-insts", "Have scratch_* flat memory instructions", + /*GenPredicate=*/1, + /*GenAssemblerPredicate=*/1, [FeatureFlatAddressSpace] >; -def FeatureScalarFlatScratchInsts : SubtargetFeature<"scalar-flat-scratch-insts", - "ScalarFlatScratchInsts", - "true", +defm ScalarFlatScratchInsts : AMDGPUSubtargetFeature<"scalar-flat-scratch-insts", "Have s_scratch_* flat memory instructions" >; @@ -91,100 +127,74 @@ def FeatureEnableFlatScratch : SubtargetFeature<"enable-flat-scratch", "Use scratch_* flat memory instructions to access scratch" >; -def FeatureFlatGVSMode : SubtargetFeature<"flat-gvs-mode", - "FlatGVSMode", - "true", +defm FlatGVSMode : AMDGPUSubtargetFeature<"flat-gvs-mode", "Have GVS addressing mode with flat_* instructions", + /*GenPredicate=*/1, + /*GenAssemblerPredicate=*/1, [FeatureFlatAddressSpace] >; -def FeatureAddNoCarryInsts : SubtargetFeature<"add-no-carry-insts", - "AddNoCarryInsts", - "true", +defm AddNoCarryInsts : AMDGPUSubtargetFeature<"add-no-carry-insts", "Have VALU add/sub instructions without carry out" >; -def FeatureUnalignedBufferAccess : SubtargetFeature<"unaligned-buffer-access", - "UnalignedBufferAccess", - "true", +defm UnalignedBufferAccess : AMDGPUSubtargetFeature<"unaligned-buffer-access", "Hardware supports unaligned global loads and stores" >; -def FeatureTrapHandler: SubtargetFeature<"trap-handler", - "TrapHandler", - "true", - "Trap handler support" +defm TrapHandler: AMDGPUSubtargetFeature<"trap-handler", + "Trap handler support", + /*GenPredicate=*/0 >; -def FeatureUnalignedScratchAccess : SubtargetFeature<"unaligned-scratch-access", - "UnalignedScratchAccess", - "true", +defm UnalignedScratchAccess : AMDGPUSubtargetFeature<"unaligned-scratch-access", "Support unaligned scratch loads and stores" >; -def FeatureUnalignedDSAccess : SubtargetFeature<"unaligned-ds-access", - "UnalignedDSAccess", - "true", +defm UnalignedDSAccess : AMDGPUSubtargetFeature<"unaligned-ds-access", "Hardware supports unaligned local and region loads and stores" >; -def FeatureRelaxedBufferOOBMode : SubtargetFeature<"relaxed-buffer-oob-mode", - "RelaxedBufferOOBMode", - "true", - "Disable strict out-of-bounds buffer guarantees. An OOB access may potentially cause an adjacent access to be treated as if it were also OOB" +defm RelaxedBufferOOBMode : AMDGPUSubtargetFeature<"relaxed-buffer-oob-mode", + "Disable strict out-of-bounds buffer guarantees. An OOB access may potentially" + "cause an adjacent access to be treated as if it were also OOB" >; -def FeatureApertureRegs : SubtargetFeature<"aperture-regs", - "HasApertureRegs", - "true", - "Has Memory Aperture Base and Size Registers" +defm ApertureRegs : AMDGPUSubtargetFeature<"aperture-regs", + "Has Memory Aperture Base and Size Registers", + /*GenPredicate=*/0 >; -def FeatureMadMixInsts : SubtargetFeature<"mad-mix-insts", - "HasMadMixInsts", - "true", +defm MadMixInsts : AMDGPUSubtargetFeature<"mad-mix-insts", "Has v_mad_mix_f32, v_mad_mixlo_f16, v_mad_mixhi_f16 instructions" >; -def FeatureFmaMixInsts : SubtargetFeature<"fma-mix-insts", - "HasFmaMixInsts", - "true", +defm FmaMixInsts : AMDGPUSubtargetFeature<"fma-mix-insts", "Has v_fma_mix_f32, v_fma_mixlo_f16, v_fma_mixhi_f16 instructions" >; -def FeatureFmaMixBF16Insts : SubtargetFeature<"fma-mix-bf16-insts", - "HasFmaMixBF16Insts", - "true", +defm FmaMixBF16Insts : AMDGPUSubtargetFeature<"fma-mix-bf16-insts", "Has v_fma_mix_f32_bf16, v_fma_mixlo_bf16, v_fma_mixhi_bf16 instructions" >; -def FeatureIEEEMinimumMaximumInsts : SubtargetFeature<"ieee-minimum-maximum-insts", - "HasIEEEMinimumMaximumInsts", - "true", - "Has v_minimum/maximum_f16/f32/f64, v_minimummaximum/maximumminimum_f16/f32 and v_pk_minimum/maximum_f16 instructions" +defm IEEEMinimumMaximumInsts : AMDGPUSubtargetFeature<"ieee-minimum-maximum-insts", + "Has v_minimum/maximum_f16/f32/f64, v_minimummaximum/maximumminimum_f16/f32 and" + "v_pk_minimum/maximum_f16 instructions" >; -def FeatureMinimum3Maximum3F32 : SubtargetFeature<"minimum3-maximum3-f32", - "HasMinimum3Maximum3F32", - "true", +defm Minimum3Maximum3F32 : AMDGPUSubtargetFeature<"minimum3-maximum3-f32", "Has v_minimum3_f32 and v_maximum3_f32 instructions" >; -def FeatureMinimum3Maximum3F16 : SubtargetFeature<"minimum3-maximum3-f16", - "HasMinimum3Maximum3F16", - "true", +defm Minimum3Maximum3F16 : AMDGPUSubtargetFeature<"minimum3-maximum3-f16", "Has v_minimum3_f16 and v_maximum3_f16 instructions" >; -def FeatureMin3Max3PKF16 : SubtargetFeature<"min3-max3-pkf16", - "HasMin3Max3PKF16", - "true", +defm Min3Max3PKF16 : AMDGPUSubtargetFeature<"min3-max3-pkf16", "Has v_pk_min3_num_f16 and v_pk_max3_num_f16 instructions" >; -def FeatureMinimum3Maximum3PKF16 : SubtargetFeature<"minimum3-maximum3-pkf16", - "HasMinimum3Maximum3PKF16", - "true", +defm Minimum3Maximum3PKF16 : AMDGPUSubtargetFeature<"minimum3-maximum3-pkf16", "Has v_pk_minimum3_f16 and v_pk_maximum3_f16 instructions" >; @@ -223,82 +233,67 @@ def FeaturePreciseMemory : SubtargetFeature<"precise-memory", "EnablePreciseMemory", "true", "Enable precise memory mode">; -def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug", - "SGPRInitBug", - "true", +defm SGPRInitBug : AMDGPUSubtargetFeature<"sgpr-init-bug", "VI SGPR initialization bug requiring a fixed SGPR allocation size" >; -def FeatureUserSGPRInit16Bug : SubtargetFeature<"user-sgpr-init16-bug", - "UserSGPRInit16Bug", - "true", - "Bug requiring at least 16 user+system SGPRs to be enabled" +defm UserSGPRInit16Bug : AMDGPUSubtargetFeature<"user-sgpr-init16-bug", + "Bug requiring at least 16 user+system SGPRs to be enabled", + /*GenPredicate=*/0 >; -def FeatureLdsMisalignedBug : SubtargetFeature<"lds-misaligned-bug", - "LDSMisalignedBug", - "true", - "Some GFX10 bug with multi-dword LDS and flat access that is not naturally aligned in WGP mode" +defm LDSMisalignedBug : AMDGPUSubtargetFeature<"lds-misaligned-bug", + "Some GFX10 bug with multi-dword LDS and flat access that is not naturally aligned in WGP mode", + /*GenPredicate=*/0 >; -def FeatureMFMAInlineLiteralBug : SubtargetFeature<"mfma-inline-literal-bug", - "HasMFMAInlineLiteralBug", - "true", - "MFMA cannot use inline literal as SrcC" +defm MFMAInlineLiteralBug : AMDGPUSubtargetFeature<"mfma-inline-literal-bug", + "MFMA cannot use inline literal as SrcC", + /*GenPredicate=*/0 >; -def FeatureVcmpxPermlaneHazard : SubtargetFeature<"vcmpx-permlane-hazard", - "HasVcmpxPermlaneHazard", - "true", - "TODO: describe me" +defm VcmpxPermlaneHazard : AMDGPUSubtargetFeature<"vcmpx-permlane-hazard", + "TODO: describe me", + /*GenPredicate=*/0 >; -def FeatureVMEMtoScalarWriteHazard : SubtargetFeature<"vmem-to-scalar-write-hazard", - "HasVMEMtoScalarWriteHazard", - "true", - "VMEM instruction followed by scalar writing to EXEC mask, M0 or SGPR leads to incorrect execution." +defm VMEMtoScalarWriteHazard : AMDGPUSubtargetFeature<"vmem-to-scalar-write-hazard", + "VMEM instruction followed by scalar writing to EXEC mask, M0 or SGPR leads to incorrect execution.", + /*GenPredicate=*/0 >; -def FeatureSMEMtoVectorWriteHazard : SubtargetFeature<"smem-to-vector-write-hazard", - "HasSMEMtoVectorWriteHazard", - "true", - "s_load_dword followed by v_cmp page faults" +defm SMEMtoVectorWriteHazard : AMDGPUSubtargetFeature<"smem-to-vector-write-hazard", + "s_load_dword followed by v_cmp page faults", + /*GenPredicate=*/0 >; -def FeatureInstFwdPrefetchBug : SubtargetFeature<"inst-fwd-prefetch-bug", - "HasInstFwdPrefetchBug", - "true", - "S_INST_PREFETCH instruction causes shader to hang" +defm InstFwdPrefetchBug : AMDGPUSubtargetFeature<"inst-fwd-prefetch-bug", + "S_INST_PREFETCH instruction causes shader to hang", + /*GenPredicate=*/0 >; -def FeatureVmemPrefInsts : SubtargetFeature<"vmem-pref-insts", - "HasVmemPrefInsts", - "true", +defm VmemPrefInsts : AMDGPUSubtargetFeature<"vmem-pref-insts", "Has flat_prefect_b8 and global_prefetch_b8 instructions" >; -def FeatureSafeSmemPrefetch : SubtargetFeature<"safe-smem-prefetch", - "HasSafeSmemPrefetch", - "true", - "SMEM prefetches do not fail on illegal address" +defm SafeSmemPrefetch : AMDGPUSubtargetFeature<"safe-smem-prefetch", + "SMEM prefetches do not fail on illegal address", + /*GenPredicate=*/0 >; -def FeatureSafeCUPrefetch : SubtargetFeature<"safe-cu-prefetch", - "HasSafeCUPrefetch", - "true", - "VMEM CU scope prefetches do not fail on illegal address" +defm SafeCUPrefetch : AMDGPUSubtargetFeature<"safe-cu-prefetch", + "VMEM CU scope prefetches do not fail on illegal address", + /*GenPredicate=*/0 >; -def FeatureVcmpxExecWARHazard : SubtargetFeature<"vcmpx-exec-war-hazard", - "HasVcmpxExecWARHazard", - "true", - "V_CMPX WAR hazard on EXEC (V_CMPX issue ONLY)" +defm VcmpxExecWARHazard : AMDGPUSubtargetFeature<"vcmpx-exec-war-hazard", + "V_CMPX WAR hazard on EXEC (V_CMPX issue ONLY)", + /*GenPredicate=*/0 >; -def FeatureLdsBranchVmemWARHazard : SubtargetFeature<"lds-branch-vmem-war-hazard", - "HasLdsBranchVmemWARHazard", - "true", - "Switching between LDS and VMEM-tex not waiting VM_VSRC=0" +defm LdsBranchVmemWARHazard : AMDGPUSubtargetFeature<"lds-branch-vmem-war-hazard", + "Switching between LDS and VMEM-tex not waiting VM_VSRC=0", + /*GenPredicate=*/0 >; class FeatureMaxHardClauseLength<int size> : SubtargetFeature< @@ -316,70 +311,60 @@ def FeatureMaxHardClauseLength32 : FeatureMaxHardClauseLength<32>; /// permitted clause length. def FeatureMaxHardClauseLength63 : FeatureMaxHardClauseLength<63>; -def FeatureNSAtoVMEMBug : SubtargetFeature<"nsa-to-vmem-bug", - "HasNSAtoVMEMBug", - "true", - "MIMG-NSA followed by VMEM fail if EXEC_LO or EXEC_HI equals zero" +defm NSAtoVMEMBug : AMDGPUSubtargetFeature<"nsa-to-vmem-bug", + "MIMG-NSA followed by VMEM fail if EXEC_LO or EXEC_HI equals zero", + /*GenPredicate=*/0 >; -def FeatureNSAClauseBug : SubtargetFeature<"nsa-clause-bug", - "HasNSAClauseBug", - "true", - "MIMG-NSA in a hard clause has unpredictable results on GFX10.1" +defm NSAClauseBug : AMDGPUSubtargetFeature<"nsa-clause-bug", + "MIMG-NSA in a hard clause has unpredictable results on GFX10.1", + /*GenPredicate=*/0 >; -def FeatureFlatSegmentOffsetBug : SubtargetFeature<"flat-segment-offset-bug", - "HasFlatSegmentOffsetBug", - "true", - "GFX10 bug where inst_offset is ignored when flat instructions access global memory" +defm FlatSegmentOffsetBug : AMDGPUSubtargetFeature<"flat-segment-offset-bug", + "GFX10 bug where inst_offset is ignored when flat instructions access global memory", + /*GenPredicate=*/0 >; -def FeatureNegativeScratchOffsetBug : SubtargetFeature<"negative-scratch-offset-bug", - "NegativeScratchOffsetBug", - "true", +defm NegativeScratchOffsetBug : AMDGPUSubtargetFeature<"negative-scratch-offset-bug", "Negative immediate offsets in scratch instructions with an SGPR offset page fault on GFX9" >; -def FeatureNegativeUnalignedScratchOffsetBug : SubtargetFeature<"negative-unaligned-scratch-offset-bug", - "NegativeUnalignedScratchOffsetBug", - "true", - "Scratch instructions with a VGPR offset and a negative immediate offset that is not a multiple of 4 read wrong memory on GFX10" +defm NegativeUnalignedScratchOffsetBug : AMDGPUSubtargetFeature<"negative-unaligned-scratch-offset-bug", + "Scratch instructions with a VGPR offset and a negative immediate offset that" + "is not a multiple of 4 read wrong memory on GFX10", + /*GenPredicate=*/0 >; -def FeatureOffset3fBug : SubtargetFeature<"offset-3f-bug", - "HasOffset3fBug", - "true", - "Branch offset of 3f hardware bug" +defm Offset3fBug : AMDGPUSubtargetFeature<"offset-3f-bug", + "Branch offset of 3f hardware bug", + /*GenPredicate=*/0 >; -def FeatureImageStoreD16Bug : SubtargetFeature<"image-store-d16-bug", - "HasImageStoreD16Bug", - "true", - "Image Store D16 hardware bug" +defm ImageStoreD16Bug : AMDGPUSubtargetFeature<"image-store-d16-bug", + "Image Store D16 hardware bug", + /*GenPredicate=*/0 >; -def FeatureImageGather4D16Bug : SubtargetFeature<"image-gather4-d16-bug", - "HasImageGather4D16Bug", - "true", - "Image Gather4 D16 hardware bug" +defm ImageGather4D16Bug : AMDGPUSubtargetFeature<"image-gather4-d16-bug", + "Image Gather4 D16 hardware bug", + /*GenPredicate=*/0 >; -def FeatureMADIntraFwdBug : SubtargetFeature<"mad-intra-fwd-bug", - "HasMADIntraFwdBug", - "true", - "MAD_U64/I64 intra instruction forwarding bug" +defm MADIntraFwdBug : AMDGPUSubtargetFeature<"mad-intra-fwd-bug", + "MAD_U64/I64 intra instruction forwarding bug", + /*GenPredicate=*/1, + /*GenAssemblerPredicate=*/0 >; -def FeatureMSAALoadDstSelBug : SubtargetFeature<"msaa-load-dst-sel-bug", - "HasMSAALoadDstSelBug", - "true", - "MSAA loads not honoring dst_sel bug" +defm MSAALoadDstSelBug : AMDGPUSubtargetFeature<"msaa-load-dst-sel-bug", + "MSAA loads not honoring dst_sel bug", + /*GenPredicate=*/0 >; -def FeaturePrivEnabledTrap2NopBug : SubtargetFeature<"priv-enabled-trap2-nop-bug", - "HasPrivEnabledTrap2NopBug", - "true", - "Hardware that runs with PRIV=1 interpreting 's_trap 2' as a nop bug" +defm PrivEnabledTrap2NopBug : AMDGPUSubtargetFeature<"priv-enabled-trap2-nop-bug", + "Hardware that runs with PRIV=1 interpreting 's_trap 2' as a nop bug", + /*GenPredicate=*/0 >; class SubtargetFeatureLDSBankCount <int Value> : SubtargetFeature < @@ -392,28 +377,24 @@ class SubtargetFeatureLDSBankCount <int Value> : SubtargetFeature < def FeatureLDSBankCount16 : SubtargetFeatureLDSBankCount<16>; def FeatureLDSBankCount32 : SubtargetFeatureLDSBankCount<32>; -def FeatureGCN3Encoding : SubtargetFeature<"gcn3-encoding", - "GCN3Encoding", - "true", - "Encoding format for VI" +defm GCN3Encoding : AMDGPUSubtargetFeature<"gcn3-encoding", + "Encoding format for VI", + /*GenPredicate=*/0 >; -def FeatureCIInsts : SubtargetFeature<"ci-insts", - "CIInsts", - "true", - "Additional instructions for CI+" +defm CIInsts : AMDGPUSubtargetFeature<"ci-insts", + "Additional instructions for CI+", + /*GenPredicate=*/0 >; -def FeatureGFX8Insts : SubtargetFeature<"gfx8-insts", - "GFX8Insts", - "true", - "Additional instructions for GFX8+" +defm GFX8Insts : AMDGPUSubtargetFeature<"gfx8-insts", + "Additional instructions for GFX8+", + /*GenPredicate=*/0 >; -def FeatureGFX9Insts : SubtargetFeature<"gfx9-insts", - "GFX9Insts", - "true", - "Additional instructions for GFX9+" +defm GFX9Insts : AMDGPUSubtargetFeature<"gfx9-insts", + "Additional instructions for GFX9+", + /*GenPredicate=*/0 >; def FeatureRequiresAlignedVGPRs : SubtargetFeature<"vgpr-align2", @@ -422,83 +403,72 @@ def FeatureRequiresAlignedVGPRs : SubtargetFeature<"vgpr-align2", "VGPR and AGPR tuple operands require even alignment" >; -def FeatureGFX90AInsts : SubtargetFeature<"gfx90a-insts", - "GFX90AInsts", - "true", - "Additional instructions for GFX90A+" - // [HasAtomicFMinFMaxF64GlobalInsts, HasAtomicFMinFMaxF64FlatInsts] // TODO +defm GFX90AInsts : AMDGPUSubtargetFeature<"gfx90a-insts", + "Additional instructions for GFX90A+", + /*GenPredicate=*/0 >; -def FeatureGFX940Insts : SubtargetFeature<"gfx940-insts", - "GFX940Insts", - "true", - "Additional instructions for GFX940+" +defm GFX940Insts : AMDGPUSubtargetFeature<"gfx940-insts", + "Additional instructions for GFX940+", + /*GenPredicate=*/0 >; -def FeaturePermlane16Swap : SubtargetFeature<"permlane16-swap", - "HasPermlane16Swap", - "true", +defm Permlane16Swap : AMDGPUSubtargetFeature<"permlane16-swap", "Has v_permlane16_swap_b32 instructions" >; -def FeaturePermlane32Swap : SubtargetFeature<"permlane32-swap", - "HasPermlane32Swap", - "true", +defm Permlane32Swap : AMDGPUSubtargetFeature<"permlane32-swap", "Has v_permlane32_swap_b32 instructions" >; -def FeatureFP8ConversionScaleInsts : SubtargetFeature<"fp8-cvt-scale-insts", - "HasFP8ConversionScaleInsts", - "true", +defm FP8ConversionScaleInsts : AMDGPUSubtargetFeature<"fp8-cvt-scale-insts", "Has fp8 conversion scale instructions" >; -def FeatureBF8ConversionScaleInsts : SubtargetFeature<"bf8-cvt-scale-insts", - "HasBF8ConversionScaleInsts", - "true", +defm BF8ConversionScaleInsts : AMDGPUSubtargetFeature<"bf8-cvt-scale-insts", "Has bf8 conversion scale instructions" >; -def FeatureFP4ConversionScaleInsts : SubtargetFeature<"fp4-cvt-scale-insts", - "HasFP4ConversionScaleInsts", - "true", +defm FP4ConversionScaleInsts : AMDGPUSubtargetFeature<"fp4-cvt-scale-insts", "Has fp4 conversion scale instructions" >; -def FeatureFP6BF6ConversionScaleInsts : SubtargetFeature<"fp6bf6-cvt-scale-insts", - "HasFP6BF6ConversionScaleInsts", - "true", +defm FP6BF6ConversionScaleInsts : AMDGPUSubtargetFeature<"fp6bf6-cvt-scale-insts", "Has fp6 and bf6 conversion scale instructions" >; -def FeatureF16BF16ToFP6BF6ConversionScaleInsts : SubtargetFeature<"f16bf16-to-fp6bf6-cvt-scale-insts", - "HasF16BF16ToFP6BF6ConversionScaleInsts", - "true", +defm F16BF16ToFP6BF6ConversionScaleInsts : AMDGPUSubtargetFeature<"f16bf16-to-fp6bf6-cvt-scale-insts", "Has f16bf16 to fp6bf6 conversion scale instructions" >; -def FeatureF32ToF16BF16ConversionSRInsts : SubtargetFeature<"f32-to-f16bf16-cvt-sr-insts", - "HasF32ToF16BF16ConversionSRInsts", - "true", +defm F32ToF16BF16ConversionSRInsts : AMDGPUSubtargetFeature<"f32-to-f16bf16-cvt-sr-insts", "Has f32 to f16bf16 conversion scale instructions" >; -def FeatureAshrPkInsts : SubtargetFeature<"ashr-pk-insts", - "HasAshrPkInsts", - "true", +defm AshrPkInsts : AMDGPUSubtargetFeature<"ashr-pk-insts", "Has Arithmetic Shift Pack instructions" >; -def FeatureCvtPkF16F32Inst : SubtargetFeature<"cvt-pk-f16-f32-inst", - "HasCvtPkF16F32Inst", - "true", +defm CvtPkF16F32Inst : AMDGPUSubtargetFeature<"cvt-pk-f16-f32-inst", "Has cvt_pk_f16_f32 instruction" >; -def FeatureGFX950Insts : SubtargetFeature<"gfx950-insts", - "GFX950Insts", - "true", +defm McastLoadInsts : AMDGPUSubtargetFeature<"mcast-load-insts", + "Has multicast load instructions" +>; + +defm SWakeupImm : AMDGPUSubtargetFeature<"s-wakeup-imm", + "s_wakeup takes an immediate operand" +>; + +defm SBarrierLeaveImm : AMDGPUSubtargetFeature<"s-barrier-leave-imm", + "s_barrier_leave takes an immediate operand" +>; + +defm GFX950Insts : AMDGPUSubtargetFeature<"gfx950-insts", "Additional instructions for GFX950+", + /*GenPredicate=*/1, + /*GenAssemblerPredicate=*/1, [FeaturePermlane16Swap, FeaturePermlane32Swap, FeatureAshrPkInsts, @@ -514,63 +484,59 @@ def FeatureGFX950Insts : SubtargetFeature<"gfx950-insts", ] >; -def FeatureGFX10Insts : SubtargetFeature<"gfx10-insts", - "GFX10Insts", - "true", - "Additional instructions for GFX10+" +defm GFX10Insts : AMDGPUSubtargetFeature<"gfx10-insts", + "Additional instructions for GFX10+", + /*GenPredicate=*/0 >; -def FeatureGFX11Insts : SubtargetFeature<"gfx11-insts", - "GFX11Insts", - "true", - "Additional instructions for GFX11+" +defm GFX11Insts : AMDGPUSubtargetFeature<"gfx11-insts", + "Additional instructions for GFX11+", + /*GenPredicate=*/0 >; -def FeatureGFX12Insts : SubtargetFeature<"gfx12-insts", - "GFX12Insts", - "true", - "Additional instructions for GFX12+" +defm GFX12Insts : AMDGPUSubtargetFeature<"gfx12-insts", + "Additional instructions for GFX12+", + /*GenPredicate=*/0 >; -def FeatureGFX1250Insts : SubtargetFeature<"gfx1250-insts", - "GFX1250Insts", - "true", - "Additional instructions for GFX1250+" +defm GFX1250Insts : AMDGPUSubtargetFeature<"gfx1250-insts", + "Additional instructions for GFX1250+", + /*GenPredicate=*/0 >; -def FeatureGFX10_3Insts : SubtargetFeature<"gfx10-3-insts", - "GFX10_3Insts", - "true", - "Additional instructions for GFX10.3" +defm GFX13Insts : AMDGPUSubtargetFeature<"gfx13-insts", + "Additional instructions for GFX13+", + /*GenPredicate=*/0, + /*GenAssemblerPredicate=*/0, + [FeatureSWakeupImm, + FeatureSBarrierLeaveImm, + ] >; -def FeatureGFX7GFX8GFX9Insts : SubtargetFeature<"gfx7-gfx8-gfx9-insts", - "GFX7GFX8GFX9Insts", - "true", - "Instructions shared in GFX7, GFX8, GFX9" +defm GFX10_3Insts : AMDGPUSubtargetFeature<"gfx10-3-insts", + "Additional instructions for GFX10.3", + /*GenPredicate=*/0 >; -def FeatureSMemRealTime : SubtargetFeature<"s-memrealtime", - "HasSMemRealTime", - "true", +defm GFX7GFX8GFX9Insts : AMDGPUSubtargetFeature<"gfx7-gfx8-gfx9-insts", + "Instructions shared in GFX7, GFX8, GFX9", + /*GenPredicate=*/0 +>; + +defm SMemRealTime : AMDGPUSubtargetFeature<"s-memrealtime", "Has s_memrealtime instruction" >; -def FeatureInv2PiInlineImm : SubtargetFeature<"inv-2pi-inline-imm", - "HasInv2PiInlineImm", - "true", - "Has 1 / (2 * pi) as inline immediate" +defm Inv2PiInlineImm : AMDGPUSubtargetFeature<"inv-2pi-inline-imm", + "Has 1 / (2 * pi) as inline immediate", + /*GenPredicate=*/0 >; -def Feature16BitInsts : SubtargetFeature<"16-bit-insts", - "Has16BitInsts", - "true", +defm 16BitInsts : AMDGPUSubtargetFeature<"16-bit-insts", "Has i16/f16 instructions" >; -def FeatureTrue16BitInsts : SubtargetFeature<"true16", - "HasTrue16BitInsts", - "true", +defm True16BitInsts : AMDGPUSubtargetFeature<"true16", "True 16-bit operand instructions" >; @@ -580,100 +546,75 @@ def FeatureRealTrue16Insts : SubtargetFeature<"real-true16", "Use true 16-bit registers" >; -def FeatureD16Writes32BitVgpr : SubtargetFeature<"d16-write-vgpr32", - "EnableD16Writes32BitVgpr", - "true", +defm D16Writes32BitVgpr : AMDGPUSubtargetFeature<"d16-write-vgpr32", "D16 instructions potentially have 32-bit data dependencies" >; -def FeatureBF16TransInsts : SubtargetFeature<"bf16-trans-insts", - "HasBF16TransInsts", - "true", +defm BF16TransInsts : AMDGPUSubtargetFeature<"bf16-trans-insts", "Has bf16 transcendental instructions" >; -def FeatureBF16ConversionInsts : SubtargetFeature<"bf16-cvt-insts", - "HasBF16ConversionInsts", - "true", +defm BF16ConversionInsts : AMDGPUSubtargetFeature<"bf16-cvt-insts", "Has bf16 conversion instructions" >; -def FeatureBF16PackedInsts : SubtargetFeature<"bf16-pk-insts", - "HasBF16PackedInsts", - "true", +defm BF16PackedInsts : AMDGPUSubtargetFeature<"bf16-pk-insts", "Has bf16 packed instructions (fma, add, mul, max, min)" >; -def FeatureVOP3P : SubtargetFeature<"vop3p", - "HasVOP3PInsts", - "true", +defm VOP3PInsts : AMDGPUSubtargetFeature<"vop3p", "Has VOP3P packed instructions" >; -def FeatureMovrel : SubtargetFeature<"movrel", - "HasMovrel", - "true", +defm Movrel : AMDGPUSubtargetFeature<"movrel", "Has v_movrel*_b32 instructions" >; -def FeatureVGPRIndexMode : SubtargetFeature<"vgpr-index-mode", - "HasVGPRIndexMode", - "true", +defm VGPRIndexMode : AMDGPUSubtargetFeature<"vgpr-index-mode", "Has VGPR mode register indexing" >; -def FeatureScalarDwordx3Loads : SubtargetFeature<"scalar-dwordx3-loads", - "HasScalarDwordx3Loads", - "true", +defm ScalarDwordx3Loads : AMDGPUSubtargetFeature<"scalar-dwordx3-loads", "Has 96-bit scalar load instructions" >; -def FeatureScalarStores : SubtargetFeature<"scalar-stores", - "HasScalarStores", - "true", +defm ScalarStores : AMDGPUSubtargetFeature<"scalar-stores", "Has store scalar memory instructions" >; -def FeatureScalarAtomics : SubtargetFeature<"scalar-atomics", - "HasScalarAtomics", - "true", +defm ScalarAtomics : AMDGPUSubtargetFeature<"scalar-atomics", "Has atomic scalar memory instructions" >; -def FeatureSDWA : SubtargetFeature<"sdwa", - "HasSDWA", - "true", - "Support SDWA (Sub-DWORD Addressing) extension" +defm SDWA : AMDGPUSubtargetFeature<"sdwa", + "Support SDWA (Sub-DWORD Addressing) extension", + /*GenPredicate=*/1, + /*GenAssemblerPredicate=*/0 >; -def FeatureSDWAOmod : SubtargetFeature<"sdwa-omod", - "HasSDWAOmod", - "true", - "Support OMod with SDWA (Sub-DWORD Addressing) extension" +defm SDWAOmod : AMDGPUSubtargetFeature<"sdwa-omod", + "Support OMod with SDWA (Sub-DWORD Addressing) extension", + /*GenPredicate=*/0 >; -def FeatureSDWAScalar : SubtargetFeature<"sdwa-scalar", - "HasSDWAScalar", - "true", - "Support scalar register with SDWA (Sub-DWORD Addressing) extension" +defm SDWAScalar : AMDGPUSubtargetFeature<"sdwa-scalar", + "Support scalar register with SDWA (Sub-DWORD Addressing) extension", + /*GenPredicate=*/0 >; -def FeatureSDWASdst : SubtargetFeature<"sdwa-sdst", - "HasSDWASdst", - "true", - "Support scalar dst for VOPC with SDWA (Sub-DWORD Addressing) extension" +defm SDWASdst : AMDGPUSubtargetFeature<"sdwa-sdst", + "Support scalar dst for VOPC with SDWA (Sub-DWORD Addressing) extension", + /*GenPredicate=*/0 >; -def FeatureSDWAMac : SubtargetFeature<"sdwa-mav", - "HasSDWAMac", - "true", - "Support v_mac_f32/f16 with SDWA (Sub-DWORD Addressing) extension" +defm SDWAMac : AMDGPUSubtargetFeature<"sdwa-mav", + "Support v_mac_f32/f16 with SDWA (Sub-DWORD Addressing) extension", + /*GenPredicate=*/0 >; -def FeatureSDWAOutModsVOPC : SubtargetFeature<"sdwa-out-mods-vopc", - "HasSDWAOutModsVOPC", - "true", - "Support clamp for VOPC with SDWA (Sub-DWORD Addressing) extension" +defm SDWAOutModsVOPC : AMDGPUSubtargetFeature<"sdwa-out-mods-vopc", + "Support clamp for VOPC with SDWA (Sub-DWORD Addressing) extension", + /*GenPredicate=*/0 >; def FeatureDPP : SubtargetFeature<"dpp", @@ -689,270 +630,227 @@ def FeatureDPP8 : SubtargetFeature<"dpp8", "Support DPP8 (Data Parallel Primitives) extension" >; -def FeatureDPALU_DPP : SubtargetFeature<"dpp-64bit", - "HasDPALU_DPP", - "true", +defm DPALU_DPP : AMDGPUSubtargetFeature<"dpp-64bit", "Support DPP (Data Parallel Primitives) extension in DP ALU" >; -def FeatureDPPSrc1SGPR : SubtargetFeature<"dpp-src1-sgpr", - "HasDPPSrc1SGPR", - "true", - "Support SGPR for Src1 of DPP instructions" +defm DPPSrc1SGPR : AMDGPUSubtargetFeature<"dpp-src1-sgpr", + "Support SGPR for Src1 of DPP instructions", + /*GenPredicate=*/0 >; -def FeaturePackedFP32Ops : SubtargetFeature<"packed-fp32-ops", - "HasPackedFP32Ops", - "true", +defm PackedFP32Ops : AMDGPUSubtargetFeature<"packed-fp32-ops", "Support packed fp32 instructions" >; -def FeatureR128A16 : SubtargetFeature<"r128-a16", - "HasR128A16", - "true", - "Support gfx9-style A16 for 16-bit coordinates/gradients/lod/clamp/mip image operands, where a16 is aliased with r128" +defm R128A16 : AMDGPUSubtargetFeature<"r128-a16", + "Support gfx9-style A16 for 16-bit coordinates/gradients/lod/clamp/mip image " + "operands, where a16 is aliased with r128" >; -def FeatureA16 : SubtargetFeature<"a16", - "HasA16", - "true", +defm A16 : AMDGPUSubtargetFeature<"a16", "Support A16 for 16-bit coordinates/gradients/lod/clamp/mip image operands" >; -def FeatureG16 : SubtargetFeature<"g16", - "HasG16", - "true", +defm G16 : AMDGPUSubtargetFeature<"g16", "Support G16 for 16-bit gradient image operands" >; -def FeatureNSAEncoding : SubtargetFeature<"nsa-encoding", - "HasNSAEncoding", - "true", - "Support NSA encoding for image instructions" +defm NSAEncoding : AMDGPUSubtargetFeature<"nsa-encoding", + "Support NSA encoding for image instructions", + /*GenPredicate=*/0 >; -def FeaturePartialNSAEncoding : SubtargetFeature<"partial-nsa-encoding", - "HasPartialNSAEncoding", - "true", - "Support partial NSA encoding for image instructions" +defm PartialNSAEncoding : AMDGPUSubtargetFeature<"partial-nsa-encoding", + "Support partial NSA encoding for image instructions", + /*GenPredicate=*/0 >; -def FeatureImageInsts : SubtargetFeature<"image-insts", - "HasImageInsts", - "true", +defm ImageInsts : AMDGPUSubtargetFeature<"image-insts", "Support image instructions" >; -def FeatureExtendedImageInsts : SubtargetFeature<"extended-image-insts", - "HasExtendedImageInsts", - "true", +defm ExtendedImageInsts : AMDGPUSubtargetFeature<"extended-image-insts", "Support mips != 0, lod != 0, gather4, and get_lod" >; -def FeatureGFX10_AEncoding : SubtargetFeature<"gfx10_a-encoding", - "GFX10_AEncoding", - "true", - "Has BVH ray tracing instructions" +defm GFX10_AEncoding : AMDGPUSubtargetFeature<"gfx10_a-encoding", + "Has BVH ray tracing instructions", + /*GenPredicate=*/0 >; -def FeatureGFX10_BEncoding : SubtargetFeature<"gfx10_b-encoding", - "GFX10_BEncoding", - "true", - "Encoding format GFX10_B" +defm GFX10_BEncoding : AMDGPUSubtargetFeature<"gfx10_b-encoding", + "Encoding format GFX10_B", + /*GenPredicate=*/0 >; -def FeatureIntClamp : SubtargetFeature<"int-clamp-insts", - "HasIntClamp", - "true", +defm IntClamp : AMDGPUSubtargetFeature<"int-clamp-insts", "Support clamp for integer destination" >; -def FeatureUnpackedD16VMem : SubtargetFeature<"unpacked-d16-vmem", - "HasUnpackedD16VMem", - "true", +defm UnpackedD16VMem : AMDGPUSubtargetFeature<"unpacked-d16-vmem", "Has unpacked d16 vmem instructions" >; -def FeatureDLInsts : SubtargetFeature<"dl-insts", - "HasDLInsts", - "true", +defm DLInsts : AMDGPUSubtargetFeature<"dl-insts", "Has v_fmac_f32 and v_xnor_b32 instructions" >; -def FeatureFmacF64Inst : SubtargetFeature<"fmacf64-inst", - "HasFmacF64Inst", - "true", +defm FmacF64Inst : AMDGPUSubtargetFeature<"fmacf64-inst", "Has v_fmac_f64 instruction" >; -def FeatureDot1Insts : SubtargetFeature<"dot1-insts", - "HasDot1Insts", - "true", +defm Dot1Insts : AMDGPUSubtargetFeature<"dot1-insts", "Has v_dot4_i32_i8 and v_dot8_i32_i4 instructions" >; -def FeatureDot2Insts : SubtargetFeature<"dot2-insts", - "HasDot2Insts", - "true", +defm Dot2Insts : AMDGPUSubtargetFeature<"dot2-insts", "Has v_dot2_i32_i16, v_dot2_u32_u16 instructions" >; -def FeatureDot3Insts : SubtargetFeature<"dot3-insts", - "HasDot3Insts", - "true", +defm Dot3Insts : AMDGPUSubtargetFeature<"dot3-insts", "Has v_dot8c_i32_i4 instruction" >; -def FeatureDot4Insts : SubtargetFeature<"dot4-insts", - "HasDot4Insts", - "true", +defm Dot4Insts : AMDGPUSubtargetFeature<"dot4-insts", "Has v_dot2c_i32_i16 instruction" >; -def FeatureDot5Insts : SubtargetFeature<"dot5-insts", - "HasDot5Insts", - "true", +defm Dot5Insts : AMDGPUSubtargetFeature<"dot5-insts", "Has v_dot2c_f32_f16 instruction" >; -def FeatureDot6Insts : SubtargetFeature<"dot6-insts", - "HasDot6Insts", - "true", +defm Dot6Insts : AMDGPUSubtargetFeature<"dot6-insts", "Has v_dot4c_i32_i8 instruction" >; -def FeatureDot7Insts : SubtargetFeature<"dot7-insts", - "HasDot7Insts", - "true", +defm Dot7Insts : AMDGPUSubtargetFeature<"dot7-insts", "Has v_dot4_u32_u8, v_dot8_u32_u4 instructions" >; -def FeatureDot8Insts : SubtargetFeature<"dot8-insts", - "HasDot8Insts", - "true", +defm Dot8Insts : AMDGPUSubtargetFeature<"dot8-insts", "Has v_dot4_i32_iu8, v_dot8_i32_iu4 instructions" >; -def FeatureDot9Insts : SubtargetFeature<"dot9-insts", - "HasDot9Insts", - "true", +defm Dot9Insts : AMDGPUSubtargetFeature<"dot9-insts", "Has v_dot2_f16_f16, v_dot2_bf16_bf16 instructions" >; -def FeatureDot10Insts : SubtargetFeature<"dot10-insts", - "HasDot10Insts", - "true", +defm Dot10Insts : AMDGPUSubtargetFeature<"dot10-insts", "Has v_dot2_f32_f16 instruction" >; -def FeatureDot11Insts : SubtargetFeature<"dot11-insts", - "HasDot11Insts", - "true", +defm Dot11Insts : AMDGPUSubtargetFeature<"dot11-insts", "Has v_dot4_f32_fp8_fp8, v_dot4_f32_fp8_bf8, v_dot4_f32_bf8_fp8, v_dot4_f32_bf8_bf8 instructions" >; -def FeatureDot12Insts : SubtargetFeature<"dot12-insts", - "HasDot12Insts", - "true", +defm Dot12Insts : AMDGPUSubtargetFeature<"dot12-insts", "Has v_dot2_f32_bf16 instructions" >; -def FeatureDot13Insts : SubtargetFeature<"dot13-insts", - "HasDot13Insts", - "true", +defm Dot13Insts : AMDGPUSubtargetFeature<"dot13-insts", "Has v_dot2c_f32_bf16 instructions" >; - -def FeatureMAIInsts : SubtargetFeature<"mai-insts", - "HasMAIInsts", - "true", +defm MAIInsts : AMDGPUSubtargetFeature<"mai-insts", "Has mAI instructions" >; -def FeatureFP8Insts : SubtargetFeature<"fp8-insts", - "HasFP8Insts", - "true", +defm FP8Insts : AMDGPUSubtargetFeature<"fp8-insts", "Has fp8 and bf8 instructions" >; -def FeatureFP8ConversionInsts : SubtargetFeature<"fp8-conversion-insts", - "HasFP8ConversionInsts", - "true", +defm FP8ConversionInsts : AMDGPUSubtargetFeature<"fp8-conversion-insts", "Has fp8 and bf8 conversion instructions" >; -def FeatureFP8E5M3Insts : SubtargetFeature<"fp8e5m3-insts", - "HasFP8E5M3Insts", - "true", +defm FP8E5M3Insts : AMDGPUSubtargetFeature<"fp8e5m3-insts", "Has fp8 e5m3 format support" >; -def FeatureCvtFP8VOP1Bug : SubtargetFeature<"cvt-fp8-vop1-bug", - "HasCvtFP8Vop1Bug", - "true", +defm CvtFP8VOP1Bug : AMDGPUSubtargetFeature<"cvt-fp8-vop1-bug", "FP8/BF8 VOP1 form of conversion to F32 is unreliable", + /*GenPredicate=*/1, + /*GenAssemblerPredicate=*/0, [FeatureFP8ConversionInsts] >; -def FeaturePkFmacF16Inst : SubtargetFeature<"pk-fmac-f16-inst", - "HasPkFmacF16Inst", - "true", +defm PkFmacF16Inst : AMDGPUSubtargetFeature<"pk-fmac-f16-inst", "Has v_pk_fmac_f16 instruction" >; -def FeatureAtomicDsPkAdd16Insts : SubtargetFeature<"atomic-ds-pk-add-16-insts", - "HasAtomicDsPkAdd16Insts", - "true", +defm CubeInsts : AMDGPUSubtargetFeature<"cube-insts", + "Has v_cube* instructions" +>; + +defm LerpInst : AMDGPUSubtargetFeature<"lerp-inst", + "Has v_lerp_u8 instruction" +>; + +defm SadInsts : AMDGPUSubtargetFeature<"sad-insts", + "Has v_sad* instructions" +>; + +defm QsadInsts : AMDGPUSubtargetFeature<"qsad-insts", + "Has v_qsad* instructions" +>; + +defm CvtNormInsts : AMDGPUSubtargetFeature<"cvt-norm-insts", + "Has v_cvt_norm* instructions" +>; + +defm CvtPkNormVOP2Insts : AMDGPUSubtargetFeature<"cvt-pknorm-vop2-insts", + "Has v_cvt_pk_norm_*f32 instructions/Has v_cvt_pk_norm_*_f16 instructions" +>; + +defm CvtPkNormVOP3Insts : AMDGPUSubtargetFeature<"cvt-pknorm-vop3-insts", + "Has v_cvt_pk_norm_*f32 instructions/Has v_cvt_pk_norm_*_f16 instructions" +>; + +defm AtomicDsPkAdd16Insts : AMDGPUSubtargetFeature<"atomic-ds-pk-add-16-insts", "Has ds_pk_add_bf16, ds_pk_add_f16, ds_pk_add_rtn_bf16, " "ds_pk_add_rtn_f16 instructions" >; -def FeatureAtomicFlatPkAdd16Insts : SubtargetFeature<"atomic-flat-pk-add-16-insts", - "HasAtomicFlatPkAdd16Insts", - "true", +defm AtomicFlatPkAdd16Insts : AMDGPUSubtargetFeature<"atomic-flat-pk-add-16-insts", "Has flat_atomic_pk_add_f16 and flat_atomic_pk_add_bf16 instructions" >; -def FeatureAtomicFaddRtnInsts : SubtargetFeature<"atomic-fadd-rtn-insts", - "HasAtomicFaddRtnInsts", - "true", +defm AtomicFaddRtnInsts : AMDGPUSubtargetFeature<"atomic-fadd-rtn-insts", "Has buffer_atomic_add_f32 and global_atomic_add_f32 instructions that " "return original value", + /*GenPredicate=*/1, + /*GenAssemblerPredicate=*/1, [FeatureFlatGlobalInsts] >; -def FeatureAtomicFMinFMaxF32GlobalInsts : SubtargetFeature<"atomic-fmin-fmax-global-f32", - "HasAtomicFMinFMaxF32GlobalInsts", - "true", +defm AtomicFMinFMaxF32GlobalInsts : AMDGPUSubtargetFeature<"atomic-fmin-fmax-global-f32", "Has global/buffer instructions for atomicrmw fmin/fmax for float" >; -def FeatureAtomicFMinFMaxF64GlobalInsts : SubtargetFeature<"atomic-fmin-fmax-global-f64", - "HasAtomicFMinFMaxF64GlobalInsts", - "true", +defm AtomicFMinFMaxF64GlobalInsts : AMDGPUSubtargetFeature<"atomic-fmin-fmax-global-f64", "Has global/buffer instructions for atomicrmw fmin/fmax for float" >; -def FeatureAtomicFMinFMaxF32FlatInsts : SubtargetFeature<"atomic-fmin-fmax-flat-f32", - "HasAtomicFMinFMaxF32FlatInsts", - "true", +defm AtomicFMinFMaxF32FlatInsts : AMDGPUSubtargetFeature<"atomic-fmin-fmax-flat-f32", "Has flat memory instructions for atomicrmw fmin/fmax for float", + /*GenPredicate=*/1, + /*GenAssemblerPredicate=*/1, [FeatureFlatAddressSpace] >; -def FeatureAtomicFMinFMaxF64FlatInsts : SubtargetFeature<"atomic-fmin-fmax-flat-f64", - "HasAtomicFMinFMaxF64FlatInsts", - "true", +defm AtomicFMinFMaxF64FlatInsts : AMDGPUSubtargetFeature<"atomic-fmin-fmax-flat-f64", "Has flat memory instructions for atomicrmw fmin/fmax for double", + /*GenPredicate=*/1, + /*GenAssemblerPredicate=*/1, [FeatureFlatAddressSpace] >; -def FeatureAtomicFaddNoRtnInsts : SubtargetFeature<"atomic-fadd-no-rtn-insts", - "HasAtomicFaddNoRtnInsts", - "true", +defm AtomicFaddNoRtnInsts : AMDGPUSubtargetFeature<"atomic-fadd-no-rtn-insts", "Has buffer_atomic_add_f32 and global_atomic_add_f32 instructions that " "don't return original value", + /*GenPredicate=*/1, + /*GenAssemblerPredicate=*/1, [FeatureFlatGlobalInsts] >; @@ -965,46 +863,40 @@ def FeatureAtomicBufferGlobalPkAddF16NoRtnInsts [FeatureFlatGlobalInsts] >; -def FeatureAtomicBufferGlobalPkAddF16Insts : SubtargetFeature<"atomic-buffer-global-pk-add-f16-insts", - "HasAtomicBufferGlobalPkAddF16Insts", - "true", - "Has buffer_atomic_pk_add_f16 and global_atomic_pk_add_f16 instructions that " - "can return original value", - [FeatureFlatGlobalInsts] +defm AtomicBufferGlobalPkAddF16Insts : AMDGPUSubtargetFeature<"atomic-buffer-global-pk-add-f16-insts", + "Has buffer_atomic_pk_add_f16 and global_atomic_pk_add_f16 instructions that " + "can return original value", + /*GenPredicate=*/1, + /*GenAssemblerPredicate=*/1, + [FeatureFlatGlobalInsts] >; -def FeatureAtomicGlobalPkAddBF16Inst : SubtargetFeature<"atomic-global-pk-add-bf16-inst", - "HasAtomicGlobalPkAddBF16Inst", - "true", - "Has global_atomic_pk_add_bf16 instruction", - [FeatureFlatGlobalInsts] +defm AtomicGlobalPkAddBF16Inst : AMDGPUSubtargetFeature<"atomic-global-pk-add-bf16-inst", + "Has global_atomic_pk_add_bf16 instruction", + /*GenPredicate=*/1, + /*GenAssemblerPredicate=*/1, + [FeatureFlatGlobalInsts] >; -def FeatureAtomicBufferPkAddBF16Inst : SubtargetFeature<"atomic-buffer-pk-add-bf16-inst", - "HasAtomicBufferPkAddBF16Inst", - "true", - "Has buffer_atomic_pk_add_bf16 instruction" +defm AtomicBufferPkAddBF16Inst : AMDGPUSubtargetFeature<"atomic-buffer-pk-add-bf16-inst", + "Has buffer_atomic_pk_add_bf16 instruction" >; -def FeatureAtomicCSubNoRtnInsts : SubtargetFeature<"atomic-csub-no-rtn-insts", - "HasAtomicCSubNoRtnInsts", - "true", +defm AtomicCSubNoRtnInsts : AMDGPUSubtargetFeature<"atomic-csub-no-rtn-insts", "Has buffer_atomic_csub and global_atomic_csub instructions that don't " - "return original value" + "return original value", + /*GenPredicate=*/1, + /*GenAssemblerPredicate=*/0 >; -def FeatureFlatAtomicFaddF32Inst - : SubtargetFeature<"flat-atomic-fadd-f32-inst", - "HasFlatAtomicFaddF32Inst", - "true", +defm FlatAtomicFaddF32Inst : AMDGPUSubtargetFeature<"flat-atomic-fadd-f32-inst", "Has flat_atomic_add_f32 instruction", + /*GenPredicate=*/1, + /*GenAssemblerPredicate=*/1, [FeatureFlatAddressSpace] >; -def FeatureFlatBufferGlobalAtomicFaddF64Inst - : SubtargetFeature<"flat-buffer-global-fadd-f64-inst", - "HasFlatBufferGlobalAtomicFaddF64Inst", - "true", +defm FlatBufferGlobalAtomicFaddF64Inst : AMDGPUSubtargetFeature<"flat-buffer-global-fadd-f64-inst", "Has flat, buffer, and global instructions for f64 atomic fadd" >; @@ -1015,33 +907,27 @@ def FeatureMemoryAtomicFAddF32DenormalSupport "global/flat/buffer atomic fadd for float supports denormal handling" >; -def FeatureAgentScopeFineGrainedRemoteMemoryAtomics - : SubtargetFeature<"agent-scope-fine-grained-remote-memory-atomics", - "HasAgentScopeFineGrainedRemoteMemoryAtomics", - "true", +defm AgentScopeFineGrainedRemoteMemoryAtomics : AMDGPUSubtargetFeature< + "agent-scope-fine-grained-remote-memory-atomics", "Agent (device) scoped atomic operations, excluding those directly " "supported by PCIe (i.e. integer atomic add, exchange, and " "compare-and-swap), are functional for allocations in host or peer " - "device memory." + "device memory.", + /*GenPredicate=*/0 >; -def FeatureEmulatedSystemScopeAtomics - : SubtargetFeature<"emulated-system-scope-atomics", - "HasEmulatedSystemScopeAtomics", - "true", +defm EmulatedSystemScopeAtomics : AMDGPUSubtargetFeature< + "emulated-system-scope-atomics", "System scope atomics unsupported by the PCI-e are emulated in HW via CAS " - "loop and functional." + "loop and functional.", + /*GenPredicate=*/0 >; -def FeatureDefaultComponentZero : SubtargetFeature<"default-component-zero", - "HasDefaultComponentZero", - "true", +defm DefaultComponentZero : AMDGPUSubtargetFeature<"default-component-zero", "BUFFER/IMAGE store instructions set unspecified components to zero (before GFX12)" >; -def FeatureDefaultComponentBroadcast : SubtargetFeature<"default-component-broadcast", - "HasDefaultComponentBroadcast", - "true", +defm DefaultComponentBroadcast : AMDGPUSubtargetFeature<"default-component-broadcast", "BUFFER/IMAGE store instructions set unspecified components to x component (GFX12)" >; @@ -1057,183 +943,144 @@ def FeatureSRAMECC : SubtargetFeature<"sramecc", "Enable SRAMECC" >; -def FeatureNoSdstCMPX : SubtargetFeature<"no-sdst-cmpx", - "HasNoSdstCMPX", - "true", +defm NoSdstCMPX : AMDGPUSubtargetFeature<"no-sdst-cmpx", "V_CMPX does not write VCC/SGPR in addition to EXEC" >; -def FeatureVscnt : SubtargetFeature<"vscnt", - "HasVscnt", - "true", - "Has separate store vscnt counter" +defm Vscnt : AMDGPUSubtargetFeature<"vscnt", + "Has separate store vscnt counter", + /*GenPredicate=*/0 >; -def FeatureGetWaveIdInst : SubtargetFeature<"get-wave-id-inst", - "HasGetWaveIdInst", - "true", +defm GetWaveIdInst : AMDGPUSubtargetFeature<"get-wave-id-inst", "Has s_get_waveid_in_workgroup instruction" >; -def FeatureSMemTimeInst : SubtargetFeature<"s-memtime-inst", - "HasSMemTimeInst", - "true", +defm SMemTimeInst : AMDGPUSubtargetFeature<"s-memtime-inst", "Has s_memtime instruction" >; -def FeatureShaderCyclesRegister : SubtargetFeature<"shader-cycles-register", - "HasShaderCyclesRegister", - "true", +defm ShaderCyclesRegister : AMDGPUSubtargetFeature<"shader-cycles-register", "Has SHADER_CYCLES hardware register" >; -def FeatureShaderCyclesHiLoRegisters : SubtargetFeature<"shader-cycles-hi-lo-registers", - "HasShaderCyclesHiLoRegisters", - "true", - "Has SHADER_CYCLES_HI/LO hardware registers" +defm ShaderCyclesHiLoRegisters : AMDGPUSubtargetFeature<"shader-cycles-hi-lo-registers", + "Has SHADER_CYCLES_HI/LO hardware registers", + /*GenPredicate=*/1, + /*GenAssemblerPredicate=*/0 >; -def FeatureMadMacF32Insts : SubtargetFeature<"mad-mac-f32-insts", - "HasMadMacF32Insts", - "true", +defm MadMacF32Insts : AMDGPUSubtargetFeature<"mad-mac-f32-insts", "Has v_mad_f32/v_mac_f32/v_madak_f32/v_madmk_f32 instructions" >; -def FeatureDsSrc2Insts : SubtargetFeature<"ds-src2-insts", - "HasDsSrc2Insts", - "true", +defm DsSrc2Insts : AMDGPUSubtargetFeature<"ds-src2-insts", "Has ds_*_src2 instructions" >; -def FeatureVOP3Literal : SubtargetFeature<"vop3-literal", - "HasVOP3Literal", - "true", - "Can use one literal in VOP3" +defm VOP3Literal : AMDGPUSubtargetFeature<"vop3-literal", + "Can use one literal in VOP3", + /*GenPredicate=*/0 >; -def FeatureNoDataDepHazard : SubtargetFeature<"no-data-dep-hazard", - "HasNoDataDepHazard", - "true", - "Does not need SW waitstates" +defm NoDataDepHazard : AMDGPUSubtargetFeature<"no-data-dep-hazard", + "Does not need SW waitstates", + /*GenPredicate=*/0 >; // Allocate 1536 VGPRs for wave32 and 768 VGPRs for wave64 // with allocation granularity 24 for wave32 and 12 for wave64 -def Feature1_5xVGPRs : SubtargetFeature<"allocate1_5xvgprs", - "Has1_5xVGPRs", - "true", - "Has 50% more physical VGPRs and 50% larger allocation granule" +defm 1_5xVGPRs : AMDGPUSubtargetFeature<"allocate1_5xvgprs", + "Has 50% more physical VGPRs and 50% larger allocation granule", + /*GenPredicate=*/0 >; - -def FeatureVOPD : SubtargetFeature<"vopd", - "HasVOPDInsts", - "true", - "Has VOPD dual issue wave32 instructions" +defm VOPDInsts : AMDGPUSubtargetFeature<"vopd", + "Has VOPD dual issue wave32 instructions", + /*GenPredicate=*/0 >; -def FeatureVALUTransUseHazard : SubtargetFeature<"valu-trans-use-hazard", - "HasVALUTransUseHazard", - "true", - "Hazard when TRANS instructions are closely followed by a use of the result" +defm VALUTransUseHazard : AMDGPUSubtargetFeature<"valu-trans-use-hazard", + "Hazard when TRANS instructions are closely followed by a use of the result", + /*GenPredicate=*/0 >; -def FeatureSALUFloatInsts : SubtargetFeature<"salu-float", - "HasSALUFloatInsts", - "true", +defm SALUFloatInsts : AMDGPUSubtargetFeature<"salu-float", "Has SALU floating point instructions" >; -def FeaturePseudoScalarTrans : SubtargetFeature<"pseudo-scalar-trans", - "HasPseudoScalarTrans", - "true", +defm PseudoScalarTrans : AMDGPUSubtargetFeature<"pseudo-scalar-trans", "Has Pseudo Scalar Transcendental instructions" >; -def FeatureHasRestrictedSOffset : SubtargetFeature<"restricted-soffset", - "HasRestrictedSOffset", - "true", +defm RestrictedSOffset : AMDGPUSubtargetFeature<"restricted-soffset", "Has restricted SOffset (immediate not supported)." >; -def FeatureRequiredExportPriority : SubtargetFeature<"required-export-priority", - "HasRequiredExportPriority", - "true", - "Export priority must be explicitly manipulated on GFX11.5" +defm RequiredExportPriority : AMDGPUSubtargetFeature<"required-export-priority", + "Export priority must be explicitly manipulated on GFX11.5", + /*GenPredicate=*/0 >; -def FeatureVmemWriteVgprInOrder : SubtargetFeature<"vmem-write-vgpr-in-order", - "HasVmemWriteVgprInOrder", - "true", - "VMEM instructions of the same type write VGPR results in order" +defm VmemWriteVgprInOrder : AMDGPUSubtargetFeature<"vmem-write-vgpr-in-order", + "VMEM instructions of the same type write VGPR results in order", + /*GenPredicate=*/0 >; -def FeatureBitOp3Insts : SubtargetFeature<"bitop3-insts", - "HasBitOp3Insts", - "true", +defm BitOp3Insts : AMDGPUSubtargetFeature<"bitop3-insts", "Has v_bitop3_b32/v_bitop3_b16 instructions" >; -def FeatureTanhInsts : SubtargetFeature<"tanh-insts", - "HasTanhInsts", - "true", +defm TanhInsts : AMDGPUSubtargetFeature<"tanh-insts", "Has v_tanh_f32/f16 instructions" >; -def FeatureTensorCvtLutInsts : SubtargetFeature<"tensor-cvt-lut-insts", - "HasTensorCvtLutInsts", - "true", +defm TensorCvtLutInsts : AMDGPUSubtargetFeature<"tensor-cvt-lut-insts", "Has v_perm_pk16* instructions" >; -def FeatureTransposeLoadF4F6Insts : SubtargetFeature<"transpose-load-f4f6-insts", - "HasTransposeLoadF4F6Insts", - "true", +defm TransposeLoadF4F6Insts : AMDGPUSubtargetFeature<"transpose-load-f4f6-insts", "Has ds_load_tr4/tr6 and global_load_tr4/tr6 instructions" >; -def FeaturePrngInst : SubtargetFeature<"prng-inst", - "HasPrngInst", - "true", +defm PrngInst : AMDGPUSubtargetFeature<"prng-inst", "Has v_prng_b32 instruction" >; -def FeatureBVHDualAndBVH8Insts : SubtargetFeature<"bvh-dual-bvh-8-insts", - "HasBVHDualAndBVH8Insts", - "true", +defm BVHDualAndBVH8Insts : AMDGPUSubtargetFeature<"bvh-dual-bvh-8-insts", "Has image_bvh_dual_intersect_ray and image_bvh8_intersect_ray instructions" >; -def FeaturePointSampleAccel : SubtargetFeature<"point-sample-accel", - "HasPointSampleAccel", - "true", - "Has point sample acceleration feature" +defm PointSampleAccel : AMDGPUSubtargetFeature<"point-sample-accel", + "Has point sample acceleration feature", + /*GenPredicate=*/0 >; -def Feature64BitLiterals : SubtargetFeature<"64-bit-literals", - "Has64BitLiterals", - "true", +defm 64BitLiterals : AMDGPUSubtargetFeature<"64-bit-literals", "Can use 64-bit literals with single DWORD instructions" >; -def Feature1024AddressableVGPRs : SubtargetFeature<"1024-addressable-vgprs", - "Has1024AddressableVGPRs", - "true", +defm 1024AddressableVGPRs : AMDGPUSubtargetFeature<"1024-addressable-vgprs", "Has 1024 addressable VGPRs" >; -def FeatureWaitXcnt : SubtargetFeature<"wait-xcnt", - "HasWaitXcnt", - "true", +defm SetregVGPRMSBFixup : AMDGPUSubtargetFeature<"setreg-vgpr-msb-fixup", + "S_SETREG to MODE clobbers VGPR MSB bits, requires fixup", + /*GenPredicate=*/0 +>; + +defm WaitXcnt : AMDGPUSubtargetFeature<"wait-xcnt", "Has s_wait_xcnt instruction" >; -def FeatureSetPrioIncWgInst : SubtargetFeature<"setprio-inc-wg-inst", - "HasSetPrioIncWgInst", - "true", +defm SetPrioIncWgInst : AMDGPUSubtargetFeature<"setprio-inc-wg-inst", "Has s_setprio_inc_wg instruction." >; +defm SWakeupBarrier : AMDGPUSubtargetFeature<"s-wakeup-barrier-inst", + "Has s_wakeup_barrier instruction." +>; + //===------------------------------------------------------------===// // Subtarget Features (options and debugging) //===------------------------------------------------------------===// @@ -1244,11 +1091,9 @@ def FeatureSetPrioIncWgInst : SubtargetFeature<"setprio-inc-wg-inst", // wave32 and wave64. Instead what users do is assemble with both // wavesizes enabled. We translate this into this special mode so this // only influences assembler behavior and nothing else. -def FeatureAssemblerPermissiveWavesize : SubtargetFeature< - "assembler-permissive-wavesize", - "AssemblerPermissiveWavesize", - "true", - "allow parsing wave32 and wave64 variants of instructions" +defm AssemblerPermissiveWavesize : AMDGPUSubtargetFeature<"assembler-permissive-wavesize", + "Allow parsing wave32 and wave64 variants of instructions", + /*GenPredicate=*/0 >; class FeatureMaxPrivateElementSize<int size> : SubtargetFeature< @@ -1262,12 +1107,6 @@ def FeatureMaxPrivateElementSize4 : FeatureMaxPrivateElementSize<4>; def FeatureMaxPrivateElementSize8 : FeatureMaxPrivateElementSize<8>; def FeatureMaxPrivateElementSize16 : FeatureMaxPrivateElementSize<16>; -def FeatureDumpCode : SubtargetFeature <"DumpCode", - "DumpCode", - "true", - "Dump MachineInstrs in the CodeEmitter" ->; - def FeatureDumpCodeLower : SubtargetFeature <"dumpcode", "DumpCode", "true", @@ -1321,74 +1160,64 @@ def FeatureEnablePRTStrictNull : SubtargetFeature<"enable-prt-strict-null", // FIXME: moveToVALU should be able to handle converting addr64 MUBUF // instructions. -def FeatureFlatForGlobal : SubtargetFeature<"flat-for-global", - "FlatForGlobal", +def FeatureUseFlatForGlobal : SubtargetFeature<"flat-for-global", + "UseFlatForGlobal", "true", "Force to generate flat instruction for global" >; -def FeatureAutoWaitcntBeforeBarrier : SubtargetFeature < - "auto-waitcnt-before-barrier", - "AutoWaitcntBeforeBarrier", - "true", - "Hardware automatically inserts waitcnt before barrier" +defm AutoWaitcntBeforeBarrier : AMDGPUSubtargetFeature <"auto-waitcnt-before-barrier", + "Hardware automatically inserts waitcnt before barrier", + /*GenPredicate=*/0 >; -def FeatureBackOffBarrier : SubtargetFeature <"back-off-barrier", - "BackOffBarrier", - "true", - "Hardware supports backing off s_barrier if an exception occurs" +defm BackOffBarrier : AMDGPUSubtargetFeature <"back-off-barrier", + "Hardware supports backing off s_barrier if an exception occurs", + /*GenPredicate=*/0 >; -def FeatureTrigReducedRange : SubtargetFeature<"trig-reduced-range", - "HasTrigReducedRange", - "true", - "Requires use of fract on arguments to trig instructions" +defm TrigReducedRange : AMDGPUSubtargetFeature<"trig-reduced-range", + "Requires use of fract on arguments to trig instructions", + /*GenPredicate=*/0 >; -def FeatureKernargPreload : SubtargetFeature <"kernarg-preload", - "KernargPreload", - "true", - "Hardware supports preloading of kernel arguments in user SGPRs." +defm KernargPreload : AMDGPUSubtargetFeature <"kernarg-preload", + "Hardware supports preloading of kernel arguments in user SGPRs.", + /*GenPredicate=*/0 >; // Alignment enforcement is controlled by a configuration register: // SH_MEM_CONFIG.alignment_mode -def FeatureUnalignedAccessMode : SubtargetFeature<"unaligned-access-mode", - "UnalignedAccessMode", - "true", +defm UnalignedAccessMode : AMDGPUSubtargetFeature<"unaligned-access-mode", "Enable unaligned global, local and region loads and stores if the hardware" " supports it" >; -def FeaturePackedTID : SubtargetFeature<"packed-tid", - "HasPackedTID", - "true", - "Workitem IDs are packed into v0 at kernel launch" +defm PackedTID : AMDGPUSubtargetFeature<"packed-tid", + "Workitem IDs are packed into v0 at kernel launch", + /*GenPredicate=*/0 >; -def FeatureArchitectedFlatScratch : SubtargetFeature<"architected-flat-scratch", - "HasArchitectedFlatScratch", - "true", - "Flat Scratch register is a readonly SPI initialized architected register" +defm ArchitectedFlatScratch : AMDGPUSubtargetFeature<"architected-flat-scratch", + "Flat Scratch register is a readonly SPI initialized architected register", + /*GenPredicate=*/0 >; -def FeatureArchitectedSGPRs : SubtargetFeature<"architected-sgprs", - "HasArchitectedSGPRs", - "true", - "Enable the architected SGPRs" +defm ArchitectedSGPRs : AMDGPUSubtargetFeature<"architected-sgprs", + "Enable the architected SGPRs", + /*GenPredicate=*/0 >; -def FeatureGDS : SubtargetFeature<"gds", - "HasGDS", - "true", - "Has Global Data Share" +defm GDS : AMDGPUSubtargetFeature<"gds", + "Has Global Data Share", + /*GenPredicate=*/1, + /*GenAssemblerPredicate=*/0 >; -def FeatureGWS : SubtargetFeature<"gws", - "HasGWS", - "true", - "Has Global Wave Sync" +defm GWS : AMDGPUSubtargetFeature<"gws", + "Has Global Wave Sync", + /*GenPredicate=*/1, + /*GenAssemblerPredicate=*/0 >; def FeatureRequiresCOV6 : SubtargetFeature<"requires-cov6", @@ -1397,18 +1226,14 @@ def FeatureRequiresCOV6 : SubtargetFeature<"requires-cov6", "Target Requires Code Object V6" >; -def FeatureXF32Insts : SubtargetFeature<"xf32-insts", - "HasXF32Insts", - "true", - "Has instructions that support xf32 format, such as " - "v_mfma_f32_16x16x8_xf32 and v_mfma_f32_32x32x4_xf32" - >; +defm XF32Insts : AMDGPUSubtargetFeature<"xf32-insts", + "Has instructions that support xf32 format, such as " + "v_mfma_f32_16x16x8_xf32 and v_mfma_f32_32x32x4_xf32" +>; -def FeatureGloballyAddressableScratch : SubtargetFeature< - "globally-addressable-scratch", - "HasGloballyAddressableScratch", - "true", - "FLAT instructions can access scratch memory for any thread in any wave" +defm GloballyAddressableScratch : AMDGPUSubtargetFeature<"globally-addressable-scratch", + "FLAT instructions can access scratch memory for any thread in any wave", + /*GenPredicate=*/0 >; // Enable the use of SCRATCH_STORE/LOAD_BLOCK instructions for saving and @@ -1419,45 +1244,56 @@ def FeatureUseBlockVGPROpsForCSR : SubtargetFeature<"block-vgpr-csr", "Use block load/store for VGPR callee saved registers" >; -def FeatureLshlAddU64Inst - : SubtargetFeature<"lshl-add-u64-inst", "HasLshlAddU64Inst", "true", - "Has v_lshl_add_u64 instruction">; +defm LshlAddU64Inst : AMDGPUSubtargetFeature<"lshl-add-u64-inst", + "Has v_lshl_add_u64 instruction" +>; -def FeatureAddSubU64Insts - : SubtargetFeature<"add-sub-u64-insts", "HasAddSubU64Insts", "true", - "Has v_add_u64 and v_sub_u64 instructions">; +defm AddSubU64Insts : AMDGPUSubtargetFeature<"add-sub-u64-insts", + "Has v_add_u64 and v_sub_u64 instructions" +>; -def FeatureMadU32Inst : SubtargetFeature<"mad-u32-inst", "HasMadU32Inst", - "true", "Has v_mad_u32 instruction">; +defm MadU32Inst : AMDGPUSubtargetFeature<"mad-u32-inst", + "Has v_mad_u32 instruction" +>; -def FeatureMemToLDSLoad : SubtargetFeature<"vmem-to-lds-load-insts", - "HasVMemToLDSLoad", - "true", - "The platform has memory to lds instructions (global_load w/lds bit set, buffer_load w/lds bit set or global_load_lds. This does not include scratch_load_lds." +defm AddMinMaxInsts : AMDGPUSubtargetFeature<"add-min-max-insts", + "Has v_add_{min|max}_{i|u}32 instructions" >; -def FeatureLdsBarrierArriveAtomic : SubtargetFeature< "lds-barrier-arrive-atomic", - "HasLdsBarrierArriveAtomic", - "true", +defm PkAddMinMaxInsts : AMDGPUSubtargetFeature<"pk-add-min-max-insts", + "Has v_pk_add_{min|max}_{i|u}16 instructions" +>; + +defm VMemToLDSLoad : AMDGPUSubtargetFeature<"vmem-to-lds-load-insts", + "The platform has memory to lds instructions (global_load w/lds bit set, buffer_load" + "w/lds bit set or global_load_lds. This does not include scratch_load_lds." +>; + +defm LdsBarrierArriveAtomic : AMDGPUSubtargetFeature<"lds-barrier-arrive-atomic", "Has LDS barrier-arrive atomic instructions" >; -def Feature45BitNumRecordsBufferResource : SubtargetFeature< "45-bit-num-records-buffer-resource", - "Has45BitNumRecordsBufferResource", - "true", - "The buffer resource (V#) supports 45-bit num_records" +defm 45BitNumRecordsBufferResource : AMDGPUSubtargetFeature<"45-bit-num-records-buffer-resource", + "The buffer resource (V#) supports 45-bit num_records", + /*GenPredicate=*/0 +>; + +defm Clusters : AMDGPUSubtargetFeature<"clusters", + "Has clusters of workgroups support", + /*GenPredicate=*/0 >; -def FeatureClusters : SubtargetFeature< "clusters", - "HasClusters", +def FeatureWaitsBeforeSystemScopeStores : SubtargetFeature< + "waits-before-system-scope-stores", + "RequiresWaitsBeforeSystemScopeStores", "true", - "Has clusters of workgroups support" + "Target requires waits for loads and atomics before system scope stores" >; -// Dummy feature used to disable assembler instructions. -def FeatureDisable : SubtargetFeature<"", - "FeatureDisable","true", - "Dummy feature to disable assembler instructions" +def FeatureUseAddPC64Inst : SubtargetFeature<"use-add-pc64-inst", + "UseAddPC64Inst", + "true", + "Use s_add_pc_i64 instruction." >; //===----------------------------------------------------------------------===// @@ -1475,7 +1311,8 @@ def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS", FeatureTrigReducedRange, FeatureExtendedImageInsts, FeatureImageInsts, FeatureGDS, FeatureGWS, FeatureDefaultComponentZero, FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts, - FeatureVmemWriteVgprInOrder + FeatureVmemWriteVgprInOrder, FeatureCubeInsts, FeatureLerpInst, + FeatureSadInsts, FeatureCvtPkNormVOP2Insts ] >; @@ -1489,7 +1326,8 @@ def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS", FeatureImageInsts, FeatureGDS, FeatureGWS, FeatureDefaultComponentZero, FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts, - FeatureVmemWriteVgprInOrder + FeatureVmemWriteVgprInOrder, FeatureCubeInsts, FeatureLerpInst, + FeatureSadInsts, FeatureQsadInsts, FeatureCvtPkNormVOP2Insts ] >; @@ -1505,7 +1343,9 @@ def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS", FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts, FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureFastDenormalF32, FeatureUnalignedBufferAccess, FeatureImageInsts, FeatureGDS, FeatureGWS, - FeatureDefaultComponentZero, FeatureVmemWriteVgprInOrder + FeatureDefaultComponentZero, FeatureVmemWriteVgprInOrder, FeatureCubeInsts, + FeatureLerpInst, FeatureSadInsts, FeatureQsadInsts, + FeatureCvtPkNormVOP2Insts ] >; @@ -1515,7 +1355,7 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9", FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts, FeatureSMemRealTime, FeatureScalarStores, FeatureInv2PiInlineImm, - FeatureApertureRegs, FeatureGFX9Insts, FeatureVOP3P, FeatureVGPRIndexMode, + FeatureApertureRegs, FeatureGFX9Insts, FeatureVOP3PInsts, FeatureVGPRIndexMode, FeatureFastFMAF32, FeatureDPP, FeatureIntClamp, FeatureSDWA, FeatureSDWAOmod, FeatureSDWAScalar, FeatureSDWASdst, FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts, @@ -1524,7 +1364,10 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9", FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureSupportsXNACK, FeatureUnalignedBufferAccess, FeatureUnalignedScratchAccess, FeatureUnalignedDSAccess, FeatureNegativeScratchOffsetBug, FeatureGWS, - FeatureDefaultComponentZero,FeatureVmemWriteVgprInOrder, FeatureMemToLDSLoad + FeatureDefaultComponentZero,FeatureVmemWriteVgprInOrder, FeatureVMemToLDSLoad, + FeatureCubeInsts, FeatureLerpInst, FeatureSadInsts, FeatureQsadInsts, + FeatureCvtNormInsts, FeatureCvtPkNormVOP2Insts, + FeatureCvtPkNormVOP3Insts ] >; @@ -1534,7 +1377,7 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10", FeatureFlatAddressSpace, FeatureCIInsts, Feature16BitInsts, FeatureSMemRealTime, FeatureInv2PiInlineImm, - FeatureApertureRegs, FeatureGFX9Insts, FeatureGFX10Insts, FeatureVOP3P, + FeatureApertureRegs, FeatureGFX9Insts, FeatureGFX10Insts, FeatureVOP3PInsts, FeatureMovrel, FeatureFastFMAF32, FeatureDPP, FeatureIntClamp, FeatureSDWA, FeatureSDWAOmod, FeatureSDWAScalar, FeatureSDWASdst, FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts, @@ -1548,7 +1391,10 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10", FeatureDefaultComponentZero, FeatureMaxHardClauseLength63, FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts, - FeatureVmemWriteVgprInOrder, FeatureMemToLDSLoad + FeatureVmemWriteVgprInOrder, FeatureVMemToLDSLoad, FeatureCubeInsts, + FeatureLerpInst, FeatureSadInsts, FeatureQsadInsts, + FeatureCvtNormInsts, FeatureCvtPkNormVOP2Insts, + FeatureCvtPkNormVOP3Insts ] >; @@ -1559,7 +1405,7 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11", FeatureInv2PiInlineImm, FeatureApertureRegs, FeatureCIInsts, FeatureGFX8Insts, FeatureGFX9Insts, FeatureGFX10Insts, FeatureGFX10_AEncoding, FeatureGFX10_BEncoding, FeatureGFX10_3Insts, - FeatureGFX11Insts, FeatureVOP3P, FeatureVOPD, FeatureTrue16BitInsts, + FeatureGFX11Insts, FeatureVOP3PInsts, FeatureVOPDInsts, FeatureTrue16BitInsts, FeatureMovrel, FeatureFastFMAF32, FeatureDPP, FeatureIntClamp, FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts, FeatureAddNoCarryInsts, FeatureFmaMixInsts, @@ -1571,7 +1417,9 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11", FeatureUnalignedDSAccess, FeatureGDS, FeatureGWS, FeatureDefaultComponentZero, FeatureMaxHardClauseLength32, FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts, - FeatureVmemWriteVgprInOrder + FeatureVmemWriteVgprInOrder, FeatureCubeInsts, FeatureLerpInst, + FeatureSadInsts, FeatureQsadInsts, FeatureCvtNormInsts, + FeatureCvtPkNormVOP2Insts, FeatureCvtPkNormVOP3Insts ] >; @@ -1582,7 +1430,7 @@ def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12", FeatureInv2PiInlineImm, FeatureApertureRegs, FeatureCIInsts, FeatureGFX8Insts, FeatureGFX9Insts, FeatureGFX10Insts, FeatureGFX10_AEncoding, FeatureGFX10_BEncoding, FeatureGFX10_3Insts, - FeatureGFX11Insts, FeatureGFX12Insts, FeatureVOP3P, FeatureVOPD, + FeatureGFX11Insts, FeatureGFX12Insts, FeatureVOP3PInsts, FeatureVOPDInsts, FeatureMovrel, FeatureFastFMAF32, FeatureDPP, FeatureIntClamp, FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts, FeatureAddNoCarryInsts, FeatureFmaMixInsts, @@ -1599,6 +1447,29 @@ def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12", ] >; +def FeatureGFX13 : GCNSubtargetFeatureGeneration<"GFX13", + "gfx13", + [FeatureFP64, FeatureMIMG_R128, + FeatureFlatAddressSpace, Feature16BitInsts, + FeatureInv2PiInlineImm, FeatureApertureRegs, + FeatureCIInsts, FeatureGFX8Insts, FeatureGFX9Insts, FeatureGFX10Insts, + FeatureGFX10_AEncoding, FeatureGFX10_BEncoding, FeatureGFX10_3Insts, + FeatureGFX11Insts, FeatureGFX12Insts, FeatureGFX13Insts, FeatureVOP3PInsts, + FeatureVOPDInsts, FeatureMovrel, FeatureFastFMAF32, FeatureDPP, FeatureIntClamp, + FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts, + FeatureAddNoCarryInsts, FeatureFmaMixInsts, + FeatureNoSdstCMPX, FeatureVscnt, + FeatureVOP3Literal, FeatureDPP8, + FeatureNoDataDepHazard, FeaturePkFmacF16Inst, + FeatureA16, FeatureFastDenormalF32, FeatureG16, + FeatureUnalignedBufferAccess, FeatureUnalignedScratchAccess, FeatureImageInsts, + FeatureUnalignedDSAccess, FeatureTrue16BitInsts, + FeatureDefaultComponentBroadcast, FeatureMaxHardClauseLength32, + FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts, + FeatureIEEEMinimumMaximumInsts, FeatureMinimum3Maximum3F32, + FeatureMinimum3Maximum3F16, FeatureAgentScopeFineGrainedRemoteMemoryAtomics + ] +>; //===----------------------------------------------------------------------===// class FeatureSet<list<SubtargetFeature> Features_> { @@ -1607,7 +1478,7 @@ class FeatureSet<list<SubtargetFeature> Features_> { def FeatureISAVersion6_0_0 : FeatureSet<[FeatureSouthernIslands, FeatureFastFMAF32, - HalfRate64Ops, + FeatureHalfRate64Ops, FeatureLDSBankCount32]>; def FeatureISAVersion6_0_1 : FeatureSet< @@ -1624,7 +1495,7 @@ def FeatureISAVersion7_0_0 : FeatureSet< def FeatureISAVersion7_0_1 : FeatureSet< [FeatureSeaIslands, - HalfRate64Ops, + FeatureHalfRate64Ops, FeatureLDSBankCount32, FeatureFastFMAF32]>; @@ -1653,7 +1524,7 @@ def FeatureISAVersion8_0_Common : FeatureSet< def FeatureISAVersion8_0_1 : FeatureSet< !listconcat(FeatureISAVersion8_0_Common.Features, [FeatureFastFMAF32, - HalfRate64Ops, + FeatureHalfRate64Ops, FeatureSupportsXNACK])>; def FeatureISAVersion8_0_2 : FeatureSet< @@ -1724,7 +1595,7 @@ def FeatureISAVersion9_0_4 : FeatureSet< def FeatureISAVersion9_0_6 : FeatureSet< !listconcat(FeatureISAVersion9_0_Consumer_Common.Features, - [HalfRate64Ops, + [FeatureHalfRate64Ops, FeatureFmaMixInsts, FeatureDLInsts, FeatureDot1Insts, @@ -1736,7 +1607,7 @@ def FeatureISAVersion9_0_6 : FeatureSet< def FeatureISAVersion9_0_8 : FeatureSet< !listconcat(FeatureISAVersion9_0_MI_Common.Features, [FeatureGDS, - HalfRate64Ops, + FeatureHalfRate64Ops, FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureAtomicBufferGlobalPkAddF16NoRtnInsts, @@ -1757,7 +1628,7 @@ def FeatureISAVersion9_0_A : FeatureSet< FeatureAtomicFaddRtnInsts, FeatureAtomicBufferGlobalPkAddF16Insts, FeaturePackedTID, - FullRate64Ops, + FeatureFullRate64Ops, FeatureBackOffBarrier, FeatureKernargPreload, FeatureAtomicFMinFMaxF64GlobalInsts, @@ -1800,7 +1671,7 @@ def FeatureISAVersion9_4_Common : FeatureSet< FeatureSupportsSRAMECC, FeaturePackedTID, FeatureArchitectedFlatScratch, - FullRate64Ops, + FeatureFullRate64Ops, FeatureBackOffBarrier, FeatureKernargPreload, FeatureAtomicFMinFMaxF64GlobalInsts, @@ -1861,7 +1732,7 @@ def FeatureISAVersion10_1_Common : FeatureSet< FeatureGetWaveIdInst, FeatureMadMacF32Insts, FeatureDsSrc2Insts, - FeatureLdsMisalignedBug, + FeatureLDSMisalignedBug, FeatureSupportsXNACK, // gfx101x bugs FeatureVcmpxPermlaneHazard, @@ -2009,6 +1880,13 @@ def FeatureISAVersion11_5_3 : FeatureSet< !listconcat(FeatureISAVersion11_5_Common.Features, [])>; +def FeatureISAVersion11_7_0 : FeatureSet< + !listconcat(FeatureISAVersion11_Common.Features, + [FeatureSALUFloatInsts, + FeatureDPPSrc1SGPR, + FeatureFP8ConversionInsts, + FeatureDot11Insts])>; + def FeatureISAVersion12 : FeatureSet< [FeatureGFX12, FeatureBackOffBarrier, @@ -2042,20 +1920,28 @@ def FeatureISAVersion12 : FeatureSet< FeatureVcmpxPermlaneHazard, FeatureSALUFloatInsts, FeaturePseudoScalarTrans, - FeatureHasRestrictedSOffset, + FeatureRestrictedSOffset, FeatureScalarDwordx3Loads, FeatureDPPSrc1SGPR, FeatureMaxHardClauseLength32, Feature1_5xVGPRs, FeatureMemoryAtomicFAddF32DenormalSupport, - FeatureBVHDualAndBVH8Insts + FeatureBVHDualAndBVH8Insts, + FeatureWaitsBeforeSystemScopeStores, + FeatureD16Writes32BitVgpr, + FeatureCubeInsts, + FeatureLerpInst, + FeatureSadInsts, + FeatureQsadInsts, + FeatureCvtNormInsts, + FeatureCvtPkNormVOP2Insts, + FeatureCvtPkNormVOP3Insts ]>; -def FeatureISAVersion12_50 : FeatureSet< +def FeatureISAVersion12_50_Common : FeatureSet< [FeatureGFX12, FeatureGFX1250Insts, FeatureRequiresAlignedVGPRs, - FeatureAddressableLocalMemorySize327680, FeatureCuMode, Feature1024AddressableVGPRs, Feature64BitLiterals, @@ -2084,7 +1970,7 @@ def FeatureISAVersion12_50 : FeatureSet< FeatureVcmpxPermlaneHazard, FeatureSALUFloatInsts, FeaturePseudoScalarTrans, - FeatureHasRestrictedSOffset, + FeatureRestrictedSOffset, FeatureScalarDwordx3Loads, FeatureDPPSrc1SGPR, FeatureBitOp3Insts, @@ -2115,22 +2001,107 @@ def FeatureISAVersion12_50 : FeatureSet< FeatureLshlAddU64Inst, FeatureAddSubU64Insts, FeatureMadU32Inst, + FeatureAddMinMaxInsts, + FeaturePkAddMinMaxInsts, FeatureLdsBarrierArriveAtomic, FeatureSetPrioIncWgInst, + FeatureSWakeupBarrier, Feature45BitNumRecordsBufferResource, FeatureSupportsXNACK, FeatureXNACK, FeatureClusters, + FeatureD16Writes32BitVgpr, + FeatureMcastLoadInsts ]>; +def FeatureISAVersion12_50 : FeatureSet< + !listconcat(FeatureISAVersion12_50_Common.Features, + [FeatureAddressableLocalMemorySize327680, + FeatureSetregVGPRMSBFixup, + FeatureCubeInsts, + FeatureLerpInst, + FeatureSadInsts, + FeatureQsadInsts, + FeatureCvtNormInsts, + FeatureCvtPkNormVOP2Insts, + FeatureCvtPkNormVOP3Insts])>; + def FeatureISAVersion12_51 : FeatureSet< - !listconcat(FeatureISAVersion12_50.Features, - [FeatureDPALU_DPP])>; + !listconcat(FeatureISAVersion12_50_Common.Features, + [FeatureAddressableLocalMemorySize327680, + FeatureDPALU_DPP, + FeatureCubeInsts, + FeatureLerpInst, + FeatureSadInsts, + FeatureQsadInsts, + FeatureCvtNormInsts, + FeatureCvtPkNormVOP2Insts, + FeatureCvtPkNormVOP3Insts])>; def FeatureISAVersion12_Generic: FeatureSet< !listconcat(FeatureISAVersion12.Features, [FeatureRequiresCOV6])>; +def FeatureISAVersion13 : FeatureSet< + [FeatureGFX13, + FeatureGFX1250Insts, + FeatureAddressableLocalMemorySize65536, + Feature64BitLiterals, + FeatureLDSBankCount32, + FeatureDLInsts, + FeatureFmacF64Inst, + FeatureDot7Insts, + FeatureDot8Insts, + FeatureNSAEncoding, + FeaturePartialNSAEncoding, + FeatureShaderCyclesRegister, + FeatureArchitectedFlatScratch, + FeatureArchitectedSGPRs, + FeatureAtomicFaddRtnInsts, + FeatureAtomicFaddNoRtnInsts, + FeatureAtomicDsPkAdd16Insts, + FeatureAtomicFlatPkAdd16Insts, + FeatureAtomicBufferGlobalPkAddF16Insts, + FeatureAtomicGlobalPkAddBF16Inst, + FeatureAtomicBufferPkAddBF16Inst, + FeatureFlatAtomicFaddF32Inst, + FeatureFP8ConversionInsts, + FeaturePackedTID, + FeatureVcmpxPermlaneHazard, + FeatureSALUFloatInsts, + FeaturePseudoScalarTrans, + FeatureRestrictedSOffset, + FeatureScalarDwordx3Loads, + FeatureDPPSrc1SGPR, + FeatureBitOp3Insts, + FeatureTanhInsts, + FeatureTensorCvtLutInsts, + FeatureTransposeLoadF4F6Insts, + Feature1_5xVGPRs, + FeatureBF16TransInsts, + FeatureBF16ConversionInsts, + FeatureBF16PackedInsts, + FeaturePrngInst, + FeaturePermlane16Swap, + FeatureAshrPkInsts, + FeatureAtomicFMinFMaxF64GlobalInsts, + FeatureAtomicFMinFMaxF64FlatInsts, + FeatureFmaMixBF16Insts, + FeatureGloballyAddressableScratch, + FeatureCvtPkF16F32Inst, + FeatureF16BF16ToFP6BF6ConversionScaleInsts, + FeatureIEEEMinimumMaximumInsts, + FeatureSWakeupBarrier, + FeatureClusters, + FeatureCubeInsts, + FeatureLerpInst, + FeatureSadInsts, + FeatureQsadInsts, + FeatureCvtNormInsts, + FeatureCvtPkNormVOP2Insts, + FeatureCvtPkNormVOP3Insts, +]>; + //===----------------------------------------------------------------------===// def AMDGPUInstrInfo : InstrInfo { @@ -2303,6 +2274,11 @@ def isGFX8GFX9GFX10GFX11 : "Subtarget->getGeneration() == AMDGPUSubtarget::GFX11">, AssemblerPredicate<(all_of FeatureGFX8Insts, (not FeatureGFX12Insts))>; +def isGFX8GFX9GFX10GFX11GFX12 : + Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&" + "Subtarget->getGeneration() < AMDGPUSubtarget::GFX13">, + AssemblerPredicate<(all_of FeatureGFX8Insts, (not FeatureGFX13Insts))>; + def isGFX7Plus : Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS">, AssemblerPredicate<(all_of FeatureCIInsts)>; @@ -2360,18 +2336,6 @@ def isNotGFX940Plus : Predicate<"!Subtarget->hasGFX940Insts()">, AssemblerPredicate<(all_of (not FeatureGFX940Insts))>; -def HasGFX950Insts : - Predicate<"Subtarget->hasGFX950Insts()">, - AssemblerPredicate<(all_of FeatureGFX950Insts)>; - -def HasPermlane16Swap : - Predicate<"Subtarget->hasPermlane16Swap()">, - AssemblerPredicate<(all_of FeaturePermlane16Swap)>; - -def HasPermlane32Swap : - Predicate<"Subtarget->hasPermlane32Swap()">, - AssemblerPredicate<(all_of FeaturePermlane32Swap)>; - def isGFX8GFX9NotGFX940 : Predicate<"!Subtarget->hasGFX940Insts() &&" "(Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||" @@ -2425,9 +2389,14 @@ def isGFX11Plus : Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11">, AssemblerPredicate<(all_of FeatureGFX11Insts)>; +def isGFX11PlusNot12_50 : + Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11 &&" + "(Subtarget->getGeneration() >= AMDGPUSubtarget::GFX13 || !Subtarget->hasGFX1250Insts())">, + AssemblerPredicate<(all_of FeatureGFX11Insts, (any_of FeatureGFX13Insts, (not FeatureGFX1250Insts)))>; + def isGFX12Only : Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::GFX12">, - AssemblerPredicate<(all_of FeatureGFX12Insts)>; + AssemblerPredicate<(all_of FeatureGFX12Insts, (not FeatureGFX13Insts))>; def isGFX12Not12_50 : Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::GFX12 && !Subtarget->hasGFX1250Insts()">, @@ -2438,12 +2407,13 @@ def isGFX12Plus : AssemblerPredicate<(all_of FeatureGFX12Insts)>; def isGFX12PlusNot12_50 : - Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12 && !Subtarget->hasGFX1250Insts()">, - AssemblerPredicate<(all_of FeatureGFX12Insts, (not FeatureGFX1250Insts))>; + Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12 &&" + "(Subtarget->getGeneration() >= AMDGPUSubtarget::GFX13 || !Subtarget->hasGFX1250Insts())">, + AssemblerPredicate<(all_of FeatureGFX12Insts, (any_of FeatureGFX13Insts, (not FeatureGFX1250Insts)))>; def isGFX125xOnly : - Predicate<"Subtarget->hasGFX1250Insts()">, - AssemblerPredicate<(all_of FeatureGFX1250Insts)>; + Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::GFX12 && Subtarget->hasGFX1250Insts()">, + AssemblerPredicate<(all_of FeatureGFX1250Insts, (not FeatureGFX13Insts))>; def isGFX1250Plus : Predicate<"Subtarget->hasGFX1250Insts()">, @@ -2454,63 +2424,27 @@ def isNotGFX1250Plus : AssemblerPredicate<(all_of (not FeatureGFX1250Insts))>; def isGFX940orGFX1250 : - Predicate<"Subtarget->hasGFX940Insts() || Subtarget->hasGFX1250Insts()">, - AssemblerPredicate<(any_of FeatureGFX940Insts, FeatureGFX1250Insts)>; - -def HasIEEEMinimumMaximumInsts : - Predicate<"Subtarget->hasIEEEMinimumMaximumInsts()">, - AssemblerPredicate<(all_of FeatureIEEEMinimumMaximumInsts)>; - -def HasMinimum3Maximum3F32 : - Predicate<"Subtarget->hasMinimum3Maximum3F32()">, - AssemblerPredicate<(all_of FeatureMinimum3Maximum3F32)>; - -def HasMinimum3Maximum3F16 : - Predicate<"Subtarget->hasMinimum3Maximum3F16()">, - AssemblerPredicate<(all_of FeatureMinimum3Maximum3F16)>; - -def HasMin3Max3PKF16 : - Predicate<"Subtarget->hasMin3Max3PKF16()">, - AssemblerPredicate<(all_of FeatureMin3Max3PKF16)>; - -def HasMinimum3Maximum3PKF16 : - Predicate<"Subtarget->hasMinimum3Maximum3PKF16()">, - AssemblerPredicate<(all_of FeatureMinimum3Maximum3PKF16)>; - + Predicate<"Subtarget->hasGFX940Insts() ||" + "(Subtarget->hasGFX1250Insts() && !Subtarget->hasGFX13Insts())">, + AssemblerPredicate<(any_of FeatureGFX940Insts, + (all_of FeatureGFX1250Insts, (not FeatureGFX13Insts)))>; -def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">, - AssemblerPredicate<(all_of FeatureFlatAddressSpace)>; +def isGFX13Only : + Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::GFX13">, + AssemblerPredicate<(all_of FeatureGFX13Insts)>; -def HasFlatBufferGlobalAtomicFaddF64Inst : - Predicate<"Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst()">, - AssemblerPredicate<(any_of FeatureFlatBufferGlobalAtomicFaddF64Inst)>; +def isGFX13Plus : + Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX13">, + AssemblerPredicate<(all_of FeatureGFX13Insts)>; -def HasAtomicFMinFMaxF32GlobalInsts : - Predicate<"Subtarget->hasAtomicFMinFMaxF32GlobalInsts()">, - AssemblerPredicate<(any_of FeatureAtomicFMinFMaxF32GlobalInsts)>; - -def HasAtomicFMinFMaxF64GlobalInsts : - Predicate<"Subtarget->hasAtomicFMinFMaxF64GlobalInsts()">, - AssemblerPredicate<(any_of FeatureAtomicFMinFMaxF64GlobalInsts)>; - -def HasAtomicFMinFMaxF32FlatInsts : - Predicate<"Subtarget->hasAtomicFMinFMaxF32FlatInsts()">, - AssemblerPredicate<(any_of FeatureAtomicFMinFMaxF32FlatInsts)>; - -def HasAtomicFMinFMaxF64FlatInsts : - Predicate<"Subtarget->hasAtomicFMinFMaxF64FlatInsts()">, - AssemblerPredicate<(any_of FeatureAtomicFMinFMaxF64FlatInsts)>; +def HasAtomicCondSubClampFlatInsts : + Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12">, + AssemblerPredicate<(all_of FeatureGFX12Insts)>; def HasLdsAtomicAddF64 : Predicate<"Subtarget->hasLdsAtomicAddF64()">, AssemblerPredicate<(any_of FeatureGFX90AInsts, FeatureGFX1250Insts)>; -def HasFlatGlobalInsts : Predicate<"Subtarget->hasFlatGlobalInsts()">, - AssemblerPredicate<(all_of FeatureFlatGlobalInsts)>; -def HasFlatScratchInsts : Predicate<"Subtarget->hasFlatScratchInsts()">, - AssemblerPredicate<(all_of FeatureFlatScratchInsts)>; -def HasScalarFlatScratchInsts : Predicate<"Subtarget->hasScalarFlatScratchInsts()">, - AssemblerPredicate<(all_of FeatureScalarFlatScratchInsts)>; def HasD16LoadStore : Predicate<"Subtarget->hasD16LoadStore()">, AssemblerPredicate<(all_of FeatureGFX9Insts)>; @@ -2519,24 +2453,17 @@ def HasFlatScratchSTMode : Predicate<"Subtarget->hasFlatScratchSTMode()">, def HasFlatScratchSVSMode : Predicate<"Subtarget->hasFlatScratchSVSMode()">, AssemblerPredicate<(any_of FeatureGFX940Insts, FeatureGFX11Insts)>; -def HasFlatGVSMode : Predicate<"Subtarget->hasFlatGVSMode()">, - AssemblerPredicate<(all_of FeatureFlatGVSMode)>; - def HasGFX10_AEncoding : Predicate<"Subtarget->hasGFX10_AEncoding()">, AssemblerPredicate<(all_of FeatureGFX10_AEncoding)>; def HasGFX10_BEncoding : Predicate<"Subtarget->hasGFX10_BEncoding()">, AssemblerPredicate<(all_of FeatureGFX10_BEncoding)>; -def HasUnpackedD16VMem : Predicate<"Subtarget->hasUnpackedD16VMem()">, - AssemblerPredicate<(all_of FeatureUnpackedD16VMem)>; def HasPackedD16VMem : Predicate<"!Subtarget->hasUnpackedD16VMem()">, AssemblerPredicate<(all_of (not FeatureUnpackedD16VMem))>; -def HasRestrictedSOffset : Predicate<"Subtarget->hasRestrictedSOffset()">, - AssemblerPredicate<(all_of FeatureHasRestrictedSOffset)>; def HasUnrestrictedSOffset : Predicate<"!Subtarget->hasRestrictedSOffset()">, - AssemblerPredicate<(all_of (not FeatureHasRestrictedSOffset))>; + AssemblerPredicate<(all_of (not FeatureRestrictedSOffset))>; def D16PreservesUnusedBits : Predicate<"Subtarget->d16PreservesUnusedBits()">, @@ -2552,7 +2479,7 @@ def HasFormattedMUBUFInsts : Predicate<"Subtarget->hasFormattedMUBUFInsts()">, AssemblerPredicate<(all_of (not FeatureGFX1250Insts))>; def HasExportInsts : Predicate<"Subtarget->hasExportInsts()">, - AssemblerPredicate<(all_of (not FeatureGFX90AInsts), (not FeatureGFX1250Insts))>; + AssemblerPredicate<(any_of FeatureGFX13Insts, (all_of (not FeatureGFX90AInsts), (not FeatureGFX1250Insts)))>; def HasVINTERPEncoding : Predicate<"Subtarget->hasVINTERPEncoding()">, AssemblerPredicate<(all_of FeatureGFX11Insts, (not FeatureGFX1250Insts))>; @@ -2563,18 +2490,10 @@ def HasDSAddTid : Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX9 def HasLDSFPAtomicAddF32 : Predicate<"Subtarget->hasLDSFPAtomicAddF32()">, AssemblerPredicate<(all_of FeatureGFX8Insts)>; -def HasAddNoCarryInsts : Predicate<"Subtarget->hasAddNoCarry()">, - AssemblerPredicate<(all_of FeatureAddNoCarryInsts)>; - -def NotHasAddNoCarryInsts : Predicate<"!Subtarget->hasAddNoCarry()">; +def NotHasAddNoCarryInsts : Predicate<"!Subtarget->hasAddNoCarryInsts()">; def HasXNACKEnabled : Predicate<"Subtarget->isXNACKEnabled()">; -def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">, - AssemblerPredicate<(all_of Feature16BitInsts)>; - -def HasTrue16BitInsts : Predicate<"Subtarget->hasTrue16BitInsts()">, - AssemblerPredicate<(all_of FeatureTrue16BitInsts)>; def NotHasTrue16BitInsts : True16PredicateClass<"!Subtarget->hasTrue16BitInsts()">, AssemblerPredicate<(all_of (not FeatureTrue16BitInsts))>; @@ -2588,30 +2507,14 @@ def NotUseRealTrue16Insts : True16PredicateClass<"!Subtarget->useRealTrue16Insts AssemblerPredicate<(not (all_of FeatureTrue16BitInsts, FeatureRealTrue16Insts))>; def UseFakeTrue16Insts : True16PredicateClass<"Subtarget->hasTrue16BitInsts() && " "!Subtarget->useRealTrue16Insts()">, - AssemblerPredicate<(all_of FeatureTrue16BitInsts)>; - // FIXME When we default to RealTrue16 instead of Fake, change the line as follows. - // AssemblerPredicate<(all_of FeatureTrue16BitInsts, (not FeatureRealTrue16Insts))>; + AssemblerPredicate<(all_of FeatureTrue16BitInsts, (not FeatureRealTrue16Insts))>; def UseTrue16WithSramECC : True16PredicateClass<"Subtarget->useRealTrue16Insts() && " "!Subtarget->d16PreservesUnusedBits()">; -def HasD16Writes32BitVgpr: Predicate<"Subtarget->hasD16Writes32BitVgpr()">, - AssemblerPredicate<(all_of FeatureTrue16BitInsts, FeatureRealTrue16Insts, FeatureD16Writes32BitVgpr)>; def NotHasD16Writes32BitVgpr: Predicate<"!Subtarget->hasD16Writes32BitVgpr()">, AssemblerPredicate<(all_of FeatureTrue16BitInsts, FeatureRealTrue16Insts, (not FeatureD16Writes32BitVgpr))>; -def HasBF16TransInsts : Predicate<"Subtarget->hasBF16TransInsts()">, - AssemblerPredicate<(all_of FeatureBF16TransInsts)>; - -def HasBF16ConversionInsts : Predicate<"Subtarget->hasBF16ConversionInsts()">, - AssemblerPredicate<(all_of FeatureBF16ConversionInsts)>; - -def HasBF16PackedInsts : Predicate<"Subtarget->hasBF16PackedInsts()">, - AssemblerPredicate<(all_of FeatureBF16PackedInsts)>; - -def HasVOP3PInsts : Predicate<"Subtarget->hasVOP3PInsts()">, - AssemblerPredicate<(all_of FeatureVOP3P)>; - def NotHasMed3_16 : Predicate<"!Subtarget->hasMed3_16()">; def HasMed3_16 : Predicate<"Subtarget->hasMed3_16()">; @@ -2620,8 +2523,6 @@ def NotHasMinMaxDenormModes : Predicate<"!Subtarget->supportsMinMaxDenormModes() def HasFminFmaxLegacy : Predicate<"Subtarget->hasFminFmaxLegacy()">; -def HasSDWA : Predicate<"Subtarget->hasSDWA()">; - def HasSDWA8 : Predicate<"Subtarget->hasSDWA()">, AssemblerPredicate<(all_of (not FeatureGFX9Insts), FeatureSDWA)>; @@ -2639,12 +2540,6 @@ def HasDPP : Predicate<"Subtarget->hasDPP()">, def HasDPP8 : Predicate<"Subtarget->hasDPP8()">, AssemblerPredicate<(all_of (not FeatureGCN3Encoding), FeatureGFX10Insts, FeatureDPP8)>; -def HasDPALU_DPP : Predicate<"Subtarget->hasDPALU_DPP()">, - AssemblerPredicate<(all_of FeatureDPALU_DPP)>; - -def HasPackedFP32Ops : Predicate<"Subtarget->hasPackedFP32Ops()">, - AssemblerPredicate<(all_of FeaturePackedFP32Ops)>; - def HasPkMovB32 : Predicate<"Subtarget->hasPkMovB32()">, AssemblerPredicate<(all_of FeatureGFX90AInsts)>; @@ -2656,14 +2551,6 @@ def HasFmaakFmamkF64Insts : Predicate<"Subtarget->hasFmaakFmamkF64Insts()">, AssemblerPredicate<(any_of FeatureGFX1250Insts)>; -def HasAddMinMaxInsts : - Predicate<"Subtarget->hasAddMinMaxInsts()">, - AssemblerPredicate<(any_of FeatureGFX1250Insts)>; - -def HasPkAddMinMaxInsts : - Predicate<"Subtarget->hasPkAddMinMaxInsts()">, - AssemblerPredicate<(any_of FeatureGFX1250Insts)>; - def HasPkMinMax3Insts : Predicate<"Subtarget->hasPkMinMax3Insts()">, AssemblerPredicate<(any_of FeatureGFX1250Insts)>; @@ -2672,295 +2559,92 @@ def HasSGetShaderCyclesInst : Predicate<"Subtarget->hasSGetShaderCyclesInst()">, AssemblerPredicate<(any_of FeatureGFX1250Insts)>; -def HasImageInsts : Predicate<"Subtarget->hasImageInsts()">, - AssemblerPredicate<(all_of FeatureImageInsts)>; - -def HasExtendedImageInsts : Predicate<"Subtarget->hasExtendedImageInsts()">, - AssemblerPredicate<(all_of FeatureExtendedImageInsts)>; - -def HasR128A16 : Predicate<"Subtarget->hasR128A16()">, - AssemblerPredicate<(all_of FeatureR128A16)>; - -def HasA16 : Predicate<"Subtarget->hasA16()">, - AssemblerPredicate<(all_of FeatureA16)>; - -def HasG16 : Predicate<"Subtarget->hasG16()">, - AssemblerPredicate<(all_of FeatureG16)>; - def HasDPP16 : Predicate<"Subtarget->hasDPP()">, AssemblerPredicate<(all_of (not FeatureGCN3Encoding), FeatureGFX10Insts, FeatureDPP)>; -def HasIntClamp : Predicate<"Subtarget->hasIntClamp()">, - AssemblerPredicate<(all_of FeatureIntClamp)>; - -def HasMadMixInsts : Predicate<"Subtarget->hasMadMixInsts()">, - AssemblerPredicate<(all_of FeatureMadMixInsts)>; - -def HasScalarStores : Predicate<"Subtarget->hasScalarStores()">, - AssemblerPredicate<(all_of FeatureScalarStores)>; - -def HasScalarAtomics : Predicate<"Subtarget->hasScalarAtomics()">, - AssemblerPredicate<(all_of FeatureScalarAtomics)>; - -def HasNoSdstCMPX : Predicate<"Subtarget->hasNoSdstCMPX()">, - AssemblerPredicate<(all_of FeatureNoSdstCMPX)>; - def HasSdstCMPX : Predicate<"!Subtarget->hasNoSdstCMPX()">, AssemblerPredicate<(all_of (not FeatureNoSdstCMPX))>; def has16BankLDS : Predicate<"Subtarget->getLDSBankCount() == 16">; def has32BankLDS : Predicate<"Subtarget->getLDSBankCount() == 32">; -def HasVGPRIndexMode : Predicate<"Subtarget->hasVGPRIndexMode()">, - AssemblerPredicate<(all_of FeatureVGPRIndexMode)>; -def HasMovrel : Predicate<"Subtarget->hasMovrel()">, - AssemblerPredicate<(all_of FeatureMovrel)>; - -def HasFmaMixInsts : Predicate<"Subtarget->hasFmaMixInsts()">, - AssemblerPredicate<(all_of FeatureFmaMixInsts)>; - -def HasFmaMixBF16Insts : Predicate<"Subtarget->hasFmaMixBF16Insts()">, - AssemblerPredicate<(all_of FeatureFmaMixBF16Insts)>; - -def HasDLInsts : Predicate<"Subtarget->hasDLInsts()">, - AssemblerPredicate<(all_of FeatureDLInsts)>; - -def HasFmacF64Inst : Predicate<"Subtarget->hasFmacF64Inst()">, - AssemblerPredicate<(all_of FeatureFmacF64Inst)>; - -def HasDot1Insts : Predicate<"Subtarget->hasDot1Insts()">, - AssemblerPredicate<(all_of FeatureDot1Insts)>; - -def HasDot2Insts : Predicate<"Subtarget->hasDot2Insts()">, - AssemblerPredicate<(all_of FeatureDot2Insts)>; - -def HasDot3Insts : Predicate<"Subtarget->hasDot3Insts()">, - AssemblerPredicate<(all_of FeatureDot3Insts)>; - -def HasDot4Insts : Predicate<"Subtarget->hasDot4Insts()">, - AssemblerPredicate<(all_of FeatureDot4Insts)>; - -def HasDot5Insts : Predicate<"Subtarget->hasDot5Insts()">, - AssemblerPredicate<(all_of FeatureDot5Insts)>; - -def HasDot6Insts : Predicate<"Subtarget->hasDot6Insts()">, - AssemblerPredicate<(all_of FeatureDot6Insts)>; - -def HasDot7Insts : Predicate<"Subtarget->hasDot7Insts()">, - AssemblerPredicate<(all_of FeatureDot7Insts)>; - -def HasDot8Insts : Predicate<"Subtarget->hasDot8Insts()">, - AssemblerPredicate<(all_of FeatureDot8Insts)>; - -def HasDot9Insts : Predicate<"Subtarget->hasDot9Insts()">, - AssemblerPredicate<(all_of FeatureDot9Insts)>; - -def HasDot10Insts : Predicate<"Subtarget->hasDot10Insts()">, - AssemblerPredicate<(all_of FeatureDot10Insts)>; - -def HasDot11Insts : Predicate<"Subtarget->hasDot11Insts()">, - AssemblerPredicate<(all_of FeatureDot11Insts)>; - -def HasDot12Insts : Predicate<"Subtarget->hasDot12Insts()">, - AssemblerPredicate<(all_of FeatureDot12Insts)>; - -def HasDot13Insts : Predicate<"Subtarget->hasDot13Insts()">, - AssemblerPredicate<(all_of FeatureDot13Insts)>; - -def HasGetWaveIdInst : Predicate<"Subtarget->hasGetWaveIdInst()">, - AssemblerPredicate<(all_of FeatureGetWaveIdInst)>; - -def HasMAIInsts : Predicate<"Subtarget->hasMAIInsts()">, - AssemblerPredicate<(all_of FeatureMAIInsts)>; def NotHasMAIInsts : Predicate<"!Subtarget->hasMAIInsts()">, AssemblerPredicate<(all_of (not FeatureMAIInsts))>; -def HasSMemRealTime : Predicate<"Subtarget->hasSMemRealTime()">, - AssemblerPredicate<(all_of FeatureSMemRealTime)>; - -def HasSMemTimeInst : Predicate<"Subtarget->hasSMemTimeInst()">, - AssemblerPredicate<(all_of FeatureSMemTimeInst)>; - -def HasShaderCyclesRegister : Predicate<"Subtarget->hasShaderCyclesRegister()">, - AssemblerPredicate<(all_of FeatureShaderCyclesRegister)>; - -def HasShaderCyclesHiLoRegisters : Predicate<"Subtarget->hasShaderCyclesHiLoRegisters()">; - -def HasFP8Insts : Predicate<"Subtarget->hasFP8Insts()">, - AssemblerPredicate<(all_of FeatureFP8Insts)>; - -def HasFP8ConversionInsts : Predicate<"Subtarget->hasFP8ConversionInsts()">, - AssemblerPredicate<(all_of FeatureFP8ConversionInsts)>; - -def HasFP8E5M3Insts : Predicate<"Subtarget->hasFP8E5M3Insts()">, - AssemblerPredicate<(all_of FeatureFP8E5M3Insts)>; - def NotHasFP8E5M3Insts : Predicate<"!Subtarget->hasFP8E5M3Insts()">, AssemblerPredicate<(all_of (not FeatureFP8E5M3Insts))>; -def HasPkFmacF16Inst : Predicate<"Subtarget->hasPkFmacF16Inst()">, - AssemblerPredicate<(all_of FeaturePkFmacF16Inst)>; - -def HasMadMacF32Insts : Predicate<"Subtarget->hasMadMacF32Insts()">, - AssemblerPredicate<(all_of FeatureMadMacF32Insts)>; - def HasFmaLegacy32 : Predicate<"Subtarget->hasGFX10_3Insts()">, AssemblerPredicate<(any_of FeatureGFX10_3Insts)>; -def HasAtomicDsPkAdd16Insts : Predicate<"Subtarget->hasAtomicDsPkAdd16Insts()">, - AssemblerPredicate<(any_of FeatureAtomicDsPkAdd16Insts)>; +def HasFmacLegacy32 : Predicate<"Subtarget->hasGFX10_3Insts() && Subtarget->getGeneration() < AMDGPUSubtarget::GFX12">, + AssemblerPredicate<(all_of FeatureGFX10_3Insts, (not FeatureGFX12Insts))>; -def HasAtomicFlatPkAdd16Insts : Predicate<"Subtarget->hasAtomicFlatPkAdd16Insts()">, - AssemblerPredicate<(any_of FeatureAtomicFlatPkAdd16Insts)>; +def HasAtomicDsCondSubClampInsts : + Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12">, + AssemblerPredicate<(all_of FeatureGFX12Insts)>; -def HasAtomicFaddRtnInsts : Predicate<"Subtarget->hasAtomicFaddRtnInsts()">, - AssemblerPredicate<(all_of FeatureAtomicFaddRtnInsts)>; -def HasAtomicFaddNoRtnInsts : Predicate<"Subtarget->hasAtomicFaddNoRtnInsts()">, - AssemblerPredicate<(all_of FeatureAtomicFaddNoRtnInsts)>; def HasAtomicBufferGlobalPkAddF16NoRtnInsts : Predicate<"Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() || Subtarget->hasAtomicBufferGlobalPkAddF16Insts()">, AssemblerPredicate<(any_of FeatureAtomicBufferGlobalPkAddF16NoRtnInsts, FeatureAtomicBufferGlobalPkAddF16Insts)>; -def HasAtomicBufferGlobalPkAddF16Insts - : Predicate<"Subtarget->hasAtomicBufferGlobalPkAddF16Insts()">, - AssemblerPredicate<(all_of FeatureAtomicBufferGlobalPkAddF16Insts)>; -def HasAtomicGlobalPkAddBF16Inst - : Predicate<"Subtarget->hasAtomicGlobalPkAddBF16Inst()">, - AssemblerPredicate<(all_of FeatureAtomicGlobalPkAddBF16Inst)>; -def HasAtomicBufferPkAddBF16Inst - : Predicate<"Subtarget->hasAtomicBufferPkAddBF16Inst()">, - AssemblerPredicate<(all_of FeatureAtomicBufferPkAddBF16Inst)>; -def HasFlatAtomicFaddF32Inst - : Predicate<"Subtarget->hasFlatAtomicFaddF32Inst()">, - AssemblerPredicate<(all_of FeatureFlatAtomicFaddF32Inst)>; - -def HasDefaultComponentZero - : Predicate<"Subtarget->hasDefaultComponentZero()">, - AssemblerPredicate<(all_of FeatureDefaultComponentZero)>; -def HasDefaultComponentBroadcast - : Predicate<"Subtarget->hasDefaultComponentBroadcast()">, - AssemblerPredicate<(all_of FeatureDefaultComponentBroadcast)>; - -def HasDsSrc2Insts : Predicate<"!Subtarget->hasDsSrc2Insts()">, - AssemblerPredicate<(all_of FeatureDsSrc2Insts)>; def HasAddPC64Inst : Predicate<"Subtarget->hasAddPC64Inst()">, AssemblerPredicate<(any_of FeatureGFX1250Insts)>; -def EnableFlatScratch : Predicate<"Subtarget->enableFlatScratch()">; - -def DisableFlatScratch : Predicate<"!Subtarget->enableFlatScratch()">; - -def HasUnalignedAccessMode : Predicate<"Subtarget->hasUnalignedAccessMode()">, - AssemblerPredicate<(all_of FeatureUnalignedAccessMode)>; +def HasFlatScratchEnabled : Predicate<"Subtarget->hasFlatScratchEnabled()">; -def HasMADIntraFwdBug : Predicate<"Subtarget->hasMADIntraFwdBug()">; +def NotHasFlatScratchEnabled : Predicate<"!Subtarget->hasFlatScratchEnabled()">; -def HasNotMADIntraFwdBug : Predicate<"!Subtarget->hasMADIntraFwdBug()">; - -def HasSALUFloatInsts : Predicate<"Subtarget->hasSALUFloatInsts()">, - AssemblerPredicate<(all_of FeatureSALUFloatInsts)>; +def NotHasMADIntraFwdBug : Predicate<"!Subtarget->hasMADIntraFwdBug()">; def NotHasSALUFloatInsts : Predicate<"!Subtarget->hasSALUFloatInsts()">, AssemblerPredicate<(all_of (not FeatureSALUFloatInsts))>; -def HasPseudoScalarTrans : Predicate<"Subtarget->hasPseudoScalarTrans()">, - AssemblerPredicate<(all_of FeaturePseudoScalarTrans)>; - -def HasBitOp3Insts : Predicate<"Subtarget->hasBitOp3Insts()">, - AssemblerPredicate<(all_of FeatureBitOp3Insts)>; - -def HasTanhInsts : Predicate<"Subtarget->hasTanhInsts()">, - AssemblerPredicate<(all_of FeatureTanhInsts)>; - -def HasTensorCvtLutInsts : Predicate<"Subtarget->hasTensorCvtLutInsts()">, - AssemblerPredicate<(all_of FeatureTensorCvtLutInsts)>; - -def HasTransposeLoadF4F6Insts : Predicate<"Subtarget->hasTransposeLoadF4F6Insts()">, - AssemblerPredicate<(all_of FeatureTransposeLoadF4F6Insts)>; - -def HasPrngInst : Predicate<"Subtarget->hasPrngInst()">, - AssemblerPredicate<(all_of FeaturePrngInst)>; +def NotHasCvtFP8VOP1Bug : Predicate<"!Subtarget->hasCvtFP8VOP1Bug()">; -def HasBVHDualAndBVH8Insts : Predicate<"Subtarget->hasBVHDualAndBVH8Insts()">, - AssemblerPredicate<(all_of FeatureBVHDualAndBVH8Insts)>; - -def Has64BitLiterals : Predicate<"Subtarget->has64BitLiterals()">, - AssemblerPredicate<(all_of Feature64BitLiterals)>; - -def Has1024AddressableVGPRs : Predicate<"Subtarget->has1024AddressableVGPRs()">, - AssemblerPredicate<(all_of Feature1024AddressableVGPRs)>; - -def HasWaitXcnt : Predicate<"Subtarget->hasWaitXcnt()">, - AssemblerPredicate<(all_of FeatureWaitXcnt)>; - -def HasFP8ConversionScaleInsts : Predicate<"Subtarget->hasFP8ConversionScaleInsts()">, - AssemblerPredicate<(all_of FeatureFP8ConversionScaleInsts)>; - -def HasBF8ConversionScaleInsts : Predicate<"Subtarget->hasBF8ConversionScaleInsts()">, - AssemblerPredicate<(all_of FeatureBF8ConversionScaleInsts)>; - -def HasFP4ConversionScaleInsts : Predicate<"Subtarget->hasFP4ConversionScaleInsts()">, - AssemblerPredicate<(all_of FeatureFP4ConversionScaleInsts)>; - -def HasFP6BF6ConversionScaleInsts : Predicate<"Subtarget->hasFP6BF6ConversionScaleInsts()">, - AssemblerPredicate<(all_of FeatureFP6BF6ConversionScaleInsts)>; - -def HasF16BF16ToFP6BF6ConversionScaleInsts : Predicate<"Subtarget->hasF16BF16ToFP6BF6ConversionScaleInsts()">, - AssemblerPredicate<(all_of FeatureF16BF16ToFP6BF6ConversionScaleInsts)>; - -def HasCvtPkF16F32Inst : Predicate<"Subtarget->hasCvtPkF16F32Inst()">, - AssemblerPredicate<(all_of FeatureCvtPkF16F32Inst)>; - -def HasF32ToF16BF16ConversionSRInsts : Predicate<"Subtarget->hasF32ToF16BF16ConversionSRInsts()">, - AssemblerPredicate<(all_of FeatureF32ToF16BF16ConversionSRInsts)>; - -def HasGDS : Predicate<"Subtarget->hasGDS()">; - -def HasGWS : Predicate<"Subtarget->hasGWS()">; - -def HasCvtFP8VOP1Bug : Predicate<"Subtarget->hasCvtFP8VOP1Bug()">; -def HasNoCvtFP8VOP1Bug : Predicate<"!Subtarget->hasCvtFP8VOP1Bug()">; - -def HasAtomicCSubNoRtnInsts : Predicate<"Subtarget->hasAtomicCSubNoRtnInsts()">; - -def HasScalarDwordx3Loads : Predicate<"Subtarget->hasScalarDwordx3Loads()">; - -def HasXF32Insts : Predicate<"Subtarget->hasXF32Insts()">, - AssemblerPredicate<(all_of FeatureXF32Insts)>; - -def HasVmemPrefInsts : Predicate<"Subtarget->hasVmemPrefInsts()">, - AssemblerPredicate<(all_of FeatureVmemPrefInsts)>; - -def HasAshrPkInsts : Predicate<"Subtarget->hasAshrPkInsts()">, - AssemblerPredicate<(all_of FeatureAshrPkInsts)>; - -def HasLshlAddU64Inst : Predicate<"Subtarget->hasLshlAddU64Inst()">, - AssemblerPredicate<(all_of FeatureLshlAddU64Inst)>; - -def HasAddSubU64Insts : Predicate<"Subtarget->hasAddSubU64Insts()">, - AssemblerPredicate<(all_of FeatureAddSubU64Insts)>; - -def HasMadU32Inst : Predicate<"Subtarget->hasMadU32Inst()">, - AssemblerPredicate<(all_of FeatureMadU32Inst)>; +def NeedsAlignedVGPRs : Predicate<"Subtarget->needsAlignedVGPRs()">, + AssemblerPredicate<(all_of FeatureRequiresAlignedVGPRs)>; -def HasLdsBarrierArriveAtomic : Predicate<"Subtarget->hasLdsBarrierArriveAtomic()">, - AssemblerPredicate<(all_of FeatureLdsBarrierArriveAtomic)>; +def NotNeedsAlignedVGPRs : Predicate<"!Subtarget->needsAlignedVGPRs()">, + AssemblerPredicate<(all_of (not FeatureRequiresAlignedVGPRs))>; -def HasSetPrioIncWgInst : Predicate<"Subtarget->hasSetPrioIncWgInst()">, - AssemblerPredicate<(all_of FeatureSetPrioIncWgInst)>; +def isWave32 : Predicate<"Subtarget->isWave32()">, + AssemblerPredicate <(any_of FeatureWavefrontSize32, + FeatureAssemblerPermissiveWavesize)>; +def isWave64 : Predicate<"Subtarget->isWave64()">, + AssemblerPredicate <(any_of FeatureWavefrontSize64, + FeatureAssemblerPermissiveWavesize)>; -def NeedsAlignedVGPRs : Predicate<"Subtarget->needsAlignedVGPRs()">, - AssemblerPredicate<(all_of FeatureRequiresAlignedVGPRs)>; +def isWave32Strict : Predicate<"Subtarget->isWave32()">, + AssemblerPredicate <(all_of FeatureWavefrontSize32)>; +def isWave64Strict : Predicate<"Subtarget->isWave64()">, + AssemblerPredicate <(all_of FeatureWavefrontSize64)>; //===----------------------------------------------------------------------===// // HwModes //===----------------------------------------------------------------------===// -// gfx90a-gfx950. Has AGPRs, and also the align2 VGPR/AGPR requirement +defvar DefaultMode_Wave64 = DefaultMode; +defvar DefaultMode_Wave32 = HwMode<[isWave32, NotNeedsAlignedVGPRs]>; + +// gfx90a-gfx950. Has AGPRs, and also the align2 VGPR/AGPR requirement. Implied +// wave64. def AVAlign2LoadStoreMode : HwMode<[HasMAIInsts, NeedsAlignedVGPRs]>; // gfx1250, has alignment requirement but no AGPRs. -def AlignedVGPRNoAGPRMode : HwMode<[NotHasMAIInsts, NeedsAlignedVGPRs]>; +def AlignedVGPRNoAGPRMode_Wave32 : HwMode<[NotHasMAIInsts, NeedsAlignedVGPRs, isWave32Strict]>; +def AlignedVGPRNoAGPRMode_Wave64 : HwMode<[NotHasMAIInsts, NeedsAlignedVGPRs, isWave64Strict]>; + +// FIXME: This should be able to only define a separate hwmode that +// only depends on wavesize for just ValueTypes. These use different +// HwMode namespaces. If we don't define the full set of modes used +// for RegClassByHwMode, tablegen crashes for some reason +def WaveSizeVT : ValueTypeByHwMode<[ + DefaultMode_Wave64, + AVAlign2LoadStoreMode, + AlignedVGPRNoAGPRMode_Wave64, + DefaultMode_Wave32, + AlignedVGPRNoAGPRMode_Wave32], [i64, i64, i64, i32, i32]>; // Include AMDGPU TD files diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp index c28c25f..2bdadda 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp @@ -65,7 +65,7 @@ recursivelyVisitUsers(GlobalValue &GV, continue; if (Instruction *I = dyn_cast<Instruction>(U)) { - Function *F = I->getParent()->getParent(); + Function *F = I->getFunction(); if (!AMDGPU::isEntryFunctionCC(F->getCallingConv())) { // FIXME: This is a horrible hack. We should always respect noinline, // and just let us hit the error when we can't handle this. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp index dda8033..346e257 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp @@ -19,7 +19,7 @@ using namespace llvm; #define DEBUG_TYPE "amdgpu-argument-reg-usage-info" -INITIALIZE_PASS(AMDGPUArgumentUsageInfo, DEBUG_TYPE, +INITIALIZE_PASS(AMDGPUArgumentUsageInfoWrapperLegacy, DEBUG_TYPE, "Argument Register Usage Information Storage", false, true) void ArgDescriptor::print(raw_ostream &OS, @@ -42,7 +42,7 @@ void ArgDescriptor::print(raw_ostream &OS, OS << '\n'; } -char AMDGPUArgumentUsageInfo::ID = 0; +char AMDGPUArgumentUsageInfoWrapperLegacy::ID = 0; const AMDGPUFunctionArgInfo AMDGPUArgumentUsageInfo::ExternFunctionInfo{}; @@ -50,15 +50,6 @@ const AMDGPUFunctionArgInfo AMDGPUArgumentUsageInfo::ExternFunctionInfo{}; const AMDGPUFunctionArgInfo AMDGPUArgumentUsageInfo::FixedABIFunctionInfo = AMDGPUFunctionArgInfo::fixedABILayout(); -bool AMDGPUArgumentUsageInfo::doInitialization(Module &M) { - return false; -} - -bool AMDGPUArgumentUsageInfo::doFinalization(Module &M) { - ArgInfoMap.clear(); - return false; -} - // TODO: Print preload kernargs? void AMDGPUArgumentUsageInfo::print(raw_ostream &OS, const Module *M) const { for (const auto &FI : ArgInfoMap) { @@ -86,6 +77,12 @@ void AMDGPUArgumentUsageInfo::print(raw_ostream &OS, const Module *M) const { } } +bool AMDGPUArgumentUsageInfo::invalidate(Module &M, const PreservedAnalyses &PA, + ModuleAnalysisManager::Invalidator &) { + auto PAC = PA.getChecker<AMDGPUArgumentUsageAnalysis>(); + return !PAC.preservedWhenStateless(); +} + std::tuple<const ArgDescriptor *, const TargetRegisterClass *, LLT> AMDGPUFunctionArgInfo::getPreloadedValue( AMDGPUFunctionArgInfo::PreloadedValue Value) const { @@ -191,3 +188,10 @@ AMDGPUArgumentUsageInfo::lookupFuncArgInfo(const Function &F) const { return FixedABIFunctionInfo; return I->second; } + +AnalysisKey AMDGPUArgumentUsageAnalysis::Key; + +AMDGPUArgumentUsageInfo +AMDGPUArgumentUsageAnalysis::run(Module &M, ModuleAnalysisManager &) { + return AMDGPUArgumentUsageInfo(); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h index 1064e57..f38e49b9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h @@ -12,7 +12,10 @@ #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/ADT/DenseMap.h" #include "llvm/CodeGen/Register.h" +#include "llvm/IR/PassManager.h" #include "llvm/Pass.h" +#include "llvm/PassRegistry.h" +#include <variant> namespace llvm { @@ -27,55 +30,44 @@ private: friend struct AMDGPUFunctionArgInfo; friend class AMDGPUArgumentUsageInfo; - union { - MCRegister Reg; - unsigned StackOffset; - }; + std::variant<std::monostate, MCRegister, unsigned> Val; // Bitmask to locate argument within the register. unsigned Mask; - bool IsStack : 1; - bool IsSet : 1; - public: - ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u, bool IsStack = false, - bool IsSet = false) - : Reg(Val), Mask(Mask), IsStack(IsStack), IsSet(IsSet) {} + ArgDescriptor(unsigned Mask = ~0u) : Mask(Mask) {} static ArgDescriptor createRegister(Register Reg, unsigned Mask = ~0u) { - return ArgDescriptor(Reg, Mask, false, true); + ArgDescriptor Ret(Mask); + Ret.Val = Reg.asMCReg(); + return Ret; } static ArgDescriptor createStack(unsigned Offset, unsigned Mask = ~0u) { - return ArgDescriptor(Offset, Mask, true, true); + ArgDescriptor Ret(Mask); + Ret.Val = Offset; + return Ret; } static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask) { - return ArgDescriptor(Arg.Reg, Mask, Arg.IsStack, Arg.IsSet); + // Copy the descriptor, then change the mask. + ArgDescriptor Ret(Arg); + Ret.Mask = Mask; + return Ret; } - bool isSet() const { - return IsSet; - } + bool isSet() const { return !std::holds_alternative<std::monostate>(Val); } explicit operator bool() const { return isSet(); } - bool isRegister() const { - return !IsStack; - } + bool isRegister() const { return std::holds_alternative<MCRegister>(Val); } - MCRegister getRegister() const { - assert(!IsStack); - return Reg; - } + MCRegister getRegister() const { return std::get<MCRegister>(Val); } - unsigned getStackOffset() const { - assert(IsStack); - return StackOffset; - } + unsigned getStackOffset() const { return std::get<unsigned>(Val); } unsigned getMask() const { // None of the target SGPRs or VGPRs are expected to have a 'zero' mask. @@ -96,7 +88,7 @@ inline raw_ostream &operator<<(raw_ostream &OS, const ArgDescriptor &Arg) { } struct KernArgPreloadDescriptor : public ArgDescriptor { - KernArgPreloadDescriptor() {} + KernArgPreloadDescriptor() = default; SmallVector<MCRegister> Regs; }; @@ -178,32 +170,67 @@ struct AMDGPUFunctionArgInfo { static AMDGPUFunctionArgInfo fixedABILayout(); }; -class AMDGPUArgumentUsageInfo : public ImmutablePass { +class AMDGPUArgumentUsageInfo { private: DenseMap<const Function *, AMDGPUFunctionArgInfo> ArgInfoMap; public: - static char ID; - static const AMDGPUFunctionArgInfo ExternFunctionInfo; static const AMDGPUFunctionArgInfo FixedABIFunctionInfo; - AMDGPUArgumentUsageInfo() : ImmutablePass(ID) { } + void print(raw_ostream &OS, const Module *M = nullptr) const; + + void clear() { ArgInfoMap.clear(); } + + void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo) { + ArgInfoMap[&F] = ArgInfo; + } + + const AMDGPUFunctionArgInfo &lookupFuncArgInfo(const Function &F) const; + + bool invalidate(Module &M, const PreservedAnalyses &PA, + ModuleAnalysisManager::Invalidator &Inv); +}; + +class AMDGPUArgumentUsageInfoWrapperLegacy : public ImmutablePass { + std::unique_ptr<AMDGPUArgumentUsageInfo> AUIP; + +public: + static char ID; + + AMDGPUArgumentUsageInfoWrapperLegacy() : ImmutablePass(ID) {} + + AMDGPUArgumentUsageInfo &getArgUsageInfo() { return *AUIP; } + const AMDGPUArgumentUsageInfo &getArgUsageInfo() const { return *AUIP; } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesAll(); } - bool doInitialization(Module &M) override; - bool doFinalization(Module &M) override; + bool doInitialization(Module &M) override { + AUIP = std::make_unique<AMDGPUArgumentUsageInfo>(); + return false; + } - void print(raw_ostream &OS, const Module *M = nullptr) const override; + bool doFinalization(Module &M) override { + AUIP->clear(); + return false; + } - void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo) { - ArgInfoMap[&F] = ArgInfo; + void print(raw_ostream &OS, const Module *M = nullptr) const override { + AUIP->print(OS, M); } +}; - const AMDGPUFunctionArgInfo &lookupFuncArgInfo(const Function &F) const; +class AMDGPUArgumentUsageAnalysis + : public AnalysisInfoMixin<AMDGPUArgumentUsageAnalysis> { + friend AnalysisInfoMixin<AMDGPUArgumentUsageAnalysis>; + static AnalysisKey Key; + +public: + using Result = AMDGPUArgumentUsageInfo; + + AMDGPUArgumentUsageInfo run(Module &M, ModuleAnalysisManager &); }; } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentation.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentation.cpp index 19e2a6a..9af3b05 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentation.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentation.cpp @@ -208,7 +208,8 @@ void instrumentAddress(Module &M, IRBuilder<> &IRB, Instruction *OrigIns, Value *NumBits = IRB.CreateTypeSize(IntptrTy, TypeStoreSize); Value *Size = IRB.CreateLShr(NumBits, ConstantInt::get(IntptrTy, 3)); Value *AddrLong = IRB.CreatePtrToInt(Addr, IntptrTy); - Value *SizeMinusOne = IRB.CreateAdd(Size, ConstantInt::get(IntptrTy, -1)); + Value *SizeMinusOne = + IRB.CreateAdd(Size, ConstantInt::getAllOnesValue(IntptrTy)); Value *LastByte = IRB.CreateIntToPtr(IRB.CreateAdd(AddrLong, SizeMinusOne), AddrTy); instrumentAddressImpl(M, IRB, OrigIns, InsertBefore, Addr, {}, 8, IsWrite, @@ -244,11 +245,8 @@ void getInterestingMemoryOperands( // Masked store has an initial operand for the value. unsigned OpOffset = IsWrite ? 1 : 0; Type *Ty = IsWrite ? CI->getArgOperand(0)->getType() : CI->getType(); - MaybeAlign Alignment = Align(1); - // Otherwise no alignment guarantees. We probably got Undef. - if (auto *Op = dyn_cast<ConstantInt>(CI->getOperand(1 + OpOffset))) - Alignment = Op->getMaybeAlignValue(); - Value *Mask = CI->getOperand(2 + OpOffset); + MaybeAlign Alignment = CI->getParamAlign(OpOffset); + Value *Mask = CI->getOperand(1 + OpOffset); Interesting.emplace_back(I, OpOffset, IsWrite, Ty, Alignment, Mask); break; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 29f8f9b..7d2df427 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -323,7 +323,7 @@ void AMDGPUAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) { "' is already defined"); const DataLayout &DL = GV->getDataLayout(); - uint64_t Size = DL.getTypeAllocSize(GV->getValueType()); + uint64_t Size = GV->getGlobalSize(DL); Align Alignment = GV->getAlign().value_or(Align(4)); emitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration()); @@ -358,6 +358,32 @@ bool AMDGPUAsmPrinter::doInitialization(Module &M) { return AsmPrinter::doInitialization(M); } +/// Mimics GCNSubtarget::computeOccupancy for MCExpr. +/// +/// Remove dependency on GCNSubtarget and depend only only the necessary values +/// for said occupancy computation. Should match computeOccupancy implementation +/// without passing \p STM on. +const AMDGPUMCExpr *createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs, + const MCExpr *NumVGPRs, + unsigned DynamicVGPRBlockSize, + const GCNSubtarget &STM, MCContext &Ctx) { + unsigned MaxWaves = IsaInfo::getMaxWavesPerEU(&STM); + unsigned Granule = IsaInfo::getVGPRAllocGranule(&STM, DynamicVGPRBlockSize); + unsigned TargetTotalNumVGPRs = IsaInfo::getTotalNumVGPRs(&STM); + unsigned Generation = STM.getGeneration(); + + auto CreateExpr = [&Ctx](unsigned Value) { + return MCConstantExpr::create(Value, Ctx); + }; + + return AMDGPUMCExpr::create(AMDGPUMCExpr::AGVK_Occupancy, + {CreateExpr(MaxWaves), CreateExpr(Granule), + CreateExpr(TargetTotalNumVGPRs), + CreateExpr(Generation), CreateExpr(InitOcc), + NumSGPRs, NumVGPRs}, + Ctx); +} + void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) { if (F.isDeclaration() || !AMDGPU::isModuleEntryFunctionCC(F.getCallingConv())) return; @@ -459,7 +485,7 @@ void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) { MaxWaves, MFI.getDynamicVGPRBlockSize())}); uint64_t NumSGPRsForWavesPerEU = std::max( {NumSgpr, (uint64_t)1, (uint64_t)STM.getMinNumSGPRs(MaxWaves)}); - const MCExpr *OccupancyExpr = AMDGPUMCExpr::createOccupancy( + const MCExpr *OccupancyExpr = createOccupancy( STM.getOccupancyWithWorkGroupSizes(*MF).second, MCConstantExpr::create(NumSGPRsForWavesPerEU, OutContext), MCConstantExpr::create(NumVGPRsForWavesPerEU, OutContext), @@ -508,9 +534,9 @@ bool AMDGPUAsmPrinter::doFinalization(Module &M) { MCSectionELF *MaxGPRSection = OutContext.getELFSection(".AMDGPU.gpr_maximums", ELF::SHT_PROGBITS, 0); OutStreamer->switchSection(MaxGPRSection); - getTargetStreamer()->EmitMCResourceMaximums(RI.getMaxVGPRSymbol(OutContext), - RI.getMaxAGPRSymbol(OutContext), - RI.getMaxSGPRSymbol(OutContext)); + getTargetStreamer()->EmitMCResourceMaximums( + RI.getMaxVGPRSymbol(OutContext), RI.getMaxAGPRSymbol(OutContext), + RI.getMaxSGPRSymbol(OutContext), RI.getMaxNamedBarrierSymbol(OutContext)); OutStreamer->popSection(); for (Function &F : M.functions()) @@ -634,7 +660,7 @@ AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF, (void)PGRM_Rsrc3; (void)EvaluatableRsrc3; assert(STM.getGeneration() >= AMDGPUSubtarget::GFX10 || - STM.hasGFX90AInsts() || AMDGPU::isGFX1250(STM) || !EvaluatableRsrc3 || + STM.hasGFX90AInsts() || STM.hasGFX1250Insts() || !EvaluatableRsrc3 || static_cast<uint64_t>(PGRM_Rsrc3) == 0); KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3; @@ -805,7 +831,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { " AccumOffset: " + getMCExprStr(AdjustedAccum), false); } - if (AMDGPU::isGFX1250(STM)) + if (STM.hasGFX1250Insts()) OutStreamer->emitRawComment( " NamedBarCnt: " + getMCExprStr(CurrentProgramInfo.NamedBarCnt), false); @@ -841,7 +867,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { [[maybe_unused]] int64_t PGMRSrc3; assert(STM.getGeneration() >= AMDGPUSubtarget::GFX10 || - STM.hasGFX90AInsts() || AMDGPU::isGFX1250(STM) || + STM.hasGFX90AInsts() || STM.hasGFX1250Insts() || (CurrentProgramInfo.ComputePGMRSrc3->evaluateAsAbsolute(PGMRSrc3) && static_cast<uint64_t>(PGMRSrc3) == 0)); if (STM.hasGFX90AInsts()) { @@ -1160,21 +1186,20 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, // Make clamp modifier on NaN input returns 0. ProgInfo.DX10Clamp = Mode.DX10Clamp; - unsigned LDSAlignShift; - if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize327680)) { - // LDS is allocated in 256 dword blocks. - LDSAlignShift = 10; - } else if (STM.getFeatureBits().test( - FeatureAddressableLocalMemorySize163840)) { - // LDS is allocated in 320 dword blocks. + unsigned LDSAlignShift = 8; + switch (getLdsDwGranularity(STM)) { + case 512: + case 320: LDSAlignShift = 11; - } else if (STM.getFeatureBits().test( - FeatureAddressableLocalMemorySize65536)) { - // LDS is allocated in 128 dword blocks. + break; + case 128: LDSAlignShift = 9; - } else { - // LDS is allocated in 64 dword blocks. + break; + case 64: LDSAlignShift = 8; + break; + default: + llvm_unreachable("invald LDS block size"); } ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs(); @@ -1230,8 +1255,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, ProgInfo.UserSGPR = MFI->getNumUserSGPRs(); // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP. - ProgInfo.TrapHandlerEnable = - STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled(); + ProgInfo.TrapHandlerEnable = STM.isAmdHsaOS() ? 0 : STM.hasTrapHandler(); ProgInfo.TGIdXEnable = MFI->hasWorkGroupIDX(); ProgInfo.TGIdYEnable = MFI->hasWorkGroupIDY(); ProgInfo.TGIdZEnable = MFI->hasWorkGroupIDZ(); @@ -1264,13 +1288,13 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT); } - if (AMDGPU::isGFX1250(STM)) + if (STM.hasGFX1250Insts()) ProgInfo.ComputePGMRSrc3 = SetBits(ProgInfo.ComputePGMRSrc3, ProgInfo.NamedBarCnt, amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT, amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT_SHIFT); - ProgInfo.Occupancy = AMDGPUMCExpr::createOccupancy( + ProgInfo.Occupancy = createOccupancy( STM.computeOccupancy(F, ProgInfo.LDSSize).second, ProgInfo.NumSGPRsForWavesPerEU, ProgInfo.NumVGPRsForWavesPerEU, MFI->getDynamicVGPRBlockSize(), STM, Ctx); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp index 0a163f8..784ee36 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -589,7 +589,7 @@ std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively( // return the next active lane auto *Mask = B.CreateShl(ConstantInt::get(WaveTy, 1), FF1); - auto *InverseMask = B.CreateXor(Mask, ConstantInt::get(WaveTy, -1)); + auto *InverseMask = B.CreateXor(Mask, ConstantInt::getAllOnesValue(WaveTy)); auto *NewActiveBits = B.CreateAnd(ActiveBits, InverseMask); ActiveBits->addIncoming(NewActiveBits, ComputeLoop); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index 9907c88f..b86a4ea 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -38,9 +38,10 @@ enum ImplicitArgumentPositions { #define AMDGPU_ATTRIBUTE(Name, Str) Name = 1 << Name##_POS, enum ImplicitArgumentMask { - NOT_IMPLICIT_INPUT = 0, + UNKNOWN_INTRINSIC = 0, #include "AMDGPUAttributes.def" - ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1 + ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1, + NOT_IMPLICIT_INPUT }; #define AMDGPU_ATTRIBUTE(Name, Str) {Name, Str}, @@ -115,7 +116,7 @@ intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit, NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5); return QUEUE_PTR; default: - return NOT_IMPLICIT_INPUT; + return UNKNOWN_INTRINSIC; } } @@ -200,16 +201,6 @@ public: /// Get code object version. unsigned getCodeObjectVersion() const { return CodeObjectVersion; } - /// Get the effective value of "amdgpu-waves-per-eu" for the function, - /// accounting for the interaction with the passed value to use for - /// "amdgpu-flat-work-group-size". - std::pair<unsigned, unsigned> - getWavesPerEU(const Function &F, - std::pair<unsigned, unsigned> FlatWorkGroupSize) { - const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); - return ST.getWavesPerEU(FlatWorkGroupSize, getLDSSize(F), F); - } - std::optional<std::pair<unsigned, unsigned>> getWavesPerEUAttr(const Function &F) { auto Val = AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", @@ -223,15 +214,6 @@ public: return std::make_pair(Val->first, *(Val->second)); } - std::pair<unsigned, unsigned> - getEffectiveWavesPerEU(const Function &F, - std::pair<unsigned, unsigned> WavesPerEU, - std::pair<unsigned, unsigned> FlatWorkGroupSize) { - const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); - return ST.getEffectiveWavesPerEU(WavesPerEU, FlatWorkGroupSize, - getLDSSize(F)); - } - unsigned getMaxWavesPerEU(const Function &F) { const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); return ST.getMaxWavesPerEU(); @@ -258,14 +240,6 @@ private: return Status; } - /// Returns the minimum amount of LDS space used by a workgroup running - /// function \p F. - static unsigned getLDSSize(const Function &F) { - return AMDGPU::getIntegerPairAttribute(F, "amdgpu-lds-size", - {0, UINT32_MAX}, true) - .first; - } - /// Get the constant access bitmap for \p C. uint8_t getConstantAccess(const Constant *C, SmallPtrSetImpl<const Constant *> &Visited) { @@ -534,6 +508,21 @@ struct AAAMDAttributesFunction : public AAAMDAttributes { ImplicitArgumentMask AttrMask = intrinsicToAttrMask(IID, NonKernelOnly, NeedsImplicit, HasApertureRegs, SupportsGetDoorbellID, COV); + + if (AttrMask == UNKNOWN_INTRINSIC) { + // Assume not-nocallback intrinsics may invoke a function which accesses + // implicit arguments. + // + // FIXME: This isn't really the correct check. We want to ensure it + // isn't calling any function that may use implicit arguments regardless + // of whether it's internal to the module or not. + // + // TODO: Ignoring callsite attributes. + if (!Callee->hasFnAttribute(Attribute::NoCallback)) + return indicatePessimisticFixpoint(); + continue; + } + if (AttrMask != NOT_IMPLICIT_INPUT) { if ((IsNonEntryFunc || !NonKernelOnly)) removeAssumedBits(AttrMask); @@ -1336,7 +1325,6 @@ struct AAAMDGPUMinAGPRAlloc Maximum.takeAssumedMaximum(NumRegs); return true; } - switch (CB.getIntrinsicID()) { case Intrinsic::not_intrinsic: break; @@ -1354,10 +1342,24 @@ struct AAAMDGPUMinAGPRAlloc return true; } + // Trap-like intrinsics such as llvm.trap and llvm.debugtrap do not have + // the nocallback attribute, so the AMDGPU attributor can conservatively + // drop all implicitly-known inputs and AGPR allocation information. Make + // sure we still infer that no implicit inputs are required and that the + // AGPR allocation stays at zero. Trap-like intrinsics may invoke a + // function which requires AGPRs, so we need to check if the called + // function has the "trap-func-name" attribute. + case Intrinsic::trap: + case Intrinsic::debugtrap: + case Intrinsic::ubsantrap: + return CB.hasFnAttr(Attribute::NoCallback) || + !CB.hasFnAttr("trap-func-name"); default: // Some intrinsics may use AGPRs, but if we have a choice, we are not // required to use AGPRs. - return true; + // Assume !nocallback intrinsics may call a function which requires + // AGPRs. + return CB.hasFnAttr(Attribute::NoCallback); } // TODO: Handle callsite attributes @@ -1555,7 +1557,7 @@ private: AMDGPU::ClusterDimsAttr Attr; - static constexpr const char AttrName[] = "amdgpu-cluster-dims"; + static constexpr char AttrName[] = "amdgpu-cluster-dims"; }; AAAMDGPUClusterDims & @@ -1584,7 +1586,7 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM, &AAAMDGPUMinAGPRAlloc::ID, &AACallEdges::ID, &AAPointerInfo::ID, &AAPotentialConstantValues::ID, &AAUnderlyingObjects::ID, &AANoAliasAddrSpace::ID, &AAAddressSpace::ID, &AAIndirectCallInfo::ID, - &AAAMDGPUClusterDims::ID}); + &AAAMDGPUClusterDims::ID, &AAAlign::ID}); AttributorConfig AC(CGUpdater); AC.IsClosedWorldModule = Options.IsClosedWorld; @@ -1642,6 +1644,10 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM, if (Ptr) { A.getOrCreateAAFor<AAAddressSpace>(IRPosition::value(*Ptr)); A.getOrCreateAAFor<AANoAliasAddrSpace>(IRPosition::value(*Ptr)); + if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Ptr)) { + if (II->getIntrinsicID() == Intrinsic::amdgcn_make_buffer_rsrc) + A.getOrCreateAAFor<AAAlign>(IRPosition::value(*Ptr)); + } } } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp new file mode 100644 index 0000000..c9fcec8 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp @@ -0,0 +1,120 @@ +//===--- AMDGPUBarrierLatency.cpp - AMDGPU Barrier Latency ----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file This file contains a DAG scheduling mutation to add latency to: +/// 1. Barrier edges between ATOMIC_FENCE instructions and preceding +/// memory accesses potentially affected by the fence. +/// This encourages the scheduling of more instructions before +/// ATOMIC_FENCE instructions. ATOMIC_FENCE instructions may +/// introduce wait counting or indicate an impending S_BARRIER +/// wait. Having more instructions in-flight across these +/// constructs improves latency hiding. +/// 2. Barrier edges from S_BARRIER_SIGNAL to S_BARRIER_WAIT. +/// This encourages independent work to be scheduled between +/// signal and wait, hiding barrier synchronization latency. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUBarrierLatency.h" +#include "GCNSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIInstrInfo.h" +#include "llvm/CodeGen/ScheduleDAGInstrs.h" +#include "llvm/Support/CommandLine.h" + +using namespace llvm; + +static cl::opt<unsigned> BarrierSignalWaitLatencyOpt( + "amdgpu-barrier-signal-wait-latency", + cl::desc("Synthetic latency between S_BARRIER_SIGNAL and S_BARRIER_WAIT " + "to encourage scheduling independent work between them"), + cl::init(16), cl::Hidden); + +namespace { + +class BarrierLatency : public ScheduleDAGMutation { +private: + SmallSet<SyncScope::ID, 4> IgnoredScopes; + +public: + BarrierLatency(MachineFunction *MF) { + LLVMContext &Context = MF->getFunction().getContext(); + IgnoredScopes.insert(SyncScope::SingleThread); + IgnoredScopes.insert(Context.getOrInsertSyncScopeID("wavefront")); + IgnoredScopes.insert(Context.getOrInsertSyncScopeID("wavefront-one-as")); + IgnoredScopes.insert(Context.getOrInsertSyncScopeID("singlethread-one-as")); + + const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); + if (!ST.requiresWaitOnWorkgroupReleaseFence()) { + // Prior to GFX10 workgroup scope does not normally require waitcnts + IgnoredScopes.insert(Context.getOrInsertSyncScopeID("workgroup")); + } + } + void apply(ScheduleDAGInstrs *DAG) override; +}; + +void addLatencyToEdge(SDep &PredDep, SUnit &SU, unsigned Latency) { + SUnit *PredSU = PredDep.getSUnit(); + SDep ForwardD = PredDep; + ForwardD.setSUnit(&SU); + for (SDep &SuccDep : PredSU->Succs) { + if (SuccDep == ForwardD) { + SuccDep.setLatency(SuccDep.getLatency() + Latency); + break; + } + } + PredDep.setLatency(PredDep.getLatency() + Latency); + PredSU->setDepthDirty(); + SU.setDepthDirty(); +} + +void BarrierLatency::apply(ScheduleDAGInstrs *DAG) { + const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(DAG->TII); + constexpr unsigned FenceLatency = 2000; + const unsigned BarrierSignalWaitLatency = BarrierSignalWaitLatencyOpt; + + for (SUnit &SU : DAG->SUnits) { + const MachineInstr *MI = SU.getInstr(); + unsigned Op = MI->getOpcode(); + + if (Op == AMDGPU::ATOMIC_FENCE) { + // Update latency on barrier edges of ATOMIC_FENCE. + // Ignore scopes not expected to have any latency. + SyncScope::ID SSID = + static_cast<SyncScope::ID>(MI->getOperand(1).getImm()); + if (IgnoredScopes.contains(SSID)) + continue; + + for (SDep &PredDep : SU.Preds) { + if (!PredDep.isBarrier()) + continue; + SUnit *PredSU = PredDep.getSUnit(); + MachineInstr *MI = PredSU->getInstr(); + // Only consider memory loads + if (!MI->mayLoad() || MI->mayStore()) + continue; + addLatencyToEdge(PredDep, SU, FenceLatency); + } + } else if (Op == AMDGPU::S_BARRIER_WAIT) { + for (SDep &PredDep : SU.Preds) { + SUnit *PredSU = PredDep.getSUnit(); + const MachineInstr *PredMI = PredSU->getInstr(); + if (TII->isBarrierStart(PredMI->getOpcode())) { + addLatencyToEdge(PredDep, SU, BarrierSignalWaitLatency); + } + } + } + } +} + +} // end namespace + +std::unique_ptr<ScheduleDAGMutation> +llvm::createAMDGPUBarrierLatencyDAGMutation(MachineFunction *MF) { + return std::make_unique<BarrierLatency>(MF); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h new file mode 100644 index 0000000..547cd2a --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h @@ -0,0 +1,24 @@ +//===- AMDGPUBarrierLatency.h - AMDGPU Export Clustering --------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUBARRIERLATENCY_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUBARRIERLATENCY_H + +#include "llvm/CodeGen/ScheduleDAGMutation.h" +#include <memory> + +namespace llvm { + +class MachineFunction; + +std::unique_ptr<ScheduleDAGMutation> +createAMDGPUBarrierLatencyDAGMutation(MachineFunction *MF); + +} // namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUBARRIERLATENCY_H diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp index 7afadde..5c6affd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -21,6 +21,7 @@ #include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/PseudoSourceValueManager.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #define DEBUG_TYPE "amdgpu-call-lowering" @@ -209,7 +210,7 @@ struct AMDGPUOutgoingArgHandler : public AMDGPUOutgoingValueHandler { if (!SPReg) { const GCNSubtarget &ST = MIRBuilder.getMF().getSubtarget<GCNSubtarget>(); - if (ST.enableFlatScratch()) { + if (ST.hasFlatScratchEnabled()) { // The stack is accessed unswizzled, so we can use a regular copy. SPReg = MIRBuilder.buildCopy(PtrTy, MFI->getStackPtrOffsetReg()).getReg(0); @@ -414,12 +415,13 @@ void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B, ArgInfo &OrigArg, MachineFunction &MF = B.getMF(); const Function &F = MF.getFunction(); const DataLayout &DL = F.getDataLayout(); - MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); + const SITargetLowering &TLI = *getTLI<SITargetLowering>(); + MachinePointerInfo PtrInfo = TLI.getKernargSegmentPtrInfo(MF); LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); SmallVector<ArgInfo, 32> SplitArgs; - SmallVector<uint64_t> FieldOffsets; + SmallVector<TypeSize> FieldOffsets; splitToValueTypes(OrigArg, SplitArgs, DL, F.getCallingConv(), &FieldOffsets); unsigned Idx = 0; @@ -737,7 +739,7 @@ bool AMDGPUCallLowering::lowerFormalArguments( // For the fixed ABI, pass workitem IDs in the last argument register. TLI.allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info); - if (!Subtarget.enableFlatScratch()) + if (!Subtarget.hasFlatScratchEnabled()) CCInfo.AllocateReg(Info->getScratchRSrcReg()); TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info); } @@ -1196,7 +1198,7 @@ void AMDGPUCallLowering::handleImplicitCallArguments( const GCNSubtarget &ST, const SIMachineFunctionInfo &FuncInfo, CallingConv::ID CalleeCC, ArrayRef<std::pair<MCRegister, Register>> ImplicitArgRegs) const { - if (!ST.enableFlatScratch()) { + if (!ST.hasFlatScratchEnabled()) { // Insert copies for the SRD. In the HSA case, this should be an identity // copy. auto ScratchRSrcReg = MIRBuilder.buildCopy(LLT::fixed_vector(4, 32), diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td index e891fdb..2932bbf 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td @@ -14,6 +14,10 @@ class CCIfNotInReg<CCAction A> : CCIf<"!ArgFlags.isInReg()", A> {} class CCIfExtend<CCAction A> : CCIf<"ArgFlags.isSExt() || ArgFlags.isZExt()", A>; +class CCIfOrigTypeShaderCCIsSGPR<CCAction A> + : CCIf<[{(!OrigTy->getScalarType()->isFloatTy() && + !OrigTy->getScalarType()->isHalfTy()) }], A>; + // Calling convention for SI def CC_SI_Gfx : CallingConv<[ @@ -56,14 +60,15 @@ def CC_SI_SHADER : CallingConv<[ >>> ]>; + def RetCC_SI_Shader : CallingConv<[ CCIfType<[i1, i16], CCIfExtend<CCPromoteToType<i32>>>, - CCIfType<[i32, i16, v2i16] , CCAssignToReg< + CCIfType<[i32, i16, v2i16], CCIfOrigTypeShaderCCIsSGPR<CCAssignToReg< !foreach(i, !range(0, 44), !cast<Register>("SGPR"#i)) // SGPR0-43 - >>, + >>>, // 32*4 + 4 is the minimum for a fetch shader with 32 outputs. - CCIfType<[f32, f16, v2f16, bf16, v2bf16] , CCAssignToReg< + CCIfType<[f32, f16, v2f16, bf16, v2bf16, i32, i16, v2i16] , CCAssignToReg< !foreach(i, !range(0, 136), !cast<Register>("VGPR"#i)) // VGPR0-135 >> ]>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 8e35ba7..e51d2c0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -33,6 +33,7 @@ #include "llvm/Pass.h" #include "llvm/Support/KnownBits.h" #include "llvm/Support/KnownFPClass.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/IntegerDivision.h" #include "llvm/Transforms/Utils/Local.h" @@ -100,10 +101,9 @@ public: const GCNSubtarget &ST; const AMDGPUTargetMachine &TM; const TargetLibraryInfo *TLI; - AssumptionCache *AC; - const DominatorTree *DT; const UniformityInfo &UA; const DataLayout &DL; + SimplifyQuery SQ; const bool HasFP32DenormalFlush; bool FlowChanged = false; mutable Function *SqrtF32 = nullptr; @@ -115,8 +115,8 @@ public: AMDGPUCodeGenPrepareImpl(Function &F, const AMDGPUTargetMachine &TM, const TargetLibraryInfo *TLI, AssumptionCache *AC, const DominatorTree *DT, const UniformityInfo &UA) - : F(F), ST(TM.getSubtarget<GCNSubtarget>(F)), TM(TM), TLI(TLI), AC(AC), - DT(DT), UA(UA), DL(F.getDataLayout()), + : F(F), ST(TM.getSubtarget<GCNSubtarget>(F)), TM(TM), TLI(TLI), UA(UA), + DL(F.getDataLayout()), SQ(DL, TLI, DT, AC), HasFP32DenormalFlush(SIModeRegisterDefaults(F, ST).FP32Denormals == DenormalMode::getPreserveSign()) {} @@ -143,21 +143,14 @@ public: bool canBreakPHINode(const PHINode &I); - /// \returns True if binary operation \p I is a signed binary operation, false - /// otherwise. - bool isSigned(const BinaryOperator &I) const; - - /// \returns True if the condition of 'select' operation \p I comes from a - /// signed 'icmp' operation, false otherwise. - bool isSigned(const SelectInst &I) const; - /// Return true if \p T is a legal scalar floating point type. bool isLegalFloatingTy(const Type *T) const; /// Wrapper to pass all the arguments to computeKnownFPClass KnownFPClass computeKnownFPClass(const Value *V, FPClassTest Interested, const Instruction *CtxI) const { - return llvm::computeKnownFPClass(V, DL, Interested, TLI, AC, CtxI, DT); + return llvm::computeKnownFPClass(V, Interested, + SQ.getWithInstruction(CtxI)); } bool canIgnoreDenormalInput(const Value *V, const Instruction *CtxI) const { @@ -168,12 +161,12 @@ public: /// \returns The minimum number of bits needed to store the value of \Op as an /// unsigned integer. Truncating to this size and then zero-extending to /// the original will not change the value. - unsigned numBitsUnsigned(Value *Op) const; + unsigned numBitsUnsigned(Value *Op, const Instruction *CtxI) const; /// \returns The minimum number of bits needed to store the value of \Op as a /// signed integer. Truncating to this size and then sign-extending to /// the original size will not change the value. - unsigned numBitsSigned(Value *Op) const; + unsigned numBitsSigned(Value *Op, const Instruction *CtxI) const; /// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24. /// SelectionDAG has an issue where an and asserting the bits are known @@ -218,8 +211,7 @@ public: Value *matchFractPat(IntrinsicInst &I); Value *applyFractPat(IRBuilder<> &Builder, Value *FractArg); - bool canOptimizeWithRsq(const FPMathOperator *SqrtOp, FastMathFlags DivFMF, - FastMathFlags SqrtFMF) const; + bool canOptimizeWithRsq(FastMathFlags DivFMF, FastMathFlags SqrtFMF) const; Value *optimizeWithRsq(IRBuilder<> &Builder, Value *Num, Value *Den, FastMathFlags DivFMF, FastMathFlags SqrtFMF, @@ -244,6 +236,14 @@ public: FastMathFlags FMF) const; Value *emitSqrtIEEE2ULP(IRBuilder<> &Builder, Value *Src, FastMathFlags FMF) const; + Value *emitRsqF64(IRBuilder<> &Builder, Value *X, FastMathFlags SqrtFMF, + FastMathFlags DivFMF, const Instruction *CtxI, + bool IsNegative) const; + + CallInst *createWorkitemIdX(IRBuilder<> &B) const; + void replaceWithWorkitemIdX(Instruction &I) const; + void replaceWithMaskedWorkitemIdX(Instruction &I, unsigned WaveSize) const; + bool tryReplaceWithWorkitemId(Instruction &I, unsigned Wave) const; bool tryNarrowMathIfNoOverflow(Instruction *I); @@ -260,6 +260,8 @@ public: bool visitIntrinsicInst(IntrinsicInst &I); bool visitFMinLike(IntrinsicInst &I); bool visitSqrt(IntrinsicInst &I); + bool visitMbcntLo(IntrinsicInst &I) const; + bool visitMbcntHi(IntrinsicInst &I) const; bool run(); }; @@ -304,16 +306,6 @@ bool AMDGPUCodeGenPrepareImpl::run() { return MadeChange; } -bool AMDGPUCodeGenPrepareImpl::isSigned(const BinaryOperator &I) const { - return I.getOpcode() == Instruction::AShr || - I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem; -} - -bool AMDGPUCodeGenPrepareImpl::isSigned(const SelectInst &I) const { - return isa<ICmpInst>(I.getOperand(0)) && - cast<ICmpInst>(I.getOperand(0))->isSigned(); -} - bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy(const Type *Ty) const { return Ty->isFloatTy() || Ty->isDoubleTy() || (Ty->isHalfTy() && ST.has16BitInsts()); @@ -327,12 +319,16 @@ bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &I) const { return I.isSimple() && TySize < 32 && Alignment >= 4 && UA.isUniform(&I); } -unsigned AMDGPUCodeGenPrepareImpl::numBitsUnsigned(Value *Op) const { - return computeKnownBits(Op, DL, AC).countMaxActiveBits(); +unsigned +AMDGPUCodeGenPrepareImpl::numBitsUnsigned(Value *Op, + const Instruction *CtxI) const { + return computeKnownBits(Op, SQ.getWithInstruction(CtxI)).countMaxActiveBits(); } -unsigned AMDGPUCodeGenPrepareImpl::numBitsSigned(Value *Op) const { - return ComputeMaxSignificantBits(Op, DL, AC); +unsigned +AMDGPUCodeGenPrepareImpl::numBitsSigned(Value *Op, + const Instruction *CtxI) const { + return ComputeMaxSignificantBits(Op, SQ.DL, SQ.AC, CtxI, SQ.DT); } static void extractValues(IRBuilder<> &Builder, @@ -383,12 +379,12 @@ bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const { unsigned LHSBits = 0, RHSBits = 0; bool IsSigned = false; - if (ST.hasMulU24() && (LHSBits = numBitsUnsigned(LHS)) <= 24 && - (RHSBits = numBitsUnsigned(RHS)) <= 24) { + if (ST.hasMulU24() && (LHSBits = numBitsUnsigned(LHS, &I)) <= 24 && + (RHSBits = numBitsUnsigned(RHS, &I)) <= 24) { IsSigned = false; - } else if (ST.hasMulI24() && (LHSBits = numBitsSigned(LHS)) <= 24 && - (RHSBits = numBitsSigned(RHS)) <= 24) { + } else if (ST.hasMulI24() && (LHSBits = numBitsSigned(LHS, &I)) <= 24 && + (RHSBits = numBitsSigned(RHS, &I)) <= 24) { IsSigned = true; } else @@ -623,15 +619,101 @@ static Value *emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src, return Builder.CreateFMul(Rsq, OutputScaleFactor); } -bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(const FPMathOperator *SqrtOp, - FastMathFlags DivFMF, - FastMathFlags SqrtFMF) const { - // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp. - if (!DivFMF.allowContract() || !SqrtFMF.allowContract()) - return false; +/// Emit inverse sqrt expansion for f64 with a correction sequence on top of +/// v_rsq_f64. This should give a 1ulp result. +Value *AMDGPUCodeGenPrepareImpl::emitRsqF64(IRBuilder<> &Builder, Value *X, + FastMathFlags SqrtFMF, + FastMathFlags DivFMF, + const Instruction *CtxI, + bool IsNegative) const { + // rsq(x): + // double y0 = BUILTIN_AMDGPU_RSQRT_F64(x); + // double e = MATH_MAD(-y0 * (x == PINF_F64 || x == 0.0 ? y0 : x), y0, 1.0); + // return MATH_MAD(y0*e, MATH_MAD(e, 0.375, 0.5), y0); + // + // -rsq(x): + // double y0 = BUILTIN_AMDGPU_RSQRT_F64(x); + // double e = MATH_MAD(-y0 * (x == PINF_F64 || x == 0.0 ? y0 : x), y0, 1.0); + // return MATH_MAD(-y0*e, MATH_MAD(e, 0.375, 0.5), -y0); + // + // The rsq instruction handles the special cases correctly. We need to check + // for the edge case conditions to ensure the special case propagates through + // the later instructions. + + Value *Y0 = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, X); + + // Try to elide the edge case check. + // + // Fast math flags imply: + // sqrt ninf => !isinf(x) + // fdiv ninf => x != 0, !isinf(x) + bool MaybePosInf = !SqrtFMF.noInfs() && !DivFMF.noInfs(); + bool MaybeZero = !DivFMF.noInfs(); + + DenormalMode DenormMode; + FPClassTest Interested = fcNone; + if (MaybePosInf) + Interested = fcPosInf; + if (MaybeZero) + Interested |= fcZero; + + if (Interested != fcNone) { + KnownFPClass KnownSrc = computeKnownFPClass(X, Interested, CtxI); + if (KnownSrc.isKnownNeverPosInfinity()) + MaybePosInf = false; + + DenormMode = F.getDenormalMode(X->getType()->getFltSemantics()); + if (KnownSrc.isKnownNeverLogicalZero(DenormMode)) + MaybeZero = false; + } + + Value *SpecialOrRsq = X; + if (MaybeZero || MaybePosInf) { + Value *Cond; + if (MaybePosInf && MaybeZero) { + if (DenormMode.Input != DenormalMode::DenormalModeKind::Dynamic) { + FPClassTest TestMask = fcPosInf | fcZero; + if (DenormMode.inputsAreZero()) + TestMask |= fcSubnormal; - // v_rsq_f32 gives 1ulp - return SqrtFMF.approxFunc() || SqrtOp->getFPAccuracy() >= 1.0f; + Cond = Builder.createIsFPClass(X, TestMask); + } else { + // Avoid using llvm.is.fpclass for dynamic denormal mode, since it + // doesn't respect the floating-point environment. + Value *IsZero = + Builder.CreateFCmpOEQ(X, ConstantFP::getZero(X->getType())); + Value *IsInf = + Builder.CreateFCmpOEQ(X, ConstantFP::getInfinity(X->getType())); + Cond = Builder.CreateOr(IsZero, IsInf); + } + } else if (MaybeZero) { + Cond = Builder.CreateFCmpOEQ(X, ConstantFP::getZero(X->getType())); + } else { + Cond = Builder.CreateFCmpOEQ(X, ConstantFP::getInfinity(X->getType())); + } + + SpecialOrRsq = Builder.CreateSelect(Cond, Y0, X); + } + + Value *NegY0 = Builder.CreateFNeg(Y0); + Value *NegXY0 = Builder.CreateFMul(SpecialOrRsq, NegY0); + + // Could be fmuladd, but isFMAFasterThanFMulAndFAdd is always true for f64. + Value *E = Builder.CreateFMA(NegXY0, Y0, ConstantFP::get(X->getType(), 1.0)); + + Value *Y0E = Builder.CreateFMul(E, IsNegative ? NegY0 : Y0); + + Value *EFMA = Builder.CreateFMA(E, ConstantFP::get(X->getType(), 0.375), + ConstantFP::get(X->getType(), 0.5)); + + return Builder.CreateFMA(Y0E, EFMA, IsNegative ? NegY0 : Y0); +} + +bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(FastMathFlags DivFMF, + FastMathFlags SqrtFMF) const { + // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp for f32 and + // f64. + return DivFMF.allowContract() && SqrtFMF.allowContract(); } Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq( @@ -647,8 +729,6 @@ Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq( if (!CLHS) return nullptr; - assert(Den->getType()->isFloatTy()); - bool IsNegative = false; // TODO: Handle other numerator values with arcp. @@ -657,14 +737,20 @@ Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq( IRBuilder<>::FastMathFlagGuard Guard(Builder); Builder.setFastMathFlags(DivFMF | SqrtFMF); - if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) || - canIgnoreDenormalInput(Den, CtxI)) { - Value *Result = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, Den); - // -1.0 / sqrt(x) -> fneg(rsq(x)) - return IsNegative ? Builder.CreateFNeg(Result) : Result; + if (Den->getType()->isFloatTy()) { + if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) || + canIgnoreDenormalInput(Den, CtxI)) { + Value *Result = + Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, Den); + // -1.0 / sqrt(x) -> fneg(rsq(x)) + return IsNegative ? Builder.CreateFNeg(Result) : Result; + } + + return emitRsqIEEE1ULP(Builder, Den, IsNegative); } - return emitRsqIEEE1ULP(Builder, Den, IsNegative); + if (Den->getType()->isDoubleTy()) + return emitRsqF64(Builder, Den, SqrtFMF, DivFMF, CtxI, IsNegative); } return nullptr; @@ -776,6 +862,9 @@ Value *AMDGPUCodeGenPrepareImpl::visitFDivElement( return Rsq; } + if (!Num->getType()->isFloatTy()) + return nullptr; + Value *Rcp = optimizeWithRcp(Builder, Num, Den, DivFMF, FDivInst); if (Rcp) return Rcp; @@ -811,7 +900,8 @@ bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) { return false; Type *Ty = FDiv.getType()->getScalarType(); - if (!Ty->isFloatTy()) + const bool IsFloat = Ty->isFloatTy(); + if (!IsFloat && !Ty->isDoubleTy()) return false; // The f64 rcp/rsq approximations are pretty inaccurate. We can do an @@ -832,10 +922,14 @@ bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) { DenII->hasOneUse()) { const auto *SqrtOp = cast<FPMathOperator>(DenII); SqrtFMF = SqrtOp->getFastMathFlags(); - if (canOptimizeWithRsq(SqrtOp, DivFMF, SqrtFMF)) + if (canOptimizeWithRsq(DivFMF, SqrtFMF)) RsqOp = SqrtOp->getOperand(0); } + // rcp path not yet implemented for f64. + if (!IsFloat && !RsqOp) + return false; + // Inaccurate rcp is allowed with afn. // // Defer to codegen to handle this. @@ -850,7 +944,7 @@ bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) { return false; // Defer the correct implementations to codegen. - if (ReqdAccuracy < 1.0f) + if (IsFloat && ReqdAccuracy < 1.0f) return false; IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator())); @@ -929,13 +1023,13 @@ unsigned AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator &I, Value *Num, Den->getType()->getScalarSizeInBits()); unsigned SSBits = Num->getType()->getScalarSizeInBits(); if (IsSigned) { - unsigned RHSSignBits = ComputeNumSignBits(Den, DL, AC, &I); + unsigned RHSSignBits = ComputeNumSignBits(Den, SQ.DL, SQ.AC, &I, SQ.DT); // A sign bit needs to be reserved for shrinking. unsigned DivBits = SSBits - RHSSignBits + 1; if (DivBits > MaxDivBits) return SSBits; - unsigned LHSSignBits = ComputeNumSignBits(Num, DL, AC, &I); + unsigned LHSSignBits = ComputeNumSignBits(Num, SQ.DL, SQ.AC, &I); unsigned SignBits = std::min(LHSSignBits, RHSSignBits); DivBits = SSBits - SignBits + 1; @@ -944,7 +1038,7 @@ unsigned AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator &I, Value *Num, // All bits are used for unsigned division for Num or Den in range // (SignedMax, UnsignedMax]. - KnownBits Known = computeKnownBits(Den, DL, AC, &I); + KnownBits Known = computeKnownBits(Den, SQ.getWithInstruction(&I)); if (Known.isNegative() || !Known.isNonNegative()) return SSBits; unsigned RHSSignBits = Known.countMinLeadingZeros(); @@ -952,7 +1046,7 @@ unsigned AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator &I, Value *Num, if (DivBits > MaxDivBits) return SSBits; - Known = computeKnownBits(Num, DL, AC, &I); + Known = computeKnownBits(Num, SQ.getWithInstruction(&I)); if (Known.isNegative() || !Known.isNonNegative()) return SSBits; unsigned LHSSignBits = Known.countMinLeadingZeros(); @@ -1089,7 +1183,7 @@ bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(BinaryOperator &I, // If there's no wider mulhi, there's only a better expansion for powers of // two. // TODO: Should really know for each vector element. - if (isKnownToBeAPowerOfTwo(C, DL, true, AC, &I, DT)) + if (isKnownToBeAPowerOfTwo(C, true, SQ.getWithInstruction(&I))) return true; return false; @@ -1099,7 +1193,8 @@ bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(BinaryOperator &I, // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2 if (BinOpDen->getOpcode() == Instruction::Shl && isa<Constant>(BinOpDen->getOperand(0)) && - isKnownToBeAPowerOfTwo(BinOpDen->getOperand(0), DL, true, AC, &I, DT)) { + isKnownToBeAPowerOfTwo(BinOpDen->getOperand(0), true, + SQ.getWithInstruction(&I))) { return true; } } @@ -1910,6 +2005,10 @@ bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) { return visitFMinLike(I); case Intrinsic::sqrt: return visitSqrt(I); + case Intrinsic::amdgcn_mbcnt_lo: + return visitMbcntLo(I); + case Intrinsic::amdgcn_mbcnt_hi: + return visitMbcntHi(I); default: return false; } @@ -1984,7 +2083,7 @@ bool AMDGPUCodeGenPrepareImpl::visitFMinLike(IntrinsicInst &I) { // Match pattern for fract intrinsic in contexts where the nan check has been // optimized out (and hope the knowledge the source can't be nan wasn't lost). - if (!I.hasNoNaNs() && !isKnownNeverNaN(FractArg, SimplifyQuery(DL, TLI))) + if (!I.hasNoNaNs() && !isKnownNeverNaN(FractArg, SQ.getWithInstruction(&I))) return false; IRBuilder<> Builder(&I); @@ -2090,6 +2189,110 @@ INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass) INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations", false, false) +/// Create a workitem.id.x intrinsic call with range metadata. +CallInst *AMDGPUCodeGenPrepareImpl::createWorkitemIdX(IRBuilder<> &B) const { + CallInst *Tid = B.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_x, {}); + ST.makeLIDRangeMetadata(Tid); + return Tid; +} + +/// Replace the instruction with a direct workitem.id.x call. +void AMDGPUCodeGenPrepareImpl::replaceWithWorkitemIdX(Instruction &I) const { + IRBuilder<> B(&I); + CallInst *Tid = createWorkitemIdX(B); + BasicBlock::iterator BI(&I); + ReplaceInstWithValue(BI, Tid); +} + +/// Replace the instruction with (workitem.id.x & mask). +void AMDGPUCodeGenPrepareImpl::replaceWithMaskedWorkitemIdX( + Instruction &I, unsigned WaveSize) const { + IRBuilder<> B(&I); + CallInst *Tid = createWorkitemIdX(B); + Constant *Mask = ConstantInt::get(Tid->getType(), WaveSize - 1); + Value *AndInst = B.CreateAnd(Tid, Mask); + BasicBlock::iterator BI(&I); + ReplaceInstWithValue(BI, AndInst); +} + +/// Try to optimize mbcnt instruction by replacing with workitem.id.x when +/// work group size allows direct computation of lane ID. +/// Returns true if optimization was applied, false otherwise. +bool AMDGPUCodeGenPrepareImpl::tryReplaceWithWorkitemId(Instruction &I, + unsigned Wave) const { + std::optional<unsigned> MaybeX = ST.getReqdWorkGroupSize(F, 0); + if (!MaybeX) + return false; + + // When work group size == wave_size, each work group contains exactly one + // wave, so the instruction can be replaced with workitem.id.x directly. + if (*MaybeX == Wave) { + replaceWithWorkitemIdX(I); + return true; + } + + // When work group evenly splits into waves, compute lane ID within wave + // using bit masking: lane_id = workitem.id.x & (wave_size - 1). + if (ST.hasWavefrontsEvenlySplittingXDim(F, /*RequiresUniformYZ=*/true)) { + replaceWithMaskedWorkitemIdX(I, Wave); + return true; + } + + return false; +} + +/// Optimize mbcnt.lo calls on wave32 architectures for lane ID computation. +bool AMDGPUCodeGenPrepareImpl::visitMbcntLo(IntrinsicInst &I) const { + // This optimization only applies to wave32 targets where mbcnt.lo operates on + // the full execution mask. + if (!ST.isWave32()) + return false; + + // Only optimize the pattern mbcnt.lo(~0, 0) which counts active lanes with + // lower IDs. + if (!match(&I, + m_Intrinsic<Intrinsic::amdgcn_mbcnt_lo>(m_AllOnes(), m_Zero()))) + return false; + + return tryReplaceWithWorkitemId(I, ST.getWavefrontSize()); +} + +/// Optimize mbcnt.hi calls for lane ID computation. +bool AMDGPUCodeGenPrepareImpl::visitMbcntHi(IntrinsicInst &I) const { + // Abort if wave size is not known at compile time. + if (!ST.isWaveSizeKnown()) + return false; + + unsigned Wave = ST.getWavefrontSize(); + + // On wave32, the upper 32 bits of execution mask are always 0, so + // mbcnt.hi(mask, val) always returns val unchanged. + if (ST.isWave32()) { + if (auto MaybeX = ST.getReqdWorkGroupSize(F, 0)) { + // Replace mbcnt.hi(mask, val) with val only when work group size matches + // wave size (single wave per work group). + if (*MaybeX == Wave) { + BasicBlock::iterator BI(&I); + ReplaceInstWithValue(BI, I.getArgOperand(1)); + return true; + } + } + } + + // Optimize the complete lane ID computation pattern: + // mbcnt.hi(~0, mbcnt.lo(~0, 0)) which counts all active lanes with lower IDs + // across the full execution mask. + using namespace PatternMatch; + + // Check for pattern: mbcnt.hi(~0, mbcnt.lo(~0, 0)) + if (!match(&I, m_Intrinsic<Intrinsic::amdgcn_mbcnt_hi>( + m_AllOnes(), m_Intrinsic<Intrinsic::amdgcn_mbcnt_lo>( + m_AllOnes(), m_Zero())))) + return false; + + return tryReplaceWithWorkitemId(I, Wave); +} + char AMDGPUCodeGenPrepare::ID = 0; FunctionPass *llvm::createAMDGPUCodeGenPreparePass() { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td index e8b211f..7f00ead 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -176,6 +176,19 @@ def binop_s64_with_s32_mask_combines : GICombineGroup<[ combine_or_s64_with_s32_mask, combine_and_s64_with_s32_mask ]>; +// (or i64:x, (zext i32:y)) -> i64:(merge (or lo_32(x), i32:y), hi_32(x)) +// (or (zext i32:y), i64:x) -> i64:(merge (or lo_32(x), i32:y), hi_32(x)) +def or_s64_zext_s32_frag : GICombinePatFrag<(outs root:$dst), (ins $src_s64, $src_s32), + [(pattern (G_OR $dst, i64:$src_s64, i64:$zext_val), (G_ZEXT i64:$zext_val, i32:$src_s32)), + (pattern (G_OR $dst, i64:$zext_val, i64:$src_s64), (G_ZEXT i64:$zext_val, i32:$src_s32))]>; + +def combine_or_s64_s32 : GICombineRule< + (defs root:$dst), + (match (or_s64_zext_s32_frag $dst, i64:$x, i32:$y):$dst), + (apply (G_UNMERGE_VALUES $x_lo, $x_hi, $x), + (G_OR $or, $x_lo, $y), + (G_MERGE_VALUES $dst, $or, $x_hi))>; + let Predicates = [Has16BitInsts, NotHasMed3_16] in { // For gfx8, expand f16-fmed3-as-f32 into a min/max f16 sequence. This // saves one instruction compared to the promotion. @@ -206,7 +219,7 @@ def AMDGPUPreLegalizerCombiner: GICombiner< "AMDGPUPreLegalizerCombinerImpl", [all_combines, combine_fmul_with_select_to_fldexp, clamp_i64_to_i16, foldable_fneg, combine_shuffle_vector_to_build_vector, - binop_s64_with_s32_mask_combines]> { + binop_s64_with_s32_mask_combines, combine_or_s64_s32]> { let CombineAllMethodName = "tryCombineAllImpl"; } @@ -215,7 +228,7 @@ def AMDGPUPostLegalizerCombiner: GICombiner< [all_combines, gfx6gfx7_combines, gfx8_combines, combine_fmul_with_select_to_fldexp, uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg, rcp_sqrt_to_rsq, fdiv_by_sqrt_to_rsq_f16, sign_extension_in_reg, smulu64, - binop_s64_with_s32_mask_combines]> { + binop_s64_with_s32_mask_combines, combine_or_s64_s32]> { let CombineAllMethodName = "tryCombineAllImpl"; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp index d23521c..77be58c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp @@ -143,8 +143,7 @@ static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI, } static bool mayIgnoreSignedZero(MachineInstr &MI) { - const TargetOptions &Options = MI.getMF()->getTarget().Options; - return Options.NoSignedZerosFPMath || MI.getFlag(MachineInstr::MIFlag::FmNsz); + return MI.getFlag(MachineInstr::MIFlag::FmNsz); } static bool isInv2Pi(const APFloat &APF) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td b/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td index d14b5ce..f538769 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td @@ -7,13 +7,13 @@ //===----------------------------------------------------------------------===// def FeatureFP64 : SubtargetFeature<"fp64", - "FP64", + "HasFP64", "true", "Enable double precision operations" >; def FeatureFMA : SubtargetFeature<"fmaf", - "FMA", + "HasFMA", "true", "Enable single precision FMA (not as fast as mul+add, but fused)" >; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index bb4bf74..cfef046 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -288,6 +288,8 @@ def : GINodeEquiv<G_AMDGPU_TBUFFER_STORE_FORMAT_D16, SItbuffer_store_d16>; // FIXME: Check MMO is atomic def : GINodeEquiv<G_ATOMICRMW_UINC_WRAP, atomic_load_uinc_wrap_glue>; def : GINodeEquiv<G_ATOMICRMW_UDEC_WRAP, atomic_load_udec_wrap_glue>; +def : GINodeEquiv<G_ATOMICRMW_USUB_COND, atomic_load_usub_cond_glue>; +def : GINodeEquiv<G_ATOMICRMW_USUB_SAT, atomic_load_usub_sat_glue>; def : GINodeEquiv<G_ATOMICRMW_FMIN, atomic_load_fmin_glue>; def : GINodeEquiv<G_ATOMICRMW_FMAX, atomic_load_fmax_glue>; @@ -308,6 +310,7 @@ def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FMIN, SIbuffer_atomic_fmin>; def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FMAX, SIbuffer_atomic_fmax>; def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_CMPSWAP, SIbuffer_atomic_cmpswap>; def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32, SIbuffer_atomic_cond_sub_u32>; +def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32, SIbuffer_atomic_csub>; def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD, SIsbuffer_load>; def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_SBYTE, SIsbuffer_load_byte>; def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_UBYTE, SIsbuffer_load_ubyte>; @@ -326,6 +329,12 @@ def : GINodeEquiv<G_AMDGPU_WHOLE_WAVE_FUNC_SETUP, AMDGPUwhole_wave_setup>; // G_AMDGPU_WHOLE_WAVE_FUNC_RETURN is simpler than AMDGPUwhole_wave_return, // so we don't mark it as equivalent. +def : GINodeEquiv<G_AMDGPU_SPONENTRY, sponentry>; + +def : GINodeEquiv<G_AMDGPU_FLAT_LOAD_MONITOR, AMDGPUflat_load_monitor>; +def : GINodeEquiv<G_AMDGPU_GLOBAL_LOAD_MONITOR, AMDGPUglobal_load_monitor>; + + class GISelSop2Pat < SDPatternOperator node, Instruction inst, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h index 1b4b113..6bad4dbd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h @@ -131,7 +131,7 @@ protected: public: MetadataStreamerMsgPackV4() = default; - ~MetadataStreamerMsgPackV4() = default; + ~MetadataStreamerMsgPackV4() override = default; bool emitTo(AMDGPUTargetStreamer &TargetStreamer) override; @@ -154,7 +154,7 @@ protected: public: MetadataStreamerMsgPackV5() = default; - ~MetadataStreamerMsgPackV5() = default; + ~MetadataStreamerMsgPackV5() override = default; }; class MetadataStreamerMsgPackV6 final : public MetadataStreamerMsgPackV5 { @@ -163,7 +163,7 @@ protected: public: MetadataStreamerMsgPackV6() = default; - ~MetadataStreamerMsgPackV6() = default; + ~MetadataStreamerMsgPackV6() override = default; void emitKernelAttrs(const AMDGPUTargetMachine &TM, const MachineFunction &MF, msgpack::MapDocNode Kern) override; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHazardLatency.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHazardLatency.cpp new file mode 100644 index 0000000..37f8678 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUHazardLatency.cpp @@ -0,0 +1,77 @@ +//===--- AMDGPUHazardLatency.cpp - AMDGPU Hazard Latency Adjustment -------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file This file contains a DAG scheduling mutation to adjust the +/// latency of data edges between instructions which use registers +/// potentially subject to additional hazard waits not accounted +/// for in the normal scheduling model. +/// While the scheduling model is typically still accurate in these +/// scenarios, adjusting latency of relevant edges can improve wait +/// merging and reduce pipeline impact of any required waits. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUHazardLatency.h" +#include "GCNSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIInstrInfo.h" +#include "llvm/CodeGen/ScheduleDAGInstrs.h" + +using namespace llvm; + +namespace { + +class HazardLatency : public ScheduleDAGMutation { +private: + const GCNSubtarget &ST; + const SIRegisterInfo &TRI; + const MachineRegisterInfo &MRI; + +public: + HazardLatency(MachineFunction *MF) + : ST(MF->getSubtarget<GCNSubtarget>()), TRI(*ST.getRegisterInfo()), + MRI(MF->getRegInfo()) {} + void apply(ScheduleDAGInstrs *DAG) override; +}; + +void HazardLatency::apply(ScheduleDAGInstrs *DAG) { + constexpr unsigned MaskLatencyBoost = 3; + + // Hazard only manifests in Wave64 + if (!ST.hasVALUMaskWriteHazard() || !ST.isWave64()) + return; + + for (SUnit &SU : DAG->SUnits) { + const MachineInstr *MI = SU.getInstr(); + if (!SIInstrInfo::isVALU(*MI)) + continue; + if (MI->getOpcode() == AMDGPU::V_READLANE_B32 || + MI->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) + continue; + for (SDep &SuccDep : SU.Succs) { + if (SuccDep.isCtrl()) + continue; + // Boost latency on VALU writes to SGPRs used by VALUs. + // Reduce risk of premature VALU pipeline stall on associated reads. + MachineInstr *DestMI = SuccDep.getSUnit()->getInstr(); + if (!SIInstrInfo::isVALU(*DestMI)) + continue; + Register Reg = SuccDep.getReg(); + if (!TRI.isSGPRReg(MRI, Reg)) + continue; + SuccDep.setLatency(SuccDep.getLatency() * MaskLatencyBoost); + } + } +} + +} // end namespace + +std::unique_ptr<ScheduleDAGMutation> +llvm::createAMDGPUHazardLatencyDAGMutation(MachineFunction *MF) { + return std::make_unique<HazardLatency>(MF); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHazardLatency.h b/llvm/lib/Target/AMDGPU/AMDGPUHazardLatency.h new file mode 100644 index 0000000..134cc27 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUHazardLatency.h @@ -0,0 +1,24 @@ +//===- AMDGPUHazardLatency.h - Hazard Latency Adjustment --------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUHAZARDLATENCY_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUHAZARDLATENCY_H + +#include "llvm/CodeGen/ScheduleDAGMutation.h" +#include <memory> + +namespace llvm { + +class MachineFunction; + +std::unique_ptr<ScheduleDAGMutation> +createAMDGPUHazardLatencyDAGMutation(MachineFunction *MF); + +} // namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUHAZARDLATENCY_H diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp index 5700468..10ffbe2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp @@ -24,6 +24,8 @@ #include "llvm/CodeGen/MachineScheduler.h" #include "llvm/CodeGen/TargetOpcodes.h" +#include <type_traits> + using namespace llvm; #define DEBUG_TYPE "igrouplp" @@ -1044,7 +1046,7 @@ private: if (!SyncPipe.size()) return false; - auto SuccSize = llvm::count_if(SU->Succs, [](const SDep &Succ) { + unsigned SuccSize = llvm::count_if(SU->Succs, [](const SDep &Succ) { return Succ.getKind() == SDep::Data; }); if (SuccSize >= Size) @@ -1052,7 +1054,7 @@ private: if (HasIntermediary) { for (auto Succ : SU->Succs) { - auto SuccSize = + unsigned SuccSize = llvm::count_if(Succ.getSUnit()->Succs, [](const SDep &SuccSucc) { return SuccSucc.getKind() == SDep::Data; }); @@ -1084,7 +1086,7 @@ private: if (!SyncPipe.size()) return false; - auto SuccSize = llvm::count_if(SU->Succs, [](const SDep &Succ) { + unsigned SuccSize = llvm::count_if(SU->Succs, [](const SDep &Succ) { return Succ.getKind() == SDep::Data; }); if (SuccSize >= Size) @@ -1092,7 +1094,7 @@ private: if (HasIntermediary) { for (auto Succ : SU->Succs) { - auto SuccSize = + unsigned SuccSize = llvm::count_if(Succ.getSUnit()->Succs, [](const SDep &SuccSucc) { return SuccSucc.getKind() == SDep::Data; }); @@ -1968,7 +1970,7 @@ private: int NumBits = 0; auto TRI = TII->getRegisterInfo(); - auto &MRI = MI->getParent()->getParent()->getRegInfo(); + auto &MRI = MI->getMF()->getRegInfo(); for (auto &Elt : Collection) { auto Op = Elt->getInstr()->getOperand(0); auto Size = @@ -2183,7 +2185,7 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy( SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); // Interleave MFMA with DS_READ prefetch - for (unsigned I = 0; I < DSRCount - 4; ++I) { + for (unsigned I = 4; I < DSRCount; ++I) { SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( SchedGroupMask::DS_READ, 1, PipelineSyncID, DAG, TII); SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); @@ -2196,7 +2198,7 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy( // Phase 2a: Loop carried dependency with V_PERM // Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they // depend on. Interleave MFMA to keep XDL unit busy throughout. - for (unsigned I = 0; I < DSWWithPermCount - DSWWithSharedVMEMCount; ++I) { + for (unsigned I = DSWWithSharedVMEMCount; I < DSWWithPermCount; ++I) { SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII); SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true)); @@ -2233,7 +2235,7 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy( // Phase 2b: Loop carried dependency without V_PERM // Schedule DS_WRITE as closely as possible to the VMEM_READ they depend on. // Interleave MFMA to keep XDL unit busy throughout. - for (unsigned I = 0; I < DSWCount - DSWWithPermCount; I++) { + for (unsigned I = DSWWithPermCount; I < DSWCount; I++) { SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII); SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); @@ -2391,6 +2393,61 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const { if (MI.isMetaInstruction()) Result = false; + else if (MI.isInlineAsm()) { + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + auto &MRI = MI.getParent()->getParent()->getRegInfo(); + bool SGPR_used = false, SGPR_big_def = false, VGPR_used = false, + VMFMA_used = false, VReg32_used = false, MayLoad = MI.mayLoad(), + MayStore = MI.mayStore(); + for (const MachineOperand &Operand : MI.operands()) + if (Operand.isReg()) { + const TargetRegisterClass &RegClass = + *TRI.getRegClassForOperandReg(MRI, Operand); + if (TRI.hasVGPRs(&RegClass)) { + VGPR_used = true; + if (Operand.isUse() && TRI.getRegSizeInBits(RegClass) == 32) + VReg32_used = true; + } + // > 128 bit registers are usually only used by MFMA instructions, so + // we're using that as a heuristic to guess the schedule group mask of + // the inline asm. + if (TRI.hasAGPRs(&RegClass) || TRI.getRegSizeInBits(RegClass) > 128) + VMFMA_used = true; + if (TRI.hasSGPRs(&RegClass)) + SGPR_used = true; + if (TRI.getRegSizeInBits(RegClass) > 64 && Operand.isDef()) + SGPR_big_def = true; + } + + typedef std::underlying_type_t<SchedGroupMask> SGMask_t; + SGMask_t InlineAsmMask = 0; + if (VGPR_used && !VMFMA_used && !MayLoad && !MayStore) + InlineAsmMask |= (SGMask_t)SchedGroupMask::VALU; + if (SGPR_used && !VGPR_used && !MayLoad && !MayStore) + InlineAsmMask |= (SGMask_t)SchedGroupMask::SALU; + if (VMFMA_used) + InlineAsmMask |= (SGMask_t)SchedGroupMask::MFMA; + if (VGPR_used && MayLoad) + InlineAsmMask |= (SGMask_t)(VReg32_used ? SchedGroupMask::DS_READ + : SchedGroupMask::VMEM_READ); + if (VGPR_used && MayStore) + InlineAsmMask |= (SGMask_t)(VReg32_used ? SchedGroupMask::DS_WRITE + : SchedGroupMask::VMEM_WRITE); + if (SGPR_big_def) + InlineAsmMask |= (SGMask_t)SchedGroupMask::DS_READ; + if (InlineAsmMask & (SGMask_t)SchedGroupMask::VALU || + InlineAsmMask & (SGMask_t)SchedGroupMask::SALU) + InlineAsmMask |= (SGMask_t)SchedGroupMask::ALU; + if (InlineAsmMask & (SGMask_t)SchedGroupMask::DS_READ || + InlineAsmMask & (SGMask_t)SchedGroupMask::DS_WRITE) + InlineAsmMask |= (SGMask_t)SchedGroupMask::DS; + if (InlineAsmMask & (SGMask_t)SchedGroupMask::VMEM_READ || + InlineAsmMask & (SGMask_t)SchedGroupMask::VMEM_WRITE) + InlineAsmMask |= (SGMask_t)SchedGroupMask::VMEM; + + Result = ((SGMask_t)SGMask & InlineAsmMask) != 0; + } + else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) && (TII->isVALU(MI) || TII->isMFMAorWMMA(MI) || TII->isSALU(MI) || TII->isTRANS(MI))) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h index aff7096..0688f07 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h @@ -11,7 +11,6 @@ #include "llvm/CodeGen/ScheduleDAGMutation.h" #include <memory> -#include <vector> namespace llvm { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index b8b419d..238f06f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -134,7 +134,7 @@ static SDValue stripExtractLoElt(SDValue In) { INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISelLegacy, "amdgpu-isel", "AMDGPU DAG->DAG Pattern Instruction Selection", false, false) -INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo) +INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfoWrapperLegacy) INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysisLegacy) INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass) #ifdef EXPENSIVE_CHECKS @@ -238,7 +238,7 @@ bool AMDGPUDAGToDAGISelLegacy::runOnMachineFunction(MachineFunction &MF) { } void AMDGPUDAGToDAGISelLegacy::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<AMDGPUArgumentUsageInfo>(); + AU.addRequired<AMDGPUArgumentUsageInfoWrapperLegacy>(); AU.addRequired<UniformityInfoWrapperPass>(); #ifdef EXPENSIVE_CHECKS AU.addRequired<DominatorTreeWrapperPass>(); @@ -470,6 +470,24 @@ MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm, return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops); } +SDNode *AMDGPUDAGToDAGISel::packConstantV2I16(const SDNode *N, + SelectionDAG &DAG) const { + // TODO: Handle undef as zero + + assert(N->getOpcode() == ISD::BUILD_VECTOR && N->getNumOperands() == 2); + uint32_t LHSVal, RHSVal; + if (getConstantValue(N->getOperand(0), LHSVal) && + getConstantValue(N->getOperand(1), RHSVal)) { + SDLoc SL(N); + uint32_t K = (LHSVal & 0xffff) | (RHSVal << 16); + return DAG.getMachineNode( + isVGPRImm(N) ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32, SL, + N->getValueType(0), DAG.getTargetConstant(K, SL, MVT::i32)); + } + + return nullptr; +} + void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) { EVT VT = N->getValueType(0); unsigned NumVectorElts = VT.getVectorNumElements(); @@ -708,10 +726,14 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { break; } + const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); assert(VT.getVectorElementType().bitsEq(MVT::i32)); - unsigned RegClassID = - SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32)->getID(); - SelectBuildVector(N, RegClassID); + const TargetRegisterClass *RegClass = + N->isDivergent() + ? TRI->getDefaultVectorSuperClassForBitWidth(NumVectorElts * 32) + : SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32); + + SelectBuildVector(N, RegClass->getID()); return; } case ISD::VECTOR_SHUFFLE: @@ -1284,7 +1306,7 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base, // FIXME: Select to VOP3 version for with-carry. unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32; - if (Subtarget->hasAddNoCarry()) { + if (Subtarget->hasAddNoCarryInsts()) { SubOp = AMDGPU::V_SUB_U32_e64; Opnds.push_back( CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit @@ -1469,7 +1491,7 @@ bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base, Opnds.push_back(Zero); Opnds.push_back(Addr.getOperand(1)); unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32; - if (Subtarget->hasAddNoCarry()) { + if (Subtarget->hasAddNoCarryInsts()) { SubOp = AMDGPU::V_SUB_U32_e64; Opnds.push_back( CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit @@ -1828,72 +1850,83 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr, isFlatScratchBaseLegal(Addr))) { int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue(); - const SIInstrInfo *TII = Subtarget->getInstrInfo(); - if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) { - Addr = N0; - OffsetVal = COffsetVal; - } else { - // If the offset doesn't fit, put the low bits into the offset field and - // add the rest. - // - // For a FLAT instruction the hardware decides whether to access - // global/scratch/shared memory based on the high bits of vaddr, - // ignoring the offset field, so we have to ensure that when we add - // remainder to vaddr it still points into the same underlying object. - // The easiest way to do that is to make sure that we split the offset - // into two pieces that are both >= 0 or both <= 0. - - SDLoc DL(N); - uint64_t RemainderOffset; - - std::tie(OffsetVal, RemainderOffset) = - TII->splitFlatOffset(COffsetVal, AS, FlatVariant); - - SDValue AddOffsetLo = - getMaterializedScalarImm32(Lo_32(RemainderOffset), DL); - SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1); - - if (Addr.getValueType().getSizeInBits() == 32) { - SmallVector<SDValue, 3> Opnds; - Opnds.push_back(N0); - Opnds.push_back(AddOffsetLo); - unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32; - if (Subtarget->hasAddNoCarry()) { - AddOp = AMDGPU::V_ADD_U32_e64; - Opnds.push_back(Clamp); - } - Addr = SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0); + // Adding the offset to the base address in a FLAT instruction must not + // change the memory aperture in which the address falls. Therefore we can + // only fold offsets from inbounds GEPs into FLAT instructions. + bool IsInBounds = + Addr.getOpcode() == ISD::PTRADD && Addr->getFlags().hasInBounds(); + if (COffsetVal == 0 || FlatVariant != SIInstrFlags::FLAT || IsInBounds) { + const SIInstrInfo *TII = Subtarget->getInstrInfo(); + if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) { + Addr = N0; + OffsetVal = COffsetVal; } else { - // TODO: Should this try to use a scalar add pseudo if the base address - // is uniform and saddr is usable? - SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32); - SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32); - - SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, - DL, MVT::i32, N0, Sub0); - SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, - DL, MVT::i32, N0, Sub1); - - SDValue AddOffsetHi = - getMaterializedScalarImm32(Hi_32(RemainderOffset), DL); - - SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1); - - SDNode *Add = - CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs, - {AddOffsetLo, SDValue(N0Lo, 0), Clamp}); - - SDNode *Addc = CurDAG->getMachineNode( - AMDGPU::V_ADDC_U32_e64, DL, VTs, - {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp}); - - SDValue RegSequenceArgs[] = { - CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32), - SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1}; - - Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL, - MVT::i64, RegSequenceArgs), - 0); + // If the offset doesn't fit, put the low bits into the offset field + // and add the rest. + // + // For a FLAT instruction the hardware decides whether to access + // global/scratch/shared memory based on the high bits of vaddr, + // ignoring the offset field, so we have to ensure that when we add + // remainder to vaddr it still points into the same underlying object. + // The easiest way to do that is to make sure that we split the offset + // into two pieces that are both >= 0 or both <= 0. + + SDLoc DL(N); + uint64_t RemainderOffset; + + std::tie(OffsetVal, RemainderOffset) = + TII->splitFlatOffset(COffsetVal, AS, FlatVariant); + + SDValue AddOffsetLo = + getMaterializedScalarImm32(Lo_32(RemainderOffset), DL); + SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1); + + if (Addr.getValueType().getSizeInBits() == 32) { + SmallVector<SDValue, 3> Opnds; + Opnds.push_back(N0); + Opnds.push_back(AddOffsetLo); + unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32; + if (Subtarget->hasAddNoCarryInsts()) { + AddOp = AMDGPU::V_ADD_U32_e64; + Opnds.push_back(Clamp); + } + Addr = + SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0); + } else { + // TODO: Should this try to use a scalar add pseudo if the base + // address is uniform and saddr is usable? + SDValue Sub0 = + CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32); + SDValue Sub1 = + CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32); + + SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, N0, Sub0); + SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, N0, Sub1); + + SDValue AddOffsetHi = + getMaterializedScalarImm32(Hi_32(RemainderOffset), DL); + + SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1); + + SDNode *Add = + CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs, + {AddOffsetLo, SDValue(N0Lo, 0), Clamp}); + + SDNode *Addc = CurDAG->getMachineNode( + AMDGPU::V_ADDC_U32_e64, DL, VTs, + {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp}); + + SDValue RegSequenceArgs[] = { + CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, + MVT::i32), + SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1}; + + Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL, + MVT::i64, RegSequenceArgs), + 0); + } } } } @@ -3047,9 +3080,38 @@ void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) { SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32); const unsigned Opc = gwsIntrinToOpcode(IntrID); + + const MCInstrDesc &InstrDesc = TII->get(Opc); + int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); + + const TargetRegisterClass *DataRC = TII->getRegClass(InstrDesc, Data0Idx); + SmallVector<SDValue, 5> Ops; - if (HasVSrc) - Ops.push_back(N->getOperand(2)); + if (HasVSrc) { + const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); + + SDValue Data = N->getOperand(2); + MVT DataVT = Data.getValueType().getSimpleVT(); + if (TRI->isTypeLegalForClass(*DataRC, DataVT)) { + // Normal 32-bit case. + Ops.push_back(N->getOperand(2)); + } else { + // Operand is really 32-bits, but requires 64-bit alignment, so use the + // even aligned 64-bit register class. + const SDValue RegSeqOps[] = { + CurDAG->getTargetConstant(DataRC->getID(), SL, MVT::i32), Data, + CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32), + SDValue( + CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL, MVT::i32), + 0), + CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32)}; + + Ops.push_back(SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, + SL, MVT::v2i32, RegSeqOps), + 0)); + } + } + Ops.push_back(OffsetField); Ops.push_back(Chain); @@ -4222,7 +4284,7 @@ static std::pair<unsigned, uint8_t> BitOp3_Op(SDValue In, SmallVector<SDValue, 3> Backup(Src.begin(), Src.end()); if (!getOperandBits(LHS, LHSBits) || !getOperandBits(RHS, RHSBits)) { - Src = Backup; + Src = std::move(Backup); return std::make_pair(0, 0); } @@ -4387,16 +4449,23 @@ bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const { bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode *N) const { const auto *Ld = cast<LoadSDNode>(N); - const MachineMemOperand *MMO = Ld->getMemOperand(); - if (N->isDivergent() && !AMDGPU::isUniformMMO(MMO)) + + // FIXME: We ought to able able to take the direct isDivergent result. We + // cannot rely on the MMO for a uniformity check, and should stop using + // it. This is a hack for 2 ways that the IR divergence analysis is superior + // to the DAG divergence: Recognizing shift-of-workitem-id as always + // uniform, and isSingleLaneExecution. These should be handled in the DAG + // version, and then this can be dropped. + if (Ld->isDivergent() && !AMDGPU::isUniformMMO(MMO)) return false; return MMO->getSize().hasValue() && Ld->getAlign() >= Align(std::min(MMO->getSize().getValue().getKnownMinValue(), uint64_t(4))) && - ((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || + (MMO->isInvariant() || + (Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) || (Subtarget->getScalarizeGlobalBehavior() && Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index 4fa0d3f..a86b754 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -14,6 +14,7 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUISELDAGTODAG_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUISELDAGTODAG_H +#include "AMDGPUSelectionDAGInfo.h" #include "GCNSubtarget.h" #include "SIMachineFunctionInfo.h" #include "SIModeRegisterDefaults.h" @@ -45,21 +46,6 @@ static inline bool getConstantValue(SDValue N, uint32_t &Out) { return false; } -// TODO: Handle undef as zero -static inline SDNode *packConstantV2I16(const SDNode *N, SelectionDAG &DAG) { - assert(N->getOpcode() == ISD::BUILD_VECTOR && N->getNumOperands() == 2); - uint32_t LHSVal, RHSVal; - if (getConstantValue(N->getOperand(0), LHSVal) && - getConstantValue(N->getOperand(1), RHSVal)) { - SDLoc SL(N); - uint32_t K = (LHSVal & 0xffff) | (RHSVal << 16); - return DAG.getMachineNode(AMDGPU::S_MOV_B32, SL, N->getValueType(0), - DAG.getTargetConstant(K, SL, MVT::i32)); - } - - return nullptr; -} - /// AMDGPU specific code to select AMDGPU machine instructions for /// SelectionDAG operations. class AMDGPUDAGToDAGISel : public SelectionDAGISel { @@ -115,6 +101,8 @@ private: MachineSDNode *buildSMovImm64(SDLoc &DL, uint64_t Val, EVT VT) const; + SDNode *packConstantV2I16(const SDNode *N, SelectionDAG &DAG) const; + SDNode *glueCopyToOp(SDNode *N, SDValue NewChain, SDValue Glue) const; SDNode *glueCopyToM0(SDNode *N, SDValue Val) const; SDNode *glueCopyToM0LDSInit(SDNode *N) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 1b559a6..da21033 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -17,6 +17,7 @@ #include "AMDGPUInstrInfo.h" #include "AMDGPUMachineFunction.h" #include "AMDGPUMemoryUtils.h" +#include "AMDGPUSelectionDAGInfo.h" #include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/GlobalISel/GISelValueTracking.h" @@ -59,8 +60,9 @@ unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) { } AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, - const AMDGPUSubtarget &STI) - : TargetLowering(TM), Subtarget(&STI) { + const TargetSubtargetInfo &STI, + const AMDGPUSubtarget &AMDGPUSTI) + : TargetLowering(TM, STI), Subtarget(&AMDGPUSTI) { // Always lower memset, memcpy, and memmove intrinsics to load/store // instructions, rather then generating calls to memset, mempcy or memmove. MaxStoresPerMemset = MaxStoresPerMemsetOptSize = ~0U; @@ -336,6 +338,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand); setTruncStoreAction(MVT::v4f32, MVT::v4bf16, Expand); setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand); + setTruncStoreAction(MVT::v6f32, MVT::v6f16, Expand); setTruncStoreAction(MVT::v8f32, MVT::v8bf16, Expand); setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand); setTruncStoreAction(MVT::v16f32, MVT::v16bf16, Expand); @@ -424,22 +427,13 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, Expand); setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Expand); - - if (Subtarget->has16BitInsts()) { - setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal); - setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Legal); - } else { - setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal); - setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Custom); - } + setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal); + setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Custom); setOperationAction({ISD::FLOG10, ISD::FLOG, ISD::FEXP, ISD::FEXP10}, MVT::f16, Custom); setOperationAction(ISD::FCANONICALIZE, {MVT::f32, MVT::f64}, Legal); - if (Subtarget->has16BitInsts()) { - setOperationAction(ISD::FCANONICALIZE, MVT::f16, Legal); - } // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches // scalarization code. Can be removed when IS_FPCLASS expand isn't called by @@ -451,11 +445,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, MVT::v16f64}, Custom); - if (isTypeLegal(MVT::f16)) - setOperationAction(ISD::IS_FPCLASS, - {MVT::v2f16, MVT::v3f16, MVT::v4f16, MVT::v16f16}, - Custom); - // Expand to fneg + fadd. setOperationAction(ISD::FSUB, MVT::f64, Expand); @@ -478,7 +467,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, MVT::v4i64, MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64}, Custom); - setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); + setOperationAction({ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP}, MVT::f64, + Expand); setOperationAction(ISD::FP_TO_FP16, {MVT::f64, MVT::f32}, Custom); const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 }; @@ -502,16 +492,15 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, // The hardware supports 32-bit FSHR, but not FSHL. setOperationAction(ISD::FSHR, MVT::i32, Legal); - // The hardware supports 32-bit ROTR, but not ROTL. - setOperationAction(ISD::ROTL, {MVT::i32, MVT::i64}, Expand); - setOperationAction(ISD::ROTR, MVT::i64, Expand); + setOperationAction({ISD::ROTL, ISD::ROTR}, {MVT::i32, MVT::i64}, Expand); setOperationAction({ISD::MULHU, ISD::MULHS}, MVT::i16, Expand); setOperationAction({ISD::MUL, ISD::MULHU, ISD::MULHS}, MVT::i64, Expand); - setOperationAction( - {ISD::UINT_TO_FP, ISD::SINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT}, - MVT::i64, Custom); + setOperationAction({ISD::UINT_TO_FP, ISD::SINT_TO_FP, ISD::FP_TO_SINT, + ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT, + ISD::FP_TO_UINT_SAT}, + MVT::i64, Custom); setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); setOperationAction({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, MVT::i32, @@ -530,19 +519,28 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, for (MVT VT : VectorIntTypes) { // Expand the following operations for the current type by default. - setOperationAction({ISD::ADD, ISD::AND, ISD::FP_TO_SINT, - ISD::FP_TO_UINT, ISD::MUL, ISD::MULHU, - ISD::MULHS, ISD::OR, ISD::SHL, - ISD::SRA, ISD::SRL, ISD::ROTL, - ISD::ROTR, ISD::SUB, ISD::SINT_TO_FP, - ISD::UINT_TO_FP, ISD::SDIV, ISD::UDIV, - ISD::SREM, ISD::UREM, ISD::SMUL_LOHI, - ISD::UMUL_LOHI, ISD::SDIVREM, ISD::UDIVREM, - ISD::SELECT, ISD::VSELECT, ISD::SELECT_CC, - ISD::XOR, ISD::BSWAP, ISD::CTPOP, - ISD::CTTZ, ISD::CTLZ, ISD::VECTOR_SHUFFLE, - ISD::SETCC, ISD::ADDRSPACECAST}, + // clang-format off + setOperationAction({ISD::ADD, ISD::AND, + ISD::FP_TO_SINT, ISD::FP_TO_UINT, + ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT, + ISD::MUL, ISD::MULHU, + ISD::MULHS, ISD::OR, + ISD::SHL, ISD::SRA, + ISD::SRL, ISD::ROTL, + ISD::ROTR, ISD::SUB, + ISD::SINT_TO_FP, ISD::UINT_TO_FP, + ISD::SDIV, ISD::UDIV, + ISD::SREM, ISD::UREM, + ISD::SMUL_LOHI, ISD::UMUL_LOHI, + ISD::SDIVREM, ISD::UDIVREM, + ISD::SELECT, ISD::VSELECT, + ISD::SELECT_CC, ISD::XOR, + ISD::BSWAP, ISD::CTPOP, + ISD::CTTZ, ISD::CTLZ, + ISD::VECTOR_SHUFFLE, ISD::SETCC, + ISD::ADDRSPACECAST}, VT, Expand); + // clang-format on } static const MVT::SimpleValueType FloatVectorTypes[] = { @@ -643,9 +641,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, } bool AMDGPUTargetLowering::mayIgnoreSignedZero(SDValue Op) const { - if (getTargetMachine().Options.NoSignedZerosFPMath) - return true; - const auto Flags = Op.getNode()->getFlags(); if (Flags.hasNoSignedZeros()) return true; @@ -820,9 +815,7 @@ bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const { // FIXME: Why are we reporting vectors of FP immediates as legal? bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const { - EVT ScalarVT = VT.getScalarType(); - return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 || - (ScalarVT == MVT::f16 && Subtarget->has16BitInsts())); + return isTypeLegal(VT.getScalarType()); } // We don't want to shrink f64 / f32 constants. @@ -966,8 +959,8 @@ bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const { assert(VT.isFloatingPoint()); // Packed operations do not have a fabs modifier. - return VT == MVT::f32 || VT == MVT::f64 || - (Subtarget->has16BitInsts() && (VT == MVT::f16 || VT == MVT::bf16)); + // Report this based on the end legalized type. + return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16; } bool AMDGPUTargetLowering::isFNegFree(EVT VT) const { @@ -1056,8 +1049,9 @@ bool AMDGPUTargetLowering::isNarrowingProfitable(SDNode *N, EVT SrcVT, case ISD::SMAX: case ISD::UMIN: case ISD::UMAX: - if (Subtarget->has16BitInsts() && - (!DestVT.isVector() || !Subtarget->hasVOP3PInsts())) { + if (isTypeLegal(MVT::i16) && + (!DestVT.isVector() || + !isOperationLegal(ISD::ADD, MVT::v2i16))) { // Check if VOP3P // Don't narrow back down to i16 if promoted to i32 already. if (!N->isDivergent() && DestVT.isInteger() && DestVT.getScalarSizeInBits() > 1 && @@ -1216,9 +1210,8 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute( const SmallVectorImpl<ISD::InputArg> &Ins) const { const MachineFunction &MF = State.getMachineFunction(); const Function &Fn = MF.getFunction(); - LLVMContext &Ctx = Fn.getParent()->getContext(); - const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF); - const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(); + LLVMContext &Ctx = Fn.getContext(); + const unsigned ExplicitOffset = Subtarget->getExplicitKernelArgOffset(); CallingConv::ID CC = Fn.getCallingConv(); Align MaxAlign = Align(1); @@ -1248,7 +1241,8 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute( SmallVector<EVT, 16> ValueVTs; SmallVector<uint64_t, 16> Offsets; - ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset); + ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, /*MemVTs=*/nullptr, + &Offsets, ArgOffset); for (unsigned Value = 0, NumValues = ValueVTs.size(); Value != NumValues; ++Value) { @@ -1409,7 +1403,12 @@ SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI, InVals.push_back(DAG.getPOISON(Arg.VT)); } - return DAG.getEntryNode(); + // FIXME: Hack because R600 doesn't handle callseq pseudos yet. + if (getTargetMachine().getTargetTriple().getArch() == Triple::r600) + return CLI.Chain; + + SDValue Chain = DAG.getCALLSEQ_START(CLI.Chain, 0, 0, CLI.DL); + return DAG.getCALLSEQ_END(Chain, 0, 0, /*InGlue=*/SDValue(), CLI.DL); } SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI, @@ -1465,6 +1464,9 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG); + case ISD::FP_TO_SINT_SAT: + case ISD::FP_TO_UINT_SAT: + return LowerFP_TO_INT_SAT(Op, DAG); case ISD::CTTZ: case ISD::CTTZ_ZERO_UNDEF: case ISD::CTLZ: @@ -1528,7 +1530,7 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, if (std::optional<uint32_t> Address = AMDGPUMachineFunction::getLDSAbsoluteAddress(*GV)) { if (IsNamedBarrier) { - unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16; + unsigned BarCnt = cast<GlobalVariable>(GV)->getGlobalSize(DL) / 16; MFI->recordNumNamedBarriers(Address.value(), BarCnt); } return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType()); @@ -1885,14 +1887,14 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op, Align BaseAlign = Load->getAlign(); Align HiAlign = commonAlignment(BaseAlign, Size); - SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT, - Load->getChain(), BasePtr, SrcValue, LoMemVT, - BaseAlign, Load->getMemOperand()->getFlags()); + SDValue LoLoad = DAG.getExtLoad( + Load->getExtensionType(), SL, LoVT, Load->getChain(), BasePtr, SrcValue, + LoMemVT, BaseAlign, Load->getMemOperand()->getFlags(), Load->getAAInfo()); SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Size)); - SDValue HiLoad = - DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(), - HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()), - HiMemVT, HiAlign, Load->getMemOperand()->getFlags()); + SDValue HiLoad = DAG.getExtLoad( + Load->getExtensionType(), SL, HiVT, Load->getChain(), HiPtr, + SrcValue.getWithOffset(LoMemVT.getStoreSize()), HiMemVT, HiAlign, + Load->getMemOperand()->getFlags(), Load->getAAInfo()); SDValue Join; if (LoVT == HiVT) { @@ -1980,10 +1982,10 @@ SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op, SDValue LoStore = DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign, - Store->getMemOperand()->getFlags()); - SDValue HiStore = - DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size), - HiMemVT, HiAlign, Store->getMemOperand()->getFlags()); + Store->getMemOperand()->getFlags(), Store->getAAInfo()); + SDValue HiStore = DAG.getTruncStore( + Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size), HiMemVT, HiAlign, + Store->getMemOperand()->getFlags(), Store->getAAInfo()); return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore); } @@ -2628,11 +2630,18 @@ static bool valueIsKnownNeverF32Denorm(SDValue Src) { return Src.getOperand(0).getValueType() == MVT::f16; case ISD::FP16_TO_FP: case ISD::FFREXP: + case ISD::FSQRT: + case AMDGPUISD::LOG: + case AMDGPUISD::EXP: return true; case ISD::INTRINSIC_WO_CHAIN: { unsigned IntrinsicID = Src.getConstantOperandVal(0); switch (IntrinsicID) { case Intrinsic::amdgcn_frexp_mant: + case Intrinsic::amdgcn_log: + case Intrinsic::amdgcn_log_clamp: + case Intrinsic::amdgcn_exp2: + case Intrinsic::amdgcn_sqrt: return true; default: return false; @@ -2731,7 +2740,7 @@ SDValue AMDGPUTargetLowering::LowerFLOG2(SDValue Op, SelectionDAG &DAG) const { if (VT == MVT::f16) { // Nothing in half is a denormal when promoted to f32. - assert(!Subtarget->has16BitInsts()); + assert(!isTypeLegal(VT)); SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags); SDValue Log = DAG.getNode(AMDGPUISD::LOG, SL, MVT::f32, Ext, Flags); return DAG.getNode(ISD::FP_ROUND, SL, VT, Log, @@ -2764,20 +2773,18 @@ SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op, EVT VT = Op.getValueType(); SDNodeFlags Flags = Op->getFlags(); SDLoc DL(Op); - const bool IsLog10 = Op.getOpcode() == ISD::FLOG10; assert(IsLog10 || Op.getOpcode() == ISD::FLOG); - const auto &Options = getTargetMachine().Options; if (VT == MVT::f16 || Flags.hasApproximateFuncs()) { - if (VT == MVT::f16 && !Subtarget->has16BitInsts()) { + if (VT == MVT::f16 && !isTypeLegal(MVT::f16)) { // Log and multiply in f32 is good enough for f16. X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X, Flags); } SDValue Lowered = LowerFLOGUnsafe(X, DL, DAG, IsLog10, Flags); - if (VT == MVT::f16 && !Subtarget->has16BitInsts()) { + if (VT == MVT::f16 && !isTypeLegal(MVT::f16)) { return DAG.getNode(ISD::FP_ROUND, DL, VT, Lowered, DAG.getTargetConstant(0, DL, MVT::i32), Flags); } @@ -2803,7 +2810,9 @@ SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op, SDValue C = DAG.getConstantFP(IsLog10 ? c_log10 : c_log, DL, VT); SDValue CC = DAG.getConstantFP(IsLog10 ? cc_log10 : cc_log, DL, VT); - + // This adds correction terms for which contraction may lead to an increase + // in the error of the approximation, so disable it. + Flags.setAllowContract(false); R = DAG.getNode(ISD::FMUL, DL, VT, Y, C, Flags); SDValue NegR = DAG.getNode(ISD::FNEG, DL, VT, R, Flags); SDValue FMA0 = DAG.getNode(ISD::FMA, DL, VT, Y, C, NegR, Flags); @@ -2826,15 +2835,16 @@ SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op, SDValue YHInt = DAG.getNode(ISD::AND, DL, MVT::i32, YAsInt, MaskConst); SDValue YH = DAG.getNode(ISD::BITCAST, DL, MVT::f32, YHInt); SDValue YT = DAG.getNode(ISD::FSUB, DL, VT, Y, YH, Flags); - + // This adds correction terms for which contraction may lead to an increase + // in the error of the approximation, so disable it. + Flags.setAllowContract(false); SDValue YTCT = DAG.getNode(ISD::FMUL, DL, VT, YT, CT, Flags); SDValue Mad0 = getMad(DAG, DL, VT, YH, CT, YTCT, Flags); SDValue Mad1 = getMad(DAG, DL, VT, YT, CH, Mad0, Flags); R = getMad(DAG, DL, VT, YH, CH, Mad1); } - const bool IsFiniteOnly = - (Flags.hasNoNaNs() || Options.NoNaNsFPMath) && Flags.hasNoInfs(); + const bool IsFiniteOnly = Flags.hasNoNaNs() && Flags.hasNoInfs(); // TODO: Check if known finite from source value. if (!IsFiniteOnly) { @@ -2910,7 +2920,7 @@ SDValue AMDGPUTargetLowering::lowerFEXP2(SDValue Op, SelectionDAG &DAG) const { if (VT == MVT::f16) { // Nothing in half is a denormal when promoted to f32. - assert(!Subtarget->has16BitInsts()); + assert(!isTypeLegal(MVT::f16)); SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags); SDValue Log = DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Ext, Flags); return DAG.getNode(ISD::FP_ROUND, SL, VT, Log, @@ -2950,19 +2960,28 @@ SDValue AMDGPUTargetLowering::lowerFEXP2(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScale, Flags); } +SDValue AMDGPUTargetLowering::lowerFEXPUnsafeImpl(SDValue X, const SDLoc &SL, + SelectionDAG &DAG, + SDNodeFlags Flags, + bool IsExp10) const { + // exp(x) -> exp2(M_LOG2E_F * x); + // exp10(x) -> exp2(log2(10) * x); + EVT VT = X.getValueType(); + SDValue Const = + DAG.getConstantFP(IsExp10 ? 0x1.a934f0p+1f : numbers::log2e, SL, VT); + + SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Const, Flags); + return DAG.getNode(VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP + : (unsigned)ISD::FEXP2, + SL, VT, Mul, Flags); +} + SDValue AMDGPUTargetLowering::lowerFEXPUnsafe(SDValue X, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const { EVT VT = X.getValueType(); - const SDValue Log2E = DAG.getConstantFP(numbers::log2e, SL, VT); - - if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) { - // exp2(M_LOG2E_F * f); - SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Log2E, Flags); - return DAG.getNode(VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP - : (unsigned)ISD::FEXP2, - SL, VT, Mul, Flags); - } + if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) + return lowerFEXPUnsafeImpl(X, SL, DAG, Flags, /*IsExp10=*/false); EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); @@ -2976,6 +2995,7 @@ SDValue AMDGPUTargetLowering::lowerFEXPUnsafe(SDValue X, const SDLoc &SL, SDValue AdjustedX = DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X); + const SDValue Log2E = DAG.getConstantFP(numbers::log2e, SL, VT); SDValue ExpInput = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, Log2E, Flags); SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, ExpInput, Flags); @@ -2994,6 +3014,7 @@ SDValue AMDGPUTargetLowering::lowerFEXP10Unsafe(SDValue X, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const { const EVT VT = X.getValueType(); + const unsigned Exp2Op = VT == MVT::f32 ? static_cast<unsigned>(AMDGPUISD::EXP) : static_cast<unsigned>(ISD::FEXP2); @@ -3050,33 +3071,32 @@ SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const { SDNodeFlags Flags = Op->getFlags(); const bool IsExp10 = Op.getOpcode() == ISD::FEXP10; - if (VT.getScalarType() == MVT::f16) { - // v_exp_f16 (fmul x, log2e) - if (allowApproxFunc(DAG, Flags)) // TODO: Does this really require fast? - return lowerFEXPUnsafe(X, SL, DAG, Flags); + // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying + // library behavior. Also, is known-not-daz source sufficient? + if (allowApproxFunc(DAG, Flags)) { // TODO: Does this really require fast? + return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags) + : lowerFEXPUnsafe(X, SL, DAG, Flags); + } + if (VT.getScalarType() == MVT::f16) { if (VT.isVector()) return SDValue(); + // Nothing in half is a denormal when promoted to f32. + // // exp(f16 x) -> // fptrunc (v_exp_f32 (fmul (fpext x), log2e)) - - // Nothing in half is a denormal when promoted to f32. + // + // exp10(f16 x) -> + // fptrunc (v_exp_f32 (fmul (fpext x), log2(10))) SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, X, Flags); - SDValue Lowered = lowerFEXPUnsafe(Ext, SL, DAG, Flags); + SDValue Lowered = lowerFEXPUnsafeImpl(Ext, SL, DAG, Flags, IsExp10); return DAG.getNode(ISD::FP_ROUND, SL, VT, Lowered, DAG.getTargetConstant(0, SL, MVT::i32), Flags); } assert(VT == MVT::f32); - // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying - // library behavior. Also, is known-not-daz source sufficient? - if (allowApproxFunc(DAG, Flags)) { - return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags) - : lowerFEXPUnsafe(X, SL, DAG, Flags); - } - // Algorithm: // // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64) @@ -3369,8 +3389,9 @@ SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, // Get the 32-bit normalized integer. Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust); // Convert the normalized 32-bit integer into f32. - unsigned Opc = - (Signed && Subtarget->isGCN()) ? ISD::SINT_TO_FP : ISD::UINT_TO_FP; + + bool UseLDEXP = isOperationLegal(ISD::FLDEXP, MVT::f32); + unsigned Opc = Signed && UseLDEXP ? ISD::SINT_TO_FP : ISD::UINT_TO_FP; SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm); // Finally, need to scale back the converted floating number as the original @@ -3378,7 +3399,7 @@ SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32), ShAmt); // On GCN, use LDEXP directly. - if (Subtarget->isGCN()) + if (UseLDEXP) return DAG.getNode(ISD::FLDEXP, SL, MVT::f32, FVal, ShAmt); // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent @@ -3445,7 +3466,7 @@ SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op, if (SrcVT != MVT::i64) return Op; - if (Subtarget->has16BitInsts() && DestVT == MVT::f16) { + if (DestVT == MVT::f16 && isTypeLegal(MVT::f16)) { SDLoc DL(Op); SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src); @@ -3493,7 +3514,7 @@ SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op, // TODO: Factor out code common with LowerUINT_TO_FP. - if (Subtarget->has16BitInsts() && DestVT == MVT::f16) { + if (DestVT == MVT::f16 && isTypeLegal(MVT::f16)) { SDLoc DL(Op); SDValue Src = Op.getOperand(0); @@ -3737,6 +3758,86 @@ SDValue AMDGPUTargetLowering::LowerFP_TO_INT(const SDValue Op, return SDValue(); } +SDValue AMDGPUTargetLowering::LowerFP_TO_INT_SAT(const SDValue Op, + SelectionDAG &DAG) const { + SDValue Src = Op.getOperand(0); + unsigned OpOpcode = Op.getOpcode(); + EVT SrcVT = Src.getValueType(); + EVT DstVT = Op.getValueType(); + SDValue SatVTOp = Op.getNode()->getOperand(1); + EVT SatVT = cast<VTSDNode>(SatVTOp)->getVT(); + SDLoc DL(Op); + + uint64_t DstWidth = DstVT.getScalarSizeInBits(); + uint64_t SatWidth = SatVT.getScalarSizeInBits(); + assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width"); + + // Will be selected natively + if (DstVT == MVT::i32 && SatWidth == DstWidth && + (SrcVT == MVT::f32 || SrcVT == MVT::f64)) + return Op; + + const SDValue Int32VT = DAG.getValueType(MVT::i32); + + // Perform all saturation at i32 and truncate + if (SatWidth < DstWidth) { + const uint64_t Int32Width = 32; + SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src, Int32VT); + SDValue Int32SatVal; + + if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) { + SDValue MinConst = DAG.getConstant( + APInt::getSignedMaxValue(SatWidth).sext(Int32Width), DL, MVT::i32); + SDValue MaxConst = DAG.getConstant( + APInt::getSignedMinValue(SatWidth).sext(Int32Width), DL, MVT::i32); + SDValue MinVal = + DAG.getNode(ISD::SMIN, DL, MVT::i32, FpToInt32, MinConst); + Int32SatVal = DAG.getNode(ISD::SMAX, DL, MVT::i32, MinVal, MaxConst); + } else { + SDValue MinConst = DAG.getConstant( + APInt::getMaxValue(SatWidth).zext(Int32Width), DL, MVT::i32); + Int32SatVal = DAG.getNode(ISD::UMIN, DL, MVT::i32, FpToInt32, MinConst); + } + + if (DstWidth == Int32Width) + return Int32SatVal; + if (DstWidth < Int32Width) + return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Int32SatVal); + + // DstWidth > Int32Width + const unsigned Ext = + OpOpcode == ISD::FP_TO_SINT_SAT ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; + return DAG.getNode(Ext, DL, DstVT, FpToInt32); + } + + // SatWidth == DstWidth + + // Saturate at i32 for i64 dst and 16b src (will invoke f16 promotion below) + if (DstVT == MVT::i64 && + (SrcVT == MVT::f16 || SrcVT == MVT::bf16 || + (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP))) { + return DAG.getNode(OpOpcode, DL, DstVT, Src, Int32VT); + } + + // Promote f16/bf16 src to f32 + if (SrcVT == MVT::f16 || SrcVT == MVT::bf16) { + SDValue PromotedSrc = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src); + return DAG.getNode(Op.getOpcode(), DL, DstVT, PromotedSrc, SatVTOp); + } + + // Promote sub-i32 dst to i32 with sub-i32 saturation + if (DstWidth < 32) { + // Note: this triggers SatWidth < DstWidth above to generate saturated + // truncate by requesting MVT::i32 destination with SatWidth < 32. + SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src, SatVTOp); + return DAG.getNode(ISD::TRUNCATE, DL, DstVT, FpToInt32); + } + + // TODO: can we implement i64 dst for f32/f64? + + return SDValue(); +} + SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const { EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); @@ -4125,8 +4226,7 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, EVT ElementType = VT.getScalarType(); EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext()); - EVT TargetType = VT.isVector() ? VT.changeVectorElementType(TargetScalarType) - : TargetScalarType; + EVT TargetType = VT.changeElementType(*DAG.getContext(), TargetScalarType); if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits()) return SDValue(); @@ -4190,8 +4290,7 @@ SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N, EVT ElementType = VT.getScalarType(); EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext()); - EVT TargetType = VT.isVector() ? VT.changeVectorElementType(TargetScalarType) - : TargetScalarType; + EVT TargetType = VT.changeElementType(*DAG.getContext(), TargetScalarType); if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits()) return SDValue(); @@ -4312,8 +4411,7 @@ SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N, EVT ElementType = VT.getScalarType(); EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext()); - EVT TargetType = VT.isVector() ? VT.changeVectorElementType(TargetScalarType) - : TargetScalarType; + EVT TargetType = VT.changeElementType(*DAG.getContext(), TargetScalarType); if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits()) return SDValue(); @@ -4547,7 +4645,7 @@ SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N, } // There are i16 integer mul/mad. - if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16)) + if (isTypeLegal(MVT::i16) && VT.getScalarType().bitsLE(MVT::i16)) return SDValue(); // SimplifyDemandedBits has the annoying habit of turning useful zero_extends @@ -4666,7 +4764,7 @@ SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const { EVT VT = N->getValueType(0); - if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32) + if (VT.isVector() || VT.getSizeInBits() > 32 || !Subtarget->hasMulU24()) return SDValue(); // Don't generate 24-bit multiplies on values that are in SGPRs, since @@ -4675,7 +4773,7 @@ SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N, // value is in an SGPR. // This doesn't apply if no s_mul_hi is available (since we'll end up with a // valu op anyway) - if (Subtarget->hasSMulHi() && !N->isDivergent()) + if (!N->isDivergent() && Subtarget->hasSMulHi()) return SDValue(); SelectionDAG &DAG = DCI.DAG; @@ -4700,9 +4798,7 @@ SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG, const SDLoc &DL, unsigned Opc) const { EVT VT = Op.getValueType(); - EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT); - if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() && - LegalVT != MVT::i16)) + if (VT.bitsGT(MVT::i32)) return SDValue(); if (VT != MVT::i32) @@ -4999,7 +5095,7 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, SDLoc SL(N); switch (Opc) { case ISD::FADD: { - if (!mayIgnoreSignedZero(N0)) + if (!mayIgnoreSignedZero(N0) && !N->getFlags().hasNoSignedZeros()) return SDValue(); // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y)) @@ -5047,7 +5143,7 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, case ISD::FMA: case ISD::FMAD: { // TODO: handle llvm.amdgcn.fma.legacy - if (!mayIgnoreSignedZero(N0)) + if (!mayIgnoreSignedZero(N0) && !N->getFlags().hasNoSignedZeros()) return SDValue(); // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z)) @@ -5259,7 +5355,7 @@ SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N, switch (N0.getOpcode()) { case ISD::FP16_TO_FP: { - assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal"); + assert(!isTypeLegal(MVT::f16) && "should only see if f16 is illegal"); SDLoc SL(N); SDValue Src = N0.getOperand(0); EVT SrcVT = Src.getValueType(); @@ -5459,7 +5555,7 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, } if ((OffsetVal + WidthVal) >= 32 && - !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) { + !(OffsetVal == 16 && WidthVal == 16 && Subtarget->hasSDWA())) { SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32); return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32, BitsFrom, ShiftVal); @@ -5649,169 +5745,6 @@ uint32_t AMDGPUTargetLowering::getImplicitParameterOffset( return getImplicitParameterOffset(MFI->getExplicitKernArgSize(), Param); } -#define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node; - -const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { - switch ((AMDGPUISD::NodeType)Opcode) { - case AMDGPUISD::FIRST_NUMBER: break; - // AMDIL DAG nodes - NODE_NAME_CASE(BRANCH_COND); - - // AMDGPU DAG nodes - NODE_NAME_CASE(IF) - NODE_NAME_CASE(ELSE) - NODE_NAME_CASE(LOOP) - NODE_NAME_CASE(CALL) - NODE_NAME_CASE(TC_RETURN) - NODE_NAME_CASE(TC_RETURN_GFX) - NODE_NAME_CASE(TC_RETURN_GFX_WholeWave) - NODE_NAME_CASE(TC_RETURN_CHAIN) - NODE_NAME_CASE(TC_RETURN_CHAIN_DVGPR) - NODE_NAME_CASE(TRAP) - NODE_NAME_CASE(RET_GLUE) - NODE_NAME_CASE(WAVE_ADDRESS) - NODE_NAME_CASE(RETURN_TO_EPILOG) - NODE_NAME_CASE(ENDPGM) - NODE_NAME_CASE(ENDPGM_TRAP) - NODE_NAME_CASE(SIMULATED_TRAP) - NODE_NAME_CASE(DWORDADDR) - NODE_NAME_CASE(FRACT) - NODE_NAME_CASE(SETCC) - NODE_NAME_CASE(DENORM_MODE) - NODE_NAME_CASE(FMA_W_CHAIN) - NODE_NAME_CASE(FMUL_W_CHAIN) - NODE_NAME_CASE(CLAMP) - NODE_NAME_CASE(COS_HW) - NODE_NAME_CASE(SIN_HW) - NODE_NAME_CASE(FMAX_LEGACY) - NODE_NAME_CASE(FMIN_LEGACY) - NODE_NAME_CASE(FMAX3) - NODE_NAME_CASE(SMAX3) - NODE_NAME_CASE(UMAX3) - NODE_NAME_CASE(FMIN3) - NODE_NAME_CASE(SMIN3) - NODE_NAME_CASE(UMIN3) - NODE_NAME_CASE(FMED3) - NODE_NAME_CASE(SMED3) - NODE_NAME_CASE(UMED3) - NODE_NAME_CASE(FMAXIMUM3) - NODE_NAME_CASE(FMINIMUM3) - NODE_NAME_CASE(FDOT2) - NODE_NAME_CASE(URECIP) - NODE_NAME_CASE(DIV_SCALE) - NODE_NAME_CASE(DIV_FMAS) - NODE_NAME_CASE(DIV_FIXUP) - NODE_NAME_CASE(FMAD_FTZ) - NODE_NAME_CASE(RCP) - NODE_NAME_CASE(RSQ) - NODE_NAME_CASE(RCP_LEGACY) - NODE_NAME_CASE(RCP_IFLAG) - NODE_NAME_CASE(LOG) - NODE_NAME_CASE(EXP) - NODE_NAME_CASE(FMUL_LEGACY) - NODE_NAME_CASE(RSQ_CLAMP) - NODE_NAME_CASE(FP_CLASS) - NODE_NAME_CASE(DOT4) - NODE_NAME_CASE(CARRY) - NODE_NAME_CASE(BORROW) - NODE_NAME_CASE(BFE_U32) - NODE_NAME_CASE(BFE_I32) - NODE_NAME_CASE(BFI) - NODE_NAME_CASE(BFM) - NODE_NAME_CASE(FFBH_U32) - NODE_NAME_CASE(FFBH_I32) - NODE_NAME_CASE(FFBL_B32) - NODE_NAME_CASE(MUL_U24) - NODE_NAME_CASE(MUL_I24) - NODE_NAME_CASE(MULHI_U24) - NODE_NAME_CASE(MULHI_I24) - NODE_NAME_CASE(MAD_U24) - NODE_NAME_CASE(MAD_I24) - NODE_NAME_CASE(MAD_I64_I32) - NODE_NAME_CASE(MAD_U64_U32) - NODE_NAME_CASE(PERM) - NODE_NAME_CASE(TEXTURE_FETCH) - NODE_NAME_CASE(R600_EXPORT) - NODE_NAME_CASE(CONST_ADDRESS) - NODE_NAME_CASE(REGISTER_LOAD) - NODE_NAME_CASE(REGISTER_STORE) - NODE_NAME_CASE(CVT_F32_UBYTE0) - NODE_NAME_CASE(CVT_F32_UBYTE1) - NODE_NAME_CASE(CVT_F32_UBYTE2) - NODE_NAME_CASE(CVT_F32_UBYTE3) - NODE_NAME_CASE(CVT_PKRTZ_F16_F32) - NODE_NAME_CASE(CVT_PKNORM_I16_F32) - NODE_NAME_CASE(CVT_PKNORM_U16_F32) - NODE_NAME_CASE(CVT_PK_I16_I32) - NODE_NAME_CASE(CVT_PK_U16_U32) - NODE_NAME_CASE(FP_TO_FP16) - NODE_NAME_CASE(BUILD_VERTICAL_VECTOR) - NODE_NAME_CASE(CONST_DATA_PTR) - NODE_NAME_CASE(PC_ADD_REL_OFFSET) - NODE_NAME_CASE(PC_ADD_REL_OFFSET64) - NODE_NAME_CASE(LDS) - NODE_NAME_CASE(DUMMY_CHAIN) - NODE_NAME_CASE(LOAD_D16_HI) - NODE_NAME_CASE(LOAD_D16_LO) - NODE_NAME_CASE(LOAD_D16_HI_I8) - NODE_NAME_CASE(LOAD_D16_HI_U8) - NODE_NAME_CASE(LOAD_D16_LO_I8) - NODE_NAME_CASE(LOAD_D16_LO_U8) - NODE_NAME_CASE(STORE_MSKOR) - NODE_NAME_CASE(TBUFFER_STORE_FORMAT) - NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16) - NODE_NAME_CASE(TBUFFER_LOAD_FORMAT) - NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16) - NODE_NAME_CASE(DS_ORDERED_COUNT) - NODE_NAME_CASE(ATOMIC_CMP_SWAP) - NODE_NAME_CASE(BUFFER_LOAD) - NODE_NAME_CASE(BUFFER_LOAD_UBYTE) - NODE_NAME_CASE(BUFFER_LOAD_USHORT) - NODE_NAME_CASE(BUFFER_LOAD_BYTE) - NODE_NAME_CASE(BUFFER_LOAD_SHORT) - NODE_NAME_CASE(BUFFER_LOAD_TFE) - NODE_NAME_CASE(BUFFER_LOAD_UBYTE_TFE) - NODE_NAME_CASE(BUFFER_LOAD_USHORT_TFE) - NODE_NAME_CASE(BUFFER_LOAD_BYTE_TFE) - NODE_NAME_CASE(BUFFER_LOAD_SHORT_TFE) - NODE_NAME_CASE(BUFFER_LOAD_FORMAT) - NODE_NAME_CASE(BUFFER_LOAD_FORMAT_TFE) - NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16) - NODE_NAME_CASE(SBUFFER_LOAD) - NODE_NAME_CASE(SBUFFER_LOAD_BYTE) - NODE_NAME_CASE(SBUFFER_LOAD_UBYTE) - NODE_NAME_CASE(SBUFFER_LOAD_SHORT) - NODE_NAME_CASE(SBUFFER_LOAD_USHORT) - NODE_NAME_CASE(SBUFFER_PREFETCH_DATA) - NODE_NAME_CASE(BUFFER_STORE) - NODE_NAME_CASE(BUFFER_STORE_BYTE) - NODE_NAME_CASE(BUFFER_STORE_SHORT) - NODE_NAME_CASE(BUFFER_STORE_FORMAT) - NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16) - NODE_NAME_CASE(BUFFER_ATOMIC_SWAP) - NODE_NAME_CASE(BUFFER_ATOMIC_ADD) - NODE_NAME_CASE(BUFFER_ATOMIC_SUB) - NODE_NAME_CASE(BUFFER_ATOMIC_SMIN) - NODE_NAME_CASE(BUFFER_ATOMIC_UMIN) - NODE_NAME_CASE(BUFFER_ATOMIC_SMAX) - NODE_NAME_CASE(BUFFER_ATOMIC_UMAX) - NODE_NAME_CASE(BUFFER_ATOMIC_AND) - NODE_NAME_CASE(BUFFER_ATOMIC_OR) - NODE_NAME_CASE(BUFFER_ATOMIC_XOR) - NODE_NAME_CASE(BUFFER_ATOMIC_INC) - NODE_NAME_CASE(BUFFER_ATOMIC_DEC) - NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP) - NODE_NAME_CASE(BUFFER_ATOMIC_CSUB) - NODE_NAME_CASE(BUFFER_ATOMIC_FADD) - NODE_NAME_CASE(BUFFER_ATOMIC_FMIN) - NODE_NAME_CASE(BUFFER_ATOMIC_FMAX) - NODE_NAME_CASE(BUFFER_ATOMIC_COND_SUB_U32) - NODE_NAME_CASE(WHOLE_WAVE_SETUP) - NODE_NAME_CASE(WHOLE_WAVE_RETURN) - } - return nullptr; -} - SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index bdaf486..adbc2c7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -51,7 +51,6 @@ protected: /// Split a vector store into multiple scalar stores. /// \returns The resulting chain. - SDValue LowerFREM(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const; @@ -78,6 +77,9 @@ protected: bool IsLog10, SDNodeFlags Flags) const; SDValue lowerFEXP2(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerFEXPUnsafeImpl(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, + SDNodeFlags Flags, bool IsExp10) const; + SDValue lowerFEXPUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const; SDValue lowerFEXP10Unsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, @@ -96,6 +98,7 @@ protected: SDValue LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG, bool Signed) const; SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const; @@ -180,7 +183,8 @@ protected: const SmallVectorImpl<ISD::InputArg> &Ins) const; public: - AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI); + AMDGPUTargetLowering(const TargetMachine &TM, const TargetSubtargetInfo &STI, + const AMDGPUSubtarget &AMDGPUSTI); bool mayIgnoreSignedZero(SDValue Op) const; @@ -280,8 +284,6 @@ public: SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const; - const char* getTargetNodeName(unsigned Opcode) const override; - // FIXME: Turn off MergeConsecutiveStores() before Instruction Selection for // AMDGPU. Commit r319036, // (https://github.com/llvm/llvm-project/commit/db77e57ea86d941a4262ef60261692f4cb6893e6) @@ -406,235 +408,6 @@ public: } }; -namespace AMDGPUISD { - -enum NodeType : unsigned { - // AMDIL ISD Opcodes - FIRST_NUMBER = ISD::BUILTIN_OP_END, - BRANCH_COND, - // End AMDIL ISD Opcodes - - // Function call. - CALL, - TC_RETURN, - TC_RETURN_GFX, - TC_RETURN_GFX_WholeWave, - TC_RETURN_CHAIN, - TC_RETURN_CHAIN_DVGPR, - TRAP, - - // Masked control flow nodes. - IF, - ELSE, - LOOP, - - // A uniform kernel return that terminates the wavefront. - ENDPGM, - - // s_endpgm, but we may want to insert it in the middle of the block. - ENDPGM_TRAP, - - // "s_trap 2" equivalent on hardware that does not support it. - SIMULATED_TRAP, - - // Return to a shader part's epilog code. - RETURN_TO_EPILOG, - - // Return with values from a non-entry function. - RET_GLUE, - - // Convert a unswizzled wave uniform stack address to an address compatible - // with a vector offset for use in stack access. - WAVE_ADDRESS, - - DWORDADDR, - FRACT, - - /// CLAMP value between 0.0 and 1.0. NaN clamped to 0, following clamp output - /// modifier behavior with dx10_enable. - CLAMP, - - // This is SETCC with the full mask result which is used for a compare with a - // result bit per item in the wavefront. - SETCC, - - DENORM_MODE, - - // FP ops with input and output chain. - FMA_W_CHAIN, - FMUL_W_CHAIN, - - // SIN_HW, COS_HW - f32 for SI, 1 ULP max error, valid from -100 pi to 100 pi. - // Denormals handled on some parts. - COS_HW, - SIN_HW, - FMAX_LEGACY, - FMIN_LEGACY, - - FMAX3, - SMAX3, - UMAX3, - FMIN3, - SMIN3, - UMIN3, - FMED3, - SMED3, - UMED3, - FMAXIMUM3, - FMINIMUM3, - FDOT2, - URECIP, - DIV_SCALE, - DIV_FMAS, - DIV_FIXUP, - // For emitting ISD::FMAD when f32 denormals are enabled because mac/mad is - // treated as an illegal operation. - FMAD_FTZ, - - // RCP, RSQ - For f32, 1 ULP max error, no denormal handling. - // For f64, max error 2^29 ULP, handles denormals. - RCP, - RSQ, - RCP_LEGACY, - RCP_IFLAG, - - // log2, no denormal handling for f32. - LOG, - - // exp2, no denormal handling for f32. - EXP, - - FMUL_LEGACY, - RSQ_CLAMP, - FP_CLASS, - DOT4, - CARRY, - BORROW, - BFE_U32, // Extract range of bits with zero extension to 32-bits. - BFE_I32, // Extract range of bits with sign extension to 32-bits. - BFI, // (src0 & src1) | (~src0 & src2) - BFM, // Insert a range of bits into a 32-bit word. - FFBH_U32, // ctlz with -1 if input is zero. - FFBH_I32, - FFBL_B32, // cttz with -1 if input is zero. - MUL_U24, - MUL_I24, - MULHI_U24, - MULHI_I24, - MAD_U24, - MAD_I24, - MAD_U64_U32, - MAD_I64_I32, - PERM, - TEXTURE_FETCH, - R600_EXPORT, - CONST_ADDRESS, - REGISTER_LOAD, - REGISTER_STORE, - - // These cvt_f32_ubyte* nodes need to remain consecutive and in order. - CVT_F32_UBYTE0, - CVT_F32_UBYTE1, - CVT_F32_UBYTE2, - CVT_F32_UBYTE3, - - // Convert two float 32 numbers into a single register holding two packed f16 - // with round to zero. - CVT_PKRTZ_F16_F32, - CVT_PKNORM_I16_F32, - CVT_PKNORM_U16_F32, - CVT_PK_I16_I32, - CVT_PK_U16_U32, - - // Same as the standard node, except the high bits of the resulting integer - // are known 0. - FP_TO_FP16, - - /// This node is for VLIW targets and it is used to represent a vector - /// that is stored in consecutive registers with the same channel. - /// For example: - /// |X |Y|Z|W| - /// T0|v.x| | | | - /// T1|v.y| | | | - /// T2|v.z| | | | - /// T3|v.w| | | | - BUILD_VERTICAL_VECTOR, - /// Pointer to the start of the shader's constant data. - CONST_DATA_PTR, - PC_ADD_REL_OFFSET, - PC_ADD_REL_OFFSET64, - LDS, - - DUMMY_CHAIN, - - FIRST_MEMORY_OPCODE, - LOAD_D16_HI = FIRST_MEMORY_OPCODE, - LOAD_D16_LO, - LOAD_D16_HI_I8, - LOAD_D16_HI_U8, - LOAD_D16_LO_I8, - LOAD_D16_LO_U8, - - STORE_MSKOR, - TBUFFER_STORE_FORMAT, - TBUFFER_STORE_FORMAT_D16, - TBUFFER_LOAD_FORMAT, - TBUFFER_LOAD_FORMAT_D16, - DS_ORDERED_COUNT, - ATOMIC_CMP_SWAP, - BUFFER_LOAD, - BUFFER_LOAD_UBYTE, - BUFFER_LOAD_USHORT, - BUFFER_LOAD_BYTE, - BUFFER_LOAD_SHORT, - BUFFER_LOAD_TFE, - BUFFER_LOAD_UBYTE_TFE, - BUFFER_LOAD_USHORT_TFE, - BUFFER_LOAD_BYTE_TFE, - BUFFER_LOAD_SHORT_TFE, - BUFFER_LOAD_FORMAT, - BUFFER_LOAD_FORMAT_TFE, - BUFFER_LOAD_FORMAT_D16, - SBUFFER_LOAD, - SBUFFER_LOAD_BYTE, - SBUFFER_LOAD_UBYTE, - SBUFFER_LOAD_SHORT, - SBUFFER_LOAD_USHORT, - SBUFFER_PREFETCH_DATA, - BUFFER_STORE, - BUFFER_STORE_BYTE, - BUFFER_STORE_SHORT, - BUFFER_STORE_FORMAT, - BUFFER_STORE_FORMAT_D16, - BUFFER_ATOMIC_SWAP, - BUFFER_ATOMIC_ADD, - BUFFER_ATOMIC_SUB, - BUFFER_ATOMIC_SMIN, - BUFFER_ATOMIC_UMIN, - BUFFER_ATOMIC_SMAX, - BUFFER_ATOMIC_UMAX, - BUFFER_ATOMIC_AND, - BUFFER_ATOMIC_OR, - BUFFER_ATOMIC_XOR, - BUFFER_ATOMIC_INC, - BUFFER_ATOMIC_DEC, - BUFFER_ATOMIC_CMPSWAP, - BUFFER_ATOMIC_CSUB, - BUFFER_ATOMIC_FADD, - BUFFER_ATOMIC_FMIN, - BUFFER_ATOMIC_FMAX, - BUFFER_ATOMIC_COND_SUB_U32, - LAST_MEMORY_OPCODE = BUFFER_ATOMIC_COND_SUB_U32, - - // Set up a whole wave function. - WHOLE_WAVE_SETUP, - - // Return from a whole wave function. - WHOLE_WAVE_RETURN, -}; - -} // End namespace AMDGPUISD - } // End namespace llvm #endif diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp index 9a90787..d1b9fb4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp @@ -68,10 +68,12 @@ public: // Get the delay type for a MachineInstr. DelayType getDelayType(const MachineInstr &MI) { - if (SIInstrInfo::isTRANS(MI)) + // Non-F64 TRANS instructions use a separate delay type. + if (SIInstrInfo::isTRANS(MI) && + !AMDGPU::isDPMACCInstruction(MI.getOpcode())) return TRANS; // WMMA XDL ops are treated the same as TRANS. - if (AMDGPU::isGFX1250(*ST) && SII->isXDLWMMA(MI)) + if (ST->hasGFX1250Insts() && SII->isXDLWMMA(MI)) return TRANS; if (SIInstrInfo::isVALU(MI)) return VALU; @@ -221,7 +223,7 @@ public: }; // A map from regunits to the delay info for that regunit. - struct DelayState : DenseMap<unsigned, DelayInfo> { + struct DelayState : DenseMap<MCRegUnit, DelayInfo> { // Merge another DelayState into this one by merging the delay info for each // regunit. void merge(const DelayState &RHS) { @@ -325,6 +327,13 @@ public: for (auto I = MachineBasicBlock::instr_iterator(LastDelayAlu), E = MachineBasicBlock::instr_iterator(MI); ++I != E;) { + if (I->getOpcode() == AMDGPU::S_SET_VGPR_MSB) { + // It is not deterministic whether the skip count counts + // S_SET_VGPR_MSB instructions or not, so do not include them in a + // skip region. + Skip = 6; + break; + } if (!I->isBundle() && !I->isMetaInstruction()) ++Skip; } @@ -359,7 +368,8 @@ public: bool Changed = false; MachineInstr *LastDelayAlu = nullptr; - MCRegUnit LastSGPRFromVALU = 0; + // FIXME: 0 is a valid register unit. + MCRegUnit LastSGPRFromVALU = static_cast<MCRegUnit>(0); // Iterate over the contents of bundles, but don't emit any instructions // inside a bundle. for (auto &MI : MBB.instrs()) { @@ -379,7 +389,8 @@ public: if (It != State.end()) { DelayInfo Info = It->getSecond(); State.advanceByVALUNum(Info.VALUNum); - LastSGPRFromVALU = 0; + // FIXME: 0 is a valid register unit. + LastSGPRFromVALU = static_cast<MCRegUnit>(0); } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index 4fe5d00..376184e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -35,7 +35,7 @@ struct AMDGPUImageDMaskIntrinsic { }; #define GET_AMDGPUImageDMaskIntrinsicTable_IMPL -#include "InstCombineTables.inc" +#include "AMDGPUGenSearchableTables.inc" } // end anonymous namespace @@ -553,6 +553,89 @@ static CallInst *rewriteCall(IRBuilderBase &B, CallInst &Old, return NewCall; } +// Return true for sequences of instructions that effectively assign +// each lane to its thread ID +static bool isThreadID(const GCNSubtarget &ST, Value *V) { + // Case 1: + // wave32: mbcnt_lo(-1, 0) + // wave64: mbcnt_hi(-1, mbcnt_lo(-1, 0)) + auto W32Pred = m_Intrinsic<Intrinsic::amdgcn_mbcnt_lo>(m_ConstantInt<-1>(), + m_ConstantInt<0>()); + auto W64Pred = m_Intrinsic<Intrinsic::amdgcn_mbcnt_hi>( + m_ConstantInt<-1>(), m_Intrinsic<Intrinsic::amdgcn_mbcnt_lo>( + m_ConstantInt<-1>(), m_ConstantInt<0>())); + if (ST.isWave32() && match(V, W32Pred)) + return true; + if (ST.isWave64() && match(V, W64Pred)) + return true; + + return false; +} + +// Attempt to capture situations where the index argument matches +// a DPP pattern, and convert to a DPP-based mov +static std::optional<Instruction *> +tryWaveShuffleDPP(const GCNSubtarget &ST, InstCombiner &IC, IntrinsicInst &II) { + Value *Val = II.getArgOperand(0); + Value *Idx = II.getArgOperand(1); + auto &B = IC.Builder; + + // DPP16 Row Share requires known wave size, architecture support + if (!ST.isWaveSizeKnown() || !ST.hasDPPRowShare()) + return std::nullopt; + + Value *Tid; + uint64_t Mask; + uint64_t RowIdx; + bool CanDPP16RowShare = false; + + // wave32 requires Mask & 0x1F == 0x10 + // wave64 requires Mask & 0x3F == 0x30 + uint64_t MaskCheck = (1UL << ST.getWavefrontSizeLog2()) - 1; + uint64_t MaskTarget = MaskCheck & 0xF0; + + // DPP16 Row Share 0: Idx = Tid & Mask + auto RowShare0Pred = m_And(m_Value(Tid), m_ConstantInt(Mask)); + + // DPP16 Row Share (0 < Row < 15): Idx = (Tid & Mask) | RowIdx + auto RowSharePred = + m_Or(m_And(m_Value(Tid), m_ConstantInt(Mask)), m_ConstantInt(RowIdx)); + + // DPP16 Row Share 15: Idx = Tid | 0xF + auto RowShare15Pred = m_Or(m_Value(Tid), m_ConstantInt<0xF>()); + + if (match(Idx, RowShare0Pred) && isThreadID(ST, Tid)) { + if ((Mask & MaskCheck) != MaskTarget) + return std::nullopt; + + RowIdx = 0; + CanDPP16RowShare = true; + } else if (match(Idx, RowSharePred) && isThreadID(ST, Tid) && RowIdx < 15 && + RowIdx > 0) { + if ((Mask & MaskCheck) != MaskTarget) + return std::nullopt; + + CanDPP16RowShare = true; + } else if (match(Idx, RowShare15Pred) && isThreadID(ST, Tid)) { + RowIdx = 15; + CanDPP16RowShare = true; + } + + if (CanDPP16RowShare) { + CallInst *UpdateDPP = + B.CreateIntrinsic(Intrinsic::amdgcn_update_dpp, Val->getType(), + {PoisonValue::get(Val->getType()), Val, + B.getInt32(AMDGPU::DPP::ROW_SHARE0 | RowIdx), + B.getInt32(0xF), B.getInt32(0xF), B.getFalse()}); + UpdateDPP->takeName(&II); + UpdateDPP->copyMetadata(II); + return IC.replaceInstUsesWith(II, UpdateDPP); + } + + // No valid DPP detected + return std::nullopt; +} + Instruction * GCNTTIImpl::hoistLaneIntrinsicThroughOperand(InstCombiner &IC, IntrinsicInst &II) const { @@ -788,7 +871,8 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf) Exp = 0; - return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Exp)); + return IC.replaceInstUsesWith(II, + ConstantInt::getSigned(II.getType(), Exp)); } if (isa<PoisonValue>(Src)) @@ -1458,30 +1542,30 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { if (isa<PoisonValue>(Src) || isa<PoisonValue>(Segment)) return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType())); - if (isa<UndefValue>(Src)) { - auto *QNaN = ConstantFP::get( - II.getType(), APFloat::getQNaN(II.getType()->getFltSemantics())); - return IC.replaceInstUsesWith(II, QNaN); - } + if (isa<UndefValue>(Segment)) + return IC.replaceInstUsesWith(II, ConstantFP::getZero(II.getType())); - const ConstantFP *Csrc = dyn_cast<ConstantFP>(Src); - if (!Csrc) + if (II.isStrictFP()) break; - if (II.isStrictFP()) + const ConstantFP *CSrc = dyn_cast<ConstantFP>(Src); + if (!CSrc && !isa<UndefValue>(Src)) break; - const APFloat &Fsrc = Csrc->getValueAPF(); - if (Fsrc.isNaN()) { - auto *Quieted = ConstantFP::get(II.getType(), Fsrc.makeQuiet()); - return IC.replaceInstUsesWith(II, Quieted); - } + // The instruction ignores special cases, and literally just extracts the + // exponents. Fold undef to nan, and index the table as normal. + APInt FSrcInt = CSrc ? CSrc->getValueAPF().bitcastToAPInt() + : APFloat::getQNaN(II.getType()->getFltSemantics()) + .bitcastToAPInt(); const ConstantInt *Cseg = dyn_cast<ConstantInt>(Segment); - if (!Cseg) + if (!Cseg) { + if (isa<UndefValue>(Src)) + return IC.replaceInstUsesWith(II, ConstantFP::getZero(II.getType())); break; + } - unsigned Exponent = (Fsrc.bitcastToAPInt().getZExtValue() >> 52) & 0x7ff; + unsigned Exponent = FSrcInt.extractBitsAsZExtValue(11, 52); unsigned SegmentVal = Cseg->getValue().trunc(5).getZExtValue(); unsigned Shift = SegmentVal * 53; if (Exponent > 1077) @@ -1737,6 +1821,33 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { NewII->takeName(&II); return IC.replaceInstUsesWith(II, NewII); } + case Intrinsic::amdgcn_tensor_load_to_lds: + case Intrinsic::amdgcn_tensor_store_from_lds: { + Value *D2 = II.getArgOperand(2); + Value *D3 = II.getArgOperand(3); + // We know that not passing the second and third tensor DMA groups is + // equivalent to passing zeroes for those registers, so we rewrite to the + // shorter form here. Undef or poison are replaced by 0. + auto Pred = m_CombineOr(m_Zero(), m_Undef()); + if (!match(D2, Pred) || !match(D3, Pred)) + return std::nullopt; + + auto ShortIntrinsic = IID == Intrinsic::amdgcn_tensor_load_to_lds + ? Intrinsic::amdgcn_tensor_load_to_lds_d2 + : Intrinsic::amdgcn_tensor_store_from_lds_d2; + CallInst *NewII = IC.Builder.CreateIntrinsic( + ShortIntrinsic, + {II.getArgOperand(0), II.getArgOperand(1), II.getArgOperand(4)}); + NewII->takeName(&II); + NewII->copyMetadata(II); + return IC.eraseInstFromFunction(II); + } + case Intrinsic::amdgcn_wave_shuffle: { + if (!ST->hasDPP()) + return std::nullopt; + + return tryWaveShuffleDPP(*ST, IC, II); + } } if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp index 5085e86..2b1f404 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp @@ -29,11 +29,19 @@ Intrinsic::ID AMDGPU::getIntrinsicID(const MachineInstr &I) { // TODO: Should largely merge with AMDGPUTTIImpl::isSourceOfDivergence. bool AMDGPU::isUniformMMO(const MachineMemOperand *MMO) { const Value *Ptr = MMO->getValue(); + if (!Ptr) { + if (const PseudoSourceValue *PSV = MMO->getPseudoValue()) { + return PSV->isConstantPool() || PSV->isStack() || PSV->isGOT() || + PSV->isJumpTable(); + } + + // Unknown value. + return false; + } + // UndefValue means this is a load of a kernel input. These are uniform. // Sometimes LDS instructions have constant pointers. - // If Ptr is null, then that means this mem operand contains a - // PseudoSourceValue like GOT. - if (!Ptr || isa<UndefValue, Constant, GlobalValue>(Ptr)) + if (isa<UndefValue, Constant, GlobalValue>(Ptr)) return true; if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h index 0eb00cb..529da8d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h @@ -50,6 +50,7 @@ const D16ImageDimIntrinsic *lookupD16ImageDimIntrinsic(unsigned Intr); struct ImageDimIntrinsicInfo { unsigned Intr; unsigned BaseOpcode; + unsigned AtomicNoRetBaseOpcode; MIMGDim Dim; uint8_t NumOffsetArgs; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td index b8fa6f3..8dc5d45 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -62,6 +62,7 @@ def AMDGPUIfBreakOp : SDTypeProfile<1, 2, // AMDGPU DAG Nodes // +// Masked control flow nodes. def AMDGPUif : SDNode<"AMDGPUISD::IF", AMDGPUIfOp, [SDNPHasChain]>; def AMDGPUelse : SDNode<"AMDGPUISD::ELSE", AMDGPUElseOp, [SDNPHasChain]>; def AMDGPUloop : SDNode<"AMDGPUISD::LOOP", AMDGPULoopOp, [SDNPHasChain]>; @@ -114,6 +115,7 @@ def AMDGPUtrap : SDNode<"AMDGPUISD::TRAP", [SDNPHasChain, SDNPVariadic, SDNPSideEffect, SDNPOptInGlue] >; +// Pointer to the start of the shader's constant data. def AMDGPUconstdata_ptr : SDNode< "AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 1, [SDTCisVT<0, iPTR>, SDTCisVT<0, iPTR>]> @@ -122,18 +124,21 @@ def AMDGPUconstdata_ptr : SDNode< // This argument to this node is a dword address. def AMDGPUdwordaddr : SDNode<"AMDGPUISD::DWORDADDR", SDTIntUnaryOp>; +// SIN_HW, COS_HW - f32 for SI, 1 ULP max error, valid from -100 pi to 100 pi. +// Denormals handled on some parts. def AMDGPUcos_impl : SDNode<"AMDGPUISD::COS_HW", SDTFPUnaryOp>; def AMDGPUsin_impl : SDNode<"AMDGPUISD::SIN_HW", SDTFPUnaryOp>; + // out = a - floor(a) def AMDGPUfract_impl : SDNode<"AMDGPUISD::FRACT", SDTFPUnaryOp>; // out = 1.0 / a def AMDGPUrcp_impl : SDNode<"AMDGPUISD::RCP", SDTFPUnaryOp>; -// v_log_f32, which is log2 +// v_log_f32, which is log2, no denormal handling for f32. def AMDGPUlog_impl : SDNode<"AMDGPUISD::LOG", SDTFPUnaryOp>; -// v_exp_f32, which is exp2 +// v_exp_f32, which is exp2, no denormal handling for f32. def AMDGPUexp_impl : SDNode<"AMDGPUISD::EXP", SDTFPUnaryOp>; // out = 1.0 / sqrt(a) @@ -146,11 +151,16 @@ def AMDGPUrcp_iflag : SDNode<"AMDGPUISD::RCP_IFLAG", SDTFPUnaryOp>; // out = 1.0 / sqrt(a) result clamped to +/- max_float. def AMDGPUrsq_clamp_impl : SDNode<"AMDGPUISD::RSQ_CLAMP", SDTFPUnaryOp>; +// Convert two float 32 numbers into a single register holding two packed f16 +// with round to zero. def AMDGPUpkrtz_f16_f32_impl : SDNode<"AMDGPUISD::CVT_PKRTZ_F16_F32", AMDGPUFPPackOp>; def AMDGPUpknorm_i16_f32_impl : SDNode<"AMDGPUISD::CVT_PKNORM_I16_F32", AMDGPUFPPackOp>; def AMDGPUpknorm_u16_f32_impl : SDNode<"AMDGPUISD::CVT_PKNORM_U16_F32", AMDGPUFPPackOp>; def AMDGPUpk_i16_i32_impl : SDNode<"AMDGPUISD::CVT_PK_I16_I32", AMDGPUIntPackOp>; def AMDGPUpk_u16_u32_impl : SDNode<"AMDGPUISD::CVT_PK_U16_U32", AMDGPUIntPackOp>; + +// Same as the standard node, except the high bits of the resulting integer +// are known 0. def AMDGPUfp_to_f16 : SDNode<"AMDGPUISD::FP_TO_FP16" , SDTFPToIntOp>; @@ -225,14 +235,18 @@ def AMDGPUSetCCOp : SDTypeProfile<1, 3, [ // setcc SDTCisInt<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, OtherVT> ]>; +// This is SETCC with the full mask result which is used for a compare with a +// result bit per item in the wavefront. def AMDGPUsetcc : SDNode<"AMDGPUISD::SETCC", AMDGPUSetCCOp>; +// FP ops with input and output chain. def AMDGPUfma : SDNode<"AMDGPUISD::FMA_W_CHAIN", SDTFPTernaryOp, [ SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; def AMDGPUmul : SDNode<"AMDGPUISD::FMUL_W_CHAIN", SDTFPBinOp, [ SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; +// These cvt_f32_ubyte* nodes need to remain consecutive and in order. def AMDGPUcvt_f32_ubyte0 : SDNode<"AMDGPUISD::CVT_F32_UBYTE0", SDTIntToFPOp, []>; def AMDGPUcvt_f32_ubyte1 : SDNode<"AMDGPUISD::CVT_F32_UBYTE1", @@ -264,6 +278,8 @@ def AMDGPUdiv_fmas_impl : SDNode<"AMDGPUISD::DIV_FMAS", AMDGPUFmasOp, // Denominator, src2 = Numerator). def AMDGPUdiv_fixup_impl : SDNode<"AMDGPUISD::DIV_FIXUP", SDTFPTernaryOp>; +// For emitting ISD::FMAD when f32 denormals are enabled because mac/mad is +// treated as an illegal operation. def AMDGPUfmad_ftz_impl : SDNode<"AMDGPUISD::FMAD_FTZ", SDTFPTernaryOp>; def AMDGPUregister_load : SDNode<"AMDGPUISD::REGISTER_LOAD", @@ -290,14 +306,23 @@ def AMDGPUatomic_cmp_swap : SDNode<"AMDGPUISD::ATOMIC_CMP_SWAP", [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>; +// Extract range of bits with zero extension to 32-bits. def AMDGPUbfe_u32_impl : SDNode<"AMDGPUISD::BFE_U32", AMDGPUDTIntTernaryOp>; + +// Extract range of bits with sign extension to 32-bits. def AMDGPUbfe_i32_impl : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>; + +// (src0 & src1) | (~src0 & src2) def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>; + +// Insert a range of bits into a 32-bit word. def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>; +// ctlz with -1 if input is zero. def AMDGPUffbh_u32_impl : SDNode<"AMDGPUISD::FFBH_U32", SDTIntBitCountUnaryOp>; def AMDGPUffbh_i32_impl : SDNode<"AMDGPUISD::FFBH_I32", SDTIntBitCountUnaryOp>; +// cttz with -1 if input is zero. def AMDGPUffbl_b32_impl : SDNode<"AMDGPUISD::FFBL_B32", SDTIntBitCountUnaryOp>; // Signed and unsigned 24-bit multiply. The highest 8-bits are ignore @@ -377,6 +402,15 @@ def AMDGPUExportOp : SDTypeProfile<0, 8, [ ]>; +def AMDGPUflat_load_monitor : SDNode< + "AMDGPUISD::FLAT_LOAD_MONITOR", SDTLoad, + [SDNPHasChain, SDNPMemOperand] +>; + +def AMDGPUglobal_load_monitor : SDNode< + "AMDGPUISD::GLOBAL_LOAD_MONITOR", SDTLoad, + [SDNPHasChain, SDNPMemOperand] +>; //===----------------------------------------------------------------------===// // Flow Control Profile Types @@ -394,16 +428,24 @@ def IL_brcond : SDNode<"AMDGPUISD::BRANCH_COND", SDTIL_BRCond, [SDNPHasChai //===----------------------------------------------------------------------===// // Call/Return DAG Nodes //===----------------------------------------------------------------------===// + +// A uniform kernel return that terminates the wavefront. def AMDGPUendpgm : SDNode<"AMDGPUISD::ENDPGM", SDTNone, [SDNPHasChain, SDNPOptInGlue]>; + +// s_endpgm, but we may want to insert it in the middle of the block. def AMDGPUendpgm_trap : SDNode<"AMDGPUISD::ENDPGM_TRAP", SDTNone, [SDNPHasChain]>; + +// "s_trap 2" equivalent on hardware that does not support it. def AMDGPUsimulated_trap : SDNode<"AMDGPUISD::SIMULATED_TRAP", SDTNone, [SDNPHasChain]>; +// Return to a shader part's epilog code. def AMDGPUreturn_to_epilog : SDNode<"AMDGPUISD::RETURN_TO_EPILOG", SDTNone, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; +// Return with values from a non-entry function. def AMDGPUret_glue : SDNode<"AMDGPUISD::RET_GLUE", SDTNone, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic] >; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 12915c73..82783dc 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -116,8 +116,14 @@ bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI, if (!DstRC || DstRC != SrcRC) return false; - return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) && - RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI); + if (!RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) || + !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI)) + return false; + const MCInstrDesc &MCID = MI.getDesc(); + if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) { + MI.getOperand(0).setIsEarlyClobber(true); + } + return true; } bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { @@ -221,14 +227,22 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const { const DebugLoc &DL = I.getDebugLoc(); MachineBasicBlock *BB = I.getParent(); + Register VCCReg = I.getOperand(1).getReg(); + MachineInstr *Cmp; + + // Set SCC as a side effect with S_CMP or S_OR. + if (STI.hasScalarCompareEq64()) { + unsigned CmpOpc = + STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32; + Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc)).addReg(VCCReg).addImm(0); + } else { + Register DeadDst = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); + Cmp = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_OR_B64), DeadDst) + .addReg(VCCReg) + .addReg(VCCReg); + } - unsigned CmpOpc = - STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32; - MachineInstr *Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc)) - .addReg(I.getOperand(1).getReg()) - .addImm(0); - if (!constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI)) - return false; + constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI); Register DstReg = I.getOperand(0).getReg(); BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(AMDGPU::SCC); @@ -269,7 +283,8 @@ bool AMDGPUInstructionSelector::selectCOPY_VCC_SCC(MachineInstr &I) const { .addImm(0); I.eraseFromParent(); - return constrainSelectedInstRegOperands(*Select, TII, TRI, RBI); + constrainSelectedInstRegOperands(*Select, TII, TRI, RBI); + return true; } bool AMDGPUInstructionSelector::selectReadAnyLane(MachineInstr &I) const { @@ -283,7 +298,8 @@ bool AMDGPUInstructionSelector::selectReadAnyLane(MachineInstr &I) const { .addReg(SrcReg); I.eraseFromParent(); - return constrainSelectedInstRegOperands(*RFL, TII, TRI, RBI); + constrainSelectedInstRegOperands(*RFL, TII, TRI, RBI); + return true; } bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const { @@ -350,7 +366,7 @@ AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO, unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx); Register Reg = MO.getReg(); BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg) - .addReg(Reg, 0, ComposedSubIdx); + .addReg(Reg, {}, ComposedSubIdx); return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(), MO.isKill(), MO.isDead(), MO.isUndef(), @@ -400,10 +416,11 @@ bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const { // Dead implicit-def of scc I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef - true, // isImp - false, // isKill - true)); // isDead - return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + true, // isImp + false, // isKill + true)); // isDead + constrainSelectedInstRegOperands(I, TII, TRI, RBI); + return true; } bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const { @@ -429,15 +446,17 @@ bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const { .add(I.getOperand(2)) .setOperandDead(3); // Dead scc I.eraseFromParent(); - return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); + constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); + return true; } - if (STI.hasAddNoCarry()) { + if (STI.hasAddNoCarryInsts()) { const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64; I.setDesc(TII.get(Opc)); I.addOperand(*MF, MachineOperand::CreateImm(0)); I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); - return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + constrainSelectedInstRegOperands(I, TII, TRI, RBI); + return true; } const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64; @@ -450,7 +469,8 @@ bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const { .add(I.getOperand(2)) .addImm(0); I.eraseFromParent(); - return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); + constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); + return true; } assert(!Sub && "illegal sub should not reach here"); @@ -491,8 +511,7 @@ bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const { .addReg(CarryReg, RegState::Kill) .addImm(0); - if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI)) - return false; + constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI); } BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) @@ -528,7 +547,8 @@ bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE( I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc)); I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); I.addOperand(*MF, MachineOperand::CreateImm(0)); - return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + constrainSelectedInstRegOperands(I, TII, TRI, RBI); + return true; } Register Src0Reg = I.getOperand(2).getReg(); @@ -593,7 +613,9 @@ bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32( I.setDesc(TII.get(Opc)); I.addOperand(*MF, MachineOperand::CreateImm(0)); I.addImplicitDefUseOperands(*MF); - return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + I.getOperand(0).setIsEarlyClobber(true); + constrainSelectedInstRegOperands(I, TII, TRI, RBI); + return true; } // TODO: We should probably legalize these to only using 32-bit results. @@ -636,7 +658,7 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const { *SrcRC, I.getOperand(1)); const DebugLoc &DL = I.getDebugLoc(); BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg) - .addReg(SrcReg, 0, SubReg); + .addReg(SrcReg, {}, SubReg); I.eraseFromParent(); return true; @@ -709,7 +731,7 @@ bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const { for (int I = 0, E = NumDst; I != E; ++I) { MachineOperand &Dst = MI.getOperand(I); BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg()) - .addReg(SrcReg, 0, SubRegs[I]); + .addReg(SrcReg, {}, SubRegs[I]); // Make sure the subregister index is valid for the source register. SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]); @@ -809,15 +831,13 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const { auto MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg) .addImm(0xFFFF) .addReg(Src0); - if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI)) - return false; + constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), Dst) .addReg(Src1) .addImm(16) .addReg(TmpReg); - if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI)) - return false; + constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); MI.eraseFromParent(); return true; @@ -863,7 +883,8 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const { .setOperandDead(3); // Dead scc MI.eraseFromParent(); - return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); + constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); + return true; } if (STI.hasSPackHL()) { Opc = AMDGPU::S_PACK_HL_B32_B16; @@ -872,7 +893,8 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const { } MI.setDesc(TII.get(Opc)); - return constrainSelectedInstRegOperands(MI, TII, TRI, RBI); + constrainSelectedInstRegOperands(MI, TII, TRI, RBI); + return true; } bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const { @@ -970,7 +992,8 @@ bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const { .addReg(OffsetReg) .addReg(WidthReg); MI.eraseFromParent(); - return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); + constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); + return true; } bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const { @@ -1072,7 +1095,8 @@ bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const { MIB.addReg(VDstIn); MI.eraseFromParent(); - return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); + constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); + return true; } // We need to handle this here because tablegen doesn't support matching @@ -1113,7 +1137,8 @@ bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const { .addImm(0); // $omod MI.eraseFromParent(); - return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); + constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); + return true; } bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const { @@ -1200,6 +1225,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const { case Intrinsic::amdgcn_permlane16_swap: case Intrinsic::amdgcn_permlane32_swap: return selectPermlaneSwapIntrin(I, IntrinsicID); + case Intrinsic::amdgcn_wave_shuffle: + return selectWaveShuffleIntrin(I); default: return selectImpl(I, *CoverageInfo); } @@ -1468,8 +1495,8 @@ bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const { .add(I.getOperand(3)); BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg) .addReg(AMDGPU::SCC); + constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI); bool Ret = - constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) && RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI); I.eraseFromParent(); return Ret; @@ -1499,9 +1526,9 @@ bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const { RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), *TRI.getBoolRC(), *MRI); - bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI); + constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI); I.eraseFromParent(); - return Ret; + return true; } bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const { @@ -1555,8 +1582,7 @@ bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const { SelectedMI.addImm(0); // op_sel RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI); - if (!constrainSelectedInstRegOperands(*SelectedMI, TII, TRI, RBI)) - return false; + constrainSelectedInstRegOperands(*SelectedMI, TII, TRI, RBI); I.eraseFromParent(); return true; @@ -1642,8 +1668,7 @@ bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const { .addReg(SrcReg) .addReg(TRI.getExec()) .setOperandDead(3); // Dead scc - if (!constrainSelectedInstRegOperands(*And, TII, TRI, RBI)) - return false; + constrainSelectedInstRegOperands(*And, TII, TRI, RBI); } } @@ -1710,7 +1735,8 @@ bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const { } I.eraseFromParent(); - return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); + constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); + return true; } bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const { @@ -1834,9 +1860,9 @@ bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic( if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI)) return false; - bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI); + constrainSelectedInstRegOperands(*DS, TII, TRI, RBI); MI.eraseFromParent(); - return Ret; + return true; } static unsigned gwsIntrinToOpcode(unsigned IntrID) { @@ -1930,20 +1956,52 @@ bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI, // The resource id offset is computed as (<isa opaque base> + M0[21:16] + // offset field) % 64. Some versions of the programming guide omit the m0 // part, or claim it's from offset 0. - auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID))); + + unsigned Opc = gwsIntrinToOpcode(IID); + const MCInstrDesc &InstrDesc = TII.get(Opc); if (HasVSrc) { Register VSrc = MI.getOperand(1).getReg(); - MIB.addReg(VSrc); - if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI)) - return false; - } + int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); + const TargetRegisterClass *DataRC = TII.getRegClass(InstrDesc, Data0Idx); + const TargetRegisterClass *SubRC = + TRI.getSubRegisterClass(DataRC, AMDGPU::sub0); - MIB.addImm(ImmOffset) - .cloneMemRefs(MI); + if (!SubRC) { + // 32-bit normal case. + if (!RBI.constrainGenericRegister(VSrc, *DataRC, *MRI)) + return false; - TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::data0); + BuildMI(*MBB, &MI, DL, InstrDesc) + .addReg(VSrc) + .addImm(ImmOffset) + .cloneMemRefs(MI); + } else { + // Requires even register alignment, so create 64-bit value and pad the + // top half with undef. + Register DataReg = MRI->createVirtualRegister(DataRC); + if (!RBI.constrainGenericRegister(VSrc, *SubRC, *MRI)) + return false; + + Register UndefReg = MRI->createVirtualRegister(SubRC); + BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg); + BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), DataReg) + .addReg(VSrc) + .addImm(AMDGPU::sub0) + .addReg(UndefReg) + .addImm(AMDGPU::sub1); + + BuildMI(*MBB, &MI, DL, InstrDesc) + .addReg(DataReg) + .addImm(ImmOffset) + .cloneMemRefs(MI); + } + } else { + BuildMI(*MBB, &MI, DL, InstrDesc) + .addImm(ImmOffset) + .cloneMemRefs(MI); + } MI.eraseFromParent(); return true; @@ -1978,11 +2036,12 @@ bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI, .addImm(IsGDS ? -1 : 0) .cloneMemRefs(MI); MI.eraseFromParent(); - return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); + constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); + return true; } bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const { - MachineFunction *MF = MI.getParent()->getParent(); + MachineFunction *MF = MI.getMF(); SIMachineFunctionInfo *MFInfo = MF->getInfo<SIMachineFunctionInfo>(); MFInfo->setInitWholeWave(); @@ -2006,19 +2065,27 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic( MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const { MachineBasicBlock *MBB = MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); + unsigned IntrOpcode = Intr->BaseOpcode; + + // For image atomic: use no-return opcode if result is unused. + if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode) { + Register ResultDef = MI.getOperand(0).getReg(); + if (MRI->use_nodbg_empty(ResultDef)) + IntrOpcode = Intr->AtomicNoRetBaseOpcode; + } const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = - AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); + AMDGPU::getMIMGBaseOpcodeInfo(IntrOpcode); const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim); - unsigned IntrOpcode = Intr->BaseOpcode; const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI); const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI); const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI); const unsigned ArgOffset = MI.getNumExplicitDefs() + 1; - Register VDataIn, VDataOut; + Register VDataIn = AMDGPU::NoRegister; + Register VDataOut = AMDGPU::NoRegister; LLT VDataTy; int NumVDataDwords = -1; bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 || @@ -2049,7 +2116,8 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic( unsigned DMaskLanes = 0; if (BaseOpcode->Atomic) { - VDataOut = MI.getOperand(0).getReg(); + if (!BaseOpcode->NoReturn) + VDataOut = MI.getOperand(0).getReg(); VDataIn = MI.getOperand(2).getReg(); LLT Ty = MRI->getType(VDataIn); @@ -2099,8 +2167,9 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic( assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this"); unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm(); - if (BaseOpcode->Atomic) - CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization + // Keep GLC only when the atomic's result is actually used. + if (BaseOpcode->Atomic && !BaseOpcode->NoReturn) + CPol |= AMDGPU::CPol::GLC; if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) | AMDGPU::CPol::VOLATILE)) return false; @@ -2280,7 +2349,8 @@ bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic( .cloneMemRefs(MI); MI.eraseFromParent(); - return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); + constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); + return true; } bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( @@ -2306,17 +2376,29 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( case Intrinsic::amdgcn_init_whole_wave: return selectInitWholeWave(I); case Intrinsic::amdgcn_raw_buffer_load_lds: + case Intrinsic::amdgcn_raw_buffer_load_async_lds: case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: + case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds: case Intrinsic::amdgcn_struct_buffer_load_lds: + case Intrinsic::amdgcn_struct_buffer_load_async_lds: case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: + case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds: return selectBufferLoadLds(I); // Until we can store both the address space of the global and the LDS // arguments by having tto MachineMemOperands on an intrinsic, we just trust // that the argument is a global pointer (buffer pointers have been handled by // a LLVM IR-level lowering). case Intrinsic::amdgcn_load_to_lds: + case Intrinsic::amdgcn_load_async_to_lds: case Intrinsic::amdgcn_global_load_lds: + case Intrinsic::amdgcn_global_load_async_lds: return selectGlobalLoadLds(I); + case Intrinsic::amdgcn_asyncmark: + case Intrinsic::amdgcn_wait_asyncmark: + // FIXME: Not supported on GFX12 yet. Will need a new feature when we do. + if (!Subtarget->hasVMemToLDSLoad()) + return false; + break; case Intrinsic::amdgcn_exp_compr: if (!STI.hasCompressedExport()) { Function &F = I.getMF()->getFunction(); @@ -2331,9 +2413,35 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn: case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: return selectDSBvhStackIntrinsic(I); + case Intrinsic::amdgcn_s_alloc_vgpr: { + // S_ALLOC_VGPR doesn't have a destination register, it just implicitly sets + // SCC. We then need to COPY it into the result vreg. + MachineBasicBlock *MBB = I.getParent(); + const DebugLoc &DL = I.getDebugLoc(); + + Register ResReg = I.getOperand(0).getReg(); + + MachineInstr *AllocMI = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_ALLOC_VGPR)) + .add(I.getOperand(2)); + (void)BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), ResReg) + .addReg(AMDGPU::SCC); + I.eraseFromParent(); + constrainSelectedInstRegOperands(*AllocMI, TII, TRI, RBI); + return RBI.constrainGenericRegister(ResReg, AMDGPU::SReg_32RegClass, *MRI); + } case Intrinsic::amdgcn_s_barrier_init: case Intrinsic::amdgcn_s_barrier_signal_var: return selectNamedBarrierInit(I, IntrinsicID); + case Intrinsic::amdgcn_s_wakeup_barrier: { + if (!STI.hasSWakeupBarrier()) { + Function &F = I.getMF()->getFunction(); + F.getContext().diagnose( + DiagnosticInfoUnsupported(F, "intrinsic not supported on subtarget", + I.getDebugLoc(), DS_Error)); + return false; + } + return selectNamedBarrierInst(I, IntrinsicID); + } case Intrinsic::amdgcn_s_barrier_join: case Intrinsic::amdgcn_s_get_named_barrier_state: return selectNamedBarrierInst(I, IntrinsicID); @@ -2372,11 +2480,10 @@ bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const { .add(I.getOperand(2)) .add(I.getOperand(3)); - bool Ret = false; - Ret |= constrainSelectedInstRegOperands(*Select, TII, TRI, RBI); - Ret |= constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI); + constrainSelectedInstRegOperands(*Select, TII, TRI, RBI); + constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI); I.eraseFromParent(); - return Ret; + return true; } // Wide VGPR select should have been split in RegBankSelect. @@ -2391,9 +2498,9 @@ bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const { .add(I.getOperand(2)) .add(I.getOperand(1)); - bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI); + constrainSelectedInstRegOperands(*Select, TII, TRI, RBI); I.eraseFromParent(); - return Ret; + return true; } bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const { @@ -2438,7 +2545,7 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const { const DebugLoc &DL = I.getDebugLoc(); MachineBasicBlock *MBB = I.getParent(); BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), DstReg) - .addReg(SrcReg, 0, AMDGPU::lo16); + .addReg(SrcReg, {}, AMDGPU::lo16); I.eraseFromParent(); return true; } @@ -2450,9 +2557,9 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const { Register LoReg = MRI->createVirtualRegister(DstRC); Register HiReg = MRI->createVirtualRegister(DstRC); BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg) - .addReg(SrcReg, 0, AMDGPU::sub0); + .addReg(SrcReg, {}, AMDGPU::sub0); BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg) - .addReg(SrcReg, 0, AMDGPU::sub1); + .addReg(SrcReg, {}, AMDGPU::sub1); if (IsVALU && STI.hasSDWA()) { // Write the low 16-bits of the high element into the high 16-bits of the @@ -2609,7 +2716,8 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { .addImm(Mask) .addReg(SrcReg); I.eraseFromParent(); - return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); + constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); + return true; } const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64; @@ -2619,7 +2727,8 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { .addImm(0) // Offset .addImm(SrcSize); // Width I.eraseFromParent(); - return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); + constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); + return true; } if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) { @@ -2644,18 +2753,18 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister; if (Signed) { BuildMI(MBB, I, DL, TII.get(AMDGPU::S_ASHR_I32), HiReg) - .addReg(SrcReg, 0, SubReg) - .addImm(31) - .setOperandDead(3); // Dead scc + .addReg(SrcReg, {}, SubReg) + .addImm(31) + .setOperandDead(3); // Dead scc } else { BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg) .addImm(0); } BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) - .addReg(SrcReg, 0, SubReg) - .addImm(AMDGPU::sub0) - .addReg(HiReg) - .addImm(AMDGPU::sub1); + .addReg(SrcReg, {}, SubReg) + .addImm(AMDGPU::sub0) + .addReg(HiReg) + .addImm(AMDGPU::sub1); I.eraseFromParent(); return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI); @@ -2673,10 +2782,10 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg); BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg) - .addReg(SrcReg, 0, SubReg) - .addImm(AMDGPU::sub0) - .addReg(UndefReg) - .addImm(AMDGPU::sub1); + .addReg(SrcReg, {}, SubReg) + .addImm(AMDGPU::sub0) + .addReg(UndefReg) + .addImm(AMDGPU::sub1); BuildMI(MBB, I, DL, TII.get(BFE64), DstReg) .addReg(ExtReg) @@ -2810,9 +2919,9 @@ bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const { Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg) - .addReg(Src, 0, AMDGPU::sub0); + .addReg(Src, {}, AMDGPU::sub0); BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg) - .addReg(Src, 0, AMDGPU::sub1); + .addReg(Src, {}, AMDGPU::sub1); BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg) .addImm(0x80000000); @@ -2852,9 +2961,9 @@ bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const { return false; BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg) - .addReg(Src, 0, AMDGPU::sub0); + .addReg(Src, {}, AMDGPU::sub0); BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg) - .addReg(Src, 0, AMDGPU::sub1); + .addReg(Src, {}, AMDGPU::sub1); BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg) .addImm(0x7fffffff); @@ -3093,7 +3202,8 @@ bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const { .addReg(MaskReg) .setOperandDead(3); // Dead scc I.eraseFromParent(); - return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); + constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); + return true; } unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32; @@ -3129,9 +3239,9 @@ bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const { // Extract the subregisters from the source pointer. BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg) - .addReg(SrcReg, 0, AMDGPU::sub0); + .addReg(SrcReg, {}, AMDGPU::sub0); BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg) - .addReg(SrcReg, 0, AMDGPU::sub1); + .addReg(SrcReg, {}, AMDGPU::sub1); Register MaskedLo, MaskedHi; @@ -3144,7 +3254,7 @@ bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const { MaskedLo = MRI->createVirtualRegister(&RegRC); BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo) - .addReg(MaskReg, 0, AMDGPU::sub0); + .addReg(MaskReg, {}, AMDGPU::sub0); BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo) .addReg(LoReg) .addReg(MaskLo); @@ -3158,7 +3268,7 @@ bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const { MaskedHi = MRI->createVirtualRegister(&RegRC); BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi) - .addReg(MaskReg, 0, AMDGPU::sub1); + .addReg(MaskReg, {}, AMDGPU::sub1); BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi) .addReg(HiReg) .addReg(MaskHi); @@ -3246,8 +3356,8 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT( unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32; BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg) - .addReg(SrcReg, 0, SubReg) - .addReg(SrcReg, RegState::Implicit); + .addReg(SrcReg, {}, SubReg) + .addReg(SrcReg, RegState::Implicit); MI.eraseFromParent(); return true; } @@ -3259,8 +3369,8 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT( BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) .addReg(IdxReg); BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg) - .addReg(SrcReg, 0, SubReg) - .addReg(SrcReg, RegState::Implicit); + .addReg(SrcReg, {}, SubReg) + .addReg(SrcReg, RegState::Implicit); MI.eraseFromParent(); return true; } @@ -3350,11 +3460,25 @@ bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT( return true; } +static bool isAsyncLDSDMA(Intrinsic::ID Intr) { + switch (Intr) { + case Intrinsic::amdgcn_raw_buffer_load_async_lds: + case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds: + case Intrinsic::amdgcn_struct_buffer_load_async_lds: + case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds: + case Intrinsic::amdgcn_load_async_to_lds: + case Intrinsic::amdgcn_global_load_async_lds: + return true; + } + return false; +} + bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const { if (!Subtarget->hasVMemToLDSLoad()) return false; unsigned Opc; unsigned Size = MI.getOperand(3).getImm(); + Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID(); // The struct intrinsic variants add one additional operand over raw. const bool HasVIndex = MI.getNumOperands() == 9; @@ -3444,12 +3568,17 @@ bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const { Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12) ? 1 : 0); // swz + MIB.addImm(isAsyncLDSDMA(IntrinsicID)); MachineMemOperand *LoadMMO = *MI.memoperands_begin(); + // Don't set the offset value here because the pointer points to the base of + // the buffer. MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo(); - LoadPtrI.Offset = MI.getOperand(6 + OpOffset).getImm(); + MachinePointerInfo StorePtrI = LoadPtrI; - StorePtrI.V = nullptr; + LoadPtrI.V = PoisonValue::get(PointerType::get(MF->getFunction().getContext(), + AMDGPUAS::BUFFER_RESOURCE)); + LoadPtrI.AddrSpace = AMDGPUAS::BUFFER_RESOURCE; StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS; auto F = LoadMMO->getFlags() & @@ -3464,7 +3593,8 @@ bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const { MIB.setMemRefs({LoadMMO, StoreMMO}); MI.eraseFromParent(); - return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); + constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); + return true; } /// Match a zero extend from a 32-bit value to 64-bits. @@ -3561,6 +3691,7 @@ bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{ unsigned Opc; unsigned Size = MI.getOperand(3).getImm(); + Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID(); switch (Size) { default: @@ -3627,13 +3758,18 @@ bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{ if (isSGPR(Addr)) MIB.addReg(VOffset); - MIB.add(MI.getOperand(4)) // offset - .add(MI.getOperand(5)); // cpol + MIB.add(MI.getOperand(4)); // offset + + unsigned Aux = MI.getOperand(5).getImm(); + MIB.addImm(Aux & ~AMDGPU::CPol::VIRTUAL_BITS); // cpol + MIB.addImm(isAsyncLDSDMA(IntrinsicID)); MachineMemOperand *LoadMMO = *MI.memoperands_begin(); MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo(); LoadPtrI.Offset = MI.getOperand(4).getImm(); MachinePointerInfo StorePtrI = LoadPtrI; + LoadPtrI.V = PoisonValue::get(PointerType::get(MF->getFunction().getContext(), + AMDGPUAS::GLOBAL_ADDRESS)); LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS; StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS; auto F = LoadMMO->getFlags() & @@ -3647,7 +3783,8 @@ bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{ MIB.setMemRefs({LoadMMO, StoreMMO}); MI.eraseFromParent(); - return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); + constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); + return true; } bool AMDGPUInstructionSelector::selectBVHIntersectRayIntrinsic( @@ -3656,8 +3793,9 @@ bool AMDGPUInstructionSelector::selectBVHIntersectRayIntrinsic( MI.getOpcode() == AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY ? 1 : 3; MI.setDesc(TII.get(MI.getOperand(OpcodeOpIdx).getImm())); MI.removeOperand(OpcodeOpIdx); - MI.addImplicitDefUseOperands(*MI.getParent()->getParent()); - return constrainSelectedInstRegOperands(MI, TII, TRI, RBI); + MI.addImplicitDefUseOperands(*MI.getMF()); + constrainSelectedInstRegOperands(MI, TII, TRI, RBI); + return true; } // FIXME: This should be removed and let the patterns select. We just need the @@ -3759,7 +3897,11 @@ bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const { MI.removeOperand(4); // VDst_In MI.removeOperand(1); // Intrinsic ID MI.addOperand(VDst_In); // Readd VDst_In to the end - MI.addImplicitDefUseOperands(*MI.getParent()->getParent()); + MI.addImplicitDefUseOperands(*MI.getMF()); + const MCInstrDesc &MCID = MI.getDesc(); + if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) { + MI.getOperand(0).setIsEarlyClobber(true); + } return true; } @@ -3783,7 +3925,8 @@ bool AMDGPUInstructionSelector::selectPermlaneSwapIntrin( MachineOperand &FI = MI.getOperand(4); FI.setImm(FI.getImm() ? AMDGPU::DPP::DPP_FI_1 : AMDGPU::DPP::DPP_FI_0); - return constrainSelectedInstRegOperands(MI, TII, TRI, RBI); + constrainSelectedInstRegOperands(MI, TII, TRI, RBI); + return true; } bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const { @@ -3814,6 +3957,133 @@ bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const { return true; } +bool AMDGPUInstructionSelector::selectWaveShuffleIntrin( + MachineInstr &MI) const { + assert(MI.getNumOperands() == 4); + MachineBasicBlock *MBB = MI.getParent(); + const DebugLoc &DL = MI.getDebugLoc(); + + Register DstReg = MI.getOperand(0).getReg(); + Register ValReg = MI.getOperand(2).getReg(); + Register IdxReg = MI.getOperand(3).getReg(); + + const LLT DstTy = MRI->getType(DstReg); + unsigned DstSize = DstTy.getSizeInBits(); + const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); + const TargetRegisterClass *DstRC = + TRI.getRegClassForSizeOnBank(DstSize, *DstRB); + + if (DstTy != LLT::scalar(32)) + return false; + + if (!Subtarget->supportsBPermute()) + return false; + + // If we can bpermute across the whole wave, then just do that + if (Subtarget->supportsWaveWideBPermute()) { + Register ShiftIdxReg = MRI->createVirtualRegister(DstRC); + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), ShiftIdxReg) + .addImm(2) + .addReg(IdxReg); + + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), DstReg) + .addReg(ShiftIdxReg) + .addReg(ValReg) + .addImm(0); + } else { + // Otherwise, we need to make use of whole wave mode + assert(Subtarget->isWave64()); + + // Set inactive lanes to poison + Register UndefValReg = + MRI->createVirtualRegister(TRI.getRegClass(AMDGPU::SReg_32RegClassID)); + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefValReg); + + Register UndefExecReg = MRI->createVirtualRegister( + TRI.getRegClass(AMDGPU::SReg_64_XEXECRegClassID)); + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefExecReg); + + Register PoisonValReg = MRI->createVirtualRegister(DstRC); + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SET_INACTIVE_B32), PoisonValReg) + .addImm(0) + .addReg(ValReg) + .addImm(0) + .addReg(UndefValReg) + .addReg(UndefExecReg); + + // ds_bpermute requires index to be multiplied by 4 + Register ShiftIdxReg = MRI->createVirtualRegister(DstRC); + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), ShiftIdxReg) + .addImm(2) + .addReg(IdxReg); + + Register PoisonIdxReg = MRI->createVirtualRegister(DstRC); + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SET_INACTIVE_B32), PoisonIdxReg) + .addImm(0) + .addReg(ShiftIdxReg) + .addImm(0) + .addReg(UndefValReg) + .addReg(UndefExecReg); + + // Get permutation of each half, then we'll select which one to use + Register SameSidePermReg = MRI->createVirtualRegister(DstRC); + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), SameSidePermReg) + .addReg(PoisonIdxReg) + .addReg(PoisonValReg) + .addImm(0); + + Register SwappedValReg = MRI->createVirtualRegister(DstRC); + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_PERMLANE64_B32), SwappedValReg) + .addReg(PoisonValReg); + + Register OppSidePermReg = MRI->createVirtualRegister(DstRC); + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), OppSidePermReg) + .addReg(PoisonIdxReg) + .addReg(SwappedValReg) + .addImm(0); + + Register WWMSwapPermReg = MRI->createVirtualRegister(DstRC); + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::STRICT_WWM), WWMSwapPermReg) + .addReg(OppSidePermReg); + + // Select which side to take the permute from + // We can get away with only using mbcnt_lo here since we're only + // trying to detect which side of 32 each lane is on, and mbcnt_lo + // returns 32 for lanes 32-63. + Register ThreadIDReg = MRI->createVirtualRegister(DstRC); + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MBCNT_LO_U32_B32_e64), ThreadIDReg) + .addImm(-1) + .addImm(0); + + Register XORReg = MRI->createVirtualRegister(DstRC); + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_XOR_B32_e64), XORReg) + .addReg(ThreadIDReg) + .addReg(PoisonIdxReg); + + Register ANDReg = MRI->createVirtualRegister(DstRC); + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_AND_B32_e64), ANDReg) + .addReg(XORReg) + .addImm(32); + + Register CompareReg = MRI->createVirtualRegister( + TRI.getRegClass(AMDGPU::SReg_64_XEXECRegClassID)); + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), CompareReg) + .addReg(ANDReg) + .addImm(0); + + // Finally do the selection + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg) + .addImm(0) + .addReg(WWMSwapPermReg) + .addImm(0) + .addReg(SameSidePermReg) + .addReg(CompareReg); + } + + MI.eraseFromParent(); + return true; +} + // Match BITOP3 operation and return a number of matched instructions plus // truth table. static std::pair<unsigned, uint8_t> BitOp3_Op(Register R, @@ -3891,7 +4161,7 @@ static std::pair<unsigned, uint8_t> BitOp3_Op(Register R, SmallVector<Register, 3> Backup(Src.begin(), Src.end()); if (!getOperandBits(LHS, LHSBits) || !getOperandBits(RHS, RHSBits)) { - Src = Backup; + Src = std::move(Backup); return std::make_pair(0, 0); } @@ -4131,6 +4401,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) { case TargetOpcode::G_ATOMICRMW_UMAX: case TargetOpcode::G_ATOMICRMW_UINC_WRAP: case TargetOpcode::G_ATOMICRMW_UDEC_WRAP: + case TargetOpcode::G_ATOMICRMW_USUB_COND: + case TargetOpcode::G_ATOMICRMW_USUB_SAT: case TargetOpcode::G_ATOMICRMW_FADD: case TargetOpcode::G_ATOMICRMW_FMIN: case TargetOpcode::G_ATOMICRMW_FMAX: @@ -6726,7 +6998,7 @@ bool AMDGPUInstructionSelector::selectSGetBarrierState( MachineInstr &I, Intrinsic::ID IntrID) const { MachineBasicBlock *MBB = I.getParent(); const DebugLoc &DL = I.getDebugLoc(); - MachineOperand BarOp = I.getOperand(2); + const MachineOperand &BarOp = I.getOperand(2); std::optional<int64_t> BarValImm = getIConstantVRegSExtVal(BarOp.getReg(), *MRI); @@ -6760,6 +7032,8 @@ unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) { llvm_unreachable("not a named barrier op"); case Intrinsic::amdgcn_s_barrier_join: return AMDGPU::S_BARRIER_JOIN_IMM; + case Intrinsic::amdgcn_s_wakeup_barrier: + return AMDGPU::S_WAKEUP_BARRIER_IMM; case Intrinsic::amdgcn_s_get_named_barrier_state: return AMDGPU::S_GET_BARRIER_STATE_IMM; }; @@ -6769,6 +7043,8 @@ unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) { llvm_unreachable("not a named barrier op"); case Intrinsic::amdgcn_s_barrier_join: return AMDGPU::S_BARRIER_JOIN_M0; + case Intrinsic::amdgcn_s_wakeup_barrier: + return AMDGPU::S_WAKEUP_BARRIER_M0; case Intrinsic::amdgcn_s_get_named_barrier_state: return AMDGPU::S_GET_BARRIER_STATE_M0; }; @@ -6779,8 +7055,8 @@ bool AMDGPUInstructionSelector::selectNamedBarrierInit( MachineInstr &I, Intrinsic::ID IntrID) const { MachineBasicBlock *MBB = I.getParent(); const DebugLoc &DL = I.getDebugLoc(); - MachineOperand BarOp = I.getOperand(1); - MachineOperand CntOp = I.getOperand(2); + const MachineOperand &BarOp = I.getOperand(1); + const MachineOperand &CntOp = I.getOperand(2); // BarID = (BarOp >> 4) & 0x3F Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index c760fe7..627cce2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -156,6 +156,7 @@ private: bool selectSBarrierSignalIsfirst(MachineInstr &I, Intrinsic::ID IID) const; bool selectSGetBarrierState(MachineInstr &I, Intrinsic::ID IID) const; bool selectSBarrierLeave(MachineInstr &I) const; + bool selectWaveShuffleIntrin(MachineInstr &I) const; std::pair<Register, unsigned> selectVOP3ModsImpl(Register Src, bool IsCanonicalizing = true, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index bd443b5..f77b4c9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -636,15 +636,11 @@ multiclass local_addr_space_atomic_op { } } -defm int_amdgcn_global_atomic_csub : noret_op; defm int_amdgcn_global_atomic_ordered_add_b64 : noret_op; defm int_amdgcn_flat_atomic_fmin_num : noret_op; defm int_amdgcn_flat_atomic_fmax_num : noret_op; defm int_amdgcn_global_atomic_fmin_num : noret_op; defm int_amdgcn_global_atomic_fmax_num : noret_op; -defm int_amdgcn_atomic_cond_sub_u32 : local_addr_space_atomic_op; -defm int_amdgcn_atomic_cond_sub_u32 : flat_addr_space_atomic_op; -defm int_amdgcn_atomic_cond_sub_u32 : global_addr_space_atomic_op; multiclass noret_binary_atomic_op<SDNode atomic_op> { let HasNoUse = true in @@ -695,6 +691,8 @@ defm atomic_load_fmin : binary_atomic_op_fp_all_as<atomic_load_fmin>; defm atomic_load_fmax : binary_atomic_op_fp_all_as<atomic_load_fmax>; defm atomic_load_uinc_wrap : binary_atomic_op_all_as<atomic_load_uinc_wrap>; defm atomic_load_udec_wrap : binary_atomic_op_all_as<atomic_load_udec_wrap>; +defm atomic_load_usub_cond : binary_atomic_op_all_as<atomic_load_usub_cond>; +defm atomic_load_usub_sat : binary_atomic_op_all_as<atomic_load_usub_sat>; defm AMDGPUatomic_cmp_swap : binary_atomic_op_all_as<AMDGPUatomic_cmp_swap>; def load_align8_local : PatFrag<(ops node:$ptr), (load_local node:$ptr)>, @@ -745,23 +743,14 @@ int FP32_ONE = 0x3f800000; int FP32_NEG_ONE = 0xbf800000; int FP64_ONE = 0x3ff0000000000000; int FP64_NEG_ONE = 0xbff0000000000000; +int BF16_ONE = 0x3F80; +int BF16_NEG_ONE = 0xBF80; } def CONST : Constants; -def FP_ZERO : PatLeaf < - (fpimm), - [{return N->getValueAPF().isZero();}] ->; - -def FP_ONE : PatLeaf < - (fpimm), - [{return N->isExactlyValue(1.0);}] ->; - -def FP_HALF : PatLeaf < - (fpimm), - [{return N->isExactlyValue(0.5);}] ->; +def fpimm_zero : FPImmLeaf<fAny, [{ return Imm.isZero(); }]> ; +def fpimm_one : FPImmLeaf<fAny, [{ return Imm.isExactlyValue(+1.0); }]> ; +def fpimm_half : FPImmLeaf<fAny, [{ return Imm.isExactlyValue(+0.5); }]> ; /* Generic helper patterns for intrinsics */ /* -------------------------------------- */ @@ -806,24 +795,17 @@ class DwordAddrPat<ValueType vt, RegisterClass rc> : AMDGPUPat < (vt rc:$addr) >; -// rotr pattern -class ROTRPattern <Instruction BIT_ALIGN> : AMDGPUPat < - (rotr i32:$src0, i32:$src1), - (BIT_ALIGN $src0, $src0, $src1) ->; - // Special conversion patterns -def cvt_rpi_i32_f32 : PatFrag < +let GIIgnoreCopies = 1 in +def cvt_rpi_i32_f32 : PatFrag< (ops node:$src), - (fp_to_sint (ffloor (fadd $src, FP_HALF))), - [{ (void) N; return TM.Options.NoNaNsFPMath; }] ->; + (fp_to_sint (ffloor_nnan (fadd $src, fpimm_half))) +>, GISelFlags; -def cvt_flr_i32_f32 : PatFrag < +def cvt_flr_i32_f32 : PatFrag< (ops node:$src), - (fp_to_sint (ffloor $src)), - [{ (void)N; return TM.Options.NoNaNsFPMath; }] + (fp_to_sint (ffloor_nnan $src)) >; let AddedComplexity = 2 in { @@ -841,7 +823,7 @@ class UMad24Pat<Instruction Inst, bit HasClamp = 0> : AMDGPUPat < } // AddedComplexity. class RcpPat<Instruction RcpInst, ValueType vt> : AMDGPUPat < - (fdiv FP_ONE, vt:$src), + (fdiv fpimm_one, vt:$src), (RcpInst $src) >; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULaneMaskUtils.h b/llvm/lib/Target/AMDGPU/AMDGPULaneMaskUtils.h index df80196..95d88c7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULaneMaskUtils.h +++ b/llvm/lib/Target/AMDGPU/AMDGPULaneMaskUtils.h @@ -31,10 +31,12 @@ public: const unsigned AndSaveExecTermOpc; const unsigned BfmOpc; const unsigned CMovOpc; + const unsigned CmpLGOp; const unsigned CSelectOpc; const unsigned MovOpc; const unsigned MovTermOpc; const unsigned OrOpc; + const unsigned OrN2Op; const unsigned OrTermOpc; const unsigned OrSaveExecOpc; const unsigned XorOpc; @@ -57,10 +59,12 @@ public: : AMDGPU::S_AND_SAVEEXEC_B64_term), BfmOpc(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), CMovOpc(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64), + CmpLGOp(IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64), CSelectOpc(IsWave32 ? AMDGPU::S_CSELECT_B32 : AMDGPU::S_CSELECT_B64), MovOpc(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), MovTermOpc(IsWave32 ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term), OrOpc(IsWave32 ? AMDGPU::S_OR_B32 : AMDGPU::S_OR_B64), + OrN2Op(IsWave32 ? AMDGPU::S_ORN2_B32 : AMDGPU::S_ORN2_B64), OrTermOpc(IsWave32 ? AMDGPU::S_OR_B32_term : AMDGPU::S_OR_B64_term), OrSaveExecOpc(IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64), diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp index 7504f1a..63e2656 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp @@ -126,7 +126,38 @@ public: return LK.first != TargetLoweringBase::TypeLegal; } - bool isOpLegal(Instruction *I) { return isa<StoreInst, IntrinsicInst>(I); } + bool isOpLegal(const Instruction *I) { + if (isa<IntrinsicInst>(I)) + return true; + + // Any store is a profitable sink (prevents flip-flopping) + if (isa<StoreInst>(I)) + return true; + + if (auto *BO = dyn_cast<BinaryOperator>(I)) { + if (auto *VT = dyn_cast<FixedVectorType>(BO->getType())) { + if (const auto *IT = dyn_cast<IntegerType>(VT->getElementType())) { + unsigned EB = IT->getBitWidth(); + unsigned EC = VT->getNumElements(); + // Check for SDWA-compatible operation + if ((EB == 8 || EB == 16) && ST.hasSDWA() && EC * EB <= 32) { + switch (BO->getOpcode()) { + case Instruction::Add: + case Instruction::Sub: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + return true; + default: + break; + } + } + } + } + } + + return false; + } bool isCoercionProfitable(Instruction *II) { SmallPtrSet<Instruction *, 4> CVisited; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 596a895..5a993a4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -30,6 +30,8 @@ #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/GlobalISel/Utils.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/PseudoSourceValueManager.h" #include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/IntrinsicsAMDGPU.h" @@ -411,7 +413,7 @@ static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, switch (AS) { case AMDGPUAS::PRIVATE_ADDRESS: // FIXME: Private element size. - return ST.enableFlatScratch() ? 128 : 32; + return ST.hasFlatScratchEnabled() ? 128 : 32; case AMDGPUAS::LOCAL_ADDRESS: return ST.useDS128() ? 128 : 64; case AMDGPUAS::GLOBAL_ADDRESS: @@ -750,7 +752,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) .scalarize(0); - if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { + if (ST.hasVOP3PInsts() && ST.hasAddNoCarryInsts() && ST.hasIntClamp()) { // Full set of gfx9 features. if (ST.hasScalarAddSub64()) { getActionDefinitionsBuilder({G_ADD, G_SUB}) @@ -976,9 +978,25 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, FPOpActions.clampMaxNumElementsStrict(0, S32, 2); } + auto &MinNumMaxNumIeee = + getActionDefinitionsBuilder({G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); + + if (ST.hasVOP3PInsts()) { + MinNumMaxNumIeee.legalFor(FPTypesPK16) + .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) + .clampMaxNumElements(0, S16, 2) + .clampScalar(0, S16, S64) + .scalarize(0); + } else if (ST.has16BitInsts()) { + MinNumMaxNumIeee.legalFor(FPTypes16).clampScalar(0, S16, S64).scalarize(0); + } else { + MinNumMaxNumIeee.legalFor(FPTypesBase) + .clampScalar(0, S32, S64) + .scalarize(0); + } + auto &MinNumMaxNum = getActionDefinitionsBuilder( - {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM, G_FMINNUM_IEEE, - G_FMAXNUM_IEEE}); + {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM}); if (ST.hasVOP3PInsts()) { MinNumMaxNum.customFor(FPTypesPK16) @@ -1039,6 +1057,11 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}}) .scalarize(0) .lower(); + + getActionDefinitionsBuilder(G_FMODF) + .lowerFor({S16, S32, S64}) + .scalarize(0) + .lower(); } else { getActionDefinitionsBuilder(G_FSQRT) .customFor({S32, S64, S16}) @@ -1072,6 +1095,11 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .minScalar(0, S32) .clampScalar(1, S32, S32) .lower(); + + getActionDefinitionsBuilder(G_FMODF) + .lowerFor({S32, S64}) + .scalarize(0) + .lower(); } auto &FPTruncActions = getActionDefinitionsBuilder(G_FPTRUNC); @@ -1171,6 +1199,17 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .scalarize(0) .lower(); + // clang-format off + auto &FPToISat = getActionDefinitionsBuilder({G_FPTOSI_SAT, G_FPTOUI_SAT}) + .legalFor({{S32, S32}, {S32, S64}}) + .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); + FPToISat.minScalar(1, S32); + FPToISat.minScalar(0, S32) + .widenScalarToNextPow2(0, 32) + .scalarize(0) + .lower(); + // clang-format on + getActionDefinitionsBuilder({G_LROUND, G_LLROUND}) .clampScalar(0, S16, S64) .scalarize(0) @@ -1705,6 +1744,13 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); } + auto &Atomics32 = + getActionDefinitionsBuilder({G_ATOMICRMW_USUB_COND, G_ATOMICRMW_USUB_SAT}) + .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, {S32, RegionPtr}}); + if (ST.hasFlatAddressSpace()) { + Atomics32.legalFor({{S32, FlatPtr}}); + } + // TODO: v2bf16 operations, and fat buffer pointer support. auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); if (ST.hasLDSFPAtomicAddF32()) { @@ -2136,9 +2182,17 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .legalFor(FPTypesPK16) .clampMaxNumElements(0, S16, 2) .scalarize(0); + } else if (ST.hasVOP3PInsts()) { + getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}) + .lowerFor({V2S16}) + .clampMaxNumElementsStrict(0, S16, 2) + .scalarize(0) + .lower(); } else { - // TODO: Implement - getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower(); + getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}) + .scalarize(0) + .clampScalar(0, S32, S64) + .lower(); } getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET}) @@ -2195,8 +2249,6 @@ bool AMDGPULegalizerInfo::legalizeCustom( case TargetOpcode::G_FMAXNUM: case TargetOpcode::G_FMINIMUMNUM: case TargetOpcode::G_FMAXIMUMNUM: - case TargetOpcode::G_FMINNUM_IEEE: - case TargetOpcode::G_FMAXNUM_IEEE: return legalizeMinNumMaxNum(Helper, MI); case TargetOpcode::G_EXTRACT_VECTOR_ELT: return legalizeExtractVectorElt(MI, MRI, B); @@ -2299,14 +2351,14 @@ Register AMDGPULegalizerInfo::getSegmentAperture( return B.buildUnmerge(S32, Dst).getReg(1); } - // TODO: can we be smarter about machine pointer info? - MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); Register LoadAddr = MRI.createGenericVirtualRegister( LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); // For code object version 5, private_base and shared_base are passed through // implicit kernargs. if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >= AMDGPU::AMDHSA_COV5) { + MachinePointerInfo PtrInfo = getKernargSegmentPtrInfo(B.getMF()); + AMDGPUTargetLowering::ImplicitParameter Param = AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE : AMDGPUTargetLowering::PRIVATE_BASE; @@ -2321,7 +2373,7 @@ Register AMDGPULegalizerInfo::getSegmentAperture( return Register(); MachineMemOperand *MMO = MF.getMachineMemOperand( - PtrInfo, + PtrInfo.getWithOffset(Offset), MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant, LLT::scalar(32), commonAlignment(Align(64), Offset)); @@ -2339,6 +2391,9 @@ Register AMDGPULegalizerInfo::getSegmentAperture( if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) return Register(); + // TODO: Use custom PseudoSourceValue + MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); + // Offset into amd_queue_t for group_segment_aperture_base_hi / // private_segment_aperture_base_hi. uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; @@ -2538,8 +2593,14 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); uint32_t AddrHiVal = Info->get32BitAddressHighBits(); auto PtrLo = B.buildPtrToInt(S32, Src); - auto HighAddr = B.buildConstant(S32, AddrHiVal); - B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr}); + if (AddrHiVal == 0) { + auto Zext = B.buildZExt(LLT::scalar(64), PtrLo); + B.buildIntToPtr(Dst, Zext); + } else { + auto HighAddr = B.buildConstant(S32, AddrHiVal); + B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr}); + } + MI.eraseFromParent(); return true; } @@ -2817,23 +2878,8 @@ bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, MachineFunction &MF = Helper.MIRBuilder.getMF(); const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || - MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; - - // With ieee_mode disabled, the instructions have the correct behavior - // already for G_FMINIMUMNUM/G_FMAXIMUMNUM. - // - // FIXME: G_FMINNUM/G_FMAXNUM should match the behavior with ieee_mode - // enabled. - if (!MFI->getMode().IEEE) { - if (MI.getOpcode() == AMDGPU::G_FMINIMUMNUM || - MI.getOpcode() == AMDGPU::G_FMAXIMUMNUM) - return true; - - return !IsIEEEOp; - } - - if (IsIEEEOp) + // With ieee_mode disabled, the instructions have the correct behavior. + if (!MFI->getMode().IEEE) return true; return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; @@ -3145,16 +3191,16 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue( return true; // Leave in place; } + const GlobalVariable &GVar = *cast<GlobalVariable>(GV); if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { - Type *Ty = GV->getValueType(); // HIP uses an unsized array `extern __shared__ T s[]` or similar // zero-sized type in other languages to declare the dynamic shared // memory which size is not known at the compile time. They will be // allocated by the runtime and placed directly after the static // allocated ones. They all share the same offset. - if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { + if (GVar.getGlobalSize(GVar.getDataLayout()) == 0) { // Adjust alignment for that dynamic shared memory array. - MFI->setDynLDSAlign(MF.getFunction(), *cast<GlobalVariable>(GV)); + MFI->setDynLDSAlign(MF.getFunction(), GVar); LLT S32 = LLT::scalar(32); auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}); B.buildIntToPtr(DstReg, Sz); @@ -3163,8 +3209,7 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue( } } - B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), - *cast<GlobalVariable>(GV))); + B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), GVar)); MI.eraseFromParent(); return true; } @@ -3383,6 +3428,10 @@ static bool valueIsKnownNeverF32Denorm(const MachineRegisterInfo &MRI, case TargetOpcode::G_INTRINSIC: { switch (cast<GIntrinsic>(DefMI)->getIntrinsicID()) { case Intrinsic::amdgcn_frexp_mant: + case Intrinsic::amdgcn_log: + case Intrinsic::amdgcn_log_clamp: + case Intrinsic::amdgcn_exp2: + case Intrinsic::amdgcn_sqrt: return true; default: break; @@ -3390,6 +3439,8 @@ static bool valueIsKnownNeverF32Denorm(const MachineRegisterInfo &MRI, break; } + case TargetOpcode::G_FSQRT: + return true; case TargetOpcode::G_FFREXP: { if (DefMI->getOperand(0).getReg() == Src) return true; @@ -3503,14 +3554,10 @@ bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI, Register X = MI.getOperand(1).getReg(); unsigned Flags = MI.getFlags(); const LLT Ty = MRI.getType(X); - MachineFunction &MF = B.getMF(); const LLT F32 = LLT::scalar(32); const LLT F16 = LLT::scalar(16); - const AMDGPUTargetMachine &TM = - static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); - if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn)) { if (Ty == F16 && !ST.has16BitInsts()) { Register LogVal = MRI.createGenericVirtualRegister(F32); @@ -3544,12 +3591,14 @@ bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI, auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log); auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log); - - R = B.buildFMul(Ty, Y, C, Flags).getReg(0); - auto NegR = B.buildFNeg(Ty, R, Flags); - auto FMA0 = B.buildFMA(Ty, Y, C, NegR, Flags); - auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, Flags); - R = B.buildFAdd(Ty, R, FMA1, Flags).getReg(0); + // This adds correction terms for which contraction may lead to an increase + // in the error of the approximation, so disable it. + auto NewFlags = Flags & ~(MachineInstr::FmContract); + R = B.buildFMul(Ty, Y, C, NewFlags).getReg(0); + auto NegR = B.buildFNeg(Ty, R, NewFlags); + auto FMA0 = B.buildFMA(Ty, Y, C, NegR, NewFlags); + auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, NewFlags); + R = B.buildFAdd(Ty, R, FMA1, NewFlags).getReg(0); } else { // ch+ct is ln(2)/ln(10) to more than 36 bits const float ch_log10 = 0x1.344000p-2f; @@ -3565,17 +3614,19 @@ bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI, auto MaskConst = B.buildConstant(Ty, 0xfffff000); auto YH = B.buildAnd(Ty, Y, MaskConst); auto YT = B.buildFSub(Ty, Y, YH, Flags); - auto YTCT = B.buildFMul(Ty, YT, CT, Flags); + // This adds correction terms for which contraction may lead to an increase + // in the error of the approximation, so disable it. + auto NewFlags = Flags & ~(MachineInstr::FmContract); + auto YTCT = B.buildFMul(Ty, YT, CT, NewFlags); Register Mad0 = - getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags); - Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, Flags); - R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, Flags); + getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), NewFlags); + Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, NewFlags); + R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, NewFlags); } const bool IsFiniteOnly = - (MI.getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) && - MI.getFlag(MachineInstr::FmNoInfs); + MI.getFlag(MachineInstr::FmNoNans) && MI.getFlag(MachineInstr::FmNoInfs); if (!IsFiniteOnly) { // Expand isfinite(x) => fabs(x) < inf @@ -3699,24 +3750,39 @@ bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI, return true; } +static MachineInstrBuilder buildExp(MachineIRBuilder &B, const DstOp &Dst, + const SrcOp &Src, unsigned Flags) { + LLT Ty = Dst.getLLTTy(*B.getMRI()); + + if (Ty == LLT::scalar(32)) { + return B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Dst}) + .addUse(Src.getReg()) + .setMIFlags(Flags); + } + return B.buildFExp2(Dst, Src, Flags); +} + +bool AMDGPULegalizerInfo::legalizeFExpUnsafeImpl(MachineIRBuilder &B, + Register Dst, Register X, + unsigned Flags, + bool IsExp10) const { + LLT Ty = B.getMRI()->getType(X); + + // exp(x) -> exp2(M_LOG2E_F * x); + // exp10(x) -> exp2(log2(10) * x); + auto Const = B.buildFConstant(Ty, IsExp10 ? 0x1.a934f0p+1f : numbers::log2e); + auto Mul = B.buildFMul(Ty, X, Const, Flags); + buildExp(B, Dst, Mul, Flags); + return true; +} + bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, Register X, unsigned Flags) const { LLT Ty = B.getMRI()->getType(Dst); LLT F32 = LLT::scalar(32); if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) { - auto Log2E = B.buildFConstant(Ty, numbers::log2e); - auto Mul = B.buildFMul(Ty, X, Log2E, Flags); - - if (Ty == F32) { - B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst}) - .addUse(Mul.getReg(0)) - .setMIFlags(Flags); - } else { - B.buildFExp2(Dst, Mul.getReg(0), Flags); - } - - return true; + return legalizeFExpUnsafeImpl(B, Dst, X, Flags, /*IsExp10=*/false); } auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f); @@ -3739,6 +3805,55 @@ bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, return true; } +bool AMDGPULegalizerInfo::legalizeFExp10Unsafe(MachineIRBuilder &B, + Register Dst, Register X, + unsigned Flags) const { + LLT Ty = B.getMRI()->getType(Dst); + LLT F32 = LLT::scalar(32); + + if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) { + // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f); + auto K0 = B.buildFConstant(Ty, 0x1.a92000p+1f); + auto K1 = B.buildFConstant(Ty, 0x1.4f0978p-11f); + + auto Mul1 = B.buildFMul(Ty, X, K1, Flags); + auto Exp2_1 = buildExp(B, Ty, Mul1, Flags); + auto Mul0 = B.buildFMul(Ty, X, K0, Flags); + auto Exp2_0 = buildExp(B, Ty, Mul0, Flags); + B.buildFMul(Dst, Exp2_0, Exp2_1, Flags); + return true; + } + + // bool s = x < -0x1.2f7030p+5f; + // x += s ? 0x1.0p+5f : 0.0f; + // exp10 = exp2(x * 0x1.a92000p+1f) * + // exp2(x * 0x1.4f0978p-11f) * + // (s ? 0x1.9f623ep-107f : 1.0f); + + auto Threshold = B.buildFConstant(Ty, -0x1.2f7030p+5f); + auto NeedsScaling = + B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold); + + auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+5f); + auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags); + auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X); + + auto K0 = B.buildFConstant(Ty, 0x1.a92000p+1f); + auto K1 = B.buildFConstant(Ty, 0x1.4f0978p-11f); + + auto Mul1 = B.buildFMul(Ty, AdjustedX, K1, Flags); + auto Exp2_1 = buildExp(B, Ty, Mul1, Flags); + auto Mul0 = B.buildFMul(Ty, AdjustedX, K0, Flags); + auto Exp2_0 = buildExp(B, Ty, Mul0, Flags); + + auto MulExps = B.buildFMul(Ty, Exp2_0, Exp2_1, Flags); + auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.9f623ep-107f); + auto AdjustedResult = B.buildFMul(Ty, MulExps, ResultScaleFactor, Flags); + + B.buildSelect(Dst, NeedsScaling, AdjustedResult, MulExps); + return true; +} + bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const { Register Dst = MI.getOperand(0).getReg(); @@ -3755,18 +3870,22 @@ bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, // v_exp_f16 (fmul x, log2e) if (allowApproxFunc(MF, Flags)) { // TODO: Does this really require fast? - legalizeFExpUnsafe(B, Dst, X, Flags); + IsExp10 ? legalizeFExp10Unsafe(B, Dst, X, Flags) + : legalizeFExpUnsafe(B, Dst, X, Flags); MI.eraseFromParent(); return true; } + // Nothing in half is a denormal when promoted to f32. + // // exp(f16 x) -> // fptrunc (v_exp_f32 (fmul (fpext x), log2e)) - - // Nothing in half is a denormal when promoted to f32. + // + // exp10(f16 x) -> + // fptrunc (v_exp_f32 (fmul (fpext x), log2(10))) auto Ext = B.buildFPExt(F32, X, Flags); Register Lowered = MRI.createGenericVirtualRegister(F32); - legalizeFExpUnsafe(B, Lowered, Ext.getReg(0), Flags); + legalizeFExpUnsafeImpl(B, Lowered, Ext.getReg(0), Flags, IsExp10); B.buildFPTrunc(Dst, Lowered, Flags); MI.eraseFromParent(); return true; @@ -3777,7 +3896,8 @@ bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying // library behavior. Also, is known-not-daz source sufficient? if (allowApproxFunc(MF, Flags)) { - legalizeFExpUnsafe(B, Dst, X, Flags); + IsExp10 ? legalizeFExp10Unsafe(B, Dst, X, Flags) + : legalizeFExpUnsafe(B, Dst, X, Flags); MI.eraseFromParent(); return true; } @@ -4702,6 +4822,14 @@ bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic( return true; } +MachinePointerInfo +AMDGPULegalizerInfo::getKernargSegmentPtrInfo(MachineFunction &MF) const { + // This isn't really a constant pool but close enough. + MachinePointerInfo PtrInfo(MF.getPSVManager().getConstantPool()); + PtrInfo.AddrSpace = AMDGPUAS::CONSTANT_ADDRESS; + return PtrInfo; +} + Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const { LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); @@ -4729,8 +4857,8 @@ bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI, "unexpected kernarg parameter type"); Register Ptr = getKernargParameterPtr(B, Offset); - MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); - B.buildLoad(DstReg, Ptr, PtrInfo, Align(4), + MachinePointerInfo PtrInfo = getKernargSegmentPtrInfo(B.getMF()); + B.buildLoad(DstReg, Ptr, PtrInfo.getWithOffset(Offset), Align(4), MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant); MI.eraseFromParent(); @@ -6042,7 +6170,7 @@ AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before // being added, so we can only safely match a 32-bit addition with no unsigned // overflow. - bool CheckNUW = AMDGPU::isGFX1250(ST); + bool CheckNUW = ST.hasGFX1250Insts(); std::tie(BaseReg, ImmOffset) = AMDGPU::getBaseWithConstantOffset( MRI, OrigOffset, /*KnownBits=*/nullptr, CheckNUW); @@ -6531,8 +6659,15 @@ static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { case Intrinsic::amdgcn_struct_buffer_atomic_fmax: case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax: return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; + case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32: + case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32: + return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32; case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32: case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32: return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32; default: llvm_unreachable("unhandled atomic opcode"); @@ -6766,7 +6901,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( } Observer.changingInstr(MI); - auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); + scope_exit ChangedInstr([&] { Observer.changedInstr(MI); }); const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE; @@ -7194,7 +7329,7 @@ bool AMDGPULegalizerInfo::legalizeSBufferPrefetch(LegalizerHelper &Helper, bool AMDGPULegalizerInfo::legalizeTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { - if (!ST.isTrapHandlerEnabled() || + if (!ST.hasTrapHandler() || ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) return legalizeTrapEndpgm(MI, MRI, B); @@ -7253,9 +7388,9 @@ bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( return false; // TODO: can we be smarter about machine pointer info? - MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); + MachinePointerInfo PtrInfo = getKernargSegmentPtrInfo(MF); MachineMemOperand *MMO = MF.getMachineMemOperand( - PtrInfo, + PtrInfo.getWithOffset(Offset), MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant, LLT::scalar(64), commonAlignment(Align(64), Offset)); @@ -7314,7 +7449,7 @@ bool AMDGPULegalizerInfo::legalizeDebugTrap(MachineInstr &MI, MachineIRBuilder &B) const { // Is non-HSA path or trap-handler disabled? Then, report a warning // accordingly - if (!ST.isTrapHandlerEnabled() || + if (!ST.hasTrapHandler() || ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { Function &Fn = B.getMF().getFunction(); Fn.getContext().diagnose(DiagnosticInfoUnsupported( @@ -7630,6 +7765,24 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, // Replace the use G_BRCOND with the exec manipulate and branch pseudos. auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID(); switch (IntrID) { + case Intrinsic::sponentry: + if (B.getMF().getInfo<SIMachineFunctionInfo>()->isBottomOfStack()) { + // FIXME: The imported pattern checks for i32 instead of p5; if we fix + // that we can remove this cast. + const LLT S32 = LLT::scalar(32); + Register TmpReg = MRI.createGenericVirtualRegister(S32); + B.buildInstr(AMDGPU::G_AMDGPU_SPONENTRY).addDef(TmpReg); + + Register DstReg = MI.getOperand(0).getReg(); + B.buildIntToPtr(DstReg, TmpReg); + MI.eraseFromParent(); + } else { + int FI = B.getMF().getFrameInfo().CreateFixedObject( + 1, 0, /*IsImmutable=*/false); + B.buildFrameIndex(MI.getOperand(0), FI); + MI.eraseFromParent(); + } + return true; case Intrinsic::amdgcn_if: case Intrinsic::amdgcn_else: { MachineInstr *Br = nullptr; @@ -7717,7 +7870,7 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, case Intrinsic::amdgcn_make_buffer_rsrc: return legalizePointerAsRsrcIntrin(MI, MRI, B); case Intrinsic::amdgcn_kernarg_segment_ptr: - if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { + if (!AMDGPU::isKernel(B.getMF().getFunction())) { // This only makes sense to call in a kernel, so just lower to null. B.buildConstant(MI.getOperand(0).getReg(), 0); MI.eraseFromParent(); @@ -7940,6 +8093,14 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax: case Intrinsic::amdgcn_struct_buffer_atomic_fmax: case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax: + case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32: + case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32: + case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32: + case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32: case Intrinsic::amdgcn_raw_buffer_atomic_fadd: case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd: case Intrinsic::amdgcn_struct_buffer_atomic_fadd: @@ -8043,6 +8204,26 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, B.buildStore(MI.getOperand(2), MI.getOperand(1), **MI.memoperands_begin()); MI.eraseFromParent(); return true; + case Intrinsic::amdgcn_flat_load_monitor_b32: + case Intrinsic::amdgcn_flat_load_monitor_b64: + case Intrinsic::amdgcn_flat_load_monitor_b128: + assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!"); + B.buildInstr(AMDGPU::G_AMDGPU_FLAT_LOAD_MONITOR) + .add(MI.getOperand(0)) + .add(MI.getOperand(2)) + .addMemOperand(*MI.memoperands_begin()); + MI.eraseFromParent(); + return true; + case Intrinsic::amdgcn_global_load_monitor_b32: + case Intrinsic::amdgcn_global_load_monitor_b64: + case Intrinsic::amdgcn_global_load_monitor_b128: + assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!"); + B.buildInstr(AMDGPU::G_AMDGPU_GLOBAL_LOAD_MONITOR) + .add(MI.getOperand(0)) + .add(MI.getOperand(2)) + .addMemOperand(*MI.memoperands_begin()); + MI.eraseFromParent(); + return true; default: { if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrID)) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h index cd44a9b..1224ee7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -91,8 +91,12 @@ public: bool legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, Register Src, bool IsLog10, unsigned Flags) const; bool legalizeFExp2(MachineInstr &MI, MachineIRBuilder &B) const; + bool legalizeFExpUnsafeImpl(MachineIRBuilder &B, Register Dst, Register Src, + unsigned Flags, bool IsExp10) const; bool legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const; + bool legalizeFExp10Unsafe(MachineIRBuilder &B, Register Dst, Register Src, + unsigned Flags) const; bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const; bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const; bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI, @@ -132,6 +136,7 @@ public: MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const; + MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const; Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const; bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B, uint64_t Offset, diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp index aa75534..4de9349 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp @@ -47,9 +47,7 @@ namespace llvm { class AMDGPULibCalls { private: - const TargetLibraryInfo *TLInfo = nullptr; - AssumptionCache *AC = nullptr; - DominatorTree *DT = nullptr; + SimplifyQuery SQ; using FuncInfo = llvm::AMDGPULibFunc; @@ -129,11 +127,10 @@ protected: } public: - AMDGPULibCalls() = default; + AMDGPULibCalls(Function &F, FunctionAnalysisManager &FAM); bool fold(CallInst *CI); - void initFunction(Function &F, FunctionAnalysisManager &FAM); void initNativeFuncs(); // Replace a normal math function call with that native version @@ -422,11 +419,11 @@ bool AMDGPULibCalls::canIncreasePrecisionOfConstantFold( return FPOp->isFast(); } -void AMDGPULibCalls::initFunction(Function &F, FunctionAnalysisManager &FAM) { - AC = &FAM.getResult<AssumptionAnalysis>(F); - TLInfo = &FAM.getResult<TargetLibraryAnalysis>(F); - DT = FAM.getCachedResult<DominatorTreeAnalysis>(F); -} +AMDGPULibCalls::AMDGPULibCalls(Function &F, FunctionAnalysisManager &FAM) + : SQ(F.getParent()->getDataLayout(), + &FAM.getResult<TargetLibraryAnalysis>(F), + FAM.getCachedResult<DominatorTreeAnalysis>(F), + &FAM.getResult<AssumptionAnalysis>(F)) {} bool AMDGPULibCalls::useNativeFunc(const StringRef F) const { return AllNative || llvm::is_contained(UseNative, F); @@ -563,74 +560,6 @@ bool AMDGPULibCalls::fold_read_write_pipe(CallInst *CI, IRBuilder<> &B, return true; } -static bool isKnownIntegral(const Value *V, const DataLayout &DL, - FastMathFlags FMF) { - if (isa<PoisonValue>(V)) - return true; - if (isa<UndefValue>(V)) - return false; - - if (const ConstantFP *CF = dyn_cast<ConstantFP>(V)) - return CF->getValueAPF().isInteger(); - - auto *VFVTy = dyn_cast<FixedVectorType>(V->getType()); - const Constant *CV = dyn_cast<Constant>(V); - if (VFVTy && CV) { - unsigned NumElts = VFVTy->getNumElements(); - for (unsigned i = 0; i != NumElts; ++i) { - Constant *Elt = CV->getAggregateElement(i); - if (!Elt) - return false; - if (isa<PoisonValue>(Elt)) - continue; - - const ConstantFP *CFP = dyn_cast<ConstantFP>(Elt); - if (!CFP || !CFP->getValue().isInteger()) - return false; - } - - return true; - } - - const Instruction *I = dyn_cast<Instruction>(V); - if (!I) - return false; - - switch (I->getOpcode()) { - case Instruction::SIToFP: - case Instruction::UIToFP: - // TODO: Could check nofpclass(inf) on incoming argument - if (FMF.noInfs()) - return true; - - // Need to check int size cannot produce infinity, which computeKnownFPClass - // knows how to do already. - return isKnownNeverInfinity(I, SimplifyQuery(DL)); - case Instruction::Call: { - const CallInst *CI = cast<CallInst>(I); - switch (CI->getIntrinsicID()) { - case Intrinsic::trunc: - case Intrinsic::floor: - case Intrinsic::ceil: - case Intrinsic::rint: - case Intrinsic::nearbyint: - case Intrinsic::round: - case Intrinsic::roundeven: - return (FMF.noInfs() && FMF.noNaNs()) || - isKnownNeverInfOrNaN(I, SimplifyQuery(DL)); - default: - break; - } - - break; - } - default: - break; - } - - return false; -} - // This function returns false if no change; return true otherwise. bool AMDGPULibCalls::fold(CallInst *CI) { Function *Callee = CI->getCalledFunction(); @@ -753,16 +682,14 @@ bool AMDGPULibCalls::fold(CallInst *CI) { // pow(x, y) -> powr(x, y) for x >= -0.0 // TODO: Account for flags on current call - if (PowrFunc && - cannotBeOrderedLessThanZero( - FPOp->getOperand(0), - SimplifyQuery(M->getDataLayout(), TLInfo, DT, AC, Call))) { + if (PowrFunc && cannotBeOrderedLessThanZero( + FPOp->getOperand(0), SQ.getWithInstruction(Call))) { Call->setCalledFunction(PowrFunc); return fold_pow(FPOp, B, PowrInfo) || true; } // pow(x, y) -> pown(x, y) for known integral y - if (isKnownIntegral(FPOp->getOperand(1), M->getDataLayout(), + if (isKnownIntegral(FPOp->getOperand(1), SQ.getWithInstruction(CI), FPOp->getFastMathFlags())) { FunctionType *PownType = getPownType(CI->getFunctionType()); AMDGPULibFunc PownInfo(AMDGPULibFunc::EI_POWN, PownType, true); @@ -845,7 +772,7 @@ bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) { return false; } } - LLVMContext &context = CI->getParent()->getParent()->getContext(); + LLVMContext &context = CI->getContext(); Constant *nval; if (getArgType(FInfo) == AMDGPULibFunc::F32) { SmallVector<float, 0> FVal; @@ -1084,7 +1011,8 @@ bool AMDGPULibCalls::fold_pow(FPMathOperator *FPOp, IRBuilder<> &B, if (needcopysign && (FInfo.getId() == AMDGPULibFunc::EI_POW)) { // We cannot handle corner cases for a general pow() function, give up // unless y is a constant integral value. Then proceed as if it were pown. - if (!isKnownIntegral(opr1, M->getDataLayout(), FPOp->getFastMathFlags())) + if (!isKnownIntegral(opr1, SQ.getWithInstruction(cast<Instruction>(FPOp)), + FPOp->getFastMathFlags())) return false; } @@ -1113,22 +1041,33 @@ bool AMDGPULibCalls::fold_pow(FPMathOperator *FPOp, IRBuilder<> &B, opr1 = B.CreateSIToFP(opr1, nval->getType(), "pownI2F"); } nval = B.CreateFMul(opr1, nval, "__ylogx"); - nval = CreateCallEx(B,ExpExpr, nval, "__exp2"); + + CallInst *Exp2Call = CreateCallEx(B, ExpExpr, nval, "__exp2"); + + // TODO: Generalized fpclass logic for pow + FPClassTest KnownNot = FPClassTest::fcNegative; + if (FPOp->hasNoNaNs()) + KnownNot |= FPClassTest::fcNan; + + Exp2Call->addRetAttr( + Attribute::getWithNoFPClass(Exp2Call->getContext(), KnownNot)); + nval = Exp2Call; if (needcopysign) { Type* nTyS = B.getIntNTy(eltType->getPrimitiveSizeInBits()); Type *nTy = FPOp->getType()->getWithNewType(nTyS); - unsigned size = nTy->getScalarSizeInBits(); Value *opr_n = FPOp->getOperand(1); if (opr_n->getType()->getScalarType()->isIntegerTy()) opr_n = B.CreateZExtOrTrunc(opr_n, nTy, "__ytou"); else opr_n = B.CreateFPToSI(opr1, nTy, "__ytou"); + unsigned size = nTy->getScalarSizeInBits(); Value *sign = B.CreateShl(opr_n, size-1, "__yeven"); sign = B.CreateAnd(B.CreateBitCast(opr0, nTy), sign, "__pow_sign"); - nval = B.CreateOr(B.CreateBitCast(nval, nTy), sign); - nval = B.CreateBitCast(nval, opr0->getType()); + + nval = B.CreateCopySign(nval, B.CreateBitCast(sign, nval->getType()), + nullptr, "__pow_sign"); } LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " @@ -1333,7 +1272,7 @@ AMDGPULibCalls::insertSinCos(Value *Arg, FastMathFlags FMF, IRBuilder<> &B, // TODO: Is it worth trying to preserve the location for the cos calls for the // load? - LoadInst *LoadCos = B.CreateLoad(Alloc->getAllocatedType(), Alloc); + LoadInst *LoadCos = B.CreateLoad(Arg->getType(), Alloc); return {SinCos, LoadCos, SinCos}; } @@ -1699,9 +1638,8 @@ bool AMDGPULibCalls::evaluateCall(CallInst *aCI, const FuncInfo &FInfo) { PreservedAnalyses AMDGPUSimplifyLibCallsPass::run(Function &F, FunctionAnalysisManager &AM) { - AMDGPULibCalls Simplifier; + AMDGPULibCalls Simplifier(F, AM); Simplifier.initNativeFuncs(); - Simplifier.initFunction(F, AM); bool Changed = false; @@ -1728,9 +1666,8 @@ PreservedAnalyses AMDGPUUseNativeCallsPass::run(Function &F, if (UseNative.empty()) return PreservedAnalyses::all(); - AMDGPULibCalls Simplifier; + AMDGPULibCalls Simplifier(F, AM); Simplifier.initNativeFuncs(); - Simplifier.initFunction(F, AM); bool Changed = false; for (auto &BB : F) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp index 0a59132..05e97d2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp @@ -635,7 +635,8 @@ bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitMemSetInst( MemSetInst &MSI) { if (MSI.getDestAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER) return false; - llvm::expandMemSetAsLoop(&MSI); + llvm::expandMemSetAsLoop(&MSI, + TM->getTargetTransformInfo(*MSI.getFunction())); MSI.eraseFromParent(); return true; } @@ -1565,8 +1566,11 @@ void SplitPtrStructs::processConditionals() { } else if (isa<SelectInst>(I)) { if (MaybeRsrc) { if (auto *RsrcInst = dyn_cast<Instruction>(Rsrc)) { - ConditionalTemps.push_back(RsrcInst); - RsrcInst->replaceAllUsesWith(*MaybeRsrc); + // Guard against conditionals that were already folded away. + if (RsrcInst != *MaybeRsrc) { + ConditionalTemps.push_back(RsrcInst); + RsrcInst->replaceAllUsesWith(*MaybeRsrc); + } } for (Value *V : Seen) FoundRsrcs[V] = *MaybeRsrc; @@ -1745,6 +1749,12 @@ Value *SplitPtrStructs::handleMemoryInst(Instruction *I, Value *Arg, Value *Ptr, case AtomicRMWInst::FMin: IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin; break; + case AtomicRMWInst::USubCond: + IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32; + break; + case AtomicRMWInst::USubSat: + IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32; + break; case AtomicRMWInst::FSub: { reportFatalUsageError( "atomic floating point subtraction not supported for " @@ -1770,14 +1780,12 @@ Value *SplitPtrStructs::handleMemoryInst(Instruction *I, Value *Arg, Value *Ptr, break; case AtomicRMWInst::UIncWrap: case AtomicRMWInst::UDecWrap: - reportFatalUsageError("wrapping increment/decrement not supported for " - "buffer resources and should've ben expanded away"); + reportFatalUsageError( + "wrapping increment/decrement not supported for " + "buffer resources and should've been expanded away"); break; case AtomicRMWInst::BAD_BINOP: llvm_unreachable("Not sure how we got a bad binop"); - case AtomicRMWInst::USubCond: - case AtomicRMWInst::USubSat: - break; } } @@ -2059,17 +2067,7 @@ PtrParts SplitPtrStructs::visitICmpInst(ICmpInst &Cmp) { "Pointer comparison is only equal or unequal"); auto [LhsRsrc, LhsOff] = getPtrParts(Lhs); auto [RhsRsrc, RhsOff] = getPtrParts(Rhs); - Value *RsrcCmp = - IRB.CreateICmp(Pred, LhsRsrc, RhsRsrc, Cmp.getName() + ".rsrc"); - copyMetadata(RsrcCmp, &Cmp); - Value *OffCmp = IRB.CreateICmp(Pred, LhsOff, RhsOff, Cmp.getName() + ".off"); - copyMetadata(OffCmp, &Cmp); - - Value *Res = nullptr; - if (Pred == ICmpInst::ICMP_EQ) - Res = IRB.CreateAnd(RsrcCmp, OffCmp); - else if (Pred == ICmpInst::ICMP_NE) - Res = IRB.CreateOr(RsrcCmp, OffCmp); + Value *Res = IRB.CreateICmp(Pred, LhsOff, RhsOff); copyMetadata(Res, &Cmp); Res->takeName(&Cmp); SplitUsers.insert(&Cmp); @@ -2210,6 +2208,7 @@ static bool isRemovablePointerIntrinsic(Intrinsic::ID IID) { case Intrinsic::memset_inline: case Intrinsic::experimental_memset_pattern: case Intrinsic::amdgcn_load_to_lds: + case Intrinsic::amdgcn_load_async_to_lds: return true; } } @@ -2298,7 +2297,8 @@ PtrParts SplitPtrStructs::visitIntrinsicInst(IntrinsicInst &I) { SplitUsers.insert(&I); return {NewRsrc, Off}; } - case Intrinsic::amdgcn_load_to_lds: { + case Intrinsic::amdgcn_load_to_lds: + case Intrinsic::amdgcn_load_async_to_lds: { Value *Ptr = I.getArgOperand(0); if (!isSplitFatPtr(Ptr->getType())) return {nullptr, nullptr}; @@ -2309,9 +2309,12 @@ PtrParts SplitPtrStructs::visitIntrinsicInst(IntrinsicInst &I) { Value *ImmOff = I.getArgOperand(3); Value *Aux = I.getArgOperand(4); Value *SOffset = IRB.getInt32(0); + Intrinsic::ID NewIntr = + IID == Intrinsic::amdgcn_load_to_lds + ? Intrinsic::amdgcn_raw_ptr_buffer_load_lds + : Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds; Instruction *NewLoad = IRB.CreateIntrinsic( - Intrinsic::amdgcn_raw_ptr_buffer_load_lds, {}, - {Rsrc, LDSPtr, LoadSize, Off, SOffset, ImmOff, Aux}); + NewIntr, {}, {Rsrc, LDSPtr, LoadSize, Off, SOffset, ImmOff, Aux}); copyMetadata(NewLoad, &I); SplitUsers.insert(&I); I.replaceAllUsesWith(NewLoad); diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp new file mode 100644 index 0000000..c26e973 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp @@ -0,0 +1,240 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Lower LDS global variables with target extension type "amdgpu.named.barrier" +// that require specialized address assignment. It assigns a unique +// barrier identifier to each named-barrier LDS variable and encodes +// this identifier within the !absolute_symbol metadata of that global. +// This encoding ensures that subsequent LDS lowering passes can process these +// barriers correctly without conflicts. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUMemoryUtils.h" +#include "AMDGPUTargetMachine.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/Analysis/CallGraph.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/ReplaceConstant.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" + +#include <algorithm> + +#define DEBUG_TYPE "amdgpu-lower-exec-sync" + +using namespace llvm; +using namespace AMDGPU; + +namespace { + +// If GV is also used directly by other kernels, create a new GV +// used only by this kernel and its function. +static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV, + Function *KF) { + bool NeedsReplacement = false; + for (Use &U : GV->uses()) { + if (auto *I = dyn_cast<Instruction>(U.getUser())) { + Function *F = I->getFunction(); + if (isKernel(*F) && F != KF) { + NeedsReplacement = true; + break; + } + } + } + if (!NeedsReplacement) + return GV; + // Create a new GV used only by this kernel and its function + GlobalVariable *NewGV = new GlobalVariable( + M, GV->getValueType(), GV->isConstant(), GV->getLinkage(), + GV->getInitializer(), GV->getName() + "." + KF->getName(), nullptr, + GV->getThreadLocalMode(), GV->getType()->getAddressSpace()); + NewGV->copyAttributesFrom(GV); + for (Use &U : make_early_inc_range(GV->uses())) { + if (auto *I = dyn_cast<Instruction>(U.getUser())) { + Function *F = I->getFunction(); + if (!isKernel(*F) || F == KF) { + U.getUser()->replaceUsesOfWith(GV, NewGV); + } + } + } + return NewGV; +} + +// Write the specified address into metadata where it can be retrieved by +// the assembler. Format is a half open range, [Address Address+1) +static void recordLDSAbsoluteAddress(Module *M, GlobalVariable *GV, + uint32_t Address) { + LLVMContext &Ctx = M->getContext(); + auto *IntTy = M->getDataLayout().getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS); + auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address)); + auto *MaxC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address + 1)); + GV->setMetadata(LLVMContext::MD_absolute_symbol, + MDNode::get(Ctx, {MinC, MaxC})); +} + +template <typename T> SmallVector<T> sortByName(SmallVector<T> &&V) { + sort(V, [](const auto *L, const auto *R) { + return L->getName() < R->getName(); + }); + return {std::move(V)}; +} + +// Main utility function for special LDS variables lowering. +static bool lowerExecSyncGlobalVariables( + Module &M, LDSUsesInfoTy &LDSUsesInfo, + VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) { + bool Changed = false; + const DataLayout &DL = M.getDataLayout(); + // The 1st round: give module-absolute assignments + int NumAbsolutes = 0; + SmallVector<GlobalVariable *> OrderedGVs; + for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) { + GlobalVariable *GV = K.first; + if (!isNamedBarrier(*GV)) + continue; + // give a module-absolute assignment if it is indirectly accessed by + // multiple kernels. This is not precise, but we don't want to duplicate + // a function when it is called by multiple kernels. + if (LDSToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) { + OrderedGVs.push_back(GV); + } else { + // leave it to the 2nd round, which will give a kernel-relative + // assignment if it is only indirectly accessed by one kernel + LDSUsesInfo.direct_access[*K.second.begin()].insert(GV); + } + LDSToKernelsThatNeedToAccessItIndirectly.erase(GV); + } + OrderedGVs = sortByName(std::move(OrderedGVs)); + for (GlobalVariable *GV : OrderedGVs) { + unsigned BarrierScope = AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP; + unsigned BarId = NumAbsolutes + 1; + unsigned BarCnt = GV->getGlobalSize(DL) / 16; + NumAbsolutes += BarCnt; + + // 4 bits for alignment, 5 bits for the barrier num, + // 3 bits for the barrier scope + unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4; + recordLDSAbsoluteAddress(&M, GV, Offset); + } + OrderedGVs.clear(); + + // The 2nd round: give a kernel-relative assignment for GV that + // either only indirectly accessed by single kernel or only directly + // accessed by multiple kernels. + SmallVector<Function *> OrderedKernels; + for (auto &K : LDSUsesInfo.direct_access) { + Function *F = K.first; + assert(isKernel(*F)); + OrderedKernels.push_back(F); + } + OrderedKernels = sortByName(std::move(OrderedKernels)); + + DenseMap<Function *, uint32_t> Kernel2BarId; + for (Function *F : OrderedKernels) { + for (GlobalVariable *GV : LDSUsesInfo.direct_access[F]) { + if (!isNamedBarrier(*GV)) + continue; + + LDSUsesInfo.direct_access[F].erase(GV); + if (GV->isAbsoluteSymbolRef()) { + // already assigned + continue; + } + OrderedGVs.push_back(GV); + } + OrderedGVs = sortByName(std::move(OrderedGVs)); + for (GlobalVariable *GV : OrderedGVs) { + // GV could also be used directly by other kernels. If so, we need to + // create a new GV used only by this kernel and its function. + auto NewGV = uniquifyGVPerKernel(M, GV, F); + Changed |= (NewGV != GV); + unsigned BarrierScope = AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP; + unsigned BarId = Kernel2BarId[F]; + BarId += NumAbsolutes + 1; + unsigned BarCnt = GV->getGlobalSize(DL) / 16; + Kernel2BarId[F] += BarCnt; + unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4; + recordLDSAbsoluteAddress(&M, NewGV, Offset); + } + OrderedGVs.clear(); + } + // Also erase those special LDS variables from indirect_access. + for (auto &K : LDSUsesInfo.indirect_access) { + assert(isKernel(*K.first)); + for (GlobalVariable *GV : K.second) { + if (isNamedBarrier(*GV)) + K.second.erase(GV); + } + } + return Changed; +} + +static bool runLowerExecSyncGlobals(Module &M) { + CallGraph CG = CallGraph(M); + bool Changed = false; + Changed |= eliminateConstantExprUsesOfLDSFromAllInstructions(M); + + // For each kernel, what variables does it access directly or through + // callees + LDSUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDS(CG, M); + + // For each variable accessed through callees, which kernels access it + VariableFunctionMap LDSToKernelsThatNeedToAccessItIndirectly; + for (auto &K : LDSUsesInfo.indirect_access) { + Function *F = K.first; + assert(isKernel(*F)); + for (GlobalVariable *GV : K.second) { + LDSToKernelsThatNeedToAccessItIndirectly[GV].insert(F); + } + } + + if (LDSUsesInfo.HasSpecialGVs) { + // Special LDS variables need special address assignment + Changed |= lowerExecSyncGlobalVariables( + M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly); + } + return Changed; +} + +class AMDGPULowerExecSyncLegacy : public ModulePass { +public: + static char ID; + AMDGPULowerExecSyncLegacy() : ModulePass(ID) {} + bool runOnModule(Module &M) override; +}; + +} // namespace + +char AMDGPULowerExecSyncLegacy::ID = 0; +char &llvm::AMDGPULowerExecSyncLegacyPassID = AMDGPULowerExecSyncLegacy::ID; + +INITIALIZE_PASS_BEGIN(AMDGPULowerExecSyncLegacy, DEBUG_TYPE, + "AMDGPU lowering of execution synchronization", false, + false) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) +INITIALIZE_PASS_END(AMDGPULowerExecSyncLegacy, DEBUG_TYPE, + "AMDGPU lowering of execution synchronization", false, + false) + +bool AMDGPULowerExecSyncLegacy::runOnModule(Module &M) { + return runLowerExecSyncGlobals(M); +} + +ModulePass *llvm::createAMDGPULowerExecSyncLegacyPass() { + return new AMDGPULowerExecSyncLegacy(); +} + +PreservedAnalyses AMDGPULowerExecSyncPass::run(Module &M, + ModuleAnalysisManager &AM) { + return runLowerExecSyncGlobals(M) ? PreservedAnalyses::none() + : PreservedAnalyses::all(); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp index dec781d..f93b0b4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp @@ -12,14 +12,26 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "AMDGPUAsanInstrumentation.h" #include "GCNSubtarget.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/CaptureTracking.h" +#include "llvm/Analysis/ScopedNoAliasAA.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/Argument.h" #include "llvm/IR/Attributes.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/IR/LLVMContext.h" #include "llvm/IR/MDBuilder.h" #include "llvm/Target/TargetMachine.h" +#include <optional> +#include <string> #define DEBUG_TYPE "amdgpu-lower-kernel-arguments" @@ -37,6 +49,7 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<TargetPassConfig>(); + AU.addRequired<DominatorTreeWrapperPass>(); AU.setPreservesAll(); } }; @@ -58,13 +71,131 @@ static BasicBlock::iterator getInsertPt(BasicBlock &BB) { return InsPt; } -static bool lowerKernelArguments(Function &F, const TargetMachine &TM) { +static void addAliasScopeMetadata(Function &F, const DataLayout &DL, + DominatorTree &DT) { + // Collect noalias arguments. + SmallVector<const Argument *, 4u> NoAliasArgs; + + for (Argument &Arg : F.args()) + if (Arg.hasNoAliasAttr() && !Arg.use_empty()) + NoAliasArgs.push_back(&Arg); + + if (NoAliasArgs.empty()) + return; + + // Add alias scopes for each noalias argument. + MDBuilder MDB(F.getContext()); + DenseMap<const Argument *, MDNode *> NewScopes; + MDNode *NewDomain = MDB.createAnonymousAliasScopeDomain(F.getName()); + + for (unsigned I = 0u; I < NoAliasArgs.size(); ++I) { + const Argument *Arg = NoAliasArgs[I]; + MDNode *NewScope = MDB.createAnonymousAliasScope(NewDomain, Arg->getName()); + NewScopes.insert({Arg, NewScope}); + } + + // Iterate over all instructions. + for (inst_iterator Inst = inst_begin(F), InstEnd = inst_end(F); + Inst != InstEnd; ++Inst) { + // If instruction accesses memory, collect its pointer arguments. + Instruction *I = &(*Inst); + SmallVector<const Value *, 2u> PtrArgs; + + if (std::optional<MemoryLocation> MO = MemoryLocation::getOrNone(I)) + PtrArgs.push_back(MO->Ptr); + else if (const CallBase *Call = dyn_cast<CallBase>(I)) { + if (Call->doesNotAccessMemory()) + continue; + + for (Value *Arg : Call->args()) { + if (!Arg->getType()->isPointerTy()) + continue; + + PtrArgs.push_back(Arg); + } + } + + if (PtrArgs.empty()) + continue; + + // Collect underlying objects of pointer arguments. + SmallVector<Metadata *, 4u> Scopes; + SmallPtrSet<const Value *, 4u> ObjSet; + SmallVector<Metadata *, 4u> NoAliases; + + for (const Value *Val : PtrArgs) { + SmallVector<const Value *, 4u> Objects; + getUnderlyingObjects(Val, Objects); + ObjSet.insert_range(Objects); + } + + bool RequiresNoCaptureBefore = false; + bool UsesUnknownObject = false; + bool UsesAliasingPtr = false; + + for (const Value *Val : ObjSet) { + if (isa<ConstantData>(Val)) + continue; + + if (const Argument *Arg = dyn_cast<Argument>(Val)) { + if (!Arg->hasAttribute(Attribute::NoAlias)) + UsesAliasingPtr = true; + } else + UsesAliasingPtr = true; + + if (isEscapeSource(Val)) + RequiresNoCaptureBefore = true; + else if (!isa<Argument>(Val) && isIdentifiedObject(Val)) + UsesUnknownObject = true; + } + + if (UsesUnknownObject) + continue; + + // Collect noalias scopes for instruction. + for (const Argument *Arg : NoAliasArgs) { + if (ObjSet.contains(Arg)) + continue; + + if (!RequiresNoCaptureBefore || + !capturesAnything(PointerMayBeCapturedBefore( + Arg, false, I, &DT, false, CaptureComponents::Provenance))) + NoAliases.push_back(NewScopes[Arg]); + } + + // Add noalias metadata to instruction. + if (!NoAliases.empty()) { + MDNode *NewMD = + MDNode::concatenate(Inst->getMetadata(LLVMContext::MD_noalias), + MDNode::get(F.getContext(), NoAliases)); + Inst->setMetadata(LLVMContext::MD_noalias, NewMD); + } + + // Collect scopes for alias.scope metadata. + if (!UsesAliasingPtr) + for (const Argument *Arg : NoAliasArgs) { + if (ObjSet.count(Arg)) + Scopes.push_back(NewScopes[Arg]); + } + + // Add alias.scope metadata to instruction. + if (!Scopes.empty()) { + MDNode *NewMD = + MDNode::concatenate(Inst->getMetadata(LLVMContext::MD_alias_scope), + MDNode::get(F.getContext(), Scopes)); + Inst->setMetadata(LLVMContext::MD_alias_scope, NewMD); + } + } +} + +static bool lowerKernelArguments(Function &F, const TargetMachine &TM, + DominatorTree &DT) { CallingConv::ID CC = F.getCallingConv(); if (CC != CallingConv::AMDGPU_KERNEL || F.arg_empty()) return false; const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); - LLVMContext &Ctx = F.getParent()->getContext(); + LLVMContext &Ctx = F.getContext(); const DataLayout &DL = F.getDataLayout(); BasicBlock &EntryBlock = *F.begin(); IRBuilder<> Builder(&EntryBlock, getInsertPt(EntryBlock)); @@ -86,6 +217,9 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) { Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize)); uint64_t ExplicitArgOffset = 0; + + addAliasScopeMetadata(F, F.getParent()->getDataLayout(), DT); + for (Argument &Arg : F.args()) { const bool IsByRef = Arg.hasByRefAttr(); Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); @@ -124,11 +258,6 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) { PT->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) && !ST.hasUsableDSOffset()) continue; - - // FIXME: We can replace this with equivalent alias.scope/noalias - // metadata, but this appears to be a lot of work. - if (Arg.hasNoAliasAttr()) - continue; } auto *VT = dyn_cast<FixedVectorType>(ArgTy); @@ -215,8 +344,6 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) { } } - // TODO: Convert noalias arg to !noalias - if (DoShiftOpt) { Value *ExtractBits = OffsetDiff == 0 ? Load : Builder.CreateLShr(Load, OffsetDiff * 8); @@ -245,7 +372,8 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) { bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { auto &TPC = getAnalysis<TargetPassConfig>(); const TargetMachine &TM = TPC.getTM<TargetMachine>(); - return lowerKernelArguments(F, TM); + DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + return lowerKernelArguments(F, TM, DT); } INITIALIZE_PASS_BEGIN(AMDGPULowerKernelArguments, DEBUG_TYPE, @@ -261,7 +389,8 @@ FunctionPass *llvm::createAMDGPULowerKernelArgumentsPass() { PreservedAnalyses AMDGPULowerKernelArgumentsPass::run(Function &F, FunctionAnalysisManager &AM) { - bool Changed = lowerKernelArguments(F, TM); + DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F); + bool Changed = lowerKernelArguments(F, TM, DT); if (Changed) { // TODO: Preserves a lot more. PreservedAnalyses PA; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp index fed7a13..fbfb710 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp @@ -1,4 +1,4 @@ -//===-- AMDGPULowerKernelAttributes.cpp ------------------------------------------===// +//===-- AMDGPULowerKernelAttributes.cpp------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -19,6 +19,7 @@ #include "llvm/CodeGen/Passes.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicsAMDGPU.h" @@ -66,13 +67,11 @@ public: bool runOnModule(Module &M) override; - StringRef getPassName() const override { - return "AMDGPU Kernel Attributes"; - } + StringRef getPassName() const override { return "AMDGPU Kernel Attributes"; } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesAll(); - } + } }; Function *getBasePtrIntrinsic(Module &M, bool IsV5OrAbove) { @@ -98,26 +97,28 @@ static void annotateGridSizeLoadWithRangeMD(LoadInst *Load, } static bool processUse(CallInst *CI, bool IsV5OrAbove) { - Function *F = CI->getParent()->getParent(); + Function *F = CI->getFunction(); auto *MD = F->getMetadata("reqd_work_group_size"); const bool HasReqdWorkGroupSize = MD && MD->getNumOperands() == 3; const bool HasUniformWorkGroupSize = - F->getFnAttribute("uniform-work-group-size").getValueAsBool(); + F->getFnAttribute("uniform-work-group-size").getValueAsBool(); SmallVector<unsigned> MaxNumWorkgroups = AMDGPU::getIntegerVecAttribute(*F, "amdgpu-max-num-workgroups", /*Size=*/3, /*DefaultVal=*/0); if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize && + !Intrinsic::getDeclarationIfExists(CI->getModule(), + Intrinsic::amdgcn_dispatch_ptr) && none_of(MaxNumWorkgroups, [](unsigned X) { return X != 0; })) return false; Value *BlockCounts[3] = {nullptr, nullptr, nullptr}; - Value *GroupSizes[3] = {nullptr, nullptr, nullptr}; - Value *Remainders[3] = {nullptr, nullptr, nullptr}; - Value *GridSizes[3] = {nullptr, nullptr, nullptr}; + Value *GroupSizes[3] = {nullptr, nullptr, nullptr}; + Value *Remainders[3] = {nullptr, nullptr, nullptr}; + Value *GridSizes[3] = {nullptr, nullptr, nullptr}; const DataLayout &DL = F->getDataLayout(); @@ -230,13 +231,15 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) { bool MadeChange = false; if (IsV5OrAbove && HasUniformWorkGroupSize) { - // Under v5 __ockl_get_local_size returns the value computed by the expression: + // Under v5 __ockl_get_local_size returns the value computed by the + // expression: // - // workgroup_id < hidden_block_count ? hidden_group_size : hidden_remainder + // workgroup_id < hidden_block_count ? hidden_group_size : + // hidden_remainder // - // For functions with the attribute uniform-work-group-size=true. we can evaluate - // workgroup_id < hidden_block_count as true, and thus hidden_group_size is returned - // for __ockl_get_local_size. + // For functions with the attribute uniform-work-group-size=true. we can + // evaluate workgroup_id < hidden_block_count as true, and thus + // hidden_group_size is returned for __ockl_get_local_size. for (int I = 0; I < 3; ++I) { Value *BlockCount = BlockCounts[I]; if (!BlockCount) @@ -261,7 +264,8 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) { for (Value *Remainder : Remainders) { if (!Remainder) continue; - Remainder->replaceAllUsesWith(Constant::getNullValue(Remainder->getType())); + Remainder->replaceAllUsesWith( + Constant::getNullValue(Remainder->getType())); MadeChange = true; } } else if (HasUniformWorkGroupSize) { // Pre-V5. @@ -302,13 +306,13 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) { continue; for (User *UMin : ZextGroupSize->users()) { - if (match(UMin, - m_UMin(m_Sub(m_Specific(GridSize), - m_Mul(GroupIDIntrin, m_Specific(ZextGroupSize))), - m_Specific(ZextGroupSize)))) { + if (match(UMin, m_UMin(m_Sub(m_Specific(GridSize), + m_Mul(GroupIDIntrin, + m_Specific(ZextGroupSize))), + m_Specific(ZextGroupSize)))) { if (HasReqdWorkGroupSize) { - ConstantInt *KnownSize - = mdconst::extract<ConstantInt>(MD->getOperand(I)); + ConstantInt *KnownSize = + mdconst::extract<ConstantInt>(MD->getOperand(I)); UMin->replaceAllUsesWith(ConstantFoldIntegerCast( KnownSize, UMin->getType(), false, DL)); } else { @@ -322,6 +326,49 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) { } } + // Upgrade the old method of calculating the block size using the grid size. + // We pattern match any case where the implicit argument group size is the + // divisor to a dispatch packet grid size read of the same dimension. + if (IsV5OrAbove) { + for (int I = 0; I < 3; I++) { + Value *GroupSize = GroupSizes[I]; + if (!GroupSize || !GroupSize->getType()->isIntegerTy(16)) + continue; + + for (User *U : GroupSize->users()) { + Instruction *Inst = cast<Instruction>(U); + if (isa<ZExtInst>(Inst) && !Inst->use_empty()) + Inst = cast<Instruction>(*Inst->user_begin()); + + using namespace llvm::PatternMatch; + if (!match( + Inst, + m_UDiv(m_ZExtOrSelf(m_Load(m_GEP( + m_Intrinsic<Intrinsic::amdgcn_dispatch_ptr>(), + m_SpecificInt(GRID_SIZE_X + I * sizeof(uint32_t))))), + m_Value()))) + continue; + + IRBuilder<> Builder(Inst); + + Value *GEP = Builder.CreateInBoundsGEP( + Builder.getInt8Ty(), CI, + {ConstantInt::get(Type::getInt64Ty(CI->getContext()), + HIDDEN_BLOCK_COUNT_X + I * sizeof(uint32_t))}); + Instruction *BlockCount = Builder.CreateLoad(Builder.getInt32Ty(), GEP); + BlockCount->setMetadata(LLVMContext::MD_invariant_load, + MDNode::get(CI->getContext(), {})); + BlockCount->setMetadata(LLVMContext::MD_noundef, + MDNode::get(CI->getContext(), {})); + + Value *BlockCountExt = Builder.CreateZExt(BlockCount, Inst->getType()); + Inst->replaceAllUsesWith(BlockCountExt); + Inst->eraseFromParent(); + MadeChange = true; + } + } + } + // If reqd_work_group_size is set, we can replace work group size with it. if (!HasReqdWorkGroupSize) return MadeChange; @@ -340,7 +387,6 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) { return MadeChange; } - // TODO: Move makeLIDRangeMetadata usage into here. Seem to not get // TargetPassConfig for subtarget. bool AMDGPULowerKernelAttributes::runOnModule(Module &M) { @@ -364,7 +410,6 @@ bool AMDGPULowerKernelAttributes::runOnModule(Module &M) { return MadeChange; } - INITIALIZE_PASS_BEGIN(AMDGPULowerKernelAttributes, DEBUG_TYPE, "AMDGPU Kernel Attributes", false, false) INITIALIZE_PASS_END(AMDGPULowerKernelAttributes, DEBUG_TYPE, @@ -385,12 +430,14 @@ AMDGPULowerKernelAttributesPass::run(Function &F, FunctionAnalysisManager &AM) { if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used. return PreservedAnalyses::all(); + bool Changed = false; for (Instruction &I : instructions(F)) { if (CallInst *CI = dyn_cast<CallInst>(&I)) { if (CI->getCalledFunction() == BasePtr) - processUse(CI, IsV5OrAbove); + Changed |= processUse(CI, IsV5OrAbove); } } - return PreservedAnalyses::all(); + return !Changed ? PreservedAnalyses::all() + : PreservedAnalyses::none().preserveSet<CFGAnalyses>(); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp index a4ef524..588eee0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -441,7 +441,7 @@ public: return KernelSet; for (Function &Func : M.functions()) { - if (Func.isDeclaration() || !isKernelLDS(&Func)) + if (Func.isDeclaration() || !isKernel(Func)) continue; for (GlobalVariable *GV : LDSUsesInfo.indirect_access[&Func]) { if (VariableSet.contains(GV)) { @@ -501,9 +501,7 @@ public: // strategy continue; } - CandidateTy Candidate( - GV, K.second.size(), - DL.getTypeAllocSize(GV->getValueType()).getFixedValue()); + CandidateTy Candidate(GV, K.second.size(), GV->getGlobalSize(DL)); if (MostUsed < Candidate) MostUsed = Candidate; } @@ -555,7 +553,7 @@ public: for (Function &Func : M->functions()) { if (Func.isDeclaration()) continue; - if (!isKernelLDS(&Func)) + if (!isKernel(Func)) continue; if (KernelsThatAllocateTableLDS.contains(&Func) || @@ -703,7 +701,7 @@ public: return false; } Function *F = I->getFunction(); - return !isKernelLDS(F); + return !isKernel(*F); }); // Replace uses of module scope variable from kernel functions that @@ -711,7 +709,7 @@ public: // Record on each kernel whether the module scope global is used by it for (Function &Func : M.functions()) { - if (Func.isDeclaration() || !isKernelLDS(&Func)) + if (Func.isDeclaration() || !isKernel(Func)) continue; if (KernelsThatAllocateModuleLDS.contains(&Func)) { @@ -743,7 +741,7 @@ public: DenseMap<Function *, LDSVariableReplacement> KernelToReplacement; for (Function &Func : M.functions()) { - if (Func.isDeclaration() || !isKernelLDS(&Func)) + if (Func.isDeclaration() || !isKernel(Func)) continue; DenseSet<GlobalVariable *> KernelUsedVariables; @@ -828,7 +826,7 @@ public: // semantics. Setting the alignment here allows this IR pass to accurately // predict the exact constant at which it will be allocated. - assert(isKernelLDS(func)); + assert(isKernel(*func)); LLVMContext &Ctx = M.getContext(); const DataLayout &DL = M.getDataLayout(); @@ -878,7 +876,7 @@ public: for (auto &func : OrderedKernels) { if (KernelsThatIndirectlyAllocateDynamicLDS.contains(func)) { - assert(isKernelLDS(func)); + assert(isKernel(*func)); if (!func->hasName()) { reportFatalUsageError("anonymous kernels cannot use LDS variables"); } @@ -912,7 +910,7 @@ public: auto *I = dyn_cast<Instruction>(U.getUser()); if (!I) continue; - if (isKernelLDS(I->getFunction())) + if (isKernel(*I->getFunction())) continue; replaceUseWithTableLookup(M, Builder, table, GV, U, nullptr); @@ -922,126 +920,6 @@ public: return KernelToCreatedDynamicLDS; } - static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV, - Function *KF) { - bool NeedsReplacement = false; - for (Use &U : GV->uses()) { - if (auto *I = dyn_cast<Instruction>(U.getUser())) { - Function *F = I->getFunction(); - if (isKernelLDS(F) && F != KF) { - NeedsReplacement = true; - break; - } - } - } - if (!NeedsReplacement) - return GV; - // Create a new GV used only by this kernel and its function - GlobalVariable *NewGV = new GlobalVariable( - M, GV->getValueType(), GV->isConstant(), GV->getLinkage(), - GV->getInitializer(), GV->getName() + "." + KF->getName(), nullptr, - GV->getThreadLocalMode(), GV->getType()->getAddressSpace()); - NewGV->copyAttributesFrom(GV); - for (Use &U : make_early_inc_range(GV->uses())) { - if (auto *I = dyn_cast<Instruction>(U.getUser())) { - Function *F = I->getFunction(); - if (!isKernelLDS(F) || F == KF) { - U.getUser()->replaceUsesOfWith(GV, NewGV); - } - } - } - return NewGV; - } - - bool lowerSpecialLDSVariables( - Module &M, LDSUsesInfoTy &LDSUsesInfo, - VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) { - bool Changed = false; - const DataLayout &DL = M.getDataLayout(); - // The 1st round: give module-absolute assignments - int NumAbsolutes = 0; - std::vector<GlobalVariable *> OrderedGVs; - for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) { - GlobalVariable *GV = K.first; - if (!isNamedBarrier(*GV)) - continue; - // give a module-absolute assignment if it is indirectly accessed by - // multiple kernels. This is not precise, but we don't want to duplicate - // a function when it is called by multiple kernels. - if (LDSToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) { - OrderedGVs.push_back(GV); - } else { - // leave it to the 2nd round, which will give a kernel-relative - // assignment if it is only indirectly accessed by one kernel - LDSUsesInfo.direct_access[*K.second.begin()].insert(GV); - } - LDSToKernelsThatNeedToAccessItIndirectly.erase(GV); - } - OrderedGVs = sortByName(std::move(OrderedGVs)); - for (GlobalVariable *GV : OrderedGVs) { - unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP; - unsigned BarId = NumAbsolutes + 1; - unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16; - NumAbsolutes += BarCnt; - - // 4 bits for alignment, 5 bits for the barrier num, - // 3 bits for the barrier scope - unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4; - recordLDSAbsoluteAddress(&M, GV, Offset); - } - OrderedGVs.clear(); - - // The 2nd round: give a kernel-relative assignment for GV that - // either only indirectly accessed by single kernel or only directly - // accessed by multiple kernels. - std::vector<Function *> OrderedKernels; - for (auto &K : LDSUsesInfo.direct_access) { - Function *F = K.first; - assert(isKernelLDS(F)); - OrderedKernels.push_back(F); - } - OrderedKernels = sortByName(std::move(OrderedKernels)); - - llvm::DenseMap<Function *, uint32_t> Kernel2BarId; - for (Function *F : OrderedKernels) { - for (GlobalVariable *GV : LDSUsesInfo.direct_access[F]) { - if (!isNamedBarrier(*GV)) - continue; - - LDSUsesInfo.direct_access[F].erase(GV); - if (GV->isAbsoluteSymbolRef()) { - // already assigned - continue; - } - OrderedGVs.push_back(GV); - } - OrderedGVs = sortByName(std::move(OrderedGVs)); - for (GlobalVariable *GV : OrderedGVs) { - // GV could also be used directly by other kernels. If so, we need to - // create a new GV used only by this kernel and its function. - auto NewGV = uniquifyGVPerKernel(M, GV, F); - Changed |= (NewGV != GV); - unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP; - unsigned BarId = Kernel2BarId[F]; - BarId += NumAbsolutes + 1; - unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16; - Kernel2BarId[F] += BarCnt; - unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4; - recordLDSAbsoluteAddress(&M, NewGV, Offset); - } - OrderedGVs.clear(); - } - // Also erase those special LDS variables from indirect_access. - for (auto &K : LDSUsesInfo.indirect_access) { - assert(isKernelLDS(K.first)); - for (GlobalVariable *GV : K.second) { - if (isNamedBarrier(*GV)) - K.second.erase(GV); - } - } - return Changed; - } - bool runOnModule(Module &M) { CallGraph CG = CallGraph(M); bool Changed = superAlignLDSGlobals(M); @@ -1058,18 +936,12 @@ public: VariableFunctionMap LDSToKernelsThatNeedToAccessItIndirectly; for (auto &K : LDSUsesInfo.indirect_access) { Function *F = K.first; - assert(isKernelLDS(F)); + assert(isKernel(*F)); for (GlobalVariable *GV : K.second) { LDSToKernelsThatNeedToAccessItIndirectly[GV].insert(F); } } - if (LDSUsesInfo.HasSpecialGVs) { - // Special LDS variables need special address assignment - Changed |= lowerSpecialLDSVariables( - M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly); - } - // Partition variables accessed indirectly into the different strategies DenseSet<GlobalVariable *> ModuleScopeVariables; DenseSet<GlobalVariable *> TableLookupVariables; @@ -1157,7 +1029,7 @@ public: const DataLayout &DL = M.getDataLayout(); for (Function &Func : M.functions()) { - if (Func.isDeclaration() || !isKernelLDS(&Func)) + if (Func.isDeclaration() || !isKernel(Func)) continue; // All three of these are optional. The first variable is allocated at @@ -1187,14 +1059,14 @@ public: if (AllocateModuleScopeStruct) { // Allocated at zero, recorded once on construction, not once per // kernel - Offset += DL.getTypeAllocSize(MaybeModuleScopeStruct->getValueType()); + Offset += MaybeModuleScopeStruct->getGlobalSize(DL); } if (AllocateKernelScopeStruct) { GlobalVariable *KernelStruct = Replacement->second.SGV; Offset = alignTo(Offset, AMDGPU::getAlign(DL, KernelStruct)); recordLDSAbsoluteAddress(&M, KernelStruct, Offset); - Offset += DL.getTypeAllocSize(KernelStruct->getValueType()); + Offset += KernelStruct->getGlobalSize(DL); } // If there is dynamic allocation, the alignment needed is included in @@ -1264,7 +1136,7 @@ private: } Align Alignment = AMDGPU::getAlign(DL, &GV); - TypeSize GVSize = DL.getTypeAllocSize(GV.getValueType()); + uint64_t GVSize = GV.getGlobalSize(DL); if (GVSize > 8) { // We might want to use a b96 or b128 load/store @@ -1310,8 +1182,7 @@ private: LDSVarsToTransform.begin(), LDSVarsToTransform.end())); for (GlobalVariable *GV : Sorted) { - OptimizedStructLayoutField F(GV, - DL.getTypeAllocSize(GV->getValueType()), + OptimizedStructLayoutField F(GV, GV->getGlobalSize(DL), AMDGPU::getAlign(DL, GV)); LayoutFields.emplace_back(F); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp index 1e6589e..f4872ec 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp @@ -43,9 +43,10 @@ #include "AMDGPULowerVGPREncoding.h" #include "AMDGPU.h" #include "GCNSubtarget.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIDefines.h" #include "SIInstrInfo.h" -#include "llvm/ADT/PackedVector.h" +#include "llvm/ADT/bit.h" +#include "llvm/Support/MathExtras.h" using namespace llvm; @@ -57,21 +58,44 @@ class AMDGPULowerVGPREncoding { static constexpr unsigned OpNum = 4; static constexpr unsigned BitsPerField = 2; static constexpr unsigned NumFields = 4; - static constexpr unsigned FieldMask = (1 << BitsPerField) - 1; - using ModeType = PackedVector<unsigned, BitsPerField, - std::bitset<BitsPerField * NumFields>>; + static constexpr unsigned ModeWidth = NumFields * BitsPerField; + static constexpr unsigned ModeMask = (1 << ModeWidth) - 1; + static constexpr unsigned VGPRMSBShift = + llvm::countr_zero_constexpr<unsigned>(AMDGPU::Hwreg::DST_VGPR_MSB); - class ModeTy : public ModeType { - public: - // bitset constructor will set all bits to zero - ModeTy() : ModeType(0) {} + struct OpMode { + // No MSBs set means they are not required to be of a particular value. + std::optional<unsigned> MSBits; + + bool update(const OpMode &New, bool &Rewritten) { + bool Updated = false; + if (New.MSBits) { + if (*New.MSBits != MSBits.value_or(0)) { + Updated = true; + Rewritten |= MSBits.has_value(); + } + MSBits = New.MSBits; + } + return Updated; + } + }; + + struct ModeTy { + OpMode Ops[OpNum]; - operator int64_t() const { return raw_bits().to_ulong(); } + bool update(const ModeTy &New, bool &Rewritten) { + bool Updated = false; + for (unsigned I : seq(OpNum)) + Updated |= Ops[I].update(New.Ops[I], Rewritten); + return Updated; + } - static ModeTy fullMask() { - ModeTy M; - M.raw_bits().flip(); - return M; + unsigned encode() const { + // Layout: [src0 msb, src1 msb, src2 msb, dst msb]. + unsigned V = 0; + for (const auto &[I, Op] : enumerate(Ops)) + V |= Op.MSBits.value_or(0) << (I * 2); + return V; } }; @@ -82,19 +106,15 @@ private: const SIInstrInfo *TII; const SIRegisterInfo *TRI; + // Current basic block. + MachineBasicBlock *MBB; + /// Most recent s_set_* instruction. MachineInstr *MostRecentModeSet; - /// Whether the current mode is known. - bool CurrentModeKnown; - /// Current mode bits. ModeTy CurrentMode; - /// Current mask of mode bits that instructions since MostRecentModeSet care - /// about. - ModeTy CurrentMask; - /// Number of current hard clause instructions. unsigned ClauseLen; @@ -108,10 +128,15 @@ private: MachineInstr *Clause; /// Insert mode change before \p I. \returns true if mode was changed. - bool setMode(ModeTy NewMode, ModeTy Mask, MachineInstr *I); + bool setMode(ModeTy NewMode, MachineBasicBlock::instr_iterator I); /// Reset mode to default. - void resetMode(MachineInstr *I) { setMode(ModeTy(), ModeTy::fullMask(), I); } + void resetMode(MachineBasicBlock::instr_iterator I) { + ModeTy Mode; + for (OpMode &Op : Mode.Ops) + Op.MSBits = 0; + setMode(Mode, I); + } /// If \p MO references VGPRs, return the MSBs. Otherwise, return nullopt. std::optional<unsigned> getMSBs(const MachineOperand &MO) const; @@ -119,49 +144,69 @@ private: /// Handle single \p MI. \return true if changed. bool runOnMachineInstr(MachineInstr &MI); - /// Compute the mode and mode mask for a single \p MI given \p Ops operands + /// Compute the mode for a single \p MI given \p Ops operands /// bit mapping. Optionally takes second array \p Ops2 for VOPD. /// If provided and an operand from \p Ops is not a VGPR, then \p Ops2 /// is checked. - void computeMode(ModeTy &NewMode, ModeTy &Mask, MachineInstr &MI, + void computeMode(ModeTy &NewMode, MachineInstr &MI, const AMDGPU::OpName Ops[OpNum], const AMDGPU::OpName *Ops2 = nullptr); /// Check if an instruction \p I is within a clause and returns a suitable /// iterator to insert mode change. It may also modify the S_CLAUSE /// instruction to extend it or drop the clause if it cannot be adjusted. - MachineInstr *handleClause(MachineInstr *I); + MachineBasicBlock::instr_iterator + handleClause(MachineBasicBlock::instr_iterator I); + + /// Check if an instruction \p I is immediately after another program state + /// instruction which it cannot coissue with. If so, insert before that + /// instruction to encourage more coissuing. + MachineBasicBlock::instr_iterator + handleCoissue(MachineBasicBlock::instr_iterator I); + + /// Handle S_SETREG_IMM32_B32 targeting MODE register. On certain hardware, + /// this instruction clobbers VGPR MSB bits[12:19], so we need to restore + /// the current mode. \returns true if the instruction was modified or a + /// new one was inserted. + bool handleSetregMode(MachineInstr &MI); + + /// Update bits[12:19] of the imm operand in S_SETREG_IMM32_B32 to contain + /// the VGPR MSB mode value. \returns true if the immediate was changed. + bool updateSetregModeImm(MachineInstr &MI, int64_t ModeValue); }; -bool AMDGPULowerVGPREncoding::setMode(ModeTy NewMode, ModeTy Mask, - MachineInstr *I) { - assert((NewMode.raw_bits() & ~Mask.raw_bits()).none()); +bool AMDGPULowerVGPREncoding::setMode(ModeTy NewMode, + MachineBasicBlock::instr_iterator I) { + // Record previous mode into high 8 bits of the immediate. + int64_t OldModeBits = CurrentMode.encode() << ModeWidth; - if (CurrentModeKnown) { - auto Delta = NewMode.raw_bits() ^ CurrentMode.raw_bits(); + bool Rewritten = false; + if (!CurrentMode.update(NewMode, Rewritten)) + return false; - if ((Delta & Mask.raw_bits()).none()) { - CurrentMask |= Mask; - return false; + if (MostRecentModeSet && !Rewritten) { + // Update MostRecentModeSet with the new mode. It can be either + // S_SET_VGPR_MSB or S_SETREG_IMM32_B32 (with Size <= 12). + if (MostRecentModeSet->getOpcode() == AMDGPU::S_SET_VGPR_MSB) { + MachineOperand &Op = MostRecentModeSet->getOperand(0); + // Carry old mode bits from the existing instruction. + int64_t OldModeBits = Op.getImm() & (ModeMask << ModeWidth); + Op.setImm(CurrentMode.encode() | OldModeBits); + } else { + assert(MostRecentModeSet->getOpcode() == AMDGPU::S_SETREG_IMM32_B32 && + "unexpected MostRecentModeSet opcode"); + updateSetregModeImm(*MostRecentModeSet, CurrentMode.encode()); } - if (MostRecentModeSet && (Delta & CurrentMask.raw_bits()).none()) { - CurrentMode |= NewMode; - CurrentMask |= Mask; - - MostRecentModeSet->getOperand(0).setImm(CurrentMode); - return true; - } + return true; } I = handleClause(I); - MostRecentModeSet = - BuildMI(*I->getParent(), I, {}, TII->get(AMDGPU::S_SET_VGPR_MSB)) - .addImm(NewMode); + I = handleCoissue(I); + MostRecentModeSet = BuildMI(*MBB, I, {}, TII->get(AMDGPU::S_SET_VGPR_MSB)) + .addImm(NewMode.encode() | OldModeBits); CurrentMode = NewMode; - CurrentMask = Mask; - CurrentModeKnown = true; return true; } @@ -179,12 +224,10 @@ AMDGPULowerVGPREncoding::getMSBs(const MachineOperand &MO) const { return Idx >> 8; } -void AMDGPULowerVGPREncoding::computeMode(ModeTy &NewMode, ModeTy &Mask, - MachineInstr &MI, +void AMDGPULowerVGPREncoding::computeMode(ModeTy &NewMode, MachineInstr &MI, const AMDGPU::OpName Ops[OpNum], const AMDGPU::OpName *Ops2) { NewMode = {}; - Mask = {}; for (unsigned I = 0; I < OpNum; ++I) { MachineOperand *Op = TII->getNamedOperand(MI, Ops[I]); @@ -223,31 +266,31 @@ void AMDGPULowerVGPREncoding::computeMode(ModeTy &NewMode, ModeTy &Mask, TII->hasVALU32BitEncoding(MI.getOpcode())))) continue; - NewMode[I] = MSBits.value(); - Mask[I] = FieldMask; + NewMode.Ops[I].MSBits = MSBits.value(); } } bool AMDGPULowerVGPREncoding::runOnMachineInstr(MachineInstr &MI) { auto Ops = AMDGPU::getVGPRLoweringOperandTables(MI.getDesc()); if (Ops.first) { - ModeTy NewMode, Mask; - computeMode(NewMode, Mask, MI, Ops.first, Ops.second); - return setMode(NewMode, Mask, &MI); + ModeTy NewMode; + computeMode(NewMode, MI, Ops.first, Ops.second); + return setMode(NewMode, MI.getIterator()); } assert(!TII->hasVGPRUses(MI) || MI.isMetaInstruction() || MI.isPseudo()); return false; } -MachineInstr *AMDGPULowerVGPREncoding::handleClause(MachineInstr *I) { +MachineBasicBlock::instr_iterator +AMDGPULowerVGPREncoding::handleClause(MachineBasicBlock::instr_iterator I) { if (!ClauseRemaining) return I; // A clause cannot start with a special instruction, place it right before // the clause. if (ClauseRemaining == ClauseLen) { - I = Clause->getPrevNode(); + I = Clause->getPrevNode()->getIterator(); assert(I->isBundle()); return I; } @@ -272,6 +315,106 @@ MachineInstr *AMDGPULowerVGPREncoding::handleClause(MachineInstr *I) { return I; } +MachineBasicBlock::instr_iterator +AMDGPULowerVGPREncoding::handleCoissue(MachineBasicBlock::instr_iterator I) { + if (I.isEnd()) + return I; + + // "Program State instructions" are instructions which are used to control + // operation of the GPU rather than performing arithmetic. Such instructions + // have different coissuing rules w.r.t s_set_vgpr_msb. + auto isProgramStateInstr = [this](MachineInstr *MI) { + unsigned Opc = MI->getOpcode(); + return TII->isBarrier(Opc) || TII->isWaitcnt(Opc) || + Opc == AMDGPU::S_DELAY_ALU; + }; + + while (!I.isEnd() && I != I->getParent()->begin()) { + auto Prev = std::prev(I); + if (!isProgramStateInstr(&*Prev)) + return I; + I = Prev; + } + + return I; +} + +/// Convert mode value from S_SET_VGPR_MSB format to MODE register format. +/// S_SET_VGPR_MSB uses: (src0[0-1], src1[2-3], src2[4-5], dst[6-7]) +/// MODE register uses: (dst[0-1], src0[2-3], src1[4-5], src2[6-7]) +/// This is a left rotation by 2 bits on an 8-bit value. +static int64_t convertModeToSetregFormat(int64_t Mode) { + assert(isUInt<8>(Mode) && "Mode expected to be 8-bit"); + return llvm::rotl<uint8_t>(static_cast<uint8_t>(Mode), /*R=*/2); +} + +bool AMDGPULowerVGPREncoding::updateSetregModeImm(MachineInstr &MI, + int64_t ModeValue) { + assert(MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32); + + // Convert from S_SET_VGPR_MSB format to MODE register format + int64_t SetregMode = convertModeToSetregFormat(ModeValue); + + MachineOperand *ImmOp = TII->getNamedOperand(MI, AMDGPU::OpName::imm); + int64_t OldImm = ImmOp->getImm(); + int64_t NewImm = + (OldImm & ~AMDGPU::Hwreg::VGPR_MSB_MASK) | (SetregMode << VGPRMSBShift); + ImmOp->setImm(NewImm); + return NewImm != OldImm; +} + +bool AMDGPULowerVGPREncoding::handleSetregMode(MachineInstr &MI) { + using namespace AMDGPU::Hwreg; + + assert(MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 && + "only S_SETREG_IMM32_B32 needs to be handled"); + + MachineOperand *SIMM16Op = TII->getNamedOperand(MI, AMDGPU::OpName::simm16); + assert(SIMM16Op && "SIMM16Op must be present"); + + auto [HwRegId, Offset, Size] = HwregEncoding::decode(SIMM16Op->getImm()); + (void)Offset; + if (HwRegId != ID_MODE) + return false; + + int64_t ModeValue = CurrentMode.encode(); + + // Case 1: Size <= 12 - the original instruction uses imm32[0:Size-1], so + // imm32[12:19] is unused. Safe to set imm32[12:19] to the correct VGPR + // MSBs. + if (Size <= VGPRMSBShift) { + // This instruction now acts as MostRecentModeSet so it can be updated if + // CurrentMode changes via piggybacking. + MostRecentModeSet = &MI; + return updateSetregModeImm(MI, ModeValue); + } + + // Case 2: Size > 12 - the original instruction uses bits beyond 11, so we + // cannot arbitrarily modify imm32[12:19]. Check if it already matches VGPR + // MSBs. Note: imm32[12:19] is in MODE register format, while ModeValue is + // in S_SET_VGPR_MSB format, so we need to convert before comparing. + MachineOperand *ImmOp = TII->getNamedOperand(MI, AMDGPU::OpName::imm); + assert(ImmOp && "ImmOp must be present"); + int64_t ImmBits12To19 = (ImmOp->getImm() & VGPR_MSB_MASK) >> VGPRMSBShift; + int64_t SetregModeValue = convertModeToSetregFormat(ModeValue); + if (ImmBits12To19 == SetregModeValue) { + // Already correct, but we must invalidate MostRecentModeSet because this + // instruction will overwrite mode[12:19]. We can't update this instruction + // via piggybacking (bits[12:19] are meaningful), so if CurrentMode changes, + // a new s_set_vgpr_msb will be inserted after this instruction. + MostRecentModeSet = nullptr; + return false; + } + + // imm32[12:19] doesn't match VGPR MSBs - insert s_set_vgpr_msb after + // the original instruction to restore the correct value. + MachineBasicBlock::iterator InsertPt = std::next(MI.getIterator()); + MostRecentModeSet = BuildMI(*MBB, InsertPt, MI.getDebugLoc(), + TII->get(AMDGPU::S_SET_VGPR_MSB)) + .addImm(ModeValue); + return true; +} + bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); if (!ST.has1024AddressableVGPRs()) @@ -282,11 +425,10 @@ bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) { bool Changed = false; ClauseLen = ClauseRemaining = 0; - CurrentMode.reset(); - CurrentMask.reset(); - CurrentModeKnown = true; + CurrentMode = {}; for (auto &MBB : MF) { MostRecentModeSet = nullptr; + this->MBB = &MBB; for (auto &MI : llvm::make_early_inc_range(MBB.instrs())) { if (MI.isMetaInstruction()) @@ -294,17 +436,16 @@ bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) { if (MI.isTerminator() || MI.isCall()) { if (MI.getOpcode() == AMDGPU::S_ENDPGM || - MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) { - CurrentMode.reset(); - CurrentModeKnown = true; - } else - resetMode(&MI); + MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) + CurrentMode = {}; + else + resetMode(MI.getIterator()); continue; } if (MI.isInlineAsm()) { if (TII->hasVGPRUses(MI)) - resetMode(&MI); + resetMode(MI.getIterator()); continue; } @@ -317,20 +458,20 @@ bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) { continue; } + if (MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 && + ST.hasSetregVGPRMSBFixup()) { + Changed |= handleSetregMode(MI); + continue; + } + Changed |= runOnMachineInstr(MI); if (ClauseRemaining) --ClauseRemaining; } - // If we're falling through to a block that has at least one other - // predecessor, we no longer know the mode. - MachineBasicBlock *Next = MBB.getNextNode(); - if (Next && Next->pred_size() >= 2 && - llvm::is_contained(Next->predecessors(), &MBB)) { - if (CurrentMode.raw_bits().any()) - CurrentModeKnown = false; - } + // Reset the mode if we are falling through. + resetMode(MBB.instr_end()); } return Changed; @@ -367,7 +508,5 @@ AMDGPULowerVGPREncodingPass::run(MachineFunction &MF, if (!AMDGPULowerVGPREncoding().run(MF)) return PreservedAnalyses::all(); - PreservedAnalyses PA; - PA.preserveSet<CFGAnalyses>(); - return PA; + return getMachineFunctionPassPreservedAnalyses().preserveSet<CFGAnalyses>(); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp index 680e7eb..fc408aa 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -229,7 +229,8 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { OutMI.addOperand(Src); return; } else if (Opcode == AMDGPU::SI_TCRETURN || - Opcode == AMDGPU::SI_TCRETURN_GFX) { + Opcode == AMDGPU::SI_TCRETURN_GFX || + Opcode == AMDGPU::SI_TCRETURN_CHAIN) { // TODO: How to use branch immediate and avoid register+add? Opcode = AMDGPU::S_SETPC_B64; } else if (AMDGPU::getT16D16Helper(Opcode)) { @@ -243,7 +244,7 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { int MCOpcode = TII->pseudoToMCOpcode(Opcode); if (MCOpcode == -1) { - LLVMContext &C = MI->getParent()->getParent()->getFunction().getContext(); + LLVMContext &C = MI->getMF()->getFunction().getContext(); C.emitError("AMDGPUMCInstLower::lower - Pseudo instruction doesn't have " "a target-specific version: " + Twine(MI->getOpcode())); } @@ -332,7 +333,7 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) { StringRef Err; if (!STI.getInstrInfo()->verifyInstruction(*MI, Err)) { - LLVMContext &C = MI->getParent()->getParent()->getFunction().getContext(); + LLVMContext &C = MI->getMF()->getFunction().getContext(); C.emitError("Illegal instruction detected: " + Err); MI->print(errs()); } @@ -346,7 +347,7 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) { } } else { // We don't want these pseudo instructions encoded. They are - // placeholder terminator instructions and should only be printed as + // placeholder instructions and should only be printed as // comments. if (MI->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) { if (isVerbose()) @@ -360,6 +361,20 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) { return; } + if (MI->getOpcode() == AMDGPU::ASYNCMARK) { + if (isVerbose()) + OutStreamer->emitRawComment(" asyncmark"); + return; + } + + if (MI->getOpcode() == AMDGPU::WAIT_ASYNCMARK) { + if (isVerbose()) { + OutStreamer->emitRawComment(" wait_asyncmark(" + + Twine(MI->getOperand(0).getImm()) + ")"); + } + return; + } + if (MI->getOpcode() == AMDGPU::SCHED_BARRIER) { if (isVerbose()) { std::string HexString; @@ -405,6 +420,16 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) { return; } + unsigned Opc = MI->getOpcode(); + if (LLVM_UNLIKELY(Opc == TargetOpcode::STATEPOINT || + Opc == TargetOpcode::STACKMAP || + Opc == TargetOpcode::PATCHPOINT)) { + LLVMContext &Ctx = MI->getMF()->getFunction().getContext(); + Ctx.emitError("unhandled statepoint-like instruction"); + OutStreamer->emitRawComment("unsupported statepoint/stackmap/patchpoint"); + return; + } + if (isVerbose()) if (STI.getInstrInfo()->isBlockLoadStore(MI->getOpcode())) emitVGPRBlockComment(MI, STI.getInstrInfo(), STI.getRegisterInfo(), @@ -412,7 +437,7 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) { *OutStreamer); if (isVerbose() && MI->getOpcode() == AMDGPU::S_SET_VGPR_MSB) { - unsigned V = MI->getOperand(0).getImm(); + unsigned V = MI->getOperand(0).getImm() & 0xff; OutStreamer->AddComment( " msbs: dst=" + Twine(V >> 6) + " src0=" + Twine(V & 3) + " src1=" + Twine((V >> 2) & 3) + " src2=" + Twine((V >> 4) & 3)); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp index 75e3d8c..a541a26 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp @@ -13,13 +13,61 @@ #include "AMDGPUMIRFormatter.h" #include "SIMachineFunctionInfo.h" +#include "llvm/TargetParser/TargetParser.h" using namespace llvm; +const char SWaitAluImmPrefix = '.'; +StringLiteral SWaitAluDelim = "_"; + +StringLiteral VaVdstName = "VaVdst"; +StringLiteral VaSdstName = "VaSdst"; +StringLiteral VaSsrcName = "VaSsrc"; +StringLiteral HoldCntName = "HoldCnt"; +StringLiteral VmVsrcName = "VmVsrc"; +StringLiteral VaVccName = "VaVcc"; +StringLiteral SaSdstName = "SaSdst"; + +StringLiteral AllOff = "AllOff"; + +void AMDGPUMIRFormatter::printSWaitAluImm(uint64_t Imm, raw_ostream &OS) const { + bool NonePrinted = true; + ListSeparator Delim(SWaitAluDelim); + auto PrintFieldIfNotMax = [&](StringRef Descr, uint64_t Num, unsigned Max) { + if (Num != Max) { + OS << Delim << Descr << SWaitAluDelim << Num; + NonePrinted = false; + } + }; + OS << SWaitAluImmPrefix; + PrintFieldIfNotMax(VaVdstName, AMDGPU::DepCtr::decodeFieldVaVdst(Imm), + AMDGPU::DepCtr::getVaVdstBitMask()); + PrintFieldIfNotMax(VaSdstName, AMDGPU::DepCtr::decodeFieldVaSdst(Imm), + AMDGPU::DepCtr::getVaSdstBitMask()); + PrintFieldIfNotMax(VaSsrcName, AMDGPU::DepCtr::decodeFieldVaSsrc(Imm), + AMDGPU::DepCtr::getVaSsrcBitMask()); + PrintFieldIfNotMax( + HoldCntName, + AMDGPU::DepCtr::decodeFieldHoldCnt(Imm, + AMDGPU::getIsaVersion(STI.getCPU())), + AMDGPU::DepCtr::getHoldCntBitMask(AMDGPU::getIsaVersion(STI.getCPU()))); + PrintFieldIfNotMax(VmVsrcName, AMDGPU::DepCtr::decodeFieldVmVsrc(Imm), + AMDGPU::DepCtr::getVmVsrcBitMask()); + PrintFieldIfNotMax(VaVccName, AMDGPU::DepCtr::decodeFieldVaVcc(Imm), + AMDGPU::DepCtr::getVaVccBitMask()); + PrintFieldIfNotMax(SaSdstName, AMDGPU::DepCtr::decodeFieldSaSdst(Imm), + AMDGPU::DepCtr::getSaSdstBitMask()); + if (NonePrinted) + OS << AllOff; +} + void AMDGPUMIRFormatter::printImm(raw_ostream &OS, const MachineInstr &MI, std::optional<unsigned int> OpIdx, int64_t Imm) const { switch (MI.getOpcode()) { + case AMDGPU::S_WAITCNT_DEPCTR: + printSWaitAluImm(Imm, OS); + break; case AMDGPU::S_DELAY_ALU: assert(OpIdx == 0); printSDelayAluImm(Imm, OS); @@ -39,6 +87,8 @@ bool AMDGPUMIRFormatter::parseImmMnemonic(const unsigned OpCode, { switch (OpCode) { + case AMDGPU::S_WAITCNT_DEPCTR: + return parseSWaitAluImmMnemonic(OpIdx, Imm, Src, ErrorCallback); case AMDGPU::S_DELAY_ALU: return parseSDelayAluImmMnemonic(OpIdx, Imm, Src, ErrorCallback); default: @@ -90,6 +140,89 @@ void AMDGPUMIRFormatter::printSDelayAluImm(int64_t Imm, Outdep(Id1); } +bool AMDGPUMIRFormatter::parseSWaitAluImmMnemonic( + const unsigned int OpIdx, int64_t &Imm, StringRef &Src, + MIRFormatter::ErrorCallbackType &ErrorCallback) const { + // TODO: For now accept integer masks for compatibility with old MIR. + if (!Src.consumeInteger(10, Imm)) + return false; + + // Initialize with all checks off. + Imm = AMDGPU::DepCtr::getDefaultDepCtrEncoding(STI); + // The input is in the form: .Name1_Num1_Name2_Num2 + // Drop the '.' prefix. + bool ConsumePrefix = Src.consume_front(SWaitAluImmPrefix); + if (!ConsumePrefix) + return ErrorCallback(Src.begin(), "expected prefix"); + if (Src.empty()) + return ErrorCallback(Src.begin(), "expected <CounterName>_<CounterNum>"); + + // Special case for all off. + if (Src == AllOff) + return false; + + // Parse a counter name, number pair in each iteration. + while (!Src.empty()) { + // Src: Name1_Num1_Name2_Num2 + // ^ + size_t DelimIdx = Src.find(SWaitAluDelim); + if (DelimIdx == StringRef::npos) + return ErrorCallback(Src.begin(), "expected <CounterName>_<CounterNum>"); + // Src: Name1_Num1_Name2_Num2 + // ^^^^^ + StringRef Name = Src.substr(0, DelimIdx); + // Save the position of the name for accurate error reporting. + StringRef::iterator NamePos = Src.begin(); + [[maybe_unused]] bool ConsumeName = Src.consume_front(Name); + assert(ConsumeName && "Expected name"); + [[maybe_unused]] bool ConsumeDelim = Src.consume_front(SWaitAluDelim); + assert(ConsumeDelim && "Expected delimiter"); + // Src: Num1_Name2_Num2 + // ^ + DelimIdx = Src.find(SWaitAluDelim); + // Src: Num1_Name2_Num2 + // ^^^^ + int64_t Num; + // Save the position of the number for accurate error reporting. + StringRef::iterator NumPos = Src.begin(); + if (Src.consumeInteger(10, Num) || Num < 0) + return ErrorCallback(NumPos, + "expected non-negative integer counter number"); + unsigned Max; + if (Name == VaVdstName) { + Max = AMDGPU::DepCtr::getVaVdstBitMask(); + Imm = AMDGPU::DepCtr::encodeFieldVaVdst(Imm, Num); + } else if (Name == VmVsrcName) { + Max = AMDGPU::DepCtr::getVmVsrcBitMask(); + Imm = AMDGPU::DepCtr::encodeFieldVmVsrc(Imm, Num); + } else if (Name == VaSdstName) { + Max = AMDGPU::DepCtr::getVaSdstBitMask(); + Imm = AMDGPU::DepCtr::encodeFieldVaSdst(Imm, Num); + } else if (Name == VaSsrcName) { + Max = AMDGPU::DepCtr::getVaSsrcBitMask(); + Imm = AMDGPU::DepCtr::encodeFieldVaSsrc(Imm, Num); + } else if (Name == HoldCntName) { + const AMDGPU::IsaVersion &Version = AMDGPU::getIsaVersion(STI.getCPU()); + Max = AMDGPU::DepCtr::getHoldCntBitMask(Version); + Imm = AMDGPU::DepCtr::encodeFieldHoldCnt(Imm, Num, Version); + } else if (Name == VaVccName) { + Max = AMDGPU::DepCtr::getVaVccBitMask(); + Imm = AMDGPU::DepCtr::encodeFieldVaVcc(Imm, Num); + } else if (Name == SaSdstName) { + Max = AMDGPU::DepCtr::getSaSdstBitMask(); + Imm = AMDGPU::DepCtr::encodeFieldSaSdst(Imm, Num); + } else { + return ErrorCallback(NamePos, "invalid counter name"); + } + // Don't allow the values to reach their maximum value. + if (Num >= Max) + return ErrorCallback(NumPos, "counter value too large"); + // Src: Name2_Num2 + Src.consume_front(SWaitAluDelim); + } + return false; +} + bool AMDGPUMIRFormatter::parseSDelayAluImmMnemonic( const unsigned int OpIdx, int64_t &Imm, llvm::StringRef &Src, llvm::MIRFormatter::ErrorCallbackType &ErrorCallback) const diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h index c5c9473..dbfc645 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h @@ -16,6 +16,7 @@ #ifndef LLVM_LIB_TARGET_AMDGPUMIRFORMATTER_H #define LLVM_LIB_TARGET_AMDGPUMIRFORMATTER_H +#include "Utils/AMDGPUBaseInfo.h" #include "llvm/CodeGen/MIRFormatter.h" namespace llvm { @@ -25,21 +26,20 @@ struct PerFunctionMIParsingState; class AMDGPUMIRFormatter final : public MIRFormatter { public: - AMDGPUMIRFormatter() = default; - virtual ~AMDGPUMIRFormatter() = default; + explicit AMDGPUMIRFormatter(const MCSubtargetInfo &STI) : STI(STI) {} + ~AMDGPUMIRFormatter() override = default; /// Implement target specific printing for machine operand immediate value, so /// that we can have more meaningful mnemonic than a 64-bit integer. Passing /// None to OpIdx means the index is unknown. - virtual void printImm(raw_ostream &OS, const MachineInstr &MI, - std::optional<unsigned> OpIdx, - int64_t Imm) const override; + void printImm(raw_ostream &OS, const MachineInstr &MI, + std::optional<unsigned> OpIdx, int64_t Imm) const override; /// Implement target specific parsing of immediate mnemonics. The mnemonic is /// a string with a leading dot. - virtual bool parseImmMnemonic(const unsigned OpCode, const unsigned OpIdx, - StringRef Src, int64_t &Imm, - ErrorCallbackType ErrorCallback) const override; + bool parseImmMnemonic(const unsigned OpCode, const unsigned OpIdx, + StringRef Src, int64_t &Imm, + ErrorCallbackType ErrorCallback) const override; /// Implement target specific parsing of target custom pseudo source value. bool @@ -49,9 +49,17 @@ public: ErrorCallbackType ErrorCallback) const override; private: + const MCSubtargetInfo &STI; + /// Prints the string to represent s_wait_alu immediate value. + void printSWaitAluImm(uint64_t Imm, raw_ostream &OS) const; /// Print the string to represent s_delay_alu immediate value void printSDelayAluImm(int64_t Imm, llvm::raw_ostream &OS) const; + /// Parse the immediate pseudo literal for s_wait_alu + bool parseSWaitAluImmMnemonic( + const unsigned int OpIdx, int64_t &Imm, StringRef &Src, + MIRFormatter::ErrorCallbackType &ErrorCallback) const; + /// Parse the immediate pseudo literal for s_delay_alu bool parseSDelayAluImmMnemonic( const unsigned int OpIdx, int64_t &Imm, llvm::StringRef &Src, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp index 664a15c..1730757 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -80,11 +80,6 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const Function &F, if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) ExplicitKernArgSize = ST.getExplicitKernArgSize(F, MaxKernArgAlign); - // FIXME: Shouldn't be target specific - Attribute NSZAttr = F.getFnAttribute("no-signed-zeros-fp-math"); - NoSignedZerosFPMath = - NSZAttr.isStringAttribute() && NSZAttr.getValueAsString() == "true"; - const GlobalVariable *DynLdsGlobal = getKernelDynLDSGlobalFromFunction(F); if (DynLdsGlobal || hasLDSKernelArgument(F)) UsesDynamicLDS = true; @@ -107,7 +102,7 @@ unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL, if (!BarAddr) llvm_unreachable("named barrier should have an assigned address"); Entry.first->second = BarAddr.value(); - unsigned BarCnt = DL.getTypeAllocSize(GV.getValueType()) / 16; + unsigned BarCnt = GV.getGlobalSize(DL) / 16; recordNumNamedBarriers(BarAddr.value(), BarCnt); return BarAddr.value(); } @@ -135,8 +130,7 @@ unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL, // section, and not within some other non-absolute-address object // allocated here, but the extra error detection is minimal and we would // have to pass the Function around or cache the attribute value. - uint32_t ObjectEnd = - ObjectStart + DL.getTypeAllocSize(GV.getValueType()); + uint32_t ObjectEnd = ObjectStart + GV.getGlobalSize(DL); if (ObjectEnd > StaticLDSSize) { report_fatal_error( "Absolute address LDS variable outside of static frame"); @@ -152,7 +146,7 @@ unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL, /// during lowering. Offset = StaticLDSSize = alignTo(StaticLDSSize, Alignment); - StaticLDSSize += DL.getTypeAllocSize(GV.getValueType()); + StaticLDSSize += GV.getGlobalSize(DL); // Align LDS size to trailing, e.g. for aligning dynamic shared memory LDSSize = alignTo(StaticLDSSize, Trailing); @@ -161,7 +155,7 @@ unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL, "expected region address space"); Offset = StaticGDSSize = alignTo(StaticGDSSize, Alignment); - StaticGDSSize += DL.getTypeAllocSize(GV.getValueType()); + StaticGDSSize += GV.getGlobalSize(DL); // FIXME: Apply alignment of dynamic GDS GDSSize = StaticGDSSize; @@ -210,7 +204,7 @@ void AMDGPUMachineFunction::setDynLDSAlign(const Function &F, const GlobalVariable &GV) { const Module *M = F.getParent(); const DataLayout &DL = M->getDataLayout(); - assert(DL.getTypeAllocSize(GV.getValueType()).isZero()); + assert(GV.getGlobalSize(DL) == 0); Align Alignment = DL.getValueOrABITypeAlignment(GV.getAlign(), GV.getValueType()); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h index fc64e16..1317210 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h @@ -61,8 +61,6 @@ protected: // Functions with the amdgpu_cs_chain or amdgpu_cs_chain_preserve CC. bool IsChainFunction = false; - bool NoSignedZerosFPMath = false; - // Function may be memory bound. bool MemoryBound = false; @@ -107,10 +105,6 @@ public: return isEntryFunction() || isChainFunction(); } - bool hasNoSignedZerosFPMath() const { - return NoSignedZerosFPMath; - } - bool isMemoryBound() const { return MemoryBound; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp index e17c211..9fbb19d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp @@ -70,7 +70,7 @@ bool isDynamicLDS(const GlobalVariable &GV) { const DataLayout &DL = M->getDataLayout(); if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) return false; - return DL.getTypeAllocSize(GV.getValueType()) == 0; + return GV.getGlobalSize(DL) == 0; } bool isLDSVariableToLower(const GlobalVariable &GV) { @@ -126,7 +126,7 @@ void getUsesOfLDSByFunction(const CallGraph &CG, Module &M, for (User *V : GV.users()) { if (auto *I = dyn_cast<Instruction>(V)) { Function *F = I->getFunction(); - if (isKernelLDS(F)) + if (isKernel(*F)) kernels[F].insert(&GV); else Functions[F].insert(&GV); @@ -135,10 +135,6 @@ void getUsesOfLDSByFunction(const CallGraph &CG, Module &M, } } -bool isKernelLDS(const Function *F) { - return AMDGPU::isKernel(F->getCallingConv()); -} - LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) { FunctionVariableMap DirectMapKernel; @@ -148,7 +144,7 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) { // Collect functions whose address has escaped DenseSet<Function *> AddressTakenFuncs; for (Function &F : M.functions()) { - if (!isKernelLDS(&F)) + if (!isKernel(F)) if (F.hasAddressTaken(nullptr, /* IgnoreCallbackUses */ false, /* IgnoreAssumeLikeCalls */ false, @@ -180,7 +176,7 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) { // access all variables accessed by functions whose address escaped for (Function &F : M.functions()) { if (!F.isDeclaration() && FunctionMakesUnknownCall(&F)) { - if (!isKernelLDS(&F)) { + if (!isKernel(F)) { set_union(TransitiveMapFunction[&F], VariablesReachableThroughFunctionPointer); } @@ -190,7 +186,7 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) { // Direct implementation of collecting all variables reachable from each // function for (Function &Func : M.functions()) { - if (Func.isDeclaration() || isKernelLDS(&Func)) + if (Func.isDeclaration() || isKernel(Func)) continue; DenseSet<Function *> seen; // catches cycles @@ -227,7 +223,7 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) { FunctionVariableMap IndirectMapKernel; for (Function &Func : M.functions()) { - if (Func.isDeclaration() || !isKernelLDS(&Func)) + if (Func.isDeclaration() || !isKernel(Func)) continue; for (const CallGraphNode::CallRecord &R : *CG[&Func]) { @@ -273,6 +269,8 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) { // this is a re-run of the pass // so we don't have anything to do. // - No variables are absolute. + // Named-barriers which are absolute symbols are removed + // from the maps. std::optional<bool> HasAbsoluteGVs; bool HasSpecialGVs = false; for (auto &Map : {DirectMapKernel, IndirectMapKernel}) { @@ -284,6 +282,10 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) { if (IsDirectMapDynLDSGV) continue; if (isNamedBarrier(*GV)) { + if (IsAbsolute) { + DirectMapKernel[Fn].erase(GV); + IndirectMapKernel[Fn].erase(GV); + } HasSpecialGVs = true; continue; } @@ -335,7 +337,7 @@ void removeFnAttrFromReachable(CallGraph &CG, Function *KernelRoot, Function *PotentialCallee = ExternalCallRecord.second->getFunction(); assert(PotentialCallee); - if (!isKernelLDS(PotentialCallee)) { + if (!isKernel(*PotentialCallee)) { for (StringRef Attr : FnAttrs) PotentialCallee->removeFnAttr(Attr); } @@ -369,6 +371,7 @@ bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA) { case Intrinsic::amdgcn_s_barrier_wait: case Intrinsic::amdgcn_s_barrier_leave: case Intrinsic::amdgcn_s_get_barrier_state: + case Intrinsic::amdgcn_s_wakeup_barrier: case Intrinsic::amdgcn_wave_barrier: case Intrinsic::amdgcn_sched_barrier: case Intrinsic::amdgcn_sched_group_barrier: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.h index 058e744..8868b93 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.h @@ -53,8 +53,6 @@ void getUsesOfLDSByFunction(const CallGraph &CG, Module &M, FunctionVariableMap &kernels, FunctionVariableMap &functions); -bool isKernelLDS(const Function *F); - LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M); /// Strip FnAttr attribute from any functions where we may have diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index a6074ea..f464fbf 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -13,6 +13,12 @@ // NOTE: NO INCLUDE GUARD DESIRED! +#ifndef MODULE_ANALYSIS +#define MODULE_ANALYSIS(NAME, CREATE_PASS) +#endif +MODULE_ANALYSIS("amdgpu-argument-usage", AMDGPUArgumentUsageAnalysis()) +#undef MODULE_ANALYSIS + #ifndef MODULE_PASS #define MODULE_PASS(NAME, CREATE_PASS) #endif @@ -29,8 +35,8 @@ MODULE_PASS("amdgpu-perf-hint", MODULE_PASS("amdgpu-preload-kernel-arguments", AMDGPUPreloadKernelArgumentsPass(*this)) MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass()) MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this)) +MODULE_PASS("amdgpu-lower-exec-sync", AMDGPULowerExecSyncPass()) MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this)) -MODULE_PASS("amdgpu-uniform-intrinsic-combine", AMDGPUUniformIntrinsicCombinePass()) #undef MODULE_PASS #ifndef MODULE_PASS_WITH_PARAMS @@ -69,6 +75,7 @@ FUNCTION_PASS("amdgpu-unify-divergent-exit-nodes", AMDGPUUnifyDivergentExitNodesPass()) FUNCTION_PASS("amdgpu-usenative", AMDGPUUseNativeCallsPass()) FUNCTION_PASS("si-annotate-control-flow", SIAnnotateControlFlowPass(*static_cast<const GCNTargetMachine *>(this))) +FUNCTION_PASS("amdgpu-uniform-intrinsic-combine", AMDGPUUniformIntrinsicCombinePass()) #undef FUNCTION_PASS #ifndef FUNCTION_ANALYSIS diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h index cf2ab825..a3be0f5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h @@ -48,7 +48,7 @@ private: FuncInfoMap FIM; public: - AMDGPUPerfHintAnalysis() {} + AMDGPUPerfHintAnalysis() = default; // OldPM bool runOnSCC(const GCNTargetMachine &TM, CallGraphSCC &SCC); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp index e86b473..0264d88 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp @@ -367,10 +367,10 @@ bool AMDGPUPostLegalizerCombinerImpl::matchRemoveFcanonicalize( return TLI->isCanonicalized(Reg, MF); } -// The buffer_load_{i8, i16} intrinsics are intially lowered as buffer_load_{u8, -// u16} instructions. Here, the buffer_load_{u8, u16} instructions are combined -// with sign extension instrucions in order to generate buffer_load_{i8, i16} -// instructions. +// The buffer_load_{i8, i16} intrinsics are initially lowered as +// buffer_load_{u8, u16} instructions. Here, the buffer_load_{u8, u16} +// instructions are combined with sign extension instrucions in order to +// generate buffer_load_{i8, i16} instructions. // Identify buffer_load_{u8, u16}. bool AMDGPUPostLegalizerCombinerImpl::matchCombineSignExtendInReg( diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp index 6e54737..4a70c5d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp @@ -177,8 +177,7 @@ void AMDGPUPreLegalizerCombinerImpl::applyClampI64ToI16( MachineInstr &MI, const ClampI64ToI16MatchInfo &MatchInfo) const { Register Src = MatchInfo.Origin; - assert(MI.getParent()->getParent()->getRegInfo().getType(Src) == - LLT::scalar(64)); + assert(MI.getMF()->getRegInfo().getType(Src) == LLT::scalar(64)); const LLT S32 = LLT::scalar(32); auto Unmerge = B.buildUnmerge(S32, Src); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp index ffbbf63..7d6e3ed 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp @@ -127,7 +127,7 @@ private: // will also be preloaded even if that data is unused. Function *cloneFunctionWithPreloadImplicitArgs(unsigned LastPreloadIndex) { FunctionType *FT = F.getFunctionType(); - LLVMContext &Ctx = F.getParent()->getContext(); + LLVMContext &Ctx = F.getContext(); SmallVector<Type *, 16> FTypes(FT->param_begin(), FT->param_end()); for (unsigned I = 0; I <= LastPreloadIndex; ++I) FTypes.push_back(getHiddenArgType(Ctx, HiddenArg(I))); @@ -196,7 +196,7 @@ public: SmallVector<std::pair<LoadInst *, unsigned>, 4> ImplicitArgLoads; for (auto *U : ImplicitArgPtr->users()) { Instruction *CI = dyn_cast<Instruction>(U); - if (!CI || CI->getParent()->getParent() != &F) + if (!CI || CI->getFunction() != &F) continue; for (auto *U : CI->users()) { @@ -213,7 +213,7 @@ public: continue; // FIXME: Expand handle merged loads. - LLVMContext &Ctx = F.getParent()->getContext(); + LLVMContext &Ctx = F.getContext(); Type *LoadTy = Load->getType(); HiddenArg HA = getHiddenArgFromOffset(Offset); if (HA == END_HIDDEN_ARGS || LoadTy != getHiddenArgType(Ctx, HA)) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp index 0137b3f..a43600a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp @@ -46,10 +46,7 @@ class AMDGPUPrepareAGPRAllocLegacy : public MachineFunctionPass { public: static char ID; - AMDGPUPrepareAGPRAllocLegacy() : MachineFunctionPass(ID) { - initializeAMDGPUPrepareAGPRAllocLegacyPass( - *PassRegistry::getPassRegistry()); - } + AMDGPUPrepareAGPRAllocLegacy() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override; @@ -62,10 +59,8 @@ public: }; } // End anonymous namespace. -INITIALIZE_PASS_BEGIN(AMDGPUPrepareAGPRAllocLegacy, DEBUG_TYPE, - "AMDGPU Prepare AGPR Alloc", false, false) -INITIALIZE_PASS_END(AMDGPUPrepareAGPRAllocLegacy, DEBUG_TYPE, - "AMDGPU Prepare AGPR Alloc", false, false) +INITIALIZE_PASS(AMDGPUPrepareAGPRAllocLegacy, DEBUG_TYPE, + "AMDGPU Prepare AGPR Alloc", false, false) char AMDGPUPrepareAGPRAllocLegacy::ID = 0; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp index f5e14c7..d3fa423 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp @@ -129,7 +129,7 @@ static StringRef getAsConstantStr(Value *V) { static void diagnoseInvalidFormatString(const CallBase *CI) { CI->getContext().diagnose(DiagnosticInfoUnsupported( - *CI->getParent()->getParent(), + *CI->getFunction(), "printf format string must be a trivially resolved constant string " "global variable", CI->getDebugLoc())); @@ -416,9 +416,13 @@ bool AMDGPUPrintfRuntimeBindingImpl::lowerPrintfForGpu(Module &M) { } } - // erase the printf calls - for (auto *CI : Printfs) + // Erase the printf calls and replace all uses with 0, signaling success. + // Since OpenCL only specifies undefined behaviors and not success criteria, + // returning 0 sinalling success always is valid. + for (auto *CI : Printfs) { + CI->replaceAllUsesWith(ConstantInt::get(CI->getType(), 0)); CI->eraseFromParent(); + } Printfs.clear(); return true; @@ -434,6 +438,17 @@ bool AMDGPUPrintfRuntimeBindingImpl::run(Module &M) { M.getModuleFlag("openmp")) return false; + // Verify the signature of the printf function and skip if it isn't correct. + const FunctionType *PrintfFunctionTy = PrintfFunction->getFunctionType(); + if (PrintfFunctionTy->getNumParams() != 1 || !PrintfFunctionTy->isVarArg() || + !PrintfFunctionTy->getReturnType()->isIntegerTy(32)) + return false; + Type *PrintfFormatArgTy = PrintfFunctionTy->getParamType(0); + if (!PrintfFormatArgTy->isPointerTy() || + !AMDGPU::isFlatGlobalAddrSpace( + PrintfFormatArgTy->getPointerAddressSpace())) + return false; + for (auto &U : PrintfFunction->uses()) { if (auto *CI = dyn_cast<CallInst>(U.getUser())) { if (CI->isCallee(&U) && !CI->isNoBuiltin()) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index ddabd25..ed676c3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -42,6 +42,7 @@ #include "llvm/IR/PatternMatch.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Transforms/Utils/SSAUpdater.h" @@ -85,6 +86,42 @@ static cl::opt<unsigned> "when sorting profitable allocas"), cl::init(4)); +// We support vector indices of the form (A * stride) + B +// All parts are optional. +struct GEPToVectorIndex { + Value *VarIndex = nullptr; // defaults to 0 + ConstantInt *VarMul = nullptr; // defaults to 1 + ConstantInt *ConstIndex = nullptr; // defaults to 0 + Value *Full = nullptr; +}; + +struct MemTransferInfo { + ConstantInt *SrcIndex = nullptr; + ConstantInt *DestIndex = nullptr; +}; + +// Analysis for planning the different strategies of alloca promotion. +struct AllocaAnalysis { + AllocaInst *Alloca = nullptr; + DenseSet<Value *> Pointers; + SmallVector<Use *> Uses; + unsigned Score = 0; + bool HaveSelectOrPHI = false; + struct { + FixedVectorType *Ty = nullptr; + SmallVector<Instruction *> Worklist; + SmallVector<Instruction *> UsersToRemove; + MapVector<GetElementPtrInst *, GEPToVectorIndex> GEPVectorIdx; + MapVector<MemTransferInst *, MemTransferInfo> TransferInfo; + } Vector; + struct { + bool Enable = false; + SmallVector<User *> Worklist; + } LDS; + + explicit AllocaAnalysis(AllocaInst *Alloca) : Alloca(Alloca) {} +}; + // Shared implementation which can do both promotion to vector and to LDS. class AMDGPUPromoteAllocaImpl { private: @@ -106,10 +143,7 @@ private: std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder); Value *getWorkitemID(IRBuilder<> &Builder, unsigned N); - /// BaseAlloca is the alloca root the search started from. - /// Val may be that alloca or a recursive user of it. - bool collectUsesWithPtrTypes(Value *BaseAlloca, Value *Val, - std::vector<Value *> &WorkList) const; + bool collectAllocaUses(AllocaAnalysis &AA) const; /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand /// indices to an instruction with 2 pointer inputs (e.g. select, icmp). @@ -122,10 +156,16 @@ private: /// Check whether we have enough local memory for promotion. bool hasSufficientLocalMem(const Function &F); - bool tryPromoteAllocaToVector(AllocaInst &I); - bool tryPromoteAllocaToLDS(AllocaInst &I, bool SufficientLDS); + FixedVectorType *getVectorTypeForAlloca(Type *AllocaTy) const; + void analyzePromoteToVector(AllocaAnalysis &AA) const; + void promoteAllocaToVector(AllocaAnalysis &AA); + void analyzePromoteToLDS(AllocaAnalysis &AA) const; + bool tryPromoteAllocaToLDS(AllocaAnalysis &AA, bool SufficientLDS, + SetVector<IntrinsicInst *> &DeferredIntrs); + void + finishDeferredAllocaToLDSPromotion(SetVector<IntrinsicInst *> &DeferredIntrs); - void sortAllocasToPromote(SmallVectorImpl<AllocaInst *> &Allocas); + void scoreAlloca(AllocaAnalysis &AA) const; void setFunctionLimits(const Function &F); @@ -236,53 +276,77 @@ FunctionPass *llvm::createAMDGPUPromoteAlloca() { return new AMDGPUPromoteAlloca(); } -static void collectAllocaUses(AllocaInst &Alloca, - SmallVectorImpl<Use *> &Uses) { - SmallVector<Instruction *, 4> WorkList({&Alloca}); +bool AMDGPUPromoteAllocaImpl::collectAllocaUses(AllocaAnalysis &AA) const { + const auto RejectUser = [&](Instruction *Inst, Twine Msg) { + LLVM_DEBUG(dbgs() << " Cannot promote alloca: " << Msg << "\n" + << " " << *Inst << "\n"); + return false; + }; + + SmallVector<Instruction *, 4> WorkList({AA.Alloca}); while (!WorkList.empty()) { auto *Cur = WorkList.pop_back_val(); + if (find(AA.Pointers, Cur) != AA.Pointers.end()) + continue; + AA.Pointers.insert(Cur); for (auto &U : Cur->uses()) { - Uses.push_back(&U); + auto *Inst = cast<Instruction>(U.getUser()); + if (isa<StoreInst>(Inst)) { + if (U.getOperandNo() != StoreInst::getPointerOperandIndex()) { + return RejectUser(Inst, "pointer escapes via store"); + } + } + AA.Uses.push_back(&U); + + if (isa<GetElementPtrInst>(U.getUser())) { + WorkList.push_back(Inst); + } else if (auto *SI = dyn_cast<SelectInst>(Inst)) { + // Only promote a select if we know that the other select operand is + // from another pointer that will also be promoted. + if (!binaryOpIsDerivedFromSameAlloca(AA.Alloca, Cur, SI, 1, 2)) + return RejectUser(Inst, "select from mixed objects"); + WorkList.push_back(Inst); + AA.HaveSelectOrPHI = true; + } else if (auto *Phi = dyn_cast<PHINode>(Inst)) { + // Repeat for phis. + + // TODO: Handle more complex cases. We should be able to replace loops + // over arrays. + switch (Phi->getNumIncomingValues()) { + case 1: + break; + case 2: + if (!binaryOpIsDerivedFromSameAlloca(AA.Alloca, Cur, Phi, 0, 1)) + return RejectUser(Inst, "phi from mixed objects"); + break; + default: + return RejectUser(Inst, "phi with too many operands"); + } - if (isa<GetElementPtrInst>(U.getUser())) - WorkList.push_back(cast<Instruction>(U.getUser())); + WorkList.push_back(Inst); + AA.HaveSelectOrPHI = true; + } } } + return true; } -void AMDGPUPromoteAllocaImpl::sortAllocasToPromote( - SmallVectorImpl<AllocaInst *> &Allocas) { - DenseMap<AllocaInst *, unsigned> Scores; - - for (auto *Alloca : Allocas) { - LLVM_DEBUG(dbgs() << "Scoring: " << *Alloca << "\n"); - unsigned &Score = Scores[Alloca]; - // Increment score by one for each user + a bonus for users within loops. - SmallVector<Use *, 8> Uses; - collectAllocaUses(*Alloca, Uses); - for (auto *U : Uses) { - Instruction *Inst = cast<Instruction>(U->getUser()); - if (isa<GetElementPtrInst>(Inst)) - continue; - unsigned UserScore = - 1 + (LoopUserWeight * LI.getLoopDepth(Inst->getParent())); - LLVM_DEBUG(dbgs() << " [+" << UserScore << "]:\t" << *Inst << "\n"); - Score += UserScore; - } - LLVM_DEBUG(dbgs() << " => Final Score:" << Score << "\n"); +void AMDGPUPromoteAllocaImpl::scoreAlloca(AllocaAnalysis &AA) const { + LLVM_DEBUG(dbgs() << "Scoring: " << *AA.Alloca << "\n"); + unsigned Score = 0; + // Increment score by one for each user + a bonus for users within loops. + for (auto *U : AA.Uses) { + Instruction *Inst = cast<Instruction>(U->getUser()); + if (isa<GetElementPtrInst>(Inst) || isa<SelectInst>(Inst) || + isa<PHINode>(Inst)) + continue; + unsigned UserScore = + 1 + (LoopUserWeight * LI.getLoopDepth(Inst->getParent())); + LLVM_DEBUG(dbgs() << " [+" << UserScore << "]:\t" << *Inst << "\n"); + Score += UserScore; } - - stable_sort(Allocas, [&](AllocaInst *A, AllocaInst *B) { - return Scores.at(A) > Scores.at(B); - }); - - // clang-format off - LLVM_DEBUG( - dbgs() << "Sorted Worklist:\n"; - for (auto *A: Allocas) - dbgs() << " " << *A << "\n"; - ); - // clang-format on + LLVM_DEBUG(dbgs() << " => Final Score:" << Score << "\n"); + AA.Score = Score; } void AMDGPUPromoteAllocaImpl::setFunctionLimits(const Function &F) { @@ -307,7 +371,7 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) { DL = &Mod->getDataLayout(); const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F); - if (!ST.isPromoteAllocaEnabled()) + if (!ST.enablePromoteAlloca()) return false; bool SufficientLDS = PromoteToLDS && hasSufficientLocalMem(F); @@ -319,27 +383,49 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) { : (MaxVGPRs * 32)) / VGPRBudgetRatio; - SmallVector<AllocaInst *, 16> Allocas; + std::vector<AllocaAnalysis> Allocas; for (Instruction &I : F.getEntryBlock()) { if (AllocaInst *AI = dyn_cast<AllocaInst>(&I)) { // Array allocations are probably not worth handling, since an allocation // of the array type is the canonical form. if (!AI->isStaticAlloca() || AI->isArrayAllocation()) continue; - Allocas.push_back(AI); + + LLVM_DEBUG(dbgs() << "Analyzing: " << *AI << '\n'); + + AllocaAnalysis AA{AI}; + if (collectAllocaUses(AA)) { + analyzePromoteToVector(AA); + if (PromoteToLDS) + analyzePromoteToLDS(AA); + if (AA.Vector.Ty || AA.LDS.Enable) { + scoreAlloca(AA); + Allocas.push_back(std::move(AA)); + } + } } } - sortAllocasToPromote(Allocas); + stable_sort(Allocas, + [](const auto &A, const auto &B) { return A.Score > B.Score; }); + + // clang-format off + LLVM_DEBUG( + dbgs() << "Sorted Worklist:\n"; + for (const auto &AA : Allocas) + dbgs() << " " << *AA.Alloca << "\n"; + ); + // clang-format on bool Changed = false; - for (AllocaInst *AI : Allocas) { - const unsigned AllocaCost = DL->getTypeSizeInBits(AI->getAllocatedType()); - // First, check if we have enough budget to vectorize this alloca. - if (AllocaCost <= VectorizationBudget) { - // If we do, attempt vectorization, otherwise, fall through and try - // promoting to LDS instead. - if (tryPromoteAllocaToVector(*AI)) { + SetVector<IntrinsicInst *> DeferredIntrs; + for (AllocaAnalysis &AA : Allocas) { + if (AA.Vector.Ty) { + const unsigned AllocaCost = + DL->getTypeSizeInBits(AA.Alloca->getAllocatedType()); + // First, check if we have enough budget to vectorize this alloca. + if (AllocaCost <= VectorizationBudget) { + promoteAllocaToVector(AA); Changed = true; assert((VectorizationBudget - AllocaCost) < VectorizationBudget && "Underflow!"); @@ -347,16 +433,18 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) { LLVM_DEBUG(dbgs() << " Remaining vectorization budget:" << VectorizationBudget << "\n"); continue; + } else { + LLVM_DEBUG(dbgs() << "Alloca too big for vectorization (size:" + << AllocaCost << ", budget:" << VectorizationBudget + << "): " << *AA.Alloca << "\n"); } - } else { - LLVM_DEBUG(dbgs() << "Alloca too big for vectorization (size:" - << AllocaCost << ", budget:" << VectorizationBudget - << "): " << *AI << "\n"); } - if (PromoteToLDS && tryPromoteAllocaToLDS(*AI, SufficientLDS)) + if (AA.LDS.Enable && + tryPromoteAllocaToLDS(AA, SufficientLDS, DeferredIntrs)) Changed = true; } + finishDeferredAllocaToLDSPromotion(DeferredIntrs); // NOTE: tryPromoteAllocaToVector removes the alloca, so Allocas contains // dangling pointers. If we want to reuse it past this point, the loop above @@ -365,11 +453,6 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) { return Changed; } -struct MemTransferInfo { - ConstantInt *SrcIndex = nullptr; - ConstantInt *DestIndex = nullptr; -}; - // Checks if the instruction I is a memset user of the alloca AI that we can // deal with. Currently, only non-volatile memsets that affect the whole alloca // are handled. @@ -387,23 +470,48 @@ static bool isSupportedMemset(MemSetInst *I, AllocaInst *AI, match(I->getOperand(2), m_SpecificInt(Size)) && !I->isVolatile(); } -static Value *calculateVectorIndex( - Value *Ptr, const std::map<GetElementPtrInst *, WeakTrackingVH> &GEPIdx) { - auto *GEP = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()); - if (!GEP) - return ConstantInt::getNullValue(Type::getInt32Ty(Ptr->getContext())); +static Value *calculateVectorIndex(Value *Ptr, AllocaAnalysis &AA) { + IRBuilder<> B(Ptr->getContext()); + + Ptr = Ptr->stripPointerCasts(); + if (Ptr == AA.Alloca) + return B.getInt32(0); + + auto *GEP = cast<GetElementPtrInst>(Ptr); + auto I = AA.Vector.GEPVectorIdx.find(GEP); + assert(I != AA.Vector.GEPVectorIdx.end() && "Must have entry for GEP!"); + + if (!I->second.Full) { + Value *Result = nullptr; + B.SetInsertPoint(GEP); + + if (I->second.VarIndex) { + Result = I->second.VarIndex; + Result = B.CreateSExtOrTrunc(Result, B.getInt32Ty()); - auto I = GEPIdx.find(GEP); - assert(I != GEPIdx.end() && "Must have entry for GEP!"); + if (I->second.VarMul) + Result = B.CreateMul(Result, I->second.VarMul); + } + + if (I->second.ConstIndex) { + if (Result) + Result = B.CreateAdd(Result, I->second.ConstIndex); + else + Result = I->second.ConstIndex; + } + + if (!Result) + Result = B.getInt32(0); + + I->second.Full = Result; + } - Value *IndexValue = I->second; - assert(IndexValue && "index value missing from GEP index map"); - return IndexValue; + return I->second.Full; } -static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca, - Type *VecElemTy, const DataLayout &DL, - SmallVector<Instruction *> &NewInsts) { +static std::optional<GEPToVectorIndex> +computeGEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca, + Type *VecElemTy, const DataLayout &DL) { // TODO: Extracting a "multiple of X" from a GEP might be a useful generic // helper. LLVMContext &Ctx = GEP->getContext(); @@ -431,7 +539,7 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca, Value *CurPtr = GEP; while (auto *CurGEP = dyn_cast<GetElementPtrInst>(CurPtr)) { if (!CurGEP->collectOffset(DL, BW, VarOffsets, ConstOffset)) - return nullptr; + return {}; // Move to the next outer pointer. CurPtr = CurGEP->getPointerOperand(); @@ -441,126 +549,78 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca, int64_t VecElemSize = DL.getTypeAllocSize(VecElemTy); if (VarOffsets.size() > 1) - return nullptr; + return {}; APInt IndexQuot; int64_t Rem; APInt::sdivrem(ConstOffset, VecElemSize, IndexQuot, Rem); if (Rem != 0) - return nullptr; - if (VarOffsets.size() == 0) - return ConstantInt::get(Ctx, IndexQuot); + return {}; - IRBuilder<> Builder(GEP); + GEPToVectorIndex Result; + + if (!ConstOffset.isZero()) + Result.ConstIndex = ConstantInt::get(Ctx, IndexQuot.sextOrTrunc(BW)); + + if (VarOffsets.empty()) + return Result; const auto &VarOffset = VarOffsets.front(); APInt OffsetQuot; APInt::sdivrem(VarOffset.second, VecElemSize, OffsetQuot, Rem); if (Rem != 0 || OffsetQuot.isZero()) - return nullptr; + return {}; - Value *Offset = VarOffset.first; - auto *OffsetType = dyn_cast<IntegerType>(Offset->getType()); + Result.VarIndex = VarOffset.first; + auto *OffsetType = dyn_cast<IntegerType>(Result.VarIndex->getType()); if (!OffsetType) - return nullptr; + return {}; - if (!OffsetQuot.isOne()) { - ConstantInt *ConstMul = - ConstantInt::get(Ctx, OffsetQuot.sext(OffsetType->getBitWidth())); - Offset = Builder.CreateMul(Offset, ConstMul); - if (Instruction *NewInst = dyn_cast<Instruction>(Offset)) - NewInsts.push_back(NewInst); - } - if (ConstOffset.isZero()) - return Offset; - - ConstantInt *ConstIndex = - ConstantInt::get(Ctx, IndexQuot.sext(OffsetType->getBitWidth())); - Value *IndexAdd = Builder.CreateAdd(Offset, ConstIndex); - if (Instruction *NewInst = dyn_cast<Instruction>(IndexAdd)) - NewInsts.push_back(NewInst); - return IndexAdd; + if (!OffsetQuot.isOne()) + Result.VarMul = ConstantInt::get(Ctx, OffsetQuot.sextOrTrunc(BW)); + + return Result; } /// Promotes a single user of the alloca to a vector form. /// /// \param Inst Instruction to be promoted. /// \param DL Module Data Layout. -/// \param VectorTy Vectorized Type. +/// \param AA Alloca Analysis. /// \param VecStoreSize Size of \p VectorTy in bytes. /// \param ElementSize Size of \p VectorTy element type in bytes. -/// \param TransferInfo MemTransferInst info map. -/// \param GEPVectorIdx GEP -> VectorIdx cache. /// \param CurVal Current value of the vector (e.g. last stored value) /// \param[out] DeferredLoads \p Inst is added to this vector if it can't /// be promoted now. This happens when promoting requires \p /// CurVal, but \p CurVal is nullptr. /// \return the stored value if \p Inst would have written to the alloca, or /// nullptr otherwise. -static Value *promoteAllocaUserToVector( - Instruction *Inst, const DataLayout &DL, FixedVectorType *VectorTy, - unsigned VecStoreSize, unsigned ElementSize, - DenseMap<MemTransferInst *, MemTransferInfo> &TransferInfo, - std::map<GetElementPtrInst *, WeakTrackingVH> &GEPVectorIdx, Value *CurVal, - SmallVectorImpl<LoadInst *> &DeferredLoads) { +static Value *promoteAllocaUserToVector(Instruction *Inst, const DataLayout &DL, + AllocaAnalysis &AA, + unsigned VecStoreSize, + unsigned ElementSize, + function_ref<Value *()> GetCurVal) { // Note: we use InstSimplifyFolder because it can leverage the DataLayout // to do more folding, especially in the case of vector splats. IRBuilder<InstSimplifyFolder> Builder(Inst->getContext(), InstSimplifyFolder(DL)); Builder.SetInsertPoint(Inst); - const auto GetOrLoadCurrentVectorValue = [&]() -> Value * { - if (CurVal) - return CurVal; - - // If the current value is not known, insert a dummy load and lower it on - // the second pass. - LoadInst *Dummy = - Builder.CreateLoad(VectorTy, PoisonValue::get(Builder.getPtrTy()), - "promotealloca.dummyload"); - DeferredLoads.push_back(Dummy); - return Dummy; - }; - - const auto CreateTempPtrIntCast = [&Builder, DL](Value *Val, - Type *PtrTy) -> Value * { - assert(DL.getTypeStoreSize(Val->getType()) == DL.getTypeStoreSize(PtrTy)); - const unsigned Size = DL.getTypeStoreSizeInBits(PtrTy); - if (!PtrTy->isVectorTy()) - return Builder.CreateBitOrPointerCast(Val, Builder.getIntNTy(Size)); - const unsigned NumPtrElts = cast<FixedVectorType>(PtrTy)->getNumElements(); - // If we want to cast to cast, e.g. a <2 x ptr> into a <4 x i32>, we need to - // first cast the ptr vector to <2 x i64>. - assert((Size % NumPtrElts == 0) && "Vector size not divisble"); - Type *EltTy = Builder.getIntNTy(Size / NumPtrElts); - return Builder.CreateBitOrPointerCast( - Val, FixedVectorType::get(EltTy, NumPtrElts)); - }; - - Type *VecEltTy = VectorTy->getElementType(); + Type *VecEltTy = AA.Vector.Ty->getElementType(); switch (Inst->getOpcode()) { case Instruction::Load: { - // Loads can only be lowered if the value is known. - if (!CurVal) { - DeferredLoads.push_back(cast<LoadInst>(Inst)); - return nullptr; - } - - Value *Index = calculateVectorIndex( - cast<LoadInst>(Inst)->getPointerOperand(), GEPVectorIdx); + Value *CurVal = GetCurVal(); + Value *Index = + calculateVectorIndex(cast<LoadInst>(Inst)->getPointerOperand(), AA); // We're loading the full vector. Type *AccessTy = Inst->getType(); TypeSize AccessSize = DL.getTypeStoreSize(AccessTy); if (Constant *CI = dyn_cast<Constant>(Index)) { if (CI->isZeroValue() && AccessSize == VecStoreSize) { - if (AccessTy->isPtrOrPtrVectorTy()) - CurVal = CreateTempPtrIntCast(CurVal, AccessTy); - else if (CurVal->getType()->isPtrOrPtrVectorTy()) - CurVal = CreateTempPtrIntCast(CurVal, CurVal->getType()); - Value *NewVal = Builder.CreateBitOrPointerCast(CurVal, AccessTy); - Inst->replaceAllUsesWith(NewVal); + Inst->replaceAllUsesWith( + Builder.CreateBitPreservingCastChain(DL, CurVal, AccessTy)); return nullptr; } } @@ -572,6 +632,36 @@ static Value *promoteAllocaUserToVector( auto *SubVecTy = FixedVectorType::get(VecEltTy, NumLoadedElts); assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy)); + // If idx is dynamic, then sandwich load with bitcasts. + // ie. VectorTy SubVecTy AccessTy + // <64 x i8> -> <16 x i8> <8 x i16> + // <64 x i8> -> <4 x i128> -> i128 -> <8 x i16> + // Extracting subvector with dynamic index has very large expansion in + // the amdgpu backend. Limit to pow2. + FixedVectorType *VectorTy = AA.Vector.Ty; + TypeSize NumBits = DL.getTypeStoreSize(SubVecTy) * 8u; + uint64_t LoadAlign = cast<LoadInst>(Inst)->getAlign().value(); + bool IsAlignedLoad = NumBits <= (LoadAlign * 8u); + unsigned TotalNumElts = VectorTy->getNumElements(); + bool IsProperlyDivisible = TotalNumElts % NumLoadedElts == 0; + if (!isa<ConstantInt>(Index) && + llvm::isPowerOf2_32(SubVecTy->getNumElements()) && + IsProperlyDivisible && IsAlignedLoad) { + IntegerType *NewElemTy = Builder.getIntNTy(NumBits); + const unsigned NewNumElts = + DL.getTypeStoreSize(VectorTy) * 8u / NumBits; + const unsigned LShrAmt = llvm::Log2_32(SubVecTy->getNumElements()); + FixedVectorType *BitCastTy = + FixedVectorType::get(NewElemTy, NewNumElts); + Value *BCVal = Builder.CreateBitCast(CurVal, BitCastTy); + Value *NewIdx = Builder.CreateLShr( + Index, ConstantInt::get(Index->getType(), LShrAmt)); + Value *ExtVal = Builder.CreateExtractElement(BCVal, NewIdx); + Value *BCOut = Builder.CreateBitCast(ExtVal, AccessTy); + Inst->replaceAllUsesWith(BCOut); + return nullptr; + } + Value *SubVec = PoisonValue::get(SubVecTy); for (unsigned K = 0; K < NumLoadedElts; ++K) { Value *CurIdx = @@ -580,13 +670,8 @@ static Value *promoteAllocaUserToVector( SubVec, Builder.CreateExtractElement(CurVal, CurIdx), K); } - if (AccessTy->isPtrOrPtrVectorTy()) - SubVec = CreateTempPtrIntCast(SubVec, AccessTy); - else if (SubVecTy->isPtrOrPtrVectorTy()) - SubVec = CreateTempPtrIntCast(SubVec, SubVecTy); - - SubVec = Builder.CreateBitOrPointerCast(SubVec, AccessTy); - Inst->replaceAllUsesWith(SubVec); + Inst->replaceAllUsesWith( + Builder.CreateBitPreservingCastChain(DL, SubVec, AccessTy)); return nullptr; } @@ -604,39 +689,27 @@ static Value *promoteAllocaUserToVector( // to know the current value. If this is a store of a single element, we // need to know the value. StoreInst *SI = cast<StoreInst>(Inst); - Value *Index = calculateVectorIndex(SI->getPointerOperand(), GEPVectorIdx); + Value *Index = calculateVectorIndex(SI->getPointerOperand(), AA); Value *Val = SI->getValueOperand(); // We're storing the full vector, we can handle this without knowing CurVal. Type *AccessTy = Val->getType(); TypeSize AccessSize = DL.getTypeStoreSize(AccessTy); - if (Constant *CI = dyn_cast<Constant>(Index)) { - if (CI->isZeroValue() && AccessSize == VecStoreSize) { - if (AccessTy->isPtrOrPtrVectorTy()) - Val = CreateTempPtrIntCast(Val, AccessTy); - else if (VectorTy->isPtrOrPtrVectorTy()) - Val = CreateTempPtrIntCast(Val, VectorTy); - return Builder.CreateBitOrPointerCast(Val, VectorTy); - } - } + if (Constant *CI = dyn_cast<Constant>(Index)) + if (CI->isZeroValue() && AccessSize == VecStoreSize) + return Builder.CreateBitPreservingCastChain(DL, Val, AA.Vector.Ty); // Storing a subvector. if (isa<FixedVectorType>(AccessTy)) { assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy))); const unsigned NumWrittenElts = AccessSize / DL.getTypeStoreSize(VecEltTy); - const unsigned NumVecElts = VectorTy->getNumElements(); + const unsigned NumVecElts = AA.Vector.Ty->getNumElements(); auto *SubVecTy = FixedVectorType::get(VecEltTy, NumWrittenElts); assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy)); - if (SubVecTy->isPtrOrPtrVectorTy()) - Val = CreateTempPtrIntCast(Val, SubVecTy); - else if (AccessTy->isPtrOrPtrVectorTy()) - Val = CreateTempPtrIntCast(Val, AccessTy); - - Val = Builder.CreateBitOrPointerCast(Val, SubVecTy); - - Value *CurVec = GetOrLoadCurrentVectorValue(); + Val = Builder.CreateBitPreservingCastChain(DL, Val, SubVecTy); + Value *CurVec = GetCurVal(); for (unsigned K = 0, NumElts = std::min(NumWrittenElts, NumVecElts); K < NumElts; ++K) { Value *CurIdx = @@ -649,22 +722,21 @@ static Value *promoteAllocaUserToVector( if (Val->getType() != VecEltTy) Val = Builder.CreateBitOrPointerCast(Val, VecEltTy); - return Builder.CreateInsertElement(GetOrLoadCurrentVectorValue(), Val, - Index); + return Builder.CreateInsertElement(GetCurVal(), Val, Index); } case Instruction::Call: { if (auto *MTI = dyn_cast<MemTransferInst>(Inst)) { // For memcpy, we need to know curval. ConstantInt *Length = cast<ConstantInt>(MTI->getLength()); unsigned NumCopied = Length->getZExtValue() / ElementSize; - MemTransferInfo *TI = &TransferInfo[MTI]; + MemTransferInfo *TI = &AA.Vector.TransferInfo[MTI]; unsigned SrcBegin = TI->SrcIndex->getZExtValue(); unsigned DestBegin = TI->DestIndex->getZExtValue(); SmallVector<int> Mask; - for (unsigned Idx = 0; Idx < VectorTy->getNumElements(); ++Idx) { + for (unsigned Idx = 0; Idx < AA.Vector.Ty->getNumElements(); ++Idx) { if (Idx >= DestBegin && Idx < DestBegin + NumCopied) { - Mask.push_back(SrcBegin < VectorTy->getNumElements() + Mask.push_back(SrcBegin < AA.Vector.Ty->getNumElements() ? SrcBegin++ : PoisonMaskElem); } else { @@ -672,7 +744,7 @@ static Value *promoteAllocaUserToVector( } } - return Builder.CreateShuffleVector(GetOrLoadCurrentVectorValue(), Mask); + return Builder.CreateShuffleVector(GetCurVal(), Mask); } if (auto *MSI = dyn_cast<MemSetInst>(Inst)) { @@ -693,14 +765,14 @@ static Value *promoteAllocaUserToVector( Elt = Builder.CreateBitCast(EltBytes, VecEltTy); } - return Builder.CreateVectorSplat(VectorTy->getElementCount(), Elt); + return Builder.CreateVectorSplat(AA.Vector.Ty->getElementCount(), Elt); } if (auto *Intr = dyn_cast<IntrinsicInst>(Inst)) { if (Intr->getIntrinsicID() == Intrinsic::objectsize) { Intr->replaceAllUsesWith( Builder.getIntN(Intr->getType()->getIntegerBitWidth(), - DL.getTypeAllocSize(VectorTy))); + DL.getTypeAllocSize(AA.Vector.Ty))); return nullptr; } } @@ -791,16 +863,13 @@ static BasicBlock::iterator skipToNonAllocaInsertPt(BasicBlock &BB, return I; } -// FIXME: Should try to pick the most likely to be profitable allocas first. -bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { - LLVM_DEBUG(dbgs() << "Trying to promote to vector: " << Alloca << '\n'); - +FixedVectorType * +AMDGPUPromoteAllocaImpl::getVectorTypeForAlloca(Type *AllocaTy) const { if (DisablePromoteAllocaToVector) { - LLVM_DEBUG(dbgs() << " Promote alloca to vector is disabled\n"); - return false; + LLVM_DEBUG(dbgs() << " Promote alloca to vectors is disabled\n"); + return nullptr; } - Type *AllocaTy = Alloca.getAllocatedType(); auto *VectorTy = dyn_cast<FixedVectorType>(AllocaTy); if (auto *ArrayTy = dyn_cast<ArrayType>(AllocaTy)) { uint64_t NumElems = 1; @@ -832,10 +901,9 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { } } } - if (!VectorTy) { LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n"); - return false; + return nullptr; } const unsigned MaxElements = @@ -845,46 +913,46 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { VectorTy->getNumElements() < 2) { LLVM_DEBUG(dbgs() << " " << *VectorTy << " has an unsupported number of elements\n"); - return false; + return nullptr; } - std::map<GetElementPtrInst *, WeakTrackingVH> GEPVectorIdx; - SmallVector<Instruction *> WorkList; - SmallVector<Instruction *> UsersToRemove; - SmallVector<Instruction *> DeferredInsts; - SmallVector<Instruction *> NewGEPInsts; - DenseMap<MemTransferInst *, MemTransferInfo> TransferInfo; - - const auto RejectUser = [&](Instruction *Inst, Twine Msg) { - LLVM_DEBUG(dbgs() << " Cannot promote alloca to vector: " << Msg << "\n" - << " " << *Inst << "\n"); - for (auto *Inst : reverse(NewGEPInsts)) - Inst->eraseFromParent(); - return false; - }; - - SmallVector<Use *, 8> Uses; - collectAllocaUses(Alloca, Uses); - - LLVM_DEBUG(dbgs() << " Attempting promotion to: " << *VectorTy << "\n"); - Type *VecEltTy = VectorTy->getElementType(); unsigned ElementSizeInBits = DL->getTypeSizeInBits(VecEltTy); if (ElementSizeInBits != DL->getTypeAllocSizeInBits(VecEltTy)) { LLVM_DEBUG(dbgs() << " Cannot convert to vector if the allocation size " "does not match the type's size\n"); - return false; + return nullptr; } - unsigned ElementSize = ElementSizeInBits / 8; + + return VectorTy; +} + +void AMDGPUPromoteAllocaImpl::analyzePromoteToVector(AllocaAnalysis &AA) const { + if (AA.HaveSelectOrPHI) { + LLVM_DEBUG(dbgs() << " Cannot convert to vector due to select or phi\n"); + return; + } + + Type *AllocaTy = AA.Alloca->getAllocatedType(); + AA.Vector.Ty = getVectorTypeForAlloca(AllocaTy); + if (!AA.Vector.Ty) + return; + + const auto RejectUser = [&](Instruction *Inst, Twine Msg) { + LLVM_DEBUG(dbgs() << " Cannot promote alloca to vector: " << Msg << "\n" + << " " << *Inst << "\n"); + AA.Vector.Ty = nullptr; + }; + + Type *VecEltTy = AA.Vector.Ty->getElementType(); + unsigned ElementSize = DL->getTypeSizeInBits(VecEltTy) / 8; assert(ElementSize > 0); - for (auto *U : Uses) { + for (auto *U : AA.Uses) { Instruction *Inst = cast<Instruction>(U->getUser()); if (Value *Ptr = getLoadStorePointerOperand(Inst)) { - // This is a store of the pointer, not to the pointer. - if (isa<StoreInst>(Inst) && - U->getOperandNo() != StoreInst::getPointerOperandIndex()) - return RejectUser(Inst, "pointer is being stored"); + assert(!isa<StoreInst>(Inst) || + U->getOperandNo() == StoreInst::getPointerOperandIndex()); Type *AccessTy = getLoadStoreType(Inst); if (AccessTy->isAggregateType()) @@ -900,34 +968,35 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { Ptr = Ptr->stripPointerCasts(); // Alloca already accessed as vector. - if (Ptr == &Alloca && DL->getTypeStoreSize(Alloca.getAllocatedType()) == - DL->getTypeStoreSize(AccessTy)) { - WorkList.push_back(Inst); + if (Ptr == AA.Alloca && + DL->getTypeStoreSize(AA.Alloca->getAllocatedType()) == + DL->getTypeStoreSize(AccessTy)) { + AA.Vector.Worklist.push_back(Inst); continue; } - if (!isSupportedAccessType(VectorTy, AccessTy, *DL)) + if (!isSupportedAccessType(AA.Vector.Ty, AccessTy, *DL)) return RejectUser(Inst, "not a supported access type"); - WorkList.push_back(Inst); + AA.Vector.Worklist.push_back(Inst); continue; } if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) { // If we can't compute a vector index from this GEP, then we can't // promote this alloca to vector. - Value *Index = GEPToVectorIndex(GEP, &Alloca, VecEltTy, *DL, NewGEPInsts); + auto Index = computeGEPToVectorIndex(GEP, AA.Alloca, VecEltTy, *DL); if (!Index) return RejectUser(Inst, "cannot compute vector index for GEP"); - GEPVectorIdx[GEP] = Index; - UsersToRemove.push_back(Inst); + AA.Vector.GEPVectorIdx[GEP] = std::move(Index.value()); + AA.Vector.UsersToRemove.push_back(Inst); continue; } if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst); - MSI && isSupportedMemset(MSI, &Alloca, *DL)) { - WorkList.push_back(Inst); + MSI && isSupportedMemset(MSI, AA.Alloca, *DL)) { + AA.Vector.Worklist.push_back(Inst); continue; } @@ -940,31 +1009,32 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { return RejectUser(Inst, "mem transfer inst length is non-constant or " "not a multiple of the vector element size"); - if (TransferInfo.try_emplace(TransferInst).second) { - DeferredInsts.push_back(Inst); - WorkList.push_back(Inst); - } + auto getConstIndexIntoAlloca = [&](Value *Ptr) -> ConstantInt * { + if (Ptr == AA.Alloca) + return ConstantInt::get(Ptr->getContext(), APInt(32, 0)); - auto getPointerIndexOfAlloca = [&](Value *Ptr) -> ConstantInt * { - GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr); - if (Ptr != &Alloca && !GEPVectorIdx.count(GEP)) + GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr); + const auto &GEPI = AA.Vector.GEPVectorIdx.find(GEP)->second; + if (GEPI.VarIndex) return nullptr; - - return dyn_cast<ConstantInt>(calculateVectorIndex(Ptr, GEPVectorIdx)); + if (GEPI.ConstIndex) + return GEPI.ConstIndex; + return ConstantInt::get(Ptr->getContext(), APInt(32, 0)); }; + MemTransferInfo *TI = + &AA.Vector.TransferInfo.try_emplace(TransferInst).first->second; unsigned OpNum = U->getOperandNo(); - MemTransferInfo *TI = &TransferInfo[TransferInst]; if (OpNum == 0) { Value *Dest = TransferInst->getDest(); - ConstantInt *Index = getPointerIndexOfAlloca(Dest); + ConstantInt *Index = getConstIndexIntoAlloca(Dest); if (!Index) return RejectUser(Inst, "could not calculate constant dest index"); TI->DestIndex = Index; } else { assert(OpNum == 1); Value *Src = TransferInst->getSource(); - ConstantInt *Index = getPointerIndexOfAlloca(Src); + ConstantInt *Index = getConstIndexIntoAlloca(Src); if (!Index) return RejectUser(Inst, "could not calculate constant src index"); TI->SrcIndex = Index; @@ -974,7 +1044,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { if (auto *Intr = dyn_cast<IntrinsicInst>(Inst)) { if (Intr->getIntrinsicID() == Intrinsic::objectsize) { - WorkList.push_back(Inst); + AA.Vector.Worklist.push_back(Inst); continue; } } @@ -983,97 +1053,114 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { if (isAssumeLikeIntrinsic(Inst)) { if (!Inst->use_empty()) return RejectUser(Inst, "assume-like intrinsic cannot have any users"); - UsersToRemove.push_back(Inst); + AA.Vector.UsersToRemove.push_back(Inst); continue; } if (isa<ICmpInst>(Inst) && all_of(Inst->users(), [](User *U) { return isAssumeLikeIntrinsic(cast<Instruction>(U)); })) { - UsersToRemove.push_back(Inst); + AA.Vector.UsersToRemove.push_back(Inst); continue; } return RejectUser(Inst, "unhandled alloca user"); } - while (!DeferredInsts.empty()) { - Instruction *Inst = DeferredInsts.pop_back_val(); - MemTransferInst *TransferInst = cast<MemTransferInst>(Inst); - // TODO: Support the case if the pointers are from different alloca or - // from different address spaces. - MemTransferInfo &Info = TransferInfo[TransferInst]; - if (!Info.SrcIndex || !Info.DestIndex) - return RejectUser( - Inst, "mem transfer inst is missing constant src and/or dst index"); + // Follow-up check to ensure we've seen both sides of all transfer insts. + for (const auto &Entry : AA.Vector.TransferInfo) { + const MemTransferInfo &TI = Entry.second; + if (!TI.SrcIndex || !TI.DestIndex) + return RejectUser(Entry.first, + "mem transfer inst between different objects"); + AA.Vector.Worklist.push_back(Entry.first); } +} - LLVM_DEBUG(dbgs() << " Converting alloca to vector " << *AllocaTy << " -> " - << *VectorTy << '\n'); - const unsigned VecStoreSize = DL->getTypeStoreSize(VectorTy); +void AMDGPUPromoteAllocaImpl::promoteAllocaToVector(AllocaAnalysis &AA) { + LLVM_DEBUG(dbgs() << "Promoting to vectors: " << *AA.Alloca << '\n'); + LLVM_DEBUG(dbgs() << " type conversion: " << *AA.Alloca->getAllocatedType() + << " -> " << *AA.Vector.Ty << '\n'); + const unsigned VecStoreSize = DL->getTypeStoreSize(AA.Vector.Ty); + + Type *VecEltTy = AA.Vector.Ty->getElementType(); + const unsigned ElementSize = DL->getTypeSizeInBits(VecEltTy) / 8; // Alloca is uninitialized memory. Imitate that by making the first value // undef. SSAUpdater Updater; - Updater.Initialize(VectorTy, "promotealloca"); + Updater.Initialize(AA.Vector.Ty, "promotealloca"); - BasicBlock *EntryBB = Alloca.getParent(); + BasicBlock *EntryBB = AA.Alloca->getParent(); BasicBlock::iterator InitInsertPos = - skipToNonAllocaInsertPt(*EntryBB, Alloca.getIterator()); - // Alloca memory is undefined to begin, not poison. - Value *AllocaInitValue = - new FreezeInst(PoisonValue::get(VectorTy), "", InitInsertPos); - AllocaInitValue->takeName(&Alloca); + skipToNonAllocaInsertPt(*EntryBB, AA.Alloca->getIterator()); + IRBuilder<> Builder(&*InitInsertPos); + Value *AllocaInitValue = Builder.CreateFreeze(PoisonValue::get(AA.Vector.Ty)); + AllocaInitValue->takeName(AA.Alloca); - Updater.AddAvailableValue(EntryBB, AllocaInitValue); + Updater.AddAvailableValue(AA.Alloca->getParent(), AllocaInitValue); - // First handle the initial worklist. - SmallVector<LoadInst *, 4> DeferredLoads; - forEachWorkListItem(WorkList, [&](Instruction *I) { + // First handle the initial worklist, in basic block order. + // + // Insert a placeholder whenever we need the vector value at the top of a + // basic block. + SmallVector<Instruction *> Placeholders; + forEachWorkListItem(AA.Vector.Worklist, [&](Instruction *I) { BasicBlock *BB = I->getParent(); - // On the first pass, we only take values that are trivially known, i.e. - // where AddAvailableValue was already called in this block. - Value *Result = promoteAllocaUserToVector( - I, *DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx, - Updater.FindValueForBlock(BB), DeferredLoads); + auto GetCurVal = [&]() -> Value * { + if (Value *CurVal = Updater.FindValueForBlock(BB)) + return CurVal; + + if (!Placeholders.empty() && Placeholders.back()->getParent() == BB) + return Placeholders.back(); + + // If the current value in the basic block is not yet known, insert a + // placeholder that we will replace later. + IRBuilder<> Builder(I); + auto *Placeholder = cast<Instruction>(Builder.CreateFreeze( + PoisonValue::get(AA.Vector.Ty), "promotealloca.placeholder")); + Placeholders.push_back(Placeholder); + return Placeholders.back(); + }; + + Value *Result = promoteAllocaUserToVector(I, *DL, AA, VecStoreSize, + ElementSize, GetCurVal); if (Result) Updater.AddAvailableValue(BB, Result); }); - // Then handle deferred loads. - forEachWorkListItem(DeferredLoads, [&](Instruction *I) { - SmallVector<LoadInst *, 0> NewDLs; - BasicBlock *BB = I->getParent(); - // On the second pass, we use GetValueInMiddleOfBlock to guarantee we always - // get a value, inserting PHIs as needed. - Value *Result = promoteAllocaUserToVector( - I, *DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx, - Updater.GetValueInMiddleOfBlock(I->getParent()), NewDLs); - if (Result) - Updater.AddAvailableValue(BB, Result); - assert(NewDLs.empty() && "No more deferred loads should be queued!"); - }); + // Now fixup the placeholders. + SmallVector<Value *> PlaceholderToNewVal(Placeholders.size()); + for (auto [Index, Placeholder] : enumerate(Placeholders)) { + Value *NewVal = Updater.GetValueInMiddleOfBlock(Placeholder->getParent()); + PlaceholderToNewVal[Index] = NewVal; + Placeholder->replaceAllUsesWith(NewVal); + } + // Note: we cannot merge this loop with the previous one because it is + // possible that the placeholder itself can be used in the SSAUpdater. The + // replaceAllUsesWith doesn't replace those uses. + for (auto [Index, Placeholder] : enumerate(Placeholders)) { + if (!Placeholder->use_empty()) + Placeholder->replaceAllUsesWith(PlaceholderToNewVal[Index]); + Placeholder->eraseFromParent(); + } - // Delete all instructions. On the first pass, new dummy loads may have been - // added so we need to collect them too. - DenseSet<Instruction *> InstsToDelete(WorkList.begin(), WorkList.end()); - InstsToDelete.insert_range(DeferredLoads); - for (Instruction *I : InstsToDelete) { + // Delete all instructions. + for (Instruction *I : AA.Vector.Worklist) { assert(I->use_empty()); I->eraseFromParent(); } // Delete all the users that are known to be removeable. - for (Instruction *I : reverse(UsersToRemove)) { + for (Instruction *I : reverse(AA.Vector.UsersToRemove)) { I->dropDroppableUses(); assert(I->use_empty()); I->eraseFromParent(); } // Alloca should now be dead too. - assert(Alloca.use_empty()); - Alloca.eraseFromParent(); - return true; + assert(AA.Alloca->use_empty()); + AA.Alloca->eraseFromParent(); } std::pair<Value *, Value *> @@ -1247,61 +1334,78 @@ bool AMDGPUPromoteAllocaImpl::binaryOpIsDerivedFromSameAlloca( return true; } -bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes( - Value *BaseAlloca, Value *Val, std::vector<Value *> &WorkList) const { +void AMDGPUPromoteAllocaImpl::analyzePromoteToLDS(AllocaAnalysis &AA) const { + if (DisablePromoteAllocaToLDS) { + LLVM_DEBUG(dbgs() << " Promote alloca to LDS is disabled\n"); + return; + } - for (User *User : Val->users()) { - if (is_contained(WorkList, User)) - continue; + // Don't promote the alloca to LDS for shader calling conventions as the work + // item ID intrinsics are not supported for these calling conventions. + // Furthermore not all LDS is available for some of the stages. + const Function &ContainingFunction = *AA.Alloca->getFunction(); + CallingConv::ID CC = ContainingFunction.getCallingConv(); + + switch (CC) { + case CallingConv::AMDGPU_KERNEL: + case CallingConv::SPIR_KERNEL: + break; + default: + LLVM_DEBUG( + dbgs() + << " promote alloca to LDS not supported with calling convention.\n"); + return; + } + + for (Use *Use : AA.Uses) { + auto *User = Use->getUser(); if (CallInst *CI = dyn_cast<CallInst>(User)) { if (!isCallPromotable(CI)) - return false; + return; - WorkList.push_back(User); + if (find(AA.LDS.Worklist, User) == AA.LDS.Worklist.end()) + AA.LDS.Worklist.push_back(User); continue; } Instruction *UseInst = cast<Instruction>(User); if (UseInst->getOpcode() == Instruction::PtrToInt) - return false; + return; if (LoadInst *LI = dyn_cast<LoadInst>(UseInst)) { if (LI->isVolatile()) - return false; + return; continue; } if (StoreInst *SI = dyn_cast<StoreInst>(UseInst)) { if (SI->isVolatile()) - return false; - - // Reject if the stored value is not the pointer operand. - if (SI->getPointerOperand() != Val) - return false; + return; continue; } if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UseInst)) { if (RMW->isVolatile()) - return false; + return; continue; } if (AtomicCmpXchgInst *CAS = dyn_cast<AtomicCmpXchgInst>(UseInst)) { if (CAS->isVolatile()) - return false; + return; continue; } // Only promote a select if we know that the other select operand // is from another pointer that will also be promoted. if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) { - if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, ICmp, 0, 1)) - return false; + if (!binaryOpIsDerivedFromSameAlloca(AA.Alloca, Use->get(), ICmp, 0, 1)) + return; // May need to rewrite constant operands. - WorkList.push_back(ICmp); + if (find(AA.LDS.Worklist, User) == AA.LDS.Worklist.end()) + AA.LDS.Worklist.push_back(ICmp); continue; } @@ -1309,28 +1413,8 @@ bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes( // Be conservative if an address could be computed outside the bounds of // the alloca. if (!GEP->isInBounds()) - return false; - } else if (SelectInst *SI = dyn_cast<SelectInst>(UseInst)) { - // Only promote a select if we know that the other select operand is from - // another pointer that will also be promoted. - if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, SI, 1, 2)) - return false; - } else if (PHINode *Phi = dyn_cast<PHINode>(UseInst)) { - // Repeat for phis. - - // TODO: Handle more complex cases. We should be able to replace loops - // over arrays. - switch (Phi->getNumIncomingValues()) { - case 1: - break; - case 2: - if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Phi, 0, 1)) - return false; - break; - default: - return false; - } - } else if (!isa<ExtractElementInst>(User)) { + return; + } else if (!isa<ExtractElementInst, SelectInst, PHINode>(User)) { // Do not promote vector/aggregate type instructions. It is hard to track // their users. @@ -1338,15 +1422,14 @@ bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes( // // TODO: If we know the address is only observed through flat pointers, we // could still promote. - return false; + return; } - WorkList.push_back(User); - if (!collectUsesWithPtrTypes(BaseAlloca, User, WorkList)) - return false; + if (find(AA.LDS.Worklist, User) == AA.LDS.Worklist.end()) + AA.LDS.Worklist.push_back(User); } - return true; + AA.LDS.Enable = true; } bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) { @@ -1378,7 +1461,7 @@ bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) { auto visitUsers = [&](const GlobalVariable *GV, const Constant *Val) -> bool { for (const User *U : Val->users()) { if (const Instruction *Use = dyn_cast<Instruction>(U)) { - if (Use->getParent()->getParent() == &F) + if (Use->getFunction() == &F) return true; } else { const Constant *C = cast<Constant>(U); @@ -1419,7 +1502,7 @@ bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) { for (const GlobalVariable *GV : UsedLDS) { Align Alignment = DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getValueType()); - uint64_t AllocSize = DL.getTypeAllocSize(GV->getValueType()); + uint64_t AllocSize = GV->getGlobalSize(DL); // HIP uses an extern unsized array in local address space for dynamically // allocated shared memory. In that case, we have to disable the promotion. @@ -1477,44 +1560,24 @@ bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) { } // FIXME: Should try to pick the most likely to be profitable allocas first. -bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I, - bool SufficientLDS) { - LLVM_DEBUG(dbgs() << "Trying to promote to LDS: " << I << '\n'); - - if (DisablePromoteAllocaToLDS) { - LLVM_DEBUG(dbgs() << " Promote alloca to LDS is disabled\n"); - return false; - } - - const DataLayout &DL = Mod->getDataLayout(); - IRBuilder<> Builder(&I); - - const Function &ContainingFunction = *I.getParent()->getParent(); - CallingConv::ID CC = ContainingFunction.getCallingConv(); - - // Don't promote the alloca to LDS for shader calling conventions as the work - // item ID intrinsics are not supported for these calling conventions. - // Furthermore not all LDS is available for some of the stages. - switch (CC) { - case CallingConv::AMDGPU_KERNEL: - case CallingConv::SPIR_KERNEL: - break; - default: - LLVM_DEBUG( - dbgs() - << " promote alloca to LDS not supported with calling convention.\n"); - return false; - } +bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS( + AllocaAnalysis &AA, bool SufficientLDS, + SetVector<IntrinsicInst *> &DeferredIntrs) { + LLVM_DEBUG(dbgs() << "Trying to promote to LDS: " << *AA.Alloca << '\n'); // Not likely to have sufficient local memory for promotion. if (!SufficientLDS) return false; + const DataLayout &DL = Mod->getDataLayout(); + IRBuilder<> Builder(AA.Alloca); + + const Function &ContainingFunction = *AA.Alloca->getParent()->getParent(); const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, ContainingFunction); unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second; - Align Alignment = - DL.getValueOrABITypeAlignment(I.getAlign(), I.getAllocatedType()); + Align Alignment = DL.getValueOrABITypeAlignment( + AA.Alloca->getAlign(), AA.Alloca->getAllocatedType()); // FIXME: This computed padding is likely wrong since it depends on inverse // usage order. @@ -1524,7 +1587,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I, uint32_t NewSize = alignTo(CurrentLocalMemUsage, Alignment); uint32_t AllocSize = - WorkGroupSize * DL.getTypeAllocSize(I.getAllocatedType()); + WorkGroupSize * DL.getTypeAllocSize(AA.Alloca->getAllocatedType()); NewSize += AllocSize; if (NewSize > LocalMemLimit) { @@ -1535,24 +1598,17 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I, CurrentLocalMemUsage = NewSize; - std::vector<Value *> WorkList; - - if (!collectUsesWithPtrTypes(&I, &I, WorkList)) { - LLVM_DEBUG(dbgs() << " Do not know how to convert all uses\n"); - return false; - } - LLVM_DEBUG(dbgs() << "Promoting alloca to local memory\n"); - Function *F = I.getParent()->getParent(); + Function *F = AA.Alloca->getFunction(); - Type *GVTy = ArrayType::get(I.getAllocatedType(), WorkGroupSize); + Type *GVTy = ArrayType::get(AA.Alloca->getAllocatedType(), WorkGroupSize); GlobalVariable *GV = new GlobalVariable( *Mod, GVTy, false, GlobalValue::InternalLinkage, PoisonValue::get(GVTy), - Twine(F->getName()) + Twine('.') + I.getName(), nullptr, + Twine(F->getName()) + Twine('.') + AA.Alloca->getName(), nullptr, GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS); GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); - GV->setAlignment(I.getAlign()); + GV->setAlignment(AA.Alloca->getAlign()); Value *TCntY, *TCntZ; @@ -1571,15 +1627,13 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I, Value *Indices[] = {Constant::getNullValue(Type::getInt32Ty(Context)), TID}; Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices); - I.mutateType(Offset->getType()); - I.replaceAllUsesWith(Offset); - I.eraseFromParent(); - - SmallVector<IntrinsicInst *> DeferredIntrs; + AA.Alloca->mutateType(Offset->getType()); + AA.Alloca->replaceAllUsesWith(Offset); + AA.Alloca->eraseFromParent(); PointerType *NewPtrTy = PointerType::get(Context, AMDGPUAS::LOCAL_ADDRESS); - for (Value *V : WorkList) { + for (Value *V : AA.LDS.Worklist) { CallInst *Call = dyn_cast<CallInst>(V); if (!Call) { if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) { @@ -1637,7 +1691,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I, // These have 2 pointer operands. In case if second pointer also needs // to be replaced we defer processing of these intrinsics until all // other values are processed. - DeferredIntrs.push_back(Intr); + DeferredIntrs.insert(Intr); continue; case Intrinsic::memset: { MemSetInst *MemSet = cast<MemSetInst>(Intr); @@ -1685,7 +1739,14 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I, } } + return true; +} + +void AMDGPUPromoteAllocaImpl::finishDeferredAllocaToLDSPromotion( + SetVector<IntrinsicInst *> &DeferredIntrs) { + for (IntrinsicInst *Intr : DeferredIntrs) { + IRBuilder<> Builder(Intr); Builder.SetInsertPoint(Intr); Intrinsic::ID ID = Intr->getIntrinsicID(); assert(ID == Intrinsic::memcpy || ID == Intrinsic::memmove); @@ -1703,6 +1764,4 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I, Intr->eraseFromParent(); } - - return true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp index fd604e1..e2e84ce 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp @@ -333,7 +333,7 @@ bool AMDGPURegBankCombinerImpl::matchFPMed3ToClamp(MachineInstr &MI, Register Val = Src0->getOperand(0).getReg(); auto isOp3Zero = [&]() { - MachineInstr *Op3 = getDefIgnoringCopies(MI.getOperand(4).getReg(), MRI); + MachineInstr *Op3 = getDefIgnoringCopies(MI.getOperand(3).getReg(), MRI); if (Op3->getOpcode() == TargetOpcode::G_FCONSTANT) return Op3->getOperand(1).getFPImm()->isExactlyValue(0.0); return false; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp index e187959..888717f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp @@ -24,6 +24,7 @@ #include "llvm/CodeGen/GlobalISel/CSEInfo.h" #include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h" #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" +#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineUniformityAnalysis.h" @@ -34,9 +35,17 @@ using namespace llvm; using namespace AMDGPU; +using namespace llvm::MIPatternMatch; namespace { +// AMDGPU-specific pattern matchers +template <typename SrcTy> +inline UnaryOp_match<SrcTy, AMDGPU::G_AMDGPU_READANYLANE> +m_GAMDGPUReadAnyLane(const SrcTy &Src) { + return UnaryOp_match<SrcTy, AMDGPU::G_AMDGPU_READANYLANE>(Src); +} + class AMDGPURegBankLegalize : public MachineFunctionPass { public: static char ID; @@ -119,8 +128,9 @@ public: bool isLaneMask(Register Reg); std::pair<MachineInstr *, Register> tryMatch(Register Src, unsigned Opcode); - std::pair<GUnmerge *, int> tryMatchRALFromUnmerge(Register Src); - Register getReadAnyLaneSrc(Register Src); + Register tryMatchUnmergeDefs(SmallVectorImpl<Register> &DefRegs); + SmallVector<Register> tryMatchMergeReadAnyLane(GMergeLikeInstr *Merge); + SmallVector<Register> getReadAnyLaneSrcs(Register Src); void replaceRegWithOrBuildCopy(Register Dst, Register Src); bool tryEliminateReadAnyLane(MachineInstr &Copy); @@ -145,43 +155,74 @@ AMDGPURegBankLegalizeCombiner::tryMatch(Register Src, unsigned Opcode) { return {MatchMI, MatchMI->getOperand(1).getReg()}; } -std::pair<GUnmerge *, int> -AMDGPURegBankLegalizeCombiner::tryMatchRALFromUnmerge(Register Src) { - MachineInstr *ReadAnyLane = MRI.getVRegDef(Src); - if (ReadAnyLane->getOpcode() != AMDGPU::G_AMDGPU_READANYLANE) - return {nullptr, -1}; - - Register RALSrc = ReadAnyLane->getOperand(1).getReg(); - if (auto *UnMerge = getOpcodeDef<GUnmerge>(RALSrc, MRI)) - return {UnMerge, UnMerge->findRegisterDefOperandIdx(RALSrc, nullptr)}; +// Check if all registers are from same unmerge and there is no shuffling. +// Returns the unmerge source if both conditions are met. +Register AMDGPURegBankLegalizeCombiner::tryMatchUnmergeDefs( + SmallVectorImpl<Register> &DefRegs) { + auto *UnMerge = getOpcodeDef<GUnmerge>(DefRegs[0], MRI); + if (!UnMerge || UnMerge->getNumDefs() != DefRegs.size()) + return {}; + for (unsigned I = 1; I < DefRegs.size(); ++I) { + if (UnMerge->getReg(I) != DefRegs[I]) + return {}; + } + return UnMerge->getSourceReg(); +} - return {nullptr, -1}; +// Check if all merge sources are readanylanes and return the readanylane +// sources if they are. +SmallVector<Register> AMDGPURegBankLegalizeCombiner::tryMatchMergeReadAnyLane( + GMergeLikeInstr *Merge) { + SmallVector<Register> ReadAnyLaneSrcs; + for (unsigned i = 0; i < Merge->getNumSources(); ++i) { + Register Src; + if (!mi_match(Merge->getSourceReg(i), MRI, + m_GAMDGPUReadAnyLane(m_Reg(Src)))) + return {}; + ReadAnyLaneSrcs.push_back(Src); + } + return ReadAnyLaneSrcs; } -Register AMDGPURegBankLegalizeCombiner::getReadAnyLaneSrc(Register Src) { +SmallVector<Register> +AMDGPURegBankLegalizeCombiner::getReadAnyLaneSrcs(Register Src) { // Src = G_AMDGPU_READANYLANE RALSrc - auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE); - if (RAL) - return RALSrc; - - // LoVgpr, HiVgpr = G_UNMERGE_VALUES UnmergeSrc - // LoSgpr = G_AMDGPU_READANYLANE LoVgpr - // HiSgpr = G_AMDGPU_READANYLANE HiVgpr - // Src G_MERGE_VALUES LoSgpr, HiSgpr - auto *Merge = getOpcodeDef<GMergeLikeInstr>(Src, MRI); - if (Merge) { - unsigned NumElts = Merge->getNumSources(); - auto [Unmerge, Idx] = tryMatchRALFromUnmerge(Merge->getSourceReg(0)); - if (!Unmerge || Unmerge->getNumDefs() != NumElts || Idx != 0) + Register RALSrc; + if (mi_match(Src, MRI, m_GAMDGPUReadAnyLane(m_Reg(RALSrc)))) + return {RALSrc}; + + // RALSrc = G_ANYEXT S16Src + // TruncSrc = G_AMDGPU_READANYLANE RALSrc + // Src = G_TRUNC TruncSrc + if (mi_match(Src, MRI, + m_GTrunc(m_GAMDGPUReadAnyLane(m_GAnyExt(m_Reg(RALSrc)))))) { + return {RALSrc}; + } + + // TruncSrc = G_AMDGPU_READANYLANE RALSrc + // AextSrc = G_TRUNC TruncSrc + // Src = G_ANYEXT AextSrc + if (mi_match(Src, MRI, + m_GAnyExt(m_GTrunc(m_GAMDGPUReadAnyLane(m_Reg(RALSrc)))))) { + return {RALSrc}; + } + + // Sgpr0 = G_AMDGPU_READANYLANE Vgpr0 + // Sgpr1 = G_AMDGPU_READANYLANE Vgpr1 + // ... + // Src = G_MERGE_LIKE Sgpr0, Sgpr1, ... + // Dst = COPY Src + if (auto *Merge = getOpcodeDef<GMergeLikeInstr>(Src, MRI)) { + SmallVector<Register> ReadAnyLaneSrcs = tryMatchMergeReadAnyLane(Merge); + if (ReadAnyLaneSrcs.empty()) return {}; - // Check if all elements are from same unmerge and there is no shuffling. - for (unsigned i = 1; i < NumElts; ++i) { - auto [UnmergeI, IdxI] = tryMatchRALFromUnmerge(Merge->getSourceReg(i)); - if (UnmergeI != Unmerge || (unsigned)IdxI != i) - return {}; - } - return Unmerge->getSourceReg(); + // Vgpr0, Vgpr1, ... = G_UNMERGE_VALUES UnmergeSrc + if (Register UnmergeSrc = tryMatchUnmergeDefs(ReadAnyLaneSrcs)) + return {UnmergeSrc}; + + // Multiple ReadAnyLane vgpr sources, need to merge Vgpr0, Vgpr1, ... + return ReadAnyLaneSrcs; } // SrcRegIdx = G_AMDGPU_READANYLANE RALElSrc @@ -192,7 +233,7 @@ Register AMDGPURegBankLegalizeCombiner::getReadAnyLaneSrc(Register Src) { return {}; int Idx = UnMerge->findRegisterDefOperandIdx(Src, nullptr); - Merge = getOpcodeDef<GMergeLikeInstr>(UnMerge->getSourceReg(), MRI); + auto *Merge = getOpcodeDef<GMergeLikeInstr>(UnMerge->getSourceReg(), MRI); if (!Merge || UnMerge->getNumDefs() != Merge->getNumSources()) return {}; @@ -202,7 +243,7 @@ Register AMDGPURegBankLegalizeCombiner::getReadAnyLaneSrc(Register Src) { auto [RALEl, RALElSrc] = tryMatch(SrcRegIdx, AMDGPU::G_AMDGPU_READANYLANE); if (RALEl) - return RALElSrc; + return {RALElSrc}; return {}; } @@ -234,17 +275,27 @@ bool AMDGPURegBankLegalizeCombiner::tryEliminateReadAnyLane( if (SrcMI.getOpcode() == AMDGPU::G_BITCAST) RALDst = SrcMI.getOperand(1).getReg(); - Register RALSrc = getReadAnyLaneSrc(RALDst); - if (!RALSrc) + B.setInstrAndDebugLoc(Copy); + SmallVector<Register> ReadAnyLaneSrcRegs = getReadAnyLaneSrcs(RALDst); + if (ReadAnyLaneSrcRegs.empty()) return false; - B.setInstr(Copy); + Register ReadAnyLaneSrc; + if (ReadAnyLaneSrcRegs.size() == 1) { + ReadAnyLaneSrc = ReadAnyLaneSrcRegs[0]; + } else { + // Multiple readanylane sources without a common unmerge, merge them. + auto Merge = B.buildMergeLikeInstr({VgprRB, MRI.getType(RALDst)}, + ReadAnyLaneSrcRegs); + ReadAnyLaneSrc = Merge.getReg(0); + } + if (SrcMI.getOpcode() != AMDGPU::G_BITCAST) { // Src = READANYLANE RALSrc Src = READANYLANE RALSrc // Dst = Copy Src $Dst = Copy Src // -> -> // Dst = RALSrc $Dst = Copy RALSrc - replaceRegWithOrBuildCopy(Dst, RALSrc); + replaceRegWithOrBuildCopy(Dst, ReadAnyLaneSrc); } else { // RALDst = READANYLANE RALSrc RALDst = READANYLANE RALSrc // Src = G_BITCAST RALDst Src = G_BITCAST RALDst @@ -252,7 +303,7 @@ bool AMDGPURegBankLegalizeCombiner::tryEliminateReadAnyLane( // -> -> // NewVgpr = G_BITCAST RALDst NewVgpr = G_BITCAST RALDst // Dst = NewVgpr $Dst = Copy NewVgpr - auto Bitcast = B.buildBitcast({VgprRB, MRI.getType(Src)}, RALSrc); + auto Bitcast = B.buildBitcast({VgprRB, MRI.getType(Src)}, ReadAnyLaneSrc); replaceRegWithOrBuildCopy(Dst, Bitcast.getReg(0)); } @@ -410,21 +461,15 @@ bool AMDGPURegBankLegalize::runOnMachineFunction(MachineFunction &MF) { unsigned Opc = MI->getOpcode(); // Insert point for use operands needs some calculation. if (Opc == AMDGPU::G_PHI) { - RBLHelper.applyMappingPHI(*MI); + if (!RBLHelper.applyMappingPHI(*MI)) + return false; continue; } // Opcodes that support pretty much all combinations of reg banks and LLTs // (except S1). There is no point in writing rules for them. - if (Opc == AMDGPU::G_BUILD_VECTOR || Opc == AMDGPU::G_UNMERGE_VALUES || - Opc == AMDGPU::G_MERGE_VALUES || Opc == AMDGPU::G_BITCAST) { - RBLHelper.applyMappingTrivial(*MI); - continue; - } - - // Opcodes that also support S1. - if (Opc == G_FREEZE && - MRI.getType(MI->getOperand(0).getReg()) != LLT::scalar(1)) { + if (Opc == AMDGPU::G_BUILD_VECTOR || Opc == AMDGPU::G_MERGE_VALUES || + Opc == AMDGPU::G_CONCAT_VECTORS || Opc == AMDGPU::G_BITCAST) { RBLHelper.applyMappingTrivial(*MI); continue; } @@ -441,7 +486,8 @@ bool AMDGPURegBankLegalize::runOnMachineFunction(MachineFunction &MF) { // S1 rules are in RegBankLegalizeRules. } - RBLHelper.findRuleAndApplyMapping(*MI); + if (!RBLHelper.findRuleAndApplyMapping(*MI)) + return false; } // Sgpr S1 clean up combines: diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index 5407566..d262f07 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -19,6 +19,7 @@ #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" +#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineUniformityAnalysis.h" @@ -32,28 +33,48 @@ using namespace AMDGPU; RegBankLegalizeHelper::RegBankLegalizeHelper( MachineIRBuilder &B, const MachineUniformityInfo &MUI, const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules) - : ST(B.getMF().getSubtarget<GCNSubtarget>()), B(B), MRI(*B.getMRI()), - MUI(MUI), RBI(RBI), RBLRules(RBLRules), IsWave32(ST.isWave32()), + : MF(B.getMF()), ST(MF.getSubtarget<GCNSubtarget>()), B(B), + MRI(*B.getMRI()), MUI(MUI), RBI(RBI), MORE(MF, nullptr), + RBLRules(RBLRules), IsWave32(ST.isWave32()), SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)), VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)), VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {} -void RegBankLegalizeHelper::findRuleAndApplyMapping(MachineInstr &MI) { - const SetOfRulesForOpcode &RuleSet = RBLRules.getRulesForOpc(MI); - const RegBankLLTMapping &Mapping = RuleSet.findMappingForMI(MI, MRI, MUI); +bool RegBankLegalizeHelper::findRuleAndApplyMapping(MachineInstr &MI) { + const SetOfRulesForOpcode *RuleSet = RBLRules.getRulesForOpc(MI); + if (!RuleSet) { + reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize", + "No AMDGPU RegBankLegalize rules defined for opcode", + MI); + return false; + } + + const RegBankLLTMapping *Mapping = RuleSet->findMappingForMI(MI, MRI, MUI); + if (!Mapping) { + reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize", + "AMDGPU RegBankLegalize: none of the rules defined with " + "'Any' for MI's opcode matched MI", + MI); + return false; + } SmallSet<Register, 4> WaterfallSgprs; unsigned OpIdx = 0; - if (Mapping.DstOpMapping.size() > 0) { + if (Mapping->DstOpMapping.size() > 0) { B.setInsertPt(*MI.getParent(), std::next(MI.getIterator())); - applyMappingDst(MI, OpIdx, Mapping.DstOpMapping); + if (!applyMappingDst(MI, OpIdx, Mapping->DstOpMapping)) + return false; } - if (Mapping.SrcOpMapping.size() > 0) { + if (Mapping->SrcOpMapping.size() > 0) { B.setInstr(MI); - applyMappingSrc(MI, OpIdx, Mapping.SrcOpMapping, WaterfallSgprs); + if (!applyMappingSrc(MI, OpIdx, Mapping->SrcOpMapping, WaterfallSgprs)) + return false; } - lower(MI, Mapping, WaterfallSgprs); + if (!lower(MI, *Mapping, WaterfallSgprs)) + return false; + + return true; } bool RegBankLegalizeHelper::executeInWaterfallLoop( @@ -274,7 +295,7 @@ bool RegBankLegalizeHelper::executeInWaterfallLoop( return true; } -void RegBankLegalizeHelper::splitLoad(MachineInstr &MI, +bool RegBankLegalizeHelper::splitLoad(MachineInstr &MI, ArrayRef<LLT> LLTBreakdown, LLT MergeTy) { MachineFunction &MF = B.getMF(); assert(MI.getNumMemOperands() == 1); @@ -322,9 +343,10 @@ void RegBankLegalizeHelper::splitLoad(MachineInstr &MI, B.buildMergeLikeInstr(Dst, MergeTyParts); } MI.eraseFromParent(); + return true; } -void RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy, +bool RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy, LLT MergeTy) { MachineFunction &MF = B.getMF(); assert(MI.getNumMemOperands() == 1); @@ -350,9 +372,10 @@ void RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy, B.buildMergeLikeInstr(Dst, MergeTyParts); } MI.eraseFromParent(); + return true; } -void RegBankLegalizeHelper::widenMMOToS32(GAnyLoad &MI) const { +bool RegBankLegalizeHelper::widenMMOToS32(GAnyLoad &MI) const { Register Dst = MI.getDstReg(); Register Ptr = MI.getPointerReg(); MachineMemOperand &MMO = MI.getMMO(); @@ -376,9 +399,10 @@ void RegBankLegalizeHelper::widenMMOToS32(GAnyLoad &MI) const { } MI.eraseFromParent(); + return true; } -void RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &MI) { +bool RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &MI) { Register Dst = MI.getOperand(0).getReg(); LLT Ty = MRI.getType(Dst); Register Src = MI.getOperand(1).getReg(); @@ -404,15 +428,22 @@ void RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &MI) { Hi = B.buildUndef({VgprRB_S32}); break; default: - llvm_unreachable("Opcode not supported"); + reportGISelFailure( + MF, MORE, "amdgpu-regbanklegalize", + "AMDGPU RegBankLegalize: lowerVccExtToSel, Opcode not supported", MI); + return false; } B.buildMergeValues(Dst, {Lo.getReg(0), Hi.getReg(0)}); } else { - llvm_unreachable("Type not supported"); + reportGISelFailure( + MF, MORE, "amdgpu-regbanklegalize", + "AMDGPU RegBankLegalize: lowerVccExtToSel, Type not supported", MI); + return false; } MI.eraseFromParent(); + return true; } std::pair<Register, Register> RegBankLegalizeHelper::unpackZExt(Register Reg) { @@ -437,7 +468,14 @@ std::pair<Register, Register> RegBankLegalizeHelper::unpackAExt(Register Reg) { return {Lo.getReg(0), Hi.getReg(0)}; } -void RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &MI) { +std::pair<Register, Register> +RegBankLegalizeHelper::unpackAExtTruncS16(Register Reg) { + auto [Lo32, Hi32] = unpackAExt(Reg); + return {B.buildTrunc(SgprRB_S16, Lo32).getReg(0), + B.buildTrunc(SgprRB_S16, Hi32).getReg(0)}; +} + +bool RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &MI) { Register Lo, Hi; switch (MI.getOpcode()) { case AMDGPU::G_SHL: { @@ -462,13 +500,18 @@ void RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &MI) { break; } default: - llvm_unreachable("Unpack lowering not implemented"); + reportGISelFailure( + MF, MORE, "amdgpu-regbanklegalize", + "AMDGPU RegBankLegalize: lowerUnpackBitShift, case not implemented", + MI); + return false; } B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi}); MI.eraseFromParent(); + return true; } -void RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &MI) { +bool RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &MI) { Register Lo, Hi; switch (MI.getOpcode()) { case AMDGPU::G_SMIN: @@ -494,10 +537,25 @@ void RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &MI) { break; } default: - llvm_unreachable("Unpack min/max lowering not implemented"); + reportGISelFailure( + MF, MORE, "amdgpu-regbanklegalize", + "AMDGPU RegBankLegalize: lowerUnpackMinMax, case not implemented", MI); + return false; } B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi}); MI.eraseFromParent(); + return true; +} + +bool RegBankLegalizeHelper::lowerUnpackAExt(MachineInstr &MI) { + auto [Op1Lo, Op1Hi] = unpackAExt(MI.getOperand(1).getReg()); + auto [Op2Lo, Op2Hi] = unpackAExt(MI.getOperand(2).getReg()); + auto ResLo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Lo, Op2Lo}); + auto ResHi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Hi, Op2Hi}); + B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), + {ResLo.getReg(0), ResHi.getReg(0)}); + MI.eraseFromParent(); + return true; } static bool isSignedBFE(MachineInstr &MI) { @@ -507,7 +565,7 @@ static bool isSignedBFE(MachineInstr &MI) { return MI.getOpcode() == AMDGPU::G_SBFX; } -void RegBankLegalizeHelper::lowerV_BFE(MachineInstr &MI) { +bool RegBankLegalizeHelper::lowerV_BFE(MachineInstr &MI) { Register Dst = MI.getOperand(0).getReg(); assert(MRI.getType(Dst) == LLT::scalar(64)); bool Signed = isSignedBFE(MI); @@ -534,7 +592,7 @@ void RegBankLegalizeHelper::lowerV_BFE(MachineInstr &MI) { auto SignBit = B.buildShl({VgprRB, S64}, SHRSrc, Amt); B.buildInstr(SHROpc, {Dst}, {SignBit, Amt}); MI.eraseFromParent(); - return; + return true; } uint64_t WidthImm = ConstWidth->Value.getZExtValue(); @@ -564,9 +622,10 @@ void RegBankLegalizeHelper::lowerV_BFE(MachineInstr &MI) { } MI.eraseFromParent(); + return true; } -void RegBankLegalizeHelper::lowerS_BFE(MachineInstr &MI) { +bool RegBankLegalizeHelper::lowerS_BFE(MachineInstr &MI) { Register DstReg = MI.getOperand(0).getReg(); LLT Ty = MRI.getType(DstReg); bool Signed = isSignedBFE(MI); @@ -591,15 +650,15 @@ void RegBankLegalizeHelper::lowerS_BFE(MachineInstr &MI) { // copies from reg class to reg bank. auto S_BFE = B.buildInstr(Opc, {{SgprRB, Ty}}, {B.buildCopy(Ty, Src), B.buildCopy(S32, Src1)}); - if (!constrainSelectedInstRegOperands(*S_BFE, *ST.getInstrInfo(), - *ST.getRegisterInfo(), RBI)) - llvm_unreachable("failed to constrain BFE"); + constrainSelectedInstRegOperands(*S_BFE, *ST.getInstrInfo(), + *ST.getRegisterInfo(), RBI); B.buildCopy(DstReg, S_BFE->getOperand(0).getReg()); MI.eraseFromParent(); + return true; } -void RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) { +bool RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) { Register Dst = MI.getOperand(0).getReg(); LLT DstTy = MRI.getType(Dst); assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64); @@ -614,9 +673,113 @@ void RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) { B.buildInstr(Opc, {{VgprRB, Ty}}, {Op1.getReg(1), Op2.getReg(1)}, Flags); B.buildMergeLikeInstr(Dst, {Lo, Hi}); MI.eraseFromParent(); + return true; } -void RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &MI) { +bool RegBankLegalizeHelper::lowerSplitTo32Mul(MachineInstr &MI) { + Register Dst = MI.getOperand(0).getReg(); + assert(MRI.getType(Dst) == S64); + auto Op1 = B.buildUnmerge({VgprRB_S32}, MI.getOperand(1).getReg()); + auto Op2 = B.buildUnmerge({VgprRB_S32}, MI.getOperand(2).getReg()); + + // TODO: G_AMDGPU_MAD_* optimizations for G_MUL divergent S64 operation to + // match GlobalISel with old regbankselect. + auto Lo = B.buildMul(VgprRB_S32, Op1.getReg(0), Op2.getReg(0)); + auto Carry = B.buildUMulH(VgprRB_S32, Op1.getReg(0), Op2.getReg(0)); + auto MulLo0Hi1 = B.buildMul(VgprRB_S32, Op1.getReg(0), Op2.getReg(1)); + auto MulHi0Lo1 = B.buildMul(VgprRB_S32, Op1.getReg(1), Op2.getReg(0)); + auto Sum = B.buildAdd(VgprRB_S32, MulLo0Hi1, MulHi0Lo1); + auto Hi = B.buildAdd(VgprRB_S32, Sum, Carry); + + B.buildMergeLikeInstr(Dst, {Lo, Hi}); + MI.eraseFromParent(); + return true; +} + +bool RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &MI) { + Register Dst = MI.getOperand(0).getReg(); + assert(MRI.getType(Dst) == V2S16); + unsigned Opc = MI.getOpcode(); + unsigned NumOps = MI.getNumOperands(); + auto Flags = MI.getFlags(); + + auto [Op1Lo, Op1Hi] = unpackAExtTruncS16(MI.getOperand(1).getReg()); + + if (NumOps == 2) { + auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo}, Flags); + auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi}, Flags); + B.buildMergeLikeInstr(Dst, {Lo, Hi}); + MI.eraseFromParent(); + return true; + } + + auto [Op2Lo, Op2Hi] = unpackAExtTruncS16(MI.getOperand(2).getReg()); + + if (NumOps == 3) { + auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo}, Flags); + auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi}, Flags); + B.buildMergeLikeInstr(Dst, {Lo, Hi}); + MI.eraseFromParent(); + return true; + } + + assert(NumOps == 4); + auto [Op3Lo, Op3Hi] = unpackAExtTruncS16(MI.getOperand(3).getReg()); + auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo, Op3Lo}, Flags); + auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi, Op3Hi}, Flags); + B.buildMergeLikeInstr(Dst, {Lo, Hi}); + MI.eraseFromParent(); + return true; +} + +bool RegBankLegalizeHelper::lowerUniMAD64(MachineInstr &MI) { + Register Dst0 = MI.getOperand(0).getReg(); + Register Dst1 = MI.getOperand(1).getReg(); + Register Src0 = MI.getOperand(2).getReg(); + Register Src1 = MI.getOperand(3).getReg(); + Register Src2 = MI.getOperand(4).getReg(); + + const GCNSubtarget &ST = B.getMF().getSubtarget<GCNSubtarget>(); + + // Keep the multiplication on the SALU. + Register DstLo = B.buildMul(SgprRB_S32, Src0, Src1).getReg(0); + Register DstHi = MRI.createVirtualRegister(SgprRB_S32); + if (ST.hasScalarMulHiInsts()) { + B.buildInstr(AMDGPU::G_UMULH, {{DstHi}}, {Src0, Src1}); + } else { + auto VSrc0 = B.buildCopy(VgprRB_S32, Src0); + auto VSrc1 = B.buildCopy(VgprRB_S32, Src1); + auto MulHi = B.buildInstr(AMDGPU::G_UMULH, {VgprRB_S32}, {VSrc0, VSrc1}); + buildReadAnyLane(B, DstHi, MulHi.getReg(0), RBI); + } + + // Accumulate and produce the "carry-out" bit. + + // The "carry-out" is defined as bit 64 of the result when computed as a + // big integer. For unsigned multiply-add, this matches the usual + // definition of carry-out. + if (mi_match(Src2, MRI, MIPatternMatch::m_ZeroInt())) { + // No accumulate: result is just the multiplication, carry is 0. + B.buildMergeLikeInstr(Dst0, {DstLo, DstHi}); + B.buildConstant(Dst1, 0); + } else { + // Accumulate: add Src2 to the multiplication result with carry chain. + Register Src2Lo = MRI.createVirtualRegister(SgprRB_S32); + Register Src2Hi = MRI.createVirtualRegister(SgprRB_S32); + B.buildUnmerge({Src2Lo, Src2Hi}, Src2); + + auto AddLo = B.buildUAddo(SgprRB_S32, SgprRB_S32, DstLo, Src2Lo); + auto AddHi = + B.buildUAdde(SgprRB_S32, SgprRB_S32, DstHi, Src2Hi, AddLo.getReg(1)); + B.buildMergeLikeInstr(Dst0, {AddLo.getReg(0), AddHi.getReg(0)}); + B.buildCopy(Dst1, AddHi.getReg(1)); + } + + MI.eraseFromParent(); + return true; +} + +bool RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &MI) { Register Dst = MI.getOperand(0).getReg(); LLT DstTy = MRI.getType(Dst); assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64 || @@ -633,9 +796,10 @@ void RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &MI) { B.buildMergeLikeInstr(Dst, {Lo, Hi}); MI.eraseFromParent(); + return true; } -void RegBankLegalizeHelper::lowerSplitTo32SExtInReg(MachineInstr &MI) { +bool RegBankLegalizeHelper::lowerSplitTo32SExtInReg(MachineInstr &MI) { auto Op1 = B.buildUnmerge(VgprRB_S32, MI.getOperand(1).getReg()); int Amt = MI.getOperand(2).getImm(); Register Lo, Hi; @@ -660,9 +824,10 @@ void RegBankLegalizeHelper::lowerSplitTo32SExtInReg(MachineInstr &MI) { B.buildMergeLikeInstr(MI.getOperand(0).getReg(), {Lo, Hi}); MI.eraseFromParent(); + return true; } -void RegBankLegalizeHelper::lower(MachineInstr &MI, +bool RegBankLegalizeHelper::lower(MachineInstr &MI, const RegBankLLTMapping &Mapping, SmallSet<Register, 4> &WaterfallSgprs) { @@ -682,12 +847,14 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI, B.buildSelect(MI.getOperand(0).getReg(), MI.getOperand(1).getReg(), True, False); MI.eraseFromParent(); - return; + return true; } case UnpackBitShift: return lowerUnpackBitShift(MI); case UnpackMinMax: return lowerUnpackMinMax(MI); + case ScalarizeToS16: + return lowerSplitTo16(MI); case Ext32To64: { const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg()); MachineInstrBuilder Hi; @@ -707,20 +874,23 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI, break; } default: - llvm_unreachable("Unsuported Opcode in Ext32To64"); + reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize", + "AMDGPU RegBankLegalize: Ext32To64, unsuported opcode", + MI); + return false; } B.buildMergeLikeInstr(MI.getOperand(0).getReg(), {MI.getOperand(1).getReg(), Hi}); MI.eraseFromParent(); - return; + return true; } case UniCstExt: { uint64_t ConstVal = MI.getOperand(1).getCImm()->getZExtValue(); B.buildConstant(MI.getOperand(0).getReg(), ConstVal); MI.eraseFromParent(); - return; + return true; } case VgprToVccCopy: { Register Src = MI.getOperand(1).getReg(); @@ -744,14 +914,37 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI, auto Zero = B.buildConstant({VgprRB, Ty}, 0); B.buildICmp(CmpInst::ICMP_NE, MI.getOperand(0).getReg(), BoolSrc, Zero); MI.eraseFromParent(); - return; + return true; } case V_BFE: return lowerV_BFE(MI); case S_BFE: return lowerS_BFE(MI); + case UniMAD64: + return lowerUniMAD64(MI); + case UniMul64: { + B.buildMul(MI.getOperand(0), MI.getOperand(1), MI.getOperand(2)); + MI.eraseFromParent(); + return true; + } + case DivSMulToMAD: { + auto Op1 = B.buildTrunc(VgprRB_S32, MI.getOperand(1)); + auto Op2 = B.buildTrunc(VgprRB_S32, MI.getOperand(2)); + auto Zero = B.buildConstant({VgprRB, S64}, 0); + + unsigned NewOpc = MI.getOpcode() == AMDGPU::G_AMDGPU_S_MUL_U64_U32 + ? AMDGPU::G_AMDGPU_MAD_U64_U32 + : AMDGPU::G_AMDGPU_MAD_I64_I32; + + B.buildInstr(NewOpc, {MI.getOperand(0).getReg(), {SgprRB, S32}}, + {Op1, Op2, Zero}); + MI.eraseFromParent(); + return true; + } case SplitTo32: return lowerSplitTo32(MI); + case SplitTo32Mul: + return lowerSplitTo32Mul(MI); case SplitTo32Select: return lowerSplitTo32Select(MI); case SplitTo32SExtInReg: @@ -773,8 +966,10 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI, else if (Size / 128 == 4) splitLoad(MI, {B128, B128, B128, B128}); else { - LLVM_DEBUG(dbgs() << "MI: "; MI.dump();); - llvm_unreachable("SplitLoad type not supported for MI"); + reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize", + "AMDGPU RegBankLegalize: SplitLoad, unsuported type", + MI); + return false; } } // 64 and 32 bit load @@ -785,10 +980,12 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI, else if (DstTy == V6S16) splitLoad(MI, {V4S16, V2S16}, V2S16); else { - LLVM_DEBUG(dbgs() << "MI: "; MI.dump();); - llvm_unreachable("SplitLoad type not supported for MI"); + reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize", + "AMDGPU RegBankLegalize: SplitLoad, unsuported type", + MI); + return false; } - break; + return true; } case WidenLoad: { LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); @@ -799,19 +996,74 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI, else if (DstTy == V6S16) widenLoad(MI, V8S16, V2S16); else { - LLVM_DEBUG(dbgs() << "MI: "; MI.dump();); - llvm_unreachable("WidenLoad type not supported for MI"); + reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize", + "AMDGPU RegBankLegalize: WidenLoad, unsuported type", + MI); + return false; } - break; + return true; } + case UnpackAExt: + return lowerUnpackAExt(MI); case WidenMMOToS32: return widenMMOToS32(cast<GAnyLoad>(MI)); + case VerifyAllSgpr: { + assert(llvm::all_of(MI.operands(), [&](const MachineOperand &Op) { + return MRI.getRegBankOrNull(Op.getReg()) == SgprRB; + })); + return true; + } + case ApplyAllVgpr: { + assert(llvm::all_of(MI.defs(), [&](const MachineOperand &Op) { + return MRI.getRegBankOrNull(Op.getReg()) == VgprRB; + })); + B.setInstrAndDebugLoc(MI); + for (unsigned i = MI.getNumDefs(); i < MI.getNumOperands(); ++i) { + Register Reg = MI.getOperand(i).getReg(); + if (MRI.getRegBank(Reg) != VgprRB) { + auto Copy = B.buildCopy({VgprRB, MRI.getType(Reg)}, Reg); + MI.getOperand(i).setReg(Copy.getReg(0)); + } + } + return true; + } + case UnmergeToShiftTrunc: { + GUnmerge *Unmerge = dyn_cast<GUnmerge>(&MI); + LLT Ty = MRI.getType(Unmerge->getSourceReg()); + if (Ty.getSizeInBits() % 32 != 0) { + reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize", + "AMDGPU RegBankLegalize: unmerge not multiple of 32", + MI); + return false; + } + + B.setInstrAndDebugLoc(MI); + if (Ty.getSizeInBits() > 32) { + auto UnmergeV2S16 = + B.buildUnmerge({SgprRB, V2S16}, Unmerge->getSourceReg()); + for (unsigned i = 0; i < UnmergeV2S16->getNumDefs(); ++i) { + auto [Dst0S32, Dst1S32] = + unpackAExt(UnmergeV2S16->getOperand(i).getReg()); + B.buildTrunc(MI.getOperand(i * 2).getReg(), Dst0S32); + B.buildTrunc(MI.getOperand(i * 2 + 1).getReg(), Dst1S32); + } + } else { + auto [Dst0S32, Dst1S32] = unpackAExt(MI.getOperand(2).getReg()); + B.buildTrunc(MI.getOperand(0).getReg(), Dst0S32); + B.buildTrunc(MI.getOperand(1).getReg(), Dst1S32); + } + + MI.eraseFromParent(); + return true; + } } if (!WaterfallSgprs.empty()) { MachineBasicBlock::iterator I = MI.getIterator(); - executeInWaterfallLoop(B, make_range(I, std::next(I)), WaterfallSgprs); + if (!executeInWaterfallLoop(B, make_range(I, std::next(I)), WaterfallSgprs)) + return false; } + return true; } LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) { @@ -832,20 +1084,26 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) { case Sgpr32ZExt: case UniInVgprS32: case Vgpr32: + case Vgpr32AExt: case Vgpr32SExt: case Vgpr32ZExt: return LLT::scalar(32); case Sgpr64: case Vgpr64: + case UniInVgprS64: return LLT::scalar(64); case Sgpr128: case Vgpr128: return LLT::scalar(128); + case SgprP0: case VgprP0: return LLT::pointer(0, 64); case SgprP1: case VgprP1: return LLT::pointer(1, 64); + case SgprP2: + case VgprP2: + return LLT::pointer(2, 32); case SgprP3: case VgprP3: return LLT::pointer(3, 32); @@ -855,18 +1113,26 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) { case SgprP5: case VgprP5: return LLT::pointer(5, 32); + case SgprP8: + return LLT::pointer(8, 128); case SgprV2S16: case VgprV2S16: case UniInVgprV2S16: return LLT::fixed_vector(2, 16); case SgprV2S32: case VgprV2S32: + case UniInVgprV2S32: return LLT::fixed_vector(2, 32); + case VgprV3S32: + return LLT::fixed_vector(3, 32); case SgprV4S32: case SgprV4S32_WF: case VgprV4S32: case UniInVgprV4S32: return LLT::fixed_vector(4, 32); + case VgprV2S64: + case UniInVgprV2S64: + return LLT::fixed_vector(2, 64); default: return LLT(); } @@ -908,7 +1174,13 @@ LLT RegBankLegalizeHelper::getBTyFromID(RegBankLLTMappingApplyID ID, LLT Ty) { case VgprB128: case UniInVgprB128: if (Ty == LLT::scalar(128) || Ty == LLT::fixed_vector(4, 32) || - Ty == LLT::fixed_vector(2, 64) || isAnyPtr(Ty, 128)) + Ty == LLT::fixed_vector(2, 64) || Ty == LLT::fixed_vector(8, 16) || + isAnyPtr(Ty, 128)) + return Ty; + return LLT(); + case VgprB160: + case UniInVgprB160: + if (Ty.getSizeInBits() == 160) return Ty; return LLT(); case SgprB256: @@ -925,6 +1197,21 @@ LLT RegBankLegalizeHelper::getBTyFromID(RegBankLLTMappingApplyID ID, LLT Ty) { Ty == LLT::fixed_vector(8, 64)) return Ty; return LLT(); + case SgprBRC: { + const SIRegisterInfo *TRI = + static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); + unsigned LLTSize = Ty.getSizeInBits(); + if (LLTSize >= 32 && TRI->getSGPRClassForBitWidth(LLTSize)) + return Ty; + return LLT(); + } + case VgprBRC: { + const SIRegisterInfo *TRI = + static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); + if (TRI->getSGPRClassForBitWidth(Ty.getSizeInBits())) + return Ty; + return LLT(); + } default: return LLT(); } @@ -940,10 +1227,13 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) { case Sgpr32_WF: case Sgpr64: case Sgpr128: + case SgprP0: case SgprP1: + case SgprP2: case SgprP3: case SgprP4: case SgprP5: + case SgprP8: case SgprPtr32: case SgprPtr64: case SgprPtr128: @@ -957,15 +1247,20 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) { case SgprB128: case SgprB256: case SgprB512: + case SgprBRC: case UniInVcc: case UniInVgprS16: case UniInVgprS32: + case UniInVgprS64: case UniInVgprV2S16: + case UniInVgprV2S32: case UniInVgprV4S32: + case UniInVgprV2S64: case UniInVgprB32: case UniInVgprB64: case UniInVgprB96: case UniInVgprB128: + case UniInVgprB160: case UniInVgprB256: case UniInVgprB512: case Sgpr32Trunc: @@ -980,6 +1275,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) { case Vgpr128: case VgprP0: case VgprP1: + case VgprP2: case VgprP3: case VgprP4: case VgprP5: @@ -988,13 +1284,18 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) { case VgprPtr128: case VgprV2S16: case VgprV2S32: + case VgprV2S64: + case VgprV3S32: case VgprV4S32: case VgprB32: case VgprB64: case VgprB96: case VgprB128: + case VgprB160: case VgprB256: case VgprB512: + case VgprBRC: + case Vgpr32AExt: case Vgpr32SExt: case Vgpr32ZExt: return VgprRB; @@ -1003,7 +1304,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) { } } -void RegBankLegalizeHelper::applyMappingDst( +bool RegBankLegalizeHelper::applyMappingDst( MachineInstr &MI, unsigned &OpIdx, const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs) { // Defs start from operand 0 @@ -1022,10 +1323,12 @@ void RegBankLegalizeHelper::applyMappingDst( case Sgpr32: case Sgpr64: case Sgpr128: + case SgprP0: case SgprP1: case SgprP3: case SgprP4: case SgprP5: + case SgprP8: case SgprV2S16: case SgprV2S32: case SgprV4S32: @@ -1035,11 +1338,14 @@ void RegBankLegalizeHelper::applyMappingDst( case Vgpr128: case VgprP0: case VgprP1: + case VgprP2: case VgprP3: case VgprP4: case VgprP5: case VgprV2S16: case VgprV2S32: + case VgprV2S64: + case VgprV3S32: case VgprV4S32: { assert(Ty == getTyFromID(MethodIDs[OpIdx])); assert(RB == getRegBankFromID(MethodIDs[OpIdx])); @@ -1052,6 +1358,7 @@ void RegBankLegalizeHelper::applyMappingDst( case SgprB128: case SgprB256: case SgprB512: + case SgprBRC: case SgprPtr32: case SgprPtr64: case SgprPtr128: @@ -1059,8 +1366,10 @@ void RegBankLegalizeHelper::applyMappingDst( case VgprB64: case VgprB96: case VgprB128: + case VgprB160: case VgprB256: case VgprB512: + case VgprBRC: case VgprPtr32: case VgprPtr64: case VgprPtr128: { @@ -1074,9 +1383,11 @@ void RegBankLegalizeHelper::applyMappingDst( assert(RB == SgprRB); Register NewDst = MRI.createVirtualRegister(VccRB_S1); Op.setReg(NewDst); - auto CopyS32_Vcc = - B.buildInstr(AMDGPU::G_AMDGPU_COPY_SCC_VCC, {SgprRB_S32}, {NewDst}); - B.buildTrunc(Reg, CopyS32_Vcc); + if (!MRI.use_empty(Reg)) { + auto CopyS32_Vcc = + B.buildInstr(AMDGPU::G_AMDGPU_COPY_SCC_VCC, {SgprRB_S32}, {NewDst}); + B.buildTrunc(Reg, CopyS32_Vcc); + } break; } case UniInVgprS16: { @@ -1092,8 +1403,11 @@ void RegBankLegalizeHelper::applyMappingDst( break; } case UniInVgprS32: + case UniInVgprS64: case UniInVgprV2S16: - case UniInVgprV4S32: { + case UniInVgprV2S32: + case UniInVgprV4S32: + case UniInVgprV2S64: { assert(Ty == getTyFromID(MethodIDs[OpIdx])); assert(RB == SgprRB); Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty}); @@ -1105,6 +1419,7 @@ void RegBankLegalizeHelper::applyMappingDst( case UniInVgprB64: case UniInVgprB96: case UniInVgprB128: + case UniInVgprB160: case UniInVgprB256: case UniInVgprB512: { assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty)); @@ -1120,20 +1435,28 @@ void RegBankLegalizeHelper::applyMappingDst( assert(RB == SgprRB); Register NewDst = MRI.createVirtualRegister(SgprRB_S32); Op.setReg(NewDst); - B.buildTrunc(Reg, NewDst); + if (!MRI.use_empty(Reg)) + B.buildTrunc(Reg, NewDst); break; } case InvalidMapping: { - LLVM_DEBUG(dbgs() << "Instruction with Invalid mapping: "; MI.dump();); - llvm_unreachable("missing fast rule for MI"); + reportGISelFailure( + MF, MORE, "amdgpu-regbanklegalize", + "AMDGPU RegBankLegalize: missing fast rule ('Div' or 'Uni') for", MI); + return false; } default: - llvm_unreachable("ID not supported"); + reportGISelFailure( + MF, MORE, "amdgpu-regbanklegalize", + "AMDGPU RegBankLegalize: applyMappingDst, ID not supported", MI); + return false; } } + + return true; } -void RegBankLegalizeHelper::applyMappingSrc( +bool RegBankLegalizeHelper::applyMappingSrc( MachineInstr &MI, unsigned &OpIdx, const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs, SmallSet<Register, 4> &SgprWaterfallOperandRegs) { @@ -1163,10 +1486,12 @@ void RegBankLegalizeHelper::applyMappingSrc( case Sgpr32: case Sgpr64: case Sgpr128: + case SgprP0: case SgprP1: case SgprP3: case SgprP4: case SgprP5: + case SgprP8: case SgprV2S16: case SgprV2S32: case SgprV4S32: { @@ -1181,6 +1506,7 @@ void RegBankLegalizeHelper::applyMappingSrc( case SgprB128: case SgprB256: case SgprB512: + case SgprBRC: case SgprPtr32: case SgprPtr64: case SgprPtr128: { @@ -1195,11 +1521,14 @@ void RegBankLegalizeHelper::applyMappingSrc( case Vgpr128: case VgprP0: case VgprP1: + case VgprP2: case VgprP3: case VgprP4: case VgprP5: case VgprV2S16: case VgprV2S32: + case VgprV2S64: + case VgprV3S32: case VgprV4S32: { assert(Ty == getTyFromID(MethodIDs[i])); if (RB != VgprRB) { @@ -1213,8 +1542,10 @@ void RegBankLegalizeHelper::applyMappingSrc( case VgprB64: case VgprB96: case VgprB128: + case VgprB160: case VgprB256: case VgprB512: + case VgprBRC: case VgprPtr32: case VgprPtr64: case VgprPtr128: { @@ -1268,6 +1599,13 @@ void RegBankLegalizeHelper::applyMappingSrc( Op.setReg(Zext.getReg(0)); break; } + case Vgpr32AExt: { + assert(Ty.getSizeInBits() < 32); + assert(RB == VgprRB); + auto Aext = B.buildAnyExt({VgprRB, S32}, Reg); + Op.setReg(Aext.getReg(0)); + break; + } case Vgpr32SExt: { // Note this ext allows S1, and it is meant to be combined away. assert(Ty.getSizeInBits() < 32); @@ -1285,12 +1623,16 @@ void RegBankLegalizeHelper::applyMappingSrc( break; } default: - llvm_unreachable("ID not supported"); + reportGISelFailure( + MF, MORE, "amdgpu-regbanklegalize", + "AMDGPU RegBankLegalize: applyMappingSrc, ID not supported", MI); + return false; } } + return true; } -void RegBankLegalizeHelper::applyMappingPHI(MachineInstr &MI) { +bool RegBankLegalizeHelper::applyMappingPHI(MachineInstr &MI) { Register Dst = MI.getOperand(0).getReg(); LLT Ty = MRI.getType(Dst); @@ -1313,16 +1655,17 @@ void RegBankLegalizeHelper::applyMappingPHI(MachineInstr &MI) { MI.getOperand(i).setReg(NewUse.getReg(0)); } - return; + return true; } - // ALL divergent i1 phis should be already lowered and inst-selected into PHI - // with sgpr reg class and S1 LLT. + // ALL divergent i1 phis should have been lowered and inst-selected into PHI + // with sgpr reg class and S1 LLT in AMDGPUGlobalISelDivergenceLowering pass. // Note: this includes divergent phis that don't require lowering. if (Ty == LLT::scalar(1) && MUI.isDivergent(Dst)) { - LLVM_DEBUG(dbgs() << "Divergent S1 G_PHI: "; MI.dump();); - llvm_unreachable("Make sure to run AMDGPUGlobalISelDivergenceLowering " - "before RegBankLegalize to lower lane mask(vcc) phis"); + reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize", + "AMDGPU RegBankLegalize: Can't lower divergent S1 G_PHI", + MI); + return false; } // We accept all types that can fit in some register class. @@ -1330,11 +1673,13 @@ void RegBankLegalizeHelper::applyMappingPHI(MachineInstr &MI) { // Divergent G_PHIs have vgpr dst but inputs can be sgpr or vgpr. if (Ty == LLT::scalar(32) || Ty == LLT::pointer(1, 64) || Ty == LLT::pointer(4, 64)) { - return; + return true; } - LLVM_DEBUG(dbgs() << "G_PHI not handled: "; MI.dump();); - llvm_unreachable("type not supported"); + reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize", + "AMDGPU RegBankLegalize: type not supported for G_PHI", + MI); + return false; } [[maybe_unused]] static bool verifyRegBankOnOperands(MachineInstr &MI, diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h index d937815..86669ae 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h @@ -12,6 +12,7 @@ #include "AMDGPURegBankLegalizeRules.h" #include "llvm/ADT/SmallSet.h" #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" +#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" #include "llvm/CodeGen/MachineRegisterInfo.h" namespace llvm { @@ -27,11 +28,13 @@ namespace AMDGPU { // to replace instruction. In other case InstApplyMethod will create new // instruction(s). class RegBankLegalizeHelper { + MachineFunction &MF; const GCNSubtarget &ST; MachineIRBuilder &B; MachineRegisterInfo &MRI; const MachineUniformityInfo &MUI; const RegisterBankInfo &RBI; + MachineOptimizationRemarkEmitter MORE; const RegBankLegalizeRules &RBLRules; const bool IsWave32; const RegisterBank *SgprRB; @@ -72,6 +75,7 @@ class RegBankLegalizeHelper { static constexpr LLT P6 = LLT::pointer(6, 32); MachineRegisterInfo::VRegAttrs SgprRB_S32 = {SgprRB, S32}; + MachineRegisterInfo::VRegAttrs SgprRB_S16 = {SgprRB, S16}; MachineRegisterInfo::VRegAttrs VgprRB_S32 = {VgprRB, S32}; MachineRegisterInfo::VRegAttrs VccRB_S1 = {VccRB, S1}; @@ -80,10 +84,10 @@ public: const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules); - void findRuleAndApplyMapping(MachineInstr &MI); + bool findRuleAndApplyMapping(MachineInstr &MI); // Manual apply helpers. - void applyMappingPHI(MachineInstr &MI); + bool applyMappingPHI(MachineInstr &MI); void applyMappingTrivial(MachineInstr &MI); private: @@ -96,34 +100,39 @@ private: const RegisterBank *getRegBankFromID(RegBankLLTMappingApplyID ID); - void + bool applyMappingDst(MachineInstr &MI, unsigned &OpIdx, const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs); - void + bool applyMappingSrc(MachineInstr &MI, unsigned &OpIdx, const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs, SmallSet<Register, 4> &SgprWaterfallOperandRegs); - void splitLoad(MachineInstr &MI, ArrayRef<LLT> LLTBreakdown, + bool splitLoad(MachineInstr &MI, ArrayRef<LLT> LLTBreakdown, LLT MergeTy = LLT()); - void widenLoad(MachineInstr &MI, LLT WideTy, LLT MergeTy = LLT()); - void widenMMOToS32(GAnyLoad &MI) const; + bool widenLoad(MachineInstr &MI, LLT WideTy, LLT MergeTy = LLT()); + bool widenMMOToS32(GAnyLoad &MI) const; - void lower(MachineInstr &MI, const RegBankLLTMapping &Mapping, + bool lower(MachineInstr &MI, const RegBankLLTMapping &Mapping, SmallSet<Register, 4> &SgprWaterfallOperandRegs); - void lowerVccExtToSel(MachineInstr &MI); + bool lowerVccExtToSel(MachineInstr &MI); std::pair<Register, Register> unpackZExt(Register Reg); std::pair<Register, Register> unpackSExt(Register Reg); std::pair<Register, Register> unpackAExt(Register Reg); - void lowerUnpackBitShift(MachineInstr &MI); - void lowerV_BFE(MachineInstr &MI); - void lowerS_BFE(MachineInstr &MI); - void lowerSplitTo32(MachineInstr &MI); - void lowerSplitTo32Select(MachineInstr &MI); - void lowerSplitTo32SExtInReg(MachineInstr &MI); - void lowerUnpackMinMax(MachineInstr &MI); + std::pair<Register, Register> unpackAExtTruncS16(Register Reg); + bool lowerUnpackBitShift(MachineInstr &MI); + bool lowerV_BFE(MachineInstr &MI); + bool lowerS_BFE(MachineInstr &MI); + bool lowerUniMAD64(MachineInstr &MI); + bool lowerSplitTo32(MachineInstr &MI); + bool lowerSplitTo32Mul(MachineInstr &MI); + bool lowerSplitTo16(MachineInstr &MI); + bool lowerSplitTo32Select(MachineInstr &MI); + bool lowerSplitTo32SExtInReg(MachineInstr &MI); + bool lowerUnpackMinMax(MachineInstr &MI); + bool lowerUnpackAExt(MachineInstr &MI); }; } // end namespace AMDGPU diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index a67b12a..a0be07d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -60,20 +60,28 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID, return MRI.getType(Reg) == LLT::pointer(0, 64); case P1: return MRI.getType(Reg) == LLT::pointer(1, 64); + case P2: + return MRI.getType(Reg) == LLT::pointer(2, 32); case P3: return MRI.getType(Reg) == LLT::pointer(3, 32); case P4: return MRI.getType(Reg) == LLT::pointer(4, 64); case P5: return MRI.getType(Reg) == LLT::pointer(5, 32); + case P8: + return MRI.getType(Reg) == LLT::pointer(8, 128); case Ptr32: return isAnyPtr(MRI.getType(Reg), 32); case Ptr64: return isAnyPtr(MRI.getType(Reg), 64); case Ptr128: return isAnyPtr(MRI.getType(Reg), 128); + case V2S16: + return MRI.getType(Reg) == LLT::fixed_vector(2, 16); case V2S32: return MRI.getType(Reg) == LLT::fixed_vector(2, 32); + case V3S32: + return MRI.getType(Reg) == LLT::fixed_vector(3, 32); case V4S32: return MRI.getType(Reg) == LLT::fixed_vector(4, 32); case B32: @@ -84,6 +92,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID, return MRI.getType(Reg).getSizeInBits() == 96; case B128: return MRI.getType(Reg).getSizeInBits() == 128; + case B160: + return MRI.getType(Reg).getSizeInBits() == 160; case B256: return MRI.getType(Reg).getSizeInBits() == 256; case B512: @@ -102,12 +112,16 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID, return MRI.getType(Reg) == LLT::pointer(0, 64) && MUI.isUniform(Reg); case UniP1: return MRI.getType(Reg) == LLT::pointer(1, 64) && MUI.isUniform(Reg); + case UniP2: + return MRI.getType(Reg) == LLT::pointer(2, 32) && MUI.isUniform(Reg); case UniP3: return MRI.getType(Reg) == LLT::pointer(3, 32) && MUI.isUniform(Reg); case UniP4: return MRI.getType(Reg) == LLT::pointer(4, 64) && MUI.isUniform(Reg); case UniP5: return MRI.getType(Reg) == LLT::pointer(5, 32) && MUI.isUniform(Reg); + case UniP8: + return MRI.getType(Reg) == LLT::pointer(8, 128) && MUI.isUniform(Reg); case UniPtr32: return isAnyPtr(MRI.getType(Reg), 32) && MUI.isUniform(Reg); case UniPtr64: @@ -116,6 +130,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID, return isAnyPtr(MRI.getType(Reg), 128) && MUI.isUniform(Reg); case UniV2S16: return MRI.getType(Reg) == LLT::fixed_vector(2, 16) && MUI.isUniform(Reg); + case UniV2S32: + return MRI.getType(Reg) == LLT::fixed_vector(2, 32) && MUI.isUniform(Reg); case UniB32: return MRI.getType(Reg).getSizeInBits() == 32 && MUI.isUniform(Reg); case UniB64: @@ -124,10 +140,23 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID, return MRI.getType(Reg).getSizeInBits() == 96 && MUI.isUniform(Reg); case UniB128: return MRI.getType(Reg).getSizeInBits() == 128 && MUI.isUniform(Reg); + case UniB160: + return MRI.getType(Reg).getSizeInBits() == 160 && MUI.isUniform(Reg); case UniB256: return MRI.getType(Reg).getSizeInBits() == 256 && MUI.isUniform(Reg); case UniB512: return MRI.getType(Reg).getSizeInBits() == 512 && MUI.isUniform(Reg); + case UniBRC: { + if (!MUI.isUniform(Reg)) + return false; + // Check if there is SGPR register class of same size as the LLT. + const SIRegisterInfo *TRI = + static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); + // There is no 16 bit SGPR register class. Extra size check is required + // since getSGPRClassForBitWidth returns SReg_32RegClass for Size 16. + unsigned LLTSize = MRI.getType(Reg).getSizeInBits(); + return LLTSize >= 32 && TRI->getSGPRClassForBitWidth(LLTSize); + } case DivS1: return MRI.getType(Reg) == LLT::scalar(1) && MUI.isDivergent(Reg); case DivS16: @@ -142,6 +171,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID, return MRI.getType(Reg) == LLT::pointer(0, 64) && MUI.isDivergent(Reg); case DivP1: return MRI.getType(Reg) == LLT::pointer(1, 64) && MUI.isDivergent(Reg); + case DivP2: + return MRI.getType(Reg) == LLT::pointer(2, 32) && MUI.isDivergent(Reg); case DivP3: return MRI.getType(Reg) == LLT::pointer(3, 32) && MUI.isDivergent(Reg); case DivP4: @@ -156,6 +187,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID, return isAnyPtr(MRI.getType(Reg), 128) && MUI.isDivergent(Reg); case DivV2S16: return MRI.getType(Reg) == LLT::fixed_vector(2, 16) && MUI.isDivergent(Reg); + case DivV2S32: + return MRI.getType(Reg) == LLT::fixed_vector(2, 32) && MUI.isDivergent(Reg); case DivB32: return MRI.getType(Reg).getSizeInBits() == 32 && MUI.isDivergent(Reg); case DivB64: @@ -164,10 +197,20 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID, return MRI.getType(Reg).getSizeInBits() == 96 && MUI.isDivergent(Reg); case DivB128: return MRI.getType(Reg).getSizeInBits() == 128 && MUI.isDivergent(Reg); + case DivB160: + return MRI.getType(Reg).getSizeInBits() == 160 && MUI.isDivergent(Reg); case DivB256: return MRI.getType(Reg).getSizeInBits() == 256 && MUI.isDivergent(Reg); case DivB512: return MRI.getType(Reg).getSizeInBits() == 512 && MUI.isDivergent(Reg); + case DivBRC: { + if (!MUI.isDivergent(Reg)) + return false; + // Check if there is VGPR register class of same size as the LLT. + const SIRegisterInfo *TRI = + static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); + return TRI->getSGPRClassForBitWidth(MRI.getType(Reg).getSizeInBits()); + } case _: return true; default: @@ -202,7 +245,7 @@ bool PredicateMapping::match(const MachineInstr &MI, return true; } -SetOfRulesForOpcode::SetOfRulesForOpcode() {} +SetOfRulesForOpcode::SetOfRulesForOpcode() = default; SetOfRulesForOpcode::SetOfRulesForOpcode(FastRulesTypes FastTypes) : FastTypes(FastTypes) {} @@ -234,12 +277,13 @@ UniformityLLTOpPredicateID LLTToBId(LLT Ty) { return B64; if (Ty == LLT::fixed_vector(3, 32)) return B96; - if (Ty == LLT::fixed_vector(4, 32) || isAnyPtr(Ty, 128)) + if (Ty == LLT::fixed_vector(4, 32) || Ty == LLT::fixed_vector(2, 64) || + Ty == LLT::fixed_vector(8, 16) || isAnyPtr(Ty, 128)) return B128; return _; } -const RegBankLLTMapping & +const RegBankLLTMapping * SetOfRulesForOpcode::findMappingForMI(const MachineInstr &MI, const MachineRegisterInfo &MRI, const MachineUniformityInfo &MUI) const { @@ -256,17 +300,16 @@ SetOfRulesForOpcode::findMappingForMI(const MachineInstr &MI, Slot = getFastPredicateSlot(LLTToId(MRI.getType(Reg))); if (Slot != -1) - return MUI.isUniform(Reg) ? Uni[Slot] : Div[Slot]; + return MUI.isUniform(Reg) ? &Uni[Slot] : &Div[Slot]; } // Slow search for more complex rules. for (const RegBankLegalizeRule &Rule : Rules) { if (Rule.Predicate.match(MI, MUI, MRI)) - return Rule.OperandMapping; + return &Rule.OperandMapping; } - LLVM_DEBUG(dbgs() << "MI: "; MI.dump();); - llvm_unreachable("None of the rules defined for MI's opcode matched MI"); + return nullptr; } void SetOfRulesForOpcode::addRule(RegBankLegalizeRule Rule) { @@ -277,14 +320,14 @@ void SetOfRulesForOpcode::addFastRuleDivergent(UniformityLLTOpPredicateID Ty, RegBankLLTMapping RuleApplyIDs) { int Slot = getFastPredicateSlot(Ty); assert(Slot != -1 && "Ty unsupported in this FastRulesTypes"); - Div[Slot] = RuleApplyIDs; + Div[Slot] = std::move(RuleApplyIDs); } void SetOfRulesForOpcode::addFastRuleUniform(UniformityLLTOpPredicateID Ty, RegBankLLTMapping RuleApplyIDs) { int Slot = getFastPredicateSlot(Ty); assert(Slot != -1 && "Ty unsupported in this FastRulesTypes"); - Uni[Slot] = RuleApplyIDs; + Uni[Slot] = std::move(RuleApplyIDs); } int SetOfRulesForOpcode::getFastPredicateSlot( @@ -349,7 +392,7 @@ RegBankLegalizeRules::addRulesForIOpcs(std::initializer_list<unsigned> OpcList, return RuleSetInitializer(OpcList, IRulesAlias, IRules, FastTypes); } -const SetOfRulesForOpcode & +const SetOfRulesForOpcode * RegBankLegalizeRules::getRulesForOpc(MachineInstr &MI) const { unsigned Opc = MI.getOpcode(); if (Opc == AMDGPU::G_INTRINSIC || Opc == AMDGPU::G_INTRINSIC_CONVERGENT || @@ -357,19 +400,15 @@ RegBankLegalizeRules::getRulesForOpc(MachineInstr &MI) const { Opc == AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS) { unsigned IntrID = cast<GIntrinsic>(MI).getIntrinsicID(); auto IRAIt = IRulesAlias.find(IntrID); - if (IRAIt == IRulesAlias.end()) { - LLVM_DEBUG(dbgs() << "MI: "; MI.dump();); - llvm_unreachable("No rules defined for intrinsic opcode"); - } - return IRules.at(IRAIt->second); + if (IRAIt == IRulesAlias.end()) + return nullptr; + return &IRules.at(IRAIt->second); } auto GRAIt = GRulesAlias.find(Opc); - if (GRAIt == GRulesAlias.end()) { - LLVM_DEBUG(dbgs() << "MI: "; MI.dump();); - llvm_unreachable("No rules defined for generic opcode"); - } - return GRules.at(GRAIt->second); + if (GRAIt == GRulesAlias.end()) + return nullptr; + return &GRules.at(GRAIt->second); } // Syntactic sugar wrapper for predicate lambda that enables '&&', '||' and '!'. @@ -470,9 +509,54 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Uni(S16, {{Sgpr32Trunc}, {Sgpr32AExt, Sgpr32AExt}}) .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}}) .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}) - .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}); + .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}) + .Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, UnpackAExt}) + .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}}) + .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr64}}) + .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}}); + + addRulesForGOpcs({G_UADDO, G_USUBO}, Standard) + .Uni(S32, {{Sgpr32, Sgpr32Trunc}, {Sgpr32, Sgpr32}}) + .Div(S32, {{Vgpr32, Vcc}, {Vgpr32, Vgpr32}}); + + addRulesForGOpcs({G_UADDE, G_USUBE, G_SADDE, G_SSUBE}, Standard) + .Uni(S32, {{Sgpr32, Sgpr32Trunc}, {Sgpr32, Sgpr32, Sgpr32AExtBoolInReg}}) + .Div(S32, {{Vgpr32, Vcc}, {Vgpr32, Vgpr32, Vcc}}); + + addRulesForGOpcs({G_UADDSAT, G_SADDSAT, G_USUBSAT, G_SSUBSAT}, Standard) + .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}}) + .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}}) + .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}) + .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}) + .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16, VgprV2S16}}) + .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}}); + + bool HasVecMulU64 = ST->hasVectorMulU64(); + addRulesForGOpcs({G_MUL}, Standard) + .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}}) + .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}) + .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}) + .Uni(S64, {{SgprB64}, {SgprB64, SgprB64}}) + .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16, VgprV2S16}}) + .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}}) + .Uni(S16, {{Sgpr32Trunc}, {Sgpr32AExt, Sgpr32AExt}}) + .Div(S64, {{VgprB64}, {VgprB64, VgprB64}}, HasVecMulU64) + .Div(S64, {{VgprB64}, {VgprB64, VgprB64}, SplitTo32Mul}, !HasVecMulU64); + + bool hasMulHi = ST->hasScalarMulHiInsts(); + addRulesForGOpcs({G_UMULH, G_SMULH}, Standard) + .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}) + .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasMulHi) + .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasMulHi); + + addRulesForGOpcs({G_AMDGPU_MAD_U64_U32}, Standard) + .Div(S64, {{Vgpr64, Vcc}, {Vgpr32, Vgpr32, Vgpr64}}) + .Uni(S64, {{Sgpr64, Sgpr32Trunc}, {Sgpr32, Sgpr32, Sgpr64}, UniMAD64}); - addRulesForGOpcs({G_MUL}, Standard).Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}); + bool HasScalarSMulU64 = ST->hasScalarSMulU64(); + addRulesForGOpcs({G_AMDGPU_S_MUL_U64_U32, G_AMDGPU_S_MUL_I64_I32}, Standard) + .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr64}, UniMul64}, HasScalarSMulU64) + .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}, DivSMulToMAD}); addRulesForGOpcs({G_XOR, G_OR, G_AND}, StandardB) .Any({{UniS1}, {{Sgpr32Trunc}, {Sgpr32AExt, Sgpr32AExt}}}) @@ -514,6 +598,10 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}) .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}}); + addRulesForGOpcs({G_FSHR}, Standard) + .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32, Vgpr32}}) + .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}}); + addRulesForGOpcs({G_FRAME_INDEX}).Any({{UniP5, _}, {{SgprP5}, {None}}}); addRulesForGOpcs({G_UBFX, G_SBFX}, Standard) @@ -538,21 +626,56 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, UnpackMinMax}) .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}}); - // Note: we only write S1 rules for G_IMPLICIT_DEF, G_CONSTANT, G_FCONSTANT - // and G_FREEZE here, rest is trivially regbankselected earlier + // Note: we only write S1 rules for G_IMPLICIT_DEF, G_CONSTANT and G_FCONSTANT + // here, rest is trivially regbankselected earlier addRulesForGOpcs({G_IMPLICIT_DEF}).Any({{UniS1}, {{Sgpr32Trunc}, {}}}); addRulesForGOpcs({G_CONSTANT}) .Any({{UniS1, _}, {{Sgpr32Trunc}, {None}, UniCstExt}}); - addRulesForGOpcs({G_FREEZE}).Any({{DivS1}, {{Vcc}, {Vcc}}}); - addRulesForGOpcs({G_ICMP}) - .Any({{UniS1, _, S32}, {{Sgpr32Trunc}, {None, Sgpr32, Sgpr32}}}) - .Any({{DivS1, _, S32}, {{Vcc}, {None, Vgpr32, Vgpr32}}}) - .Any({{DivS1, _, S64}, {{Vcc}, {None, Vgpr64, Vgpr64}}}); + addRulesForGOpcs({G_FREEZE}) + .Any({{UniS1}, {{Sgpr32Trunc}, {Sgpr32AExt}}}) + .Any({{DivS1}, {{Vcc}, {Vcc}}}) + .Any({{UniS16}, {{Sgpr16}, {Sgpr16}}}) + .Any({{UniBRC}, {{SgprBRC}, {SgprBRC}}}) + .Any({{DivBRC}, {{VgprBRC}, {VgprBRC}}}); + + addRulesForGOpcs({G_UNMERGE_VALUES}) + .Any({{UniS16}, {{}, {}, UnmergeToShiftTrunc}}) + .Any({{UniBRC}, {{}, {}, VerifyAllSgpr}}) + .Any({{DivBRC}, {{}, {}, ApplyAllVgpr}}); + + Predicate isSignedICmp([](const MachineInstr &MI) -> bool { + auto Pred = + static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate()); + return CmpInst::isSigned(Pred); + }); + + Predicate isEqualityICmp([](const MachineInstr &MI) -> bool { + auto Pred = + static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate()); + return ICmpInst::isEquality(Pred); + }); - addRulesForGOpcs({G_FCMP}) - .Any({{UniS1, _, S32}, {{UniInVcc}, {None, Vgpr32, Vgpr32}}}) - .Any({{DivS1, _, S32}, {{Vcc}, {None, Vgpr32, Vgpr32}}}); + bool HasScalarCompareEq64 = ST->hasScalarCompareEq64(); + // clang-format off + addRulesForGOpcs({G_ICMP}) + .Any({{{UniS1, _, S16}, isEqualityICmp}, {{Sgpr32Trunc}, {None, Sgpr32ZExt, Sgpr32ZExt}}}) + .Any({{{UniS1, _, S16}, !isEqualityICmp && isSignedICmp}, {{Sgpr32Trunc}, {None, Sgpr32SExt, Sgpr32SExt}}}) + .Any({{{UniS1, _, S16}, !isEqualityICmp && !isSignedICmp}, {{Sgpr32Trunc}, {None, Sgpr32ZExt, Sgpr32ZExt}}}) + .Any({{{DivS1, _, S16}}, {{Vcc}, {None, Vgpr16, Vgpr16}}}) + .Any({{{UniS1, _, S32}}, {{Sgpr32Trunc}, {None, Sgpr32, Sgpr32}}}) + .Any({{{DivS1, _, S32}}, {{Vcc}, {None, Vgpr32, Vgpr32}}}) + .Any({{{UniS1, _, S64}, isEqualityICmp}, {{Sgpr32Trunc}, {None, Sgpr64, Sgpr64}}}, HasScalarCompareEq64) + .Any({{{UniS1, _, S64}, isEqualityICmp}, {{UniInVcc}, {None, Vgpr64, Vgpr64}}}, !HasScalarCompareEq64) + .Any({{{UniS1, _, S64}, !isEqualityICmp}, {{UniInVcc}, {None, Vgpr64, Vgpr64}}}) + .Any({{{DivS1, _, S64}}, {{Vcc}, {None, Vgpr64, Vgpr64}}}) + .Any({{{UniS1, _, Ptr32}}, {{Sgpr32Trunc}, {None, SgprPtr32, SgprPtr32}}}) + .Any({{{DivS1, _, Ptr32}}, {{Vcc}, {None, VgprPtr32, VgprPtr32}}}) + .Any({{{UniS1, _, Ptr64}, isEqualityICmp}, {{Sgpr32Trunc}, {None, SgprPtr64, SgprPtr64}}}, HasScalarCompareEq64) + .Any({{{UniS1, _, Ptr64}, isEqualityICmp}, {{UniInVcc}, {None, VgprPtr64, VgprPtr64}}}, !HasScalarCompareEq64) + .Any({{{UniS1, _, Ptr64}, !isEqualityICmp}, {{UniInVcc}, {None, VgprPtr64, VgprPtr64}}}) + .Any({{{DivS1, _, Ptr64}}, {{Vcc}, {None, VgprPtr64, VgprPtr64}}}); + // clang-format on addRulesForGOpcs({G_BRCOND}) .Any({{UniS1}, {{}, {Sgpr32AExtBoolInReg}}}) @@ -580,6 +703,8 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}}) .Any({{DivS32, S16}, {{Vgpr32}, {Vgpr16}}}); + bool Has16bitCmp = ST->has16BitInsts(); + // In global-isel G_TRUNC in-reg is treated as no-op, inst selected into COPY. // It is up to user to deal with truncated bits. addRulesForGOpcs({G_TRUNC}) @@ -593,7 +718,9 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Any({{UniV2S16, V2S32}, {{SgprV2S16}, {SgprV2S32}}}) .Any({{DivV2S16, V2S32}, {{VgprV2S16}, {VgprV2S32}}}) // This is non-trivial. VgprToVccCopy is done using compare instruction. - .Any({{DivS1, DivS16}, {{Vcc}, {Vgpr16}, VgprToVccCopy}}) + .Any({{DivS1, DivS16}, {{Vcc}, {Vgpr16}, VgprToVccCopy}}, Has16bitCmp) + .Any({{DivS1, DivS16}, {{Vcc}, {Vgpr32AExt}, VgprToVccCopy}}, + !Has16bitCmp) .Any({{DivS1, DivS32}, {{Vcc}, {Vgpr32}, VgprToVccCopy}}) .Any({{DivS1, DivS64}, {{Vcc}, {Vgpr64}, VgprToVccCopy}}); @@ -639,6 +766,64 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Uni(S64, {{Sgpr64}, {Sgpr64, Imm}}) .Div(S64, {{Vgpr64}, {Vgpr64, Imm}}); + // Atomic read-modify-write operations: result and value are always VGPR, + // pointer varies by address space. + addRulesForGOpcs({G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, G_ATOMICRMW_XCHG, + G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, + G_ATOMICRMW_MIN, G_ATOMICRMW_MAX, G_ATOMICRMW_UMIN, + G_ATOMICRMW_UMAX, G_ATOMICRMW_UINC_WRAP, + G_ATOMICRMW_UDEC_WRAP}) + .Any({{DivS32, P0, S32}, {{Vgpr32}, {VgprP0, Vgpr32}}}) + .Any({{DivS64, P0, S64}, {{Vgpr64}, {VgprP0, Vgpr64}}}) + .Any({{DivS32, P1, S32}, {{Vgpr32}, {VgprP1, Vgpr32}}}) + .Any({{DivS64, P1, S64}, {{Vgpr64}, {VgprP1, Vgpr64}}}) + .Any({{DivS32, P3, S32}, {{Vgpr32}, {VgprP3, Vgpr32}}}) + .Any({{DivS64, P3, S64}, {{Vgpr64}, {VgprP3, Vgpr64}}}); + + bool HasAtomicFlatPkAdd16Insts = ST->hasAtomicFlatPkAdd16Insts(); + bool HasAtomicBufferGlobalPkAddF16Insts = + ST->hasAtomicBufferGlobalPkAddF16NoRtnInsts() || + ST->hasAtomicBufferGlobalPkAddF16Insts(); + bool HasAtomicDsPkAdd16Insts = ST->hasAtomicDsPkAdd16Insts(); + addRulesForGOpcs({G_ATOMICRMW_FADD}) + .Any({{DivS32, P0, S32}, {{Vgpr32}, {VgprP0, Vgpr32}}}) + .Any({{DivS64, P0, S64}, {{Vgpr64}, {VgprP0, Vgpr64}}}) + .Any({{DivS32, P1, S32}, {{Vgpr32}, {VgprP1, Vgpr32}}}) + .Any({{DivS64, P1, S64}, {{Vgpr64}, {VgprP1, Vgpr64}}}) + .Any({{DivS32, P3, S32}, {{Vgpr32}, {VgprP3, Vgpr32}}}) + .Any({{DivS64, P3, S64}, {{Vgpr64}, {VgprP3, Vgpr64}}}) + .Any({{DivV2S16, P0, V2S16}, {{VgprV2S16}, {VgprP0, VgprV2S16}}}, + HasAtomicFlatPkAdd16Insts) + .Any({{DivV2S16, P1, V2S16}, {{VgprV2S16}, {VgprP1, VgprV2S16}}}, + HasAtomicBufferGlobalPkAddF16Insts) + .Any({{DivV2S16, P3, V2S16}, {{VgprV2S16}, {VgprP3, VgprV2S16}}}, + HasAtomicDsPkAdd16Insts); + + addRulesForGOpcs({G_ATOMIC_CMPXCHG}) + .Any({{DivS32, P2}, {{Vgpr32}, {VgprP2, Vgpr32, Vgpr32}}}) + .Any({{DivS64, P2}, {{Vgpr64}, {VgprP2, Vgpr64, Vgpr64}}}) + .Any({{DivS32, P3}, {{Vgpr32}, {VgprP3, Vgpr32, Vgpr32}}}) + .Any({{DivS64, P3}, {{Vgpr64}, {VgprP3, Vgpr64, Vgpr64}}}); + + addRulesForGOpcs({G_AMDGPU_ATOMIC_CMPXCHG}) + .Any({{DivS32, P0}, {{Vgpr32}, {VgprP0, VgprV2S32}}}) + .Any({{DivS32, P1}, {{Vgpr32}, {VgprP1, VgprV2S32}}}) + .Any({{DivS64, P0}, {{Vgpr64}, {VgprP0, VgprV2S64}}}) + .Any({{DivS64, P1}, {{Vgpr64}, {VgprP1, VgprV2S64}}}); + + addRulesForGOpcs({G_AMDGPU_BUFFER_ATOMIC_CMPSWAP}, Standard) + .Div(S32, {{Vgpr32}, + {Vgpr32, Vgpr32, SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}) + .Div(S64, {{Vgpr64}, + {Vgpr64, Vgpr64, SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}); + + addRulesForGOpcs({G_AMDGPU_BUFFER_ATOMIC_SWAP, G_AMDGPU_BUFFER_ATOMIC_UMAX, + G_AMDGPU_BUFFER_ATOMIC_UMIN, G_AMDGPU_BUFFER_ATOMIC_SMAX, + G_AMDGPU_BUFFER_ATOMIC_SMIN}, + Standard) + .Div(S32, {{Vgpr32}, {Vgpr32, SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}) + .Div(S64, {{Vgpr64}, {Vgpr64, SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}); + bool hasSMRDx3 = ST->hasScalarDwordx3Loads(); bool hasSMRDSmall = ST->hasScalarSubwordLoads(); bool usesTrue16 = ST->useRealTrue16Insts(); @@ -860,6 +1045,7 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Any({{B64, Ptr32}, {{}, {VgprB64, VgprPtr32}}}) .Any({{B96, Ptr32}, {{}, {VgprB96, VgprPtr32}}}) .Any({{B128, Ptr32}, {{}, {VgprB128, VgprPtr32}}}); + // clang-format on addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD, G_AMDGPU_BUFFER_LOAD_FORMAT, @@ -874,8 +1060,49 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Div(B128, {{VgprB128}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}) .Uni(B128, {{UniInVgprB128}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}); - addRulesForGOpcs({G_AMDGPU_BUFFER_STORE}) - .Any({{S32}, {{}, {Vgpr32, SgprV4S32, Vgpr32, Vgpr32, Sgpr32}}}); + addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD_USHORT, G_AMDGPU_BUFFER_LOAD_UBYTE, + G_AMDGPU_BUFFER_LOAD_SSHORT, G_AMDGPU_BUFFER_LOAD_SBYTE}, + StandardB) + .Div(B32, {{VgprB32}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}) + .Uni(B32, {{UniInVgprB32}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}); + + addRulesForGOpcs( + {G_AMDGPU_BUFFER_LOAD_UBYTE_TFE, G_AMDGPU_BUFFER_LOAD_USHORT_TFE}, + StandardB) + .Div(B64, {{VgprB64}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}) + .Uni(B64, {{UniInVgprB64}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}); + + addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD_TFE, G_AMDGPU_BUFFER_LOAD_FORMAT_TFE}, + StandardB) + .Div(B64, {{VgprB64}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}) + .Uni(B64, {{UniInVgprB64}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}) + .Div(B96, {{VgprB96}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}) + .Uni(B96, {{UniInVgprB96}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}) + .Div(B128, {{VgprB128}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}) + .Uni(B128, {{UniInVgprB128}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}) + .Any({{DivB160}, {{VgprB160}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}}) + .Any({{UniB160}, + {{UniInVgprB160}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}}); + + addRulesForGOpcs( + {G_AMDGPU_BUFFER_LOAD_FORMAT_D16, G_AMDGPU_TBUFFER_LOAD_FORMAT_D16}, + StandardB) + .Div(B32, {{VgprB32}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}) + .Uni(B32, {{UniInVgprB32}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}) + .Div(B64, {{VgprB64}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}) + .Uni(B64, {{UniInVgprB64}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}) + .Div(B128, {{VgprB128}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}) + .Uni(B128, {{UniInVgprB128}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}); + + addRulesForGOpcs({G_AMDGPU_BUFFER_STORE, G_AMDGPU_BUFFER_STORE_BYTE, + G_AMDGPU_BUFFER_STORE_SHORT, G_AMDGPU_BUFFER_STORE_FORMAT, + G_AMDGPU_BUFFER_STORE_FORMAT_D16, + G_AMDGPU_TBUFFER_STORE_FORMAT, + G_AMDGPU_TBUFFER_STORE_FORMAT_D16}) + .Any({{B32}, {{}, {VgprB32, SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}}) + .Any({{B64}, {{}, {VgprB64, SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}}) + .Any({{B96}, {{}, {VgprB96, SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}}) + .Any({{B128}, {{}, {VgprB128, SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}}); addRulesForGOpcs({G_PTR_ADD}) .Any({{UniPtr32}, {{SgprPtr32}, {SgprPtr32, Sgpr32}}}) @@ -899,34 +1126,237 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Any({{UniS128}, {{Sgpr128}, {SgprPtr128}}}) .Any({{DivS128}, {{Vgpr128}, {VgprPtr128}}}); + // FIXME: Update llvm/test/CodeGen/AMDGPU/ptrmask.ll to use GlobalISel. + // Currently crashes on P8 (buffer resource) tests due to legalizer issue. + addRulesForGOpcs({G_PTRMASK}) + .Any({{UniP1}, {{SgprP1}, {SgprP1, Sgpr64}}}) + .Any({{DivP1}, {{VgprP1}, {VgprP1, Vgpr64}}}) + .Any({{UniP3}, {{SgprP3}, {SgprP3, Sgpr32}}}) + .Any({{DivP3}, {{VgprP3}, {VgprP3, Vgpr32}}}); + addRulesForGOpcs({G_ABS}, Standard).Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt}}); - addRulesForGOpcs({G_READSTEADYCOUNTER}, Standard).Uni(S64, {{Sgpr64}, {}}); + addRulesForGOpcs({G_BITREVERSE}, Standard) + .Uni(S32, {{Sgpr32}, {Sgpr32}}) + .Div(S32, {{Vgpr32}, {Vgpr32}}) + .Uni(S64, {{Sgpr64}, {Sgpr64}}) + .Div(S64, {{Vgpr64}, {Vgpr64}}); + + addRulesForGOpcs({G_FENCE}).Any({{{}}, {{}, {}}}); + + addRulesForGOpcs({G_READSTEADYCOUNTER, G_READCYCLECOUNTER}, Standard) + .Uni(S64, {{Sgpr64}, {}}); + + addRulesForGOpcs({G_BLOCK_ADDR}).Any({{UniP0}, {{SgprP0}, {}}}); + + addRulesForGOpcs({G_GLOBAL_VALUE}) + .Any({{UniP0}, {{SgprP0}, {}}}) + .Any({{UniP1}, {{SgprP1}, {}}}) + .Any({{UniP3}, {{SgprP3}, {}}}) + .Any({{UniP4}, {{SgprP4}, {}}}) + .Any({{UniP8}, {{SgprP8}, {}}}); + + addRulesForGOpcs({G_AMDGPU_WAVE_ADDRESS}).Any({{UniP5}, {{SgprP5}, {}}}); bool hasSALUFloat = ST->hasSALUFloatInsts(); - addRulesForGOpcs({G_FADD}, Standard) + addRulesForGOpcs({G_FADD, G_FMUL, G_STRICT_FADD, G_STRICT_FMUL}, Standard) + .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}}, !hasSALUFloat) + .Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16}}, hasSALUFloat) + .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}}) .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasSALUFloat) .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasSALUFloat) - .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}); + .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}) + .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64}}) + .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}}) + .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16, VgprV2S16}}, !hasSALUFloat) + .Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, ScalarizeToS16}, + hasSALUFloat) + .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}}); - addRulesForGOpcs({G_FPTOUI}) - .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}, hasSALUFloat) - .Any({{UniS32, S32}, {{UniInVgprS32}, {Vgpr32}}}, !hasSALUFloat); + addRulesForGOpcs({G_FSUB, G_STRICT_FSUB}, Standard) + .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}}) + .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}) + .Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16}}, hasSALUFloat) + .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}}, !hasSALUFloat) + .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasSALUFloat) + .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasSALUFloat); - addRulesForGOpcs({G_UITOFP}) + addRulesForGOpcs({G_FMAD}, Standard) + .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16, Vgpr16}}) + .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16, Vgpr16}}) + .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32, Vgpr32}}) + .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}}); + + addRulesForGOpcs({G_FLDEXP, G_STRICT_FLDEXP}, Standard) + .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}) + .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}) + .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}}) + .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}}) + .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr32}}) + .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}}); + + addRulesForGOpcs({G_FMA, G_STRICT_FMA}, Standard) + .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16, Vgpr16}}) + .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}}) + .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64, Vgpr64}}) + .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64, Vgpr64}}) + .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16, VgprV2S16}}) + .Any({{UniV2S32}, {{UniInVgprV2S32}, {VgprV2S32, VgprV2S32, VgprV2S32}}}) + .Any({{DivV2S32}, {{VgprV2S32}, {VgprV2S32, VgprV2S32, VgprV2S32}}}) + .Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16, Sgpr16}}, hasSALUFloat) + .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16, Vgpr16}}, !hasSALUFloat) + .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32, Sgpr32}}, hasSALUFloat) + .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32, Vgpr32}}, !hasSALUFloat) + .Uni(V2S16, + {{SgprV2S16}, {SgprV2S16, SgprV2S16, SgprV2S16}, ScalarizeToS16}, + hasSALUFloat) + .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16, VgprV2S16, VgprV2S16}}, + !hasSALUFloat); + + addRulesForGOpcs({G_AMDGPU_FMED3}, Standard) + .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16, Vgpr16}}) + .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16, Vgpr16}}) + .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32, Vgpr32}}) + .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}}); + + // TODO: This opcode is generated from the i64->i16 signed clamped pattern in + // the PreLegalizerCombiner. Move the combine to RegBankCombiner to keep more + // instructions on SALU. + addRulesForGOpcs({G_AMDGPU_SMED3}, Standard) + .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32, Vgpr32}}) + .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}}); + + // FNEG and FABS are either folded as source modifiers or can be selected as + // bitwise XOR and AND with Mask. XOR and AND are available on SALU but for + // targets without SALU float we still select them as VGPR since there would + // be no real sgpr use. + addRulesForGOpcs({G_FNEG, G_FABS}, Standard) + .Uni(S16, {{UniInVgprS16}, {Vgpr16}}, !hasSALUFloat) + .Uni(S16, {{Sgpr16}, {Sgpr16}}, hasSALUFloat) + .Div(S16, {{Vgpr16}, {Vgpr16}}) + .Uni(S32, {{UniInVgprS32}, {Vgpr32}}, !hasSALUFloat) + .Uni(S32, {{Sgpr32}, {Sgpr32}}, hasSALUFloat) + .Div(S32, {{Vgpr32}, {Vgpr32}}) + .Uni(S64, {{UniInVgprS64}, {Vgpr64}}) + .Div(S64, {{Vgpr64}, {Vgpr64}}) + .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16}}, !hasSALUFloat) + .Uni(V2S16, {{SgprV2S16}, {SgprV2S16}, ScalarizeToS16}, hasSALUFloat) + .Div(V2S16, {{VgprV2S16}, {VgprV2S16}}) + .Any({{UniV2S32}, {{UniInVgprV2S32}, {VgprV2S32}}}) + .Any({{DivV2S32}, {{VgprV2S32}, {VgprV2S32}}}); + + addRulesForGOpcs({G_FCANONICALIZE}, Standard) + .Uni(S32, {{UniInVgprS32}, {Vgpr32}}) + .Div(S32, {{Vgpr32}, {Vgpr32}}) + .Uni(S16, {{UniInVgprS16}, {Vgpr16}}) + .Div(S16, {{Vgpr16}, {Vgpr16}}) + .Uni(S64, {{UniInVgprS64}, {Vgpr64}}) + .Div(S64, {{Vgpr64}, {Vgpr64}}) + .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16}}) + .Div(V2S16, {{VgprV2S16}, {VgprV2S16}}) + .Any({{UniV2S32}, {{UniInVgprV2S32}, {VgprV2S32}}}) + .Any({{DivV2S32}, {{VgprV2S32}, {VgprV2S32}}}); + + bool hasPST = ST->hasPseudoScalarTrans(); + addRulesForGOpcs({G_FSQRT}, Standard) + .Div(S16, {{Vgpr16}, {Vgpr16}}) + .Uni(S16, {{Sgpr16}, {Sgpr16}}, hasPST) + .Uni(S16, {{UniInVgprS16}, {Vgpr16}}, !hasPST); + + addRulesForGOpcs({G_FPTOUI, G_FPTOSI}) + .Any({{UniS16, S16}, {{UniInVgprS16}, {Vgpr16}}}) + .Any({{DivS16, S16}, {{Vgpr16}, {Vgpr16}}}) + .Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}}, hasSALUFloat) + .Any({{UniS32, S16}, {{UniInVgprS32}, {Vgpr16}}}, !hasSALUFloat) + .Any({{DivS32, S16}, {{Vgpr32}, {Vgpr16}}}) + .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}, hasSALUFloat) + .Any({{UniS32, S32}, {{UniInVgprS32}, {Vgpr32}}}, !hasSALUFloat) .Any({{DivS32, S32}, {{Vgpr32}, {Vgpr32}}}) + .Any({{UniS32, S64}, {{UniInVgprS32}, {Vgpr64}}}) + .Any({{DivS32, S64}, {{Vgpr32}, {Vgpr64}}}); + + addRulesForGOpcs({G_UITOFP, G_SITOFP}) + .Any({{UniS16, S16}, {{UniInVgprS16}, {Vgpr16}}}) + .Any({{DivS16, S16}, {{Vgpr16}, {Vgpr16}}}) + .Any({{UniS16, S32}, {{Sgpr16}, {Sgpr32}}}, hasSALUFloat) + .Any({{UniS16, S32}, {{UniInVgprS16}, {Vgpr32}}}, !hasSALUFloat) + .Any({{DivS16, S32}, {{Vgpr16}, {Vgpr32}}}) .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}, hasSALUFloat) - .Any({{UniS32, S32}, {{UniInVgprS32}, {Vgpr32}}}, !hasSALUFloat); + .Any({{UniS32, S32}, {{UniInVgprS32}, {Vgpr32}}}, !hasSALUFloat) + .Any({{DivS32, S32}, {{Vgpr32}, {Vgpr32}}}) + .Any({{UniS64, S32}, {{UniInVgprS64}, {Vgpr32}}}) + .Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}}}); + + addRulesForGOpcs({G_FPEXT}) + .Any({{DivS32, S16}, {{Vgpr32}, {Vgpr16}}}) + .Any({{UniS64, S32}, {{UniInVgprS64}, {Vgpr32}}}) + .Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}}}) + .Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}}, hasSALUFloat) + .Any({{UniS32, S16}, {{UniInVgprS32}, {Vgpr16}}}, !hasSALUFloat); + + addRulesForGOpcs({G_AMDGPU_CVT_PK_I16_I32}, Standard) + .Uni(V2S16, {{UniInVgprV2S16}, {Vgpr32, Vgpr32}}) + .Div(V2S16, {{VgprV2S16}, {Vgpr32, Vgpr32}}); + + addRulesForGOpcs({G_AMDGPU_FMIN_LEGACY, G_AMDGPU_FMAX_LEGACY}, Standard) + .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}) + .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}); + + addRulesForGOpcs({G_FPTRUNC}) + .Any({{DivS16, S32}, {{Vgpr16}, {Vgpr32}}}) + .Any({{UniS32, S64}, {{UniInVgprS32}, {Vgpr64}}}) + .Any({{DivS32, S64}, {{Vgpr32}, {Vgpr64}}}) + .Any({{UniV2S16, V2S32}, {{UniInVgprV2S16}, {VgprV2S32}}}) + .Any({{DivV2S16, V2S32}, {{VgprV2S16}, {VgprV2S32}}}) + .Any({{UniS16, S32}, {{Sgpr16}, {Sgpr32}}}, hasSALUFloat) + .Any({{UniS16, S32}, {{UniInVgprS16}, {Vgpr32}}}, !hasSALUFloat); + + addRulesForGOpcs({G_IS_FPCLASS}) + .Any({{DivS1, S16}, {{Vcc}, {Vgpr16}}}) + .Any({{UniS1, S16}, {{UniInVcc}, {Vgpr16}}}) + .Any({{DivS1, S32}, {{Vcc}, {Vgpr32}}}) + .Any({{UniS1, S32}, {{UniInVcc}, {Vgpr32}}}) + .Any({{DivS1, S64}, {{Vcc}, {Vgpr64}}}) + .Any({{UniS1, S64}, {{UniInVcc}, {Vgpr64}}}); + + addRulesForGOpcs({G_FCMP}, Standard) + .Any({{UniS1, _, S16}, {{Sgpr32Trunc}, {None, Sgpr16, Sgpr16}}}, + hasSALUFloat) + .Any({{UniS1, _, S16}, {{UniInVcc}, {None, Vgpr16, Vgpr16}}}, + !hasSALUFloat) + .Any({{DivS1, _, S16}, {{Vcc}, {None, Vgpr16, Vgpr16}}}) + .Any({{UniS1, _, S32}, {{Sgpr32Trunc}, {None, Sgpr32, Sgpr32}}}, + hasSALUFloat) + .Any({{UniS1, _, S32}, {{UniInVcc}, {None, Vgpr32, Vgpr32}}}, + !hasSALUFloat) + .Any({{DivS1, _, S32}, {{Vcc}, {None, Vgpr32, Vgpr32}}}) + .Any({{UniS1, _, S64}, {{UniInVcc}, {None, Vgpr64, Vgpr64}}}) + .Any({{DivS1, _, S64}, {{Vcc}, {None, Vgpr64, Vgpr64}}}); + + addRulesForGOpcs({G_INTRINSIC_TRUNC, G_INTRINSIC_ROUNDEVEN, G_FFLOOR, G_FCEIL, + G_FEXP2, G_FLOG2}, + Standard) + .Uni(S16, {{UniInVgprS16}, {Vgpr16}}) + .Div(S16, {{Vgpr16}, {Vgpr16}}) + .Uni(S32, {{UniInVgprS32}, {Vgpr32}}) + .Div(S32, {{Vgpr32}, {Vgpr32}}) + .Uni(S64, {{UniInVgprS64}, {Vgpr64}}) + .Div(S64, {{Vgpr64}, {Vgpr64}}); using namespace Intrinsic; addRulesForIOpcs({amdgcn_s_getpc}).Any({{UniS64, _}, {{Sgpr64}, {None}}}); + addRulesForIOpcs({amdgcn_groupstaticsize}).Any({{S32}, {{Sgpr32}, {IntrId}}}); + // This is "intrinsic lane mask" it was set to i32/i64 in llvm-ir. - addRulesForIOpcs({amdgcn_end_cf}).Any({{_, S32}, {{}, {None, Sgpr32}}}); + addRulesForIOpcs({amdgcn_end_cf}) + .Any({{_, UniS32}, {{}, {IntrId, Sgpr32}}}) + .Any({{_, UniS64}, {{}, {IntrId, Sgpr64}}}); addRulesForIOpcs({amdgcn_if_break}, Standard) + .Uni(S64, {{Sgpr64}, {IntrId, Vcc, Sgpr64}}) .Uni(S32, {{Sgpr32}, {IntrId, Vcc, Sgpr32}}); addRulesForIOpcs({amdgcn_mbcnt_lo, amdgcn_mbcnt_hi}, Standard) @@ -938,4 +1368,68 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, // readfirstlaning just in case register is not in sgpr. .Any({{UniS32, _, UniS32}, {{}, {Sgpr32, None, Vgpr32}}}); + addRulesForIOpcs({amdgcn_s_sleep}).Any({{_, _}, {{}, {IntrId, Imm}}}); + + addRulesForIOpcs({amdgcn_mul_u24, amdgcn_mul_i24}, Standard) + .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32}}) + .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32}}) + .Uni(S64, {{UniInVgprS64}, {IntrId, Vgpr32, Vgpr32}}) + .Div(S64, {{Vgpr64}, {IntrId, Vgpr32, Vgpr32}}); + + addRulesForIOpcs({amdgcn_mulhi_u24, amdgcn_mulhi_i24, amdgcn_fmul_legacy}, + Standard) + .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32}}) + .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32}}); + + addRulesForIOpcs({amdgcn_fma_legacy}, Standard) + .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}}) + .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}}); + + addRulesForIOpcs({amdgcn_frexp_mant, amdgcn_fract}, Standard) + .Uni(S16, {{UniInVgprS16}, {IntrId, Vgpr16}}) + .Div(S16, {{Vgpr16}, {IntrId, Vgpr16}}) + .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32}}) + .Div(S32, {{Vgpr32}, {IntrId, Vgpr32}}) + .Uni(S64, {{UniInVgprS64}, {IntrId, Vgpr64}}) + .Div(S64, {{Vgpr64}, {IntrId, Vgpr64}}); + + addRulesForIOpcs({amdgcn_prng_b32}) + .Any({{UniS32}, {{UniInVgprS32}, {IntrId, Vgpr32}}}) + .Any({{DivS32}, {{Vgpr32}, {IntrId, Vgpr32}}}); + + addRulesForIOpcs({amdgcn_sffbh}, Standard) + .Uni(S32, {{Sgpr32}, {IntrId, Sgpr32}}) + .Div(S32, {{Vgpr32}, {IntrId, Vgpr32}}); + + addRulesForIOpcs({amdgcn_ubfe, amdgcn_sbfe}, Standard) + .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}}) + .Uni(S32, {{Sgpr32}, {IntrId, Sgpr32, Sgpr32, Sgpr32}, S_BFE}) + .Uni(S64, {{Sgpr64}, {IntrId, Sgpr64, Sgpr32, Sgpr32}, S_BFE}) + .Div(S64, {{Vgpr64}, {IntrId, Vgpr64, Vgpr32, Vgpr32}, V_BFE}); + + addRulesForIOpcs({amdgcn_global_load_tr_b64}) + .Any({{DivB64}, {{VgprB64}, {IntrId, SgprP1}}}) + .Any({{DivB32}, {{VgprB32}, {IntrId, SgprP1}}}); + + addRulesForIOpcs({amdgcn_global_load_tr_b128}) + .Any({{DivB64}, {{VgprB64}, {IntrId, SgprP1}}}) + .Any({{DivB128}, {{VgprB128}, {IntrId, SgprP1}}}); + + addRulesForIOpcs({amdgcn_global_atomic_ordered_add_b64}) + .Any({{DivS64}, {{Vgpr64}, {IntrId, VgprP1, Vgpr64}}}); + + addRulesForIOpcs({amdgcn_wwm, amdgcn_strict_wwm}, StandardB) + .Div(B32, {{VgprB32}, {IntrId, VgprB32}}) + .Uni(B32, {{SgprB32}, {IntrId, SgprB32}}) + .Div(B64, {{VgprB64}, {IntrId, VgprB64}}) + .Uni(B64, {{SgprB64}, {IntrId, SgprB64}}) + .Div(B96, {{VgprB96}, {IntrId, VgprB96}}) + .Uni(B96, {{SgprB96}, {IntrId, SgprB96}}) + .Div(B128, {{VgprB128}, {IntrId, VgprB128}}) + .Uni(B128, {{SgprB128}, {IntrId, SgprB128}}) + .Any({{UniB256}, {{SgprB256}, {IntrId, SgprB256}}}) + .Any({{DivB256}, {{VgprB256}, {IntrId, VgprB256}}}) + .Any({{UniB512}, {{SgprB512}, {IntrId, SgprB512}}}) + .Any({{DivB512}, {{VgprB512}, {IntrId, VgprB512}}}); + } // end initialize rules diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h index 93e0efd..eee4f62 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h @@ -60,24 +60,29 @@ enum UniformityLLTOpPredicateID { // pointers P0, P1, + P2, P3, P4, P5, + P8, Ptr32, Ptr64, Ptr128, UniP0, UniP1, + UniP2, UniP3, UniP4, UniP5, + UniP8, UniPtr32, UniPtr64, UniPtr128, DivP0, DivP1, + DivP2, DivP3, DivP4, DivP5, @@ -88,18 +93,24 @@ enum UniformityLLTOpPredicateID { // vectors V2S16, V2S32, + V2S64, V3S32, V4S32, UniV2S16, + UniV2S32, + UniV2S64, DivV2S16, + DivV2S32, + DivV2S64, // B types B32, B64, B96, B128, + B160, B256, B512, @@ -107,15 +118,19 @@ enum UniformityLLTOpPredicateID { UniB64, UniB96, UniB128, + UniB160, UniB256, UniB512, + UniBRC, DivB32, DivB64, DivB96, DivB128, + DivB160, DivB256, DivB512, + DivBRC }; // How to apply register bank on register operand. @@ -134,10 +149,13 @@ enum RegBankLLTMappingApplyID { Sgpr32, Sgpr64, Sgpr128, + SgprP0, SgprP1, + SgprP2, SgprP3, SgprP4, SgprP5, + SgprP8, SgprPtr32, SgprPtr64, SgprPtr128, @@ -150,6 +168,7 @@ enum RegBankLLTMappingApplyID { SgprB128, SgprB256, SgprB512, + SgprBRC, // vgpr scalars, pointers, vectors and B-types Vgpr16, @@ -158,6 +177,7 @@ enum RegBankLLTMappingApplyID { Vgpr128, VgprP0, VgprP1, + VgprP2, VgprP3, VgprP4, VgprP5, @@ -166,24 +186,32 @@ enum RegBankLLTMappingApplyID { VgprPtr128, VgprV2S16, VgprV2S32, + VgprV3S32, VgprB32, VgprB64, VgprB96, VgprB128, + VgprB160, VgprB256, VgprB512, + VgprBRC, VgprV4S32, + VgprV2S64, // Dst only modifiers: read-any-lane and truncs UniInVcc, UniInVgprS16, UniInVgprS32, + UniInVgprS64, UniInVgprV2S16, + UniInVgprV2S32, UniInVgprV4S32, + UniInVgprV2S64, UniInVgprB32, UniInVgprB64, UniInVgprB96, UniInVgprB128, + UniInVgprB160, UniInVgprB256, UniInVgprB512, @@ -198,6 +226,7 @@ enum RegBankLLTMappingApplyID { Sgpr32AExtBoolInReg, Sgpr32SExt, Sgpr32ZExt, + Vgpr32AExt, Vgpr32SExt, Vgpr32ZExt, }; @@ -216,14 +245,23 @@ enum LoweringMethodID { S_BFE, V_BFE, VgprToVccCopy, + UniMAD64, + UniMul64, + DivSMulToMAD, SplitTo32, + SplitTo32Mul, + ScalarizeToS16, SplitTo32Select, SplitTo32SExtInReg, Ext32To64, UniCstExt, SplitLoad, WidenLoad, - WidenMMOToS32 + WidenMMOToS32, + UnpackAExt, + VerifyAllSgpr, + ApplyAllVgpr, + UnmergeToShiftTrunc }; enum FastRulesTypes { @@ -277,7 +315,7 @@ public: SetOfRulesForOpcode(); SetOfRulesForOpcode(FastRulesTypes FastTypes); - const RegBankLLTMapping & + const RegBankLLTMapping * findMappingForMI(const MachineInstr &MI, const MachineRegisterInfo &MRI, const MachineUniformityInfo &MUI) const; @@ -297,7 +335,7 @@ private: class RegBankLegalizeRules { const GCNSubtarget *ST; MachineRegisterInfo *MRI; - // Separate maps for G-opcodes and instrinsics since they are in different + // Separate maps for G-opcodes and intrinsics since they are in different // enums. Multiple opcodes can share same set of rules. // RulesAlias = map<Opcode, KeyOpcode> // Rules = map<KeyOpcode, SetOfRulesForOpcode> @@ -375,7 +413,7 @@ public: MRI = &_MRI; }; - const SetOfRulesForOpcode &getRulesForOpc(MachineInstr &MI) const; + const SetOfRulesForOpcode *getRulesForOpc(MachineInstr &MI) const; }; } // end namespace AMDGPU diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 56807a4..e8f316d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -287,9 +287,6 @@ unsigned AMDGPURegisterBankInfo::getBreakDownCost( const RegisterBank & AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC, LLT Ty) const { - if (&RC == &AMDGPU::SReg_1RegClass) - return AMDGPU::VCCRegBank; - // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a // VCC-like use. if (TRI->isSGPRClass(&RC)) { @@ -471,7 +468,7 @@ RegisterBankInfo::InstructionMappings AMDGPURegisterBankInfo::getInstrAlternativeMappings( const MachineInstr &MI) const { - const MachineFunction &MF = *MI.getParent()->getParent(); + const MachineFunction &MF = *MI.getMF(); const MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -1263,11 +1260,14 @@ unsigned AMDGPURegisterBankInfo::setBufferOffsets( } } + const bool CheckNUW = Subtarget.hasGFX1250Insts(); Register Base; unsigned Offset; std::tie(Base, Offset) = - AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset); + AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset, + /*KnownBits=*/nullptr, + /*CheckNUW=*/CheckNUW); uint32_t SOffset, ImmOffset; if ((int)Offset > 0 && @@ -1292,7 +1292,8 @@ unsigned AMDGPURegisterBankInfo::setBufferOffsets( // Handle the variable sgpr + vgpr case. MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI); - if (Add && (int)Offset >= 0) { + if (Add && (int)Offset >= 0 && + (!CheckNUW || Add->getFlag(MachineInstr::NoUWrap))) { Register Src0 = getSrcRegIgnoringCopies(Add->getOperand(1).getReg(), *MRI); Register Src1 = getSrcRegIgnoringCopies(Add->getOperand(2).getReg(), *MRI); @@ -1561,8 +1562,7 @@ bool AMDGPURegisterBankInfo::applyMappingBFE(MachineIRBuilder &B, (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64); auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs}); - if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this)) - llvm_unreachable("failed to constrain BFE"); + constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this); MI.eraseFromParent(); return true; @@ -1873,11 +1873,11 @@ bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg, Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); B.buildInstr(AMDGPU::V_MOV_B32_e32) - .addDef(TmpReg0) - .addUse(SrcReg, 0, AMDGPU::sub0); + .addDef(TmpReg0) + .addUse(SrcReg, {}, AMDGPU::sub0); B.buildInstr(AMDGPU::V_MOV_B32_e32) - .addDef(TmpReg1) - .addUse(SrcReg, 0, AMDGPU::sub1); + .addDef(TmpReg1) + .addUse(SrcReg, {}, AMDGPU::sub1); B.buildInstr(AMDGPU::REG_SEQUENCE) .addDef(DstReg) .addUse(TmpReg0) @@ -2412,7 +2412,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl( if (DstBank == &AMDGPU::VCCRegBank) break; - MachineFunction *MF = MI.getParent()->getParent(); + MachineFunction *MF = MI.getMF(); ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank); LegalizerHelper Helper(*MF, ApplyBank, B); @@ -2492,7 +2492,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl( // There is no VALU abs instruction so we need to replace it with a sub and // max combination. if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) { - MachineFunction *MF = MI.getParent()->getParent(); + MachineFunction *MF = MI.getMF(); ApplyRegBankMapping Apply(B, *this, MRI, &AMDGPU::VGPRRegBank); LegalizerHelper Helper(*MF, Apply, B); @@ -3114,6 +3114,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl( case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: { @@ -3283,6 +3285,9 @@ void AMDGPURegisterBankInfo::applyMappingImpl( constrainOpWithReadfirstlane(B, MI, 2); // M0 return; } + case Intrinsic::amdgcn_s_alloc_vgpr: + constrainOpWithReadfirstlane(B, MI, 2); + return; case Intrinsic::amdgcn_s_sendmsg: case Intrinsic::amdgcn_s_sendmsghalt: { // FIXME: Should this use a waterfall loop? @@ -3297,7 +3302,9 @@ void AMDGPURegisterBankInfo::applyMappingImpl( constrainOpWithReadfirstlane(B, MI, 1); // M0 return; case Intrinsic::amdgcn_raw_buffer_load_lds: - case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: { + case Intrinsic::amdgcn_raw_buffer_load_async_lds: + case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: + case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds: { applyDefaultMapping(OpdMapper); constrainOpWithReadfirstlane(B, MI, 1); // rsrc constrainOpWithReadfirstlane(B, MI, 2); // M0 @@ -3305,7 +3312,9 @@ void AMDGPURegisterBankInfo::applyMappingImpl( return; } case Intrinsic::amdgcn_struct_buffer_load_lds: - case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: { + case Intrinsic::amdgcn_struct_buffer_load_async_lds: + case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: + case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds: { applyDefaultMapping(OpdMapper); constrainOpWithReadfirstlane(B, MI, 1); // rsrc constrainOpWithReadfirstlane(B, MI, 2); // M0 @@ -3321,7 +3330,9 @@ void AMDGPURegisterBankInfo::applyMappingImpl( return; } case Intrinsic::amdgcn_load_to_lds: - case Intrinsic::amdgcn_global_load_lds: { + case Intrinsic::amdgcn_load_async_to_lds: + case Intrinsic::amdgcn_global_load_lds: + case Intrinsic::amdgcn_global_load_async_lds: { applyDefaultMapping(OpdMapper); constrainOpWithReadfirstlane(B, MI, 2); return; @@ -3348,6 +3359,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl( constrainOpWithReadfirstlane(B, MI, 1); return; case Intrinsic::amdgcn_s_barrier_join: + case Intrinsic::amdgcn_s_wakeup_barrier: constrainOpWithReadfirstlane(B, MI, 1); return; case Intrinsic::amdgcn_s_barrier_init: @@ -3496,6 +3508,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl( executeInWaterfallLoop(B, make_range(Start, End), SGPROperandRegs); break; } + case AMDGPU::G_AMDGPU_FLAT_LOAD_MONITOR: + case AMDGPU::G_AMDGPU_GLOBAL_LOAD_MONITOR: case AMDGPU::G_LOAD: case AMDGPU::G_ZEXTLOAD: case AMDGPU::G_SEXTLOAD: { @@ -3607,7 +3621,7 @@ unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo &MRI, } bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const { - const MachineFunction &MF = *MI.getParent()->getParent(); + const MachineFunction &MF = *MI.getMF(); const MachineRegisterInfo &MRI = MF.getRegInfo(); for (const MachineOperand &MO : MI.operands()) { if (!MO.isReg()) @@ -3623,7 +3637,7 @@ bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const { const RegisterBankInfo::InstructionMapping & AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const { - const MachineFunction &MF = *MI.getParent()->getParent(); + const MachineFunction &MF = *MI.getMF(); const MachineRegisterInfo &MRI = MF.getRegInfo(); SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); @@ -3641,7 +3655,7 @@ AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const { const RegisterBankInfo::InstructionMapping & AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const { - const MachineFunction &MF = *MI.getParent()->getParent(); + const MachineFunction &MF = *MI.getMF(); const MachineRegisterInfo &MRI = MF.getRegInfo(); SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); @@ -3665,7 +3679,7 @@ AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const { const RegisterBankInfo::InstructionMapping & AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const { - const MachineFunction &MF = *MI.getParent()->getParent(); + const MachineFunction &MF = *MI.getMF(); const MachineRegisterInfo &MRI = MF.getRegInfo(); SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); @@ -3744,7 +3758,7 @@ AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI, const RegisterBankInfo::InstructionMapping & AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const { - const MachineFunction &MF = *MI.getParent()->getParent(); + const MachineFunction &MF = *MI.getMF(); const MachineRegisterInfo &MRI = MF.getRegInfo(); SmallVector<const ValueMapping*, 2> OpdsMapping(2); unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); @@ -3834,7 +3848,7 @@ AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg, // const RegisterBankInfo::InstructionMapping & AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { - const MachineFunction &MF = *MI.getParent()->getParent(); + const MachineFunction &MF = *MI.getMF(); const MachineRegisterInfo &MRI = MF.getRegInfo(); if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) { @@ -4084,6 +4098,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { } case AMDGPU::G_FPTOSI: case AMDGPU::G_FPTOUI: + case AMDGPU::G_FPTOSI_SAT: + case AMDGPU::G_FPTOUI_SAT: case AMDGPU::G_SITOFP: case AMDGPU::G_UITOFP: { unsigned SizeDst = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); @@ -4502,6 +4518,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: { @@ -4577,6 +4595,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[0] = getSGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); break; + case AMDGPU::G_AMDGPU_SPONENTRY: { + unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); + break; + } case AMDGPU::G_INTRINSIC: case AMDGPU::G_INTRINSIC_CONVERGENT: { switch (cast<GIntrinsic>(MI).getIntrinsicID()) { @@ -4835,6 +4858,14 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_perm_pk16_b4_u4: case Intrinsic::amdgcn_perm_pk16_b6_u4: case Intrinsic::amdgcn_perm_pk16_b8_u4: + case Intrinsic::amdgcn_add_max_i32: + case Intrinsic::amdgcn_add_max_u32: + case Intrinsic::amdgcn_add_min_i32: + case Intrinsic::amdgcn_add_min_u32: + case Intrinsic::amdgcn_pk_add_max_i16: + case Intrinsic::amdgcn_pk_add_max_u16: + case Intrinsic::amdgcn_pk_add_min_i16: + case Intrinsic::amdgcn_pk_add_min_u16: return getDefaultMappingVOP(MI); case Intrinsic::amdgcn_log: case Intrinsic::amdgcn_exp2: @@ -5073,17 +5104,17 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { unsigned MinNumRegsRequired = DstSize / 32; const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + bool UseAGPRForm = Info->selectAGPRFormMFMA(MinNumRegsRequired); + OpdsMapping[0] = - Info->getMinNumAGPRs() >= MinNumRegsRequired - ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI) - : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); + UseAGPRForm ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI) + : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); OpdsMapping[4] = - Info->getMinNumAGPRs() >= MinNumRegsRequired - ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI) - : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); + UseAGPRForm ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI) + : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); OpdsMapping[8] = getVGPROpMapping(MI.getOperand(8).getReg(), MRI, *TRI); OpdsMapping[10] = getVGPROpMapping(MI.getOperand(10).getReg(), MRI, *TRI); @@ -5209,11 +5240,15 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { break; } case Intrinsic::amdgcn_wave_reduce_add: + case Intrinsic::amdgcn_wave_reduce_fadd: case Intrinsic::amdgcn_wave_reduce_sub: + case Intrinsic::amdgcn_wave_reduce_fsub: case Intrinsic::amdgcn_wave_reduce_min: case Intrinsic::amdgcn_wave_reduce_umin: + case Intrinsic::amdgcn_wave_reduce_fmin: case Intrinsic::amdgcn_wave_reduce_max: case Intrinsic::amdgcn_wave_reduce_umax: + case Intrinsic::amdgcn_wave_reduce_fmax: case Intrinsic::amdgcn_wave_reduce_and: case Intrinsic::amdgcn_wave_reduce_or: case Intrinsic::amdgcn_wave_reduce_xor: { @@ -5225,11 +5260,20 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[2] = AMDGPU::getValueMapping(regBankID, OpSize); break; } - case Intrinsic::amdgcn_s_bitreplicate: + case Intrinsic::amdgcn_s_bitreplicate: { Register MaskReg = MI.getOperand(2).getReg(); unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID); OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64); OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, 32); + break; + } + case Intrinsic::amdgcn_wave_shuffle: { + unsigned OpSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize); + OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize); + OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize); + break; + } } break; } @@ -5296,12 +5340,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); break; } - case Intrinsic::amdgcn_global_atomic_csub: case Intrinsic::amdgcn_global_atomic_fmin_num: case Intrinsic::amdgcn_global_atomic_fmax_num: case Intrinsic::amdgcn_flat_atomic_fmin_num: case Intrinsic::amdgcn_flat_atomic_fmax_num: - case Intrinsic::amdgcn_atomic_cond_sub_u32: case Intrinsic::amdgcn_global_atomic_ordered_add_b64: case Intrinsic::amdgcn_global_load_tr_b64: case Intrinsic::amdgcn_global_load_tr_b128: @@ -5311,12 +5353,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_ds_load_tr16_b128: case Intrinsic::amdgcn_ds_load_tr4_b64: case Intrinsic::amdgcn_ds_load_tr6_b96: - case Intrinsic::amdgcn_flat_load_monitor_b32: - case Intrinsic::amdgcn_flat_load_monitor_b64: - case Intrinsic::amdgcn_flat_load_monitor_b128: - case Intrinsic::amdgcn_global_load_monitor_b32: - case Intrinsic::amdgcn_global_load_monitor_b64: - case Intrinsic::amdgcn_global_load_monitor_b128: case Intrinsic::amdgcn_ds_read_tr4_b64: case Intrinsic::amdgcn_ds_read_tr6_b96: case Intrinsic::amdgcn_ds_read_tr8_b64: @@ -5359,6 +5395,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); OpdsMapping[8] = getSGPROpMapping(MI.getOperand(8).getReg(), MRI, *TRI); break; + case Intrinsic::amdgcn_s_alloc_vgpr: + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1); + OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); + break; case Intrinsic::amdgcn_s_sendmsg: case Intrinsic::amdgcn_s_sendmsghalt: { // This must be an SGPR, but accept a VGPR. @@ -5418,7 +5458,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { break; } case Intrinsic::amdgcn_raw_buffer_load_lds: - case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: { + case Intrinsic::amdgcn_raw_buffer_load_async_lds: + case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: + case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds: { OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); @@ -5451,7 +5493,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { break; } case Intrinsic::amdgcn_struct_buffer_load_lds: - case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: { + case Intrinsic::amdgcn_struct_buffer_load_async_lds: + case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: + case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds: { OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); @@ -5570,6 +5614,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); break; case Intrinsic::amdgcn_s_barrier_join: + case Intrinsic::amdgcn_s_wakeup_barrier: OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); break; case Intrinsic::amdgcn_s_barrier_init: @@ -5696,6 +5741,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_ATOMICRMW_FMAX: case AMDGPU::G_ATOMICRMW_UINC_WRAP: case AMDGPU::G_ATOMICRMW_UDEC_WRAP: + case AMDGPU::G_ATOMICRMW_USUB_COND: + case AMDGPU::G_ATOMICRMW_USUB_SAT: case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: { OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg()); @@ -5728,6 +5775,14 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN: OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); break; + case AMDGPU::G_AMDGPU_FLAT_LOAD_MONITOR: + case AMDGPU::G_AMDGPU_GLOBAL_LOAD_MONITOR: { + unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); + unsigned PtrSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); + OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize); + break; + } } return getInstructionMapping(/*ID*/1, /*Cost*/1, diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td b/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td index 1c1a6da..c37d309 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td @@ -15,7 +15,7 @@ def VGPRRegBank : RegisterBank<"VGPR", >; // It is helpful to distinguish conditions from ordinary SGPRs. -def VCCRegBank : RegisterBank <"VCC", [SReg_1]>; +def VCCRegBank : RegisterBank<"VCC", [SReg_32, SReg_64]>; def AGPRRegBank : RegisterBank <"AGPR", [AGPR_LO16, AGPR_32, AReg_64, AReg_96, AReg_128, AReg_160, AReg_192, AReg_224, AReg_256, AReg_288, AReg_320, AReg_352, AReg_384, AReg_512, AReg_1024] diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp index 0ea9add..4e664e0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp @@ -256,17 +256,13 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage( // Pseudo used just to encode the underlying global. Is there a better // way to track this? + // TODO: Some of the generic call-like pseudos do not encode the callee, + // so we overly conservatively treat this as an indirect call. const MachineOperand *CalleeOp = TII->getNamedOperand(MI, AMDGPU::OpName::callee); - const Function *Callee = getCalleeFunction(*CalleeOp); - - // Avoid crashing on undefined behavior with an illegal call to a - // kernel. If a callsite's calling convention doesn't match the - // function's, it's undefined behavior. If the callsite calling - // convention does match, that would have errored earlier. - if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv())) - report_fatal_error("invalid call to entry function"); + const Function *Callee = + CalleeOp ? getCalleeFunction(*CalleeOp) : nullptr; auto isSameFunction = [](const MachineFunction &MF, const Function *F) { return F == &MF.getFunction(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp index 89c16da..7a5db42 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp @@ -32,6 +32,7 @@ #include "llvm/CodeGen/LiveStacks.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/SlotIndexes.h" #include "llvm/CodeGen/VirtRegMap.h" #include "llvm/InitializePasses.h" @@ -96,8 +97,8 @@ public: /// Compute the register class constraints based on the uses of \p Reg, /// excluding MFMA uses from which can be rewritten to change the register - /// class constraint. This should be nearly identical to - /// MachineRegisterInfo::recomputeRegClass. + /// class constraint. MFMA scale operands need to be constraint checked. + /// This should be nearly identical to MachineRegisterInfo::recomputeRegClass. /// \p RewriteCandidates will collect the set of MFMA instructions that need /// to have the opcode mutated to perform the replacement. @@ -151,9 +152,16 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::recomputeRegClassExceptRewritable( // We can swap the classes of dst + src2 as a pair to AGPR, so ignore the // effects of rewrite candidates. It just so happens that we can use - // either AGPR or VGPR in src0/src1, so don't bother checking the - // constraint effects of the individual operands. + // either AGPR or VGPR in src0/src1. We still need to check constraint + // effects for scale variant, which does not allow AGPR. if (isRewriteCandidate(*MI)) { + int AGPROp = AMDGPU::getMFMASrcCVDstAGPROp(MI->getOpcode()); + const MCInstrDesc &AGPRDesc = TII.get(AGPROp); + const TargetRegisterClass *NewRC = + TII.getRegClass(AGPRDesc, MO.getOperandNo()); + if (!TRI.hasAGPRs(NewRC)) + return false; + const MachineOperand *VDst = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst); const MachineOperand *Src2 = @@ -587,10 +595,7 @@ public: static char ID; RegisterClassInfo RegClassInfo; - AMDGPURewriteAGPRCopyMFMALegacy() : MachineFunctionPass(ID) { - initializeAMDGPURewriteAGPRCopyMFMALegacyPass( - *PassRegistry::getPassRegistry()); - } + AMDGPURewriteAGPRCopyMFMALegacy() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override; @@ -659,7 +664,11 @@ AMDGPURewriteAGPRCopyMFMAPass::run(MachineFunction &MF, if (!Impl.run(MF)) return PreservedAnalyses::all(); auto PA = getMachineFunctionPassPreservedAnalyses(); - PA.preserveSet<CFGAnalyses>(); - PA.preserve<LiveStacksAnalysis>(); + PA.preserveSet<CFGAnalyses>() + .preserve<LiveStacksAnalysis>() + .preserve<VirtRegMapAnalysis>() + .preserve<SlotIndexesAnalysis>() + .preserve<LiveIntervalsAnalysis>() + .preserve<LiveRegMatrixAnalysis>(); return PA; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp index 4b1f80c..a2e16c7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp @@ -299,7 +299,7 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) { if (Replacements.empty()) return false; - LLVMContext &Ctx = F.getParent()->getContext(); + LLVMContext &Ctx = F.getContext(); StructType *NewRetTy = StructType::create(Ctx, ReturnTypes, F.getName()); FunctionType *NewFuncTy = FunctionType::get(NewRetTy, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td index 2393346..963bb91 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -237,8 +237,6 @@ def : SourceOfDivergence<int_amdgcn_mbcnt_lo>; def : SourceOfDivergence<int_r600_read_tidig_x>; def : SourceOfDivergence<int_r600_read_tidig_y>; def : SourceOfDivergence<int_r600_read_tidig_z>; -def : SourceOfDivergence<int_amdgcn_atomic_cond_sub_u32>; -def : SourceOfDivergence<int_amdgcn_global_atomic_csub>; def : SourceOfDivergence<int_amdgcn_global_atomic_fmin_num>; def : SourceOfDivergence<int_amdgcn_global_atomic_fmax_num>; def : SourceOfDivergence<int_amdgcn_global_atomic_ordered_add_b64>; @@ -409,7 +407,17 @@ def : AlwaysUniform<int_amdgcn_cluster_workgroup_max_flat_id>; def : AlwaysUniform<int_amdgcn_workgroup_id_x>; def : AlwaysUniform<int_amdgcn_workgroup_id_y>; def : AlwaysUniform<int_amdgcn_workgroup_id_z>; +def : AlwaysUniform<int_amdgcn_s_alloc_vgpr>; def : AlwaysUniform<int_amdgcn_s_getpc>; def : AlwaysUniform<int_amdgcn_s_getreg>; def : AlwaysUniform<int_amdgcn_s_memrealtime>; def : AlwaysUniform<int_amdgcn_s_memtime>; + +def AMDGPUImageDMaskIntrinsicTable : GenericTable { + let FilterClass = "AMDGPUImageDMaskIntrinsic"; + let Fields = ["Intr"]; + + let PrimaryKey = ["Intr"]; + let PrimaryKeyName = "getAMDGPUImageDMaskIntrinsic"; + let PrimaryKeyEarlyOut = 1; +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSelectionDAGInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSelectionDAGInfo.cpp index 2941a48..5b8ee5f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSelectionDAGInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSelectionDAGInfo.cpp @@ -7,13 +7,53 @@ //===----------------------------------------------------------------------===// #include "AMDGPUSelectionDAGInfo.h" -#include "AMDGPUISelLowering.h" + +#define GET_SDNODE_DESC +#include "AMDGPUGenSDNodeInfo.inc" using namespace llvm; +AMDGPUSelectionDAGInfo::AMDGPUSelectionDAGInfo() + : SelectionDAGGenTargetInfo(AMDGPUGenSDNodeInfo) {} + AMDGPUSelectionDAGInfo::~AMDGPUSelectionDAGInfo() = default; -bool AMDGPUSelectionDAGInfo::isTargetMemoryOpcode(unsigned Opcode) const { - return Opcode >= AMDGPUISD::FIRST_MEMORY_OPCODE && - Opcode <= AMDGPUISD::LAST_MEMORY_OPCODE; +const char *AMDGPUSelectionDAGInfo::getTargetNodeName(unsigned Opcode) const { +#define NODE_NAME_CASE(node) \ + case AMDGPUISD::node: \ + return "AMDGPUISD::" #node; + + switch (static_cast<AMDGPUISD::NodeType>(Opcode)) { + // These nodes don't have corresponding entries in *.td files yet. + NODE_NAME_CASE(WAVE_ADDRESS) + NODE_NAME_CASE(MAD_I64_I32) + NODE_NAME_CASE(MAD_U64_U32) + NODE_NAME_CASE(BUILD_VERTICAL_VECTOR) + // These do, but only when compiling R600.td, + // and the enum is generated from AMDGPU.td. + NODE_NAME_CASE(DOT4) + NODE_NAME_CASE(TEXTURE_FETCH) + NODE_NAME_CASE(R600_EXPORT) + NODE_NAME_CASE(CONST_ADDRESS) + NODE_NAME_CASE(DUMMY_CHAIN) + } + +#undef NODE_NAME_CASE + + return SelectionDAGGenTargetInfo::getTargetNodeName(Opcode); +} + +void AMDGPUSelectionDAGInfo::verifyTargetNode(const SelectionDAG &DAG, + const SDNode *N) const { + switch (N->getOpcode()) { + case AMDGPUISD::IF: + // result #0 must have type i1, but has type i32/i64 + case AMDGPUISD::ELSE: + case AMDGPUISD::LOOP: + // operand #1 must have type i1, but has type i32/i64 + case AMDGPUISD::LDS: + // result #0 must have type i64 (iPTR), but has type i32 + return; + } + SelectionDAGGenTargetInfo::verifyTargetNode(DAG, N); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSelectionDAGInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUSelectionDAGInfo.h index 3280be7..bae614a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSelectionDAGInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSelectionDAGInfo.h @@ -11,13 +11,49 @@ #include "llvm/CodeGen/SelectionDAGTargetInfo.h" +#define GET_SDNODE_ENUM +#include "AMDGPUGenSDNodeInfo.inc" + namespace llvm { +namespace AMDGPUISD { + +enum NodeType : unsigned { + // Convert a unswizzled wave uniform stack address to an address compatible + // with a vector offset for use in stack access. + WAVE_ADDRESS = GENERATED_OPCODE_END, + + DOT4, + MAD_U64_U32, + MAD_I64_I32, + TEXTURE_FETCH, + R600_EXPORT, + CONST_ADDRESS, -class AMDGPUSelectionDAGInfo : public SelectionDAGTargetInfo { + /// This node is for VLIW targets and it is used to represent a vector + /// that is stored in consecutive registers with the same channel. + /// For example: + /// |X |Y|Z|W| + /// T0|v.x| | | | + /// T1|v.y| | | | + /// T2|v.z| | | | + /// T3|v.w| | | | + BUILD_VERTICAL_VECTOR, + + DUMMY_CHAIN, +}; + +} // namespace AMDGPUISD + +class AMDGPUSelectionDAGInfo : public SelectionDAGGenTargetInfo { public: + AMDGPUSelectionDAGInfo(); + ~AMDGPUSelectionDAGInfo() override; - bool isTargetMemoryOpcode(unsigned Opcode) const override; + const char *getTargetNodeName(unsigned Opcode) const override; + + void verifyTargetNode(const SelectionDAG &DAG, + const SDNode *N) const override; }; } // namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp index 9af8129..d04dc3e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp @@ -314,9 +314,7 @@ public: #endif bool empty() const { return Nodes.empty(); } - const iterator_range<nodes_iterator> nodes() const { - return {Nodes.begin(), Nodes.end()}; - } + iterator_range<nodes_iterator> nodes() const { return Nodes; } const Node &getNode(unsigned ID) const { return *Nodes[ID]; } unsigned getNumNodes() const { return Nodes.size(); } @@ -993,7 +991,7 @@ void RecursiveSearchSplitting::run() { { SplitModuleTimer SMT("recursive_search_pick", "partitioning"); SplitProposal SP(SG, NumParts); - pickPartition(/*BranchDepth=*/0, /*Idx=*/0, SP); + pickPartition(/*BranchDepth=*/0, /*Idx=*/0, std::move(SP)); } } @@ -1140,7 +1138,7 @@ void RecursiveSearchSplitting::pickPartition(unsigned Depth, unsigned Idx, LLVM_DEBUG(dbgs().indent(Depth) << " [lb] " << Idx << "=P" << CheapestPID << "? "); BranchSP.add(CheapestPID, Cluster); - pickPartition(Depth + 1, Idx + 1, BranchSP); + pickPartition(Depth + 1, Idx + 1, std::move(BranchSP)); } // ms = most similar = put in partition with the most in common @@ -1149,7 +1147,7 @@ void RecursiveSearchSplitting::pickPartition(unsigned Depth, unsigned Idx, LLVM_DEBUG(dbgs().indent(Depth) << " [ms] " << Idx << "=P" << MostSimilarPID << "? "); BranchSP.add(MostSimilarPID, Cluster); - pickPartition(Depth + 1, Idx + 1, BranchSP); + pickPartition(Depth + 1, Idx + 1, std::move(BranchSP)); } return; @@ -1163,7 +1161,7 @@ void RecursiveSearchSplitting::pickPartition(unsigned Depth, unsigned Idx, SP.setName("recursive_search (depth=" + std::to_string(Depth) + ") #" + std::to_string(NumProposalsSubmitted++)); LLVM_DEBUG(dbgs() << '\n'); - SubmitProposal(SP); + SubmitProposal(std::move(SP)); } std::pair<unsigned, CostType> diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 26e0b3df..300aca1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -32,16 +32,6 @@ using namespace llvm; #define DEBUG_TYPE "amdgpu-subtarget" -AMDGPUSubtarget::AMDGPUSubtarget(Triple TT) : TargetTriple(std::move(TT)) {} - -bool AMDGPUSubtarget::useRealTrue16Insts() const { - return hasTrue16BitInsts() && EnableRealTrue16Insts; -} - -bool AMDGPUSubtarget::hasD16Writes32BitVgpr() const { - return EnableD16Writes32BitVgpr; -} - // Returns the maximum per-workgroup LDS allocation size (in bytes) that still // allows the given function to achieve an occupancy of NWaves waves per // SIMD / EU, taking into account only the function's *maximum* workgroup size. @@ -282,7 +272,7 @@ bool AMDGPUSubtarget::isSingleLaneExecution(const Function &Func) const { } bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { - Function *Kernel = I->getParent()->getParent(); + Function *Kernel = I->getFunction(); unsigned MinSize = 0; unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; bool IdQuery = false; @@ -350,7 +340,7 @@ bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { } unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const { - assert(AMDGPU::isKernel(F.getCallingConv())); + assert(AMDGPU::isKernel(F)); // We don't allocate the segment if we know the implicit arguments weren't // used, even if the ABI implies we need them. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index ed03ef2..302fe7c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -42,40 +42,18 @@ public: GFX10 = 9, GFX11 = 10, GFX12 = 11, + GFX13 = 12, }; private: Triple TargetTriple; protected: - bool GCN3Encoding = false; - bool Has16BitInsts = false; - bool HasTrue16BitInsts = false; - bool HasFP8ConversionScaleInsts = false; - bool HasBF8ConversionScaleInsts = false; - bool HasFP4ConversionScaleInsts = false; - bool HasFP6BF6ConversionScaleInsts = false; - bool HasF16BF16ToFP6BF6ConversionScaleInsts = false; - bool HasCvtPkF16F32Inst = false; - bool HasF32ToF16BF16ConversionSRInsts = false; - bool EnableRealTrue16Insts = false; - bool EnableD16Writes32BitVgpr = false; - bool HasBF16TransInsts = false; - bool HasBF16ConversionInsts = false; - bool HasBF16PackedInsts = false; - bool HasMadMixInsts = false; - bool HasMadMacF32Insts = false; - bool HasDsSrc2Insts = false; - bool HasSDWA = false; - bool HasVOP3PInsts = false; bool HasMulI24 = true; bool HasMulU24 = true; bool HasSMulHi = false; - bool HasInv2PiInlineImm = false; bool HasFminFmaxLegacy = true; - bool EnablePromoteAlloca = false; - bool HasTrigReducedRange = false; - bool FastFMAF32 = false; + unsigned EUsPerCU = 4; unsigned MaxWavesPerEU = 10; unsigned LocalMemorySize = 0; @@ -83,7 +61,7 @@ protected: char WavefrontSizeLog2 = 0; public: - AMDGPUSubtarget(Triple TT); + AMDGPUSubtarget(Triple TT) : TargetTriple(std::move(TT)) {} static const AMDGPUSubtarget &get(const MachineFunction &MF); static const AMDGPUSubtarget &get(const TargetMachine &TM, @@ -132,13 +110,6 @@ public: /// size, register usage, and/or lds usage. std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const; - /// Overload which uses the specified values for the flat work group sizes, - /// rather than querying the function itself. \p FlatWorkGroupSizes Should - /// correspond to the function's value for getFlatWorkGroupSizes. - std::pair<unsigned, unsigned> - getWavesPerEU(const Function &F, - std::pair<unsigned, unsigned> FlatWorkGroupSizes) const; - /// Overload which uses the specified values for the flat workgroup sizes and /// LDS space rather than querying the function itself. \p FlatWorkGroupSizes /// should correspond to the function's value for getFlatWorkGroupSizes and \p @@ -206,16 +177,13 @@ public: bool isGCN() const { return TargetTriple.isAMDGCN(); } - bool isGCN3Encoding() const { - return GCN3Encoding; - } - - bool has16BitInsts() const { - return Has16BitInsts; - } - - /// Return true if the subtarget supports True16 instructions. - bool hasTrue16BitInsts() const { return HasTrue16BitInsts; } + //==---------------------------------------------------------------------===// + // TableGen-generated feature getters. + //==---------------------------------------------------------------------===// +#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \ + virtual bool GETTER() const { return false; } +#include "AMDGPUGenSubtargetInfo.inc" + //==---------------------------------------------------------------------===// /// Return true if real (non-fake) variants of True16 instructions using /// 16-bit registers should be code-generated. Fake True16 instructions are @@ -223,56 +191,8 @@ public: /// operands and always use their low halves. // TODO: Remove and use hasTrue16BitInsts() instead once True16 is fully // supported and the support for fake True16 instructions is removed. - bool useRealTrue16Insts() const; - - bool hasD16Writes32BitVgpr() const; - - bool hasBF16TransInsts() const { return HasBF16TransInsts; } - - bool hasBF16ConversionInsts() const { - return HasBF16ConversionInsts; - } - - bool hasBF16PackedInsts() const { return HasBF16PackedInsts; } - - bool hasMadMixInsts() const { - return HasMadMixInsts; - } - - bool hasFP8ConversionScaleInsts() const { return HasFP8ConversionScaleInsts; } - - bool hasBF8ConversionScaleInsts() const { return HasBF8ConversionScaleInsts; } - - bool hasFP4ConversionScaleInsts() const { return HasFP4ConversionScaleInsts; } - - bool hasFP6BF6ConversionScaleInsts() const { - return HasFP6BF6ConversionScaleInsts; - } - - bool hasF16BF16ToFP6BF6ConversionScaleInsts() const { - return HasF16BF16ToFP6BF6ConversionScaleInsts; - } - - bool hasCvtPkF16F32Inst() const { return HasCvtPkF16F32Inst; } - - bool hasF32ToF16BF16ConversionSRInsts() const { - return HasF32ToF16BF16ConversionSRInsts; - } - - bool hasMadMacF32Insts() const { - return HasMadMacF32Insts || !isGCN(); - } - - bool hasDsSrc2Insts() const { - return HasDsSrc2Insts; - } - - bool hasSDWA() const { - return HasSDWA; - } - - bool hasVOP3PInsts() const { - return HasVOP3PInsts; + bool useRealTrue16Insts() const { + return hasTrue16BitInsts() && enableRealTrue16Insts(); } bool hasMulI24() const { @@ -287,26 +207,10 @@ public: return HasSMulHi; } - bool hasInv2PiInlineImm() const { - return HasInv2PiInlineImm; - } - bool hasFminFmaxLegacy() const { return HasFminFmaxLegacy; } - bool hasTrigReducedRange() const { - return HasTrigReducedRange; - } - - bool hasFastFMAF32() const { - return FastFMAF32; - } - - bool isPromoteAllocaEnabled() const { - return EnablePromoteAlloca; - } - unsigned getWavefrontSize() const { return 1 << WavefrontSizeLog2; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp index 4a9437b..3fd554a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp @@ -47,8 +47,8 @@ // corresponds to offset, second member corresponds to size of LDS global // being replaced and third represents the total aligned size. It will // have name "llvm.amdgcn.sw.lds.<kernel-name>.md". This global will have -// an intializer with static LDS related offsets and sizes initialized. -// But for dynamic LDS related entries, offsets will be intialized to +// an initializer with static LDS related offsets and sizes initialized. +// But for dynamic LDS related entries, offsets will be initialized to // previous static LDS allocation end offset. Sizes for them will be zero // initially. These dynamic LDS offset and size values will be updated // within the kernel, since kernel can read the dynamic LDS size @@ -271,7 +271,7 @@ void AMDGPUSwLowerLDS::getNonKernelsWithLDSArguments(const CallGraph &CG) { Function *CalledFunc = CallerCGN->getFunction(); if (!CalledFunc || CalledFunc->isDeclaration()) continue; - if (AMDGPU::isKernelLDS(CalledFunc)) + if (AMDGPU::isKernel(*CalledFunc)) continue; for (auto AI = CalledFunc->arg_begin(), E = CalledFunc->arg_end(); AI != E; ++AI) { @@ -297,7 +297,7 @@ void AMDGPUSwLowerLDS::getUsesOfLDSByNonKernels() { for (User *V : GV->users()) { if (auto *I = dyn_cast<Instruction>(V)) { Function *F = I->getFunction(); - if (!isKernelLDS(F) && !F->isDeclaration()) + if (!isKernel(*F) && !F->isDeclaration()) FuncLDSAccessInfo.NonKernelToLDSAccessMap[F].insert(GV); } } @@ -523,7 +523,7 @@ static void replacesUsesOfGlobalInFunction(Function *Func, GlobalVariable *GV, auto ReplaceUsesLambda = [Func](const Use &U) -> bool { auto *V = U.getUser(); if (auto *Inst = dyn_cast<Instruction>(V)) { - auto *Func1 = Inst->getParent()->getParent(); + auto *Func1 = Inst->getFunction(); if (Func == Func1) return true; } @@ -1169,7 +1169,7 @@ bool AMDGPUSwLowerLDS::run() { if (!F || K.second.empty()) continue; - assert(isKernelLDS(F)); + assert(isKernel(*F)); // Only inserts if key isn't already in the map. FuncLDSAccessInfo.KernelToLDSParametersMap.insert( diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 4958a20..49c60c2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -17,9 +17,12 @@ #include "AMDGPUTargetMachine.h" #include "AMDGPU.h" #include "AMDGPUAliasAnalysis.h" +#include "AMDGPUArgumentUsageInfo.h" +#include "AMDGPUBarrierLatency.h" #include "AMDGPUCtorDtorLowering.h" #include "AMDGPUExportClustering.h" #include "AMDGPUExportKernelRuntimeHandles.h" +#include "AMDGPUHazardLatency.h" #include "AMDGPUIGroupLP.h" #include "AMDGPUISelDAGToDAG.h" #include "AMDGPULowerVGPREncoding.h" @@ -72,6 +75,7 @@ #include "llvm/CodeGen/AtomicExpand.h" #include "llvm/CodeGen/BranchRelaxation.h" #include "llvm/CodeGen/DeadMachineInstructionElim.h" +#include "llvm/CodeGen/EarlyIfConversion.h" #include "llvm/CodeGen/GlobalISel/CSEInfo.h" #include "llvm/CodeGen/GlobalISel/IRTranslator.h" #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" @@ -140,29 +144,36 @@ public: const CGPassBuilderOption &Opts, PassInstrumentationCallbacks *PIC); - void addIRPasses(AddIRPass &) const; - void addCodeGenPrepare(AddIRPass &) const; - void addPreISel(AddIRPass &addPass) const; - void addILPOpts(AddMachinePass &) const; - void addAsmPrinter(AddMachinePass &, CreateMCStreamer) const; - Error addInstSelector(AddMachinePass &) const; - void addPreRewrite(AddMachinePass &) const; - void addMachineSSAOptimization(AddMachinePass &) const; - void addPostRegAlloc(AddMachinePass &) const; - void addPreEmitPass(AddMachinePass &) const; - void addPreEmitRegAlloc(AddMachinePass &) const; - Error addRegAssignmentOptimized(AddMachinePass &) const; - void addPreRegAlloc(AddMachinePass &) const; - void addOptimizedRegAlloc(AddMachinePass &) const; - void addPreSched2(AddMachinePass &) const; + void addIRPasses(PassManagerWrapper &PMW) const; + void addCodeGenPrepare(PassManagerWrapper &PMW) const; + void addPreISel(PassManagerWrapper &PMW) const; + void addILPOpts(PassManagerWrapper &PMWM) const; + void addAsmPrinter(PassManagerWrapper &PMW, CreateMCStreamer) const; + Error addInstSelector(PassManagerWrapper &PMW) const; + void addPreRewrite(PassManagerWrapper &PMW) const; + void addMachineSSAOptimization(PassManagerWrapper &PMW) const; + void addPostRegAlloc(PassManagerWrapper &PMW) const; + void addPreEmitPass(PassManagerWrapper &PMWM) const; + void addPreEmitRegAlloc(PassManagerWrapper &PMW) const; + Error addRegAssignmentFast(PassManagerWrapper &PMW) const; + Error addRegAssignmentOptimized(PassManagerWrapper &PMW) const; + void addPreRegAlloc(PassManagerWrapper &PMW) const; + Error addFastRegAlloc(PassManagerWrapper &PMW) const; + Error addOptimizedRegAlloc(PassManagerWrapper &PMW) const; + void addPreSched2(PassManagerWrapper &PMW) const; + void addPostBBSections(PassManagerWrapper &PMW) const; + +private: + Error validateRegAllocOptions() const; +public: /// Check if a pass is enabled given \p Opt option. The option always /// overrides defaults if explicitly used. Otherwise its default will be used /// given that a pass shall work at an optimization \p Level minimum. bool isPassEnabled(const cl::opt<bool> &Opt, CodeGenOptLevel Level = CodeGenOptLevel::Default) const; - void addEarlyCSEOrGVNPass(AddIRPass &) const; - void addStraightLineScalarOptimizationPasses(AddIRPass &) const; + void addEarlyCSEOrGVNPass(PassManagerWrapper &PMW) const; + void addStraightLineScalarOptimizationPasses(PassManagerWrapper &PMW) const; }; class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> { @@ -237,6 +248,63 @@ static cl::opt<WWMRegisterRegAlloc::FunctionPassCtor, false, cl::init(&useDefaultRegisterAllocator), cl::desc("Register allocator to use for WWM registers")); +// New pass manager register allocator options for AMDGPU +static cl::opt<RegAllocType, false, RegAllocTypeParser> SGPRRegAllocNPM( + "sgpr-regalloc-npm", cl::Hidden, cl::init(RegAllocType::Default), + cl::desc("Register allocator for SGPRs (new pass manager)")); + +static cl::opt<RegAllocType, false, RegAllocTypeParser> VGPRRegAllocNPM( + "vgpr-regalloc-npm", cl::Hidden, cl::init(RegAllocType::Default), + cl::desc("Register allocator for VGPRs (new pass manager)")); + +static cl::opt<RegAllocType, false, RegAllocTypeParser> WWMRegAllocNPM( + "wwm-regalloc-npm", cl::Hidden, cl::init(RegAllocType::Default), + cl::desc("Register allocator for WWM registers (new pass manager)")); + +/// Check if the given RegAllocType is supported for AMDGPU NPM register +/// allocation. Only Fast and Greedy are supported; Basic and PBQP are not. +static Error checkRegAllocSupported(RegAllocType RAType, StringRef RegName) { + if (RAType == RegAllocType::Basic || RAType == RegAllocType::PBQP) { + return make_error<StringError>( + Twine("unsupported register allocator '") + + (RAType == RegAllocType::Basic ? "basic" : "pbqp") + "' for " + + RegName + " registers", + inconvertibleErrorCode()); + } + return Error::success(); +} + +Error AMDGPUCodeGenPassBuilder::validateRegAllocOptions() const { + // 1. Generic --regalloc-npm is not supported for AMDGPU. + if (Opt.RegAlloc != RegAllocType::Unset) { + return make_error<StringError>( + "-regalloc-npm not supported for amdgcn. Use -sgpr-regalloc-npm, " + "-vgpr-regalloc-npm, and -wwm-regalloc-npm", + inconvertibleErrorCode()); + } + + // 2. Legacy PM regalloc options are not compatible with NPM. + if (SGPRRegAlloc.getNumOccurrences() > 0 || + VGPRRegAlloc.getNumOccurrences() > 0 || + WWMRegAlloc.getNumOccurrences() > 0) { + return make_error<StringError>( + "-sgpr-regalloc, -vgpr-regalloc, and -wwm-regalloc are legacy PM " + "options. Use -sgpr-regalloc-npm, -vgpr-regalloc-npm, and " + "-wwm-regalloc-npm with the new pass manager", + inconvertibleErrorCode()); + } + + // 3. Only Fast and Greedy allocators are supported for AMDGPU. + if (auto Err = checkRegAllocSupported(SGPRRegAllocNPM, "SGPR")) + return Err; + if (auto Err = checkRegAllocSupported(WWMRegAllocNPM, "WWM")) + return Err; + if (auto Err = checkRegAllocSupported(VGPRRegAllocNPM, "VGPR")) + return Err; + + return Error::success(); +} + static void initializeDefaultSGPRRegisterAllocatorOnce() { RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault(); @@ -464,6 +532,11 @@ static cl::opt<bool> EnableScalarIRPasses( cl::init(true), cl::Hidden); +static cl::opt<bool> EnableLowerExecSync( + "amdgpu-enable-lower-exec-sync", + cl::desc("Enable lowering of execution synchronization."), cl::init(true), + cl::Hidden); + static cl::opt<bool> EnableSwLowerLDS("amdgpu-enable-sw-lower-lds", cl::desc("Enable lowering of lds to global memory pass " @@ -566,9 +639,10 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeSILoadStoreOptimizerLegacyPass(*PR); initializeAMDGPUCtorDtorLoweringLegacyPass(*PR); initializeAMDGPUAlwaysInlinePass(*PR); + initializeAMDGPULowerExecSyncLegacyPass(*PR); initializeAMDGPUSwLowerLDSLegacyPass(*PR); initializeAMDGPUAnnotateUniformValuesLegacyPass(*PR); - initializeAMDGPUArgumentUsageInfoPass(*PR); + initializeAMDGPUArgumentUsageInfoWrapperLegacyPass(*PR); initializeAMDGPUAtomicOptimizerPass(*PR); initializeAMDGPULowerKernelArgumentsPass(*PR); initializeAMDGPUPromoteKernelArgumentsPass(*PR); @@ -618,6 +692,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeAMDGPUPreloadKernArgPrologLegacyPass(*PR); initializeAMDGPUWaitSGPRHazardsLegacyPass(*PR); initializeAMDGPUPreloadKernelArgumentsLegacyPass(*PR); + initializeAMDGPUUniformIntrinsicCombineLegacyPass(*PR); } static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { @@ -639,6 +714,8 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial)); DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); + DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation(C->MF)); + DAG->addMutation(createAMDGPUHazardLatencyDAGMutation(C->MF)); return DAG; } @@ -659,6 +736,8 @@ createGCNMaxMemoryClauseMachineScheduler(MachineSchedContext *C) { if (ST.shouldClusterStores()) DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); + DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation(C->MF)); + DAG->addMutation(createAMDGPUHazardLatencyDAGMutation(C->MF)); return DAG; } @@ -737,7 +816,7 @@ static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) { return "r600"; } -static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) { +static Reloc::Model getEffectiveRelocModel() { // The AMDGPU toolchain only supports generating shared objects, so we // must always use PIC. return Reloc::PIC_; @@ -751,8 +830,8 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, CodeGenOptLevel OptLevel) : CodeGenTargetMachineImpl( T, TT.computeDataLayout(), TT, getGPUOrDefault(TT, CPU), FS, Options, - getEffectiveRelocModel(RM), - getEffectiveCodeModel(CM, CodeModel::Small), OptLevel), + getEffectiveRelocModel(), getEffectiveCodeModel(CM, CodeModel::Small), + OptLevel), TLOF(createTLOF(getTargetTriple())) { initAsmInfo(); if (TT.isAMDGCN()) { @@ -802,7 +881,8 @@ static bool mustPreserveGV(const GlobalValue &GV) { } void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) { - AAM.registerFunctionAnalysis<AMDGPUAA>(); + if (EnableAMDGPUAliasAnalysis) + AAM.registerFunctionAnalysis<AMDGPUAA>(); } static Expected<ScanOptions> @@ -812,7 +892,7 @@ parseAMDGPUAtomicOptimizerStrategy(StringRef Params) { Params.consume_front("strategy="); auto Result = StringSwitch<std::optional<ScanOptions>>(Params) .Case("dpp", ScanOptions::DPP) - .Cases("iterative", "", ScanOptions::Iterative) + .Cases({"iterative", ""}, ScanOptions::Iterative) .Case("none", ScanOptions::None) .Default(std::nullopt); if (Result) @@ -884,9 +964,6 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { if (EarlyInlineAll && !EnableFunctionCalls) PM.addPass(AMDGPUAlwaysInlinePass()); - - if (EnableUniformIntrinsicCombine) - PM.addPass(AMDGPUUniformIntrinsicCombinePass()); }); PB.registerPeepholeEPCallback( @@ -897,6 +974,9 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { FPM.addPass(AMDGPUUseNativeCallsPass()); if (EnableLibCallSimplify) FPM.addPass(AMDGPUSimplifyLibCallsPass()); + + if (EnableUniformIntrinsicCombine) + FPM.addPass(AMDGPUUniformIntrinsicCombinePass()); }); PB.registerCGSCCOptimizerLateEPCallback( @@ -958,6 +1038,8 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { // We want to support the -lto-partitions=N option as "best effort". // For that, we need to lower LDS earlier in the pipeline before the // module is partitioned for codegen. + if (EnableLowerExecSync) + PM.addPass(AMDGPULowerExecSyncPass()); if (EnableSwLowerLDS) PM.addPass(AMDGPUSwLowerLDSPass(*this)); if (EnableLowerModuleLDS) @@ -1197,6 +1279,8 @@ GCNTargetMachine::createPostMachineScheduler(MachineSchedContext *C) const { EnableVOPD) DAG->addMutation(createVOPDPairingMutation()); DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); + DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation(C->MF)); + DAG->addMutation(createAMDGPUHazardLatencyDAGMutation(C->MF)); return DAG; } //===----------------------------------------------------------------------===// @@ -1213,10 +1297,6 @@ class GCNPassConfig final : public AMDGPUPassConfig { public: GCNPassConfig(TargetMachine &TM, PassManagerBase &PM) : AMDGPUPassConfig(TM, PM) { - // It is necessary to know the register usage of the entire call graph. We - // allow calls without EnableAMDGPUFunctionCalls if they are marked - // noinline, so this is always required. - setRequiresCodeGenSCCOrder(true); substitutePass(&PostRASchedulerID, &PostMachineSchedulerID); } @@ -1310,6 +1390,9 @@ void AMDGPUPassConfig::addIRPasses() { isPassEnabled(EnableImageIntrinsicOptimizer)) addPass(createAMDGPUImageIntrinsicOptimizerPass(&TM)); + if (EnableUniformIntrinsicCombine) + addPass(createAMDGPUUniformIntrinsicCombineLegacyPass()); + // This can be disabled by passing ::Disable here or on the command line // with --expand-variadics-override=disable. addPass(createExpandVariadicsPass(ExpandVariadicsMode::Lowering)); @@ -1325,6 +1408,10 @@ void AMDGPUPassConfig::addIRPasses() { // Make enqueued block runtime handles externally visible. addPass(createAMDGPUExportKernelRuntimeHandlesLegacyPass()); + // Lower special LDS accesses. + if (EnableLowerExecSync) + addPass(createAMDGPULowerExecSyncLegacyPass()); + // Lower LDS accesses to global memory pass if address sanitizer is enabled. if (EnableSwLowerLDS) addPass(createAMDGPUSwLowerLDSLegacyPass(&TM)); @@ -1410,9 +1497,6 @@ void AMDGPUPassConfig::addCodeGenPrepare() { // many cases. addPass(createAMDGPULowerBufferFatPointersPass()); addPass(createAMDGPULowerIntrinsicsLegacyPass()); - // In accordance with the above FIXME, manually force all the - // function-level passes into a CGSCCPassManager. - addPass(new DummyCGSCCPass()); } // LowerSwitch pass may introduce unreachable blocks that can @@ -2007,6 +2091,42 @@ bool GCNTargetMachine::parseMachineFunctionInfo( MFI->ArgInfo.WorkItemIDZ, 0, 0))) return true; + // Parse FirstKernArgPreloadReg separately, since it's a Register, + // not ArgDescriptor. + if (YamlMFI.ArgInfo && YamlMFI.ArgInfo->FirstKernArgPreloadReg) { + const yaml::SIArgument &A = *YamlMFI.ArgInfo->FirstKernArgPreloadReg; + + if (!A.IsRegister) { + // For stack arguments, we don't have RegisterName.SourceRange, + // but we should have some location info from the YAML parser + const MemoryBuffer &Buffer = + *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID()); + // Create a minimal valid source range + SMLoc Loc = SMLoc::getFromPointer(Buffer.getBufferStart()); + SMRange Range(Loc, Loc); + + Error = SMDiagnostic( + *PFS.SM, Loc, Buffer.getBufferIdentifier(), 1, 0, SourceMgr::DK_Error, + "firstKernArgPreloadReg must be a register, not a stack location", "", + {}, {}); + + SourceRange = Range; + return true; + } + + Register Reg; + if (parseNamedRegisterReference(PFS, Reg, A.RegisterName.Value, Error)) { + SourceRange = A.RegisterName.SourceRange; + return true; + } + + if (!AMDGPU::SGPR_32RegClass.contains(Reg)) + return diagnoseRegisterClass(A.RegisterName); + + MFI->ArgInfo.FirstKernArgPreloadReg = Reg; + MFI->NumUserSGPRs += YamlMFI.NumKernargPreloadSGPRs; + } + if (ST.hasIEEEMode()) MFI->Mode.IEEE = YamlMFI.Mode.IEEE; if (ST.hasDX10ClampMode()) @@ -2046,63 +2166,74 @@ AMDGPUCodeGenPassBuilder::AMDGPUCodeGenPassBuilder( // Exceptions and StackMaps are not supported, so these passes will never do // anything. // Garbage collection is not supported. - disablePass<StackMapLivenessPass, FuncletLayoutPass, - ShadowStackGCLoweringPass>(); + disablePass<StackMapLivenessPass, FuncletLayoutPass, PatchableFunctionPass, + ShadowStackGCLoweringPass, GCLoweringPass>(); } -void AMDGPUCodeGenPassBuilder::addIRPasses(AddIRPass &addPass) const { - if (RemoveIncompatibleFunctions && TM.getTargetTriple().isAMDGCN()) - addPass(AMDGPURemoveIncompatibleFunctionsPass(TM)); +void AMDGPUCodeGenPassBuilder::addIRPasses(PassManagerWrapper &PMW) const { + if (RemoveIncompatibleFunctions && TM.getTargetTriple().isAMDGCN()) { + flushFPMsToMPM(PMW); + addModulePass(AMDGPURemoveIncompatibleFunctionsPass(TM), PMW); + } - addPass(AMDGPUPrintfRuntimeBindingPass()); + flushFPMsToMPM(PMW); + addModulePass(AMDGPUPrintfRuntimeBindingPass(), PMW); if (LowerCtorDtor) - addPass(AMDGPUCtorDtorLoweringPass()); + addModulePass(AMDGPUCtorDtorLoweringPass(), PMW); if (isPassEnabled(EnableImageIntrinsicOptimizer)) - addPass(AMDGPUImageIntrinsicOptimizerPass(TM)); + addFunctionPass(AMDGPUImageIntrinsicOptimizerPass(TM), PMW); + if (EnableUniformIntrinsicCombine) + addFunctionPass(AMDGPUUniformIntrinsicCombinePass(), PMW); // This can be disabled by passing ::Disable here or on the command line // with --expand-variadics-override=disable. - addPass(ExpandVariadicsPass(ExpandVariadicsMode::Lowering)); + flushFPMsToMPM(PMW); + addModulePass(ExpandVariadicsPass(ExpandVariadicsMode::Lowering), PMW); - addPass(AMDGPUAlwaysInlinePass()); - addPass(AlwaysInlinerPass()); + addModulePass(AMDGPUAlwaysInlinePass(), PMW); + addModulePass(AlwaysInlinerPass(), PMW); - addPass(AMDGPUExportKernelRuntimeHandlesPass()); + addModulePass(AMDGPUExportKernelRuntimeHandlesPass(), PMW); + + if (EnableLowerExecSync) + addModulePass(AMDGPULowerExecSyncPass(), PMW); if (EnableSwLowerLDS) - addPass(AMDGPUSwLowerLDSPass(TM)); + addModulePass(AMDGPUSwLowerLDSPass(TM), PMW); // Runs before PromoteAlloca so the latter can account for function uses if (EnableLowerModuleLDS) - addPass(AMDGPULowerModuleLDSPass(TM)); + addModulePass(AMDGPULowerModuleLDSPass(TM), PMW); // Run atomic optimizer before Atomic Expand if (TM.getOptLevel() >= CodeGenOptLevel::Less && (AMDGPUAtomicOptimizerStrategy != ScanOptions::None)) - addPass(AMDGPUAtomicOptimizerPass(TM, AMDGPUAtomicOptimizerStrategy)); + addFunctionPass( + AMDGPUAtomicOptimizerPass(TM, AMDGPUAtomicOptimizerStrategy), PMW); - addPass(AtomicExpandPass(&TM)); + addFunctionPass(AtomicExpandPass(TM), PMW); if (TM.getOptLevel() > CodeGenOptLevel::None) { - addPass(AMDGPUPromoteAllocaPass(TM)); + addFunctionPass(AMDGPUPromoteAllocaPass(TM), PMW); if (isPassEnabled(EnableScalarIRPasses)) - addStraightLineScalarOptimizationPasses(addPass); + addStraightLineScalarOptimizationPasses(PMW); // TODO: Handle EnableAMDGPUAliasAnalysis // TODO: May want to move later or split into an early and late one. - addPass(AMDGPUCodeGenPreparePass(TM)); + addFunctionPass(AMDGPUCodeGenPreparePass(TM), PMW); // Try to hoist loop invariant parts of divisions AMDGPUCodeGenPrepare may // have expanded. if (TM.getOptLevel() > CodeGenOptLevel::Less) { - addPass(createFunctionToLoopPassAdaptor(LICMPass(LICMOptions()), - /*UseMemorySSA=*/true)); + addFunctionPass(createFunctionToLoopPassAdaptor(LICMPass(LICMOptions()), + /*UseMemorySSA=*/true), + PMW); } } - Base::addIRPasses(addPass); + Base::addIRPasses(PMW); // EarlyCSE is not always strong enough to clean up what LSR produces. For // example, GVN can combine @@ -2117,20 +2248,23 @@ void AMDGPUCodeGenPassBuilder::addIRPasses(AddIRPass &addPass) const { // // but EarlyCSE can do neither of them. if (isPassEnabled(EnableScalarIRPasses)) - addEarlyCSEOrGVNPass(addPass); + addEarlyCSEOrGVNPass(PMW); } -void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const { - if (TM.getOptLevel() > CodeGenOptLevel::None) - addPass(AMDGPUPreloadKernelArgumentsPass(TM)); +void AMDGPUCodeGenPassBuilder::addCodeGenPrepare( + PassManagerWrapper &PMW) const { + if (TM.getOptLevel() > CodeGenOptLevel::None) { + flushFPMsToMPM(PMW); + addModulePass(AMDGPUPreloadKernelArgumentsPass(TM), PMW); + } if (EnableLowerKernelArguments) - addPass(AMDGPULowerKernelArgumentsPass(TM)); + addFunctionPass(AMDGPULowerKernelArgumentsPass(TM), PMW); - Base::addCodeGenPrepare(addPass); + Base::addCodeGenPrepare(PMW); if (isPassEnabled(EnableLoadStoreVectorizer)) - addPass(LoadStoreVectorizerPass()); + addFunctionPass(LoadStoreVectorizerPass(), PMW); // This lowering has been placed after codegenprepare to take advantage of // address mode matching (which is why it isn't put with the LDS lowerings). @@ -2139,102 +2273,160 @@ void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const { // but has been put before switch lowering and CFG flattening so that those // passes can run on the more optimized control flow this pass creates in // many cases. - addPass(AMDGPULowerBufferFatPointersPass(TM)); - addPass.requireCGSCCOrder(); + flushFPMsToMPM(PMW); + addModulePass(AMDGPULowerBufferFatPointersPass(TM), PMW); + flushFPMsToMPM(PMW); + requireCGSCCOrder(PMW); - addPass(AMDGPULowerIntrinsicsPass(TM)); + addModulePass(AMDGPULowerIntrinsicsPass(TM), PMW); // LowerSwitch pass may introduce unreachable blocks that can cause unexpected // behavior for subsequent passes. Placing it here seems better that these // blocks would get cleaned up by UnreachableBlockElim inserted next in the // pass flow. - addPass(LowerSwitchPass()); + addFunctionPass(LowerSwitchPass(), PMW); } -void AMDGPUCodeGenPassBuilder::addPreISel(AddIRPass &addPass) const { +void AMDGPUCodeGenPassBuilder::addPreISel(PassManagerWrapper &PMW) const { + + // Require AMDGPUArgumentUsageAnalysis so that it's available during ISel. + flushFPMsToMPM(PMW); + addModulePass(RequireAnalysisPass<AMDGPUArgumentUsageAnalysis, Module>(), + PMW); if (TM.getOptLevel() > CodeGenOptLevel::None) { - addPass(FlattenCFGPass()); - addPass(SinkingPass()); - addPass(AMDGPULateCodeGenPreparePass(TM)); + addFunctionPass(FlattenCFGPass(), PMW); + addFunctionPass(SinkingPass(), PMW); + addFunctionPass(AMDGPULateCodeGenPreparePass(TM), PMW); } // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit // regions formed by them. - addPass(AMDGPUUnifyDivergentExitNodesPass()); - addPass(FixIrreduciblePass()); - addPass(UnifyLoopExitsPass()); - addPass(StructurizeCFGPass(/*SkipUniformRegions=*/false)); + addFunctionPass(AMDGPUUnifyDivergentExitNodesPass(), PMW); + addFunctionPass(FixIrreduciblePass(), PMW); + addFunctionPass(UnifyLoopExitsPass(), PMW); + addFunctionPass(StructurizeCFGPass(/*SkipUniformRegions=*/false), PMW); - addPass(AMDGPUAnnotateUniformValuesPass()); + addFunctionPass(AMDGPUAnnotateUniformValuesPass(), PMW); - addPass(SIAnnotateControlFlowPass(TM)); + addFunctionPass(SIAnnotateControlFlowPass(TM), PMW); // TODO: Move this right after structurizeCFG to avoid extra divergence // analysis. This depends on stopping SIAnnotateControlFlow from making // control flow modifications. - addPass(AMDGPURewriteUndefForPHIPass()); + addFunctionPass(AMDGPURewriteUndefForPHIPass(), PMW); if (!getCGPassBuilderOption().EnableGlobalISelOption || !isGlobalISelAbortEnabled() || !NewRegBankSelect) - addPass(LCSSAPass()); + addFunctionPass(LCSSAPass(), PMW); - if (TM.getOptLevel() > CodeGenOptLevel::Less) - addPass(AMDGPUPerfHintAnalysisPass(TM)); + if (TM.getOptLevel() > CodeGenOptLevel::Less) { + flushFPMsToMPM(PMW); + addModulePass(AMDGPUPerfHintAnalysisPass(TM), PMW); + } // FIXME: Why isn't this queried as required from AMDGPUISelDAGToDAG, and why // isn't this in addInstSelector? - addPass(RequireAnalysisPass<UniformityInfoAnalysis, Function>(), - /*Force=*/true); + addFunctionPass(RequireAnalysisPass<UniformityInfoAnalysis, Function>(), PMW, + /*Force=*/true); } -void AMDGPUCodeGenPassBuilder::addILPOpts(AddMachinePass &addPass) const { +void AMDGPUCodeGenPassBuilder::addILPOpts(PassManagerWrapper &PMW) const { if (EnableEarlyIfConversion) - addPass(EarlyIfConverterPass()); + addMachineFunctionPass(EarlyIfConverterPass(), PMW); - Base::addILPOpts(addPass); + Base::addILPOpts(PMW); } -void AMDGPUCodeGenPassBuilder::addAsmPrinter(AddMachinePass &addPass, +void AMDGPUCodeGenPassBuilder::addAsmPrinter(PassManagerWrapper &PMW, CreateMCStreamer) const { // TODO: Add AsmPrinter. } -Error AMDGPUCodeGenPassBuilder::addInstSelector(AddMachinePass &addPass) const { - addPass(AMDGPUISelDAGToDAGPass(TM)); - addPass(SIFixSGPRCopiesPass()); - addPass(SILowerI1CopiesPass()); +Error AMDGPUCodeGenPassBuilder::addInstSelector(PassManagerWrapper &PMW) const { + addMachineFunctionPass(AMDGPUISelDAGToDAGPass(TM), PMW); + addMachineFunctionPass(SIFixSGPRCopiesPass(), PMW); + addMachineFunctionPass(SILowerI1CopiesPass(), PMW); return Error::success(); } -void AMDGPUCodeGenPassBuilder::addPreRewrite(AddMachinePass &addPass) const { +void AMDGPUCodeGenPassBuilder::addPreRewrite(PassManagerWrapper &PMW) const { if (EnableRegReassign) { - addPass(GCNNSAReassignPass()); + addMachineFunctionPass(GCNNSAReassignPass(), PMW); } + + addMachineFunctionPass(AMDGPURewriteAGPRCopyMFMAPass(), PMW); } void AMDGPUCodeGenPassBuilder::addMachineSSAOptimization( - AddMachinePass &addPass) const { - Base::addMachineSSAOptimization(addPass); + PassManagerWrapper &PMW) const { + Base::addMachineSSAOptimization(PMW); - addPass(SIFoldOperandsPass()); + addMachineFunctionPass(SIFoldOperandsPass(), PMW); if (EnableDPPCombine) { - addPass(GCNDPPCombinePass()); + addMachineFunctionPass(GCNDPPCombinePass(), PMW); } - addPass(SILoadStoreOptimizerPass()); + addMachineFunctionPass(SILoadStoreOptimizerPass(), PMW); if (isPassEnabled(EnableSDWAPeephole)) { - addPass(SIPeepholeSDWAPass()); - addPass(EarlyMachineLICMPass()); - addPass(MachineCSEPass()); - addPass(SIFoldOperandsPass()); + addMachineFunctionPass(SIPeepholeSDWAPass(), PMW); + addMachineFunctionPass(EarlyMachineLICMPass(), PMW); + addMachineFunctionPass(MachineCSEPass(), PMW); + addMachineFunctionPass(SIFoldOperandsPass(), PMW); } - addPass(DeadMachineInstructionElimPass()); - addPass(SIShrinkInstructionsPass()); + addMachineFunctionPass(DeadMachineInstructionElimPass(), PMW); + addMachineFunctionPass(SIShrinkInstructionsPass(), PMW); +} + +Error AMDGPUCodeGenPassBuilder::addFastRegAlloc(PassManagerWrapper &PMW) const { + insertPass<PHIEliminationPass>(SILowerControlFlowPass()); + + insertPass<TwoAddressInstructionPass>(SIWholeQuadModePass()); + + return Base::addFastRegAlloc(PMW); } -void AMDGPUCodeGenPassBuilder::addOptimizedRegAlloc( - AddMachinePass &addPass) const { +Error AMDGPUCodeGenPassBuilder::addRegAssignmentFast( + PassManagerWrapper &PMW) const { + if (auto Err = validateRegAllocOptions()) + return Err; + + addMachineFunctionPass(GCNPreRALongBranchRegPass(), PMW); + + // SGPR allocation - default to fast at -O0. + if (SGPRRegAllocNPM == RegAllocType::Greedy) + addMachineFunctionPass(RAGreedyPass({onlyAllocateSGPRs, "sgpr"}), PMW); + else + addMachineFunctionPass(RegAllocFastPass({onlyAllocateSGPRs, "sgpr", false}), + PMW); + + // Equivalent of PEI for SGPRs. + addMachineFunctionPass(SILowerSGPRSpillsPass(), PMW); + + // To Allocate wwm registers used in whole quad mode operations (for shaders). + addMachineFunctionPass(SIPreAllocateWWMRegsPass(), PMW); + + // WWM allocation - default to fast at -O0. + if (WWMRegAllocNPM == RegAllocType::Greedy) + addMachineFunctionPass(RAGreedyPass({onlyAllocateWWMRegs, "wwm"}), PMW); + else + addMachineFunctionPass( + RegAllocFastPass({onlyAllocateWWMRegs, "wwm", false}), PMW); + + addMachineFunctionPass(SILowerWWMCopiesPass(), PMW); + addMachineFunctionPass(AMDGPUReserveWWMRegsPass(), PMW); + + // VGPR allocation - default to fast at -O0. + if (VGPRRegAllocNPM == RegAllocType::Greedy) + addMachineFunctionPass(RAGreedyPass({onlyAllocateVGPRs, "vgpr"}), PMW); + else + addMachineFunctionPass(RegAllocFastPass({onlyAllocateVGPRs, "vgpr"}), PMW); + + return Error::success(); +} + +Error AMDGPUCodeGenPassBuilder::addOptimizedRegAlloc( + PassManagerWrapper &PMW) const { if (EnableDCEInRA) insertPass<DetectDeadLanesPass>(DeadMachineInstructionElimPass()); @@ -2269,90 +2461,108 @@ void AMDGPUCodeGenPassBuilder::addOptimizedRegAlloc( if (TM.getOptLevel() > CodeGenOptLevel::Less) insertPass<MachineSchedulerPass>(SIFormMemoryClausesPass()); - Base::addOptimizedRegAlloc(addPass); + return Base::addOptimizedRegAlloc(PMW); } -void AMDGPUCodeGenPassBuilder::addPreRegAlloc(AddMachinePass &addPass) const { +void AMDGPUCodeGenPassBuilder::addPreRegAlloc(PassManagerWrapper &PMW) const { if (getOptLevel() != CodeGenOptLevel::None) - addPass(AMDGPUPrepareAGPRAllocPass()); + addMachineFunctionPass(AMDGPUPrepareAGPRAllocPass(), PMW); } Error AMDGPUCodeGenPassBuilder::addRegAssignmentOptimized( - AddMachinePass &addPass) const { - // TODO: Check --regalloc-npm option + PassManagerWrapper &PMW) const { + if (auto Err = validateRegAllocOptions()) + return Err; - addPass(GCNPreRALongBranchRegPass()); + addMachineFunctionPass(GCNPreRALongBranchRegPass(), PMW); - addPass(RAGreedyPass({onlyAllocateSGPRs, "sgpr"})); + // SGPR allocation - default to greedy at -O1 and above. + if (SGPRRegAllocNPM == RegAllocType::Fast) + addMachineFunctionPass(RegAllocFastPass({onlyAllocateSGPRs, "sgpr", false}), + PMW); + else + addMachineFunctionPass(RAGreedyPass({onlyAllocateSGPRs, "sgpr"}), PMW); // Commit allocated register changes. This is mostly necessary because too // many things rely on the use lists of the physical registers, such as the // verifier. This is only necessary with allocators which use LiveIntervals, // since FastRegAlloc does the replacements itself. - addPass(VirtRegRewriterPass(false)); + addMachineFunctionPass(VirtRegRewriterPass(false), PMW); // At this point, the sgpr-regalloc has been done and it is good to have the // stack slot coloring to try to optimize the SGPR spill stack indices before // attempting the custom SGPR spill lowering. - addPass(StackSlotColoringPass()); + addMachineFunctionPass(StackSlotColoringPass(), PMW); // Equivalent of PEI for SGPRs. - addPass(SILowerSGPRSpillsPass()); + addMachineFunctionPass(SILowerSGPRSpillsPass(), PMW); // To Allocate wwm registers used in whole quad mode operations (for shaders). - addPass(SIPreAllocateWWMRegsPass()); - - // For allocating other wwm register operands. - addPass(RAGreedyPass({onlyAllocateWWMRegs, "wwm"})); - addPass(SILowerWWMCopiesPass()); - addPass(VirtRegRewriterPass(false)); - addPass(AMDGPUReserveWWMRegsPass()); - - // For allocating per-thread VGPRs. - addPass(RAGreedyPass({onlyAllocateVGPRs, "vgpr"})); + addMachineFunctionPass(SIPreAllocateWWMRegsPass(), PMW); + // WWM allocation - default to greedy at -O1 and above. + if (WWMRegAllocNPM == RegAllocType::Fast) + addMachineFunctionPass( + RegAllocFastPass({onlyAllocateWWMRegs, "wwm", false}), PMW); + else + addMachineFunctionPass(RAGreedyPass({onlyAllocateWWMRegs, "wwm"}), PMW); + addMachineFunctionPass(SILowerWWMCopiesPass(), PMW); + addMachineFunctionPass(VirtRegRewriterPass(false), PMW); + addMachineFunctionPass(AMDGPUReserveWWMRegsPass(), PMW); + + // VGPR allocation - default to greedy at -O1 and above. + if (VGPRRegAllocNPM == RegAllocType::Fast) + addMachineFunctionPass(RegAllocFastPass({onlyAllocateVGPRs, "vgpr"}), PMW); + else + addMachineFunctionPass(RAGreedyPass({onlyAllocateVGPRs, "vgpr"}), PMW); - addPreRewrite(addPass); - addPass(VirtRegRewriterPass(true)); + addPreRewrite(PMW); + addMachineFunctionPass(VirtRegRewriterPass(true), PMW); - addPass(AMDGPUMarkLastScratchLoadPass()); + addMachineFunctionPass(AMDGPUMarkLastScratchLoadPass(), PMW); return Error::success(); } -void AMDGPUCodeGenPassBuilder::addPostRegAlloc(AddMachinePass &addPass) const { - addPass(SIFixVGPRCopiesPass()); +void AMDGPUCodeGenPassBuilder::addPostRegAlloc(PassManagerWrapper &PMW) const { + addMachineFunctionPass(SIFixVGPRCopiesPass(), PMW); if (TM.getOptLevel() > CodeGenOptLevel::None) - addPass(SIOptimizeExecMaskingPass()); - Base::addPostRegAlloc(addPass); + addMachineFunctionPass(SIOptimizeExecMaskingPass(), PMW); + Base::addPostRegAlloc(PMW); } -void AMDGPUCodeGenPassBuilder::addPreSched2(AddMachinePass &addPass) const { +void AMDGPUCodeGenPassBuilder::addPreSched2(PassManagerWrapper &PMW) const { if (TM.getOptLevel() > CodeGenOptLevel::None) - addPass(SIShrinkInstructionsPass()); - addPass(SIPostRABundlerPass()); + addMachineFunctionPass(SIShrinkInstructionsPass(), PMW); + addMachineFunctionPass(SIPostRABundlerPass(), PMW); } -void AMDGPUCodeGenPassBuilder::addPreEmitPass(AddMachinePass &addPass) const { +void AMDGPUCodeGenPassBuilder::addPostBBSections( + PassManagerWrapper &PMW) const { + // We run this later to avoid passes like livedebugvalues and BBSections + // having to deal with the apparent multi-entry functions we may generate. + addMachineFunctionPass(AMDGPUPreloadKernArgPrologPass(), PMW); +} + +void AMDGPUCodeGenPassBuilder::addPreEmitPass(PassManagerWrapper &PMW) const { if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less)) { - addPass(GCNCreateVOPDPass()); + addMachineFunctionPass(GCNCreateVOPDPass(), PMW); } - addPass(SIMemoryLegalizerPass()); - addPass(SIInsertWaitcntsPass()); + addMachineFunctionPass(SIMemoryLegalizerPass(), PMW); + addMachineFunctionPass(SIInsertWaitcntsPass(), PMW); - // TODO: addPass(SIModeRegisterPass()); + addMachineFunctionPass(SIModeRegisterPass(), PMW); - if (TM.getOptLevel() > CodeGenOptLevel::None) { - // TODO: addPass(SIInsertHardClausesPass()); - } + if (TM.getOptLevel() > CodeGenOptLevel::None) + addMachineFunctionPass(SIInsertHardClausesPass(), PMW); - addPass(SILateBranchLoweringPass()); + addMachineFunctionPass(SILateBranchLoweringPass(), PMW); if (isPassEnabled(EnableSetWavePriority, CodeGenOptLevel::Less)) - addPass(AMDGPUSetWavePriorityPass()); + addMachineFunctionPass(AMDGPUSetWavePriorityPass(), PMW); if (TM.getOptLevel() > CodeGenOptLevel::None) - addPass(SIPreEmitPeepholePass()); + addMachineFunctionPass(SIPreEmitPeepholePass(), PMW); // The hazard recognizer that runs as part of the post-ra scheduler does not // guarantee to be able handle all hazards correctly. This is because if there @@ -2362,15 +2572,15 @@ void AMDGPUCodeGenPassBuilder::addPreEmitPass(AddMachinePass &addPass) const { // // Here we add a stand-alone hazard recognizer pass which can handle all // cases. - addPass(PostRAHazardRecognizerPass()); - addPass(AMDGPUWaitSGPRHazardsPass()); - addPass(AMDGPULowerVGPREncodingPass()); + addMachineFunctionPass(PostRAHazardRecognizerPass(), PMW); + addMachineFunctionPass(AMDGPUWaitSGPRHazardsPass(), PMW); + addMachineFunctionPass(AMDGPULowerVGPREncodingPass(), PMW); if (isPassEnabled(EnableInsertDelayAlu, CodeGenOptLevel::Less)) { - addPass(AMDGPUInsertDelayAluPass()); + addMachineFunctionPass(AMDGPUInsertDelayAluPass(), PMW); } - addPass(BranchRelaxationPass()); + addMachineFunctionPass(BranchRelaxationPass(), PMW); } bool AMDGPUCodeGenPassBuilder::isPassEnabled(const cl::opt<bool> &Opt, @@ -2382,32 +2592,33 @@ bool AMDGPUCodeGenPassBuilder::isPassEnabled(const cl::opt<bool> &Opt, return Opt; } -void AMDGPUCodeGenPassBuilder::addEarlyCSEOrGVNPass(AddIRPass &addPass) const { +void AMDGPUCodeGenPassBuilder::addEarlyCSEOrGVNPass( + PassManagerWrapper &PMW) const { if (TM.getOptLevel() == CodeGenOptLevel::Aggressive) - addPass(GVNPass()); + addFunctionPass(GVNPass(), PMW); else - addPass(EarlyCSEPass()); + addFunctionPass(EarlyCSEPass(), PMW); } void AMDGPUCodeGenPassBuilder::addStraightLineScalarOptimizationPasses( - AddIRPass &addPass) const { + PassManagerWrapper &PMW) const { if (isPassEnabled(EnableLoopPrefetch, CodeGenOptLevel::Aggressive)) - addPass(LoopDataPrefetchPass()); + addFunctionPass(LoopDataPrefetchPass(), PMW); - addPass(SeparateConstOffsetFromGEPPass()); + addFunctionPass(SeparateConstOffsetFromGEPPass(), PMW); // ReassociateGEPs exposes more opportunities for SLSR. See // the example in reassociate-geps-and-slsr.ll. - addPass(StraightLineStrengthReducePass()); + addFunctionPass(StraightLineStrengthReducePass(), PMW); // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or // EarlyCSE can reuse. - addEarlyCSEOrGVNPass(addPass); + addEarlyCSEOrGVNPass(PMW); // Run NaryReassociate after EarlyCSE/GVN to be more effective. - addPass(NaryReassociatePass()); + addFunctionPass(NaryReassociatePass(), PMW); // NaryReassociate on GEPs creates redundant common expressions, so run // EarlyCSE after it. - addPass(EarlyCSEPass()); + addFunctionPass(EarlyCSEPass(), PMW); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 03d16fd..d4a6838 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -80,7 +80,8 @@ static cl::opt<size_t> InlineMaxBB( static cl::opt<unsigned> MemcpyLoopUnroll( "amdgpu-memcpy-loop-unroll", cl::desc("Unroll factor (affecting 4x32-bit operations) to use for memory " - "operations when lowering memcpy as a loop"), + "operations when lowering statically-sized memcpy, memmove, or" + "memset as a loop"), cl::init(16), cl::Hidden); static bool dependsOnLocalPhi(const Loop *L, const Value *Cond, @@ -206,9 +207,8 @@ void AMDGPUTTIImpl::getUnrollingPreferences( dyn_cast<AllocaInst>(getUnderlyingObject(Ptr)); if (!Alloca || !Alloca->isStaticAlloca()) continue; - Type *Ty = Alloca->getAllocatedType(); - unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0; - if (AllocaSize > MaxAlloca) + auto AllocaSize = Alloca->getAllocationSize(DL); + if (!AllocaSize || AllocaSize->getFixedValue() > MaxAlloca) continue; } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { @@ -285,7 +285,7 @@ uint64_t AMDGPUTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const { const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = { // Codegen control options which don't matter. AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler, - AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureFlatForGlobal, + AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureUseFlatForGlobal, AMDGPU::FeaturePromoteAlloca, AMDGPU::FeatureUnalignedScratchAccess, AMDGPU::FeatureUnalignedAccessMode, @@ -300,7 +300,7 @@ const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = { AMDGPU::FeatureSRAMECC, // Perf-tuning features - AMDGPU::FeatureFastFMAF32, AMDGPU::HalfRate64Ops}; + AMDGPU::FeatureFastFMAF32, AMDGPU::FeatureHalfRate64Ops}; GCNTTIImpl::GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F) : BaseT(TM, F.getDataLayout()), @@ -804,7 +804,7 @@ GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, InstRate = getFullRateInstrCost(); static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16}; - if (any_of(ValidSatTys, [<](MVT M) { return M == LT.second; })) + if (any_of(ValidSatTys, equal_to(LT.second))) NElts = 1; break; } @@ -883,10 +883,9 @@ GCNTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, return LT.first * getHalfRateInstrCost(CostKind); } -InstructionCost GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, - TTI::TargetCostKind CostKind, - unsigned Index, const Value *Op0, - const Value *Op1) const { +InstructionCost GCNTTIImpl::getVectorInstrCost( + unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, unsigned Index, + const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const { switch (Opcode) { case Instruction::ExtractElement: case Instruction::InsertElement: { @@ -895,8 +894,8 @@ InstructionCost GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, if (EltSize < 32) { if (EltSize == 16 && Index == 0 && ST->has16BitInsts()) return 0; - return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, - Op1); + return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1, + VIC); } // Extracts are just reads of a subregister, so are free. Inserts are @@ -907,7 +906,8 @@ InstructionCost GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, return Index == ~0u ? 2 : 0; } default: - return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1); + return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1, + VIC); } } @@ -1150,41 +1150,6 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, ConstantInt::getTrue(Ctx) : ConstantInt::getFalse(Ctx); return NewVal; } - case Intrinsic::ptrmask: { - unsigned OldAS = OldV->getType()->getPointerAddressSpace(); - unsigned NewAS = NewV->getType()->getPointerAddressSpace(); - Value *MaskOp = II->getArgOperand(1); - Type *MaskTy = MaskOp->getType(); - - bool DoTruncate = false; - - const GCNTargetMachine &TM = - static_cast<const GCNTargetMachine &>(getTLI()->getTargetMachine()); - if (!TM.isNoopAddrSpaceCast(OldAS, NewAS)) { - // All valid 64-bit to 32-bit casts work by chopping off the high - // bits. Any masking only clearing the low bits will also apply in the new - // address space. - if (DL.getPointerSizeInBits(OldAS) != 64 || - DL.getPointerSizeInBits(NewAS) != 32) - return nullptr; - - // TODO: Do we need to thread more context in here? - KnownBits Known = computeKnownBits(MaskOp, DL, nullptr, II); - if (Known.countMinLeadingOnes() < 32) - return nullptr; - - DoTruncate = true; - } - - IRBuilder<> B(II); - if (DoTruncate) { - MaskTy = B.getInt32Ty(); - MaskOp = B.CreateTrunc(MaskOp, MaskTy); - } - - return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->getType(), MaskTy}, - {NewV, MaskOp}); - } case Intrinsic::amdgcn_flat_atomic_fmax_num: case Intrinsic::amdgcn_flat_atomic_fmin_num: { Type *DestTy = II->getType(); @@ -1241,46 +1206,123 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, (ScalarSize == 16 || ScalarSize == 8)) { // Larger vector widths may require additional instructions, but are // typically cheaper than scalarized versions. - unsigned NumVectorElts = cast<FixedVectorType>(SrcTy)->getNumElements(); - unsigned RequestedElts = - count_if(Mask, [](int MaskElt) { return MaskElt != -1; }); + // + // We assume that shuffling at a register granularity can be done for free. + // This is not true for vectors fed into memory instructions, but it is + // effectively true for all other shuffling. The emphasis of the logic here + // is to assist generic transform in cleaning up / canonicalizing those + // shuffles. + + // With op_sel VOP3P instructions freely can access the low half or high + // half of a register, so any swizzle of two elements is free. + if (auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcTy)) { + unsigned NumSrcElts = SrcVecTy->getNumElements(); + if (ST->hasVOP3PInsts() && ScalarSize == 16 && NumSrcElts == 2 && + (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Reverse || + Kind == TTI::SK_PermuteSingleSrc)) + return 0; + } + unsigned EltsPerReg = 32 / ScalarSize; - if (RequestedElts == 0) - return 0; switch (Kind) { case TTI::SK_Broadcast: + // A single v_perm_b32 can be re-used for all destination registers. + return 1; case TTI::SK_Reverse: - case TTI::SK_PermuteSingleSrc: { - // With op_sel VOP3P instructions freely can access the low half or high - // half of a register, so any swizzle of two elements is free. - if (ST->hasVOP3PInsts() && ScalarSize == 16 && NumVectorElts == 2) - return 0; - unsigned NumPerms = alignTo(RequestedElts, EltsPerReg) / EltsPerReg; - // SK_Broadcast just reuses the same mask - unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? 1 : NumPerms; - return NumPerms + NumPermMasks; - } + // One instruction per register. + if (auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy)) + return divideCeil(DstVecTy->getNumElements(), EltsPerReg); + return InstructionCost::getInvalid(); case TTI::SK_ExtractSubvector: + if (Index % EltsPerReg == 0) + return 0; // Shuffling at register granularity + if (auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy)) + return divideCeil(DstVecTy->getNumElements(), EltsPerReg); + return InstructionCost::getInvalid(); case TTI::SK_InsertSubvector: { - // Even aligned accesses are free - if (!(Index % 2)) - return 0; - // Insert/extract subvectors only require shifts / extract code to get the - // relevant bits - return alignTo(RequestedElts, EltsPerReg) / EltsPerReg; + auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy); + if (!DstVecTy) + return InstructionCost::getInvalid(); + unsigned NumDstElts = DstVecTy->getNumElements(); + unsigned NumInsertElts = cast<FixedVectorType>(SubTp)->getNumElements(); + unsigned EndIndex = Index + NumInsertElts; + unsigned BeginSubIdx = Index % EltsPerReg; + unsigned EndSubIdx = EndIndex % EltsPerReg; + unsigned Cost = 0; + + if (BeginSubIdx != 0) { + // Need to shift the inserted vector into place. The cost is the number + // of destination registers overlapped by the inserted vector. + Cost = divideCeil(EndIndex, EltsPerReg) - (Index / EltsPerReg); + } + + // If the last register overlap is partial, there may be three source + // registers feeding into it; that takes an extra instruction. + if (EndIndex < NumDstElts && BeginSubIdx < EndSubIdx) + Cost += 1; + + return Cost; } - case TTI::SK_PermuteTwoSrc: - case TTI::SK_Splice: - case TTI::SK_Select: { - unsigned NumPerms = alignTo(RequestedElts, EltsPerReg) / EltsPerReg; - // SK_Select just reuses the same mask - unsigned NumPermMasks = Kind == TTI::SK_Select ? 1 : NumPerms; - return NumPerms + NumPermMasks; + case TTI::SK_Splice: { + auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy); + if (!DstVecTy) + return InstructionCost::getInvalid(); + unsigned NumElts = DstVecTy->getNumElements(); + assert(NumElts == cast<FixedVectorType>(SrcTy)->getNumElements()); + // Determine the sub-region of the result vector that requires + // sub-register shuffles / mixing. + unsigned EltsFromLHS = NumElts - Index; + bool LHSIsAligned = (Index % EltsPerReg) == 0; + bool RHSIsAligned = (EltsFromLHS % EltsPerReg) == 0; + if (LHSIsAligned && RHSIsAligned) + return 0; + if (LHSIsAligned && !RHSIsAligned) + return divideCeil(NumElts, EltsPerReg) - (EltsFromLHS / EltsPerReg); + if (!LHSIsAligned && RHSIsAligned) + return divideCeil(EltsFromLHS, EltsPerReg); + return divideCeil(NumElts, EltsPerReg); } - default: break; } + + if (!Mask.empty()) { + unsigned NumSrcElts = cast<FixedVectorType>(SrcTy)->getNumElements(); + + // Generically estimate the cost by assuming that each destination + // register is derived from sources via v_perm_b32 instructions if it + // can't be copied as-is. + // + // For each destination register, derive the cost of obtaining it based + // on the number of source registers that feed into it. + unsigned Cost = 0; + for (unsigned DstIdx = 0; DstIdx < Mask.size(); DstIdx += EltsPerReg) { + SmallVector<int, 4> Regs; + bool Aligned = true; + for (unsigned I = 0; I < EltsPerReg && DstIdx + I < Mask.size(); ++I) { + int SrcIdx = Mask[DstIdx + I]; + if (SrcIdx == -1) + continue; + int Reg; + if (SrcIdx < (int)NumSrcElts) { + Reg = SrcIdx / EltsPerReg; + if (SrcIdx % EltsPerReg != I) + Aligned = false; + } else { + Reg = NumSrcElts + (SrcIdx - NumSrcElts) / EltsPerReg; + if ((SrcIdx - NumSrcElts) % EltsPerReg != I) + Aligned = false; + } + if (!llvm::is_contained(Regs, Reg)) + Regs.push_back(Reg); + } + if (Regs.size() >= 2) + Cost += Regs.size() - 1; + else if (!Aligned) + Cost += 1; + } + return Cost; + } } return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index, @@ -1299,8 +1341,60 @@ bool GCNTTIImpl::isProfitableToSinkOperands(Instruction *I, if (any_of(Ops, [&](Use *U) { return U->get() == Op.get(); })) continue; - if (match(&Op, m_FAbs(m_Value())) || match(&Op, m_FNeg(m_Value()))) + if (match(&Op, m_FAbs(m_Value())) || match(&Op, m_FNeg(m_Value()))) { Ops.push_back(&Op); + continue; + } + + // Check for zero-cost multiple use InsertElement/ExtractElement + // instructions + if (Instruction *OpInst = dyn_cast<Instruction>(Op.get())) { + if (OpInst->getType()->isVectorTy() && OpInst->getNumOperands() > 1) { + Instruction *VecOpInst = dyn_cast<Instruction>(OpInst->getOperand(0)); + if (VecOpInst && VecOpInst->hasOneUse()) + continue; + + if (getVectorInstrCost(OpInst->getOpcode(), OpInst->getType(), + TTI::TCK_RecipThroughput, 0, + OpInst->getOperand(0), + OpInst->getOperand(1)) == 0) { + Ops.push_back(&Op); + continue; + } + } + } + + if (auto *Shuffle = dyn_cast<ShuffleVectorInst>(Op.get())) { + + unsigned EltSize = DL.getTypeSizeInBits( + cast<VectorType>(Shuffle->getType())->getElementType()); + + // For i32 (or greater) shufflevectors, these will be lowered into a + // series of insert / extract elements, which will be coalesced away. + if (EltSize < 16 || !ST->has16BitInsts()) + continue; + + int NumSubElts, SubIndex; + if (Shuffle->changesLength()) { + if (Shuffle->increasesLength() && Shuffle->isIdentityWithPadding()) { + Ops.push_back(&Op); + continue; + } + + if ((Shuffle->isExtractSubvectorMask(SubIndex) || + Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex)) && + !(SubIndex & 0x1)) { + Ops.push_back(&Op); + continue; + } + } + + if (Shuffle->isReverse() || Shuffle->isZeroEltSplat() || + Shuffle->isSingleSource()) { + Ops.push_back(&Op); + continue; + } + } } return !Ops.empty(); @@ -1413,7 +1507,8 @@ static unsigned getCallArgsTotalAllocaSize(const CallBase *CB, if (!AI || !AI->isStaticAlloca() || !AIVisited.insert(AI).second) continue; - AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType()); + if (auto Size = AI->getAllocationSize(DL)) + AllocaSize += Size->getFixedValue(); } return AllocaSize; } @@ -1467,10 +1562,13 @@ unsigned GCNTTIImpl::getCallerAllocaCost(const CallBase *CB, Threshold += Threshold / 2; } - auto ArgAllocaSize = DL.getTypeAllocSize(AI->getAllocatedType()); + auto ArgAllocaSize = AI->getAllocationSize(DL); + if (!ArgAllocaSize) + return 0; // Attribute the bonus proportionally to the alloca size - unsigned AllocaThresholdBonus = (Threshold * ArgAllocaSize) / AllocaSize; + unsigned AllocaThresholdBonus = + (Threshold * ArgAllocaSize->getFixedValue()) / AllocaSize; return AllocaThresholdBonus; } @@ -1574,3 +1672,14 @@ unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) const { } return BaseT::getNumberOfParts(Tp); } + +InstructionUniformity +GCNTTIImpl::getInstructionUniformity(const Value *V) const { + if (isAlwaysUniform(V)) + return InstructionUniformity::AlwaysUniform; + + if (isSourceOfDivergence(V)) + return InstructionUniformity::NeverUniform; + + return InstructionUniformity::Default; +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index 20da834..3ec157aa 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -101,6 +101,14 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> { std::pair<InstructionCost, MVT> getTypeLegalizationCost(Type *Ty) const; + /// \returns true if V might be divergent even when all of its operands + /// are uniform. + bool isSourceOfDivergence(const Value *V) const; + + /// Returns true for the target specific set of operations which produce + /// uniform result even taking non-uniform arguments. + bool isAlwaysUniform(const Value *V) const; + public: explicit GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F); @@ -168,14 +176,13 @@ public: ArrayRef<unsigned> Indices = {}) const; using BaseT::getVectorInstrCost; - InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy, - TTI::TargetCostKind CostKind, - unsigned Index, const Value *Op0, - const Value *Op1) const override; + InstructionCost + getVectorInstrCost(unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, + unsigned Index, const Value *Op0, const Value *Op1, + TTI::VectorInstrContext VIC = + TTI::VectorInstrContext::None) const override; bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const; - bool isSourceOfDivergence(const Value *V) const override; - bool isAlwaysUniform(const Value *V) const override; bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const override { // Address space casts must cast between different address spaces. @@ -302,6 +309,8 @@ public: /// together under a single i32 value. Otherwise fall back to base /// implementation. unsigned getNumberOfParts(Type *Tp) const override; + + InstructionUniformity getInstructionUniformity(const Value *V) const override; }; } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp index 50c78d8..864d877 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp @@ -16,12 +16,6 @@ /// uniformity. And every instruction that's downstream and cares about dynamic /// uniformity must be convergent (and isel will introduce v_readfirstlane for /// them if their operands can't be proven statically uniform). -/// -/// This pass is implemented as a ModulePass because intrinsic declarations -/// exist at the module scope, allowing us to skip processing entirely if no -/// declarations are present and to traverse their user lists directly when -/// they are. A FunctionPass would instead require scanning every instruction -/// in every function to find relevant intrinsics, which is far less efficient. //===----------------------------------------------------------------------===// #include "AMDGPU.h" @@ -63,10 +57,11 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II, const UniformityInfo &UI, ValueMap<const Value *, bool> &Tracker) { llvm::Intrinsic::ID IID = II.getIntrinsicID(); - + /// We deliberately do not simplify readfirstlane with a uniform argument, so + /// that frontends can use it to force a copy to SGPR and thereby prevent the + /// backend from generating unwanted waterfall loops. switch (IID) { case Intrinsic::amdgcn_permlane64: - case Intrinsic::amdgcn_readfirstlane: case Intrinsic::amdgcn_readlane: { Value *Src = II.getArgOperand(0); if (isDivergentUseWithNew(II.getOperandUse(0), UI, Tracker)) @@ -97,14 +92,12 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II, Tracker[NotOp] = true; // NOT preserves uniformity LLVM_DEBUG(dbgs() << "Replacing ICMP_EQ: " << *NotOp << '\n'); ICmp->replaceAllUsesWith(NotOp); - ICmp->eraseFromParent(); Changed = true; } else if (Pred == ICmpInst::ICMP_NE && match(OtherOp, m_Zero())) { // Case: (icmp ne %ballot, 0) -> %ballot_arg LLVM_DEBUG(dbgs() << "Replacing ICMP_NE with ballot argument: " << *Src << '\n'); ICmp->replaceAllUsesWith(Src); - ICmp->eraseFromParent(); Changed = true; } } @@ -114,46 +107,95 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II, II.eraseFromParent(); return Changed; } + case Intrinsic::amdgcn_wave_shuffle: { + Use &Val = II.getOperandUse(0); + Use &Idx = II.getOperandUse(1); + + // Like with readlane, if Value is uniform then just propagate it + if (!isDivergentUseWithNew(Val, UI, Tracker)) { + II.replaceAllUsesWith(Val); + II.eraseFromParent(); + return true; + } + + // Otherwise, when Index is uniform, this is just a readlane operation + if (isDivergentUseWithNew(Idx, UI, Tracker)) + return false; + + // The readlane intrinsic we want to call has the exact same function + // signature, so we can quickly modify the instruction in-place + Module *Mod = II.getModule(); + II.setCalledFunction(Intrinsic::getOrInsertDeclaration( + Mod, Intrinsic::amdgcn_readlane, II.getType())); + return true; + } default: - llvm_unreachable("Unexpected intrinsic ID in optimizeUniformIntrinsic"); + return false; } return false; } -/// Iterates over intrinsic declarations in the module to optimize their uses. -static bool runUniformIntrinsicCombine(Module &M, ModuleAnalysisManager &AM) { +/// Iterates over intrinsic calls in the Function to optimize. +static bool runUniformIntrinsicCombine(Function &F, const UniformityInfo &UI) { bool IsChanged = false; ValueMap<const Value *, bool> Tracker; - FunctionAnalysisManager &FAM = - AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); - for (Function &F : M) { - switch (F.getIntrinsicID()) { - case Intrinsic::amdgcn_permlane64: - case Intrinsic::amdgcn_readfirstlane: - case Intrinsic::amdgcn_readlane: - case Intrinsic::amdgcn_ballot: - break; - default: + for (Instruction &I : make_early_inc_range(instructions(F))) { + auto *II = dyn_cast<IntrinsicInst>(&I); + if (!II) continue; - } - - for (User *U : make_early_inc_range(F.users())) { - auto *II = cast<IntrinsicInst>(U); - Function *ParentF = II->getFunction(); - const auto &UI = FAM.getResult<UniformityInfoAnalysis>(*ParentF); - IsChanged |= optimizeUniformIntrinsic(*II, UI, Tracker); - } + IsChanged |= optimizeUniformIntrinsic(*II, UI, Tracker); } return IsChanged; } PreservedAnalyses -AMDGPUUniformIntrinsicCombinePass::run(Module &M, ModuleAnalysisManager &AM) { - if (!runUniformIntrinsicCombine(M, AM)) +AMDGPUUniformIntrinsicCombinePass::run(Function &F, + FunctionAnalysisManager &AM) { + const auto &UI = AM.getResult<UniformityInfoAnalysis>(F); + if (!runUniformIntrinsicCombine(F, UI)) return PreservedAnalyses::all(); PreservedAnalyses PA; PA.preserve<UniformityInfoAnalysis>(); return PA; } + +namespace { +class AMDGPUUniformIntrinsicCombineLegacy : public FunctionPass { +public: + static char ID; + AMDGPUUniformIntrinsicCombineLegacy() : FunctionPass(ID) {} + +private: + bool runOnFunction(Function &F) override; + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired<UniformityInfoWrapperPass>(); + AU.addRequired<TargetPassConfig>(); + } +}; +} // namespace + +char AMDGPUUniformIntrinsicCombineLegacy::ID = 0; +char &llvm::AMDGPUUniformIntrinsicCombineLegacyPassID = + AMDGPUUniformIntrinsicCombineLegacy::ID; + +bool AMDGPUUniformIntrinsicCombineLegacy::runOnFunction(Function &F) { + if (skipFunction(F)) + return false; + const UniformityInfo &UI = + getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo(); + return runUniformIntrinsicCombine(F, UI); +} + +INITIALIZE_PASS_BEGIN(AMDGPUUniformIntrinsicCombineLegacy, DEBUG_TYPE, + "AMDGPU Uniform Intrinsic Combine", false, false) +INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) +INITIALIZE_PASS_END(AMDGPUUniformIntrinsicCombineLegacy, DEBUG_TYPE, + "AMDGPU Uniform Intrinsic Combine", false, false) + +FunctionPass *llvm::createAMDGPUUniformIntrinsicCombineLegacyPass() { + return new AMDGPUUniformIntrinsicCombineLegacy(); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp index 733c5d5..fe81a5e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp @@ -181,14 +181,52 @@ BasicBlock *AMDGPUUnifyDivergentExitNodesImpl::unifyReturnBlockSet( return NewRetBlock; } +static BasicBlock * +createDummyReturnBlock(Function &F, + SmallVector<BasicBlock *, 4> &ReturningBlocks) { + BasicBlock *DummyReturnBB = + BasicBlock::Create(F.getContext(), "DummyReturnBlock", &F); + Type *RetTy = F.getReturnType(); + Value *RetVal = RetTy->isVoidTy() ? nullptr : PoisonValue::get(RetTy); + ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB); + ReturningBlocks.push_back(DummyReturnBB); + return DummyReturnBB; +} + +/// Handle conditional branch instructions (-> 2 targets) and callbr +/// instructions with N targets. +static void handleNBranch(Function &F, BasicBlock *BB, Instruction *BI, + BasicBlock *DummyReturnBB, + std::vector<DominatorTree::UpdateType> &Updates) { + SmallVector<BasicBlock *, 2> Successors(successors(BB)); + + // Create a new transition block to hold the conditional branch. + BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock"); + + Updates.reserve(Updates.size() + 2 * Successors.size() + 2); + + // 'Successors' become successors of TransitionBB instead of BB, + // and TransitionBB becomes a single successor of BB. + Updates.emplace_back(DominatorTree::Insert, BB, TransitionBB); + for (BasicBlock *Successor : Successors) { + Updates.emplace_back(DominatorTree::Insert, TransitionBB, Successor); + Updates.emplace_back(DominatorTree::Delete, BB, Successor); + } + + // Create a branch that will always branch to the transition block and + // references DummyReturnBB. + BB->getTerminator()->eraseFromParent(); + BranchInst::Create(TransitionBB, DummyReturnBB, + ConstantInt::getTrue(F.getContext()), BB); + Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB); +} + bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT, const PostDominatorTree &PDT, const UniformityInfo &UA) { - assert(hasOnlySimpleTerminator(F) && "Unsupported block terminator."); - if (PDT.root_size() == 0 || (PDT.root_size() == 1 && - !isa<BranchInst>(PDT.getRoot()->getTerminator()))) + !isa<BranchInst, CallBrInst>(PDT.getRoot()->getTerminator()))) return false; // Loop over all of the blocks in a function, tracking all of the blocks that @@ -222,46 +260,28 @@ bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT, if (HasDivergentExitBlock) UnreachableBlocks.push_back(BB); } else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) { - - ConstantInt *BoolTrue = ConstantInt::getTrue(F.getContext()); - if (DummyReturnBB == nullptr) { - DummyReturnBB = BasicBlock::Create(F.getContext(), - "DummyReturnBlock", &F); - Type *RetTy = F.getReturnType(); - Value *RetVal = RetTy->isVoidTy() ? nullptr : PoisonValue::get(RetTy); - ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB); - ReturningBlocks.push_back(DummyReturnBB); - } + if (!DummyReturnBB) + DummyReturnBB = createDummyReturnBlock(F, ReturningBlocks); if (BI->isUnconditional()) { BasicBlock *LoopHeaderBB = BI->getSuccessor(0); BI->eraseFromParent(); // Delete the unconditional branch. // Add a new conditional branch with a dummy edge to the return block. - BranchInst::Create(LoopHeaderBB, DummyReturnBB, BoolTrue, BB); - Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB); - } else { // Conditional branch. - SmallVector<BasicBlock *, 2> Successors(successors(BB)); - - // Create a new transition block to hold the conditional branch. - BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock"); - - Updates.reserve(Updates.size() + 2 * Successors.size() + 2); - - // 'Successors' become successors of TransitionBB instead of BB, - // and TransitionBB becomes a single successor of BB. - Updates.emplace_back(DominatorTree::Insert, BB, TransitionBB); - for (BasicBlock *Successor : Successors) { - Updates.emplace_back(DominatorTree::Insert, TransitionBB, Successor); - Updates.emplace_back(DominatorTree::Delete, BB, Successor); - } - - // Create a branch that will always branch to the transition block and - // references DummyReturnBB. - BB->getTerminator()->eraseFromParent(); - BranchInst::Create(TransitionBB, DummyReturnBB, BoolTrue, BB); + BranchInst::Create(LoopHeaderBB, DummyReturnBB, + ConstantInt::getTrue(F.getContext()), BB); Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB); + } else { + handleNBranch(F, BB, BI, DummyReturnBB, Updates); } Changed = true; + } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(BB->getTerminator())) { + if (!DummyReturnBB) + DummyReturnBB = createDummyReturnBlock(F, ReturningBlocks); + + handleNBranch(F, BB, CBI, DummyReturnBB, Updates); + Changed = true; + } else { + llvm_unreachable("unsupported block terminator"); } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp b/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp index 61c5dcd..faef408 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp @@ -17,6 +17,7 @@ #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIInstrInfo.h" #include "llvm/ADT/SetVector.h" +#include "llvm/TargetParser/TargetParser.h" using namespace llvm; @@ -44,6 +45,7 @@ namespace { class AMDGPUWaitSGPRHazards { public: + const GCNSubtarget *ST; const SIInstrInfo *TII; const SIRegisterInfo *TRI; const MachineRegisterInfo *MRI; @@ -54,7 +56,7 @@ public: bool CullSGPRHazardsAtMemWait; unsigned CullSGPRHazardsMemWaitThreshold; - AMDGPUWaitSGPRHazards() {} + AMDGPUWaitSGPRHazards() = default; // Return the numeric ID 0-127 for a given SGPR. static std::optional<unsigned> sgprNumber(Register Reg, @@ -165,7 +167,7 @@ public: } unsigned mergeMasks(unsigned Mask1, unsigned Mask2) { - unsigned Mask = 0xffff; + unsigned Mask = AMDGPU::DepCtr::getDefaultDepCtrEncoding(*ST); Mask = AMDGPU::DepCtr::encodeFieldSaSdst( Mask, std::min(AMDGPU::DepCtr::decodeFieldSaSdst(Mask1), AMDGPU::DepCtr::decodeFieldSaSdst(Mask2))); @@ -181,9 +183,12 @@ public: Mask = AMDGPU::DepCtr::encodeFieldVaVdst( Mask, std::min(AMDGPU::DepCtr::decodeFieldVaVdst(Mask1), AMDGPU::DepCtr::decodeFieldVaVdst(Mask2))); + const AMDGPU::IsaVersion &Version = AMDGPU::getIsaVersion(ST->getCPU()); Mask = AMDGPU::DepCtr::encodeFieldHoldCnt( - Mask, std::min(AMDGPU::DepCtr::decodeFieldHoldCnt(Mask1), - AMDGPU::DepCtr::decodeFieldHoldCnt(Mask2))); + Mask, + std::min(AMDGPU::DepCtr::decodeFieldHoldCnt(Mask1, Version), + AMDGPU::DepCtr::decodeFieldHoldCnt(Mask2, Version)), + Version); Mask = AMDGPU::DepCtr::encodeFieldVaSsrc( Mask, std::min(AMDGPU::DepCtr::decodeFieldVaSsrc(Mask1), AMDGPU::DepCtr::decodeFieldVaSsrc(Mask2))); @@ -387,7 +392,7 @@ public: // Apply wait if (Wait) { - unsigned Mask = 0xffff; + unsigned Mask = AMDGPU::DepCtr::getDefaultDepCtrEncoding(*ST); if (Wait & WA_VCC) { State.VCCHazard &= ~HazardState::VALU; Mask = AMDGPU::DepCtr::encodeFieldVaVcc(Mask, 0); @@ -438,8 +443,8 @@ public: } bool run(MachineFunction &MF) { - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - if (!ST.hasVALUReadSGPRHazard()) + ST = &MF.getSubtarget<GCNSubtarget>(); + if (!ST->hasVALUReadSGPRHazard()) return false; // Parse settings @@ -467,10 +472,10 @@ public: if (!EnableSGPRHazardWaits) return false; - TII = ST.getInstrInfo(); - TRI = ST.getRegisterInfo(); + TII = ST->getInstrInfo(); + TRI = ST->getRegisterInfo(); MRI = &MF.getRegInfo(); - DsNopCount = ST.isWave64() ? WAVE64_NOPS : WAVE32_NOPS; + DsNopCount = ST->isWave64() ? WAVE64_NOPS : WAVE32_NOPS; auto CallingConv = MF.getFunction().getCallingConv(); if (!AMDGPU::isEntryFunctionCC(CallingConv) && @@ -555,6 +560,6 @@ PreservedAnalyses AMDGPUWaitSGPRHazardsPass::run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM) { if (AMDGPUWaitSGPRHazards().run(MF)) - return PreservedAnalyses::none(); + return getMachineFunctionPassPreservedAnalyses(); return PreservedAnalyses::all(); } diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 99ba043..998a9d0 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -126,6 +126,7 @@ public: ImmTySMEMOffsetMod, ImmTyCPol, ImmTyTFE, + ImmTyIsAsync, ImmTyD16, ImmTyClamp, ImmTyOModSI, @@ -143,10 +144,13 @@ public: ImmTyExpTgt, ImmTyExpCompr, ImmTyExpVM, + ImmTyDone, + ImmTyRowEn, ImmTyFORMAT, ImmTyHwreg, ImmTyOff, ImmTySendMsg, + ImmTyWaitEvent, ImmTyInterpSlot, ImmTyInterpAttr, ImmTyInterpAttrChan, @@ -347,6 +351,11 @@ public: return isRegKind() && getReg() == AMDGPU::SGPR_NULL; } + bool isAV_LdSt_32_Align2_RegOp() const { + return isRegClass(AMDGPU::VGPR_32RegClassID) || + isRegClass(AMDGPU::AGPR_32RegClassID); + } + bool isVRegWithInputMods() const; template <bool IsFake16> bool isT16_Lo128VRegWithInputMods() const; template <bool IsFake16> bool isT16VRegWithInputMods() const; @@ -408,6 +417,8 @@ public: bool isNegLo() const { return isImmTy(ImmTyNegLo); } bool isNegHi() const { return isImmTy(ImmTyNegHi); } bool isBitOp3() const { return isImmTy(ImmTyBitOp3) && isUInt<8>(getImm()); } + bool isDone() const { return isImmTy(ImmTyDone); } + bool isRowEn() const { return isImmTy(ImmTyRowEn); } bool isRegOrImm() const { return isReg() || isImm(); @@ -661,6 +672,8 @@ public: bool isVSrc_v2f16() const { return isVSrc_f16() || isLiteralImm(MVT::v2f16); } + bool isVSrc_v2f16_splat() const { return isVSrc_v2f16(); } + bool isVSrc_NoInline_v2f16() const { return isVSrc_v2f16(); } bool isVISrcB32() const { @@ -956,6 +969,7 @@ public: bool isSDelayALU() const; bool isHwreg() const; bool isSendMsg() const; + bool isWaitEvent() const; bool isSplitBarrier() const; bool isSwizzle() const; bool isSMRDOffset8() const; @@ -1108,6 +1122,7 @@ public: case ImmTyIndexKey16bit: OS << "index_key"; break; case ImmTyIndexKey32bit: OS << "index_key"; break; case ImmTyTFE: OS << "TFE"; break; + case ImmTyIsAsync: OS << "IsAsync"; break; case ImmTyD16: OS << "D16"; break; case ImmTyFORMAT: OS << "FORMAT"; break; case ImmTyClamp: OS << "Clamp"; break; @@ -1133,8 +1148,11 @@ public: case ImmTyExpTgt: OS << "ExpTgt"; break; case ImmTyExpCompr: OS << "ExpCompr"; break; case ImmTyExpVM: OS << "ExpVM"; break; + case ImmTyDone: OS << "Done"; break; + case ImmTyRowEn: OS << "RowEn"; break; case ImmTyHwreg: OS << "Hwreg"; break; case ImmTySendMsg: OS << "SendMsg"; break; + case ImmTyWaitEvent: OS << "WaitEvent"; break; case ImmTyInterpSlot: OS << "InterpSlot"; break; case ImmTyInterpAttr: OS << "InterpAttr"; break; case ImmTyInterpAttrChan: OS << "InterpAttrChan"; break; @@ -1544,6 +1562,12 @@ public: bool isGFX1250() const { return AMDGPU::isGFX1250(getSTI()); } + bool isGFX1250Plus() const { return AMDGPU::isGFX1250Plus(getSTI()); } + + bool isGFX13() const { return AMDGPU::isGFX13(getSTI()); } + + bool isGFX13Plus() const { return AMDGPU::isGFX13Plus(getSTI()); } + bool isGFX10_AEncoding() const { return AMDGPU::isGFX10_AEncoding(getSTI()); } bool isGFX10_BEncoding() const { @@ -1675,7 +1699,8 @@ public: ParseStatus parseNamedBit(StringRef Name, OperandVector &Operands, - AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone); + AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone, + bool IgnoreNegative = false); unsigned getCPolKind(StringRef Id, StringRef Mnemo, bool &Disabling) const; ParseStatus parseCPol(OperandVector &Operands); ParseStatus parseScope(OperandVector &Operands, int64_t &Scope); @@ -1763,7 +1788,7 @@ private: bool IsSymbolic = false; bool IsDefined = false; - OperandInfoTy(int64_t Val) : Val(Val) {} + constexpr OperandInfoTy(int64_t Val) : Val(Val) {} }; struct StructuredOpField : OperandInfoTy { @@ -1772,8 +1797,8 @@ private: unsigned Width; bool IsDefined = false; - StructuredOpField(StringLiteral Id, StringLiteral Desc, unsigned Width, - int64_t Default) + constexpr StructuredOpField(StringLiteral Id, StringLiteral Desc, + unsigned Width, int64_t Default) : OperandInfoTy(Default), Id(Id), Desc(Desc), Width(Width) {} virtual ~StructuredOpField() = default; @@ -1860,13 +1885,12 @@ private: bool validateTHAndScopeBits(const MCInst &Inst, const OperandVector &Operands, const unsigned CPol); bool validateTFE(const MCInst &Inst, const OperandVector &Operands); - bool validateSetVgprMSB(const MCInst &Inst, const OperandVector &Operands); bool validateLdsDirect(const MCInst &Inst, const OperandVector &Operands); bool validateWMMA(const MCInst &Inst, const OperandVector &Operands); unsigned getConstantBusLimit(unsigned Opcode) const; bool usesConstantBus(const MCInst &Inst, unsigned OpIdx); bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const; - unsigned findImplicitSGPRReadInVOP(const MCInst &Inst) const; + MCRegister findImplicitSGPRReadInVOP(const MCInst &Inst) const; bool isSupportedMnemo(StringRef Mnemo, const FeatureBitset &FBS); @@ -1905,6 +1929,7 @@ public: ParseStatus parseExpTgt(OperandVector &Operands); ParseStatus parseSendMsg(OperandVector &Operands); + ParseStatus parseWaitEvent(OperandVector &Operands); ParseStatus parseInterpSlot(OperandVector &Operands); ParseStatus parseInterpAttr(OperandVector &Operands); ParseStatus parseSOPPBrTarget(OperandVector &Operands); @@ -2040,6 +2065,7 @@ static const fltSemantics *getOpFltSemantics(uint8_t OperandType) { case AMDGPU::OPERAND_REG_INLINE_C_FP16: case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: case AMDGPU::OPERAND_REG_IMM_V2FP16: + case AMDGPU::OPERAND_REG_IMM_V2FP16_SPLAT: case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: case AMDGPU::OPERAND_KIMM16: return &APFloat::IEEEhalf(); @@ -2434,6 +2460,7 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: case AMDGPU::OPERAND_REG_IMM_V2INT16: case AMDGPU::OPERAND_REG_IMM_V2FP16: + case AMDGPU::OPERAND_REG_IMM_V2FP16_SPLAT: case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: case AMDGPU::OPERAND_REG_IMM_V2FP32: case AMDGPU::OPERAND_REG_IMM_V2INT32: @@ -2476,6 +2503,7 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo case AMDGPU::OPERAND_REG_IMM_V2INT16: case AMDGPU::OPERAND_REG_IMM_V2BF16: case AMDGPU::OPERAND_REG_IMM_V2FP16: + case AMDGPU::OPERAND_REG_IMM_V2FP16_SPLAT: case AMDGPU::OPERAND_REG_IMM_V2FP32: case AMDGPU::OPERAND_REG_IMM_V2INT32: case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32: @@ -2922,7 +2950,7 @@ MCRegister AMDGPUAsmParser::getRegularReg(RegisterKind RegKind, unsigned RegNum, return AMDGPU::NoRegister; } - if (RegKind == IS_VGPR && !isGFX1250() && RegIdx + RegWidth / 32 > 256) { + if (RegKind == IS_VGPR && !isGFX1250Plus() && RegIdx + RegWidth / 32 > 256) { Error(Loc, "register index is out of range"); return MCRegister(); } @@ -3666,7 +3694,8 @@ StringRef AMDGPUAsmParser::getMatchedVariantName() const { return ""; } -unsigned AMDGPUAsmParser::findImplicitSGPRReadInVOP(const MCInst &Inst) const { +MCRegister +AMDGPUAsmParser::findImplicitSGPRReadInVOP(const MCInst &Inst) const { const MCInstrDesc &Desc = MII.get(Inst.getOpcode()); for (MCPhysReg Reg : Desc.implicit_uses()) { switch (Reg) { @@ -3680,7 +3709,7 @@ unsigned AMDGPUAsmParser::findImplicitSGPRReadInVOP(const MCInst &Inst) const { break; } } - return AMDGPU::NoRegister; + return MCRegister(); } // NB: This code is correct only when used to check constant @@ -3720,6 +3749,9 @@ bool AMDGPUAsmParser::isInlineConstant(const MCInst &Inst, OperandType == AMDGPU::OPERAND_REG_IMM_V2FP16) return AMDGPU::isInlinableLiteralV2F16(Val); + if (OperandType == AMDGPU::OPERAND_REG_IMM_V2FP16_SPLAT) + return AMDGPU::isPKFMACF16InlineConstant(Val, isGFX11Plus()); + if (OperandType == AMDGPU::OPERAND_REG_INLINE_C_V2BF16 || OperandType == AMDGPU::OPERAND_REG_IMM_V2BF16) return AMDGPU::isInlinableLiteralV2BF16(Val); @@ -3855,9 +3887,9 @@ bool AMDGPUAsmParser::validateConstantBusLimitations( LiteralSize = 4; } - SmallDenseSet<unsigned> SGPRsUsed; - unsigned SGPRUsed = findImplicitSGPRReadInVOP(Inst); - if (SGPRUsed != AMDGPU::NoRegister) { + SmallDenseSet<MCRegister> SGPRsUsed; + MCRegister SGPRUsed = findImplicitSGPRReadInVOP(Inst); + if (SGPRUsed) { SGPRsUsed.insert(SGPRUsed); ++ConstantBusUseCount; } @@ -3940,7 +3972,7 @@ AMDGPUAsmParser::checkVOPDRegBankConstraints(const MCInst &Inst, bool AsVOPD3) { bool SkipSrc = Opcode == AMDGPU::V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 || Opcode == AMDGPU::V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 || Opcode == AMDGPU::V_DUAL_MOV_B32_e32_X_MOV_B32_e32_e96_gfx1250; - bool AllowSameVGPR = isGFX1250(); + bool AllowSameVGPR = isGFX1250Plus(); if (AsVOPD3) { // Literal constants are not allowed with VOPD3. for (auto OpName : {OpName::src0X, OpName::src0Y}) { @@ -4074,7 +4106,7 @@ bool AMDGPUAsmParser::tryVOPD(const MCInst &Inst) { // form but switch to VOPD3 otherwise. bool AMDGPUAsmParser::tryAnotherVOPDEncoding(const MCInst &Inst) { const unsigned Opcode = Inst.getOpcode(); - if (!isGFX1250() || !isVOPD(Opcode)) + if (!isGFX1250Plus() || !isVOPD(Opcode)) return false; if (MII.get(Opcode).TSFlags & SIInstrFlags::VOPD3) @@ -5364,7 +5396,7 @@ bool AMDGPUAsmParser::validateCoherencyBits(const MCInst &Inst, unsigned CPol = Inst.getOperand(CPolPos).getImm(); - if (!isGFX1250()) { + if (!isGFX1250Plus()) { if (CPol & CPol::SCAL) { SMLoc S = getImmLoc(AMDGPUOperand::ImmTyCPol, Operands); StringRef CStr(S.getPointer()); @@ -5506,22 +5538,6 @@ bool AMDGPUAsmParser::validateTFE(const MCInst &Inst, return true; } -bool AMDGPUAsmParser::validateSetVgprMSB(const MCInst &Inst, - const OperandVector &Operands) { - if (Inst.getOpcode() != AMDGPU::S_SET_VGPR_MSB_gfx12) - return true; - - int Simm16Pos = - AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::simm16); - if ((unsigned)Inst.getOperand(Simm16Pos).getImm() > 255) { - SMLoc Loc = Operands[1]->getStartLoc(); - Error(Loc, "s_set_vgpr_msb accepts values in range [0..255]"); - return false; - } - - return true; -} - bool AMDGPUAsmParser::validateWMMA(const MCInst &Inst, const OperandVector &Operands) { unsigned Opc = Inst.getOpcode(); @@ -5541,12 +5557,9 @@ bool AMDGPUAsmParser::validateWMMA(const MCInst &Inst, if (RegSize == AMDGPU::wmmaScaleF8F6F4FormatToNumRegs(Fmt) * 32) return true; - static const char *FmtNames[] = {"MATRIX_FMT_FP8", "MATRIX_FMT_BF8", - "MATRIX_FMT_FP6", "MATRIX_FMT_BF6", - "MATRIX_FMT_FP4"}; - Error(getOperandLoc(Operands, SrcIdx), - "wrong register tuple size for " + Twine(FmtNames[Fmt])); + "wrong register tuple size for " + + Twine(WMMAMods::ModMatrixFmt[Fmt])); return false; }; @@ -5681,9 +5694,6 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst, SMLoc IDLoc, if (!validateTFE(Inst, Operands)) { return false; } - if (!validateSetVgprMSB(Inst, Operands)) { - return false; - } if (!validateWMMA(Inst, Operands)) { return false; } @@ -6182,7 +6192,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { return Error(IDRange.Start, "directive requires gfx90a+", IDRange); AccumOffset = ExprVal; } else if (ID == ".amdhsa_named_barrier_count") { - if (!isGFX1250()) + if (!isGFX1250Plus()) return Error(IDRange.Start, "directive requires gfx1250+", IDRange); NamedBarCnt = ExprVal; } else if (ID == ".amdhsa_reserve_vcc") { @@ -6382,7 +6392,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { return TokError("amdgpu_user_sgpr_count smaller than than implied by " "enabled user SGPRs"); - if (isGFX1250()) { + if (isGFX1250Plus()) { if (!isUInt<COMPUTE_PGM_RSRC2_GFX125_USER_SGPR_COUNT_WIDTH>(UserSGPRCount)) return TokError("too many user SGPRs enabled"); AMDGPU::MCKernelDescriptor::bits_set( @@ -6437,7 +6447,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { getContext()); } - if (isGFX1250()) + if (isGFX1250Plus()) MCKernelDescriptor::bits_set(KD.compute_pgm_rsrc3, NamedBarCnt, COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT_SHIFT, COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT, @@ -7046,13 +7056,16 @@ ParseStatus AMDGPUAsmParser::parseOperandArrayWithPrefix( ParseStatus AMDGPUAsmParser::parseNamedBit(StringRef Name, OperandVector &Operands, - AMDGPUOperand::ImmTy ImmTy) { + AMDGPUOperand::ImmTy ImmTy, + bool IgnoreNegative) { int64_t Bit; SMLoc S = getLoc(); if (trySkipId(Name)) { Bit = 1; } else if (trySkipId("no", Name)) { + if (IgnoreNegative) + return ParseStatus::Success; Bit = 0; } else { return ParseStatus::NoMatch; @@ -7063,6 +7076,12 @@ ParseStatus AMDGPUAsmParser::parseNamedBit(StringRef Name, if (Name == "a16" && !hasA16()) return Error(S, "a16 modifier is not supported on this GPU"); + if (Bit == 0 && Name == "gds") { + StringRef Mnemo = ((AMDGPUOperand &)*Operands[0]).getToken(); + if (Mnemo.starts_with("ds_gws")) + return Error(S, "nogds is not allowed"); + } + if (isGFX9() && ImmTy == AMDGPUOperand::ImmTyA16) ImmTy = AMDGPUOperand::ImmTyR128A16; @@ -7403,10 +7422,7 @@ ParseStatus AMDGPUAsmParser::parseIndexKey32bit(OperandVector &Operands) { ParseStatus AMDGPUAsmParser::tryParseMatrixFMT(OperandVector &Operands, StringRef Name, AMDGPUOperand::ImmTy Type) { - return parseStringOrIntWithPrefix(Operands, Name, - {"MATRIX_FMT_FP8", "MATRIX_FMT_BF8", - "MATRIX_FMT_FP6", "MATRIX_FMT_BF6", - "MATRIX_FMT_FP4"}, + return parseStringOrIntWithPrefix(Operands, Name, WMMAMods::ModMatrixFmt, Type); } @@ -7423,8 +7439,8 @@ ParseStatus AMDGPUAsmParser::parseMatrixBFMT(OperandVector &Operands) { ParseStatus AMDGPUAsmParser::tryParseMatrixScale(OperandVector &Operands, StringRef Name, AMDGPUOperand::ImmTy Type) { - return parseStringOrIntWithPrefix( - Operands, Name, {"MATRIX_SCALE_ROW0", "MATRIX_SCALE_ROW1"}, Type); + return parseStringOrIntWithPrefix(Operands, Name, WMMAMods::ModMatrixScale, + Type); } ParseStatus AMDGPUAsmParser::parseMatrixAScale(OperandVector &Operands) { @@ -7440,10 +7456,8 @@ ParseStatus AMDGPUAsmParser::parseMatrixBScale(OperandVector &Operands) { ParseStatus AMDGPUAsmParser::tryParseMatrixScaleFmt(OperandVector &Operands, StringRef Name, AMDGPUOperand::ImmTy Type) { - return parseStringOrIntWithPrefix( - Operands, Name, - {"MATRIX_SCALE_FMT_E8", "MATRIX_SCALE_FMT_E5M3", "MATRIX_SCALE_FMT_E4M3"}, - Type); + return parseStringOrIntWithPrefix(Operands, Name, WMMAMods::ModMatrixScaleFmt, + Type); } ParseStatus AMDGPUAsmParser::parseMatrixAScaleFmt(OperandVector &Operands) { @@ -8241,6 +8255,41 @@ bool AMDGPUOperand::isSendMsg() const { return isImmTy(ImmTySendMsg); } +ParseStatus AMDGPUAsmParser::parseWaitEvent(OperandVector &Operands) { + using namespace llvm::AMDGPU::WaitEvent; + + SMLoc Loc = getLoc(); + int64_t ImmVal = 0; + + StructuredOpField DontWaitExportReady("dont_wait_export_ready", "bit value", + 1, 0); + StructuredOpField ExportReady("export_ready", "bit value", 1, 0); + + StructuredOpField *TargetBitfield = + isGFX11() ? &DontWaitExportReady : &ExportReady; + + ParseStatus Res = parseStructuredOpFields({TargetBitfield}); + if (Res.isNoMatch() && parseExpr(ImmVal, "structured immediate")) + Res = ParseStatus::Success; + else if (Res.isSuccess()) { + if (!validateStructuredOpFields({TargetBitfield})) + return ParseStatus::Failure; + ImmVal = TargetBitfield->Val; + } + + if (!Res.isSuccess()) + return ParseStatus::Failure; + + if (!isUInt<16>(ImmVal)) + return Error(Loc, "invalid immediate: only 16-bit values are legal"); + + Operands.push_back(AMDGPUOperand::CreateImm(this, ImmVal, Loc, + AMDGPUOperand::ImmTyWaitEvent)); + return ParseStatus::Success; +} + +bool AMDGPUOperand::isWaitEvent() const { return isImmTy(ImmTyWaitEvent); } + //===----------------------------------------------------------------------===// // v_interp //===----------------------------------------------------------------------===// @@ -9048,6 +9097,9 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst, addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyCPol, 0); + // Parse a dummy operand as a placeholder for the SWZ operand. This enforces + // agreement between MCInstrDesc.getNumOperands and MCInst.getNumOperands. + Inst.addOperand(MCOperand::createImm(0)); } //===----------------------------------------------------------------------===// @@ -9514,6 +9566,8 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands, Opc == AMDGPU::V_CVT_SCALEF32_PK_FP4_BF16_vi || Opc == AMDGPU::V_CVT_SR_BF8_F32_vi || Opc == AMDGPU::V_CVT_SR_FP8_F32_vi || + Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_gfx11 || + Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_gfx11 || Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_gfx12 || Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_gfx12) { Inst.addOperand(MCOperand::createImm(0)); // Placeholder for src2_mods @@ -9523,7 +9577,19 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands, // Adding vdst_in operand is already covered for these DPP instructions in // cvtVOP3DPP. if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vdst_in) && - !(Opc == AMDGPU::V_CVT_PK_BF8_F32_t16_e64_dpp_gfx12 || + !(Opc == AMDGPU::V_CVT_PK_BF8_F32_t16_e64_dpp_gfx11 || + Opc == AMDGPU::V_CVT_PK_FP8_F32_t16_e64_dpp_gfx11 || + Opc == AMDGPU::V_CVT_PK_BF8_F32_t16_e64_dpp8_gfx11 || + Opc == AMDGPU::V_CVT_PK_FP8_F32_t16_e64_dpp8_gfx11 || + Opc == AMDGPU::V_CVT_PK_BF8_F32_fake16_e64_dpp_gfx11 || + Opc == AMDGPU::V_CVT_PK_FP8_F32_fake16_e64_dpp_gfx11 || + Opc == AMDGPU::V_CVT_PK_BF8_F32_fake16_e64_dpp8_gfx11 || + Opc == AMDGPU::V_CVT_PK_FP8_F32_fake16_e64_dpp8_gfx11 || + Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp_gfx11 || + Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp8_gfx11 || + Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp_gfx11 || + Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp8_gfx11 || + Opc == AMDGPU::V_CVT_PK_BF8_F32_t16_e64_dpp_gfx12 || Opc == AMDGPU::V_CVT_PK_FP8_F32_t16_e64_dpp_gfx12 || Opc == AMDGPU::V_CVT_PK_BF8_F32_t16_e64_dpp8_gfx12 || Opc == AMDGPU::V_CVT_PK_FP8_F32_t16_e64_dpp8_gfx12 || @@ -10439,7 +10505,7 @@ ParseStatus AMDGPUAsmParser::parseCustomOperand(OperandVector &Operands, case MCK_addr64: return parseTokenOp("addr64", Operands); case MCK_done: - return parseTokenOp("done", Operands); + return parseNamedBit("done", Operands, AMDGPUOperand::ImmTyDone, true); case MCK_idxen: return parseTokenOp("idxen", Operands); case MCK_lds: @@ -10449,7 +10515,7 @@ ParseStatus AMDGPUAsmParser::parseCustomOperand(OperandVector &Operands, case MCK_off: return parseTokenOp("off", Operands); case MCK_row_95_en: - return parseTokenOp("row_en", Operands); + return parseNamedBit("row_en", Operands, AMDGPUOperand::ImmTyRowEn, true); case MCK_gds: return parseNamedBit("gds", Operands, AMDGPUOperand::ImmTyGDS); case MCK_tfe: @@ -10480,6 +10546,10 @@ unsigned AMDGPUAsmParser::validateTargetOperandClass(MCParsedAsmOperand &Op, return Operand.isOffen() ? Match_Success : Match_InvalidOperand; case MCK_tfe: return Operand.isTFE() ? Match_Success : Match_InvalidOperand; + case MCK_done: + return Operand.isDone() ? Match_Success : Match_InvalidOperand; + case MCK_row_95_en: + return Operand.isRowEn() ? Match_Success : Match_InvalidOperand; case MCK_SSrc_b32: // When operands have expression values, they will return true for isToken, // because it is not possible to distinguish between a token and an diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index b97b738..568fff2 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -159,9 +159,9 @@ class MTBUF_Real <MTBUF_Pseudo ps, string real_name = ps.Mnemonic> : } class getMTBUFInsDA<list<RegisterOperand> vdataList, - list<RegisterClass> vaddrList=[], bit hasRestrictedSOffset> { + list<RegisterOperand> vaddrList=[], bit hasRestrictedSOffset> { RegisterOperand vdata_op = !if(!empty(vdataList), ?, !head(vdataList)); - RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList)); + RegisterOperand vaddr_op = !if(!empty(vaddrList), ?, !head(vaddrList)); dag SOffset = !if(hasRestrictedSOffset, (ins SReg_32:$soffset), (ins SCSrc_b32:$soffset)); @@ -171,7 +171,7 @@ class getMTBUFInsDA<list<RegisterOperand> vdataList, dag Inputs = !if(!empty(vaddrList), NonVaddrInputs, - !con((ins vaddrClass:$vaddr), NonVaddrInputs)); + !con((ins vaddr_op:$vaddr), NonVaddrInputs)); dag ret = !if(!empty(vdataList), Inputs, !con((ins vdata_op:$vdata), Inputs)); @@ -180,10 +180,10 @@ class getMTBUFInsDA<list<RegisterOperand> vdataList, class getMTBUFIns<int addrKind, list<RegisterOperand> vdataList=[], bit hasRestrictedSOffset> { dag ret = !if(!eq(addrKind, BUFAddrKind.Offset), getMTBUFInsDA<vdataList, [], hasRestrictedSOffset>.ret, - !if(!eq(addrKind, BUFAddrKind.OffEn), getMTBUFInsDA<vdataList, [VGPR_32], hasRestrictedSOffset>.ret, - !if(!eq(addrKind, BUFAddrKind.IdxEn), getMTBUFInsDA<vdataList, [VGPR_32], hasRestrictedSOffset>.ret, - !if(!eq(addrKind, BUFAddrKind.BothEn), getMTBUFInsDA<vdataList, [VReg_64], hasRestrictedSOffset>.ret, - !if(!eq(addrKind, BUFAddrKind.Addr64), getMTBUFInsDA<vdataList, [VReg_64], hasRestrictedSOffset>.ret, + !if(!eq(addrKind, BUFAddrKind.OffEn), getMTBUFInsDA<vdataList, [VGPROp_32], hasRestrictedSOffset>.ret, + !if(!eq(addrKind, BUFAddrKind.IdxEn), getMTBUFInsDA<vdataList, [VGPROp_32], hasRestrictedSOffset>.ret, + !if(!eq(addrKind, BUFAddrKind.BothEn), getMTBUFInsDA<vdataList, [VGPROp_64], hasRestrictedSOffset>.ret, + !if(!eq(addrKind, BUFAddrKind.Addr64), getMTBUFInsDA<vdataList, [VGPROp_64], hasRestrictedSOffset>.ret, (ins)))))); } @@ -393,7 +393,7 @@ class MUBUF_Invalidate <string opName, SDPatternOperator node = null_frag> : let sccb_value = 0; } -class getBUFVDataRegisterOperand<int Size, bit isTFE> { +class getBUFVDataRegisterOperand<int Size, bit isTFE, bit isTrue16 = false> { defvar tfeVDataOp = !cond(!eq(Size, 16) : AVLdSt_64, !eq(Size, 32) : AVLdSt_64, @@ -402,7 +402,7 @@ class getBUFVDataRegisterOperand<int Size, bit isTFE> { !eq(Size, 128) : AVLdSt_160); defvar VDataOp = - !cond(!eq(Size, 16) : AVLdSt_32, + !cond(!eq(Size, 16) : !if(isTrue16, VGPROp_16, AVLdSt_32), !eq(Size, 32) : AVLdSt_32, !eq(Size, 64) : AVLdSt_64, !eq(Size, 96) : AVLdSt_96, @@ -417,15 +417,17 @@ class getBUFVDataRegisterOperandForOp<RegisterOperand Op, bit isTFE> { } class getMUBUFInsDA<list<RegisterOperand> vdataList, - list<RegisterClassLike> vaddrList, bit isTFE, bit hasRestrictedSOffset> { + list<RegisterOperand> vaddrList, bit isTFE, bit hasRestrictedSOffset, + bit isTrue16, bit isLds> { RegisterOperand vdataClass = !if(!empty(vdataList), ?, !head(vdataList)); - RegisterClassLike vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList)); - RegisterOperand vdata_op = getBUFVDataRegisterOperand<!cast<SIRegisterClassLike>(vdataClass.RegClass).Size, isTFE>.ret; + RegisterOperand vaddr_op = !if(!empty(vaddrList), ?, !head(vaddrList)); + RegisterOperand vdata_op = getBUFVDataRegisterOperand<!cast<SIRegisterClassLike>(vdataClass.RegClass).Size, isTFE, isTrue16>.ret; dag SOffset = !if(hasRestrictedSOffset, (ins SReg_32:$soffset), (ins SCSrc_b32:$soffset)); - dag NonVaddrInputs = !con((ins SReg_128_XNULL:$srsrc), SOffset, (ins Offset:$offset, CPol_0:$cpol, i1imm_0:$swz)); + dag IsAsyncOpnd = !if(isLds, (ins i1imm_0:$IsAsync), (ins)); + dag NonVaddrInputs = !con((ins SReg_128_XNULL:$srsrc), SOffset, (ins Offset:$offset, CPol_0:$cpol, i1imm_0:$swz), IsAsyncOpnd); - dag Inputs = !if(!empty(vaddrList), NonVaddrInputs, !con((ins vaddrClass:$vaddr), NonVaddrInputs)); + dag Inputs = !if(!empty(vaddrList), NonVaddrInputs, !con((ins vaddr_op:$vaddr), NonVaddrInputs)); dag ret = !if(!empty(vdataList), Inputs, !con((ins vdata_op:$vdata), Inputs)); } @@ -448,13 +450,14 @@ class getMUBUFElements<ValueType vt> { ); } -class getMUBUFIns<int addrKind, list<RegisterOperand> vdataList, bit isTFE, bit hasRestrictedSOffset> { +class getMUBUFIns<int addrKind, list<RegisterOperand> vdataList, bit isTFE, + bit hasRestrictedSOffset, bit isTrue16, bit isLds> { dag ret = - !if(!eq(addrKind, BUFAddrKind.Offset), getMUBUFInsDA<vdataList, [], isTFE, hasRestrictedSOffset>.ret, - !if(!eq(addrKind, BUFAddrKind.OffEn), getMUBUFInsDA<vdataList, [VGPR_32], isTFE, hasRestrictedSOffset>.ret, - !if(!eq(addrKind, BUFAddrKind.IdxEn), getMUBUFInsDA<vdataList, [VGPR_32], isTFE, hasRestrictedSOffset>.ret, - !if(!eq(addrKind, BUFAddrKind.BothEn), getMUBUFInsDA<vdataList, [VReg_64_AlignTarget], isTFE, hasRestrictedSOffset>.ret, - !if(!eq(addrKind, BUFAddrKind.Addr64), getMUBUFInsDA<vdataList, [VReg_64_AlignTarget], isTFE, hasRestrictedSOffset>.ret, + !if(!eq(addrKind, BUFAddrKind.Offset), getMUBUFInsDA<vdataList, [], isTFE, hasRestrictedSOffset, isTrue16, isLds>.ret, + !if(!eq(addrKind, BUFAddrKind.OffEn), getMUBUFInsDA<vdataList, [VGPROp_32], isTFE, hasRestrictedSOffset, isTrue16, isLds>.ret, + !if(!eq(addrKind, BUFAddrKind.IdxEn), getMUBUFInsDA<vdataList, [VGPROp_32], isTFE, hasRestrictedSOffset, isTrue16, isLds>.ret, + !if(!eq(addrKind, BUFAddrKind.BothEn), getMUBUFInsDA<vdataList, [VGPROp_64], isTFE, hasRestrictedSOffset, isTrue16, isLds>.ret, + !if(!eq(addrKind, BUFAddrKind.Addr64), getMUBUFInsDA<vdataList, [VGPROp_64], isTFE, hasRestrictedSOffset, isTrue16, isLds>.ret, (ins)))))); } @@ -499,7 +502,7 @@ class MUBUF_Load_Pseudo <string opName, RegisterOperand vdata_op = getBUFVDataRegisterOperand<vdata_vt.Size, isTFE>.ret> : MUBUF_Pseudo<opName, !if(!or(isLds, isLdsOpc), (outs), (outs vdata_op:$vdata)), - !con(getMUBUFIns<addrKind, [], isTFE, hasRestrictedSOffset>.ret, + !con(getMUBUFIns<addrKind, [], isTFE, hasRestrictedSOffset, 0, isLds>.ret, !if(HasTiedDest, (ins vdata_op:$vdata_in), (ins))), getMUBUFAsmOps<addrKind, !or(isLds, isLdsOpc), isLds, isTFE>.ret, pattern>, @@ -509,7 +512,7 @@ class MUBUF_Load_Pseudo <string opName, let AsmMatchConverter = "cvtMubuf"; let Constraints = !if(HasTiedDest, "$vdata = $vdata_in", ""); - let LGKM_CNT = isLds; + let LGKM_CNT = 0; let has_vdata = !not(!or(isLds, isLdsOpc)); let mayLoad = 1; let mayStore = isLds; @@ -566,6 +569,33 @@ multiclass MUBUF_Pseudo_Loads_Helper<string opName, ValueType load_vt, } } +multiclass MUBUF_Pseudo_Loads_Helper_t16<string opName, string Hi16Name, string Lo16Name, ValueType load_vt, + bit TiedDest, bit isLds, bit isTFE, bit hasRestrictedSOffset> { + def _OFFSET : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, load_vt, TiedDest, isLds, 0, isTFE, hasRestrictedSOffset, [], VGPROp_16>, + MUBUFAddr64Table<0, NAME # !if(isLds, "_LDS", "")>, True16D16Table<Hi16Name#"_OFFSET", Lo16Name#"_OFFSET">; + + def _ADDR64 : MUBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, load_vt, TiedDest, isLds, 0, isTFE, hasRestrictedSOffset, [], VGPROp_16>, + MUBUFAddr64Table<1, NAME # !if(isLds, "_LDS", "")>, True16D16Table<Hi16Name#"_ADDR64", Lo16Name#"_ADDR64">; + + def _OFFEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, load_vt, TiedDest, isLds, 0, isTFE, hasRestrictedSOffset, [], VGPROp_16>, + True16D16Table<Hi16Name#"_OFFEN", Lo16Name#"_OFFEN">; + def _IDXEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, load_vt, TiedDest, isLds, 0, isTFE, hasRestrictedSOffset, [], VGPROp_16>, + True16D16Table<Hi16Name#"_IDXEN", Lo16Name#"_IDXEN">; + def _BOTHEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, load_vt, TiedDest, isLds, 0, isTFE, hasRestrictedSOffset, [], VGPROp_16>, + True16D16Table<Hi16Name#"_BOTHEN", Lo16Name#"_BOTHEN">; + + let DisableWQM = 1 in { + def _OFFSET_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, load_vt, TiedDest, isLds, 0, isTFE, hasRestrictedSOffset, [], VGPROp_16>, + True16D16Table<Hi16Name#"_OFFSET_exact", Lo16Name#"_OFFSET_exact">; + def _OFFEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, load_vt, TiedDest, isLds, 0, isTFE, hasRestrictedSOffset, [], VGPROp_16>, + True16D16Table<Hi16Name#"_OFFEN_exact", Lo16Name#"_OFFEN_exact">; + def _IDXEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, load_vt, TiedDest, isLds, 0, isTFE, hasRestrictedSOffset, [], VGPROp_16>, + True16D16Table<Hi16Name#"_IDXEN_exact", Lo16Name#"_IDXEN_exact">; + def _BOTHEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, load_vt, TiedDest, isLds, 0, isTFE, hasRestrictedSOffset, [], VGPROp_16>, + True16D16Table<Hi16Name#"_BOTHEN_exact", Lo16Name#"_BOTHEN_exact">; + } +} + multiclass MUBUF_Pseudo_Loads<string opName, ValueType load_vt = i32, bit TiedDest = 0, bit isLds = 0> { defm NAME : MUBUF_Pseudo_Loads_Helper<opName, load_vt, TiedDest, isLds, 0, 0>; @@ -577,6 +607,23 @@ multiclass MUBUF_Pseudo_Loads<string opName, ValueType load_vt = i32, } } +multiclass MUBUF_Pseudo_Loads_t16<string opName, ValueType load_vt = i32, + bit TiedDest = 0, bit isLds = 0, string hiOpName = NAME#"_HI"> { + let True16Predicate = NotUseRealTrue16Insts in { + defm NAME : MUBUF_Pseudo_Loads_Helper<opName, load_vt, TiedDest, isLds, 0, 0>; + defm _VBUFFER : MUBUF_Pseudo_Loads_Helper<opName, load_vt, TiedDest, isLds, 0, 1>; + } + let True16Predicate = UseRealTrue16Insts in { + defvar NAME16 = opName#"_t16"; + defm _t16 : MUBUF_Pseudo_Loads_Helper_t16<NAME16, hiOpName, NAME, i16, 0, isLds, 0, 0>; + defm _t16_VBUFFER : MUBUF_Pseudo_Loads_Helper_t16<NAME16, hiOpName#"_VBUFFER", NAME#"_VBUFFER", i16, 0, isLds, 0, 1>; + } + if !not(isLds) then { + defm _TFE : MUBUF_Pseudo_Loads_Helper<opName, load_vt, TiedDest, isLds, 1, 0>; + defm _TFE_VBUFFER : MUBUF_Pseudo_Loads_Helper<opName, load_vt, TiedDest, isLds, 1, 1>; + } +} + multiclass MUBUF_Pseudo_Loads_Lds<string opName, ValueType load_vt = i32, Predicate LDSPred = TruePredicate> { defm NAME : MUBUF_Pseudo_Loads<opName, load_vt>; @@ -595,10 +642,11 @@ class MUBUF_Store_Pseudo <string opName, ValueType store_vt, bit isTFE = 0, bit hasRestrictedSOffset = 0, - list<dag> pattern=[]> + list<dag> pattern=[], + bit isTrue16 = false> : MUBUF_Pseudo<opName, (outs), - getMUBUFIns<addrKind, [getVregSrcForVT<store_vt>.ret], isTFE, hasRestrictedSOffset>.ret, + getMUBUFIns<addrKind, [getVregSrcForVT<store_vt, isTrue16, 0>.ret], isTFE, hasRestrictedSOffset, isTrue16, 0>.ret, getMUBUFAsmOps<addrKind, 0, 0, isTFE>.ret, pattern>, MUBUF_SetupAddr<addrKind> { @@ -650,6 +698,33 @@ multiclass MUBUF_Pseudo_Stores_Helper<string opName, ValueType store_vt, } } +multiclass MUBUF_Pseudo_Stores_Helper_t16<string opName, string Hi16Name, string Lo16Name, ValueType store_vt, + bit isTFE, bit hasRestrictedSOffset> { + def _OFFSET : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, store_vt, isTFE, hasRestrictedSOffset, [], 1/*isTrue16*/>, + MUBUFAddr64Table<0, NAME>, True16D16Table<Hi16Name#"_OFFSET", Lo16Name#"_OFFSET">; + + def _ADDR64 : MUBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, store_vt, isTFE, hasRestrictedSOffset, [], 1/*isTrue16*/>, + MUBUFAddr64Table<1, NAME>, True16D16Table<Hi16Name#"_ADDR64", Lo16Name#"_ADDR64">; + + def _OFFEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, store_vt, isTFE, hasRestrictedSOffset, [], 1/*isTrue16*/>, + True16D16Table<Hi16Name#"_OFFEN", Lo16Name#"_OFFEN">; + def _IDXEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, store_vt, isTFE, hasRestrictedSOffset, [], 1/*isTrue16*/>, + True16D16Table<Hi16Name#"_IDXEN", Lo16Name#"_IDXEN">; + def _BOTHEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, store_vt, isTFE, hasRestrictedSOffset, [], 1/*isTrue16*/>, + True16D16Table<Hi16Name#"_BOTHEN", Lo16Name#"_BOTHEN">; + + let DisableWQM = 1 in { + def _OFFSET_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, store_vt, isTFE, hasRestrictedSOffset, [], 1/*isTrue16*/>, + True16D16Table<Hi16Name#"_OFFSET_exact", Lo16Name#"_OFFSET_exact">; + def _OFFEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, store_vt, isTFE, hasRestrictedSOffset, [], 1/*isTrue16*/>, + True16D16Table<Hi16Name#"_OFFEN_exact", Lo16Name#"_OFFEN_exact">; + def _IDXEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, store_vt, isTFE, hasRestrictedSOffset, [], 1/*isTrue16*/>, + True16D16Table<Hi16Name#"_IDXEN_exact", Lo16Name#"_IDXEN_exact">; + def _BOTHEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, store_vt, isTFE, hasRestrictedSOffset, [], 1/*isTrue16*/>, + True16D16Table<Hi16Name#"_BOTHEN_exact", Lo16Name#"_BOTHEN_exact">; + } +} + multiclass MUBUF_Pseudo_Stores<string opName, ValueType store_vt = i32> { defm NAME : MUBUF_Pseudo_Stores_Helper<opName, store_vt, 0, 0>; defm _TFE : MUBUF_Pseudo_Stores_Helper<opName, store_vt, 1, 0>; @@ -658,6 +733,22 @@ multiclass MUBUF_Pseudo_Stores<string opName, ValueType store_vt = i32> { defm _TFE_VBUFFER : MUBUF_Pseudo_Stores_Helper<opName, store_vt, 1, 1>; } +multiclass MUBUF_Pseudo_Stores_t16<string opName, ValueType store_vt = i32> { + defm _TFE : MUBUF_Pseudo_Stores_Helper<opName, store_vt, 1, 0>; + defm _TFE_VBUFFER : MUBUF_Pseudo_Stores_Helper<opName, store_vt, 1, 1>; + + let True16Predicate = NotUseRealTrue16Insts in { + defm NAME : MUBUF_Pseudo_Stores_Helper<opName, store_vt, 0, 0>; + + defm _VBUFFER : MUBUF_Pseudo_Stores_Helper<opName, store_vt, 0, 1>; + } + let True16Predicate = UseRealTrue16Insts, SubtargetPredicate = HasD16LoadStore in { + defvar NAME16 = opName#"_t16"; + defm _t16 : MUBUF_Pseudo_Stores_Helper_t16<NAME16, NAME#"_D16_HI", NAME, i16, 0, 0>; + defm _t16_VBUFFER : MUBUF_Pseudo_Stores_Helper_t16<NAME16, NAME#"_D16_HI_VBUFFER", NAME#"_VBUFFER", i16, 0, 1>; + } +} + class MUBUF_Pseudo_Store_Lds<string opName> : MUBUF_Pseudo<opName, (outs), @@ -677,11 +768,11 @@ class MUBUF_Pseudo_Store_Lds<string opName> } class getMUBUFAtomicInsDA<RegisterOperand vdata_op, bit vdata_in, bit hasRestrictedSOffset, - list<RegisterClassLike> vaddrList=[]> { - RegisterClassLike vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList)); + list<RegisterOperand> vaddrList=[]> { + RegisterOperand vaddr_op = !if(!empty(vaddrList), ?, !head(vaddrList)); dag VData = !if(vdata_in, (ins vdata_op:$vdata_in), (ins vdata_op:$vdata)); - dag Data = !if(!empty(vaddrList), VData, !con(VData, (ins vaddrClass:$vaddr))); + dag Data = !if(!empty(vaddrList), VData, !con(VData, (ins vaddr_op:$vaddr))); dag SOffset = !if(hasRestrictedSOffset, (ins SReg_32:$soffset), (ins SCSrc_b32:$soffset)); dag MainInputs = !con((ins SReg_128_XNULL:$srsrc), SOffset, (ins Offset:$offset)); dag CPol = !if(vdata_in, (ins CPol_GLC_WithDefault:$cpol), @@ -698,13 +789,13 @@ class getMUBUFAtomicIns<int addrKind, !if(!eq(addrKind, BUFAddrKind.Offset), getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset>.ret, !if(!eq(addrKind, BUFAddrKind.OffEn), - getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset, [VGPR_32]>.ret, + getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset, [VGPROp_32]>.ret, !if(!eq(addrKind, BUFAddrKind.IdxEn), - getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset, [VGPR_32]>.ret, + getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset, [VGPROp_32]>.ret, !if(!eq(addrKind, BUFAddrKind.BothEn), - getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset, [VReg_64_AlignTarget]>.ret, + getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset, [VGPROp_64]>.ret, !if(!eq(addrKind, BUFAddrKind.Addr64), - getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset, [VReg_64_AlignTarget]>.ret, + getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset, [VGPROp_64]>.ret, (ins)))))); } @@ -783,37 +874,20 @@ multiclass MUBUF_Pseudo_Atomics_NO_RTN <string opName, multiclass MUBUF_Pseudo_Atomics_RTN <string opName, RegisterOperand vdataClass, - ValueType vdataType, - SDPatternOperator atomic> { + ValueType vdataType> { let FPAtomic = vdataType.isFP in { - def _OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass, 0, - [(set vdataType:$vdata, - (atomic (MUBUFOffset v4i32:$srsrc, i32:$soffset, i32:$offset), - vdataType:$vdata_in))]>, - MUBUFAddr64Table <0, NAME # "_RTN">; - - def _ADDR64_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, 0, - [(set vdataType:$vdata, - (atomic (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i32:$offset), - vdataType:$vdata_in))]>, - MUBUFAddr64Table <1, NAME # "_RTN">; - + def _OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass, 0>, + MUBUFAddr64Table <0, NAME # "_RTN">; + def _ADDR64_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, 0>, + MUBUFAddr64Table <1, NAME # "_RTN">; def _OFFEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, 0>; def _IDXEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, 0>; def _BOTHEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, 0>; - def _VBUFFER_OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.Offset, vdataClass, 1, - [(set vdataType:$vdata, - (atomic (MUBUFOffset v4i32:$srsrc, i32:$soffset, i32:$offset), - vdataType:$vdata_in))]>, - MUBUFAddr64Table <0, NAME # "_VBUFFER_RTN">; - - def _VBUFFER_ADDR64_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.Addr64, vdataClass, 1, - [(set vdataType:$vdata, - (atomic (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i32:$offset), - vdataType:$vdata_in))]>, - MUBUFAddr64Table <1, NAME # "_VBUFFER_RTN">; - + def _VBUFFER_OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.Offset, vdataClass, 1>, + MUBUFAddr64Table <0, NAME # "_VBUFFER_RTN">; + def _VBUFFER_ADDR64_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.Addr64, vdataClass, 1>, + MUBUFAddr64Table <1, NAME # "_VBUFFER_RTN">; def _VBUFFER_OFFEN_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.OffEn, vdataClass, 1>; def _VBUFFER_IDXEN_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.IdxEn, vdataClass, 1>; def _VBUFFER_BOTHEN_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.BothEn, vdataClass, 1>; @@ -822,10 +896,9 @@ multiclass MUBUF_Pseudo_Atomics_RTN <string opName, multiclass MUBUF_Pseudo_Atomics <string opName, RegisterOperand vdataClass, - ValueType vdataType, - SDPatternOperator atomic = null_frag> : + ValueType vdataType> : MUBUF_Pseudo_Atomics_NO_RTN<opName, vdataClass, vdataType>, - MUBUF_Pseudo_Atomics_RTN<opName, vdataClass, vdataType, atomic>; + MUBUF_Pseudo_Atomics_RTN<opName, vdataClass, vdataType>; //===----------------------------------------------------------------------===// @@ -889,10 +962,16 @@ let TiedSourceNotRead = 1 in { >; } // End OtherPredicates = [HasUnpackedD16VMem], D16Buf = 1. + +let TiedSourceNotRead = 1, SubtargetPredicate = HasD16LoadStore, OtherPredicates = [HasFormattedMUBUFInsts] in +defm BUFFER_LOAD_FORMAT_D16_HI_X : MUBUF_Pseudo_Loads < + "buffer_load_format_d16_hi_x", i32 +>; + let OtherPredicates = [HasPackedD16VMem], D16Buf = 1 in { let TiedSourceNotRead = 1 in { - defm BUFFER_LOAD_FORMAT_D16_X : MUBUF_Pseudo_Loads < - "buffer_load_format_d16_x", f16 + defm BUFFER_LOAD_FORMAT_D16_X : MUBUF_Pseudo_Loads_t16 < + "buffer_load_format_d16_x", f16, 0, 0, "BUFFER_LOAD_FORMAT_D16_HI_X" >; defm BUFFER_LOAD_FORMAT_D16_XY : MUBUF_Pseudo_Loads < "buffer_load_format_d16_xy", v2f16 @@ -948,9 +1027,6 @@ defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i32, atomic_load_aext_8_globa defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i32, atomic_load_zext_8_global>; defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_USHORT", i32, atomic_load_aext_16_global>; defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_USHORT", i32, atomic_load_zext_16_global>; -defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i16, atomic_load_aext_8_global>; -defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i16, atomic_load_zext_8_global>; -defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_USHORT", i16, atomic_load_nonext_16_global>; defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i32, extloadi8_global>; defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i32, zextloadi8_global>; defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_SBYTE", i32, sextloadi8_global>; @@ -977,12 +1053,23 @@ foreach vt = VReg_128.RegTypes in { defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX4", vt, load_global>; } -defm BUFFER_STORE_BYTE : MUBUF_Pseudo_Stores < +let SubtargetPredicate = HasD16LoadStore in { +defm BUFFER_STORE_BYTE_D16_HI : MUBUF_Pseudo_Stores < + "buffer_store_byte_d16_hi", i32 +>; + +defm BUFFER_STORE_SHORT_D16_HI : MUBUF_Pseudo_Stores < + "buffer_store_short_d16_hi", i32 +>; +} + +defm BUFFER_STORE_BYTE : MUBUF_Pseudo_Stores_t16 < "buffer_store_byte", i32 >; -defm BUFFER_STORE_SHORT : MUBUF_Pseudo_Stores < +defm BUFFER_STORE_SHORT : MUBUF_Pseudo_Stores_t16 < "buffer_store_short", i32 >; + defm BUFFER_STORE_DWORD : MUBUF_Pseudo_Stores < "buffer_store_dword", i32 >; @@ -1096,7 +1183,7 @@ defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Pseudo_Atomics < let OtherPredicates = [HasGFX10_BEncoding] in { defm BUFFER_ATOMIC_CSUB : MUBUF_Pseudo_Atomics < - "buffer_atomic_csub", VGPROp_32, i32, int_amdgcn_global_atomic_csub + "buffer_atomic_csub", VGPROp_32, i32 >; } @@ -1117,65 +1204,52 @@ def BUFFER_WBINVL1_SC : MUBUF_Invalidate <"buffer_wbinvl1_sc", let SubtargetPredicate = isGFX6GFX7GFX10Plus in { defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Pseudo_Atomics < - "buffer_atomic_fcmpswap", AVLdSt_64, v2f32, null_frag + "buffer_atomic_fcmpswap", AVLdSt_64, v2f32 >; } let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts in { defm BUFFER_ATOMIC_FMIN : MUBUF_Pseudo_Atomics < - "buffer_atomic_fmin", AVLdSt_32, f32, null_frag + "buffer_atomic_fmin", AVLdSt_32, f32 >; defm BUFFER_ATOMIC_FMAX : MUBUF_Pseudo_Atomics < - "buffer_atomic_fmax", AVLdSt_32, f32, null_frag + "buffer_atomic_fmax", AVLdSt_32, f32 >; } let SubtargetPredicate = isGFX6GFX7GFX10 in { defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Pseudo_Atomics < - "buffer_atomic_fcmpswap_x2", VGPROp_128, v2f64, null_frag + "buffer_atomic_fcmpswap_x2", VGPROp_128, v2f64 >; } let SubtargetPredicate = HasD16LoadStore in { let TiedSourceNotRead = 1 in { -defm BUFFER_LOAD_UBYTE_D16 : MUBUF_Pseudo_Loads < - "buffer_load_ubyte_d16", i32, 1 ->; - defm BUFFER_LOAD_UBYTE_D16_HI : MUBUF_Pseudo_Loads < "buffer_load_ubyte_d16_hi", i32, 1 >; -defm BUFFER_LOAD_SBYTE_D16 : MUBUF_Pseudo_Loads < - "buffer_load_sbyte_d16", i32, 1 ->; - defm BUFFER_LOAD_SBYTE_D16_HI : MUBUF_Pseudo_Loads < "buffer_load_sbyte_d16_hi", i32, 1 >; -defm BUFFER_LOAD_SHORT_D16 : MUBUF_Pseudo_Loads < - "buffer_load_short_d16", i32, 1 ->; - defm BUFFER_LOAD_SHORT_D16_HI : MUBUF_Pseudo_Loads < "buffer_load_short_d16_hi", i32, 1 >; -let OtherPredicates = [HasFormattedMUBUFInsts] in -defm BUFFER_LOAD_FORMAT_D16_HI_X : MUBUF_Pseudo_Loads < - "buffer_load_format_d16_hi_x", i32 +defm BUFFER_LOAD_UBYTE_D16 : MUBUF_Pseudo_Loads_t16 < + "buffer_load_ubyte_d16", i32, 1 >; -} // End TiedSourceNotRead -defm BUFFER_STORE_BYTE_D16_HI : MUBUF_Pseudo_Stores < - "buffer_store_byte_d16_hi", i32 +defm BUFFER_LOAD_SBYTE_D16 : MUBUF_Pseudo_Loads_t16 < + "buffer_load_sbyte_d16", i32, 1 >; -defm BUFFER_STORE_SHORT_D16_HI : MUBUF_Pseudo_Stores < - "buffer_store_short_d16_hi", i32 +defm BUFFER_LOAD_SHORT_D16 : MUBUF_Pseudo_Loads_t16 < + "buffer_load_short_d16", i32, 1 >; +} // End TiedSourceNotRead let OtherPredicates = [HasFormattedMUBUFInsts] in defm BUFFER_STORE_FORMAT_D16_HI_X : MUBUF_Pseudo_Stores < @@ -1184,6 +1258,18 @@ defm BUFFER_STORE_FORMAT_D16_HI_X : MUBUF_Pseudo_Stores < } // End HasD16LoadStore +let True16Predicate = NotUseRealTrue16Insts in { +defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i16, atomic_load_aext_8_global>; +defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i16, atomic_load_zext_8_global>; +defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_USHORT", i16, atomic_load_nonext_16_global>; +} + +let True16Predicate = UseRealTrue16Insts in { +defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE_D16_t16", i16, atomic_load_aext_8_global>; +defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE_D16_t16", i16, atomic_load_zext_8_global>; +defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_SHORT_D16_t16", i16, atomic_load_nonext_16_global>; +} + let SubtargetPredicate = isNotGFX940Plus in def BUFFER_WBINVL1 : MUBUF_Invalidate < "buffer_wbinvl1", int_amdgcn_buffer_wbinvl1 @@ -1201,12 +1287,12 @@ defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_NO_RTN < let SubtargetPredicate = HasAtomicFaddRtnInsts in defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_RTN< - "buffer_atomic_add_f32", AVLdSt_32, f32, null_frag + "buffer_atomic_add_f32", AVLdSt_32, f32 >; let SubtargetPredicate = HasAtomicBufferGlobalPkAddF16Insts in defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_RTN < - "buffer_atomic_pk_add_f16", AVLdSt_32, v2f16, null_frag + "buffer_atomic_pk_add_f16", AVLdSt_32, v2f16 >; let SubtargetPredicate = isGFX12Plus in { @@ -1385,8 +1471,14 @@ let OtherPredicates = [HasUnpackedD16VMem, HasFormattedMUBUFInsts] in { } // End OtherPredicates = [HasUnpackedD16VMem, HasFormattedMUBUFInsts]. let OtherPredicates = [HasPackedD16VMem, HasFormattedMUBUFInsts] in { +let True16Predicate = NotUseRealTrue16Insts in { defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, f16, "BUFFER_LOAD_FORMAT_D16_X">; defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, i16, "BUFFER_LOAD_FORMAT_D16_X">; +} +let True16Predicate = UseRealTrue16Insts in { + defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, f16, "BUFFER_LOAD_FORMAT_D16_X_t16">; + defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, i16, "BUFFER_LOAD_FORMAT_D16_X_t16">; +} defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, i32, "BUFFER_LOAD_FORMAT_D16_X">; defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2f16, "BUFFER_LOAD_FORMAT_D16_XY">; defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2i16, "BUFFER_LOAD_FORMAT_D16_XY">; @@ -1952,15 +2044,26 @@ multiclass MUBUFLoad_PatternOffset <string Instr, ValueType vt, } let OtherPredicates = [Has16BitInsts] in { - +let True16Predicate = NotUseRealTrue16Insts in { defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_SBYTE", i16, sextloadi8_constant>; defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_UBYTE", i16, extloadi8_constant>; defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_UBYTE", i16, zextloadi8_constant>; defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_SBYTE", i16, sextloadi8_global>; defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_UBYTE", i16, extloadi8_global>; defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_UBYTE", i16, zextloadi8_global>; - defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_USHORT", i16, load_global>; +} + +let True16Predicate = UseRealTrue16Insts in { +defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_SBYTE_D16_t16", i16, sextloadi8_constant>; +defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_UBYTE_D16_t16", i16, extloadi8_constant>; +defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_UBYTE_D16_t16", i16, zextloadi8_constant>; +defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_SBYTE_D16_t16", i16, sextloadi8_global>; +defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_UBYTE_D16_t16", i16, extloadi8_global>; +defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_UBYTE_D16_t16", i16, zextloadi8_global>; +defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_SHORT_D16_t16", i16, load_global>; +} + } // End OtherPredicates = [Has16BitInsts] @@ -2000,6 +2103,19 @@ multiclass MUBUFScratchLoadPat_D16_Common <string Instr, >; } +multiclass MUBUFScratchLoadPat_D16_Common_t16 <string Instr, ValueType vt, PatFrag ld_frag> { + def : GCNPat < + (vt (ld_frag (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, i32:$soffset, i32:$offset))), + (!cast<MUBUF_Pseudo>(Instr # _OFFEN) $vaddr, $srsrc, $soffset, $offset) + >; + + def : GCNPat < + (vt (ld_frag (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, i32:$offset))), + (!cast<MUBUF_Pseudo>(Instr # _OFFSET) $srsrc, $soffset, $offset) + >; +} + + multiclass MUBUFScratchLoadPat_D16 <string Instr, ValueType vt, PatFrag ld_frag> { let SubtargetPredicate = HasUnrestrictedSOffset in { @@ -2008,17 +2124,35 @@ multiclass MUBUFScratchLoadPat_D16 <string Instr, defm : MUBUFScratchLoadPat_D16_Common<Instr # "_VBUFFER", vt, ld_frag>; } -let OtherPredicates = [DisableFlatScratch] in { +multiclass MUBUFScratchLoadPat_D16_t16 <string Instr, + ValueType vt, PatFrag ld_frag> { + let SubtargetPredicate = HasUnrestrictedSOffset in { + defm : MUBUFScratchLoadPat_D16_Common_t16<Instr, vt, ld_frag>; + } + defm : MUBUFScratchLoadPat_D16_Common_t16<Instr # "_VBUFFER", vt, ld_frag>; +} + +let OtherPredicates = [NotHasFlatScratchEnabled] in { defm : MUBUFScratchLoadPat <"BUFFER_LOAD_SBYTE", i32, sextloadi8_private>; defm : MUBUFScratchLoadPat <"BUFFER_LOAD_UBYTE", i32, extloadi8_private>; defm : MUBUFScratchLoadPat <"BUFFER_LOAD_UBYTE", i32, zextloadi8_private>; -defm : MUBUFScratchLoadPat <"BUFFER_LOAD_SBYTE", i16, sextloadi8_private>; -defm : MUBUFScratchLoadPat <"BUFFER_LOAD_UBYTE", i16, extloadi8_private>; -defm : MUBUFScratchLoadPat <"BUFFER_LOAD_UBYTE", i16, zextloadi8_private>; defm : MUBUFScratchLoadPat <"BUFFER_LOAD_SSHORT", i32, sextloadi16_private>; defm : MUBUFScratchLoadPat <"BUFFER_LOAD_USHORT", i32, extloadi16_private>; defm : MUBUFScratchLoadPat <"BUFFER_LOAD_USHORT", i32, zextloadi16_private>; + +let True16Predicate = NotUseRealTrue16Insts in { +defm : MUBUFScratchLoadPat <"BUFFER_LOAD_SBYTE", i16, sextloadi8_private>; +defm : MUBUFScratchLoadPat <"BUFFER_LOAD_UBYTE", i16, extloadi8_private>; +defm : MUBUFScratchLoadPat <"BUFFER_LOAD_UBYTE", i16, zextloadi8_private>; defm : MUBUFScratchLoadPat <"BUFFER_LOAD_USHORT", i16, load_private>; +} + +let True16Predicate = UseRealTrue16Insts in { +defm : MUBUFScratchLoadPat_D16_t16 <"BUFFER_LOAD_SBYTE_D16_t16", i16, sextloadi8_private>; +defm : MUBUFScratchLoadPat_D16_t16 <"BUFFER_LOAD_UBYTE_D16_t16", i16, extloadi8_private>; +defm : MUBUFScratchLoadPat_D16_t16 <"BUFFER_LOAD_UBYTE_D16_t16", i16, zextloadi8_private>; +defm : MUBUFScratchLoadPat_D16_t16 <"BUFFER_LOAD_SHORT_D16_t16", i16, load_private>; +} foreach vt = Reg32Types.types in { defm : MUBUFScratchLoadPat <"BUFFER_LOAD_DWORD", vt, load_private>; @@ -2027,7 +2161,7 @@ defm : MUBUFScratchLoadPat <"BUFFER_LOAD_DWORDX2", v2i32, load_private>; defm : MUBUFScratchLoadPat <"BUFFER_LOAD_DWORDX3", v3i32, load_private>; defm : MUBUFScratchLoadPat <"BUFFER_LOAD_DWORDX4", v4i32, load_private>; -let OtherPredicates = [D16PreservesUnusedBits, DisableFlatScratch] in { +let OtherPredicates = [D16PreservesUnusedBits, NotHasFlatScratchEnabled] in { defm : MUBUFScratchLoadPat_D16<"BUFFER_LOAD_SHORT_D16_HI", v2i16, load_d16_hi_private>; defm : MUBUFScratchLoadPat_D16<"BUFFER_LOAD_UBYTE_D16_HI", v2i16, az_extloadi8_d16_hi_private>; defm : MUBUFScratchLoadPat_D16<"BUFFER_LOAD_SBYTE_D16_HI", v2i16, sextloadi8_d16_hi_private>; @@ -2043,7 +2177,7 @@ defm : MUBUFScratchLoadPat_D16<"BUFFER_LOAD_UBYTE_D16", v2f16, az_extloadi8_d16_ defm : MUBUFScratchLoadPat_D16<"BUFFER_LOAD_SBYTE_D16", v2f16, sextloadi8_d16_lo_private>; } -} // End OtherPredicates = [DisableFlatScratch] +} // End OtherPredicates = [NotHasFlatScratchEnabled] multiclass MUBUFStore_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Instr_OFFSET, ValueType vt, PatFrag atomic_st> { @@ -2084,8 +2218,15 @@ multiclass MUBUFStore_PatternOffset <string Instr, ValueType vt, defm : MUBUFStore_PatternOffset_Common<Instr # "_VBUFFER", vt, st>; } +let True16Predicate = NotUseRealTrue16Insts in { defm : MUBUFStore_PatternOffset <"BUFFER_STORE_BYTE", i16, truncstorei8_global>; defm : MUBUFStore_PatternOffset <"BUFFER_STORE_SHORT", i16, store_global>; +} + +let True16Predicate = UseRealTrue16Insts in { +defm : MUBUFStore_PatternOffset <"BUFFER_STORE_BYTE_t16", i16, truncstorei8_global>; +defm : MUBUFStore_PatternOffset <"BUFFER_STORE_SHORT_t16", i16, store_global>; +} multiclass MUBUFScratchStorePat_Common <string Instr, ValueType vt, PatFrag st, @@ -2112,11 +2253,19 @@ multiclass MUBUFScratchStorePat <string Instr, defm : MUBUFScratchStorePat_Common<Instr # "_VBUFFER", vt, st, rc>; } -let OtherPredicates = [DisableFlatScratch] in { +let OtherPredicates = [NotHasFlatScratchEnabled] in { defm : MUBUFScratchStorePat <"BUFFER_STORE_BYTE", i32, truncstorei8_private>; defm : MUBUFScratchStorePat <"BUFFER_STORE_SHORT", i32, truncstorei16_private>; + +let True16Predicate = NotUseRealTrue16Insts in { defm : MUBUFScratchStorePat <"BUFFER_STORE_BYTE", i16, truncstorei8_private>; defm : MUBUFScratchStorePat <"BUFFER_STORE_SHORT", i16, store_private>; +} + +let True16Predicate = UseRealTrue16Insts in { +defm : MUBUFScratchStorePat <"BUFFER_STORE_BYTE_t16", i16, truncstorei8_private, VGPR_16>; +defm : MUBUFScratchStorePat <"BUFFER_STORE_SHORT_t16", i16, store_private, VGPR_16>; +} foreach vt = Reg32Types.types in { defm : MUBUFScratchStorePat <"BUFFER_STORE_DWORD", vt, store_private>; @@ -2127,7 +2276,7 @@ defm : MUBUFScratchStorePat <"BUFFER_STORE_DWORDX3", v3i32, store_private, VReg_ defm : MUBUFScratchStorePat <"BUFFER_STORE_DWORDX4", v4i32, store_private, VReg_128>; -let OtherPredicates = [HasD16LoadStore, DisableFlatScratch] in { +let OtherPredicates = [HasD16LoadStore, NotHasFlatScratchEnabled] in { // Hiding the extract high pattern in the PatFrag seems to not // automatically increase the complexity. let AddedComplexity = 1 in { @@ -2135,7 +2284,7 @@ defm : MUBUFScratchStorePat <"BUFFER_STORE_SHORT_D16_HI", i32, store_hi16_privat defm : MUBUFScratchStorePat <"BUFFER_STORE_BYTE_D16_HI", i32, truncstorei8_hi16_private>; } } -} // End OtherPredicates = [DisableFlatScratch] +} // End OtherPredicates = [NotHasFlatScratchEnabled] //===----------------------------------------------------------------------===// // MTBUF Patterns diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index 13f727b68..ae684a5 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -15,6 +15,7 @@ tablegen(LLVM AMDGPUGenMCCodeEmitter.inc -gen-emitter) tablegen(LLVM AMDGPUGenMCPseudoLowering.inc -gen-pseudo-lowering) tablegen(LLVM AMDGPUGenRegisterBank.inc -gen-register-bank) tablegen(LLVM AMDGPUGenRegisterInfo.inc -gen-register-info) +tablegen(LLVM AMDGPUGenSDNodeInfo.inc -gen-sd-node-info) tablegen(LLVM AMDGPUGenSearchableTables.inc -gen-searchable-tables) tablegen(LLVM AMDGPUGenSubtargetInfo.inc -gen-subtarget) @@ -39,10 +40,6 @@ tablegen(LLVM R600GenSubtargetInfo.inc -gen-subtarget) add_public_tablegen_target(AMDGPUCommonTableGen) -set(LLVM_TARGET_DEFINITIONS InstCombineTables.td) -tablegen(LLVM InstCombineTables.inc -gen-searchable-tables) -add_public_tablegen_target(InstCombineTableGen) - add_llvm_target(AMDGPUCodeGen AMDGPUAliasAnalysis.cpp AMDGPUAlwaysInlinePass.cpp @@ -52,6 +49,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUAsmPrinter.cpp AMDGPUAtomicOptimizer.cpp AMDGPUAttributor.cpp + AMDGPUBarrierLatency.cpp AMDGPUCallLowering.cpp AMDGPUCodeGenPrepare.cpp AMDGPUCombinerHelper.cpp @@ -61,6 +59,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUFrameLowering.cpp AMDGPUGlobalISelDivergenceLowering.cpp AMDGPUGlobalISelUtils.cpp + AMDGPUHazardLatency.cpp AMDGPUHSAMetadataStreamer.cpp AMDGPUInsertDelayAlu.cpp AMDGPUInstCombineIntrinsic.cpp @@ -80,6 +79,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPULowerKernelAttributes.cpp AMDGPULowerModuleLDSPass.cpp AMDGPUPrepareAGPRAlloc.cpp + AMDGPULowerExecSync.cpp AMDGPUSwLowerLDS.cpp AMDGPUMachineFunction.cpp AMDGPUMachineModuleInfo.cpp diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index d0ad120..d8a8450 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -128,7 +128,7 @@ class DS_0A1D_NORET<string opName, RegisterOperand rc = AVLdSt_32> class DS_1A1D_NORET<string opName, RegisterOperand rc = AVLdSt_32> : DS_Pseudo<opName, (outs), - (ins VGPR_32:$addr, rc:$data0, Offset:$offset, gds:$gds), + (ins VGPROp_32:$addr, rc:$data0, Offset:$offset, gds:$gds), " $addr, $data0$offset$gds"> { let has_data1 = 0; @@ -163,7 +163,7 @@ multiclass DS_1A1D_NORET_mc_gfx9<string opName, RegisterOperand rc = AVLdSt_32> class DS_1A2D_NORET<string opName, RegisterOperand data_op = VGPROp_32> : DS_Pseudo<opName, (outs), - (ins VGPR_32:$addr, data_op:$data0, data_op:$data1, Offset:$offset, gds:$gds), + (ins VGPROp_32:$addr, data_op:$data0, data_op:$data1, Offset:$offset, gds:$gds), " $addr, $data0, $data1$offset$gds"> { let has_vdst = 0; @@ -190,7 +190,7 @@ multiclass DS_1A2D_NORET_mc<string opName, RegisterOperand rc = VGPROp_32> { class DS_1A2D_Off8_NORET <string opName, RegisterOperand data_op = VGPROp_32> : DS_Pseudo<opName, (outs), - (ins VGPR_32:$addr, data_op:$data0, data_op:$data1, + (ins VGPROp_32:$addr, data_op:$data0, data_op:$data1, Offset0:$offset0, Offset1:$offset1, gds:$gds), " $addr, $data0, $data1$offset0$offset1$gds"> { @@ -230,7 +230,7 @@ class DS_0A1D_RET_GDS<string opName, RegisterOperand dst_op = AVLdSt_32, class DS_1A1D_RET <string opName, RegisterOperand data_op = AVLdSt_32> : DS_Pseudo<opName, (outs data_op:$vdst), - (ins VGPR_32:$addr, data_op:$data0, Offset:$offset, gds:$gds), + (ins VGPROp_32:$addr, data_op:$data0, Offset:$offset, gds:$gds), " $vdst, $addr, $data0$offset$gds"> { let has_data1 = 0; @@ -260,7 +260,7 @@ class DS_1A2D_RET<string opName, RegisterOperand dst_rc = VGPROp_32, RegisterOperand src_rc = dst_rc>: DS_Pseudo<opName, (outs dst_rc:$vdst), - (ins VGPR_32:$addr, src_rc:$data0, src_rc:$data1, Offset:$offset, gds:$gds), + (ins VGPROp_32:$addr, src_rc:$data0, src_rc:$data1, Offset:$offset, gds:$gds), " $vdst, $addr, $data0, $data1$offset$gds"> { let IsAtomicRet = 1; @@ -286,7 +286,7 @@ class DS_1A2D_Off8_RET<string opName, RegisterOperand src_rc = dst_rc> : DS_Pseudo<opName, (outs dst_rc:$vdst), - (ins VGPR_32:$addr, src_rc:$data0, src_rc:$data1, Offset0:$offset0, Offset1:$offset1, gds:$gds), + (ins VGPROp_32:$addr, src_rc:$data0, src_rc:$data1, Offset0:$offset0, Offset1:$offset1, gds:$gds), " $vdst, $addr, $data0, $data1$offset0$offset1$gds"> { let has_offset = 0; @@ -311,8 +311,8 @@ class DS_BVH_STACK<string opName, RegisterOperand vdst_rc, RegisterOperand data1_rc> : DS_Pseudo<opName, - (outs vdst_rc:$vdst, VGPR_32:$addr), - (ins VGPR_32:$addr_in, VGPR_32:$data0, data1_rc:$data1, Offset:$offset), + (outs vdst_rc:$vdst, VGPROp_32:$addr), + (ins VGPROp_32:$addr_in, VGPROp_32:$data0, data1_rc:$data1, Offset:$offset), " $vdst, $addr, $data0, $data1$offset"> { let Constraints = "$addr = $addr_in"; let has_gds = 0; @@ -327,8 +327,8 @@ class DS_1A_RET<string opName, RegisterOperand data_op = AVLdSt_32, : DS_Pseudo<opName, (outs data_op:$vdst), !if(HasTiedOutput, - (ins VGPR_32:$addr, ofs:$offset, gds:$gds, data_op:$vdst_in), - (ins VGPR_32:$addr, ofs:$offset, gds:$gds)), + (ins VGPROp_32:$addr, ofs:$offset, gds:$gds, data_op:$vdst_in), + (ins VGPROp_32:$addr, ofs:$offset, gds:$gds)), " $vdst, $addr$offset$gds"> { let Constraints = !if(HasTiedOutput, "$vdst = $vdst_in", ""); let has_data0 = 0; @@ -366,7 +366,7 @@ class DS_1A_RET_Tied<string opName, RegisterOperand rc = AVLdSt_32> : class DS_1A_Off8_RET <string opName, RegisterOperand rc = AVLdSt_32> : DS_Pseudo<opName, (outs rc:$vdst), - (ins VGPR_32:$addr, Offset0:$offset0, Offset1:$offset1, gds:$gds), + (ins VGPROp_32:$addr, Offset0:$offset0, Offset1:$offset1, gds:$gds), " $vdst, $addr$offset0$offset1$gds"> { let has_offset = 0; @@ -384,7 +384,7 @@ multiclass DS_1A_Off8_RET_mc <string opName, RegisterOperand rc = VGPROp_32> { class DS_1A_RET_GDS <string opName> : DS_Pseudo<opName, (outs AVLdSt_32:$vdst), - (ins VGPR_32:$addr, Offset:$offset), + (ins VGPROp_32:$addr, Offset:$offset), " $vdst, $addr$offset gds"> { let has_data0 = 0; @@ -396,7 +396,7 @@ class DS_1A_RET_GDS <string opName> : DS_Pseudo<opName, class DS_1A_Off16_NORET <string opName> : DS_Pseudo<opName, (outs), - (ins VGPR_32:$addr, Offset:$offset, gds:$gds), + (ins VGPROp_32:$addr, Offset:$offset, gds:$gds), " $addr$offset$gds"> { let has_vdst = 0; @@ -422,7 +422,7 @@ class DS_0A_RET <string opName> : DS_Pseudo<opName, class DS_1A <string opName> : DS_Pseudo<opName, (outs), - (ins VGPR_32:$addr, Offset:$offset, gds:$gds), + (ins VGPROp_32:$addr, Offset:$offset, gds:$gds), " $addr$offset$gds"> { let mayLoad = 1; @@ -463,7 +463,7 @@ class DS_GWS_0D <string opName> class DS_GWS_1D <string opName> : DS_GWS<opName, - (ins AVLdSt_32:$data0, Offset:$offset), + (ins AV_LdSt_32_Align2_RegOp:$data0, Offset:$offset), " $data0$offset gds"> { let has_gws_data0 = 1; @@ -491,7 +491,7 @@ class DS_1A1D_PERMUTE <string opName, SDPatternOperator node = null_frag, RegisterOperand data_op = AVLdSt_32> : DS_Pseudo<opName, (outs data_op:$vdst), - (ins VGPR_32:$addr, data_op:$data0, Offset:$offset), + (ins VGPROp_32:$addr, data_op:$data0, Offset:$offset), " $vdst, $addr, $data0$offset", [(set i32:$vdst, (node (DS1Addr1Offset i32:$addr, i32:$offset), i32:$data0))] > { @@ -886,17 +886,6 @@ defm DS_SUB_CLAMP_RTN_U32 : DS_1A1D_RET_mc_gfx9<"ds_sub_clamp_rtn_u32", VGPROp_3 def DS_BPERMUTE_FI_B32 : DS_1A1D_PERMUTE <"ds_bpermute_fi_b32", int_amdgcn_ds_bpermute_fi_b32>; -multiclass DSAtomicRetNoRetPatIntrinsic_mc<DS_Pseudo inst, DS_Pseudo noRetInst, - ValueType vt, string frag> { - def : DSAtomicRetPat<inst, vt, - !cast<PatFrag>(frag#"_local_addrspace")>; - - let OtherPredicates = [HasAtomicCSubNoRtnInsts] in - def : DSAtomicRetPat<noRetInst, vt, - !cast<PatFrag>(frag#"_noret_local_addrspace"), /* complexity */ 1>; -} - -defm : DSAtomicRetNoRetPatIntrinsic_mc<DS_COND_SUB_RTN_U32, DS_COND_SUB_U32, i32, "int_amdgcn_atomic_cond_sub_u32">; } // let SubtargetPredicate = isGFX12Plus let SubtargetPredicate = isGFX1250Plus in { @@ -917,7 +906,7 @@ def DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 : DS_1A_Off16_NORET<"ds_atomic_async_barr def : GCNPat < (int_amdgcn_ds_atomic_async_barrier_arrive_b64 (DS1Addr1Offset i32:$ptr, i32:$offset)), - (DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 VGPR_32:$ptr, Offset:$offset, (i1 0)) + (DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 VGPROp_32:$ptr, Offset:$offset, (i1 0)) >; defm DS_ATOMIC_BARRIER_ARRIVE_RTN_B64 : DS_1A1D_RET_mc_gfx9<"ds_atomic_barrier_arrive_rtn_b64", VGPROp_64>; @@ -943,7 +932,7 @@ let WaveSizePredicate = isWave64, SubtargetPredicate = HasGFX950Insts, mayStore def : GCNPat < (int_amdgcn_ds_swizzle i32:$src, timm:$offset16), - (DS_SWIZZLE_B32 VGPR_32:$src, (as_i16timm $offset16), (i1 0)) + (DS_SWIZZLE_B32 VGPROp_32:$src, (as_i16timm $offset16), (i1 0)) >; class DSReadPat <DS_Pseudo inst, ValueType vt, PatFrag frag, int gds=0> : GCNPat < @@ -1279,6 +1268,14 @@ defm : DSAtomicRetNoRetPat_NoM0_mc<DS_PK_ADD_RTN_F16, DS_PK_ADD_F16, v2f16, "ato defm : DSAtomicRetNoRetPat_NoM0_mc<DS_PK_ADD_RTN_BF16, DS_PK_ADD_BF16, v2bf16, "atomic_load_fadd">; } +let SubtargetPredicate = HasAtomicDsCondSubClampInsts in { + +defm : DSAtomicRetNoRetPat_NoM0_mc<DS_COND_SUB_RTN_U32, DS_COND_SUB_U32, i32, "atomic_load_usub_cond">; + +defm : DSAtomicRetNoRetPat_NoM0_mc<DS_SUB_CLAMP_RTN_U32, DS_SUB_CLAMP_U32, i32, "atomic_load_usub_sat">; + +} // let SubtargetPredicate = HasAtomicDsCondSubClampInsts + let SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10 in { defm : DSAtomicCmpXChgSwapped_mc<DS_CMPST_RTN_B32, DS_CMPST_B32, i32, "atomic_cmp_swap">; } @@ -1339,28 +1336,28 @@ def : GCNPat < def : GCNPat < (i64 (int_amdgcn_ds_add_gs_reg_rtn i32:$src, timm:$offset32)), - (DS_ADD_GS_REG_RTN VGPR_32:$src, (as_i32timm $offset32)) + (DS_ADD_GS_REG_RTN VGPROp_32:$src, (as_i32timm $offset32)) >; def : GCNPat < (i32 (int_amdgcn_ds_add_gs_reg_rtn i32:$src, timm:$offset32)), (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS - (DS_ADD_GS_REG_RTN VGPR_32:$src, (as_i32timm $offset32)), + (DS_ADD_GS_REG_RTN VGPROp_32:$src, (as_i32timm $offset32)), VReg_64)), sub0) >; def : GCNPat < (i64 (int_amdgcn_ds_sub_gs_reg_rtn i32:$src, timm:$offset32)), - (DS_SUB_GS_REG_RTN VGPR_32:$src, (as_i32timm $offset32)) + (DS_SUB_GS_REG_RTN VGPROp_32:$src, (as_i32timm $offset32)) >; def : GCNPat < (i32 (int_amdgcn_ds_sub_gs_reg_rtn i32:$src, timm:$offset32)), (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS - (DS_SUB_GS_REG_RTN VGPR_32:$src, (as_i32timm $offset32)), + (DS_SUB_GS_REG_RTN VGPROp_32:$src, (as_i32timm $offset32)), VReg_64)), sub0) >; @@ -1488,6 +1485,12 @@ let AssemblerPredicate = isGFX12Plus in { def : MnemonicAlias<"ds_load_tr_b64", "ds_load_tr8_b64">, Requires<[isGFX1250Plus]>; def : MnemonicAlias<"ds_load_tr_b128", "ds_load_tr16_b128">, Requires<[isGFX1250Plus]>; +// Additional aliases for ds load transpose instructions. +def : MnemonicAlias<"ds_load_b64_tr_b8", "ds_load_tr8_b64">, Requires<[isGFX125xOnly]>; +def : MnemonicAlias<"ds_load_b128_tr_b16", "ds_load_tr16_b128">, Requires<[isGFX125xOnly]>; +def : MnemonicAlias<"ds_load_b64_tr_b4", "ds_load_tr4_b64">, Requires<[isGFX125xOnly]>; +def : MnemonicAlias<"ds_load_b96_tr_b6", "ds_load_tr6_b96">, Requires<[isGFX125xOnly]>; + //===----------------------------------------------------------------------===// // GFX11. //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index e0375ea..b2dfd09 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -528,12 +528,26 @@ void AMDGPUDisassembler::decodeImmOperands(MCInst &MI, break; case AMDGPU::OPERAND_REG_IMM_FP16: case AMDGPU::OPERAND_REG_IMM_INT16: - case AMDGPU::OPERAND_REG_IMM_V2FP16: case AMDGPU::OPERAND_REG_INLINE_C_FP16: case AMDGPU::OPERAND_REG_INLINE_C_INT16: + Imm = getInlineImmValF16(Imm); + break; + case AMDGPU::OPERAND_REG_IMM_V2FP16: case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: Imm = getInlineImmValF16(Imm); break; + case AMDGPU::OPERAND_REG_IMM_V2FP16_SPLAT: { + // V_PK_FMAC_F16 on GFX11+ duplicates the f16 inline constant to both + // halves, so we need to produce the duplicated value for correct + // round-trip. + if (isGFX11Plus()) { + int64_t F16Val = getInlineImmValF16(Imm); + Imm = (F16Val << 16) | (F16Val & 0xFFFF); + } else { + Imm = getInlineImmValF16(Imm); + } + break; + } case AMDGPU::OPERAND_REG_IMM_FP64: case AMDGPU::OPERAND_REG_IMM_INT64: case AMDGPU::OPERAND_REG_INLINE_AC_FP64: @@ -566,7 +580,7 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, // Try to decode DPP and SDWA first to solve conflict with VOP1 and VOP2 // encodings - if (isGFX1250() && Bytes.size() >= 16) { + if (isGFX1250Plus() && Bytes.size() >= 16) { std::bitset<128> DecW = eat16Bytes(Bytes); if (tryDecodeInst(DecoderTableGFX1250128, MI, DecW, Address, CS)) break; @@ -595,6 +609,11 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, tryDecodeInst(DecoderTableGFX12W6496, MI, DecW, Address, CS)) break; + if (isGFX13() && + tryDecodeInst(DecoderTableGFX1396, DecoderTableGFX13_FAKE1696, MI, + DecW, Address, CS)) + break; + if (STI.hasFeature(AMDGPU::Feature64BitLiterals)) { // Return 8 bytes for a potential literal. Bytes = Bytes_.slice(4, MaxInstBytesNum - 4); @@ -680,6 +699,11 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, tryDecodeInst(DecoderTableGFX12W6464, MI, QW, Address, CS)) break; + if (isGFX13() && + tryDecodeInst(DecoderTableGFX1364, DecoderTableGFX13_FAKE1664, MI, QW, + Address, CS)) + break; + // Reinitialize Bytes Bytes = Bytes_.slice(0, MaxInstBytesNum); } @@ -727,6 +751,11 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, tryDecodeInst(DecoderTableGFX1232, DecoderTableGFX12_FAKE1632, MI, DW, Address, CS)) break; + + if (isGFX13() && + tryDecodeInst(DecoderTableGFX1332, DecoderTableGFX13_FAKE1632, MI, DW, + Address, CS)) + break; } return MCDisassembler::Fail; @@ -892,6 +921,7 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, // have EXEC as implicit destination. Issue a warning if encoding for // vdst is not EXEC. if ((MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3) && + MCII->get(MI.getOpcode()).getNumDefs() == 0 && MCII->get(MI.getOpcode()).hasImplicitDefOfPhysReg(AMDGPU::EXEC)) { auto ExecEncoding = MRI.getEncodingValue(AMDGPU::EXEC_LO); if (Bytes_[0] != ExecEncoding) @@ -1198,8 +1228,8 @@ void AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const { // Given a wide tuple \p Reg check if it will overflow 256 registers. // \returns \p Reg on success or NoRegister otherwise. -static unsigned CheckVGPROverflow(unsigned Reg, const MCRegisterClass &RC, - const MCRegisterInfo &MRI) { +static MCRegister CheckVGPROverflow(MCRegister Reg, const MCRegisterClass &RC, + const MCRegisterInfo &MRI) { unsigned NumRegs = RC.getSizeInBits() / 32; MCRegister Sub0 = MRI.getSubReg(Reg, AMDGPU::sub0); if (!Sub0) @@ -1213,7 +1243,7 @@ static unsigned CheckVGPROverflow(unsigned Reg, const MCRegisterClass &RC, assert(BaseReg && "Only vector registers expected"); - return (Sub0 - BaseReg + NumRegs <= 256) ? Reg : AMDGPU::NoRegister; + return (Sub0 - BaseReg + NumRegs <= 256) ? Reg : MCRegister(); } // Note that before gfx10, the MIMG encoding provided no information about @@ -1455,9 +1485,8 @@ MCOperand AMDGPUDisassembler::errOperand(unsigned V, return MCOperand(); } -inline -MCOperand AMDGPUDisassembler::createRegOperand(unsigned int RegId) const { - return MCOperand::createReg(AMDGPU::getMCReg(RegId, STI)); +inline MCOperand AMDGPUDisassembler::createRegOperand(MCRegister Reg) const { + return MCOperand::createReg(AMDGPU::getMCReg(Reg, STI)); } inline @@ -1597,6 +1626,9 @@ AMDGPUDisassembler::decodeLiteralConstant(const MCInstrDesc &Desc, case AMDGPU::OPERAND_REG_IMM_V2FP16: UseLit = AMDGPU::isInlinableLiteralV2F16(Val); break; + case AMDGPU::OPERAND_REG_IMM_V2FP16_SPLAT: + UseLit = AMDGPU::isPKFMACF16InlineConstant(Val, isGFX11Plus()); + break; case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: break; case AMDGPU::OPERAND_REG_IMM_INT16: @@ -2225,6 +2257,16 @@ bool AMDGPUDisassembler::isGFX12Plus() const { bool AMDGPUDisassembler::isGFX1250() const { return AMDGPU::isGFX1250(STI); } +bool AMDGPUDisassembler::isGFX1250Plus() const { + return AMDGPU::isGFX1250Plus(STI); +} + +bool AMDGPUDisassembler::isGFX13() const { return AMDGPU::isGFX13(STI); } + +bool AMDGPUDisassembler::isGFX13Plus() const { + return AMDGPU::isGFX13Plus(STI); +} + bool AMDGPUDisassembler::hasArchitectedFlatScratch() const { return STI.hasFeature(AMDGPU::FeatureArchitectedFlatScratch); } @@ -2381,7 +2423,7 @@ Expected<bool> AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC1( } // Bits [27]. - if (isGFX1250()) { + if (isGFX1250Plus()) { PRINT_PSEUDO_DIRECTIVE_COMMENT("FLAT_SCRATCH_IS_NV", COMPUTE_PGM_RSRC1_GFX125_FLAT_SCRATCH_IS_NV); } else { @@ -2395,7 +2437,7 @@ Expected<bool> AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC1( // Bits [29-31]. if (isGFX10Plus()) { // WGP_MODE is not available on GFX1250. - if (!isGFX1250()) { + if (!isGFX1250Plus()) { PRINT_DIRECTIVE(".amdhsa_workgroup_processor_mode", COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE); } @@ -2526,7 +2568,7 @@ Expected<bool> AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC3( } // Bits [14-21]. - if (isGFX1250()) { + if (isGFX1250Plus()) { PRINT_DIRECTIVE(".amdhsa_named_barrier_count", COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT); PRINT_PSEUDO_DIRECTIVE_COMMENT( diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h index d103d79..28f71d8 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -69,7 +69,7 @@ public: const char* getRegClassName(unsigned RegClassID) const; - MCOperand createRegOperand(unsigned int RegId) const; + MCOperand createRegOperand(MCRegister Reg) const; MCOperand createRegOperand(unsigned RegClassID, unsigned Val) const; MCOperand createSRegOperand(unsigned SRegClassID, unsigned Val) const; MCOperand createVGPR16Operand(unsigned RegIdx, bool IsHi) const; @@ -182,6 +182,9 @@ public: bool isGFX12() const; bool isGFX12Plus() const; bool isGFX1250() const; + bool isGFX1250Plus() const; + bool isGFX13() const; + bool isGFX13Plus() const; bool hasArchitectedFlatScratch() const; bool hasKernargPreload() const; diff --git a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td index dadc7dc..a2e3ece 100644 --- a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td +++ b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td @@ -505,7 +505,6 @@ def : AMDGPUPat < (fshr i32:$src0, i32:$src1, i32:$src2), (BIT_ALIGN_INT_eg $src0, $src1, $src2) >; -def : ROTRPattern <BIT_ALIGN_INT_eg>; def MULADD_eg : MULADD_Common<0x14>; def MULADD_IEEE_eg : MULADD_IEEE_Common<0x18>; def FMA_eg : FMA_Common<0x7>; diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 6de59be..63460b5 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -229,13 +229,13 @@ class GlobalSaddrTable <bit is_saddr, string Name = ""> { class FLAT_Load_Pseudo< string opName, RegisterOperand vdata_op, bit HasTiedOutput = 0, bit HasSaddr = 0, bit EnableSaddr = 0, - RegisterClassLike VaddrRC = !if(EnableSaddr, VGPR_32, VReg_64_AlignTarget)> + RegisterOperand VaddrOp = !if(EnableSaddr, VGPROp_32, VGPROp_64)> : FLAT_Pseudo<opName, (outs), (ins), ""> { let OutOperandList = (outs vdata_op:$vdst); let InOperandList = !con( !if(EnableSaddr, (ins SReg_64_XEXEC_XNULL:$saddr), (ins)), - (ins VaddrRC:$vaddr, flat_offset:$offset), + (ins VaddrOp:$vaddr, flat_offset:$offset), // FIXME: Operands with default values do not work with following // non-optional operands. !if(HasTiedOutput, (ins CPol:$cpol, vdata_op:$vdst_in), @@ -262,15 +262,25 @@ multiclass FLAT_Flat_Load_Pseudo<string opName, RegisterOperand regClass = AVLdS multiclass FLAT_Flat_Load_Pseudo_t16<string opName> { defm "" : FLAT_Flat_Load_Pseudo<opName, AVLdSt_32, 1>; - let True16Predicate = UseRealTrue16Insts in - defm _t16 : FLAT_Flat_Load_Pseudo<opName#"_t16", VGPROp_16>, True16D16Table<NAME#"_HI", NAME>; + + defvar Name16 = opName#"_t16"; + let True16Predicate = UseRealTrue16Insts in { + def _t16 : FLAT_Load_Pseudo<Name16, VGPROp_16>, + GlobalSaddrTable<0, Name16>, + True16D16Table<NAME#"_HI", NAME>; + + let OtherPredicates = [HasFlatGVSMode] in + def _t16_SADDR : FLAT_Load_Pseudo<Name16, VGPROp_16, 0, 1, 1>, + GlobalSaddrTable<1, Name16>, + True16D16Table<NAME#"_HI_SADDR", NAME#"_SADDR">; + } } class FLAT_Store_Pseudo <string opName, RegisterOperand vdataClass, bit HasSaddr = 0, bit EnableSaddr = 0, - RegisterClassLike VaddrRC = !if(EnableSaddr, VGPR_32, VReg_64_AlignTarget)> : FLAT_Pseudo<opName, (outs), (ins), ""> { + RegisterOperand VaddrOp = !if(EnableSaddr, VGPROp_32, VGPROp_64)> : FLAT_Pseudo<opName, (outs), (ins), ""> { let InOperandList = !con( - (ins VaddrRC:$vaddr, vdataClass:$vdata), + (ins VaddrOp:$vaddr, vdataClass:$vdata), !if(EnableSaddr, (ins SReg_64_XEXEC_XNULL:$saddr), (ins)), (ins flat_offset:$offset, CPol_0:$cpol)); let AsmOperands = " $vaddr, $vdata"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$cpol"; @@ -380,15 +390,16 @@ multiclass FLAT_Global_Store_Pseudo_t16<string opName> { // Async loads, introduced in gfx1250, will store directly // to a DS address in vdst (they will not use M0 for DS addess). -class FLAT_Global_Load_LDS_Pseudo <string opName, bit EnableSaddr = 0, bit IsAsync = 0> : FLAT_Pseudo< +class FLAT_Global_Load_LDS_Pseudo <string opName, bit EnableSaddr = 0, bit IsAsync = 0, bit IsLegacyLDSDMA = 0> : FLAT_Pseudo< opName, (outs ), !con( - !if(IsAsync, (ins VGPR_32:$vdst), (ins)), - !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64_AlignTarget:$vaddr)), - (ins flat_offset:$offset, CPol_0:$cpol)), + !if(IsAsync, (ins VGPROp_32:$vdst), (ins)), + !if(EnableSaddr, (ins SReg_64:$saddr, VGPROp_32:$vaddr), (ins VGPROp_64:$vaddr)), + (ins flat_offset:$offset, CPol_0:$cpol), + !if(IsLegacyLDSDMA, (ins i1imm_0:$IsAsync), (ins))), !if(IsAsync, " $vdst,", "")#" $vaddr"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> { - let LGKM_CNT = !not(IsAsync); + let LGKM_CNT = 0; let VM_CNT = !not(IsAsync); let ASYNC_CNT = IsAsync; let is_flat_global = 1; @@ -406,10 +417,10 @@ class FLAT_Global_Load_LDS_Pseudo <string opName, bit EnableSaddr = 0, bit IsAsy let SchedRW = [WriteVMEM, WriteLDS]; } -multiclass FLAT_Global_Load_LDS_Pseudo<string opName, bit IsAsync = 0> { - def "" : FLAT_Global_Load_LDS_Pseudo<opName, 0, IsAsync>, +multiclass FLAT_Global_Load_LDS_Pseudo<string opName, bit IsAsync = 0, bit IsLegacyLDSDMA = 0> { + def "" : FLAT_Global_Load_LDS_Pseudo<opName, 0, IsAsync, IsLegacyLDSDMA>, GlobalSaddrTable<0, opName>; - def _SADDR : FLAT_Global_Load_LDS_Pseudo<opName, 1, IsAsync>, + def _SADDR : FLAT_Global_Load_LDS_Pseudo<opName, 1, IsAsync, IsLegacyLDSDMA>, GlobalSaddrTable<1, opName>; } @@ -417,7 +428,7 @@ class FLAT_Global_STORE_LDS_Pseudo <string opName, bit EnableSaddr = 0> : FLAT_P opName, (outs ), !con( - !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64_AlignTarget:$vaddr)), (ins VGPR_32:$vdata), + !if(EnableSaddr, (ins SReg_64:$saddr, VGPROp_32:$vaddr), (ins VGPROp_64:$vaddr)), (ins VGPROp_32:$vdata), (ins flat_offset:$offset, CPol_0:$cpol)), " $vaddr, $vdata"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> { let VM_CNT = 0; @@ -511,7 +522,7 @@ class FLAT_Global_Invalidate_Writeback<string opName, SDPatternOperator node = n let sve = 0; } -class FLAT_Prefetch_Pseudo<string opName, dag addr = (ins VReg_64_AlignTarget:$vaddr), string asm = " $vaddr"> : +class FLAT_Prefetch_Pseudo<string opName, dag addr = (ins VGPROp_64:$vaddr), string asm = " $vaddr"> : FLAT_Pseudo<opName, (outs), !con(addr, (ins flat_offset:$offset, CPol_0:$cpol)), asm#"$offset$cpol"> { let has_vdst = 0; let has_data = 0; @@ -524,7 +535,7 @@ class FLAT_Prefetch_Pseudo<string opName, dag addr = (ins VReg_64_AlignTarget:$v multiclass FLAT_Flat_Prefetch_Pseudo<string opName> { def "" : FLAT_Prefetch_Pseudo<opName>, GlobalSaddrTable<0, opName>; - def _SADDR : FLAT_Prefetch_Pseudo<opName, (ins SReg_64:$saddr, VGPR_32:$vaddr), " $vaddr, $saddr">, + def _SADDR : FLAT_Prefetch_Pseudo<opName, (ins SReg_64:$saddr, VGPROp_32:$vaddr), " $vaddr, $saddr">, GlobalSaddrTable<1, opName> { let OtherPredicates = [HasFlatGVSMode]; let enabled_saddr = 1; @@ -533,9 +544,9 @@ multiclass FLAT_Flat_Prefetch_Pseudo<string opName> { multiclass FLAT_Global_Prefetch_Pseudo<string opName> { let is_flat_global = 1, has_saddr = 1 in { - def "" : FLAT_Prefetch_Pseudo<opName, (ins VReg_64_AlignTarget:$vaddr), " $vaddr, off">, + def "" : FLAT_Prefetch_Pseudo<opName, (ins VGPROp_64:$vaddr), " $vaddr, off">, GlobalSaddrTable<0, opName>; - def _SADDR : FLAT_Prefetch_Pseudo<opName, (ins SReg_64:$saddr, VGPR_32:$vaddr), " $vaddr, $saddr">, + def _SADDR : FLAT_Prefetch_Pseudo<opName, (ins SReg_64:$saddr, VGPROp_32:$vaddr), " $vaddr, $saddr">, GlobalSaddrTable<1, opName> { let enabled_saddr = 1; } @@ -557,11 +568,11 @@ class FLAT_Scratch_Load_Pseudo <string opName, RegisterOperand regClass = AVLdSt (outs regClass:$vdst), !con( !if(EnableSVE, - (ins VGPR_32:$vaddr, SReg_32_XEXEC_HI:$saddr, flat_offset:$offset), + (ins VGPROp_32:$vaddr, SReg_32_XEXEC_HI:$saddr, flat_offset:$offset), !if(EnableSaddr, (ins SReg_32_XEXEC_HI:$saddr, flat_offset:$offset), !if(EnableVaddr, - (ins VGPR_32:$vaddr, flat_offset:$offset), + (ins VGPROp_32:$vaddr, flat_offset:$offset), (ins flat_offset:$offset)))), !if(HasTiedOutput, (ins CPol:$cpol, regClass:$vdst_in), (ins CPol_0:$cpol))), @@ -584,11 +595,11 @@ class FLAT_Scratch_Store_Pseudo <string opName, RegisterOperand vdata_op, bit En opName, (outs), !if(EnableSVE, - (ins vdata_op:$vdata, VGPR_32:$vaddr, SReg_32_XEXEC_HI:$saddr, flat_offset:$offset, CPol_0:$cpol), + (ins vdata_op:$vdata, VGPROp_32:$vaddr, SReg_32_XEXEC_HI:$saddr, flat_offset:$offset, CPol_0:$cpol), !if(EnableSaddr, (ins vdata_op:$vdata, SReg_32_XEXEC_HI:$saddr, flat_offset:$offset, CPol_0:$cpol), !if(EnableVaddr, - (ins vdata_op:$vdata, VGPR_32:$vaddr, flat_offset:$offset, CPol_0:$cpol), + (ins vdata_op:$vdata, VGPROp_32:$vaddr, flat_offset:$offset, CPol_0:$cpol), (ins vdata_op:$vdata, flat_offset:$offset, CPol_0:$cpol)))), " "#!if(EnableVaddr, "$vaddr", "off")#", $vdata, "#!if(EnableSaddr, "$saddr", "off")#"$offset$cpol"> { let is_flat_scratch = 1; @@ -687,11 +698,11 @@ class FLAT_Scratch_Load_LDS_Pseudo <string opName, bit EnableSaddr = 0, opName, (outs ), !if(EnableSVE, - (ins VGPR_32:$vaddr, SReg_32_XEXEC_HI:$saddr, flat_offset:$offset, CPol:$cpol), + (ins VGPROp_32:$vaddr, SReg_32_XEXEC_HI:$saddr, flat_offset:$offset, CPol:$cpol), !if(EnableSaddr, (ins SReg_32_XEXEC_HI:$saddr, flat_offset:$offset, CPol:$cpol), !if(EnableVaddr, - (ins VGPR_32:$vaddr, flat_offset:$offset, CPol:$cpol), + (ins VGPROp_32:$vaddr, flat_offset:$offset, CPol:$cpol), (ins flat_offset:$offset, CPol:$cpol)))), " "#!if(EnableVaddr, "$vaddr, ", "off, ")#!if(EnableSaddr, "$saddr", "off")#"$offset$cpol"> { @@ -754,7 +765,7 @@ multiclass FLAT_Atomic_Pseudo_NO_RTN< RegisterOperand data_op = vdst_op> { def "" : FLAT_AtomicNoRet_Pseudo <opName, (outs), - (ins VReg_64_AlignTarget:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_0:$cpol), + (ins VGPROp_64:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_0:$cpol), " $vaddr, $vdata$offset$cpol">, GlobalSaddrTable<0, opName> { let FPAtomic = data_vt.isFP; @@ -763,7 +774,7 @@ multiclass FLAT_Atomic_Pseudo_NO_RTN< def _SADDR : FLAT_AtomicNoRet_Pseudo <opName, (outs), - (ins VGPR_32:$vaddr, data_op:$vdata, SReg_64:$saddr, flat_offset:$offset, CPol_0:$cpol), + (ins VGPROp_32:$vaddr, data_op:$vdata, SReg_64:$saddr, flat_offset:$offset, CPol_0:$cpol), " $vaddr, $vdata, $saddr$offset$cpol">, GlobalSaddrTable<1, opName> { let OtherPredicates = [HasFlatGVSMode]; @@ -786,7 +797,7 @@ multiclass FLAT_Atomic_Pseudo_RTN< def _RTN : FLAT_AtomicRet_Pseudo <opName, (outs vdst_op_vgpr:$vdst), - (ins VReg_64_AlignTarget:$vaddr, data_op_vgpr:$vdata, flat_offset:$offset, CPol_GLC1:$cpol), + (ins VGPROp_64:$vaddr, data_op_vgpr:$vdata, flat_offset:$offset, CPol_GLC1:$cpol), " $vdst, $vaddr, $vdata$offset$cpol">, GlobalSaddrTable<0, opName#"_rtn"> { let FPAtomic = data_vt.isFP; @@ -795,7 +806,7 @@ multiclass FLAT_Atomic_Pseudo_RTN< def _SADDR_RTN : FLAT_AtomicRet_Pseudo <opName, (outs vdst_op_vgpr:$vdst), - (ins VGPR_32:$vaddr, data_op_vgpr:$vdata, SReg_64:$saddr, flat_offset:$offset, CPol_GLC1:$cpol), + (ins VGPROp_32:$vaddr, data_op_vgpr:$vdata, SReg_64:$saddr, flat_offset:$offset, CPol_GLC1:$cpol), " $vdst, $vaddr, $vdata, $saddr$offset$cpol">, GlobalSaddrTable<1, opName#"_rtn"> { let OtherPredicates = [HasFlatGVSMode]; @@ -811,7 +822,7 @@ multiclass FLAT_Atomic_Pseudo_RTN< def _RTN_agpr : FLAT_AtomicRet_Pseudo <opName, (outs vdst_op_agpr:$vdst), - (ins VReg_64_AlignTarget:$vaddr, data_op_agpr:$vdata, flat_offset:$offset, CPol_GLC1:$cpol), + (ins VGPROp_64:$vaddr, data_op_agpr:$vdata, flat_offset:$offset, CPol_GLC1:$cpol), " $vdst, $vaddr, $vdata$offset$cpol">, GlobalSaddrTable<0, opName#"_rtn_agpr"> { let FPAtomic = data_vt.isFP; @@ -837,10 +848,10 @@ class FLAT_Global_Atomic_Pseudo_NO_RTN< ValueType data_vt = vt, RegisterOperand data_op = vdst_op, bit EnableSaddr = false, - RegisterClassLike VaddrRC = !if(EnableSaddr, VGPR_32, VReg_64_AlignTarget)> + RegisterOperand VaddrOp = !if(EnableSaddr, VGPROp_32, VGPROp_64)> : FLAT_AtomicNoRet_Pseudo<opName, (outs), (ins), "">, GlobalSaddrTable<EnableSaddr, opName> { let InOperandList = !con( - (ins VaddrRC:$vaddr, data_op:$vdata), + (ins VaddrOp:$vaddr, data_op:$vdata), !if(EnableSaddr, (ins SReg_64_XEXEC_XNULL:$saddr), (ins)), (ins flat_offset:$offset, CPol_0:$cpol)); let AsmOperands = " $vaddr, $vdata, "#!if(EnableSaddr, "$saddr", "off")#"$offset$cpol"; @@ -867,7 +878,7 @@ class FLAT_Global_Atomic_Pseudo_RTN< RegisterOperand data_op = vdst_op, bit EnableSaddr = false, bit IsVGPR = false, - RegisterClassLike VaddrRC = !if(EnableSaddr, VGPR_32, VReg_64_AlignTarget)> + RegisterOperand VaddrOp = !if(EnableSaddr, VGPROp_32, VGPROp_64)> : FLAT_AtomicRet_Pseudo<opName, (outs), (ins), "">, GlobalSaddrTable<EnableSaddr, opName#"_rtn"#!if(IsVGPR, "", "_agpr")> { defvar vdst_rc= !if(IsVGPR, getEquivalentVGPROperand<vdst_op>.ret, getEquivalentAGPROperand<vdst_op>.ret); @@ -875,7 +886,7 @@ class FLAT_Global_Atomic_Pseudo_RTN< let OutOperandList = (outs vdst_rc:$vdst); let InOperandList = !con( - (ins VaddrRC:$vaddr, data_rc:$vdata), + (ins VaddrOp:$vaddr, data_rc:$vdata), !if(EnableSaddr, (ins SReg_64_XEXEC_XNULL:$saddr), (ins)), (ins flat_offset:$offset, CPol_GLC1:$cpol)); let AsmOperands = " $vdst, $vaddr, $vdata, "#!if(EnableSaddr, "$saddr", "off")#"$offset$cpol"; @@ -1202,15 +1213,15 @@ let SubtargetPredicate = HasGFX10_BEncoding in { VGPROp_32, i32>; } -defm GLOBAL_LOAD_LDS_UBYTE : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_ubyte">; -defm GLOBAL_LOAD_LDS_SBYTE : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_sbyte">; -defm GLOBAL_LOAD_LDS_USHORT : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_ushort">; -defm GLOBAL_LOAD_LDS_SSHORT : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_sshort">; -defm GLOBAL_LOAD_LDS_DWORD : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dword">; +defm GLOBAL_LOAD_LDS_UBYTE : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_ubyte", 0, 1>; +defm GLOBAL_LOAD_LDS_SBYTE : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_sbyte", 0, 1>; +defm GLOBAL_LOAD_LDS_USHORT : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_ushort", 0, 1>; +defm GLOBAL_LOAD_LDS_SSHORT : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_sshort", 0, 1>; +defm GLOBAL_LOAD_LDS_DWORD : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dword", 0, 1>; let SubtargetPredicate = HasGFX950Insts in { -defm GLOBAL_LOAD_LDS_DWORDX3 : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dwordx3">; -defm GLOBAL_LOAD_LDS_DWORDX4 : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dwordx4">; +defm GLOBAL_LOAD_LDS_DWORDX3 : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dwordx3", 0, 1>; +defm GLOBAL_LOAD_LDS_DWORDX4 : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dwordx4", 0, 1>; } let SubtargetPredicate = isGFX12PlusNot12_50 in @@ -1224,7 +1235,7 @@ let SubtargetPredicate = isGFX12Plus in { def GLOBAL_WBINV : FLAT_Global_Invalidate_Writeback<"global_wbinv">; } // End SubtargetPredicate = isGFX12Plus -let SubtargetPredicate = isGFX1250Plus in { +let SubtargetPredicate = HasMcastLoadInsts in { let Uses = [M0, EXEC, ASYNCcnt], WaveSizePredicate = isWave32 in { defm CLUSTER_LOAD_ASYNC_TO_LDS_B8 : FLAT_Global_Load_LDS_Pseudo<"cluster_load_async_to_lds_b8", 1>; @@ -1243,7 +1254,7 @@ defm GLOBAL_STORE_ASYNC_FROM_LDS_B128 : FLAT_Global_STORE_LDS_Pseudo<"global_s def TENSOR_SAVE : FLAT_Global_Tensor_Pseudo<"tensor_save", 1>; def TENSOR_STOP : FLAT_Global_Tensor_Pseudo<"tensor_stop">; -} // End SubtargetPredicate = isGFX1250Plus +} // End SubtargetPredicate = HasMcastLoadInsts defm SCRATCH_LOAD_UBYTE : FLAT_Scratch_Load_Pseudo <"scratch_load_ubyte">; defm SCRATCH_LOAD_SBYTE : FLAT_Scratch_Load_Pseudo <"scratch_load_sbyte">; @@ -1404,62 +1415,62 @@ class FlatSignedLoadPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType >; class GlobalLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), vt:$in)), + (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPROp_32:$voffset), i32:$offset, CPol:$cpol), vt:$in)), (inst $saddr, $voffset, $offset, $cpol, $in) >; class FlatLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), vt:$in)), + (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPROp_32:$voffset), i32:$offset, CPol:$cpol), vt:$in)), (inst $saddr, $voffset, $offset, $cpol, $in) >; class FlatLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol))), + (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPROp_32:$voffset), i32:$offset, CPol:$cpol))), (inst $saddr, $voffset, $offset, $cpol) >; class FlatLoadSaddrPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol))), + (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPROp_32:$voffset), i32:$offset, CPol:$cpol))), (EXTRACT_SUBREG (inst $saddr, $voffset, $offset, $cpol), lo16) >; class FlatLoadLDSSignedPat_M0 <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat < - (node (i64 VReg_64:$vaddr), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm:$cpol), M0), + (node (i64 VReg_64:$vaddr), (i32 VGPROp_32:$dsaddr), (i32 timm:$offset), (i32 timm:$cpol), M0), (inst $dsaddr, $vaddr, $offset, $cpol) >; class GlobalLoadLDSSaddrPat_M0 <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat < - (node (GlobalSAddrNoIOffsetM0 (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), CPol:$cpol), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm), M0), + (node (GlobalSAddrNoIOffsetM0 (i64 SReg_64:$saddr), (i32 VGPROp_32:$voffset), CPol:$cpol), (i32 VGPROp_32:$dsaddr), (i32 timm:$offset), (i32 timm), M0), (inst $dsaddr, $saddr, $voffset, $offset, $cpol) >; class FlatLoadLDSSignedPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat < - (node (i64 VReg_64:$vaddr), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm:$cpol)), + (node (i64 VReg_64:$vaddr), (i32 VGPROp_32:$dsaddr), (i32 timm:$offset), (i32 timm:$cpol)), (inst $dsaddr, $vaddr, $offset, $cpol) >; class GlobalLoadLDSSaddrPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat < - (node (GlobalSAddrNoIOffset (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), CPol:$cpol), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm)), + (node (GlobalSAddrNoIOffset (i64 SReg_64:$saddr), (i32 VGPROp_32:$voffset), CPol:$cpol), (i32 VGPROp_32:$dsaddr), (i32 timm:$offset), (i32 timm)), (inst $dsaddr, $saddr, $voffset, $offset, $cpol) >; class FlatStoreLDSSignedPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat < - (node (i64 VReg_64:$vaddr), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm:$cpol)), + (node (i64 VReg_64:$vaddr), (i32 VGPROp_32:$dsaddr), (i32 timm:$offset), (i32 timm:$cpol)), (inst $vaddr, $dsaddr, $offset, $cpol) >; class GlobalStoreLDSSaddrPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat < - (node (GlobalSAddrNoIOffset (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), CPol:$cpol), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm)), + (node (GlobalSAddrNoIOffset (i64 SReg_64:$saddr), (i32 VGPROp_32:$voffset), CPol:$cpol), (i32 VGPROp_32:$dsaddr), (i32 timm:$offset), (i32 timm)), (inst $saddr, $voffset, $dsaddr, $offset, $cpol) >; class GlobalLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol))), + (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPROp_32:$voffset), i32:$offset, CPol:$cpol))), (inst $saddr, $voffset, $offset, $cpol) >; class GlobalLoadSaddrPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol))), + (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPROp_32:$voffset), i32:$offset, CPol:$cpol))), (EXTRACT_SUBREG (inst $saddr, $voffset, $offset, $cpol), lo16) >; @@ -1469,7 +1480,7 @@ class FlatLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> >; class FlatLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol))), + (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPROp_32:$voffset), i32:$offset, CPol:$cpol))), (inst $saddr, $voffset, $offset, $cpol) >; @@ -1479,7 +1490,7 @@ class FlatLoadSignedPat_M0 <FLAT_Pseudo inst, SDPatternOperator node, ValueType >; class GlobalLoadSaddrPat_M0 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (vt (node (GlobalSAddrCPolM0 (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), (i32 timm), M0)), + (vt (node (GlobalSAddrCPolM0 (i64 SReg_64:$saddr), (i32 VGPROp_32:$voffset), i32:$offset, CPol:$cpol), (i32 timm), M0)), (inst $saddr, $voffset, $offset, $cpol) >; @@ -1489,19 +1500,19 @@ class FlatLoadSignedPat_CPOL <FLAT_Pseudo inst, SDPatternOperator node, ValueTyp >; class GlobalLoadSaddrPat_CPOL <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (vt (node (GlobalSAddrCPol (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), (i32 timm))), + (vt (node (GlobalSAddrCPol (i64 SReg_64:$saddr), (i32 VGPROp_32:$voffset), i32:$offset, CPol:$cpol), (i32 timm))), (inst $saddr, $voffset, $offset, $cpol) >; class FlatStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (node vt:$data, (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol)), + (node vt:$data, (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPROp_32:$voffset), i32:$offset, CPol:$cpol)), (inst $voffset, getVregSrcForVT<vt>.ret:$data, $saddr, $offset, $cpol) >; class FlatAtomicSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ComplexPattern pat, ValueType vt, ValueType data_vt = vt> : GCNPat < - (vt (node (pat (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), data_vt:$data)), + (vt (node (pat (i64 SReg_64:$saddr), (i32 VGPROp_32:$voffset), i32:$offset, CPol:$cpol), data_vt:$data)), (inst $voffset, getVregSrcForVT<data_vt>.ret:$data, $saddr, $offset, $cpol)> { let SubtargetPredicate = inst.SubtargetPredicate; let OtherPredicates = inst.OtherPredicates; @@ -1509,7 +1520,7 @@ class FlatAtomicSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ComplexPatte class GlobalAtomicNoRtnSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), vt:$data), + (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPROp_32:$voffset), i32:$offset, CPol:$cpol), vt:$data), (inst $voffset, getVregSrcForVT<vt>.ret:$data, $saddr, $offset, $cpol) >; @@ -1539,7 +1550,7 @@ multiclass FlatAtomicNoRtnPatBase <string base_inst_name, string node, ValueType let AddedComplexity = 1 in def : GCNPat <(vt (noRtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)), - (inst VReg_64_AlignTarget:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)> { + (inst VGPROp_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)> { let SubtargetPredicate = inst.SubtargetPredicate; let OtherPredicates = inst.OtherPredicates; } @@ -1552,10 +1563,6 @@ multiclass FlatAtomicNoRtnPatBase <string base_inst_name, string node, ValueType } } -multiclass FlatAtomicNoRtnPatWithAddrSpace<string inst, string node, string addrSpaceSuffix, - ValueType vt> : - FlatAtomicNoRtnPatBase<inst, node # "_noret_" # addrSpaceSuffix, vt, vt>; - multiclass FlatAtomicNoRtnPat <string inst, string node, ValueType vt, ValueType data_vt = vt, bit isIntr = 0> : FlatAtomicNoRtnPatBase<inst, node # "_noret" # !if(isIntr, "", "_"#vt), vt, data_vt>; @@ -1568,7 +1575,7 @@ multiclass FlatAtomicRtnPatBase <string inst_name, string node, ValueType vt, defvar rtnNode = !cast<SDPatternOperator>(node); def : GCNPat <(vt (rtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)), - (inst VReg_64_AlignTarget:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)> { + (inst VGPROp_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)> { let SubtargetPredicate = inst.SubtargetPredicate; let OtherPredicates = inst.OtherPredicates; } @@ -1580,10 +1587,6 @@ multiclass FlatAtomicRtnPatBase <string inst_name, string node, ValueType vt, } } -multiclass FlatAtomicRtnPatWithAddrSpace<string inst, string intr, string addrSpaceSuffix, - ValueType vt> : - FlatAtomicRtnPatBase<inst, intr # "_" # addrSpaceSuffix, vt, vt>; - multiclass FlatAtomicRtnPat <string inst, string node, ValueType vt, ValueType data_vt = vt, bit isIntr = 0> : FlatAtomicRtnPatBase<inst, node # !if(isIntr, "", "_"#vt), vt, data_vt>; @@ -1612,7 +1615,7 @@ multiclass FlatAtomicIntrPat <string inst, string node, ValueType vt, class FlatSignedAtomicPatBase <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt, ValueType data_vt = vt> : GCNPat < (vt (node (GlobalOffset i64:$vaddr, i32:$offset), data_vt:$data)), - (inst VReg_64_AlignTarget:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)> { + (inst VGPROp_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)> { let SubtargetPredicate = inst.SubtargetPredicate; let OtherPredicates = inst.OtherPredicates; } @@ -1631,27 +1634,27 @@ multiclass FlatSignedAtomicPat <string inst, string node, ValueType vt, } class ScratchLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (vt (node (ScratchOffset (i32 VGPR_32:$vaddr), i32:$offset))), + (vt (node (ScratchOffset (i32 VGPROp_32:$vaddr), i32:$offset))), (inst $vaddr, $offset) >; class ScratchLoadSignedPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (node (ScratchOffset (i32 VGPR_32:$vaddr), i32:$offset), vt:$in), + (node (ScratchOffset (i32 VGPROp_32:$vaddr), i32:$offset), vt:$in), (inst $vaddr, $offset, 0, $in) >; class ScratchLoadSignedPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (vt (node (ScratchOffset (i32 VGPR_32:$vaddr), i32:$offset))), + (vt (node (ScratchOffset (i32 VGPROp_32:$vaddr), i32:$offset))), (inst $vaddr, $offset, 0) >; class ScratchLoadSignedPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (vt (node (ScratchOffset (i32 VGPR_32:$vaddr), i32:$offset))), + (vt (node (ScratchOffset (i32 VGPROp_32:$vaddr), i32:$offset))), (EXTRACT_SUBREG (inst $vaddr, $offset), lo16) >; class ScratchStoreSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (node vt:$data, (ScratchOffset (i32 VGPR_32:$vaddr), i32:$offset)), + (node vt:$data, (ScratchOffset (i32 VGPROp_32:$vaddr), i32:$offset)), (inst getVregSrcForVT<vt>.ret:$data, $vaddr, $offset) >; @@ -1682,28 +1685,28 @@ class ScratchStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, >; class ScratchLoadSVaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol))), + (vt (node (ScratchSVAddr (i32 VGPROp_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol))), (inst $vaddr, $saddr, $offset, $cpol) >; class ScratchStoreSVaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (node vt:$data, (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol)), + (node vt:$data, (ScratchSVAddr (i32 VGPROp_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol)), (inst getVregSrcForVT<vt>.ret:$data, $vaddr, $saddr, $offset, $cpol) >; class ScratchLoadSVaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol), vt:$in)), + (vt (node (ScratchSVAddr (i32 VGPROp_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol), vt:$in)), (inst $vaddr, $saddr, $offset, $cpol, $in) >; class ScratchLoadSVaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol))), + (vt (node (ScratchSVAddr (i32 VGPROp_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol))), (inst $vaddr, $saddr, $offset, $cpol) >; class ScratchLoadSVaddrPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol))), + (vt (node (ScratchSVAddr (i32 VGPROp_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol))), (EXTRACT_SUBREG (inst $vaddr, $saddr, $offset, $cpol), lo16) >; @@ -2169,14 +2172,16 @@ defm : FlatAtomicPat <"FLAT_ATOMIC_MIN_F64", "atomic_load_fmin_"#as, f64>; defm : FlatAtomicPat <"FLAT_ATOMIC_MAX_F64", "atomic_load_fmax_"#as, f64>; } +let SubtargetPredicate = HasAtomicCondSubClampFlatInsts in { + defm : FlatAtomicRtnPat<"FLAT_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_" #as, i32 >; + + defm : FlatAtomicNoRtnPat<"FLAT_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_"#as, i32>; +} } // end foreach as defm : FlatStorePats <FLAT_STORE_BYTE, truncstorei8_flat, i16>; defm : FlatStorePats <FLAT_STORE_SHORT, store_flat, i16>; -defm : FlatAtomicRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32>; -defm : FlatAtomicNoRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32>; - let OtherPredicates = [HasD16LoadStore] in { defm : FlatStorePats <FLAT_STORE_SHORT_D16_HI, truncstorei16_hi16_flat, i32>; defm : FlatStorePats <FLAT_STORE_BYTE_D16_HI, truncstorei8_hi16_flat, i32>; @@ -2340,10 +2345,10 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_OR", "atomic_load_or_global", i32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SWAP", "atomic_swap_global", i32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP", "AMDGPUatomic_cmp_swap_global", i32, v2i32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR", "atomic_load_xor_global", i32>; -defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_CSUB", "int_amdgcn_global_atomic_csub", i32, i32, /* isIntr */ 1>; +defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_CSUB", "atomic_load_usub_sat_global", i32>; let SubtargetPredicate = HasAtomicCSubNoRtnInsts in -defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_CSUB", "int_amdgcn_global_atomic_csub", i32, i32, /* isIntr */ 1>; +defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_CSUB", "atomic_load_usub_sat_global", i32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_X2", "atomic_load_add_global", i64>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB_X2", "atomic_load_sub_global", i64>; @@ -2360,10 +2365,8 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP_X2", "AMDGPUatomic_cmp_swap_ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR_X2", "atomic_load_xor_global", i64>; let SubtargetPredicate = isGFX12Plus in { - defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "global_addrspace", i32>; - - let SubtargetPredicate = HasAtomicCSubNoRtnInsts in - defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "global_addrspace", i32>; + defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_global", i32>; + defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_global", i32>; } let OtherPredicates = [isGFX12PlusNot12_50] in @@ -2387,13 +2390,13 @@ let WaveSizePredicate = isWave32, OtherPredicates = [HasTransposeLoadF4F6Insts] } let OtherPredicates = [isGFX125xOnly] in { - def : FlatLoadPat_CPOL <FLAT_LOAD_MONITOR_B32, int_amdgcn_flat_load_monitor_b32, i32>; - def : FlatLoadPat_CPOL <FLAT_LOAD_MONITOR_B64, int_amdgcn_flat_load_monitor_b64, v2i32>; - def : FlatLoadPat_CPOL <FLAT_LOAD_MONITOR_B128, int_amdgcn_flat_load_monitor_b128, v4i32>; + def : FlatLoadPat <FLAT_LOAD_MONITOR_B32, AMDGPUflat_load_monitor, i32>; + def : FlatLoadPat <FLAT_LOAD_MONITOR_B64, AMDGPUflat_load_monitor, v2i32>; + def : FlatLoadPat <FLAT_LOAD_MONITOR_B128, AMDGPUflat_load_monitor, v4i32>; - defm : GlobalFLATLoadPats_CPOL <GLOBAL_LOAD_MONITOR_B32, int_amdgcn_global_load_monitor_b32, i32>; - defm : GlobalFLATLoadPats_CPOL <GLOBAL_LOAD_MONITOR_B64, int_amdgcn_global_load_monitor_b64, v2i32>; - defm : GlobalFLATLoadPats_CPOL <GLOBAL_LOAD_MONITOR_B128, int_amdgcn_global_load_monitor_b128, v4i32>; + defm : GlobalFLATLoadPats <GLOBAL_LOAD_MONITOR_B32, AMDGPUglobal_load_monitor, i32>; + defm : GlobalFLATLoadPats <GLOBAL_LOAD_MONITOR_B64, AMDGPUglobal_load_monitor, v2i32>; + defm : GlobalFLATLoadPats <GLOBAL_LOAD_MONITOR_B128, AMDGPUglobal_load_monitor, v4i32>; } // End SubtargetPredicate = isGFX125xOnly let OtherPredicates = [isGFX1250Plus] in { @@ -2450,7 +2453,7 @@ defm : FlatAtomicPat <"FLAT_ATOMIC_PK_ADD_BF16", "atomic_load_fadd_flat", v2bf16 defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "atomic_load_fadd_global", v2bf16>; -let OtherPredicates = [HasFlatScratchInsts, EnableFlatScratch] in { +let OtherPredicates = [HasFlatScratchInsts, HasFlatScratchEnabled] in { defm : ScratchFLATLoadPats <SCRATCH_LOAD_UBYTE, extloadi8_private, i32>; defm : ScratchFLATLoadPats <SCRATCH_LOAD_UBYTE, zextloadi8_private, i32>; @@ -2508,12 +2511,12 @@ defm : ScratchFLATStorePats <SCRATCH_STORE_BYTE, truncstorei8_private, i32>; defm : ScratchFLATStorePats <SCRATCH_STORE_SHORT, truncstorei16_private, i32>; defm : ScratchFLATStorePats <SCRATCH_STORE_DWORDX3, store_private, v3i32>; -let OtherPredicates = [HasD16LoadStore, HasFlatScratchInsts, EnableFlatScratch] in { +let OtherPredicates = [HasD16LoadStore, HasFlatScratchInsts, HasFlatScratchEnabled] in { defm : ScratchFLATStorePats <SCRATCH_STORE_SHORT_D16_HI, truncstorei16_hi16_private, i32>; defm : ScratchFLATStorePats <SCRATCH_STORE_BYTE_D16_HI, truncstorei8_hi16_private, i32>; } -let OtherPredicates = [D16PreservesUnusedBits, HasFlatScratchInsts, EnableFlatScratch] in { +let OtherPredicates = [D16PreservesUnusedBits, HasFlatScratchInsts, HasFlatScratchEnabled] in { defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_private, v2i16>; defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_private, v2f16>; defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_private, v2i16>; @@ -2529,7 +2532,7 @@ defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_SHORT_D16, load_d16_lo_private, v2i defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_SHORT_D16, load_d16_lo_private, v2f16>; } -} // End OtherPredicates = [HasFlatScratchInsts,EnableFlatScratch] +} // End OtherPredicates = [HasFlatScratchInsts,HasFlatScratchEnabled] def PrefetchLoc: SDNodeXForm<timm, [{ uint32_t V = N->getZExtValue(); @@ -2568,7 +2571,7 @@ multiclass FlatPrefetchPats<string inst, SDPatternOperator prefetch_kind, SDPatt } def : GCNPat < - (prefetch_kind (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), rw, (i32 timm:$loc), i32imm_one), + (prefetch_kind (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPROp_32:$voffset), i32:$offset), rw, (i32 timm:$loc), i32imm_one), (!cast<FLAT_Pseudo>(inst#"_SADDR") $saddr, $voffset, $offset, (i32 (PrefetchLoc $loc))) > { let AddedComplexity = !if(!eq(rw, i32imm_zero), 11, 30); @@ -2582,7 +2585,7 @@ multiclass FlatIntrPrefetchPats<string inst, SDPatternOperator intr> { >; def : GCNPat < - (intr (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), timm:$cpol), + (intr (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPROp_32:$voffset), i32:$offset), timm:$cpol), (!cast<FLAT_Pseudo>(inst#"_SADDR") $saddr, $voffset, $offset, $cpol)> { let AddedComplexity = 11; } @@ -3642,17 +3645,6 @@ multiclass VFLAT_Real_gfx1250<bits<8> op, } } -multiclass VFLAT_Aliases_gfx1250<string name> { - defvar ps = get_FLAT_ps<NAME>; - if !ne(ps.Mnemonic, name) then - def : MnemonicAlias<ps.Mnemonic, name>, Requires<[isGFX125xOnly]>; -} - -multiclass VFLAT_Real_Base_gfx1250<bits<8> op, string name = get_FLAT_ps<NAME>.Mnemonic> : - VFLAT_Aliases_gfx1250<name> { - defm "" : VFLAT_Real_gfx1250<op, name>; -} - multiclass VFLAT_Real_RTN_gfx1250<bits<8> op, string name> { defm _RTN : VFLAT_Real_gfx1250<op, name>; } @@ -3665,9 +3657,14 @@ multiclass VFLAT_Real_SADDR_RTN_gfx1250<bits<8> op, string name> { defm _SADDR_RTN : VFLAT_Real_gfx1250<op, name>; } -multiclass VFLAT_Real_AllAddr_gfx1250<bits<8> op, string name = get_FLAT_ps<NAME>.Mnemonic> : - VFLAT_Real_Base_gfx1250<op, name>, - VFLAT_Real_SADDR_gfx1250<op, name>; +multiclass VFLAT_Real_AllAddr_gfx1250<bits<8> op, string name = get_FLAT_ps<NAME>.Mnemonic, + list<Predicate> aliasPreds = [isGFX125xOnly]> : + VFLAT_Real_gfx1250<op, name>, + VFLAT_Real_SADDR_gfx1250<op, name> { + defvar ps = get_FLAT_ps<NAME>; + if !ne(ps.Mnemonic, name) then + def : MnemonicAlias<ps.Mnemonic, name>, Requires<aliasPreds>; +} multiclass VFLAT_Real_Atomics_gfx1250<bits<8> op, string name = get_FLAT_ps<NAME>.Mnemonic> : VFLAT_Real_AllAddr_gfx1250<op, name>, @@ -3711,6 +3708,12 @@ defm GLOBAL_LOAD_TR_B64_w32 : VFLAT_Real_AllAddr_gfx1250<0x058, "globa defm GLOBAL_LOAD_TR4_B64 : VFLAT_Real_AllAddr_gfx1250<0x073>; defm GLOBAL_LOAD_TR6_B96 : VFLAT_Real_AllAddr_gfx1250<0x074>; +// Additional aliases for global load transpose instructions. +def : MnemonicAlias<"global_load_b128_tr_b16", "global_load_tr16_b128">, Requires<[isGFX125xOnly]>; +def : MnemonicAlias<"global_load_b64_tr_b8", "global_load_tr8_b64">, Requires<[isGFX125xOnly]>; +def : MnemonicAlias<"global_load_b64_tr_b4", "global_load_tr4_b64">, Requires<[isGFX125xOnly]>; +def : MnemonicAlias<"global_load_b96_tr_b6", "global_load_tr6_b96">, Requires<[isGFX125xOnly]>; + defm FLAT_ATOMIC_ADD_F64 : VFLAT_Real_Atomics_gfx1250<0x055>; defm FLAT_ATOMIC_MIN_F64 : VFLAT_Real_Atomics_gfx1250<0x05b, "flat_atomic_min_num_f64">; defm FLAT_ATOMIC_MAX_F64 : VFLAT_Real_Atomics_gfx1250<0x05c, "flat_atomic_max_num_f64">; diff --git a/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp b/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp index 27f40f1..72805aa 100644 --- a/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp +++ b/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp @@ -134,6 +134,7 @@ public: LLVM_DEBUG(dbgs() << "CreateVOPD Pass:\n"); const SIInstrInfo *SII = ST->getInstrInfo(); + const SIRegisterInfo *TRI = ST->getRegisterInfo(); bool Changed = false; unsigned EncodingFamily = AMDGPU::getVOPDEncodingFamily(*ST); bool HasVOPD3 = ST->hasVOPD3(); @@ -160,16 +161,25 @@ public: llvm::AMDGPU::CanBeVOPD SecondCanBeVOPD = AMDGPU::getCanBeVOPD(Opc2, EncodingFamily, VOPD3); - if (FirstCanBeVOPD.X && SecondCanBeVOPD.Y) + if (FirstCanBeVOPD.X && SecondCanBeVOPD.Y && + llvm::checkVOPDRegConstraints(*SII, *FirstMI, *SecondMI, VOPD3)) { CI = VOPDCombineInfo(FirstMI, SecondMI, VOPD3); - else if (FirstCanBeVOPD.Y && SecondCanBeVOPD.X) + return true; + } + // We can try swapping the order of the instructions, but in that case + // neither instruction can write to a register the other reads from. + // OpX cannot write something OpY reads because that is the hardware + // rule, and OpY cannot write what OpX reads because that would + // violate the data dependency in the original order. + for (const auto &Use : SecondMI->uses()) + if (Use.isReg() && FirstMI->modifiesRegister(Use.getReg(), TRI)) + return false; + if (FirstCanBeVOPD.Y && SecondCanBeVOPD.X && + llvm::checkVOPDRegConstraints(*SII, *SecondMI, *FirstMI, VOPD3)) { CI = VOPDCombineInfo(SecondMI, FirstMI, VOPD3); - else - return false; - // checkVOPDRegConstraints cares about program order, but doReplace - // cares about X-Y order in the constituted VOPD - return llvm::checkVOPDRegConstraints(*SII, *FirstMI, *SecondMI, - VOPD3); + return true; + } + return false; }; if (checkVOPD(false) || (HasVOPD3 && checkVOPD(true))) { diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp index 464cbec..6ba669f 100644 --- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp +++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp @@ -43,6 +43,7 @@ #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" using namespace llvm; @@ -256,7 +257,7 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg()), *MRI)); auto *Def = getVRegSubRegDef(CombOldVGPR, *MRI); - DPPInst.addReg(CombOldVGPR.Reg, Def ? 0 : RegState::Undef, + DPPInst.addReg(CombOldVGPR.Reg, getUndefRegState(!Def), CombOldVGPR.SubReg); ++NumOperands; } else if (TII->isVOPC(DPPOp) || (TII->isVOP3(DPPOp) && OrigOpE32 != -1 && diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index a911e7e..d504d86 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -16,6 +16,7 @@ #include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/ScheduleDAG.h" #include "llvm/TargetParser/TargetParser.h" @@ -190,6 +191,12 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { if (checkFPAtomicToDenormModeHazard(MI) > 0) return HazardType; + // Hazards which cannot be mitigated with S_NOPs. + if (!IsHazardRecognizerMode) { + if (checkWMMACoexecutionHazards(MI) > 0) + return Hazard; + } + if (ST.hasNoDataDepHazard()) return NoHazard; @@ -435,10 +442,7 @@ void GCNHazardRecognizer::RecedeCycle() { // Helper Functions //===----------------------------------------------------------------------===// -using HazardFnResult = enum { HazardFound, HazardExpired, NoHazardFound }; - -using IsExpiredFn = function_ref<bool(const MachineInstr &, int WaitStates)>; -using GetNumWaitStatesFn = function_ref<unsigned int(const MachineInstr &)>; +enum HazardFnResult { HazardFound, HazardExpired, NoHazardFound }; // Search for a hazard in a block and its predecessors. template <typename StateT> @@ -546,11 +550,14 @@ hasHazard(StateT InitialState, // Returns a minimum wait states since \p I walking all predecessors. // Only scans until \p IsExpired does not return true. // Can only be run in a hazard recognizer mode. -static int getWaitStatesSince( - GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB, - MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates, - IsExpiredFn IsExpired, DenseSet<const MachineBasicBlock *> &Visited, - GetNumWaitStatesFn GetNumWaitStates = SIInstrInfo::getNumWaitStates) { +static int +getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, + const MachineBasicBlock *MBB, + MachineBasicBlock::const_reverse_instr_iterator I, + int WaitStates, GCNHazardRecognizer::IsExpiredFn IsExpired, + DenseSet<const MachineBasicBlock *> &Visited, + GCNHazardRecognizer::GetNumWaitStatesFn GetNumWaitStates = + SIInstrInfo::getNumWaitStates) { for (auto E = MBB->instr_rend(); I != E; ++I) { // Don't add WaitStates for parent BUNDLE instructions. if (I->isBundle()) @@ -582,20 +589,26 @@ static int getWaitStatesSince( return MinWaitStates; } -static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, - const MachineInstr *MI, IsExpiredFn IsExpired) { +static int +getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, + const MachineInstr *MI, + GCNHazardRecognizer::IsExpiredFn IsExpired, + GCNHazardRecognizer::GetNumWaitStatesFn GetNumWaitStates = + SIInstrInfo::getNumWaitStates) { DenseSet<const MachineBasicBlock *> Visited; return getWaitStatesSince(IsHazard, MI->getParent(), std::next(MI->getReverseIterator()), 0, IsExpired, - Visited, SIInstrInfo::getNumWaitStates); + Visited, GetNumWaitStates); } -int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) { +int GCNHazardRecognizer::getWaitStatesSince( + IsHazardFn IsHazard, int Limit, GetNumWaitStatesFn GetNumWaitStates) { if (IsHazardRecognizerMode) { auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) { return WaitStates >= Limit; }; - return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn); + return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn, + GetNumWaitStates); } int WaitStates = 0; @@ -607,7 +620,7 @@ int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) { if (MI->isInlineAsm()) continue; } - ++WaitStates; + WaitStates += MI ? GetNumWaitStates(*MI) : 1; if (WaitStates >= Limit) break; @@ -615,6 +628,10 @@ int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) { return std::numeric_limits<int>::max(); } +int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) { + return getWaitStatesSince(IsHazard, Limit, SIInstrInfo::getNumWaitStates); +} + int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg, IsHazardFn IsHazardDef, int Limit) { @@ -643,7 +660,7 @@ int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard, static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV, MCRegister Reg) { for (MCRegUnit Unit : TRI.regunits(Reg)) - BV.set(Unit); + BV.set(static_cast<unsigned>(Unit)); } static void addRegsToSet(const SIRegisterInfo &TRI, @@ -1243,6 +1260,20 @@ int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) { getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates); } +// emit V_NOP instructions. \p WaitStatesNeeded is the number of V_NOPs we need +// to insert, negative means not needed. +bool GCNHazardRecognizer::emitVNops(MachineInstr *MI, int WaitStatesNeeded) { + if (WaitStatesNeeded <= 0) + return false; + + const SIInstrInfo *TII = ST.getInstrInfo(); + for (int I = 0; I < WaitStatesNeeded; ++I) + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), + TII->get(AMDGPU::V_NOP_e32)); + + return true; +} + void GCNHazardRecognizer::fixHazards(MachineInstr *MI) { fixVMEMtoScalarWriteHazards(MI); fixVcmpxPermlaneHazards(MI); @@ -1257,7 +1288,7 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) { fixVALUTransUseHazard(MI); fixVALUTransCoexecutionHazards(MI); fixWMMAHazards(MI); // fall-through if co-execution is enabled. - fixWMMACoexecutionHazards(MI); + emitVNops(MI, checkWMMACoexecutionHazards(MI)); fixShift64HighRegBug(MI); fixVALUMaskWriteHazard(MI); fixRequiredExportPriority(MI); @@ -1306,8 +1337,8 @@ bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) { bool IsUndef = Src0->isUndef(); BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32)) - .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0)) - .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill); + .addReg(Reg, RegState::Define | getDeadRegState(IsUndef)) + .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill); return true; } @@ -1354,7 +1385,7 @@ bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) { const SIInstrInfo *TII = ST.getInstrInfo(); BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::S_WAITCNT_DEPCTR)) - .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0)); + .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0, ST)); return true; } @@ -1487,7 +1518,7 @@ bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) { BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::S_WAITCNT_DEPCTR)) - .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0)); + .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0, ST)); return true; } @@ -1502,9 +1533,8 @@ static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, bool HasVmem = false; for (auto &MBB : MF) { for (auto &MI : MBB) { - HasLds |= SIInstrInfo::isDS(MI); - HasVmem |= (SIInstrInfo::isVMEM(MI) && !SIInstrInfo::isFLAT(MI)) || - SIInstrInfo::isSegmentSpecificFLAT(MI); + HasLds |= SIInstrInfo::isDS(MI) || SIInstrInfo::isLDSDMA(MI); + HasVmem |= SIInstrInfo::isVMEM(MI); if (HasLds && HasVmem) return true; } @@ -1526,10 +1556,9 @@ bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) { assert(!ST.hasExtendedWaitCounts()); auto IsHazardInst = [](const MachineInstr &MI) { - if (SIInstrInfo::isDS(MI)) + if (SIInstrInfo::isDS(MI) || SIInstrInfo::isLDSDMA(MI)) return 1; - if ((SIInstrInfo::isVMEM(MI) && !SIInstrInfo::isFLAT(MI)) || - SIInstrInfo::isSegmentSpecificFLAT(MI)) + if (SIInstrInfo::isVMEM(MI)) return 2; return 0; }; @@ -1653,7 +1682,7 @@ bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) { } else { BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_WAITCNT_DEPCTR)) - .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0)); + .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0, ST)); } return true; @@ -1811,7 +1840,7 @@ bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) { BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_WAITCNT_DEPCTR)) - .addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0)); + .addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0, ST)); return true; } @@ -1897,13 +1926,13 @@ bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) { // avoided. BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_WAITCNT_DEPCTR)) - .addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0)); + .addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0, ST)); return true; } bool GCNHazardRecognizer::fixVALUTransCoexecutionHazards(MachineInstr *MI) { - if (!AMDGPU::isGFX1250(ST) || // Coexecution disabled. + if (!ST.hasGFX1250Insts() || // Coexecution disabled. !SIInstrInfo::isVALU(*MI) || SIInstrInfo::isTRANS(*MI)) return false; @@ -2047,13 +2076,13 @@ static bool IsWMMAHazardInstInCategory(const MachineInstr &MI, return false; } -bool GCNHazardRecognizer::fixWMMACoexecutionHazards(MachineInstr *MI) { - if (!AMDGPU::isGFX1250(ST)) - return false; +int GCNHazardRecognizer::checkWMMACoexecutionHazards(MachineInstr *MI) { + if (!ST.hasGFX1250Insts()) + return 0; const SIInstrInfo *TII = ST.getInstrInfo(); if (!TII->isXDLWMMA(*MI) && !isCoexecutableVALUInst(*MI)) - return false; + return 0; const SIRegisterInfo *TRI = ST.getRegisterInfo(); @@ -2131,9 +2160,6 @@ bool GCNHazardRecognizer::fixWMMACoexecutionHazards(MachineInstr *MI) { }; int Limit = 0; - auto IsExpiredFn = [&Limit](const MachineInstr &, int WaitStates) { - return WaitStates >= Limit; - }; auto GetWaitStatesFn = [](const MachineInstr &I) { return SIInstrInfo::isVALU(I) ? 1 : 0; @@ -2143,38 +2169,26 @@ bool GCNHazardRecognizer::fixWMMACoexecutionHazards(MachineInstr *MI) { if (TII->isXDLWMMA(*MI)) { for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) { Limit = WMMAWaitStates[Category]; // for IsExpiredFn. - DenseSet<const MachineBasicBlock *> Visited; - // '::getWaitStatesSince' returns the number of VALUs in between if hazard + // 'getWaitStatesSince' returns the number of VALUs in between if hazard // exists, and INT_MAX if there is no hazard. As a result, a negative // WaitStatesNeeded here means no hazard, and we will continue to search // for other categories. WaitStatesNeeded = - Limit - ::getWaitStatesSince(IsWMMAHazardFn, MI->getParent(), - std::next(MI->getReverseIterator()), 0, - IsExpiredFn, Visited, GetWaitStatesFn); + Limit - getWaitStatesSince(IsWMMAHazardFn, Limit, GetWaitStatesFn); } } else { // Must be a co-executable VALU. for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) { Limit = VALUWaitStates[Category]; // for IsExpiredFn. - DenseSet<const MachineBasicBlock *> Visited; - // '::getWaitStatesSince' returns the number of VALUs in between if hazard + // 'getWaitStatesSince' returns the number of VALUs in between if hazard // exists, and INT_MAX if there is no hazard. As a result, a negative // WaitStatesNeeded here means no hazard, and we will continue to search // for other categories. WaitStatesNeeded = - Limit - ::getWaitStatesSince(IsVALUHazardFn, MI->getParent(), - std::next(MI->getReverseIterator()), 0, - IsExpiredFn, Visited, GetWaitStatesFn); + Limit - getWaitStatesSince(IsVALUHazardFn, Limit, GetWaitStatesFn); } } - // WaitStatesNeeded now is the number of V_NOPs we need to insert, negative - // means not needed. - for (int i = 0; i < WaitStatesNeeded; i++) - BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), - TII->get(AMDGPU::V_NOP_e32)); - - return true; + return WaitStatesNeeded; } bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) { @@ -2204,16 +2218,33 @@ bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) { if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(AmtReg + 1)) return false; - MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1); - bool OverlappedSrc = Src1->isReg() && TRI.regsOverlap(Src1->getReg(), AmtReg); - bool OverlappedDst = MI->modifiesRegister(AmtReg, &TRI); - bool Overlapped = OverlappedSrc || OverlappedDst; - - assert(!OverlappedDst || !OverlappedSrc || - Src1->getReg() == MI->getOperand(0).getReg()); assert(ST.needsAlignedVGPRs()); static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1); + const DebugLoc &DL = MI->getDebugLoc(); + MachineBasicBlock *MBB = MI->getParent(); + MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1); + + // In: + // + // Dst = shiftrev64 Amt, Src1 + // + // if Dst!=Src1 then avoid the bug with: + // + // Dst.sub0 = Amt + // Dst = shift64 Dst.sub0, Src1 + + Register DstReg = MI->getOperand(0).getReg(); + if (!Src1->isReg() || Src1->getReg() != DstReg) { + Register DstLo = TRI.getSubReg(DstReg, AMDGPU::sub0); + runOnInstruction( + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), DstLo).add(*Amt)); + Amt->setReg(DstLo); + Amt->setIsKill(true); + return true; + } + + bool Overlapped = MI->modifiesRegister(AmtReg, &TRI); Register NewReg; for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass : AMDGPU::VGPR_32RegClass) { @@ -2230,8 +2261,6 @@ bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) { if (Overlapped) NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0); - DebugLoc DL = MI->getDebugLoc(); - MachineBasicBlock *MBB = MI->getParent(); // Insert a full wait count because found register might be pending a wait. BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_WAITCNT)) .addImm(0); @@ -2269,9 +2298,8 @@ bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) { Amt->setIsKill(false); // We do not update liveness, so verifier may see it as undef. Amt->setIsUndef(); - if (OverlappedDst) + if (Overlapped) { MI->getOperand(0).setReg(NewReg); - if (OverlappedSrc) { Src1->setReg(NewReg); Src1->setIsKill(false); Src1->setIsUndef(); @@ -3267,29 +3295,103 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) { return false; assert(!ST.hasExtendedWaitCounts()); - if (!ST.isWave64() || !SIInstrInfo::isSALU(*MI)) + if (!ST.isWave64()) + return false; + + const bool IsSALU = SIInstrInfo::isSALU(*MI); + const bool IsVALU = SIInstrInfo::isVALU(*MI); + if (!IsSALU && !IsVALU) return false; // The hazard sequence is three instructions: // 1. VALU reads SGPR as mask - // 2. SALU writes SGPR - // 3. SALU reads SGPR - // The hazard can expire if the distance between 2 and 3 is sufficient. - // In practice this happens <10% of the time, hence this always assumes - // the hazard exists if 1 and 2 are present to avoid searching. + // 2. VALU/SALU writes SGPR + // 3. VALU/SALU reads SGPR + // The hazard can expire if the distance between 2 and 3 is sufficient, + // or (2) is VALU and (3) is SALU. + // In practice this happens <10% of the time, hence always assume the hazard + // exists if (1) and (2) are present to avoid searching all SGPR reads. - const MachineOperand *SDSTOp = TII.getNamedOperand(*MI, AMDGPU::OpName::sdst); - if (!SDSTOp || !SDSTOp->isReg()) - return false; + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + + auto IgnoreableSGPR = [](const Register Reg) { + switch (Reg) { + case AMDGPU::EXEC: + case AMDGPU::EXEC_LO: + case AMDGPU::EXEC_HI: + case AMDGPU::M0: + case AMDGPU::SGPR_NULL: + case AMDGPU::SGPR_NULL64: + case AMDGPU::SCC: + return true; + default: + return false; + } + }; + auto IsVCC = [](const Register Reg) { + return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI; + }; - const Register HazardReg = SDSTOp->getReg(); - if (HazardReg == AMDGPU::EXEC || - HazardReg == AMDGPU::EXEC_LO || - HazardReg == AMDGPU::EXEC_HI || - HazardReg == AMDGPU::M0) + struct StateType { + SmallSet<Register, 2> HazardSGPRs; + + static unsigned getHashValue(const StateType &State) { + return hash_combine_range(State.HazardSGPRs); + } + static bool isEqual(const StateType &LHS, const StateType &RHS) { + return LHS.HazardSGPRs == RHS.HazardSGPRs; + } + }; + + SmallVector<const MachineInstr *> WaitInstrs; + bool HasSGPRRead = false; + StateType InitialState; + + // Look for SGPR write. + MachineOperand *HazardDef = nullptr; + for (MachineOperand &Op : MI->operands()) { + if (!Op.isReg()) + continue; + if (Op.isDef() && HazardDef) + continue; + + Register Reg = Op.getReg(); + if (IgnoreableSGPR(Reg)) + continue; + if (!IsVCC(Reg)) { + if (Op.isImplicit()) + continue; + if (!TRI->isSGPRReg(MRI, Reg)) + continue; + } + // Also check for SGPR reads. + if (Op.isUse()) { + HasSGPRRead = true; + continue; + } + + assert(!HazardDef); + HazardDef = &Op; + } + + if (!HazardDef) return false; - auto IsHazardFn = [HazardReg, this](const MachineInstr &I) { + // Setup to track writes to individual SGPRs + const Register HazardReg = HazardDef->getReg(); + if (AMDGPU::SReg_32RegClass.contains(HazardReg)) { + InitialState.HazardSGPRs.insert(HazardReg); + } else { + assert(AMDGPU::SReg_64RegClass.contains(HazardReg)); + InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub0)); + InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub1)); + } + + auto IsHazardFn = [&](StateType &State, const MachineInstr &I) { + if (State.HazardSGPRs.empty()) + return HazardExpired; + switch (I.getOpcode()) { case AMDGPU::V_ADDC_U32_e32: case AMDGPU::V_ADDC_U32_dpp: @@ -3304,11 +3406,10 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) { case AMDGPU::V_SUBB_U32_e32: case AMDGPU::V_SUBB_U32_dpp: case AMDGPU::V_SUBBREV_U32_e32: - case AMDGPU::V_SUBBREV_U32_dpp: + case AMDGPU::V_SUBBREV_U32_dpp: { // These implicitly read VCC as mask source. - return HazardReg == AMDGPU::VCC || - HazardReg == AMDGPU::VCC_LO || - HazardReg == AMDGPU::VCC_HI; + return IsVCC(HazardReg) ? HazardFound : NoHazardFound; + } case AMDGPU::V_ADDC_U32_e64: case AMDGPU::V_ADDC_U32_e64_dpp: case AMDGPU::V_CNDMASK_B16_t16_e64: @@ -3324,68 +3425,110 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) { // Only check mask register overlaps. const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2); assert(SSRCOp); - return TRI.regsOverlap(SSRCOp->getReg(), HazardReg); + bool Result = TRI->regsOverlap(SSRCOp->getReg(), HazardReg); + return Result ? HazardFound : NoHazardFound; } default: - return false; + return NoHazardFound; } }; - const MachineRegisterInfo &MRI = MF.getRegInfo(); - auto IsExpiredFn = [&MRI, this](const MachineInstr &I, int) { - // s_waitcnt_depctr sa_sdst(0) mitigates hazard. - if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && - AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0) - return true; - - // VALU access to any SGPR or literal constant other than HazardReg - // mitigates hazard. No need to check HazardReg here as this will - // only be called when !IsHazardFn. - if (!SIInstrInfo::isVALU(I)) - return false; - for (int OpNo = 0, End = I.getNumOperands(); OpNo < End; ++OpNo) { - const MachineOperand &Op = I.getOperand(OpNo); - if (Op.isReg()) { - Register OpReg = Op.getReg(); - // Only consider uses - if (!Op.isUse()) + const unsigned ConstantMaskBits = AMDGPU::DepCtr::encodeFieldSaSdst( + AMDGPU::DepCtr::encodeFieldVaSdst(AMDGPU::DepCtr::encodeFieldVaVcc(0, ST), + 0), + 0); + auto UpdateStateFn = [&](StateType &State, const MachineInstr &I) { + switch (I.getOpcode()) { + case AMDGPU::S_WAITCNT_DEPCTR: + // Record mergable waits within region of instructions free of SGPR reads. + if (!HasSGPRRead && I.getParent() == MI->getParent() && !I.isBundled() && + (I.getOperand(0).getImm() & ConstantMaskBits) == ConstantMaskBits) + WaitInstrs.push_back(&I); + break; + default: + // Update tracking of SGPR reads and writes. + for (auto &Op : I.operands()) { + if (!Op.isReg()) continue; - // Ignore EXEC - if (OpReg == AMDGPU::EXEC || - OpReg == AMDGPU::EXEC_LO || - OpReg == AMDGPU::EXEC_HI) + + Register Reg = Op.getReg(); + if (IgnoreableSGPR(Reg)) continue; - // Ignore all implicit uses except VCC - if (Op.isImplicit()) { - if (OpReg == AMDGPU::VCC || - OpReg == AMDGPU::VCC_LO || - OpReg == AMDGPU::VCC_HI) - return true; + if (!IsVCC(Reg)) { + if (Op.isImplicit()) + continue; + if (!TRI->isSGPRReg(MRI, Reg)) + continue; + } + if (Op.isUse()) { + HasSGPRRead = true; continue; } - if (TRI.isSGPRReg(MRI, OpReg)) - return true; - } else { - const MCInstrDesc &InstDesc = I.getDesc(); - const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo]; - if (!TII.isInlineConstant(Op, OpInfo)) - return true; + + // Stop tracking any SGPRs with writes on the basis that they will + // already have an appropriate wait inserted afterwards. + SmallVector<Register, 2> Found; + for (Register SGPR : State.HazardSGPRs) { + if (Reg == SGPR || TRI->regsOverlap(Reg, SGPR)) + Found.push_back(SGPR); + } + for (Register SGPR : Found) + State.HazardSGPRs.erase(SGPR); } + break; } - return false; }; // Check for hazard - if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == - std::numeric_limits<int>::max()) + if (!hasHazard<StateType>(InitialState, IsHazardFn, UpdateStateFn, + MI->getParent(), + std::next(MI->getReverseIterator()))) return false; - auto NextMI = std::next(MI->getIterator()); + // Compute counter mask + unsigned DepCtr = + IsVALU ? (IsVCC(HazardReg) ? AMDGPU::DepCtr::encodeFieldVaVcc(0, ST) + : AMDGPU::DepCtr::encodeFieldVaSdst(0, ST)) + : AMDGPU::DepCtr::encodeFieldSaSdst(0, ST); + + // Try to merge previous waits into this one for regions with no SGPR reads. + if (!WaitInstrs.empty()) { + // Note: WaitInstrs contains const pointers, so walk backward from MI to + // obtain a mutable pointer to each instruction to be merged. + // This is expected to be a very short walk within the same block. + SmallVector<MachineInstr *> ToErase; + unsigned Found = 0; + for (MachineBasicBlock::reverse_iterator It = MI->getReverseIterator(), + End = MI->getParent()->rend(); + Found < WaitInstrs.size() && It != End; ++It) { + MachineInstr *WaitMI = &*It; + // Find next wait instruction. + if (std::as_const(WaitMI) != WaitInstrs[Found]) + continue; + Found++; + unsigned WaitMask = WaitMI->getOperand(0).getImm(); + assert((WaitMask & ConstantMaskBits) == ConstantMaskBits); + DepCtr = AMDGPU::DepCtr::encodeFieldSaSdst( + DepCtr, std::min(AMDGPU::DepCtr::decodeFieldSaSdst(WaitMask), + AMDGPU::DepCtr::decodeFieldSaSdst(DepCtr))); + DepCtr = AMDGPU::DepCtr::encodeFieldVaSdst( + DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaSdst(WaitMask), + AMDGPU::DepCtr::decodeFieldVaSdst(DepCtr))); + DepCtr = AMDGPU::DepCtr::encodeFieldVaVcc( + DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaVcc(WaitMask), + AMDGPU::DepCtr::decodeFieldVaVcc(DepCtr))); + ToErase.push_back(WaitMI); + } + assert(Found == WaitInstrs.size()); + for (MachineInstr *WaitMI : ToErase) + WaitMI->eraseFromParent(); + } - // Add s_waitcnt_depctr sa_sdst(0) after SALU write. + // Add s_waitcnt_depctr after SGPR write. + auto NextMI = std::next(MI->getIterator()); auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(), TII.get(AMDGPU::S_WAITCNT_DEPCTR)) - .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0)); + .addImm(DepCtr); // SALU write may be s_getpc in a bundle. updateGetPCBundle(NewMI); @@ -3531,10 +3674,10 @@ bool GCNHazardRecognizer::fixDsAtomicAsyncBarrierArriveB64(MachineInstr *MI) { const SIInstrInfo *TII = ST.getInstrInfo(); BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::S_WAITCNT_DEPCTR)) - .addImm(0xFFE3); + .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0, ST)); BuildMI(*MI->getParent(), std::next(MI->getIterator()), MI->getDebugLoc(), TII->get(AMDGPU::S_WAITCNT_DEPCTR)) - .addImm(0xFFE3); + .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0, ST)); return true; } @@ -3611,7 +3754,7 @@ bool GCNHazardRecognizer::fixScratchBaseForwardingHazard(MachineInstr *MI) { BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::S_WAITCNT_DEPCTR)) .addImm(AMDGPU::DepCtr::encodeFieldVaSdst( - AMDGPU::DepCtr::encodeFieldSaSdst(0), 0)); + AMDGPU::DepCtr::encodeFieldSaSdst(0, ST), 0)); return true; } diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h index 67beffa..d725134 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h @@ -32,6 +32,8 @@ class GCNSubtarget; class GCNHazardRecognizer final : public ScheduleHazardRecognizer { public: typedef function_ref<bool(const MachineInstr &)> IsHazardFn; + typedef function_ref<bool(const MachineInstr &, int WaitStates)> IsExpiredFn; + typedef function_ref<unsigned int(const MachineInstr &)> GetNumWaitStatesFn; private: // Distinguish if we are called from scheduler or hazard recognizer @@ -74,6 +76,8 @@ private: // used on a newly inserted instruction before returning from PreEmitNoops. void runOnInstruction(MachineInstr *MI); + int getWaitStatesSince(IsHazardFn IsHazard, int Limit, + GetNumWaitStatesFn GetNumWaitStates); int getWaitStatesSince(IsHazardFn IsHazard, int Limit); int getWaitStatesSinceDef(unsigned Reg, IsHazardFn IsHazardDef, int Limit); int getWaitStatesSinceSetReg(IsHazardFn IsHazard, int Limit); @@ -94,6 +98,9 @@ private: int checkReadM0Hazards(MachineInstr *SMovRel); int checkNSAtoVMEMHazard(MachineInstr *MI); int checkFPAtomicToDenormModeHazard(MachineInstr *MI); + // Emit V_NOP instructions. \p WaitStatesNeeded is the number of V_NOPs we + // need to insert, negative means not needed. + bool emitVNops(MachineInstr *MI, int WaitStatesNeeded); void fixHazards(MachineInstr *MI); bool fixVcmpxPermlaneHazards(MachineInstr *MI); bool fixVMEMtoScalarWriteHazards(MachineInstr *MI); @@ -106,7 +113,7 @@ private: bool fixVALUTransUseHazard(MachineInstr *MI); bool fixVALUTransCoexecutionHazards(MachineInstr *MI); bool fixWMMAHazards(MachineInstr *MI); - bool fixWMMACoexecutionHazards(MachineInstr *MI); + int checkWMMACoexecutionHazards(MachineInstr *MI); bool fixShift64HighRegBug(MachineInstr *MI); bool fixVALUMaskWriteHazard(MachineInstr *MI); bool fixRequiredExportPriority(MachineInstr *MI); diff --git a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp index f253a84..dff153c 100644 --- a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp +++ b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp @@ -381,10 +381,14 @@ void GCNIterativeScheduler::scheduleRegion(Region &R, Range &&Schedule, auto Top = R.Begin; for (const auto &I : Schedule) { auto MI = getMachineInstr(I); - if (MI != &*Top) { + + MachineBasicBlock::iterator MII = MI->getIterator(); + if (MII != Top) { + bool NonDebugReordered = + !MI->isDebugInstr() && skipDebugInstructionsForward(Top, MII) != MII; BB->remove(MI); BB->insert(Top, MI); - if (!MI->isDebugInstr()) + if (NonDebugReordered) LIS->handleMove(*MI, true); } if (!MI->isDebugInstr()) { diff --git a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp index 959ce69..5529808 100644 --- a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp +++ b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp @@ -43,7 +43,7 @@ public: bool run(MachineFunction &MF); private: - using NSA_Status = enum { + enum NSA_Status { NOT_NSA, // Not an NSA instruction FIXED, // NSA which we cannot modify NON_CONTIGUOUS, // NSA with non-sequential address which we can try @@ -81,9 +81,7 @@ class GCNNSAReassignLegacy : public MachineFunctionPass { public: static char ID; - GCNNSAReassignLegacy() : MachineFunctionPass(ID) { - initializeGCNNSAReassignLegacyPass(*PassRegistry::getPassRegistry()); - } + GCNNSAReassignLegacy() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override; diff --git a/llvm/lib/Target/AMDGPU/GCNPreRALongBranchReg.cpp b/llvm/lib/Target/AMDGPU/GCNPreRALongBranchReg.cpp index 355bbeb..5e9ac56 100644 --- a/llvm/lib/Target/AMDGPU/GCNPreRALongBranchReg.cpp +++ b/llvm/lib/Target/AMDGPU/GCNPreRALongBranchReg.cpp @@ -57,9 +57,7 @@ public: class GCNPreRALongBranchRegLegacy : public MachineFunctionPass { public: static char ID; - GCNPreRALongBranchRegLegacy() : MachineFunctionPass(ID) { - initializeGCNPreRALongBranchRegLegacyPass(*PassRegistry::getPassRegistry()); - } + GCNPreRALongBranchRegLegacy() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override { return GCNPreRALongBranchReg().run(MF); diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp index 4deb2a9..cd56887 100644 --- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp +++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp @@ -63,9 +63,7 @@ class GCNPreRAOptimizationsLegacy : public MachineFunctionPass { public: static char ID; - GCNPreRAOptimizationsLegacy() : MachineFunctionPass(ID) { - initializeGCNPreRAOptimizationsLegacyPass(*PassRegistry::getPassRegistry()); - } + GCNPreRAOptimizationsLegacy() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override; @@ -136,7 +134,7 @@ bool GCNPreRAOptimizationsImpl::processReg(Register Reg) { continue; if (Def.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64) { - MachineOperand DefSrcMO = Def.getOperand(1); + const MachineOperand &DefSrcMO = Def.getOperand(1); // Immediates are not an issue and can be propagated in // postrapseudos pass. Only handle cases where defining @@ -270,15 +268,14 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) { continue; Register Dst = MI.getOperand(0).getReg(); Register Src = MI.getOperand(1).getReg(); - if (Dst.isVirtual() && - MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass && - Src.isPhysical() && + const TargetRegisterClass *DstRC = TRI->getRegClassForReg(*MRI, Dst); + bool IsDst16Bit = AMDGPU::VGPR_16RegClass.hasSubClassEq(DstRC); + if (Dst.isVirtual() && IsDst16Bit && Src.isPhysical() && TRI->getRegClassForReg(*MRI, Src) == &AMDGPU::VGPR_32RegClass) MRI->setRegAllocationHint(Dst, 0, TRI->getSubReg(Src, AMDGPU::lo16)); if (Src.isVirtual() && MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass && - Dst.isPhysical() && - TRI->getRegClassForReg(*MRI, Dst) == &AMDGPU::VGPR_32RegClass) + Dst.isPhysical() && DstRC == &AMDGPU::VGPR_32RegClass) MRI->setRegAllocationHint(Src, 0, TRI->getSubReg(Dst, AMDGPU::lo16)); if (!Dst.isVirtual() || !Src.isVirtual()) continue; @@ -287,8 +284,7 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) { MRI->setRegAllocationHint(Dst, AMDGPURI::Size32, Src); MRI->setRegAllocationHint(Src, AMDGPURI::Size16, Dst); } - if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass && - MRI->getRegClass(Src) == &AMDGPU::VGPR_32RegClass) + if (IsDst16Bit && MRI->getRegClass(Src) == &AMDGPU::VGPR_32RegClass) MRI->setRegAllocationHint(Dst, AMDGPURI::Size16, Src); } } diff --git a/llvm/lib/Target/AMDGPU/GCNProcessors.td b/llvm/lib/Target/AMDGPU/GCNProcessors.td index c6fb31f..9949208 100644 --- a/llvm/lib/Target/AMDGPU/GCNProcessors.td +++ b/llvm/lib/Target/AMDGPU/GCNProcessors.td @@ -304,6 +304,10 @@ def : ProcessorModel<"gfx1153", GFX11SpeedModel, FeatureISAVersion11_5_3.Features >; +def : ProcessorModel<"gfx1170", GFX11SpeedModel, + FeatureISAVersion11_7_0.Features +>; + // [gfx1100, gfx1101, gfx1102, gfx1103, gfx1150, gfx1151, gfx1152, gfx1153] def : ProcessorModel<"gfx11-generic", GFX11SpeedModel, FeatureISAVersion11_Generic.Features @@ -333,3 +337,11 @@ def : ProcessorModel<"gfx1250", GFX1250SpeedModel, def : ProcessorModel<"gfx1251", GFX1250SpeedModel, FeatureISAVersion12_51.Features >; + +//===----------------------------------------------------------------------===// +// GCN GFX13. +//===----------------------------------------------------------------------===// + +def : ProcessorModel<"gfx1310", GFX12SpeedModel, + FeatureISAVersion13.Features +>; diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp index 4e11c4f..89307ef 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -97,6 +97,51 @@ void GCNRegPressure::inc(unsigned Reg, Value[RegKind] += Sign; } +namespace { +struct RegExcess { + unsigned SGPR = 0; + unsigned VGPR = 0; + unsigned ArchVGPR = 0; + unsigned AGPR = 0; + + bool anyExcess() const { return SGPR || VGPR || ArchVGPR || AGPR; } + bool hasVectorRegisterExcess() const { return VGPR || ArchVGPR || AGPR; } + + RegExcess(const MachineFunction &MF, const GCNRegPressure &RP) + : RegExcess(MF, RP, GCNRPTarget(MF, RP)) {} + RegExcess(const MachineFunction &MF, const GCNRegPressure &RP, + const GCNRPTarget &Target) { + unsigned MaxSGPRs = Target.getMaxSGPRs(); + unsigned MaxVGPRs = Target.getMaxVGPRs(); + + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + SGPR = std::max(static_cast<int>(RP.getSGPRNum() - MaxSGPRs), 0); + + // The number of virtual VGPRs required to handle excess SGPR + unsigned WaveSize = ST.getWavefrontSize(); + unsigned VGPRForSGPRSpills = divideCeil(SGPR, WaveSize); + + unsigned MaxArchVGPRs = ST.getAddressableNumArchVGPRs(); + + // Unified excess pressure conditions, accounting for VGPRs used for SGPR + // spills + VGPR = std::max(static_cast<int>(RP.getVGPRNum(ST.hasGFX90AInsts()) + + VGPRForSGPRSpills - MaxVGPRs), + 0); + + unsigned ArchVGPRLimit = ST.hasGFX90AInsts() ? MaxArchVGPRs : MaxVGPRs; + // Arch VGPR excess pressure conditions, accounting for VGPRs used for SGPR + // spills + ArchVGPR = std::max(static_cast<int>(RP.getArchVGPRNum() + + VGPRForSGPRSpills - ArchVGPRLimit), + 0); + + // AGPR excess pressure conditions + AGPR = std::max(static_cast<int>(RP.getAGPRNum() - ArchVGPRLimit), 0); + } +}; +} // namespace + bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O, unsigned MaxOccupancy) const { const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); @@ -123,63 +168,25 @@ bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O, return Occ > OtherOcc; unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF); - unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF); - // SGPR excess pressure conditions - unsigned ExcessSGPR = std::max(static_cast<int>(getSGPRNum() - MaxSGPRs), 0); - unsigned OtherExcessSGPR = - std::max(static_cast<int>(O.getSGPRNum() - MaxSGPRs), 0); - - auto WaveSize = ST.getWavefrontSize(); - // The number of virtual VGPRs required to handle excess SGPR - unsigned VGPRForSGPRSpills = (ExcessSGPR + (WaveSize - 1)) / WaveSize; - unsigned OtherVGPRForSGPRSpills = - (OtherExcessSGPR + (WaveSize - 1)) / WaveSize; + RegExcess Excess(MF, *this); + RegExcess OtherExcess(MF, O); unsigned MaxArchVGPRs = ST.getAddressableNumArchVGPRs(); - // Unified excess pressure conditions, accounting for VGPRs used for SGPR - // spills - unsigned ExcessVGPR = - std::max(static_cast<int>(getVGPRNum(ST.hasGFX90AInsts()) + - VGPRForSGPRSpills - MaxVGPRs), - 0); - unsigned OtherExcessVGPR = - std::max(static_cast<int>(O.getVGPRNum(ST.hasGFX90AInsts()) + - OtherVGPRForSGPRSpills - MaxVGPRs), - 0); - // Arch VGPR excess pressure conditions, accounting for VGPRs used for SGPR - // spills - unsigned ExcessArchVGPR = std::max( - static_cast<int>(getVGPRNum(false) + VGPRForSGPRSpills - MaxArchVGPRs), - 0); - unsigned OtherExcessArchVGPR = - std::max(static_cast<int>(O.getVGPRNum(false) + OtherVGPRForSGPRSpills - - MaxArchVGPRs), - 0); - // AGPR excess pressure conditions - unsigned ExcessAGPR = std::max( - static_cast<int>(ST.hasGFX90AInsts() ? (getAGPRNum() - MaxArchVGPRs) - : (getAGPRNum() - MaxVGPRs)), - 0); - unsigned OtherExcessAGPR = std::max( - static_cast<int>(ST.hasGFX90AInsts() ? (O.getAGPRNum() - MaxArchVGPRs) - : (O.getAGPRNum() - MaxVGPRs)), - 0); - - bool ExcessRP = ExcessSGPR || ExcessVGPR || ExcessArchVGPR || ExcessAGPR; - bool OtherExcessRP = OtherExcessSGPR || OtherExcessVGPR || - OtherExcessArchVGPR || OtherExcessAGPR; + bool ExcessRP = Excess.anyExcess(); + bool OtherExcessRP = OtherExcess.anyExcess(); // Give second precedence to the reduced number of spills to hold the register // pressure. if (ExcessRP || OtherExcessRP) { // The difference in excess VGPR pressure, after including VGPRs used for // SGPR spills - int VGPRDiff = ((OtherExcessVGPR + OtherExcessArchVGPR + OtherExcessAGPR) - - (ExcessVGPR + ExcessArchVGPR + ExcessAGPR)); + int VGPRDiff = + ((OtherExcess.VGPR + OtherExcess.ArchVGPR + OtherExcess.AGPR) - + (Excess.VGPR + Excess.ArchVGPR + Excess.AGPR)); - int SGPRDiff = OtherExcessSGPR - ExcessSGPR; + int SGPRDiff = OtherExcess.SGPR - Excess.SGPR; if (VGPRDiff != 0) return VGPRDiff > 0; @@ -282,11 +289,12 @@ collectVirtualRegUses(SmallVectorImpl<VRegMaskOrUnit> &VRegMaskOrUnits, Register Reg = MO.getReg(); auto I = llvm::find_if(VRegMaskOrUnits, [Reg](const VRegMaskOrUnit &RM) { - return RM.RegUnit == Reg; + return RM.VRegOrUnit.asVirtualReg() == Reg; }); auto &P = I == VRegMaskOrUnits.end() - ? VRegMaskOrUnits.emplace_back(Reg, LaneBitmask::getNone()) + ? VRegMaskOrUnits.emplace_back(VirtRegOrUnit(Reg), + LaneBitmask::getNone()) : *I; P.LaneMask |= MO.getSubReg() ? TRI.getSubRegIndexLaneMask(MO.getSubReg()) @@ -295,7 +303,7 @@ collectVirtualRegUses(SmallVectorImpl<VRegMaskOrUnit> &VRegMaskOrUnits, SlotIndex InstrSI; for (auto &P : VRegMaskOrUnits) { - auto &LI = LIS.getInterval(P.RegUnit); + auto &LI = LIS.getInterval(P.VRegOrUnit.asVirtualReg()); if (!LI.hasSubRanges()) continue; @@ -312,29 +320,22 @@ collectVirtualRegUses(SmallVectorImpl<VRegMaskOrUnit> &VRegMaskOrUnits, /// Mostly copy/paste from CodeGen/RegisterPressure.cpp static LaneBitmask getLanesWithProperty( const LiveIntervals &LIS, const MachineRegisterInfo &MRI, - bool TrackLaneMasks, Register RegUnit, SlotIndex Pos, - LaneBitmask SafeDefault, + bool TrackLaneMasks, Register Reg, SlotIndex Pos, function_ref<bool(const LiveRange &LR, SlotIndex Pos)> Property) { - if (RegUnit.isVirtual()) { - const LiveInterval &LI = LIS.getInterval(RegUnit); - LaneBitmask Result; - if (TrackLaneMasks && LI.hasSubRanges()) { - for (const LiveInterval::SubRange &SR : LI.subranges()) { - if (Property(SR, Pos)) - Result |= SR.LaneMask; - } - } else if (Property(LI, Pos)) { - Result = TrackLaneMasks ? MRI.getMaxLaneMaskForVReg(RegUnit) - : LaneBitmask::getAll(); + assert(Reg.isVirtual()); + const LiveInterval &LI = LIS.getInterval(Reg); + LaneBitmask Result; + if (TrackLaneMasks && LI.hasSubRanges()) { + for (const LiveInterval::SubRange &SR : LI.subranges()) { + if (Property(SR, Pos)) + Result |= SR.LaneMask; } - - return Result; + } else if (Property(LI, Pos)) { + Result = + TrackLaneMasks ? MRI.getMaxLaneMaskForVReg(Reg) : LaneBitmask::getAll(); } - const LiveRange *LR = LIS.getCachedRegUnit(RegUnit); - if (LR == nullptr) - return SafeDefault; - return Property(*LR, Pos) ? LaneBitmask::getAll() : LaneBitmask::getNone(); + return Result; } /// Mostly copy/paste from CodeGen/RegisterPressure.cpp @@ -412,15 +413,15 @@ bool GCNRPTarget::isSaveBeneficial(Register Reg) const { const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo(); const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI); + RegExcess Excess(MF, RP, *this); + if (SRI->isSGPRClass(RC)) - return RP.getSGPRNum() > MaxSGPRs; - unsigned NumVGPRs = - SRI->isAGPRClass(RC) ? RP.getAGPRNum() : RP.getArchVGPRNum(); - // The addressable limit must always be respected. - if (NumVGPRs > MaxVGPRs) - return true; - // For unified RFs, combined VGPR usage limit must be respected as well. - return UnifiedRF && RP.getVGPRNum(true) > MaxUnifiedVGPRs; + return Excess.SGPR; + + if (SRI->isAGPRClass(RC)) + return (UnifiedRF && Excess.VGPR) || Excess.AGPR; + + return (UnifiedRF && Excess.VGPR) || Excess.ArchVGPR; } bool GCNRPTarget::satisfied() const { @@ -431,6 +432,11 @@ bool GCNRPTarget::satisfied() const { return true; } +bool GCNRPTarget::hasVectorRegisterExcess() const { + RegExcess Excess(MF, RP, *this); + return Excess.hasVectorRegisterExcess(); +} + /////////////////////////////////////////////////////////////////////////////// // GCNRPTracker @@ -502,10 +508,9 @@ void GCNRPTracker::reset(const MachineRegisterInfo &MRI_, } /// Mostly copy/paste from CodeGen/RegisterPressure.cpp -LaneBitmask GCNRPTracker::getLastUsedLanes(Register RegUnit, - SlotIndex Pos) const { +LaneBitmask GCNRPTracker::getLastUsedLanes(Register Reg, SlotIndex Pos) const { return getLanesWithProperty( - LIS, *MRI, true, RegUnit, Pos.getBaseIndex(), LaneBitmask::getNone(), + LIS, *MRI, true, Reg, Pos.getBaseIndex(), [](const LiveRange &LR, SlotIndex Pos) { const LiveRange::Segment *S = LR.getSegmentContaining(Pos); return S != nullptr && S->end == Pos.getRegSlot(); @@ -562,10 +567,10 @@ void GCNUpwardRPTracker::recede(const MachineInstr &MI) { SmallVector<VRegMaskOrUnit, 8> RegUses; collectVirtualRegUses(RegUses, MI, LIS, *MRI); for (const VRegMaskOrUnit &U : RegUses) { - LaneBitmask &LiveMask = LiveRegs[U.RegUnit]; + LaneBitmask &LiveMask = LiveRegs[U.VRegOrUnit.asVirtualReg()]; LaneBitmask PrevMask = LiveMask; LiveMask |= U.LaneMask; - CurPressure.inc(U.RegUnit, PrevMask, LiveMask, *MRI); + CurPressure.inc(U.VRegOrUnit.asVirtualReg(), PrevMask, LiveMask, *MRI); } // Update MaxPressure with uses plus early-clobber defs pressure. @@ -580,7 +585,7 @@ void GCNUpwardRPTracker::recede(const MachineInstr &MI) { bool GCNDownwardRPTracker::reset(const MachineInstr &MI, const LiveRegSet *LiveRegsCopy) { - MRI = &MI.getParent()->getParent()->getRegInfo(); + MRI = &MI.getMF()->getRegInfo(); LastTrackedMI = nullptr; MBBEnd = MI.getParent()->end(); NextMI = &MI; @@ -748,9 +753,9 @@ GCNDownwardRPTracker::bumpDownwardPressure(const MachineInstr *MI, GCNRegPressure TempPressure = CurPressure; for (const VRegMaskOrUnit &Use : RegOpers.Uses) { - Register Reg = Use.RegUnit; - if (!Reg.isVirtual()) + if (!Use.VRegOrUnit.isVirtualReg()) continue; + Register Reg = Use.VRegOrUnit.asVirtualReg(); LaneBitmask LastUseMask = getLastUsedLanes(Reg, SlotIdx); if (LastUseMask.none()) continue; @@ -782,9 +787,9 @@ GCNDownwardRPTracker::bumpDownwardPressure(const MachineInstr *MI, // Generate liveness for defs. for (const VRegMaskOrUnit &Def : RegOpers.Defs) { - Register Reg = Def.RegUnit; - if (!Reg.isVirtual()) + if (!Def.VRegOrUnit.isVirtualReg()) continue; + Register Reg = Def.VRegOrUnit.asVirtualReg(); auto It = LiveRegs.find(Reg); LaneBitmask LiveMask = It != LiveRegs.end() ? It->second : LaneBitmask(0); LaneBitmask NewMask = LiveMask | Def.LaneMask; @@ -824,8 +829,7 @@ Printable llvm::print(const GCNRPTracker::LiveRegSet &LiveRegs, Register Reg = Register::index2VirtReg(I); auto It = LiveRegs.find(Reg); if (It != LiveRegs.end() && It->second.any()) - OS << ' ' << printVRegOrUnit(Reg, TRI) << ':' - << PrintLaneMask(It->second); + OS << ' ' << printReg(Reg, TRI) << ':' << PrintLaneMask(It->second); } OS << '\n'; }); diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h index 979a8b0..c55796c 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h @@ -21,6 +21,7 @@ #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/RegisterPressure.h" #include <algorithm> +#include <array> namespace llvm { @@ -45,7 +46,7 @@ struct GCNRegPressure { return !Value[SGPR] && !Value[VGPR] && !Value[AGPR] && !Value[AVGPR]; } - void clear() { std::fill(&Value[0], &Value[ValueArraySize], 0); } + void clear() { Value.fill(0); } unsigned getNumRegs(RegKind Kind) const { assert(Kind < TOTAL_KINDS); @@ -101,6 +102,29 @@ struct GCNRegPressure { DynamicVGPRBlockSize)); } + unsigned getVGPRSpills(MachineFunction &MF, unsigned ArchVGPRThreshold, + unsigned AGPRThreshold, unsigned CombinedThreshold) { + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + if (!ST.hasGFX90AInsts()) + return 0; + + unsigned ArchPressure = getArchVGPRNum(); + unsigned AGPRPressure = getAGPRNum(); + + unsigned ArchSpill = ArchPressure > ArchVGPRThreshold + ? (ArchPressure - ArchVGPRThreshold) + : 0; + unsigned AGPRSpill = + AGPRPressure > AGPRThreshold ? (AGPRPressure - AGPRThreshold) : 0; + + unsigned UnifiedPressure = getVGPRNum(/*UnifiedVGPRFile=*/true); + unsigned UnifiedSpill = UnifiedPressure > CombinedThreshold + ? (UnifiedPressure - CombinedThreshold) + : 0; + + return std::max(UnifiedSpill, ArchSpill + AGPRSpill); + } + void inc(unsigned Reg, LaneBitmask PrevMask, LaneBitmask NewMask, @@ -127,9 +151,7 @@ struct GCNRegPressure { bool less(const MachineFunction &MF, const GCNRegPressure &O, unsigned MaxOccupancy = std::numeric_limits<unsigned>::max()) const; - bool operator==(const GCNRegPressure &O) const { - return std::equal(&Value[0], &Value[ValueArraySize], O.Value); - } + bool operator==(const GCNRegPressure &O) const { return Value == O.Value; } bool operator!=(const GCNRegPressure &O) const { return !(*this == O); @@ -160,7 +182,7 @@ private: /// Pressure for all register kinds (first all regular registers kinds, then /// all tuple register kinds). - unsigned Value[ValueArraySize]; + std::array<unsigned, ValueArraySize> Value; static unsigned getRegKind(const TargetRegisterClass *RC, const SIRegisterInfo *STI); @@ -235,6 +257,12 @@ public: /// Whether the current RP is at or below the defined pressure target. bool satisfied() const; + bool hasVectorRegisterExcess() const; + + unsigned getMaxSGPRs() const { return MaxSGPRs; } + unsigned getMaxVGPRs() const { + return UnifiedRF ? MaxUnifiedVGPRs : MaxVGPRs; + } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) friend raw_ostream &operator<<(raw_ostream &OS, const GCNRPTarget &Target) { @@ -293,7 +321,7 @@ protected: /// Mostly copy/paste from CodeGen/RegisterPressure.cpp void bumpDeadDefs(ArrayRef<VRegMaskOrUnit> DeadDefs); - LaneBitmask getLastUsedLanes(Register RegUnit, SlotIndex Pos) const; + LaneBitmask getLastUsedLanes(Register Reg, SlotIndex Pos) const; public: // reset tracker and set live register set to the specified value. @@ -456,7 +484,7 @@ template <typename Range> DenseMap<MachineInstr*, GCNRPTracker::LiveRegSet> getLiveRegMap(Range &&R, bool After, LiveIntervals &LIS) { std::vector<SlotIndex> Indexes; - Indexes.reserve(std::distance(R.begin(), R.end())); + Indexes.reserve(llvm::size(R)); auto &SII = *LIS.getSlotIndexes(); for (MachineInstr *I : R) { auto SI = SII.getInstructionIndex(*I); @@ -464,7 +492,7 @@ getLiveRegMap(Range &&R, bool After, LiveIntervals &LIS) { } llvm::sort(Indexes); - auto &MRI = (*R.begin())->getParent()->getParent()->getRegInfo(); + auto &MRI = (*R.begin())->getMF()->getRegInfo(); DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> LiveRegMap; SmallVector<SlotIndex, 32> LiveIdxs, SRLiveIdxs; for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { @@ -494,13 +522,13 @@ getLiveRegMap(Range &&R, bool After, LiveIntervals &LIS) { inline GCNRPTracker::LiveRegSet getLiveRegsAfter(const MachineInstr &MI, const LiveIntervals &LIS) { return getLiveRegs(LIS.getInstructionIndex(MI).getDeadSlot(), LIS, - MI.getParent()->getParent()->getRegInfo()); + MI.getMF()->getRegInfo()); } inline GCNRPTracker::LiveRegSet getLiveRegsBefore(const MachineInstr &MI, const LiveIntervals &LIS) { return getLiveRegs(LIS.getInstructionIndex(MI).getBaseIndex(), LIS, - MI.getParent()->getParent()->getRegInfo()); + MI.getMF()->getRegInfo()); } template <typename Range> diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 9fbf9e5..b044195 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -28,10 +28,19 @@ #include "GCNRegPressure.h" #include "SIMachineFunctionInfo.h" #include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/BitVector.h" #include "llvm/ADT/STLExtras.h" #include "llvm/CodeGen/CalcSpillWeights.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineBlockFrequencyInfo.h" +#include "llvm/CodeGen/MachineBranchProbabilityInfo.h" +#include "llvm/CodeGen/MachineCycleAnalysis.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/RegisterClassInfo.h" #include "llvm/MC/LaneBitmask.h" +#include "llvm/MC/MCInstrItineraries.h" +#include "llvm/MC/MCSchedule.h" +#include "llvm/MC/TargetRegistry.h" #include "llvm/Support/ErrorHandling.h" #define DEBUG_TYPE "machine-scheduler" @@ -90,6 +99,10 @@ static cl::opt<bool> PrintMaxRPRegUsageAfterScheduler( cl::init(false)); #endif +static cl::opt<bool> DisableRewriteMFMAFormSchedStage( + "amdgpu-disable-rewrite-mfma-form-sched-stage", cl::Hidden, + cl::desc("Disable rewrie mfma rewrite scheduling stage"), cl::init(true)); + const unsigned ScheduleMetrics::ScaleFactor = 100; GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C) @@ -145,7 +158,6 @@ void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) { VGPRCriticalLimit -= std::min(VGPRLimitBias + ErrorMargin, VGPRCriticalLimit); SGPRExcessLimit -= std::min(SGPRLimitBias + ErrorMargin, SGPRExcessLimit); VGPRExcessLimit -= std::min(VGPRLimitBias + ErrorMargin, VGPRExcessLimit); - LLVM_DEBUG(dbgs() << "VGPRCriticalLimit = " << VGPRCriticalLimit << ", VGPRExcessLimit = " << VGPRExcessLimit << ", SGPRCriticalLimit = " << SGPRCriticalLimit @@ -690,6 +702,8 @@ GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy( const MachineSchedContext *C, bool IsLegacyScheduler) : GCNSchedStrategy(C) { SchedStages.push_back(GCNSchedStageID::OccInitialSchedule); + if (!DisableRewriteMFMAFormSchedStage) + SchedStages.push_back(GCNSchedStageID::RewriteMFMAForm); SchedStages.push_back(GCNSchedStageID::UnclusteredHighRPReschedule); SchedStages.push_back(GCNSchedStageID::ClusteredLowOccupancyReschedule); SchedStages.push_back(GCNSchedStageID::PreRARematerialize); @@ -946,6 +960,8 @@ GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) { switch (SchedStageID) { case GCNSchedStageID::OccInitialSchedule: return std::make_unique<OccInitialScheduleStage>(SchedStageID, *this); + case GCNSchedStageID::RewriteMFMAForm: + return std::make_unique<RewriteMFMAFormStage>(SchedStageID, *this); case GCNSchedStageID::UnclusteredHighRPReschedule: return std::make_unique<UnclusteredHighRPStage>(SchedStageID, *this); case GCNSchedStageID::ClusteredLowOccupancyReschedule: @@ -970,6 +986,8 @@ void GCNScheduleDAGMILive::schedule() { GCNRegPressure GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const { + if (Regions[RegionIdx].first == Regions[RegionIdx].second) + return llvm::getRegPressure(MRI, LiveIns[RegionIdx]); GCNDownwardRPTracker RPTracker(*LIS); RPTracker.advance(Regions[RegionIdx].first, Regions[RegionIdx].second, &LiveIns[RegionIdx]); @@ -978,10 +996,8 @@ GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const { static MachineInstr *getLastMIForRegion(MachineBasicBlock::iterator RegionBegin, MachineBasicBlock::iterator RegionEnd) { - auto REnd = RegionEnd == RegionBegin->getParent()->end() - ? std::prev(RegionEnd) - : RegionEnd; - return &*skipDebugInstructionsBackward(REnd, RegionBegin); + assert(RegionBegin != RegionEnd && "Region must not be empty"); + return &*skipDebugInstructionsBackward(std::prev(RegionEnd), RegionBegin); } void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx, @@ -1076,9 +1092,12 @@ GCNScheduleDAGMILive::getRegionLiveOutMap() const { assert(!Regions.empty()); std::vector<MachineInstr *> RegionLastMIs; RegionLastMIs.reserve(Regions.size()); - for (auto &[RegionBegin, RegionEnd] : reverse(Regions)) + for (auto &[RegionBegin, RegionEnd] : reverse(Regions)) { + // Skip empty regions. + if (RegionBegin == RegionEnd) + continue; RegionLastMIs.push_back(getLastMIForRegion(RegionBegin, RegionEnd)); - + } return getLiveRegMap(RegionLastMIs, /*After=*/true, *LIS); } @@ -1088,10 +1107,12 @@ void RegionPressureMap::buildLiveRegMap() { RegionLiveRegMap = IsLiveOut ? DAG->getRegionLiveOutMap() : DAG->getRegionLiveInMap(); for (unsigned I = 0; I < DAG->Regions.size(); I++) { + auto &[RegionBegin, RegionEnd] = DAG->Regions[I]; + // Skip empty regions. + if (RegionBegin == RegionEnd) + continue; MachineInstr *RegionKey = - IsLiveOut - ? getLastMIForRegion(DAG->Regions[I].first, DAG->Regions[I].second) - : &*DAG->Regions[I].first; + IsLiveOut ? getLastMIForRegion(RegionBegin, RegionEnd) : &*RegionBegin; IdxToInstruction[I] = RegionKey; } } @@ -1160,6 +1181,8 @@ void GCNScheduleDAGMILive::runSchedStages() { ScheduleDAGMILive::schedule(); Stage->finalizeGCNRegion(); + Stage->advanceRegion(); + exitRegion(); } Stage->finalizeGCNSchedStage(); @@ -1180,6 +1203,9 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) { case GCNSchedStageID::OccInitialSchedule: OS << "Max Occupancy Initial Schedule"; break; + case GCNSchedStageID::RewriteMFMAForm: + OS << "Instruction Rewriting Reschedule"; + break; case GCNSchedStageID::UnclusteredHighRPReschedule: OS << "Unclustered High Register Pressure Reschedule"; break; @@ -1213,6 +1239,107 @@ bool GCNSchedStage::initGCNSchedStage() { return true; } +void RewriteMFMAFormStage::findReachingDefs( + MachineOperand &UseMO, LiveIntervals *LIS, + SmallVectorImpl<SlotIndex> &DefIdxs) { + MachineInstr *UseMI = UseMO.getParent(); + LiveInterval &UseLI = LIS->getInterval(UseMO.getReg()); + VNInfo *VNI = UseLI.getVNInfoAt(LIS->getInstructionIndex(*UseMI)); + + // If the def is not a PHI, then it must be the only reaching def. + if (!VNI->isPHIDef()) { + DefIdxs.push_back(VNI->def); + return; + } + + SmallPtrSet<MachineBasicBlock *, 8> Visited = {UseMI->getParent()}; + SmallVector<MachineBasicBlock *, 8> Worklist; + + // Mark the predecessor blocks for traversal + for (MachineBasicBlock *PredMBB : UseMI->getParent()->predecessors()) { + Worklist.push_back(PredMBB); + Visited.insert(PredMBB); + } + + while (!Worklist.empty()) { + MachineBasicBlock *CurrMBB = Worklist.pop_back_val(); + + SlotIndex CurrMBBEnd = LIS->getMBBEndIdx(CurrMBB); + VNInfo *VNI = UseLI.getVNInfoAt(CurrMBBEnd.getPrevSlot()); + + MachineBasicBlock *DefMBB = LIS->getMBBFromIndex(VNI->def); + + // If there is a def in this block, then add it to the list. This is the + // reaching def of this path. + if (!VNI->isPHIDef()) { + DefIdxs.push_back(VNI->def); + continue; + } + + for (MachineBasicBlock *PredMBB : DefMBB->predecessors()) { + if (Visited.insert(PredMBB).second) + Worklist.push_back(PredMBB); + } + } +} + +void RewriteMFMAFormStage::findReachingUses( + MachineInstr *DefMI, LiveIntervals *LIS, + SmallVectorImpl<MachineOperand *> &ReachingUses) { + SlotIndex DefIdx = LIS->getInstructionIndex(*DefMI); + for (MachineOperand &UseMO : + DAG.MRI.use_nodbg_operands(DefMI->getOperand(0).getReg())) { + SmallVector<SlotIndex, 8> ReachingDefIndexes; + findReachingDefs(UseMO, LIS, ReachingDefIndexes); + + // If we find a use that contains this DefMI in its reachingDefs, then it is + // a reaching use. + if (any_of(ReachingDefIndexes, [DefIdx](SlotIndex RDIdx) { + return SlotIndex::isSameInstr(RDIdx, DefIdx); + })) + ReachingUses.push_back(&UseMO); + } +} + +bool RewriteMFMAFormStage::initGCNSchedStage() { + // We only need to run this pass if the architecture supports AGPRs. + // Additionally, we don't use AGPRs at occupancy levels above 1 so there + // is no need for this pass in that case, either. + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + if (!ST.hasGFX90AInsts() || MFI.getMinWavesPerEU() > 1) + return false; + + RegionsWithExcessArchVGPR.resize(DAG.Regions.size()); + RegionsWithExcessArchVGPR.reset(); + for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) { + GCNRegPressure PressureBefore = DAG.Pressure[Region]; + if (PressureBefore.getArchVGPRNum() > ST.getAddressableNumArchVGPRs()) + RegionsWithExcessArchVGPR[Region] = true; + } + + if (RegionsWithExcessArchVGPR.none()) + return false; + + TII = ST.getInstrInfo(); + SRI = ST.getRegisterInfo(); + + std::vector<std::pair<MachineInstr *, unsigned>> RewriteCands; + DenseMap<MachineBasicBlock *, std::set<Register>> CopyForUse; + SmallPtrSet<MachineInstr *, 8> CopyForDef; + + if (!initHeuristics(RewriteCands, CopyForUse, CopyForDef)) + return false; + + int64_t Cost = getRewriteCost(RewriteCands, CopyForUse, CopyForDef); + + // If we haven't found the beneficial conditions, prefer the VGPR form which + // may result in less cross RC copies. + if (Cost > 0) + return false; + + return rewrite(RewriteCands); +} + bool UnclusteredHighRPStage::initGCNSchedStage() { if (DisableUnclusterHighRP) return false; @@ -1228,18 +1355,20 @@ bool UnclusteredHighRPStage::initGCNSchedStage() { createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PreRAReentry)); InitialOccupancy = DAG.MinOccupancy; - // Aggressivly try to reduce register pressure in the unclustered high RP + // Aggressively try to reduce register pressure in the unclustered high RP // stage. Temporarily increase occupancy target in the region. + TempTargetOccupancy = MFI.getMaxWavesPerEU() > DAG.MinOccupancy + ? InitialOccupancy + 1 + : InitialOccupancy; + IsAnyRegionScheduled = false; S.SGPRLimitBias = S.HighRPSGPRBias; S.VGPRLimitBias = S.HighRPVGPRBias; - if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy) - MFI.increaseOccupancy(MF, ++DAG.MinOccupancy); LLVM_DEBUG( dbgs() << "Retrying function scheduling without clustering. " - "Aggressivly try to reduce register pressure to achieve occupancy " - << DAG.MinOccupancy << ".\n"); + "Aggressively try to reduce register pressure to achieve occupancy " + << TempTargetOccupancy << ".\n"); return true; } @@ -1267,33 +1396,222 @@ bool ClusteredLowOccStage::initGCNSchedStage() { #define REMAT_PREFIX "[PreRARemat] " #define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << REMAT_PREFIX; X;) +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +Printable PreRARematStage::ScoredRemat::print() const { + return Printable([&](raw_ostream &OS) { + OS << '(' << MaxFreq << ", " << FreqDiff << ", " << RegionImpact << ')'; + }); +} +#endif + bool PreRARematStage::initGCNSchedStage() { // FIXME: This pass will invalidate cached BBLiveInMap and MBBLiveIns for // regions inbetween the defs and region we sinked the def to. Will need to be // fixed if there is another pass after this pass. assert(!S.hasNextStage()); - if (!GCNSchedStage::initGCNSchedStage() || DAG.Regions.size() == 1) + if (!GCNSchedStage::initGCNSchedStage() || DAG.Regions.size() <= 1) return false; + // Maps all MIs (except lone terminators, which are not part of any region) to + // their parent region. Non-lone terminators are considered part of the region + // they delimitate. + DenseMap<MachineInstr *, unsigned> MIRegion(MF.getInstructionCount()); + // Before performing any IR modification record the parent region of each MI // and the parent MBB of each region. const unsigned NumRegions = DAG.Regions.size(); - RegionBB.reserve(NumRegions); for (unsigned I = 0; I < NumRegions; ++I) { RegionBoundaries Region = DAG.Regions[I]; for (auto MI = Region.first; MI != Region.second; ++MI) MIRegion.insert({&*MI, I}); - RegionBB.push_back(Region.first->getParent()); + MachineBasicBlock *ParentMBB = Region.first->getParent(); + if (Region.second != ParentMBB->end()) + MIRegion.insert({&*Region.second, I}); + RegionBB.push_back(ParentMBB); } - if (!canIncreaseOccupancyOrReduceSpill()) +#ifndef NDEBUG + auto PrintTargetRegions = [&]() -> void { + if (TargetRegions.none()) { + dbgs() << REMAT_PREFIX << "No target regions\n"; + return; + } + dbgs() << REMAT_PREFIX << "Target regions:\n"; + for (unsigned I : TargetRegions.set_bits()) + dbgs() << REMAT_PREFIX << " [" << I << "] " << RPTargets[I] << '\n'; + }; + auto PrintRematReg = [&](const RematReg &Remat) -> Printable { + return Printable([&, Remat](raw_ostream &OS) { + // Concatenate all region numbers in which the register is unused and + // live-through. + bool HasLiveThroughRegion = false; + OS << '[' << Remat.DefRegion << " -"; + for (unsigned I = 0; I < NumRegions; ++I) { + if (Remat.isUnusedLiveThrough(I)) { + if (HasLiveThroughRegion) { + OS << ','; + } else { + OS << "- "; + HasLiveThroughRegion = true; + } + OS << I; + } + } + if (HasLiveThroughRegion) + OS << " -"; + OS << "-> " << Remat.UseRegion << "] "; + Remat.DefMI->print(OS, /*IsStandalone=*/true, /*SkipOpers=*/false, + /*SkipDebugLoc=*/false, /*AddNewLine=*/false); + }); + }; +#endif + + // Set an objective for the stage based on current RP in each region. + REMAT_DEBUG({ + dbgs() << "Analyzing "; + MF.getFunction().printAsOperand(dbgs(), false); + dbgs() << ": "; + }); + if (!setObjective()) { + LLVM_DEBUG(dbgs() << "no objective to achieve, occupancy is maximal at " + << MFI.getMaxWavesPerEU() << '\n'); return false; + } + LLVM_DEBUG({ + if (TargetOcc) { + dbgs() << "increase occupancy from " << *TargetOcc - 1 << '\n'; + } else { + dbgs() << "reduce spilling (minimum target occupancy is " + << MFI.getMinWavesPerEU() << ")\n"; + } + PrintTargetRegions(); + }); + + if (!collectRematRegs(MIRegion)) { + REMAT_DEBUG(dbgs() << "No rematerializable registers\n"); + return false; + } + const ScoredRemat::FreqInfo FreqInfo(MF, DAG); + REMAT_DEBUG({ + dbgs() << "Rematerializable registers:\n"; + for (const RematReg &Remat : RematRegs) + dbgs() << REMAT_PREFIX << " " << PrintRematReg(Remat) << '\n'; + dbgs() << REMAT_PREFIX << "Region frequencies\n"; + for (auto [I, Freq] : enumerate(FreqInfo.Regions)) { + dbgs() << REMAT_PREFIX << " [" << I << "] "; + if (Freq) + dbgs() << Freq; + else + dbgs() << "unknown "; + dbgs() << " | " << *DAG.Regions[I].first; + } + }); + + SmallVector<ScoredRemat> ScoredRemats; + for (RematReg &Remat : RematRegs) + ScoredRemats.emplace_back(&Remat, FreqInfo, DAG); + +// Rematerialize registers in successive rounds until all RP targets are +// satisifed or until we run out of rematerialization candidates. +#ifndef NDEBUG + unsigned RoundNum = 0; +#endif + BitVector RecomputeRP(NumRegions); + do { + assert(!ScoredRemats.empty() && "no more remat candidates"); + + // (Re-)Score and (re-)sort all remats in increasing score order. + for (ScoredRemat &Remat : ScoredRemats) + Remat.update(TargetRegions, RPTargets, FreqInfo, !TargetOcc); + sort(ScoredRemats); + + REMAT_DEBUG({ + dbgs() << "==== ROUND " << RoundNum++ << " ====\n" + << REMAT_PREFIX + << "Candidates with non-null score, in rematerialization order:\n"; + for (const ScoredRemat &RematDecision : reverse(ScoredRemats)) { + if (RematDecision.hasNullScore()) + break; + dbgs() << REMAT_PREFIX << " " << RematDecision.print() << " | " + << *RematDecision.Remat->DefMI; + } + PrintTargetRegions(); + }); + + RecomputeRP.reset(); + unsigned RematIdx = ScoredRemats.size(); + + // Rematerialize registers in decreasing score order until we estimate + // that all RP targets are satisfied or until rematerialization candidates + // are no longer useful to decrease RP. + for (; RematIdx && TargetRegions.any(); --RematIdx) { + const ScoredRemat &Candidate = ScoredRemats[RematIdx - 1]; + // Stop rematerializing on encountering a null score. Since scores + // monotonically decrease as we rematerialize, we know there is nothing + // useful left to do in such cases, even if we were to re-score. + if (Candidate.hasNullScore()) { + RematIdx = 0; + break; + } + + RematReg &Remat = *Candidate.Remat; + // When previous rematerializations in this round have already satisfied + // RP targets in all regions this rematerialization can impact, we have a + // good indication that our scores have diverged significantly from + // reality, in which case we interrupt this round and re-score. This also + // ensures that every rematerialization we perform is possibly impactful + // in at least one target region. + if (!Remat.maybeBeneficial(TargetRegions, RPTargets)) + break; + + REMAT_DEBUG(dbgs() << "** REMAT " << PrintRematReg(Remat) << '\n';); + // Every rematerialization we do here is likely to move the instruction + // into a higher frequency region, increasing the total sum latency of the + // instruction itself. This is acceptable if we are eliminating a spill in + // the process, but when the goal is increasing occupancy we get nothing + // out of rematerialization if occupancy is not increased in the end; in + // such cases we want to roll back the rematerialization. + RollbackInfo *Rollback = + TargetOcc ? &Rollbacks.emplace_back(&Remat) : nullptr; + rematerialize(Remat, RecomputeRP, Rollback); + unsetSatisifedRPTargets(Remat.Live); + } + + REMAT_DEBUG({ + if (!TargetRegions.any()) { + dbgs() << "** Interrupt round on all targets achieved\n"; + } else if (RematIdx) { + dbgs() << "** Interrupt round on stale score for " + << *ScoredRemats[RematIdx - 1].Remat->DefMI; + } else { + dbgs() << "** Stop on exhausted rematerialization candidates\n"; + } + }); + + // Peel off registers we already rematerialized from the vector's tail. + ScoredRemats.truncate(RematIdx); + } while ((updateAndVerifyRPTargets(RecomputeRP) || TargetRegions.any()) && + !ScoredRemats.empty()); + if (RescheduleRegions.none()) + return false; + + // Commit all pressure changes to the DAG and compute minimum achieved + // occupancy in impacted regions. + REMAT_DEBUG(dbgs() << "==== REMAT RESULTS ====\n"); + unsigned DynamicVGPRBlockSize = MFI.getDynamicVGPRBlockSize(); + for (unsigned I : RescheduleRegions.set_bits()) { + DAG.Pressure[I] = RPTargets[I].getCurrentRP(); + REMAT_DEBUG(dbgs() << '[' << I << "] Achieved occupancy " + << DAG.Pressure[I].getOccupancy(ST, DynamicVGPRBlockSize) + << " (" << RPTargets[I] << ")\n"); + } + AchievedOcc = MFI.getMaxWavesPerEU(); + for (const GCNRegPressure &RP : DAG.Pressure) { + AchievedOcc = + std::min(AchievedOcc, RP.getOccupancy(ST, DynamicVGPRBlockSize)); + } - // Rematerialize identified instructions and update scheduler's state. - rematerialize(); - if (GCNTrackers) - DAG.RegionLiveOuts.buildLiveRegMap(); REMAT_DEBUG({ dbgs() << "Retrying function scheduling with new min. occupancy of " << AchievedOcc << " from rematerializing (original was " @@ -1303,11 +1621,7 @@ bool PreRARematStage::initGCNSchedStage() { dbgs() << ")\n"; }); - if (AchievedOcc > DAG.MinOccupancy) { - DAG.MinOccupancy = AchievedOcc; - SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); - MFI.increaseOccupancy(MF, DAG.MinOccupancy); - } + DAG.setTargetOccupancy(getStageTargetOccupancy()); return true; } @@ -1320,15 +1634,26 @@ void UnclusteredHighRPStage::finalizeGCNSchedStage() { SavedMutations.swap(DAG.Mutations); S.SGPRLimitBias = S.VGPRLimitBias = 0; if (DAG.MinOccupancy > InitialOccupancy) { + assert(IsAnyRegionScheduled); LLVM_DEBUG(dbgs() << StageID << " stage successfully increased occupancy to " << DAG.MinOccupancy << '\n'); + } else if (!IsAnyRegionScheduled) { + assert(DAG.MinOccupancy == InitialOccupancy); + LLVM_DEBUG(dbgs() << StageID + << ": No regions scheduled, min occupancy stays at " + << DAG.MinOccupancy << ", MFI occupancy stays at " + << MFI.getOccupancy() << ".\n"); } GCNSchedStage::finalizeGCNSchedStage(); } bool GCNSchedStage::initGCNRegion() { + // Skip empty scheduling region. + if (DAG.begin() == DAG.end()) + return false; + // Check whether this new region is also a new block. if (DAG.RegionBegin->getParent() != CurrentMBB) setupNewBlock(); @@ -1336,8 +1661,8 @@ bool GCNSchedStage::initGCNRegion() { unsigned NumRegionInstrs = std::distance(DAG.begin(), DAG.end()); DAG.enterRegion(CurrentMBB, DAG.begin(), DAG.end(), NumRegionInstrs); - // Skip empty scheduling regions (0 or 1 schedulable instructions). - if (DAG.begin() == DAG.end() || DAG.begin() == std::prev(DAG.end())) + // Skip regions with 1 schedulable instruction. + if (DAG.begin() == std::prev(DAG.end())) return false; LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n"); @@ -1396,13 +1721,25 @@ bool UnclusteredHighRPStage::initGCNRegion() { // rescheduling of previous regions did not make occupancy drop back down to // the initial minimum). unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize(); + // If no region has been scheduled yet, the DAG has not yet been updated with + // the occupancy target. So retrieve it from the temporary. + unsigned CurrentTargetOccupancy = + IsAnyRegionScheduled ? DAG.MinOccupancy : TempTargetOccupancy; if (!DAG.RegionsWithExcessRP[RegionIdx] && - (DAG.MinOccupancy <= InitialOccupancy || + (CurrentTargetOccupancy <= InitialOccupancy || DAG.Pressure[RegionIdx].getOccupancy(ST, DynamicVGPRBlockSize) != InitialOccupancy)) return false; - return GCNSchedStage::initGCNRegion(); + bool IsSchedulingThisRegion = GCNSchedStage::initGCNRegion(); + // If this is the first region scheduled during this stage, make the target + // occupancy changes in the DAG and MFI. + if (!IsAnyRegionScheduled && IsSchedulingThisRegion) { + IsAnyRegionScheduled = true; + if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy) + DAG.setTargetOccupancy(TempTargetOccupancy); + } + return IsSchedulingThisRegion; } bool ClusteredLowOccStage::initGCNRegion() { @@ -1447,9 +1784,23 @@ void GCNSchedStage::finalizeGCNRegion() { if (DAG.RegionsWithIGLPInstrs[RegionIdx] && StageID != GCNSchedStageID::UnclusteredHighRPReschedule) SavedMutations.swap(DAG.Mutations); +} - DAG.exitRegion(); - advanceRegion(); +void PreRARematStage::finalizeGCNRegion() { + GCNSchedStage::finalizeGCNRegion(); + // When the goal is to increase occupancy, all regions must reach the target + // occupancy for rematerializations to be possibly useful, otherwise we will + // just hurt latency for no benefit. If minimum occupancy drops below the + // target there is no point in trying to re-schedule further regions. + if (!TargetOcc) + return; + RegionReverts.emplace_back(RegionIdx, Unsched, PressureBefore); + if (DAG.MinOccupancy < *TargetOcc) { + REMAT_DEBUG(dbgs() << "Region " << RegionIdx + << " cannot meet occupancy target, interrupting " + "re-scheduling in all regions\n"); + RescheduleRegions.reset(); + } } void GCNSchedStage::checkScheduling() { @@ -1518,10 +1869,12 @@ void GCNSchedStage::checkScheduling() { // Revert if this region's schedule would cause a drop in occupancy or // spilling. - if (shouldRevertScheduling(WavesAfter)) - revertScheduling(); - else + if (shouldRevertScheduling(WavesAfter)) { + modifyRegionSchedule(RegionIdx, DAG.BB, Unsched); + std::tie(DAG.RegionBegin, DAG.RegionEnd) = DAG.Regions[RegionIdx]; + } else { DAG.Pressure[RegionIdx] = PressureAfter; + } } unsigned @@ -1723,8 +2076,9 @@ bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) { } bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) { - return GCNSchedStage::shouldRevertScheduling(WavesAfter) || - mayCauseSpilling(WavesAfter) || (TargetOcc && WavesAfter < TargetOcc); + // When trying to increase occupancy (TargetOcc == true) the stage manages + // region reverts globally (all or none), so we always return false here. + return !TargetOcc && mayCauseSpilling(WavesAfter); } bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) { @@ -1749,89 +2103,625 @@ bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) { return false; } -void GCNSchedStage::revertScheduling() { - LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n"); - DAG.RegionEnd = DAG.RegionBegin; - int SkippedDebugInstr = 0; - for (MachineInstr *MI : Unsched) { - if (MI->isDebugInstr()) { - ++SkippedDebugInstr; - continue; - } +void GCNSchedStage::modifyRegionSchedule(unsigned RegionIdx, + MachineBasicBlock *MBB, + ArrayRef<MachineInstr *> MIOrder) { + assert(static_cast<size_t>(std::distance(DAG.Regions[RegionIdx].first, + DAG.Regions[RegionIdx].second)) == + MIOrder.size() && + "instruction number mismatch"); + if (MIOrder.empty()) + return; - if (MI->getIterator() != DAG.RegionEnd) { - DAG.BB->splice(DAG.RegionEnd, DAG.BB, MI); - if (!MI->isDebugInstr()) + LLVM_DEBUG(dbgs() << "Reverting scheduling for region " << RegionIdx << '\n'); + + // Reconstruct MI sequence by moving instructions in desired order before + // the current region's start. + MachineBasicBlock::iterator RegionEnd = DAG.Regions[RegionIdx].first; + for (MachineInstr *MI : MIOrder) { + // Either move the next MI in order before the end of the region or move the + // region end past the MI if it is at the correct position. + MachineBasicBlock::iterator MII = MI->getIterator(); + if (MII != RegionEnd) { + // Will subsequent splice move MI up past a non-debug instruction? + bool NonDebugReordered = + !MI->isDebugInstr() && + skipDebugInstructionsForward(RegionEnd, MII) != MII; + MBB->splice(RegionEnd, MBB, MI); + // Only update LiveIntervals information if non-debug instructions are + // reordered. Otherwise debug instructions could cause code generation to + // change. + if (NonDebugReordered) DAG.LIS->handleMove(*MI, true); + } else { + ++RegionEnd; + } + if (MI->isDebugInstr()) { + LLVM_DEBUG(dbgs() << "Scheduling " << *MI); + continue; } // Reset read-undef flags and update them later. - for (auto &Op : MI->all_defs()) + for (MachineOperand &Op : MI->all_defs()) Op.setIsUndef(false); RegisterOperands RegOpers; RegOpers.collect(*MI, *DAG.TRI, DAG.MRI, DAG.ShouldTrackLaneMasks, false); - if (!MI->isDebugInstr()) { - if (DAG.ShouldTrackLaneMasks) { - // Adjust liveness and add missing dead+read-undef flags. - SlotIndex SlotIdx = DAG.LIS->getInstructionIndex(*MI).getRegSlot(); - RegOpers.adjustLaneLiveness(*DAG.LIS, DAG.MRI, SlotIdx, MI); - } else { - // Adjust for missing dead-def flags. - RegOpers.detectDeadDefs(*MI, *DAG.LIS); - } + if (DAG.ShouldTrackLaneMasks) { + // Adjust liveness and add missing dead+read-undef flags. + SlotIndex SlotIdx = DAG.LIS->getInstructionIndex(*MI).getRegSlot(); + RegOpers.adjustLaneLiveness(*DAG.LIS, DAG.MRI, SlotIdx, MI); + } else { + // Adjust for missing dead-def flags. + RegOpers.detectDeadDefs(*MI, *DAG.LIS); } - DAG.RegionEnd = MI->getIterator(); - ++DAG.RegionEnd; LLVM_DEBUG(dbgs() << "Scheduling " << *MI); } - // After reverting schedule, debug instrs will now be at the end of the block - // and RegionEnd will point to the first debug instr. Increment RegionEnd - // pass debug instrs to the actual end of the scheduling region. - while (SkippedDebugInstr-- > 0) - ++DAG.RegionEnd; + // The region end doesn't change throughout scheduling since it itself is + // outside the region (whether that is a MBB end or a terminator MI). + assert(RegionEnd == DAG.Regions[RegionIdx].second && "region end mismatch"); + DAG.Regions[RegionIdx].first = MIOrder.front(); +} + +bool RewriteMFMAFormStage::isRewriteCandidate(MachineInstr *MI) const { - // If Unsched.front() instruction is a debug instruction, this will actually - // shrink the region since we moved all debug instructions to the end of the - // block. Find the first instruction that is not a debug instruction. - DAG.RegionBegin = Unsched.front()->getIterator(); - if (DAG.RegionBegin->isDebugInstr()) { - for (MachineInstr *MI : Unsched) { - if (MI->isDebugInstr()) + if (!static_cast<const SIInstrInfo *>(DAG.TII)->isMAI(*MI)) + return false; + return AMDGPU::getMFMASrcCVDstAGPROp(MI->getOpcode()) != -1; +} + +bool RewriteMFMAFormStage::initHeuristics( + std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands, + DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse, + SmallPtrSetImpl<MachineInstr *> &CopyForDef) { + bool Changed = false; + + // Prepare for the heuristics + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + if (!isRewriteCandidate(&MI)) continue; - DAG.RegionBegin = MI->getIterator(); - break; + + int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(MI.getOpcode()); + assert(ReplacementOp != -1); + + RewriteCands.push_back({&MI, MI.getOpcode()}); + MI.setDesc(TII->get(ReplacementOp)); + + MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); + if (Src2->isReg()) { + SmallVector<SlotIndex, 8> Src2ReachingDefs; + findReachingDefs(*Src2, DAG.LIS, Src2ReachingDefs); + + // For any definition of the src2 register which is non-MFMA, we + // insert a copy. + for (SlotIndex RDIdx : Src2ReachingDefs) { + MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIdx); + if (!TII->isMAI(*RD)) + CopyForDef.insert(RD); + } + } + + MachineOperand &Dst = MI.getOperand(0); + SmallVector<MachineOperand *, 8> DstReachingUses; + + findReachingUses(&MI, DAG.LIS, DstReachingUses); + + for (MachineOperand *RUOp : DstReachingUses) { + if (TII->isMAI(*RUOp->getParent())) + continue; + + // For any user of the result of the MFMA which is not an MFMA, we + // insert a copy. For a given register, we will only insert one copy + // per user block. + CopyForUse[RUOp->getParent()->getParent()].insert(RUOp->getReg()); + + SmallVector<SlotIndex, 8> DstUsesReachingDefs; + findReachingDefs(*RUOp, DAG.LIS, DstUsesReachingDefs); + + for (SlotIndex RDIndex : DstUsesReachingDefs) { + MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIndex); + if (TII->isMAI(*RD)) + continue; + + // For any definition of the user of the MFMA which is not an MFMA, + // we insert a copy. We do this to transform all the reaching defs + // of this use to AGPR. By doing this, we can insert a copy from + // AGPR to VGPR at the user rather than after the MFMA. + CopyForDef.insert(RD); + } + } + + // Do the rewrite to allow for updated RP calculation. + const TargetRegisterClass *VGPRRC = DAG.MRI.getRegClass(Dst.getReg()); + const TargetRegisterClass *AGPRRC = SRI->getEquivalentAGPRClass(VGPRRC); + DAG.MRI.setRegClass(Dst.getReg(), AGPRRC); + if (Src2->isReg()) + DAG.MRI.setRegClass(Src2->getReg(), AGPRRC); + Changed = true; } } - // Then move the debug instructions back into their correct place and set - // RegionBegin and RegionEnd if needed. - DAG.placeDebugValues(); + return Changed; +} - DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd); +int64_t RewriteMFMAFormStage::getRewriteCost( + const std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands, + const DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse, + const SmallPtrSetImpl<MachineInstr *> &CopyForDef) { + MachineBlockFrequencyInfo *MBFI = DAG.MBFI; + + int64_t BestSpillCost = 0; + int64_t Cost = 0; + uint64_t EntryFreq = MBFI->getEntryFreq().getFrequency(); + + std::pair<unsigned, unsigned> MaxVectorRegs = + ST.getMaxNumVectorRegs(MF.getFunction()); + unsigned ArchVGPRThreshold = MaxVectorRegs.first; + unsigned AGPRThreshold = MaxVectorRegs.second; + unsigned CombinedThreshold = ST.getMaxNumVGPRs(MF); + + for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) { + if (!RegionsWithExcessArchVGPR[Region]) + continue; + + GCNRegPressure &PressureBefore = DAG.Pressure[Region]; + unsigned SpillCostBefore = PressureBefore.getVGPRSpills( + MF, ArchVGPRThreshold, AGPRThreshold, CombinedThreshold); + + // For the cases we care about (i.e. ArchVGPR usage is greater than the + // addressable limit), rewriting alone should bring pressure to manageable + // level. If we find any such region, then the rewrite is potentially + // beneficial. + GCNRegPressure PressureAfter = DAG.getRealRegPressure(Region); + unsigned SpillCostAfter = PressureAfter.getVGPRSpills( + MF, ArchVGPRThreshold, AGPRThreshold, CombinedThreshold); + + uint64_t BlockFreq = + MBFI->getBlockFreq(DAG.Regions[Region].first->getParent()) + .getFrequency(); + + bool RelativeFreqIsDenom = EntryFreq > BlockFreq; + uint64_t RelativeFreq = EntryFreq && BlockFreq + ? (RelativeFreqIsDenom ? EntryFreq / BlockFreq + : BlockFreq / EntryFreq) + : 1; + + // This assumes perfect spilling / splitting -- using one spill / copy + // instruction and one restoreFrom / copy for each excess register, + int64_t SpillCost = ((int)SpillCostAfter - (int)SpillCostBefore) * 2; + + // Also account for the block frequency. + if (RelativeFreqIsDenom) + SpillCost /= (int64_t)RelativeFreq; + else + SpillCost *= (int64_t)RelativeFreq; + + // If we have increased spilling in any block, just bail. + if (SpillCost > 0) + return SpillCost; + + if (SpillCost < BestSpillCost) + BestSpillCost = SpillCost; + } + + // Set the cost to the largest decrease in spill cost in order to not double + // count spill reductions. + Cost = BestSpillCost; + assert(Cost <= 0); + + unsigned CopyCost = 0; + + // For each CopyForDef, increase the cost by the register size while + // accounting for block frequency. + for (MachineInstr *DefMI : CopyForDef) { + Register DefReg = DefMI->getOperand(0).getReg(); + uint64_t DefFreq = + EntryFreq + ? MBFI->getBlockFreq(DefMI->getParent()).getFrequency() / EntryFreq + : 1; + + const TargetRegisterClass *RC = DAG.MRI.getRegClass(DefReg); + CopyCost += RC->getCopyCost() * DefFreq; + } + + // Account for CopyForUse copies in each block that the register is used. + for (auto &[UseBlock, UseRegs] : CopyForUse) { + uint64_t UseFreq = + EntryFreq ? MBFI->getBlockFreq(UseBlock).getFrequency() / EntryFreq : 1; + + for (Register UseReg : UseRegs) { + const TargetRegisterClass *RC = DAG.MRI.getRegClass(UseReg); + CopyCost += RC->getCopyCost() * UseFreq; + } + } + + // Reset the classes that were changed to AGPR for better RB analysis. + // We must do rewriting after copy-insertion, as some defs of the register + // may require VGPR. Additionally, if we bail out and don't perform the + // rewrite then these need to be restored anyway. + for (auto &[MI, OriginalOpcode] : RewriteCands) { + assert(TII->isMAI(*MI)); + const TargetRegisterClass *AGPRRC = + DAG.MRI.getRegClass(MI->getOperand(0).getReg()); + const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(AGPRRC); + + MachineOperand *Src2 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2); + assert(Src2); + + if (Src2->isReg()) + DAG.MRI.setRegClass(Src2->getReg(), VGPRRC); + DAG.MRI.setRegClass(MI->getOperand(0).getReg(), VGPRRC); + MI->setDesc(TII->get(OriginalOpcode)); + } + + return Cost + CopyCost; } -bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() { +bool RewriteMFMAFormStage::rewrite( + const std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands) { + DenseMap<MachineInstr *, unsigned> FirstMIToRegion; + DenseMap<MachineInstr *, unsigned> LastMIToRegion; + + for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) { + RegionBoundaries Entry = DAG.Regions[Region]; + if (Entry.first == Entry.second) + continue; + + FirstMIToRegion[&*Entry.first] = Region; + if (Entry.second != Entry.first->getParent()->end()) + LastMIToRegion[&*Entry.second] = Region; + } + + // Rewrite the MFMAs to AGPR, and insert any copies as needed. + // The general assumption of the algorithm (and the previous cost calculation) + // is that it is better to insert the copies in the MBB of the def of the src2 + // operands, and in the MBB of the user of the dest operands. This is based on + // the assumption that the MFMAs are likely to appear in loop bodies, while + // the src2 and dest operands are live-in / live-out of the loop. Due to this + // design, the algorithm for finding copy insertion points is more + // complicated. + // + // There are three main cases to handle: 1. the reaching defs of the src2 + // operands, 2. the reaching uses of the dst operands, and 3. the reaching + // defs of the reaching uses of the dst operand. + // + // In the first case, we simply insert copies after each of the reaching + // definitions. In the second case, we collect all the uses of a given dest + // and organize them by MBB. Then, we insert 1 copy for each MBB before the + // earliest use. Since the use may have multiple reaching defs, and since we + // want to replace the register it is using with the result of the copy, we + // must handle case 3. In the third case, we simply insert a copy after each + // of the reaching defs to connect to the copy of the reaching uses of the dst + // reg. This allows us to avoid inserting copies next to the MFMAs. + // + // While inserting the copies, we maintain a map of operands which will use + // different regs (i.e. the result of the copies). For example, a case 1 src2 + // operand will use the register result of the copies after the reaching defs, + // as opposed to the original register. Now that we have completed our copy + // analysis and placement, we can bulk update the registers. We do this + // separately as to avoid complicating the reachingDef and reachingUse + // queries. + // + // While inserting the copies, we also maintain a list or registers which we + // will want to reclassify as AGPR. After doing the copy insertion and the + // register replacement, we can finally do the reclassification. This uses the + // redef map, as the registers we are interested in reclassifying may be + // replaced by the result of a copy. We must do this after the copy analysis + // and placement as we must have an accurate redef map -- otherwise we may end + // up creating illegal instructions. + + // The original registers of the MFMA that need to be reclassified as AGPR. + DenseSet<Register> RewriteRegs; + // The map of an original register in the MFMA to a new register (result of a + // copy) that it should be replaced with. + DenseMap<Register, Register> RedefMap; + // The map of the original MFMA registers to the relevant MFMA operands. + DenseMap<Register, DenseSet<MachineOperand *>> ReplaceMap; + // The map of reaching defs for a given register -- to avoid duplicate copies. + DenseMap<Register, SmallPtrSet<MachineInstr *, 8>> ReachingDefCopyMap; + // The map of reaching uses for a given register by basic block -- to avoid + // duplicate copies and to calculate per MBB insert pts. + DenseMap<unsigned, DenseMap<Register, SmallPtrSet<MachineOperand *, 8>>> + ReachingUseTracker; + + for (auto &[MI, OriginalOpcode] : RewriteCands) { + int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(MI->getOpcode()); + if (ReplacementOp == -1) + continue; + MI->setDesc(TII->get(ReplacementOp)); + + // Case 1: insert copies for the reaching defs of the Src2Reg. + MachineOperand *Src2 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2); + if (Src2->isReg()) { + Register Src2Reg = Src2->getReg(); + if (!Src2Reg.isVirtual()) + return false; + + Register MappedReg = Src2->getReg(); + SmallVector<SlotIndex, 8> Src2ReachingDefs; + findReachingDefs(*Src2, DAG.LIS, Src2ReachingDefs); + SmallSetVector<MachineInstr *, 8> Src2DefsReplace; + + for (SlotIndex RDIndex : Src2ReachingDefs) { + MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIndex); + if (TII->isMAI(*RD)) + continue; + + // If there is a non mai reaching def, then we need a copy. + Src2DefsReplace.insert(RD); + } + + if (!Src2DefsReplace.empty()) { + DenseMap<Register, Register>::iterator RI = RedefMap.find(Src2Reg); + if (RI != RedefMap.end()) { + MappedReg = RI->second; + } else { + assert(!ReachingDefCopyMap.contains(Src2Reg)); + const TargetRegisterClass *Src2RC = DAG.MRI.getRegClass(Src2Reg); + const TargetRegisterClass *VGPRRC = + SRI->getEquivalentVGPRClass(Src2RC); + + // Track the mapping of the original register to the new register. + MappedReg = DAG.MRI.createVirtualRegister(VGPRRC); + RedefMap[Src2Reg] = MappedReg; + } + + // If none exists, create a copy from this reaching def. + // We may have inserted a copy already in an earlier iteration. + for (MachineInstr *RD : Src2DefsReplace) { + // Do not create redundant copies. + if (ReachingDefCopyMap[Src2Reg].insert(RD).second) { + MachineInstrBuilder VGPRCopy = + BuildMI(*RD->getParent(), std::next(RD->getIterator()), + RD->getDebugLoc(), TII->get(TargetOpcode::COPY)) + .addDef(MappedReg, {}, 0) + .addUse(Src2Reg, {}, 0); + DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy); + + // If this reaching def was the last MI in the region, update the + // region boundaries. + if (LastMIToRegion.contains(RD)) { + unsigned UpdateRegion = LastMIToRegion[RD]; + DAG.Regions[UpdateRegion].second = VGPRCopy; + LastMIToRegion.erase(RD); + } + } + } + } + + // Track the register for reclassification + RewriteRegs.insert(Src2Reg); + + // Always insert the operand for replacement. If this corresponds with a + // chain of tied-def we may not see the VGPR requirement until later. + ReplaceMap[Src2Reg].insert(Src2); + } + + // Case 2 and Case 3: insert copies before the reaching uses of the dsts, + // and after the reaching defs of the reaching uses of the dsts. + + MachineOperand *Dst = &MI->getOperand(0); + Register DstReg = Dst->getReg(); + if (!DstReg.isVirtual()) + return false; + + Register MappedReg = DstReg; + SmallVector<MachineOperand *, 8> DstReachingUses; + + SmallVector<MachineOperand *, 8> DstReachingUseCopies; + SmallVector<MachineInstr *, 8> DstUseDefsReplace; + + findReachingUses(MI, DAG.LIS, DstReachingUses); + + for (MachineOperand *RUOp : DstReachingUses) { + if (TII->isMAI(*RUOp->getParent())) + continue; + + // If there is a non mai reaching use, then we need a copy. + if (find(DstReachingUseCopies, RUOp) == DstReachingUseCopies.end()) + DstReachingUseCopies.push_back(RUOp); + SmallVector<SlotIndex, 8> DstUsesReachingDefs; + findReachingDefs(*RUOp, DAG.LIS, DstUsesReachingDefs); + + for (SlotIndex RDIndex : DstUsesReachingDefs) { + MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIndex); + if (TII->isMAI(*RD)) + continue; + + // If there is a non mai reaching def of this reaching use, then we will + // need a copy. + if (find(DstUseDefsReplace, RD) == DstUseDefsReplace.end()) + DstUseDefsReplace.push_back(RD); + } + } + + if (!DstUseDefsReplace.empty()) { + DenseMap<Register, Register>::iterator RI = RedefMap.find(DstReg); + if (RI != RedefMap.end()) { + MappedReg = RI->second; + } else { + assert(!ReachingDefCopyMap.contains(DstReg)); + const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(DstReg); + const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(DstRC); + + // Track the mapping of the original register to the new register. + MappedReg = DAG.MRI.createVirtualRegister(VGPRRC); + RedefMap[DstReg] = MappedReg; + } + + // If none exists, create a copy from this reaching def. + // We may have inserted a copy already in an earlier iteration. + for (MachineInstr *RD : DstUseDefsReplace) { + // Do not create reundant copies. + if (ReachingDefCopyMap[DstReg].insert(RD).second) { + MachineInstrBuilder VGPRCopy = + BuildMI(*RD->getParent(), std::next(RD->getIterator()), + RD->getDebugLoc(), TII->get(TargetOpcode::COPY)) + .addDef(MappedReg, {}, 0) + .addUse(DstReg, {}, 0); + DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy); + + // If this reaching def was the last MI in the region, update the + // region boundaries. + DenseMap<MachineInstr *, unsigned>::iterator LMI = + LastMIToRegion.find(RD); + if (LMI != LastMIToRegion.end()) { + unsigned UpdateRegion = LMI->second; + DAG.Regions[UpdateRegion].second = VGPRCopy; + LastMIToRegion.erase(RD); + } + } + } + } + + DenseSet<MachineOperand *> &DstRegSet = ReplaceMap[DstReg]; + for (MachineOperand *RU : DstReachingUseCopies) { + MachineBasicBlock *RUBlock = RU->getParent()->getParent(); + // Just keep track of the reaching use of this register by block. After we + // have scanned all the MFMAs we can find optimal insert pts. + if (RUBlock != MI->getParent()) { + ReachingUseTracker[RUBlock->getNumber()][DstReg].insert(RU); + continue; + } + + // Special case, the use is in the same block as the MFMA. Insert the copy + // just before the use. + const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(DstReg); + const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(DstRC); + Register NewUseReg = DAG.MRI.createVirtualRegister(VGPRRC); + MachineInstr *UseInst = RU->getParent(); + MachineInstrBuilder VGPRCopy = + BuildMI(*UseInst->getParent(), UseInst->getIterator(), + UseInst->getDebugLoc(), TII->get(TargetOpcode::COPY)) + .addDef(NewUseReg, {}, 0) + .addUse(DstReg, {}, 0); + DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy); + // Since we know this use has only one reaching def, we can replace the + // use reg. + RU->setReg(NewUseReg); + // Track the copy source operand for r eplacement. + DstRegSet.insert(&VGPRCopy->getOperand(1)); + } + + // Track the register for reclassification + RewriteRegs.insert(DstReg); + + // Insert the dst operand for replacement. If this dst is in a chain of + // tied-def MFMAs, and the first src2 needs to be replaced with a new reg, + // all the correspond operands need to be replaced. + DstRegSet.insert(Dst); + } + + // Handle the copies for dst uses. + using RUBType = + std::pair<unsigned, DenseMap<Register, SmallPtrSet<MachineOperand *, 8>>>; + for (RUBType RUBlockEntry : ReachingUseTracker) { + using RUDType = std::pair<Register, SmallPtrSet<MachineOperand *, 8>>; + for (RUDType RUDst : RUBlockEntry.second) { + MachineOperand *OpBegin = *RUDst.second.begin(); + SlotIndex InstPt = DAG.LIS->getInstructionIndex(*OpBegin->getParent()); + + // Find the earliest use in this block. + for (MachineOperand *User : RUDst.second) { + SlotIndex NewInstPt = DAG.LIS->getInstructionIndex(*User->getParent()); + if (SlotIndex::isEarlierInstr(NewInstPt, InstPt)) + InstPt = NewInstPt; + } + + const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(RUDst.first); + const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(DstRC); + Register NewUseReg = DAG.MRI.createVirtualRegister(VGPRRC); + MachineInstr *UseInst = DAG.LIS->getInstructionFromIndex(InstPt); + + MachineInstrBuilder VGPRCopy = + BuildMI(*UseInst->getParent(), UseInst->getIterator(), + UseInst->getDebugLoc(), TII->get(TargetOpcode::COPY)) + .addDef(NewUseReg, {}, 0) + .addUse(RUDst.first, {}, 0); + DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy); + + // If this UseInst was the first MI in the region, update the region + // boundaries. + DenseMap<MachineInstr *, unsigned>::iterator FI = + FirstMIToRegion.find(UseInst); + if (FI != FirstMIToRegion.end()) { + unsigned UpdateRegion = FI->second; + DAG.Regions[UpdateRegion].first = VGPRCopy; + FirstMIToRegion.erase(UseInst); + } + + // Replace the operand for all users. + for (MachineOperand *User : RUDst.second) { + User->setReg(NewUseReg); + } + + // Track the copy source operand for replacement. + ReplaceMap[RUDst.first].insert(&VGPRCopy->getOperand(1)); + } + } + + // We may have needed to insert copies after the reaching defs of the MFMAs. + // Replace the original register with the result of the copy for all relevant + // operands. + for (std::pair<Register, Register> NewDef : RedefMap) { + Register OldReg = NewDef.first; + Register NewReg = NewDef.second; + + // Replace the register for any associated operand in the MFMA chain. + for (MachineOperand *ReplaceOp : ReplaceMap[OldReg]) + ReplaceOp->setReg(NewReg); + } + + // Finally, do the reclassification of the MFMA registers. + for (Register RewriteReg : RewriteRegs) { + Register RegToRewrite = RewriteReg; + + // Be sure to update the replacement register and not the original. + DenseMap<Register, Register>::iterator RI = RedefMap.find(RewriteReg); + if (RI != RedefMap.end()) + RegToRewrite = RI->second; + + const TargetRegisterClass *CurrRC = DAG.MRI.getRegClass(RegToRewrite); + const TargetRegisterClass *AGPRRC = SRI->getEquivalentAGPRClass(CurrRC); + + DAG.MRI.setRegClass(RegToRewrite, AGPRRC); + } + + // Bulk update the LIS. + DAG.LIS->reanalyze(DAG.MF); + // Liveins may have been modified for cross RC copies + RegionPressureMap LiveInUpdater(&DAG, false); + LiveInUpdater.buildLiveRegMap(); + + for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) + DAG.LiveIns[Region] = LiveInUpdater.getLiveRegsForRegionIdx(Region); + + DAG.Pressure[RegionIdx] = DAG.getRealRegPressure(RegionIdx); + + return true; +} + +unsigned PreRARematStage::getStageTargetOccupancy() const { + return TargetOcc ? *TargetOcc : MFI.getMinWavesPerEU(); +} + +bool PreRARematStage::setObjective() { const Function &F = MF.getFunction(); - // Maps optimizable regions (i.e., regions at minimum and register-limited - // occupancy, or regions with spilling) to the target RP we would like to - // reach. - DenseMap<unsigned, GCNRPTarget> OptRegions; + // Set up "spilling targets" for all regions. unsigned MaxSGPRs = ST.getMaxNumSGPRs(F); unsigned MaxVGPRs = ST.getMaxNumVGPRs(F); - auto ResetTargetRegions = [&]() { - OptRegions.clear(); - for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { - const GCNRegPressure &RP = DAG.Pressure[I]; - GCNRPTarget Target(MaxSGPRs, MaxVGPRs, MF, RP); - if (!Target.satisfied()) - OptRegions.insert({I, Target}); - } - }; + bool HasVectorRegisterExcess = false; + for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { + const GCNRegPressure &RP = DAG.Pressure[I]; + GCNRPTarget &Target = RPTargets.emplace_back(MaxSGPRs, MaxVGPRs, MF, RP); + if (!Target.satisfied()) + TargetRegions.set(I); + HasVectorRegisterExcess |= Target.hasVectorRegisterExcess(); + } - ResetTargetRegions(); - if (!OptRegions.empty() || DAG.MinOccupancy >= MFI.getMaxWavesPerEU()) { + if (HasVectorRegisterExcess || DAG.MinOccupancy >= MFI.getMaxWavesPerEU()) { // In addition to register usage being above addressable limits, occupancy // below the minimum is considered like "spilling" as well. TargetOcc = std::nullopt; @@ -1839,94 +2729,68 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() { // There is no spilling and room to improve occupancy; set up "increased // occupancy targets" for all regions. TargetOcc = DAG.MinOccupancy + 1; - unsigned VGPRBlockSize = - MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize(); + const unsigned VGPRBlockSize = MFI.getDynamicVGPRBlockSize(); MaxSGPRs = ST.getMaxNumSGPRs(*TargetOcc, false); MaxVGPRs = ST.getMaxNumVGPRs(*TargetOcc, VGPRBlockSize); - ResetTargetRegions(); - } - REMAT_DEBUG({ - dbgs() << "Analyzing "; - MF.getFunction().printAsOperand(dbgs(), false); - dbgs() << ": "; - if (OptRegions.empty()) { - dbgs() << "no objective to achieve, occupancy is maximal at " - << MFI.getMaxWavesPerEU(); - } else if (!TargetOcc) { - dbgs() << "reduce spilling (minimum target occupancy is " - << MFI.getMinWavesPerEU() << ')'; - } else { - dbgs() << "increase occupancy from " << DAG.MinOccupancy << " to " - << TargetOcc; - } - dbgs() << '\n'; - for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { - if (auto OptIt = OptRegions.find(I); OptIt != OptRegions.end()) { - dbgs() << REMAT_PREFIX << " [" << I << "] " << OptIt->getSecond() - << '\n'; - } + for (auto [I, Target] : enumerate(RPTargets)) { + Target.setTarget(MaxSGPRs, MaxVGPRs); + if (!Target.satisfied()) + TargetRegions.set(I); } - }); - if (OptRegions.empty()) - return false; + } - // Accounts for a reduction in RP in an optimizable region. Returns whether we - // estimate that we have identified enough rematerialization opportunities to - // achieve our goal, and sets Progress to true when this particular reduction - // in pressure was helpful toward that goal. - auto ReduceRPInRegion = [&](auto OptIt, Register Reg, LaneBitmask Mask, - bool &Progress) -> bool { - GCNRPTarget &Target = OptIt->getSecond(); - if (!Target.isSaveBeneficial(Reg)) - return false; - Progress = true; - Target.saveReg(Reg, Mask, DAG.MRI); - if (Target.satisfied()) - OptRegions.erase(OptIt->getFirst()); - return OptRegions.empty(); - }; + return TargetRegions.any(); +} +bool PreRARematStage::collectRematRegs( + const DenseMap<MachineInstr *, unsigned> &MIRegion) { // We need up-to-date live-out info. to query live-out register masks in // regions containing rematerializable instructions. DAG.RegionLiveOuts.buildLiveRegMap(); - // Cache set of registers that are going to be rematerialized. - DenseSet<unsigned> RematRegs; + // Set of registers already marked for potential remterialization; used to + // avoid rematerialization chains. + SmallSet<Register, 4> MarkedRegs; + auto IsMarkedForRemat = [&MarkedRegs](const MachineOperand &MO) -> bool { + return MO.isReg() && MarkedRegs.contains(MO.getReg()); + }; // Identify rematerializable instructions in the function. for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { - auto Region = DAG.Regions[I]; - for (auto MI = Region.first; MI != Region.second; ++MI) { + RegionBoundaries Bounds = DAG.Regions[I]; + for (auto MI = Bounds.first; MI != Bounds.second; ++MI) { // The instruction must be rematerializable. MachineInstr &DefMI = *MI; if (!isReMaterializable(DefMI)) continue; - // We only support rematerializing virtual registers with one definition. + // We only support rematerializing virtual registers with one + // definition. Register Reg = DefMI.getOperand(0).getReg(); if (!Reg.isVirtual() || !DAG.MRI.hasOneDef(Reg)) continue; // We only care to rematerialize the instruction if it has a single - // non-debug user in a different region. The using MI may not belong to a - // region if it is a lone region terminator. + // non-debug user in a different region. + // FIXME: Allow rematerializations with multiple uses. This should be + // relatively easy to support using the current cost model. MachineInstr *UseMI = DAG.MRI.getOneNonDBGUser(Reg); if (!UseMI) continue; auto UseRegion = MIRegion.find(UseMI); - if (UseRegion != MIRegion.end() && UseRegion->second == I) + if (UseRegion == MIRegion.end() || UseRegion->second == I) continue; // Do not rematerialize an instruction if it uses or is used by an // instruction that we have designated for rematerialization. // FIXME: Allow for rematerialization chains: this requires 1. updating - // remat points to account for uses that are rematerialized, and 2. either - // rematerializing the candidates in careful ordering, or deferring the - // MBB RP walk until the entire chain has been rematerialized. - if (Rematerializations.contains(UseMI) || - llvm::any_of(DefMI.operands(), [&RematRegs](MachineOperand &MO) { - return MO.isReg() && RematRegs.contains(MO.getReg()); - })) + // remat points to account for uses that are rematerialized, and 2. + // either rematerializing the candidates in careful ordering, or + // deferring the MBB RP walk until the entire chain has been + // rematerialized. + const MachineOperand &UseMO = UseMI->getOperand(0); + if (IsMarkedForRemat(UseMO) || + llvm::any_of(DefMI.operands(), IsMarkedForRemat)) continue; // Do not rematerialize an instruction it it uses registers that aren't @@ -1937,188 +2801,257 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() { *DAG.TII)) continue; - REMAT_DEBUG(dbgs() << "Region " << I << ": remat instruction " << DefMI); - RematInstruction &Remat = - Rematerializations.try_emplace(&DefMI, UseMI).first->second; - - bool RematUseful = false; - if (auto It = OptRegions.find(I); It != OptRegions.end()) { - // Optimistically consider that moving the instruction out of its - // defining region will reduce RP in the latter; this assumes that - // maximum RP in the region is reached somewhere between the defining - // instruction and the end of the region. - REMAT_DEBUG(dbgs() << " Defining region is optimizable\n"); - LaneBitmask Mask = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I)[Reg]; - if (ReduceRPInRegion(It, Reg, Mask, RematUseful)) - return true; - } - - for (unsigned LIRegion = 0; LIRegion != E; ++LIRegion) { - // We are only collecting regions in which the register is a live-in - // (and may be live-through). - auto It = DAG.LiveIns[LIRegion].find(Reg); - if (It == DAG.LiveIns[LIRegion].end() || It->second.none()) - continue; - Remat.LiveInRegions.insert(LIRegion); - - // Account for the reduction in RP due to the rematerialization in an - // optimizable region in which the defined register is a live-in. This - // is exact for live-through region but optimistic in the using region, - // where RP is actually reduced only if maximum RP is reached somewhere - // between the beginning of the region and the rematerializable - // instruction's use. - if (auto It = OptRegions.find(LIRegion); It != OptRegions.end()) { - REMAT_DEBUG(dbgs() << " Live-in in region " << LIRegion << '\n'); - if (ReduceRPInRegion(It, Reg, DAG.LiveIns[LIRegion][Reg], - RematUseful)) - return true; - } - } - - // If the instruction is not a live-in or live-out in any optimizable - // region then there is no point in rematerializing it. - if (!RematUseful) { - Rematerializations.pop_back(); - REMAT_DEBUG(dbgs() << " No impact, not rematerializing instruction\n"); - } else { - RematRegs.insert(Reg); - } + // Add the instruction to the rematerializable list. + MarkedRegs.insert(Reg); + RematRegs.emplace_back(&DefMI, UseMI, DAG, MIRegion); } } - if (TargetOcc) { - // We were trying to increase occupancy but failed, abort the stage. - REMAT_DEBUG(dbgs() << "Cannot increase occupancy\n"); - Rematerializations.clear(); - return false; + return !RematRegs.empty(); +} + +PreRARematStage::RematReg::RematReg( + MachineInstr *DefMI, MachineInstr *UseMI, GCNScheduleDAGMILive &DAG, + const DenseMap<MachineInstr *, unsigned> &MIRegion) + : DefMI(DefMI), UseMI(UseMI), LiveIn(DAG.Regions.size()), + LiveOut(DAG.Regions.size()), Live(DAG.Regions.size()), + DefRegion(MIRegion.at(DefMI)), UseRegion(MIRegion.at(UseMI)) { + + // Mark regions in which the rematerializable register is live. + Register Reg = getReg(); + for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { + auto LiveInIt = DAG.LiveIns[I].find(Reg); + if (LiveInIt != DAG.LiveIns[I].end()) + LiveIn.set(I); + const auto &LiveOuts = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I); + if (auto LiveOutIt = LiveOuts.find(Reg); LiveOutIt != LiveOuts.end()) + LiveOut.set(I); + } + Live |= LiveIn; + Live |= LiveOut; + Mask = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(DefRegion).at(Reg); +} + +bool PreRARematStage::RematReg::maybeBeneficial( + const BitVector &TargetRegions, ArrayRef<GCNRPTarget> RPTargets) const { + Register Reg = getReg(); + for (unsigned I : TargetRegions.set_bits()) { + if (Live[I] && RPTargets[I].isSaveBeneficial(Reg)) + return true; } - REMAT_DEBUG(dbgs() << "Can reduce but not eliminate spilling\n"); - return !Rematerializations.empty(); + return false; } -void PreRARematStage::rematerialize() { - const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo(); +void PreRARematStage::RematReg::insertMI(unsigned RegionIdx, + MachineInstr *RematMI, + GCNScheduleDAGMILive &DAG) const { + RegionBoundaries &Bounds = DAG.Regions[RegionIdx]; + if (Bounds.first == std::next(MachineBasicBlock::iterator(RematMI))) + Bounds.first = RematMI; + DAG.LIS->InsertMachineInstrInMaps(*RematMI); + DAG.LIS->createAndComputeVirtRegInterval(RematMI->getOperand(0).getReg()); +} + +PreRARematStage::ScoredRemat::FreqInfo::FreqInfo( + MachineFunction &MF, const GCNScheduleDAGMILive &DAG) { + assert(DAG.MLI && "MLI not defined in DAG"); + MachineBranchProbabilityInfo MBPI; + MachineBlockFrequencyInfo MBFI(MF, MBPI, *DAG.MLI); - // Collect regions whose RP changes in unpredictable way; we will have to - // fully recompute their RP after all rematerailizations. - DenseSet<unsigned> RecomputeRP; - - // Rematerialize all instructions. - for (auto &[DefMI, Remat] : Rematerializations) { - MachineBasicBlock::iterator InsertPos(Remat.UseMI); - Register Reg = DefMI->getOperand(0).getReg(); - unsigned DefRegion = MIRegion.at(DefMI); - - // Rematerialize DefMI to its use block. - TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg, - AMDGPU::NoSubRegister, *DefMI, *DAG.TRI); - Remat.RematMI = &*std::prev(InsertPos); - DAG.LIS->InsertMachineInstrInMaps(*Remat.RematMI); - - // Update region boundaries in regions we sinked from (remove defining MI) - // and to (insert MI rematerialized in use block). Only then we can erase - // the original MI. - DAG.updateRegionBoundaries(DAG.Regions[DefRegion], DefMI, nullptr); - auto UseRegion = MIRegion.find(Remat.UseMI); - if (UseRegion != MIRegion.end()) { - DAG.updateRegionBoundaries(DAG.Regions[UseRegion->second], InsertPos, - Remat.RematMI); + const unsigned NumRegions = DAG.Regions.size(); + MinFreq = MBFI.getEntryFreq().getFrequency(); + MaxFreq = 0; + Regions.reserve(NumRegions); + for (unsigned I = 0; I < NumRegions; ++I) { + MachineBasicBlock *MBB = DAG.Regions[I].first->getParent(); + uint64_t BlockFreq = MBFI.getBlockFreq(MBB).getFrequency(); + Regions.push_back(BlockFreq); + if (BlockFreq && BlockFreq < MinFreq) + MinFreq = BlockFreq; + else if (BlockFreq > MaxFreq) + MaxFreq = BlockFreq; + } + if (!MinFreq) + return; + + // Scale everything down if frequencies are high. + if (MinFreq >= ScaleFactor * ScaleFactor) { + for (uint64_t &Freq : Regions) + Freq /= ScaleFactor; + MinFreq /= ScaleFactor; + MaxFreq /= ScaleFactor; + } +} + +PreRARematStage::ScoredRemat::ScoredRemat(RematReg *Remat, const FreqInfo &Freq, + const GCNScheduleDAGMILive &DAG) + : Remat(Remat), NumRegs(getNumRegs(DAG)), FreqDiff(getFreqDiff(Freq)) {} + +unsigned PreRARematStage::ScoredRemat::getNumRegs( + const GCNScheduleDAGMILive &DAG) const { + const TargetRegisterClass &RC = *DAG.MRI.getRegClass(Remat->getReg()); + unsigned RegSize = DAG.TRI->getRegSizeInBits(RC); + if (unsigned SubIdx = Remat->DefMI->getOperand(0).getSubReg()) { + // The following may return -1 (i.e., a large unsigned number) on indices + // that may be used to access subregisters of multiple sizes; in such cases + // fallback on the size derived from the register class. + unsigned SubRegSize = DAG.TRI->getSubRegIdxSize(SubIdx); + if (SubRegSize < RegSize) + RegSize = SubRegSize; + } + return divideCeil(RegSize, 32); +} + +int64_t PreRARematStage::ScoredRemat::getFreqDiff(const FreqInfo &Freq) const { + // Get frequencies of defining and using regions. A rematerialization from the + // least frequent region to the most frequent region will yield the greatest + // latency penalty and therefore should get minimum score. Reciprocally, a + // rematerialization in the other direction should get maximum score. Default + // to values that will yield the worst possible score given known frequencies + // in order to penalize rematerializations from or into regions whose + // frequency is unknown. + int64_t DefOrMin = std::max(Freq.Regions[Remat->DefRegion], Freq.MinFreq); + int64_t UseOrMax = Freq.Regions[Remat->UseRegion]; + if (!UseOrMax) + UseOrMax = Freq.MaxFreq; + return DefOrMin - UseOrMax; +} + +void PreRARematStage::ScoredRemat::update(const BitVector &TargetRegions, + ArrayRef<GCNRPTarget> RPTargets, + const FreqInfo &FreqInfo, + bool ReduceSpill) { + MaxFreq = 0; + RegionImpact = 0; + for (unsigned I : TargetRegions.set_bits()) { + if (!Remat->Live[I] || !RPTargets[I].isSaveBeneficial(Remat->getReg())) + continue; + bool UnusedLT = Remat->isUnusedLiveThrough(I); + + // Regions in which RP is guaranteed to decrease have more weight. + RegionImpact += UnusedLT ? 2 : 1; + + if (ReduceSpill) { + uint64_t Freq = FreqInfo.Regions[I]; + if (!UnusedLT) { + // Apply a frequency penalty in regions in which we are not sure that RP + // will decrease. + Freq /= 2; + } + MaxFreq = std::max(MaxFreq, Freq); } - DAG.LIS->RemoveMachineInstrFromMaps(*DefMI); - DefMI->eraseFromParent(); + } + RegionImpact *= NumRegs; +} - // Collect all regions impacted by the rematerialization and update their - // live-in/RP information. - for (unsigned I : Remat.LiveInRegions) { - ImpactedRegions.insert({I, DAG.Pressure[I]}); - GCNRPTracker::LiveRegSet &RegionLiveIns = DAG.LiveIns[I]; +void PreRARematStage::rematerialize(const RematReg &Remat, + BitVector &RecomputeRP, + RollbackInfo *Rollback) { + const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo(); + MachineInstr &DefMI = *Remat.DefMI; + Register Reg = DefMI.getOperand(0).getReg(); + Register NewReg = DAG.MRI.cloneVirtualRegister(Reg); + + // Rematerialize the register in the region where it is used. + MachineBasicBlock::iterator InsertPos = Remat.UseMI; + TII->reMaterialize(*InsertPos->getParent(), InsertPos, NewReg, 0, DefMI); + MachineInstr *RematMI = &*std::prev(InsertPos); + Remat.UseMI->substituteRegister(Reg, NewReg, 0, *DAG.TRI); + Remat.insertMI(Remat.UseRegion, RematMI, DAG); + if (Rollback) { + Rollback->RematMI = RematMI; + // Make the original MI a debug value so that it does not influence + // scheduling and replace all read registers with a sentinel register to + // prevent operands to appear in use-lists of other MIs during LIS + // updates. Store mappings between operand indices and original registers + // for potential rollback. + DefMI.setDesc(TII->get(TargetOpcode::DBG_VALUE)); + for (auto [Idx, MO] : enumerate(Remat.DefMI->operands())) { + if (MO.isReg() && MO.readsReg()) { + Rollback->RegMap.insert({Idx, MO.getReg()}); + MO.setReg(Register()); + } + } + } else { + // Just delete the original instruction if it cannot be rolled back. + DAG.deleteMI(Remat.DefRegion, &DefMI); + } #ifdef EXPENSIVE_CHECKS - // All uses are known to be available / live at the remat point. Thus, the - // uses should already be live in to the region. - for (MachineOperand &MO : DefMI->operands()) { - if (!MO.isReg() || !MO.getReg() || !MO.readsReg()) - continue; + // All uses are known to be available / live at the remat point. Thus, + // the uses should already be live in to the using region. + for (MachineOperand &MO : DefMI.operands()) { + if (!MO.isReg() || !MO.getReg() || !MO.readsReg()) + continue; - Register UseReg = MO.getReg(); - if (!UseReg.isVirtual()) - continue; + Register UseReg = MO.getReg(); + if (!UseReg.isVirtual()) + continue; - LiveInterval &LI = DAG.LIS->getInterval(UseReg); - LaneBitmask LM = DAG.MRI.getMaxLaneMaskForVReg(MO.getReg()); - if (LI.hasSubRanges() && MO.getSubReg()) - LM = DAG.TRI->getSubRegIndexLaneMask(MO.getSubReg()); - - LaneBitmask LiveInMask = RegionLiveIns.at(UseReg); - LaneBitmask UncoveredLanes = LM & ~(LiveInMask & LM); - // If this register has lanes not covered by the LiveIns, be sure they - // do not map to any subrange. ref: - // machine-scheduler-sink-trivial-remats.mir::omitted_subrange - if (UncoveredLanes.any()) { - assert(LI.hasSubRanges()); - for (LiveInterval::SubRange &SR : LI.subranges()) - assert((SR.LaneMask & UncoveredLanes).none()); - } - } + LiveInterval &LI = DAG.LIS->getInterval(UseReg); + LaneBitmask LM = DAG.MRI.getMaxLaneMaskForVReg(MO.getReg()); + if (LI.hasSubRanges() && MO.getSubReg()) + LM = DAG.TRI->getSubRegIndexLaneMask(MO.getSubReg()); + + LaneBitmask LiveInMask = DAG.LiveIns[Remat.UseRegion].at(UseReg); + LaneBitmask UncoveredLanes = LM & ~(LiveInMask & LM); + // If this register has lanes not covered by the LiveIns, be sure they + // do not map to any subrange. ref: + // machine-scheduler-sink-trivial-remats.mir::omitted_subrange + if (UncoveredLanes.any()) { + assert(LI.hasSubRanges()); + for (LiveInterval::SubRange &SR : LI.subranges()) + assert((SR.LaneMask & UncoveredLanes).none()); + } + } #endif - // The register is no longer a live-in in all regions but the one that - // contains the single use. In live-through regions, maximum register - // pressure decreases predictably so we can directly update it. In the - // using region, maximum RP may or may not decrease, so we will mark it - // for re-computation after all materializations have taken place. - LaneBitmask PrevMask = RegionLiveIns[Reg]; - RegionLiveIns.erase(Reg); - RegMasks.insert({{I, Remat.RematMI->getOperand(0).getReg()}, PrevMask}); - if (Remat.UseMI->getParent() != DAG.Regions[I].first->getParent()) - DAG.Pressure[I].inc(Reg, PrevMask, LaneBitmask::getNone(), DAG.MRI); - else - RecomputeRP.insert(I); + // Remove the register from all regions where it is a live-in or live-out + // and adjust RP targets. The save is guaranteed in regions in which the + // register is live-through and unused but optimistic in all other regions + // where the register is live. + for (unsigned I : Remat.Live.set_bits()) { + RPTargets[I].saveReg(Reg, Remat.Mask, DAG.MRI); + DAG.LiveIns[I].erase(Reg); + DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I).erase(Reg); + if (!Remat.isUnusedLiveThrough(I)) + RecomputeRP.set(I); + } + + RescheduleRegions |= Remat.Live; +} + +void PreRARematStage::commitRematerializations() const { + REMAT_DEBUG(dbgs() << "Commiting all rematerializations\n"); + for (const RollbackInfo &Rollback : Rollbacks) + DAG.deleteMI(Rollback.Remat->DefRegion, Rollback.Remat->DefMI); +} + +void PreRARematStage::unsetSatisifedRPTargets(const BitVector &Regions) { + for (unsigned I : Regions.set_bits()) { + if (TargetRegions[I] && RPTargets[I].satisfied()) { + REMAT_DEBUG(dbgs() << " [" << I << "] Target reached!\n"); + TargetRegions.reset(I); } - // RP in the region from which the instruction was rematerialized may or may - // not decrease. - ImpactedRegions.insert({DefRegion, DAG.Pressure[DefRegion]}); - RecomputeRP.insert(DefRegion); - - // Recompute live interval to reflect the register's rematerialization. - Register RematReg = Remat.RematMI->getOperand(0).getReg(); - DAG.LIS->removeInterval(RematReg); - DAG.LIS->createAndComputeVirtRegInterval(RematReg); - } - - // All regions impacted by at least one rematerialization must be rescheduled. - // Maximum pressure must also be recomputed for all regions where it changed - // non-predictably and checked against the target occupancy. - unsigned DynamicVGPRBlockSize = - MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize(); - AchievedOcc = MFI.getMaxWavesPerEU(); - for (auto &[I, OriginalRP] : ImpactedRegions) { - bool IsEmptyRegion = DAG.Regions[I].first == DAG.Regions[I].second; - RescheduleRegions[I] = !IsEmptyRegion; - if (!RecomputeRP.contains(I)) - continue; + } +} - GCNRegPressure RP; - if (IsEmptyRegion) { - RP = getRegPressure(DAG.MRI, DAG.LiveIns[I]); - } else { - GCNDownwardRPTracker RPT(*DAG.LIS); - auto *NonDbgMI = &*skipDebugInstructionsForward(DAG.Regions[I].first, - DAG.Regions[I].second); - if (NonDbgMI == DAG.Regions[I].second) { - // Region is non-empty but contains only debug instructions. - RP = getRegPressure(DAG.MRI, DAG.LiveIns[I]); - } else { - RPT.reset(*NonDbgMI, &DAG.LiveIns[I]); - RPT.advance(DAG.Regions[I].second); - RP = RPT.moveMaxPressure(); - } +bool PreRARematStage::updateAndVerifyRPTargets(const BitVector &Regions) { + bool TooOptimistic = false; + for (unsigned I : Regions.set_bits()) { + GCNRPTarget &Target = RPTargets[I]; + Target.setRP(DAG.getRealRegPressure(I)); + + // Since we were optimistic in assessing RP decreases in these regions, we + // may need to remark the target as a target region if RP didn't decrease + // as expected. + if (!TargetRegions[I] && !Target.satisfied()) { + REMAT_DEBUG(dbgs() << " [" << I << "] Incorrect RP estimation\n"); + TooOptimistic = true; + TargetRegions.set(I); } - DAG.Pressure[I] = RP; - AchievedOcc = - std::min(AchievedOcc, RP.getOccupancy(ST, DynamicVGPRBlockSize)); } - REMAT_DEBUG(dbgs() << "Achieved occupancy " << AchievedOcc << "\n"); + return TooOptimistic; } // Copied from MachineLICM @@ -2141,80 +3074,116 @@ bool PreRARematStage::isReMaterializable(const MachineInstr &MI) { void PreRARematStage::finalizeGCNSchedStage() { // We consider that reducing spilling is always beneficial so we never - // rollback rematerializations in such cases. It's also possible that - // rescheduling lowers occupancy over the one achieved just through remats, in - // which case we do not want to rollback either (the rescheduling was already - // reverted in PreRARematStage::shouldRevertScheduling in such cases). - unsigned MaxOcc = std::max(AchievedOcc, DAG.MinOccupancy); - if (!TargetOcc || MaxOcc >= *TargetOcc) + // rollback rematerializations or revert scheduling in such cases. + if (!TargetOcc) return; - REMAT_DEBUG(dbgs() << "Rolling back all rematerializations\n"); - const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo(); + // When increasing occupancy, it is possible that re-scheduling is not able to + // achieve the target occupancy in all regions, in which case re-scheduling in + // all regions should be reverted. + if (DAG.MinOccupancy >= *TargetOcc) { + commitRematerializations(); + return; + } - // Rollback the rematerializations. - for (const auto &[DefMI, Remat] : Rematerializations) { - MachineInstr &RematMI = *Remat.RematMI; - unsigned DefRegion = MIRegion.at(DefMI); - MachineBasicBlock::iterator InsertPos(DAG.Regions[DefRegion].second); - MachineBasicBlock *MBB = RegionBB[DefRegion]; - Register Reg = RematMI.getOperand(0).getReg(); - - // Re-rematerialize MI at the end of its original region. Note that it may - // not be rematerialized exactly in the same position as originally within - // the region, but it should not matter much. - TII->reMaterialize(*MBB, InsertPos, Reg, AMDGPU::NoSubRegister, RematMI, - *DAG.TRI); - MachineInstr *NewMI = &*std::prev(InsertPos); - DAG.LIS->InsertMachineInstrInMaps(*NewMI); - - auto UseRegion = MIRegion.find(Remat.UseMI); - if (UseRegion != MIRegion.end()) { - DAG.updateRegionBoundaries(DAG.Regions[UseRegion->second], RematMI, - nullptr); + // It is possible that re-scheduling lowers occupancy over the one achieved + // just through rematerializations, in which case we revert re-scheduling in + // all regions but do not roll back rematerializations. + const bool ShouldRollbackRemats = AchievedOcc < *TargetOcc; + + // When we both need to revert re-scheduling and rollback rematerializations, + // restore rematerialized MIs' original state before reverting so that they + // are treated as non-debug instructions by the revert logic. + if (ShouldRollbackRemats) { + for (const RollbackInfo &Rollback : Rollbacks) { + const auto &[Remat, RematMI, RegMap] = Rollback; + Remat->DefMI->setDesc(DAG.TII->get(RematMI->getOpcode())); + for (const auto &[MOIdx, Reg] : RegMap) + Remat->DefMI->getOperand(MOIdx).setReg(Reg); } - DAG.updateRegionBoundaries(DAG.Regions[DefRegion], InsertPos, NewMI); + } - // Erase rematerialized MI. - DAG.LIS->RemoveMachineInstrFromMaps(RematMI); - RematMI.eraseFromParent(); + // Revert re-scheduling in all affected regions. + for (const auto &[RegionIdx, OrigMIOrder, MaxPressure] : RegionReverts) { + REMAT_DEBUG(dbgs() << "Reverting re-scheduling in region " << RegionIdx + << '\n'); + DAG.Pressure[RegionIdx] = MaxPressure; + modifyRegionSchedule(RegionIdx, RegionBB[RegionIdx], OrigMIOrder); + } - // Recompute live interval for the re-rematerialized register + if (!ShouldRollbackRemats) { + commitRematerializations(); + DAG.setTargetOccupancy(AchievedOcc); + return; + } + + // Reset the target occupancy to what it was pre-rematerialization. + DAG.setTargetOccupancy(*TargetOcc - 1); + + // Finish rolling back rematerializations, then recompute pressure in all + // affected regions. + REMAT_DEBUG(dbgs() << "==== ROLLBACK ====\n"); + BitVector RecomputeRP(DAG.Regions.size()); + DenseSet<Register> RecomputeLI; + for (const RollbackInfo &Rollback : Rollbacks) { + const auto &[Remat, RematMI, RegMap] = Rollback; + + // Switch back to using the original register and delete the + // rematerialization. + Register Reg = RematMI->getOperand(0).getReg(); + Register OriginalReg = Remat->DefMI->getOperand(0).getReg(); + Remat->UseMI->substituteRegister(Reg, OriginalReg, 0, *DAG.TRI); + REMAT_DEBUG(dbgs() << '[' << Remat->UseRegion + << "] Deleting rematerialization " << *RematMI); + DAG.deleteMI(Remat->UseRegion, RematMI); + + // Re-add the defined register as a live-in/live-out in all regions it used + // to be one in. + std::pair<Register, LaneBitmask> LiveReg(OriginalReg, Remat->Mask); + for (unsigned I : Remat->LiveIn.set_bits()) + DAG.LiveIns[I].insert(LiveReg); + for (unsigned I : Remat->LiveOut.set_bits()) + DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I).insert(LiveReg); + + RecomputeRP |= Rollback.Remat->Live; + // Regenerate intervals for all register operands of rematerialized MIs as + // slot indices may have changed slightly from before re-scheduling. + for (MachineOperand &MO : Rollback.Remat->DefMI->operands()) { + if (MO.isReg() && MO.getReg().isVirtual()) + RecomputeLI.insert(MO.getReg()); + } + } + for (Register Reg : RecomputeLI) { DAG.LIS->removeInterval(Reg); DAG.LIS->createAndComputeVirtRegInterval(Reg); - - // Re-add the register as a live-in in all regions it used to be one in. - for (unsigned LIRegion : Remat.LiveInRegions) - DAG.LiveIns[LIRegion].insert({Reg, RegMasks.at({LIRegion, Reg})}); } - - // Reset RP in all impacted regions. - for (auto &[I, OriginalRP] : ImpactedRegions) - DAG.Pressure[I] = OriginalRP; +#ifdef EXPENSIVE_CHECKS + // In particular, we want to check for coherent MI/slot order in regions in + // which reverts and/or rollbacks may have happened. + MF.verify(); +#endif + for (unsigned I : RecomputeRP.set_bits()) + DAG.Pressure[I] = DAG.getRealRegPressure(I); GCNSchedStage::finalizeGCNSchedStage(); } -void GCNScheduleDAGMILive::updateRegionBoundaries( - RegionBoundaries &RegionBounds, MachineBasicBlock::iterator MI, - MachineInstr *NewMI) { - assert((!NewMI || NewMI != RegionBounds.second) && - "cannot remove at region end"); - - if (RegionBounds.first == RegionBounds.second) { - assert(NewMI && "cannot remove from an empty region"); - RegionBounds.first = NewMI; - return; - } +void GCNScheduleDAGMILive::deleteMI(unsigned RegionIdx, MachineInstr *MI) { + // It's not possible for the deleted instruction to be upper region boundary + // since we don't delete region terminators. + if (Regions[RegionIdx].first == MI) + Regions[RegionIdx].first = std::next(MachineBasicBlock::iterator(MI)); + LIS->removeInterval(MI->getOperand(0).getReg()); + LIS->RemoveMachineInstrFromMaps(*MI); + MI->eraseFromParent(); +} - // We only care for modifications at the beginning of a non-empty region since - // the upper region boundary is exclusive. - if (MI != RegionBounds.first) - return; - if (!NewMI) - RegionBounds.first = std::next(MI); // Removal +void GCNScheduleDAGMILive::setTargetOccupancy(unsigned TargetOccupancy) { + MinOccupancy = TargetOccupancy; + if (MFI.getOccupancy() < TargetOccupancy) + MFI.increaseOccupancy(MF, MinOccupancy); else - RegionBounds.first = NewMI; // Insertion + MFI.limitOccupancy(MinOccupancy); } static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) { diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h index 975781f..6b6a403 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -16,6 +16,9 @@ #include "GCNRegPressure.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/MapVector.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineBlockFrequencyInfo.h" +#include "llvm/CodeGen/MachineBranchProbabilityInfo.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineScheduler.h" @@ -28,11 +31,12 @@ class GCNSchedStage; enum class GCNSchedStageID : unsigned { OccInitialSchedule = 0, - UnclusteredHighRPReschedule = 1, - ClusteredLowOccupancyReschedule = 2, - PreRARematerialize = 3, - ILPInitialSchedule = 4, - MemoryClauseInitialSchedule = 5 + RewriteMFMAForm = 1, + UnclusteredHighRPReschedule = 2, + ClusteredLowOccupancyReschedule = 3, + PreRARematerialize = 4, + ILPInitialSchedule = 5, + MemoryClauseInitialSchedule = 6 }; #ifndef NDEBUG @@ -183,7 +187,7 @@ class ScheduleMetrics { unsigned BubbleCycles; public: - ScheduleMetrics() {} + ScheduleMetrics() = default; ScheduleMetrics(unsigned L, unsigned BC) : ScheduleLength(L), BubbleCycles(BC) {} unsigned getLength() const { return ScheduleLength; } @@ -198,8 +202,7 @@ public: }; inline raw_ostream &operator<<(raw_ostream &OS, const ScheduleMetrics &Sm) { - dbgs() << "\n Schedule Metric (scaled by " - << ScheduleMetrics::ScaleFactor + dbgs() << "\n Schedule Metric (scaled by " << ScheduleMetrics::ScaleFactor << " ) is: " << Sm.getMetric() << " [ " << Sm.getBubbles() << "/" << Sm.getLength() << " ]\n"; return OS; @@ -217,7 +220,7 @@ class RegionPressureMap { bool IsLiveOut; public: - RegionPressureMap() {} + RegionPressureMap() = default; RegionPressureMap(GCNScheduleDAGMILive *GCNDAG, bool LiveOut) : DAG(GCNDAG), IsLiveOut(LiveOut) {} // Build the Instr->LiveReg and RegionIdx->Instr maps @@ -239,6 +242,7 @@ using RegionBoundaries = class GCNScheduleDAGMILive final : public ScheduleDAGMILive { friend class GCNSchedStage; friend class OccInitialScheduleStage; + friend class RewriteMFMAFormStage; friend class UnclusteredHighRPStage; friend class ClusteredLowOccStage; friend class PreRARematStage; @@ -300,18 +304,15 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive { // Compute and cache live-ins and pressure for all regions in block. void computeBlockPressure(unsigned RegionIdx, const MachineBasicBlock *MBB); - /// If necessary, updates a region's boundaries following insertion ( \p NewMI - /// != nullptr) or removal ( \p NewMI == nullptr) of a \p MI in the region. - /// For an MI removal, this must be called before the MI is actually erased - /// from its parent MBB. - void updateRegionBoundaries(RegionBoundaries &RegionBounds, - MachineBasicBlock::iterator MI, - MachineInstr *NewMI); + /// Makes the scheduler try to achieve an occupancy of \p TargetOccupancy. + void setTargetOccupancy(unsigned TargetOccupancy); void runSchedStages(); std::unique_ptr<GCNSchedStage> createSchedStage(GCNSchedStageID SchedStageID); + void deleteMI(unsigned RegionIdx, MachineInstr *MI); + public: GCNScheduleDAGMILive(MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S); @@ -367,12 +368,12 @@ public: // be skipped. virtual bool initGCNRegion(); + // Finalize state after scheduling a region. + virtual void finalizeGCNRegion(); + // Track whether a new region is also a new MBB. void setupNewBlock(); - // Finalize state after scheudling a region. - void finalizeGCNRegion(); - // Check result of scheduling. void checkScheduling(); @@ -397,8 +398,12 @@ public: // Returns true if the new schedule may result in more spilling. bool mayCauseSpilling(unsigned WavesAfter); - // Attempt to revert scheduling for this region. - void revertScheduling(); + /// Sets the schedule of region \p RegionIdx in block \p MBB to \p MIOrder. + /// The MIs in \p MIOrder must be exactly the same as the ones currently + /// existing inside the region, only in a different order that honors def-use + /// chains. + void modifyRegionSchedule(unsigned RegionIdx, MachineBasicBlock *MBB, + ArrayRef<MachineInstr *> MIOrder); void advanceRegion() { RegionIdx++; } @@ -413,10 +418,67 @@ public: : GCNSchedStage(StageID, DAG) {} }; +class RewriteMFMAFormStage : public GCNSchedStage { +private: + // Record regions with excess archvgpr register pressure over the physical + // register limit. Register pressure in these regions usually will result in + // spilling. + BitVector RegionsWithExcessArchVGPR; + + const SIInstrInfo *TII; + const SIRegisterInfo *SRI; + + /// Do a speculative rewrite and collect copy locations. The speculative + /// rewrite allows us to calculate the RP of the code after the rewrite, and + /// the copy locations allow us to calculate the total cost of copies required + /// for the rewrite. Stores the rewritten instructions in \p RewriteCands , + /// the copy locations for uses (of the MFMA result) in \p CopyForUse and the + /// copy locations for defs (of the MFMA operands) in \p CopyForDef + bool + initHeuristics(std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands, + DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse, + SmallPtrSetImpl<MachineInstr *> &CopyForDef); + + /// Calculate the rewrite cost and undo the state change (e.g. rewriting) done + /// in initHeuristics. Uses \p CopyForUse and \p CopyForDef to calculate copy + /// costs, and \p RewriteCands to undo rewriting. + int64_t getRewriteCost( + const std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands, + const DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse, + const SmallPtrSetImpl<MachineInstr *> &CopyForDef); + + /// Do the final rewrite on \p RewriteCands and insert any needed copies. + bool + rewrite(const std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands); + + /// \returns true if this MI is a rewrite candidate. + bool isRewriteCandidate(MachineInstr *MI) const; + + /// Finds all the reaching defs of \p UseMO and stores the SlotIndexes into \p + /// DefIdxs + void findReachingDefs(MachineOperand &UseMO, LiveIntervals *LIS, + SmallVectorImpl<SlotIndex> &DefIdxs); + + /// Finds all the reaching uses of \p DefMI and stores the use operands in \p + /// ReachingUses + void findReachingUses(MachineInstr *DefMI, LiveIntervals *LIS, + SmallVectorImpl<MachineOperand *> &ReachingUses); + +public: + bool initGCNSchedStage() override; + + RewriteMFMAFormStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG) + : GCNSchedStage(StageID, DAG) {} +}; + class UnclusteredHighRPStage : public GCNSchedStage { private: // Save the initial occupancy before starting this stage. unsigned InitialOccupancy; + // Save the temporary target occupancy before starting this stage. + unsigned TempTargetOccupancy; + // Track whether any region was scheduled by this stage. + bool IsAnyRegionScheduled; public: bool initGCNSchedStage() override; @@ -447,65 +509,242 @@ public: }; /// Attempts to reduce function spilling or, if there is no spilling, to -/// increase function occupancy by one with respect to ArchVGPR usage by sinking -/// rematerializable instructions to their use. When the stage -/// estimates reducing spilling or increasing occupancy is possible, as few -/// instructions as possible are rematerialized to reduce potential negative +/// increase function occupancy by one with respect to register usage by sinking +/// rematerializable instructions to their use. When the stage estimates that +/// reducing spilling or increasing occupancy is possible, it tries to +/// rematerialize as few registers as possible to reduce potential negative /// effects on function latency. +/// +/// The stage only supports rematerializing registers that meet all of the +/// following constraints. +/// 1. The register is virtual and has a single defining instruction. +/// 2. The single defining instruction is either deemed rematerializable by the +/// target-independent logic, or if not, has no non-constant and +/// non-ignorable physical register use. +/// 3 The register has no virtual register use whose live range would be +/// extended by the rematerialization. +/// 4. The register has a single non-debug user in a different region from its +/// defining region. +/// 5. The register is not used by or using another register that is going to be +/// rematerialized. class PreRARematStage : public GCNSchedStage { private: - /// Useful information about a rematerializable instruction. - struct RematInstruction { - /// Single use of the rematerializable instruction's defined register, - /// located in a different block. + /// A rematerializable register. + struct RematReg { + /// Single MI defining the rematerializable register. + MachineInstr *DefMI; + /// Single user of the rematerializable register. MachineInstr *UseMI; - /// Rematerialized version of \p DefMI, set in - /// PreRARematStage::rematerialize. Used for reverting rematerializations. - MachineInstr *RematMI; - /// Set of regions in which the rematerializable instruction's defined - /// register is a live-in. - SmallDenseSet<unsigned, 4> LiveInRegions; + /// Regions in which the register is live-in/live-out/live anywhere. + BitVector LiveIn, LiveOut, Live; + /// The rematerializable register's lane bitmask. + LaneBitmask Mask; + /// Defining and using regions. + unsigned DefRegion, UseRegion; + + RematReg(MachineInstr *DefMI, MachineInstr *UseMI, + GCNScheduleDAGMILive &DAG, + const DenseMap<MachineInstr *, unsigned> &MIRegion); + + /// Returns the rematerializable register. Do not call after deleting the + /// original defining instruction. + Register getReg() const { return DefMI->getOperand(0).getReg(); } + + /// Determines whether this rematerialization may be beneficial in at least + /// one target region. + bool maybeBeneficial(const BitVector &TargetRegions, + ArrayRef<GCNRPTarget> RPTargets) const; + + /// Determines if the register is both unused and live-through in region \p + /// I. This guarantees that rematerializing it will reduce RP in the region. + bool isUnusedLiveThrough(unsigned I) const { + assert(I < Live.size() && "region index out of range"); + return LiveIn[I] && LiveOut[I] && I != UseRegion; + } + + /// Updates internal structures following a MI rematerialization. Part of + /// the stage instead of the DAG because it makes assumptions that are + /// specific to the rematerialization process. + void insertMI(unsigned RegionIdx, MachineInstr *RematMI, + GCNScheduleDAGMILive &DAG) const; + }; + + /// A scored rematerialization candidate. Higher scores indicate more + /// beneficial rematerializations. A null score indicate the rematerialization + /// is not helpful to reduce RP in target regions. + struct ScoredRemat { + /// The rematerializable register under consideration. + RematReg *Remat; + + /// Execution frequency information required by scoring heuristics. + /// Frequencies are scaled down if they are high to avoid overflow/underflow + /// when combining them. + struct FreqInfo { + /// Per-region execution frequencies. 0 when unknown. + SmallVector<uint64_t> Regions; + /// Minimum and maximum observed frequencies. + uint64_t MinFreq, MaxFreq; + + FreqInfo(MachineFunction &MF, const GCNScheduleDAGMILive &DAG); + + private: + static const uint64_t ScaleFactor = 1024; + }; + + /// This only initializes state-independent characteristics of \p Remat, not + /// the actual score. + ScoredRemat(RematReg *Remat, const FreqInfo &Freq, + const GCNScheduleDAGMILive &DAG); + + /// Updates the rematerialization's score w.r.t. the current \p RPTargets. + /// \p RegionFreq indicates the frequency of each region + void update(const BitVector &TargetRegions, ArrayRef<GCNRPTarget> RPTargets, + const FreqInfo &Freq, bool ReduceSpill); + + /// Returns whether the current score is null, indicating the + /// rematerialization is useless. + bool hasNullScore() const { return !RegionImpact; } + + /// Compare score components of non-null scores pair-wise. A null score is + /// always strictly lesser than another non-null score. + bool operator<(const ScoredRemat &O) const { + if (hasNullScore()) + return !O.hasNullScore(); + if (O.hasNullScore()) + return false; + if (MaxFreq != O.MaxFreq) + return MaxFreq < O.MaxFreq; + if (FreqDiff != O.FreqDiff) + return FreqDiff < O.FreqDiff; + if (RegionImpact != O.RegionImpact) + return RegionImpact < O.RegionImpact; + // Break ties using pointer to rematerializable register. Rematerializable + // registers are collected in instruction order so, within the same + // region, this will prefer registers defined earlier that have longer + // live ranges in their defining region (since the registers we consider + // are always live-out in their defining region). + return Remat > O.Remat; + } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + Printable print() const; +#endif + + private: + /// Number of 32-bit registers this rematerialization covers. + unsigned NumRegs; + + // The three members below are the scoring components, top to bottom from + // most important to least important when comparing candidates. - RematInstruction(MachineInstr *UseMI) : UseMI(UseMI) {} + /// Frequency of impacted target region with highest known frequency. This + /// only matters when the stage is trying to reduce spilling, so it is + /// always 0 when it is not. + uint64_t MaxFreq; + /// Frequency difference between defining and using regions. Negative values + /// indicate we are rematerializing to higher frequency regions; positive + /// values indicate the contrary. + int64_t FreqDiff; + /// Expected number of target regions impacted by the rematerialization, + /// scaled by the size of the register being rematerialized. + unsigned RegionImpact; + + unsigned getNumRegs(const GCNScheduleDAGMILive &DAG) const; + + int64_t getFreqDiff(const FreqInfo &Freq) const; }; - /// Maps all MIs to their parent region. MI terminators are considered to be - /// outside the region they delimitate, and as such are not stored in the map. - DenseMap<MachineInstr *, unsigned> MIRegion; /// Parent MBB to each region, in region order. SmallVector<MachineBasicBlock *> RegionBB; - /// Collects instructions to rematerialize. - MapVector<MachineInstr *, RematInstruction> Rematerializations; - /// Collects regions whose live-ins or register pressure will change due to - /// rematerializations. - DenseMap<unsigned, GCNRegPressure> ImpactedRegions; - /// In case we need to rollback rematerializations, save lane masks for all - /// rematerialized registers in all regions in which they are live-ins. - DenseMap<std::pair<unsigned, Register>, LaneBitmask> RegMasks; - /// After successful stage initialization, indicates which regions should be - /// rescheduled. - BitVector RescheduleRegions; - /// The target occupancy the stage is trying to achieve. Empty when the + /// Register pressure targets for all regions. + SmallVector<GCNRPTarget> RPTargets; + /// Regions which are above the stage's RP target. + BitVector TargetRegions; + /// The target occupancy the set is trying to achieve. Empty when the /// objective is spilling reduction. std::optional<unsigned> TargetOcc; /// Achieved occupancy *only* through rematerializations (pre-rescheduling). - /// Smaller than or equal to the target occupancy. unsigned AchievedOcc; + /// After successful stage initialization, indicates which regions should be + /// rescheduled. + BitVector RescheduleRegions; - /// Returns whether remat can reduce spilling or increase function occupancy - /// by 1 through rematerialization. If it can do one, collects instructions in - /// PreRARematStage::Rematerializations and sets the target occupancy in - /// PreRARematStage::TargetOccupancy. - bool canIncreaseOccupancyOrReduceSpill(); + /// List of rematerializable registers. + SmallVector<RematReg> RematRegs; + + /// Holds enough information to rollback a rematerialization decision post + /// re-scheduling. + struct RollbackInfo { + /// The rematerializable register under consideration. + const RematReg *Remat; + /// The rematerialized MI replacing the original defining MI. + MachineInstr *RematMI; + /// Maps register machine operand indices to their original register. + SmallDenseMap<unsigned, Register, 4> RegMap; + + RollbackInfo(const RematReg *Remat) : Remat(Remat) {} + }; + /// List of rematerializations to rollback if rematerialization does not end + /// up being beneficial. + SmallVector<RollbackInfo> Rollbacks; + + /// State of a region pre-re-scheduling but post-rematerializations that we + /// must keep to be able to revert re-scheduling effects. + struct RegionSchedRevert { + /// Region number; + unsigned RegionIdx; + /// Original instruction order (both debug and non-debug MIs). + std::vector<MachineInstr *> OrigMIOrder; + /// Maximum pressure recorded in the region. + GCNRegPressure MaxPressure; + + RegionSchedRevert(unsigned RegionIdx, ArrayRef<MachineInstr *> OrigMIOrder, + const GCNRegPressure &MaxPressure) + : RegionIdx(RegionIdx), OrigMIOrder(OrigMIOrder), + MaxPressure(MaxPressure) {} + }; + /// After re-scheduling, contains pre-re-scheduling data for all re-scheduled + /// regions. + SmallVector<RegionSchedRevert> RegionReverts; + + /// Returns the occupancy the stage is trying to achieve. + unsigned getStageTargetOccupancy() const; + + /// Determines the stage's objective (increasing occupancy or reducing + /// spilling, set in \ref TargetOcc). Defines \ref RPTargets in all regions to + /// achieve that objective and mark those that don't achieve it in \ref + /// TargetRegions. Returns whether there is any target region. + bool setObjective(); + + /// Unsets target regions in \p Regions whose RP target has been reached. + void unsetSatisifedRPTargets(const BitVector &Regions); + + /// Fully recomputes RP from the DAG in \p Regions. Among those regions, sets + /// again all \ref TargetRegions that were optimistically marked as satisfied + /// but are actually not, and returns whether there were any such regions. + bool updateAndVerifyRPTargets(const BitVector &Regions); + + /// Collects all rematerializable registers and appends them to \ref + /// RematRegs. \p MIRegion maps MIs to their region. Returns whether any + /// rematerializable register was found. + bool collectRematRegs(const DenseMap<MachineInstr *, unsigned> &MIRegion); + + /// Rematerializes \p Remat. This removes the rematerialized register from + /// live-in/out lists in the DAG and updates RP targets in all affected + /// regions, which are also marked in \ref RescheduleRegions. Regions in which + /// RP savings are not guaranteed are set in \p RecomputeRP. When \p Rollback + /// is non-null, fills it with required information to be able to rollback the + /// rematerialization post-rescheduling. + void rematerialize(const RematReg &Remat, BitVector &RecomputeRP, + RollbackInfo *Rollback); + + /// Deletes all rematerialized MIs from the MIR when they were kept around for + /// potential rollback. + void commitRematerializations() const; /// Whether the MI is rematerializable bool isReMaterializable(const MachineInstr &MI); - /// Rematerializes all instructions in PreRARematStage::Rematerializations - /// and stores the achieved occupancy after remat in - /// PreRARematStage::AchievedOcc. - void rematerialize(); - /// If remat alone did not increase occupancy to the target one, rollbacks all /// rematerializations and resets live-ins/RP in all regions impacted by the /// stage to their pre-stage values. @@ -516,10 +755,17 @@ public: bool initGCNRegion() override; + void finalizeGCNRegion() override; + bool shouldRevertScheduling(unsigned WavesAfter) override; PreRARematStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG) - : GCNSchedStage(StageID, DAG), RescheduleRegions(DAG.Regions.size()) {} + : GCNSchedStage(StageID, DAG), TargetRegions(DAG.Regions.size()), + RescheduleRegions(DAG.Regions.size()) { + const unsigned NumRegions = DAG.Regions.size(); + RPTargets.reserve(NumRegions); + RegionBB.reserve(NumRegions); + } }; class ILPInitialScheduleStage : public GCNSchedStage { diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp index f291e37..da63628 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp @@ -119,15 +119,15 @@ GCNSubtarget &GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, // that do not support ADDR64 variants of MUBUF instructions. Such targets // cannot use a 64 bit offset with a MUBUF instruction to access the global // address space - if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) { - ToggleFeature(AMDGPU::FeatureFlatForGlobal); - FlatForGlobal = true; + if (!hasAddr64() && !FS.contains("flat-for-global") && !UseFlatForGlobal) { + ToggleFeature(AMDGPU::FeatureUseFlatForGlobal); + UseFlatForGlobal = true; } // Unless +-flat-for-global is specified, use MUBUF instructions for global // address space access if flat operations are not available. - if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) { - ToggleFeature(AMDGPU::FeatureFlatForGlobal); - FlatForGlobal = false; + if (!hasFlat() && !FS.contains("flat-for-global") && UseFlatForGlobal) { + ToggleFeature(AMDGPU::FeatureUseFlatForGlobal); + UseFlatForGlobal = false; } // Set defaults if needed. @@ -169,7 +169,6 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, : // clang-format off AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS), AMDGPUSubtarget(TT), - TargetTriple(TT), TargetID(*this), InstrItins(getInstrItineraryForCPU(GPU)), InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), @@ -645,6 +644,8 @@ void GCNSubtarget::adjustSchedDependency( MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end()); unsigned Lat = 0; for (++I; I != E && I->isBundledWithPred(); ++I) { + if (I->isMetaInstruction()) + continue; if (I->modifiesRegister(Reg, TRI)) Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I); else if (Lat) @@ -658,6 +659,8 @@ void GCNSubtarget::adjustSchedDependency( MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end()); unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI); for (++I; I != E && I->isBundledWithPred() && Lat; ++I) { + if (I->isMetaInstruction()) + continue; if (I->readsRegister(Reg, TRI)) break; --Lat; @@ -699,7 +702,7 @@ GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F, KernargSegmentPtr = true; bool IsAmdHsaOrMesa = ST.isAmdHsaOrMesa(F); - if (IsAmdHsaOrMesa && !ST.enableFlatScratch()) + if (IsAmdHsaOrMesa && !ST.hasFlatScratchEnabled()) PrivateSegmentBuffer = true; else if (ST.isMesaGfxShader(F)) ImplicitBufferPtr = true; @@ -717,13 +720,13 @@ GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F, } if (ST.hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(CC) && - (IsAmdHsaOrMesa || ST.enableFlatScratch()) && - // FlatScratchInit cannot be true for graphics CC if enableFlatScratch() - // is false. - (ST.enableFlatScratch() || + (IsAmdHsaOrMesa || ST.hasFlatScratchEnabled()) && + // FlatScratchInit cannot be true for graphics CC if + // hasFlatScratchEnabled() is false. + (ST.hasFlatScratchEnabled() || (!AMDGPU::isGraphics(CC) && !F.hasFnAttribute("amdgpu-no-flat-scratch-init"))) && - !ST.flatScratchIsArchitected()) { + !ST.hasArchitectedFlatScratch()) { FlatScratchInit = true; } diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index c2e6078..b308e0d 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -60,238 +60,25 @@ private: protected: // Basic subtarget description. - Triple TargetTriple; AMDGPU::IsaInfo::AMDGPUTargetID TargetID; unsigned Gen = INVALID; InstrItineraryData InstrItins; int LDSBankCount = 0; unsigned MaxPrivateElementSize = 0; - // Possibly statically set by tablegen, but may want to be overridden. - bool FastDenormalF32 = false; - bool HalfRate64Ops = false; - bool FullRate64Ops = false; - // Dynamically set bits that enable features. - bool FlatForGlobal = false; - bool AutoWaitcntBeforeBarrier = false; - bool BackOffBarrier = false; - bool UnalignedScratchAccess = false; - bool UnalignedAccessMode = false; - bool RelaxedBufferOOBMode = false; - bool HasApertureRegs = false; - bool SupportsXNACK = false; - bool KernargPreload = false; - - // This should not be used directly. 'TargetID' tracks the dynamic settings - // for XNACK. - bool EnableXNACK = false; - - bool EnableTgSplit = false; - bool EnableCuMode = false; - bool TrapHandler = false; - bool EnablePreciseMemory = false; - - // Used as options. - bool EnableLoadStoreOpt = false; - bool EnableUnsafeDSOffsetFolding = false; - bool EnableSIScheduler = false; - bool EnableDS128 = false; - bool EnablePRTStrictNull = false; - bool DumpCode = false; - bool AssemblerPermissiveWavesize = false; - - // Subtarget statically properties set by tablegen - bool FP64 = false; - bool FMA = false; - bool MIMG_R128 = false; - bool CIInsts = false; - bool GFX8Insts = false; - bool GFX9Insts = false; - bool GFX90AInsts = false; - bool GFX940Insts = false; - bool GFX950Insts = false; - bool GFX10Insts = false; - bool GFX11Insts = false; - bool GFX12Insts = false; - bool GFX1250Insts = false; - bool GFX10_3Insts = false; - bool GFX7GFX8GFX9Insts = false; - bool SGPRInitBug = false; - bool UserSGPRInit16Bug = false; - bool NegativeScratchOffsetBug = false; - bool NegativeUnalignedScratchOffsetBug = false; - bool HasSMemRealTime = false; - bool HasIntClamp = false; - bool HasFmaMixInsts = false; - bool HasFmaMixBF16Insts = false; - bool HasMovrel = false; - bool HasVGPRIndexMode = false; - bool HasScalarDwordx3Loads = false; - bool HasScalarStores = false; - bool HasScalarAtomics = false; - bool HasSDWAOmod = false; - bool HasSDWAScalar = false; - bool HasSDWASdst = false; - bool HasSDWAMac = false; - bool HasSDWAOutModsVOPC = false; - bool HasDPP = false; - bool HasDPP8 = false; - bool HasDPALU_DPP = false; - bool HasDPPSrc1SGPR = false; - bool HasPackedFP32Ops = false; - bool HasImageInsts = false; - bool HasExtendedImageInsts = false; - bool HasR128A16 = false; - bool HasA16 = false; - bool HasG16 = false; - bool HasNSAEncoding = false; - bool HasPartialNSAEncoding = false; - bool GFX10_AEncoding = false; - bool GFX10_BEncoding = false; - bool HasDLInsts = false; - bool HasFmacF64Inst = false; - bool HasDot1Insts = false; - bool HasDot2Insts = false; - bool HasDot3Insts = false; - bool HasDot4Insts = false; - bool HasDot5Insts = false; - bool HasDot6Insts = false; - bool HasDot7Insts = false; - bool HasDot8Insts = false; - bool HasDot9Insts = false; - bool HasDot10Insts = false; - bool HasDot11Insts = false; - bool HasDot12Insts = false; - bool HasDot13Insts = false; - bool HasMAIInsts = false; - bool HasFP8Insts = false; - bool HasFP8ConversionInsts = false; - bool HasFP8E5M3Insts = false; - bool HasCvtFP8Vop1Bug = false; - bool HasPkFmacF16Inst = false; - bool HasAtomicFMinFMaxF32GlobalInsts = false; - bool HasAtomicFMinFMaxF64GlobalInsts = false; - bool HasAtomicFMinFMaxF32FlatInsts = false; - bool HasAtomicFMinFMaxF64FlatInsts = false; - bool HasAtomicDsPkAdd16Insts = false; - bool HasAtomicFlatPkAdd16Insts = false; - bool HasAtomicFaddRtnInsts = false; - bool HasAtomicFaddNoRtnInsts = false; - bool HasMemoryAtomicFaddF32DenormalSupport = false; - bool HasAtomicBufferGlobalPkAddF16NoRtnInsts = false; - bool HasAtomicBufferGlobalPkAddF16Insts = false; - bool HasAtomicCSubNoRtnInsts = false; - bool HasAtomicGlobalPkAddBF16Inst = false; - bool HasAtomicBufferPkAddBF16Inst = false; - bool HasFlatAtomicFaddF32Inst = false; - bool HasFlatBufferGlobalAtomicFaddF64Inst = false; - bool HasDefaultComponentZero = false; - bool HasAgentScopeFineGrainedRemoteMemoryAtomics = false; - bool HasEmulatedSystemScopeAtomics = false; - bool HasDefaultComponentBroadcast = false; - bool HasXF32Insts = false; + bool DynamicVGPR = false; + bool DynamicVGPRBlockSize32 = false; + bool ScalarizeGlobal = false; + /// The maximum number of instructions that may be placed within an S_CLAUSE, /// which is one greater than the maximum argument to S_CLAUSE. A value of 0 /// indicates a lack of S_CLAUSE support. unsigned MaxHardClauseLength = 0; - bool SupportsSRAMECC = false; - bool DynamicVGPR = false; - bool DynamicVGPRBlockSize32 = false; - bool HasVMemToLDSLoad = false; - bool RequiresAlignVGPR = false; - - // This should not be used directly. 'TargetID' tracks the dynamic settings - // for SRAMECC. - bool EnableSRAMECC = false; - - bool HasNoSdstCMPX = false; - bool HasVscnt = false; - bool HasWaitXcnt = false; - bool HasGetWaveIdInst = false; - bool HasSMemTimeInst = false; - bool HasShaderCyclesRegister = false; - bool HasShaderCyclesHiLoRegisters = false; - bool HasVOP3Literal = false; - bool HasNoDataDepHazard = false; - bool FlatAddressSpace = false; - bool FlatInstOffsets = false; - bool FlatGlobalInsts = false; - bool FlatScratchInsts = false; - bool FlatGVSMode = false; - bool ScalarFlatScratchInsts = false; - bool HasArchitectedFlatScratch = false; - bool EnableFlatScratch = false; - bool HasArchitectedSGPRs = false; - bool HasGDS = false; - bool HasGWS = false; - bool AddNoCarryInsts = false; - bool HasUnpackedD16VMem = false; - bool LDSMisalignedBug = false; - bool HasMFMAInlineLiteralBug = false; - bool UnalignedBufferAccess = false; - bool UnalignedDSAccess = false; - bool HasPackedTID = false; - bool ScalarizeGlobal = false; - bool HasSALUFloatInsts = false; - bool HasPseudoScalarTrans = false; - bool HasRestrictedSOffset = false; - bool Has64BitLiterals = false; - bool Has1024AddressableVGPRs = false; - bool HasBitOp3Insts = false; - bool HasTanhInsts = false; - bool HasTensorCvtLutInsts = false; - bool HasTransposeLoadF4F6Insts = false; - bool HasPrngInst = false; - bool HasBVHDualAndBVH8Insts = false; - bool HasPermlane16Swap = false; - bool HasPermlane32Swap = false; - bool HasVcmpxPermlaneHazard = false; - bool HasVMEMtoScalarWriteHazard = false; - bool HasSMEMtoVectorWriteHazard = false; - bool HasInstFwdPrefetchBug = false; - bool HasVmemPrefInsts = false; - bool HasSafeSmemPrefetch = false; - bool HasSafeCUPrefetch = false; - bool HasVcmpxExecWARHazard = false; - bool HasLdsBranchVmemWARHazard = false; - bool HasNSAtoVMEMBug = false; - bool HasNSAClauseBug = false; - bool HasOffset3fBug = false; - bool HasFlatSegmentOffsetBug = false; - bool HasImageStoreD16Bug = false; - bool HasImageGather4D16Bug = false; - bool HasMSAALoadDstSelBug = false; - bool HasPrivEnabledTrap2NopBug = false; - bool Has1_5xVGPRs = false; - bool HasMADIntraFwdBug = false; - bool HasVOPDInsts = false; - bool HasVALUTransUseHazard = false; - bool HasRequiredExportPriority = false; - bool HasVmemWriteVgprInOrder = false; - bool HasAshrPkInsts = false; - bool HasIEEEMinimumMaximumInsts = false; - bool HasMinimum3Maximum3F32 = false; - bool HasMinimum3Maximum3F16 = false; - bool HasMin3Max3PKF16 = false; - bool HasMinimum3Maximum3PKF16 = false; - bool HasLshlAddU64Inst = false; - bool HasAddSubU64Insts = false; - bool HasMadU32Inst = false; - bool HasPointSampleAccel = false; - bool HasLdsBarrierArriveAtomic = false; - bool HasSetPrioIncWgInst = false; - - bool RequiresCOV6 = false; - bool UseBlockVGPROpsForCSR = false; - bool HasGloballyAddressableScratch = false; - - bool Has45BitNumRecordsBufferResource = false; - - bool HasClusters = false; - - // Dummy feature to use for assembler in tablegen. - bool FeatureDisable = false; + +#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \ + bool ATTRIBUTE = DEFAULT; +#include "AMDGPUGenSubtargetInfo.inc" private: SIInstrInfo InstrInfo; @@ -303,24 +90,20 @@ public: const GCNTargetMachine &TM); ~GCNSubtarget() override; - GCNSubtarget &initializeSubtargetDependencies(const Triple &TT, - StringRef GPU, StringRef FS); + GCNSubtarget &initializeSubtargetDependencies(const Triple &TT, StringRef GPU, + StringRef FS); /// Diagnose inconsistent subtarget features before attempting to codegen /// function \p F. void checkSubtargetFeatures(const Function &F) const; - const SIInstrInfo *getInstrInfo() const override { - return &InstrInfo; - } + const SIInstrInfo *getInstrInfo() const override { return &InstrInfo; } const SIFrameLowering *getFrameLowering() const override { return &FrameLowering; } - const SITargetLowering *getTargetLowering() const override { - return &TLInfo; - } + const SITargetLowering *getTargetLowering() const override { return &TLInfo; } const SIRegisterInfo *getRegisterInfo() const override { return &InstrInfo.getRegisterInfo(); @@ -358,9 +141,13 @@ public: void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS); - Generation getGeneration() const { - return (Generation)Gen; - } + Generation getGeneration() const { return (Generation)Gen; } + + bool isGFX11Plus() const { return getGeneration() >= GFX11; } + +#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \ + bool GETTER() const override { return ATTRIBUTE; } +#include "AMDGPUGenSubtargetInfo.inc" unsigned getMaxWaveScratchSize() const { // See COMPUTE_TMPRING_SIZE.WAVESIZE. @@ -381,12 +168,11 @@ public: return llvm::countl_zero(getMaxWaveScratchSize()) + getWavefrontSizeLog2(); } - int getLDSBankCount() const { - return LDSBankCount; - } + int getLDSBankCount() const { return LDSBankCount; } unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const { - return (ForBufferRSrc || !enableFlatScratch()) ? MaxPrivateElementSize : 16; + return (ForBufferRSrc || !hasFlatScratchEnabled()) ? MaxPrivateElementSize + : 16; } unsigned getConstantBusLimit(unsigned Opcode) const; @@ -397,34 +183,12 @@ public: bool zeroesHigh16BitsOfDest(unsigned Opcode) const; bool supportsWGP() const { - if (GFX1250Insts) + if (HasGFX1250Insts) return false; return getGeneration() >= GFX10; } - bool hasIntClamp() const { - return HasIntClamp; - } - - bool hasFP64() const { - return FP64; - } - - bool hasMIMG_R128() const { - return MIMG_R128; - } - - bool hasHWFP64() const { - return FP64; - } - - bool hasHalfRate64Ops() const { - return HalfRate64Ops; - } - - bool hasFullRate64Ops() const { - return FullRate64Ops; - } + bool hasHWFP64() const { return HasFP64; } bool hasAddr64() const { return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS); @@ -440,67 +204,19 @@ public: return getGeneration() >= VOLCANIC_ISLANDS; } - bool hasFractBug() const { - return getGeneration() == SOUTHERN_ISLANDS; - } - - bool hasBFE() const { - return true; - } - - bool hasBFI() const { - return true; - } - - bool hasBFM() const { - return hasBFE(); - } - - bool hasBCNT(unsigned Size) const { - return true; - } + bool hasFractBug() const { return getGeneration() == SOUTHERN_ISLANDS; } - bool hasFFBL() const { - return true; - } - - bool hasFFBH() const { - return true; - } - - bool hasMed3_16() const { - return getGeneration() >= AMDGPUSubtarget::GFX9; - } + bool hasMed3_16() const { return getGeneration() >= AMDGPUSubtarget::GFX9; } bool hasMin3Max3_16() const { return getGeneration() >= AMDGPUSubtarget::GFX9; } - bool hasFmaMixInsts() const { - return HasFmaMixInsts; - } + bool hasSwap() const { return HasGFX9Insts; } - bool hasFmaMixBF16Insts() const { return HasFmaMixBF16Insts; } + bool hasScalarPackInsts() const { return HasGFX9Insts; } - bool hasCARRY() const { - return true; - } - - bool hasFMA() const { - return FMA; - } - - bool hasSwap() const { - return GFX9Insts; - } - - bool hasScalarPackInsts() const { - return GFX9Insts; - } - - bool hasScalarMulHiInsts() const { - return GFX9Insts; - } + bool hasScalarMulHiInsts() const { return HasGFX9Insts; } bool hasScalarSubwordLoads() const { return getGeneration() >= GFX12; } @@ -515,9 +231,7 @@ public: /// True if the offset field of DS instructions works as expected. On SI, the /// offset uses a 16-bit adder and does not always wrap properly. - bool hasUsableDSOffset() const { - return getGeneration() >= SEA_ISLANDS; - } + bool hasUsableDSOffset() const { return getGeneration() >= SEA_ISLANDS; } bool unsafeDSOffsetFoldingEnabled() const { return EnableUnsafeDSOffsetFolding; @@ -530,14 +244,10 @@ public: /// Extra wait hazard is needed in some cases before /// s_cbranch_vccnz/s_cbranch_vccz. - bool hasReadVCCZBug() const { - return getGeneration() <= SEA_ISLANDS; - } + bool hasReadVCCZBug() const { return getGeneration() <= SEA_ISLANDS; } /// Writes to VCC_LO/VCC_HI update the VCCZ flag. - bool partialVCCWritesUpdateVCCZ() const { - return getGeneration() >= GFX10; - } + bool partialVCCWritesUpdateVCCZ() const { return getGeneration() >= GFX10; } /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR /// was written by a VALU instruction. @@ -551,19 +261,13 @@ public: return getGeneration() >= VOLCANIC_ISLANDS; } - bool hasRFEHazards() const { - return getGeneration() >= VOLCANIC_ISLANDS; - } + bool hasRFEHazards() const { return getGeneration() >= VOLCANIC_ISLANDS; } /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32. unsigned getSetRegWaitStates() const { return getGeneration() <= SEA_ISLANDS ? 1 : 2; } - bool dumpCode() const { - return DumpCode; - } - /// Return the amount of LDS that can be used that will not restrict the /// occupancy lower than WaveCount. unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, @@ -578,25 +282,15 @@ public: return getGeneration() >= AMDGPUSubtarget::GFX10; } - bool useFlatForGlobal() const { - return FlatForGlobal; - } - /// \returns If target supports ds_read/write_b128 and user enables generation /// of ds_read/write_b128. - bool useDS128() const { - return CIInsts && EnableDS128; - } + bool useDS128() const { return HasCIInsts && EnableDS128; } /// \return If target supports ds_read/write_b96/128. - bool hasDS96AndDS128() const { - return CIInsts; - } + bool hasDS96AndDS128() const { return HasCIInsts; } /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64 - bool haveRoundOpsF64() const { - return CIInsts; - } + bool haveRoundOpsF64() const { return HasCIInsts; } /// \returns If MUBUF instructions always perform range checking, even for /// buffer resources used for private memory access. @@ -606,91 +300,29 @@ public: /// \returns If target requires PRT Struct NULL support (zero result registers /// for sparse texture support). - bool usePRTStrictNull() const { - return EnablePRTStrictNull; - } - - bool hasAutoWaitcntBeforeBarrier() const { - return AutoWaitcntBeforeBarrier; - } - - /// \returns true if the target supports backing off of s_barrier instructions - /// when an exception is raised. - bool supportsBackOffBarrier() const { - return BackOffBarrier; - } - - bool hasUnalignedBufferAccess() const { - return UnalignedBufferAccess; - } + bool usePRTStrictNull() const { return EnablePRTStrictNull; } bool hasUnalignedBufferAccessEnabled() const { - return UnalignedBufferAccess && UnalignedAccessMode; - } - - bool hasUnalignedDSAccess() const { - return UnalignedDSAccess; + return HasUnalignedBufferAccess && HasUnalignedAccessMode; } bool hasUnalignedDSAccessEnabled() const { - return UnalignedDSAccess && UnalignedAccessMode; - } - - bool hasUnalignedScratchAccess() const { - return UnalignedScratchAccess; + return HasUnalignedDSAccess && HasUnalignedAccessMode; } bool hasUnalignedScratchAccessEnabled() const { - return UnalignedScratchAccess && UnalignedAccessMode; + return HasUnalignedScratchAccess && HasUnalignedAccessMode; } - bool hasUnalignedAccessMode() const { - return UnalignedAccessMode; - } - - bool hasRelaxedBufferOOBMode() const { return RelaxedBufferOOBMode; } - - bool hasApertureRegs() const { - return HasApertureRegs; - } + bool isXNACKEnabled() const { return TargetID.isXnackOnOrAny(); } - bool isTrapHandlerEnabled() const { - return TrapHandler; - } + bool isTgSplitEnabled() const { return EnableTgSplit; } - bool isXNACKEnabled() const { - return TargetID.isXnackOnOrAny(); - } - - bool isTgSplitEnabled() const { - return EnableTgSplit; - } - - bool isCuModeEnabled() const { - return EnableCuMode; - } + bool isCuModeEnabled() const { return EnableCuMode; } bool isPreciseMemoryEnabled() const { return EnablePreciseMemory; } - bool hasFlatAddressSpace() const { - return FlatAddressSpace; - } - - bool hasFlatScrRegister() const { - return hasFlatAddressSpace(); - } - - bool hasFlatInstOffsets() const { - return FlatInstOffsets; - } - - bool hasFlatGlobalInsts() const { - return FlatGlobalInsts; - } - - bool hasFlatScratchInsts() const { - return FlatScratchInsts; - } + bool hasFlatScrRegister() const { return hasFlatAddressSpace(); } // Check if target supports ST addressing mode with FLAT scratch instructions. // The ST addressing mode means no registers are used, either VGPR or SGPR, @@ -699,24 +331,16 @@ public: return hasFlatScratchInsts() && (hasGFX10_3Insts() || hasGFX940Insts()); } - bool hasFlatScratchSVSMode() const { return GFX940Insts || GFX11Insts; } + bool hasFlatScratchSVSMode() const { return HasGFX940Insts || HasGFX11Insts; } - bool hasScalarFlatScratchInsts() const { - return ScalarFlatScratchInsts; - } - - bool enableFlatScratch() const { - return flatScratchIsArchitected() || + bool hasFlatScratchEnabled() const { + return hasArchitectedFlatScratch() || (EnableFlatScratch && hasFlatScratchInsts()); } - bool hasGlobalAddTidInsts() const { - return GFX10_BEncoding; - } + bool hasGlobalAddTidInsts() const { return HasGFX10_BEncoding; } - bool hasAtomicCSub() const { - return GFX10_BEncoding; - } + bool hasAtomicCSub() const { return HasGFX10_BEncoding; } bool hasMTBUFInsts() const { return !hasGFX1250Insts(); } @@ -726,7 +350,9 @@ public: return !hasGFX940Insts() && !hasGFX1250Insts(); } - bool hasVINTERPEncoding() const { return GFX11Insts && !hasGFX1250Insts(); } + bool hasVINTERPEncoding() const { + return HasGFX11Insts && !hasGFX1250Insts(); + } // DS_ADD_F64/DS_ADD_RTN_F64 bool hasLdsAtomicAddF64() const { @@ -737,274 +363,45 @@ public: return getGeneration() >= GFX9; } - bool hasFlatSegmentOffsetBug() const { - return HasFlatSegmentOffsetBug; - } + bool hasFlatLgkmVMemCountInOrder() const { return getGeneration() > GFX9; } - bool hasFlatLgkmVMemCountInOrder() const { - return getGeneration() > GFX9; - } - - bool hasD16LoadStore() const { - return getGeneration() >= GFX9; - } + bool hasD16LoadStore() const { return getGeneration() >= GFX9; } bool d16PreservesUnusedBits() const { return hasD16LoadStore() && !TargetID.isSramEccOnOrAny(); } - bool hasD16Images() const { - return getGeneration() >= VOLCANIC_ISLANDS; - } + bool hasD16Images() const { return getGeneration() >= VOLCANIC_ISLANDS; } /// Return if most LDS instructions have an m0 use that require m0 to be /// initialized. - bool ldsRequiresM0Init() const { - return getGeneration() < GFX9; - } + bool ldsRequiresM0Init() const { return getGeneration() < GFX9; } // True if the hardware rewinds and replays GWS operations if a wave is // preempted. // // If this is false, a GWS operation requires testing if a nack set the // MEM_VIOL bit, and repeating if so. - bool hasGWSAutoReplay() const { - return getGeneration() >= GFX9; - } + bool hasGWSAutoReplay() const { return getGeneration() >= GFX9; } /// \returns if target has ds_gws_sema_release_all instruction. - bool hasGWSSemaReleaseAll() const { - return CIInsts; - } - - /// \returns true if the target has integer add/sub instructions that do not - /// produce a carry-out. This includes v_add_[iu]32, v_sub_[iu]32, - /// v_add_[iu]16, and v_sub_[iu]16, all of which support the clamp modifier - /// for saturation. - bool hasAddNoCarry() const { - return AddNoCarryInsts; - } + bool hasGWSSemaReleaseAll() const { return HasCIInsts; } bool hasScalarAddSub64() const { return getGeneration() >= GFX12; } bool hasScalarSMulU64() const { return getGeneration() >= GFX12; } - bool hasUnpackedD16VMem() const { - return HasUnpackedD16VMem; - } - // Covers VS/PS/CS graphics shaders bool isMesaGfxShader(const Function &F) const { return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv()); } - bool hasMad64_32() const { - return getGeneration() >= SEA_ISLANDS; - } - - bool hasSDWAOmod() const { - return HasSDWAOmod; - } - - bool hasSDWAScalar() const { - return HasSDWAScalar; - } - - bool hasSDWASdst() const { - return HasSDWASdst; - } - - bool hasSDWAMac() const { - return HasSDWAMac; - } - - bool hasSDWAOutModsVOPC() const { - return HasSDWAOutModsVOPC; - } - - bool hasDLInsts() const { - return HasDLInsts; - } - - bool hasFmacF64Inst() const { return HasFmacF64Inst; } - - bool hasDot1Insts() const { - return HasDot1Insts; - } - - bool hasDot2Insts() const { - return HasDot2Insts; - } - - bool hasDot3Insts() const { - return HasDot3Insts; - } - - bool hasDot4Insts() const { - return HasDot4Insts; - } - - bool hasDot5Insts() const { - return HasDot5Insts; - } - - bool hasDot6Insts() const { - return HasDot6Insts; - } - - bool hasDot7Insts() const { - return HasDot7Insts; - } - - bool hasDot8Insts() const { - return HasDot8Insts; - } - - bool hasDot9Insts() const { - return HasDot9Insts; - } - - bool hasDot10Insts() const { - return HasDot10Insts; - } - - bool hasDot11Insts() const { - return HasDot11Insts; - } - - bool hasDot12Insts() const { - return HasDot12Insts; - } - - bool hasDot13Insts() const { - return HasDot13Insts; - } - - bool hasMAIInsts() const { - return HasMAIInsts; - } - - bool hasFP8Insts() const { - return HasFP8Insts; - } - - bool hasFP8ConversionInsts() const { return HasFP8ConversionInsts; } - - bool hasFP8E5M3Insts() const { return HasFP8E5M3Insts; } - - bool hasPkFmacF16Inst() const { - return HasPkFmacF16Inst; - } - - bool hasAtomicFMinFMaxF32GlobalInsts() const { - return HasAtomicFMinFMaxF32GlobalInsts; - } - - bool hasAtomicFMinFMaxF64GlobalInsts() const { - return HasAtomicFMinFMaxF64GlobalInsts; - } - - bool hasAtomicFMinFMaxF32FlatInsts() const { - return HasAtomicFMinFMaxF32FlatInsts; - } - - bool hasAtomicFMinFMaxF64FlatInsts() const { - return HasAtomicFMinFMaxF64FlatInsts; - } - - bool hasAtomicDsPkAdd16Insts() const { return HasAtomicDsPkAdd16Insts; } - - bool hasAtomicFlatPkAdd16Insts() const { return HasAtomicFlatPkAdd16Insts; } + bool hasMad64_32() const { return getGeneration() >= SEA_ISLANDS; } bool hasAtomicFaddInsts() const { return HasAtomicFaddRtnInsts || HasAtomicFaddNoRtnInsts; } - bool hasAtomicFaddRtnInsts() const { return HasAtomicFaddRtnInsts; } - - bool hasAtomicFaddNoRtnInsts() const { return HasAtomicFaddNoRtnInsts; } - - bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const { - return HasAtomicBufferGlobalPkAddF16NoRtnInsts; - } - - bool hasAtomicBufferGlobalPkAddF16Insts() const { - return HasAtomicBufferGlobalPkAddF16Insts; - } - - bool hasAtomicGlobalPkAddBF16Inst() const { - return HasAtomicGlobalPkAddBF16Inst; - } - - bool hasAtomicBufferPkAddBF16Inst() const { - return HasAtomicBufferPkAddBF16Inst; - } - - bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; } - - /// \return true if the target has flat, global, and buffer atomic fadd for - /// double. - bool hasFlatBufferGlobalAtomicFaddF64Inst() const { - return HasFlatBufferGlobalAtomicFaddF64Inst; - } - - /// \return true if the target's flat, global, and buffer atomic fadd for - /// float supports denormal handling. - bool hasMemoryAtomicFaddF32DenormalSupport() const { - return HasMemoryAtomicFaddF32DenormalSupport; - } - - /// \return true if atomic operations targeting fine-grained memory work - /// correctly at device scope, in allocations in host or peer PCIe device - /// memory. - bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const { - return HasAgentScopeFineGrainedRemoteMemoryAtomics; - } - - /// \return true is HW emulates system scope atomics unsupported by the PCI-e - /// via CAS loop. - bool hasEmulatedSystemScopeAtomics() const { - return HasEmulatedSystemScopeAtomics; - } - - bool hasDefaultComponentZero() const { return HasDefaultComponentZero; } - - bool hasDefaultComponentBroadcast() const { - return HasDefaultComponentBroadcast; - } - - bool hasNoSdstCMPX() const { - return HasNoSdstCMPX; - } - - bool hasVscnt() const { - return HasVscnt; - } - - bool hasGetWaveIdInst() const { - return HasGetWaveIdInst; - } - - bool hasSMemTimeInst() const { - return HasSMemTimeInst; - } - - bool hasShaderCyclesRegister() const { - return HasShaderCyclesRegister; - } - - bool hasShaderCyclesHiLoRegisters() const { - return HasShaderCyclesHiLoRegisters; - } - - bool hasVOP3Literal() const { - return HasVOP3Literal; - } - - bool hasNoDataDepHazard() const { - return HasNoDataDepHazard; - } - bool vmemWriteNeedsExpWaitcnt() const { return getGeneration() < SEA_ISLANDS; } @@ -1013,13 +410,7 @@ public: return getGeneration() == GFX10 || getGeneration() == GFX11; } - bool hasPrefetch() const { return GFX12Insts; } - - bool hasVmemPrefInsts() const { return HasVmemPrefInsts; } - - bool hasSafeSmemPrefetch() const { return HasSafeSmemPrefetch; } - - bool hasSafeCUPrefetch() const { return HasSafeCUPrefetch; } + bool hasPrefetch() const { return HasGFX12Insts; } // Has s_cmpk_* instructions. bool hasSCmpK() const { return getGeneration() < GFX12; } @@ -1034,15 +425,11 @@ public: // dynamic realignment in common cases. Align getStackAlignment() const { return Align(16); } - bool enableMachineScheduler() const override { - return true; - } + bool enableMachineScheduler() const override { return true; } bool useAA() const override; - bool enableSubRegLiveness() const override { - return true; - } + bool enableSubRegLiveness() const override { return true; } void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; } bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; } @@ -1051,9 +438,7 @@ public: static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI); // XXX - Why is this here if it isn't in the default pass set? - bool enableEarlyIfConversion() const override { - return true; - } + bool enableEarlyIfConversion() const override { return true; } void overrideSchedPolicy(MachineSchedPolicy &Policy, const SchedRegion &Region) const override; @@ -1067,73 +452,35 @@ public: return AMDGPU::getMaxNumUserSGPRs(*this); } - bool hasSMemRealTime() const { - return HasSMemRealTime; - } - - bool hasMovrel() const { - return HasMovrel; - } - - bool hasVGPRIndexMode() const { - return HasVGPRIndexMode; - } - bool useVGPRIndexMode() const; bool hasScalarCompareEq64() const { return getGeneration() >= VOLCANIC_ISLANDS; } - bool hasScalarDwordx3Loads() const { return HasScalarDwordx3Loads; } - - bool hasScalarStores() const { - return HasScalarStores; - } - - bool hasScalarAtomics() const { - return HasScalarAtomics; + bool hasLDSFPAtomicAddF32() const { return HasGFX8Insts; } + bool hasLDSFPAtomicAddF64() const { + return HasGFX90AInsts || HasGFX1250Insts; } - bool hasLDSFPAtomicAddF32() const { return GFX8Insts; } - bool hasLDSFPAtomicAddF64() const { return GFX90AInsts || GFX1250Insts; } - /// \returns true if the subtarget has the v_permlanex16_b32 instruction. bool hasPermLaneX16() const { return getGeneration() >= GFX10; } /// \returns true if the subtarget has the v_permlane64_b32 instruction. bool hasPermLane64() const { return getGeneration() >= GFX11; } - bool hasDPP() const { - return HasDPP; - } - - bool hasDPPBroadcasts() const { - return HasDPP && getGeneration() < GFX10; - } + bool hasDPPBroadcasts() const { return HasDPP && getGeneration() < GFX10; } bool hasDPPWavefrontShifts() const { return HasDPP && getGeneration() < GFX10; } - bool hasDPP8() const { - return HasDPP8; - } - - bool hasDPALU_DPP() const { - return HasDPALU_DPP; - } - - bool hasDPPSrc1SGPR() const { return HasDPPSrc1SGPR; } - - bool hasPackedFP32Ops() const { - return HasPackedFP32Ops; + bool hasDPPRowShare() const { + return HasDPP && (HasGFX90AInsts || getGeneration() >= GFX10); } // Has V_PK_MOV_B32 opcode - bool hasPkMovB32() const { - return GFX90AInsts; - } + bool hasPkMovB32() const { return HasGFX90AInsts; } bool hasFmaakFmamkF32Insts() const { return getGeneration() >= GFX10 || hasGFX940Insts(); @@ -1141,96 +488,26 @@ public: bool hasFmaakFmamkF64Insts() const { return hasGFX1250Insts(); } - bool hasImageInsts() const { - return HasImageInsts; - } - - bool hasExtendedImageInsts() const { - return HasExtendedImageInsts; - } - - bool hasR128A16() const { - return HasR128A16; - } - - bool hasA16() const { return HasA16; } - - bool hasG16() const { return HasG16; } - - bool hasOffset3fBug() const { - return HasOffset3fBug; - } - - bool hasImageStoreD16Bug() const { return HasImageStoreD16Bug; } - - bool hasImageGather4D16Bug() const { return HasImageGather4D16Bug; } - - bool hasMADIntraFwdBug() const { return HasMADIntraFwdBug; } - - bool hasMSAALoadDstSelBug() const { return HasMSAALoadDstSelBug; } - - bool hasPrivEnabledTrap2NopBug() const { return HasPrivEnabledTrap2NopBug; } - - bool hasNSAEncoding() const { return HasNSAEncoding; } - bool hasNonNSAEncoding() const { return getGeneration() < GFX12; } - bool hasPartialNSAEncoding() const { return HasPartialNSAEncoding; } - unsigned getNSAMaxSize(bool HasSampler = false) const { return AMDGPU::getNSAMaxSize(*this, HasSampler); } - bool hasGFX10_AEncoding() const { - return GFX10_AEncoding; - } - - bool hasGFX10_BEncoding() const { - return GFX10_BEncoding; - } - - bool hasGFX10_3Insts() const { - return GFX10_3Insts; - } - bool hasMadF16() const; - bool hasMovB64() const { return GFX940Insts || GFX1250Insts; } - - bool hasLshlAddU64Inst() const { return HasLshlAddU64Inst; } + bool hasMovB64() const { return HasGFX940Insts || HasGFX1250Insts; } // Scalar and global loads support scale_offset bit. - bool hasScaleOffset() const { return GFX1250Insts; } - - bool hasFlatGVSMode() const { return FlatGVSMode; } + bool hasScaleOffset() const { return HasGFX1250Insts; } // FLAT GLOBAL VOffset is signed - bool hasSignedGVSOffset() const { return GFX1250Insts; } + bool hasSignedGVSOffset() const { return HasGFX1250Insts; } - bool enableSIScheduler() const { - return EnableSIScheduler; - } + bool loadStoreOptEnabled() const { return EnableLoadStoreOpt; } - bool loadStoreOptEnabled() const { - return EnableLoadStoreOpt; - } - - bool hasSGPRInitBug() const { - return SGPRInitBug; - } - - bool hasUserSGPRInit16Bug() const { - return UserSGPRInit16Bug && isWave32(); - } - - bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; } - - bool hasNegativeUnalignedScratchOffsetBug() const { - return NegativeUnalignedScratchOffsetBug; - } - - bool hasMFMAInlineLiteralBug() const { - return HasMFMAInlineLiteralBug; + bool hasUserSGPRInit16BugInWave32() const { + return HasUserSGPRInit16Bug && isWave32(); } bool has12DWordStoreHazard() const { @@ -1238,9 +515,7 @@ public: } // \returns true if the subtarget supports DWORDX3 load/store instructions. - bool hasDwordx3LoadStores() const { - return CIInsts; - } + bool hasDwordx3LoadStores() const { return HasCIInsts; } bool hasReadM0MovRelInterpHazard() const { return getGeneration() == AMDGPUSubtarget::GFX9; @@ -1259,66 +534,32 @@ public: return getGeneration() == AMDGPUSubtarget::GFX9; } - bool hasVcmpxPermlaneHazard() const { - return HasVcmpxPermlaneHazard; - } - - bool hasVMEMtoScalarWriteHazard() const { - return HasVMEMtoScalarWriteHazard; - } - - bool hasSMEMtoVectorWriteHazard() const { - return HasSMEMtoVectorWriteHazard; - } - - bool hasLDSMisalignedBug() const { - return LDSMisalignedBug && !EnableCuMode; - } - - bool hasInstFwdPrefetchBug() const { - return HasInstFwdPrefetchBug; - } - - bool hasVcmpxExecWARHazard() const { - return HasVcmpxExecWARHazard; - } - - bool hasLdsBranchVmemWARHazard() const { - return HasLdsBranchVmemWARHazard; + bool hasLDSMisalignedBugInWGPMode() const { + return HasLDSMisalignedBug && !EnableCuMode; } // Shift amount of a 64 bit shift cannot be a highest allocated register // if also at the end of the allocation block. bool hasShift64HighRegBug() const { - return GFX90AInsts && !GFX940Insts; + return HasGFX90AInsts && !HasGFX940Insts; } // Has one cycle hazard on transcendental instruction feeding a // non transcendental VALU. - bool hasTransForwardingHazard() const { return GFX940Insts; } + bool hasTransForwardingHazard() const { return HasGFX940Insts; } // Has one cycle hazard on a VALU instruction partially writing dst with // a shift of result bits feeding another VALU instruction. - bool hasDstSelForwardingHazard() const { return GFX940Insts; } + bool hasDstSelForwardingHazard() const { return HasGFX940Insts; } // Cannot use op_sel with v_dot instructions. - bool hasDOTOpSelHazard() const { return GFX940Insts || GFX11Insts; } + bool hasDOTOpSelHazard() const { return HasGFX940Insts || HasGFX11Insts; } // Does not have HW interlocs for VALU writing and then reading SGPRs. - bool hasVDecCoExecHazard() const { - return GFX940Insts; - } - - bool hasNSAtoVMEMBug() const { - return HasNSAtoVMEMBug; - } - - bool hasNSAClauseBug() const { return HasNSAClauseBug; } + bool hasVDecCoExecHazard() const { return HasGFX940Insts; } bool hasHardClauses() const { return MaxHardClauseLength > 0; } - bool hasGFX90AInsts() const { return GFX90AInsts; } - bool hasFPAtomicToDenormModeHazard() const { return getGeneration() == GFX10; } @@ -1333,77 +574,45 @@ public: return getGeneration() == GFX11; } - bool hasVALUTransUseHazard() const { return HasVALUTransUseHazard; } - - bool hasCvtScaleForwardingHazard() const { return GFX950Insts; } + bool hasCvtScaleForwardingHazard() const { return HasGFX950Insts; } bool requiresCodeObjectV6() const { return RequiresCOV6; } bool useVGPRBlockOpsForCSR() const { return UseBlockVGPROpsForCSR; } - bool hasGloballyAddressableScratch() const { - return HasGloballyAddressableScratch; - } - bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; } - bool hasVALUReadSGPRHazard() const { return GFX12Insts && !GFX1250Insts; } + bool hasVALUReadSGPRHazard() const { + return HasGFX12Insts && !HasGFX1250Insts; + } bool setRegModeNeedsVNOPs() const { - return GFX1250Insts && getGeneration() == GFX12; + return HasGFX1250Insts && getGeneration() == GFX12; } /// Return if operations acting on VGPR tuples require even alignment. bool needsAlignedVGPRs() const { return RequiresAlignVGPR; } /// Return true if the target has the S_PACK_HL_B32_B16 instruction. - bool hasSPackHL() const { return GFX11Insts; } + bool hasSPackHL() const { return HasGFX11Insts; } /// Return true if the target's EXP instruction has the COMPR flag, which /// affects the meaning of the EN (enable) bits. - bool hasCompressedExport() const { return !GFX11Insts; } + bool hasCompressedExport() const { return !HasGFX11Insts; } /// Return true if the target's EXP instruction supports the NULL export /// target. - bool hasNullExportTarget() const { return !GFX11Insts; } - - bool has1_5xVGPRs() const { return Has1_5xVGPRs; } - - bool hasVOPDInsts() const { return HasVOPDInsts; } + bool hasNullExportTarget() const { return !HasGFX11Insts; } bool hasFlatScratchSVSSwizzleBug() const { return getGeneration() == GFX11; } /// Return true if the target has the S_DELAY_ALU instruction. - bool hasDelayAlu() const { return GFX11Insts; } - - bool hasPackedTID() const { return HasPackedTID; } - - // GFX94* is a derivation to GFX90A. hasGFX940Insts() being true implies that - // hasGFX90AInsts is also true. - bool hasGFX940Insts() const { return GFX940Insts; } - - // GFX950 is a derivation to GFX94*. hasGFX950Insts() implies that - // hasGFX940Insts and hasGFX90AInsts are also true. - bool hasGFX950Insts() const { return GFX950Insts; } + bool hasDelayAlu() const { return HasGFX11Insts; } /// Returns true if the target supports /// global_load_lds_dwordx3/global_load_lds_dwordx4 or /// buffer_load_dwordx3/buffer_load_dwordx4 with the lds bit. - bool hasLDSLoadB96_B128() const { - return hasGFX950Insts(); - } - - bool hasVMemToLDSLoad() const { return HasVMemToLDSLoad; } - - bool hasSALUFloatInsts() const { return HasSALUFloatInsts; } - - bool hasPseudoScalarTrans() const { return HasPseudoScalarTrans; } - - bool hasRestrictedSOffset() const { return HasRestrictedSOffset; } - - bool hasRequiredExportPriority() const { return HasRequiredExportPriority; } - - bool hasVmemWriteVgprInOrder() const { return HasVmemWriteVgprInOrder; } + bool hasLDSLoadB96_B128() const { return hasGFX950Insts(); } /// \returns true if the target uses LOADcnt/SAMPLEcnt/BVHcnt, DScnt/KMcnt /// and STOREcnt rather than VMcnt, LGKMcnt and VScnt respectively. @@ -1415,59 +624,25 @@ public: return getGeneration() == GFX12; } - /// \returns true if the target has instructions with xf32 format support. - bool hasXF32Insts() const { return HasXF32Insts; } - - bool hasBitOp3Insts() const { return HasBitOp3Insts; } - - bool hasPermlane16Swap() const { return HasPermlane16Swap; } - bool hasPermlane32Swap() const { return HasPermlane32Swap; } - bool hasAshrPkInsts() const { return HasAshrPkInsts; } - - bool hasMinimum3Maximum3F32() const { - return HasMinimum3Maximum3F32; - } - - bool hasMinimum3Maximum3F16() const { - return HasMinimum3Maximum3F16; + /// \returns true if the target has packed f32 instructions that only read 32 + /// bits from a scalar operand (SGPR or literal) and replicates the bits to + /// both channels. + bool hasPKF32InstsReplicatingLower32BitsOfScalarInput() const { + return getGeneration() == GFX12 && HasGFX1250Insts; } - bool hasMin3Max3PKF16() const { return HasMin3Max3PKF16; } - - bool hasTanhInsts() const { return HasTanhInsts; } - - bool hasTensorCvtLutInsts() const { return HasTensorCvtLutInsts; } - - bool hasAddPC64Inst() const { return GFX1250Insts; } - - bool has1024AddressableVGPRs() const { return Has1024AddressableVGPRs; } - - bool hasMinimum3Maximum3PKF16() const { - return HasMinimum3Maximum3PKF16; - } - - bool hasTransposeLoadF4F6Insts() const { return HasTransposeLoadF4F6Insts; } - - /// \returns true if the target has s_wait_xcnt insertion. Supported for - /// GFX1250. - bool hasWaitXCnt() const { return HasWaitXcnt; } + bool hasAddPC64Inst() const { return HasGFX1250Insts; } - // A single DWORD instructions can use a 64-bit literal. - bool has64BitLiterals() const { return Has64BitLiterals; } - - bool hasPointSampleAccel() const { return HasPointSampleAccel; } - - bool hasLdsBarrierArriveAtomic() const { return HasLdsBarrierArriveAtomic; } + /// \returns true if the target supports expert scheduling mode 2 which relies + /// on the compiler to insert waits to avoid hazards between VMEM and VALU + /// instructions in some instances. + bool hasExpertSchedulingMode() const { return getGeneration() >= GFX12; } /// \returns The maximum number of instructions that can be enclosed in an /// S_CLAUSE on the given subtarget, or 0 for targets that do not support that /// instruction. unsigned maxHardClauseLength() const { return MaxHardClauseLength; } - bool hasPrngInst() const { return HasPrngInst; } - - bool hasBVHDualAndBVH8Insts() const { return HasBVHDualAndBVH8Insts; } - /// Return the maximum number of waves per SIMD for kernels using \p SGPRs /// SGPRs unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const; @@ -1496,50 +671,22 @@ public: return getGeneration() >= AMDGPUSubtarget::GFX9; } - /// \returns true if the flat_scratch register is initialized by the HW. - /// In this case it is readonly. - bool flatScratchIsArchitected() const { return HasArchitectedFlatScratch; } - - /// \returns true if the architected SGPRs are enabled. - bool hasArchitectedSGPRs() const { return HasArchitectedSGPRs; } - - /// \returns true if Global Data Share is supported. - bool hasGDS() const { return HasGDS; } - - /// \returns true if Global Wave Sync is supported. - bool hasGWS() const { return HasGWS; } - /// \returns true if the machine has merged shaders in which s0-s7 are /// reserved by the hardware and user SGPRs start at s8 - bool hasMergedShaders() const { - return getGeneration() >= GFX9; - } + bool hasMergedShaders() const { return getGeneration() >= GFX9; } // \returns true if the target supports the pre-NGG legacy geometry path. bool hasLegacyGeometry() const { return getGeneration() < GFX11; } - // \returns true if preloading kernel arguments is supported. - bool hasKernargPreload() const { return KernargPreload; } - // \returns true if the target has split barriers feature bool hasSplitBarriers() const { return getGeneration() >= GFX12; } - // \returns true if FP8/BF8 VOP1 form of conversion to F32 is unreliable. - bool hasCvtFP8VOP1Bug() const { return HasCvtFP8Vop1Bug; } - - // \returns true if CSUB (a.k.a. SUB_CLAMP on GFX12) atomics support a - // no-return form. - bool hasAtomicCSubNoRtnInsts() const { return HasAtomicCSubNoRtnInsts; } - // \returns true if the target has DX10_CLAMP kernel descriptor mode bit bool hasDX10ClampMode() const { return getGeneration() < GFX12; } // \returns true if the target has IEEE kernel descriptor mode bit bool hasIEEEMode() const { return getGeneration() < GFX12; } - // \returns true if the target has IEEE fminimum/fmaximum instructions - bool hasIEEEMinimumMaximumInsts() const { return HasIEEEMinimumMaximumInsts; } - // \returns true if the target has WG_RR_MODE kernel descriptor mode bit bool hasRrWGMode() const { return getGeneration() >= GFX12; } @@ -1547,52 +694,43 @@ public: /// values. bool hasSignedScratchOffsets() const { return getGeneration() >= GFX12; } - bool hasGFX1250Insts() const { return GFX1250Insts; } + bool hasINVWBL2WaitCntRequirement() const { return HasGFX1250Insts; } - bool hasVOPD3() const { return GFX1250Insts; } - - // \returns true if the target has V_ADD_U64/V_SUB_U64 instructions. - bool hasAddSubU64Insts() const { return HasAddSubU64Insts; } - - // \returns true if the target has V_MAD_U32 instruction. - bool hasMadU32Inst() const { return HasMadU32Inst; } + bool hasVOPD3() const { return HasGFX1250Insts; } // \returns true if the target has V_MUL_U64/V_MUL_I64 instructions. - bool hasVectorMulU64() const { return GFX1250Insts; } + bool hasVectorMulU64() const { return HasGFX1250Insts; } // \returns true if the target has V_MAD_NC_U64_U32/V_MAD_NC_I64_I32 // instructions. - bool hasMadU64U32NoCarry() const { return GFX1250Insts; } + bool hasMadU64U32NoCarry() const { return HasGFX1250Insts; } // \returns true if the target has V_{MIN|MAX}_{I|U}64 instructions. - bool hasIntMinMax64() const { return GFX1250Insts; } - - // \returns true if the target has V_ADD_{MIN|MAX}_{I|U}32 instructions. - bool hasAddMinMaxInsts() const { return GFX1250Insts; } - - // \returns true if the target has V_PK_ADD_{MIN|MAX}_{I|U}16 instructions. - bool hasPkAddMinMaxInsts() const { return GFX1250Insts; } + bool hasIntMinMax64() const { return HasGFX1250Insts; } // \returns true if the target has V_PK_{MIN|MAX}3_{I|U}16 instructions. - bool hasPkMinMax3Insts() const { return GFX1250Insts; } + bool hasPkMinMax3Insts() const { return HasGFX1250Insts; } // \returns ture if target has S_GET_SHADER_CYCLES_U64 instruction. - bool hasSGetShaderCyclesInst() const { return GFX1250Insts; } - - // \returns true if target has S_SETPRIO_INC_WG instruction. - bool hasSetPrioIncWgInst() const { return HasSetPrioIncWgInst; } + bool hasSGetShaderCyclesInst() const { return HasGFX1250Insts; } // \returns true if S_GETPC_B64 zero-extends the result from 48 bits instead // of sign-extending. Note that GFX1250 has not only fixed the bug but also // extended VA to 57 bits. - bool hasGetPCZeroExtension() const { return GFX12Insts && !GFX1250Insts; } + bool hasGetPCZeroExtension() const { + return HasGFX12Insts && !HasGFX1250Insts; + } // \returns true if the target needs to create a prolog for backward // compatibility when preloading kernel arguments. bool needsKernArgPreloadProlog() const { - return hasKernargPreload() && !GFX1250Insts; + return hasKernargPreload() && !HasGFX1250Insts; } + bool hasCondSubInsts() const { return HasGFX12Insts; } + + bool hasSubClampInsts() const { return hasGFX10_3Insts(); } + /// \returns SGPR allocation granularity supported by the subtarget. unsigned getSGPRAllocGranule() const { return AMDGPU::IsaInfo::getSGPRAllocGranule(this); @@ -1724,9 +862,7 @@ public: /// unit requirement. unsigned getMaxNumVGPRs(const Function &F) const; - unsigned getMaxNumAGPRs(const Function &F) const { - return getMaxNumVGPRs(F); - } + unsigned getMaxNumAGPRs(const Function &F) const { return getMaxNumVGPRs(F); } /// Return a pair of maximum numbers of VGPRs and AGPRs that meet the number /// of waves per execution unit required for the function \p MF. @@ -1746,13 +882,9 @@ public: bool supportsWave64() const { return !hasGFX1250Insts(); } - bool isWave32() const { - return getWavefrontSize() == 32; - } + bool isWave32() const { return getWavefrontSize() == 32; } - bool isWave64() const { - return getWavefrontSize() == 64; - } + bool isWave64() const { return getWavefrontSize() == 64; } /// Returns if the wavesize of this subtarget is known reliable. This is false /// only for the a default target-cpu that does not have an explicit @@ -1809,11 +941,11 @@ public: // \returns true if the subtarget has a hazard requiring an "s_nop 0" // instruction before "s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)". - bool requiresNopBeforeDeallocVGPRs() const { return !GFX1250Insts; } + bool requiresNopBeforeDeallocVGPRs() const { return !HasGFX1250Insts; } // \returns true if the subtarget needs S_WAIT_ALU 0 before S_GETREG_B32 on // STATUS, STATE_PRIV, EXCP_FLAG_PRIV, or EXCP_FLAG_USER. - bool requiresWaitIdleBeforeGetReg() const { return GFX1250Insts; } + bool requiresWaitIdleBeforeGetReg() const { return HasGFX1250Insts; } bool isDynamicVGPREnabled() const { return DynamicVGPR; } unsigned getDynamicVGPRBlockSize() const { @@ -1835,15 +967,21 @@ public: // Requires s_wait_alu(0) after s102/s103 write and src_flat_scratch_base // read. bool hasScratchBaseForwardingHazard() const { - return GFX1250Insts && getGeneration() == GFX12; + return HasGFX1250Insts && getGeneration() == GFX12; } - /// \returns true if the subtarget supports clusters of workgroups. - bool hasClusters() const { return HasClusters; } + // src_flat_scratch_hi cannot be used as a source in SALU producing a 64-bit + // result. + bool hasFlatScratchHiInB64InstHazard() const { + return HasGFX1250Insts && getGeneration() == GFX12; + } - /// \returns true if the subtarget requires a wait for xcnt before atomic - /// flat/global stores & rmw. - bool requiresWaitXCntBeforeAtomicStores() const { return GFX1250Insts; } + /// \returns true if the subtarget requires a wait for xcnt before VMEM + /// accesses that must never be repeated in the event of a page fault/re-try. + /// Atomic stores/rmw and all volatile accesses fall under this criteria. + bool requiresWaitXCntForSingleAccessInstructions() const { + return HasGFX1250Insts; + } /// \returns the number of significant bits in the immediate field of the /// S_NOP instruction. @@ -1855,10 +993,28 @@ public: return 3; } - /// \returns true if the sub-target supports buffer resource (V#) with 45-bit - /// num_records. - bool has45BitNumRecordsBufferResource() const { - return Has45BitNumRecordsBufferResource; + bool supportsBPermute() const { + return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS; + } + + bool supportsWaveWideBPermute() const { + return (getGeneration() <= AMDGPUSubtarget::GFX9 || + getGeneration() == AMDGPUSubtarget::GFX12) || + isWave32(); + } + + /// Return true if real (non-fake) variants of True16 instructions using + /// 16-bit registers should be code-generated. Fake True16 instructions are + /// identical to non-fake ones except that they take 32-bit registers as + /// operands and always use their low halves. + // TODO: Remove and use hasTrue16BitInsts() instead once True16 is fully + // supported and the support for fake True16 instructions is removed. + bool useRealTrue16Insts() const { + return hasTrue16BitInsts() && EnableRealTrue16Insts; + } + + bool requiresWaitOnWorkgroupReleaseFence() const { + return getGeneration() >= GFX10 || isTgSplitEnabled(); } }; diff --git a/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp index 9e66909..663f538 100644 --- a/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp +++ b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp @@ -35,18 +35,18 @@ using namespace llvm; #define DEBUG_TYPE "gcn-vopd-utils" bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII, - const MachineInstr &FirstMI, - const MachineInstr &SecondMI, bool IsVOPD3) { + const MachineInstr &MIX, + const MachineInstr &MIY, bool IsVOPD3) { namespace VOPD = AMDGPU::VOPD; - const MachineFunction *MF = FirstMI.getMF(); + const MachineFunction *MF = MIX.getMF(); const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); if (IsVOPD3 && !ST.hasVOPD3()) return false; - if (!IsVOPD3 && (TII.isVOP3(FirstMI) || TII.isVOP3(SecondMI))) + if (!IsVOPD3 && (TII.isVOP3(MIX) || TII.isVOP3(MIY))) return false; - if (TII.isDPP(FirstMI) || TII.isDPP(SecondMI)) + if (TII.isDPP(MIX) || TII.isDPP(MIY)) return false; const SIRegisterInfo *TRI = dyn_cast<SIRegisterInfo>(ST.getRegisterInfo()); @@ -61,32 +61,24 @@ bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII, UniqueLiterals.push_back(&Op); }; SmallVector<Register> UniqueScalarRegs; - assert([&]() -> bool { - for (auto MII = MachineBasicBlock::const_iterator(&FirstMI); - MII != FirstMI.getParent()->instr_end(); ++MII) { - if (&*MII == &SecondMI) - return true; - } - return false; - }() && "Expected FirstMI to precede SecondMI"); - // Cannot pair dependent instructions - for (const auto &Use : SecondMI.uses()) - if (Use.isReg() && FirstMI.modifiesRegister(Use.getReg(), TRI)) + + // MIX must not modify any registers used by MIY. + for (const auto &Use : MIY.uses()) + if (Use.isReg() && MIX.modifiesRegister(Use.getReg(), TRI)) return false; auto getVRegIdx = [&](unsigned OpcodeIdx, unsigned OperandIdx) { - const MachineInstr &MI = (OpcodeIdx == VOPD::X) ? FirstMI : SecondMI; + const MachineInstr &MI = (OpcodeIdx == VOPD::X) ? MIX : MIY; const MachineOperand &Operand = MI.getOperand(OperandIdx); if (Operand.isReg() && TRI->isVectorRegister(MRI, Operand.getReg())) return Operand.getReg(); return Register(); }; - auto InstInfo = - AMDGPU::getVOPDInstInfo(FirstMI.getDesc(), SecondMI.getDesc()); + auto InstInfo = AMDGPU::getVOPDInstInfo(MIX.getDesc(), MIY.getDesc()); for (auto CompIdx : VOPD::COMPONENTS) { - const MachineInstr &MI = (CompIdx == VOPD::X) ? FirstMI : SecondMI; + const MachineInstr &MI = (CompIdx == VOPD::X) ? MIX : MIY; const MachineOperand &Src0 = *TII.getNamedOperand(MI, AMDGPU::OpName::src0); if (Src0.isReg()) { @@ -153,8 +145,8 @@ bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII, // On GFX12+ if both OpX and OpY are V_MOV_B32 then OPY uses SRC2 // source-cache. bool SkipSrc = ST.getGeneration() >= AMDGPUSubtarget::GFX12 && - FirstMI.getOpcode() == AMDGPU::V_MOV_B32_e32 && - SecondMI.getOpcode() == AMDGPU::V_MOV_B32_e32; + MIX.getOpcode() == AMDGPU::V_MOV_B32_e32 && + MIY.getOpcode() == AMDGPU::V_MOV_B32_e32; bool AllowSameVGPR = ST.hasGFX1250Insts(); if (InstInfo.hasInvalidOperand(getVRegIdx, *TRI, SkipSrc, AllowSameVGPR, @@ -163,22 +155,23 @@ bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII, if (IsVOPD3) { // BITOP3 can be converted to DUAL_BITOP2 only if src2 is zero. - if (AMDGPU::hasNamedOperand(SecondMI.getOpcode(), AMDGPU::OpName::bitop3)) { + // MIX check is only relevant to scheduling? + if (AMDGPU::hasNamedOperand(MIX.getOpcode(), AMDGPU::OpName::bitop3)) { const MachineOperand &Src2 = - *TII.getNamedOperand(SecondMI, AMDGPU::OpName::src2); + *TII.getNamedOperand(MIX, AMDGPU::OpName::src2); if (!Src2.isImm() || Src2.getImm()) return false; } - if (AMDGPU::hasNamedOperand(FirstMI.getOpcode(), AMDGPU::OpName::bitop3)) { + if (AMDGPU::hasNamedOperand(MIY.getOpcode(), AMDGPU::OpName::bitop3)) { const MachineOperand &Src2 = - *TII.getNamedOperand(FirstMI, AMDGPU::OpName::src2); + *TII.getNamedOperand(MIY, AMDGPU::OpName::src2); if (!Src2.isImm() || Src2.getImm()) return false; } } - LLVM_DEBUG(dbgs() << "VOPD Reg Constraints Passed\n\tX: " << FirstMI - << "\n\tY: " << SecondMI << "\n"); + LLVM_DEBUG(dbgs() << "VOPD Reg Constraints Passed\n\tX: " << MIX + << "\n\tY: " << MIY << "\n"); return true; } @@ -208,6 +201,15 @@ static bool shouldScheduleVOPDAdjacent(const TargetInstrInfo &TII, (FirstCanBeVOPD.Y && SecondCanBeVOPD.X))) return false; + assert([&]() -> bool { + for (auto MII = MachineBasicBlock::const_iterator(FirstMI); + MII != FirstMI->getParent()->instr_end(); ++MII) { + if (&*MII == &SecondMI) + return true; + } + return false; + }() && "Expected FirstMI to precede SecondMI"); + return checkVOPDRegConstraints(STII, *FirstMI, SecondMI, VOPD3); }; diff --git a/llvm/lib/Target/AMDGPU/InstCombineTables.td b/llvm/lib/Target/AMDGPU/InstCombineTables.td deleted file mode 100644 index 3d62641..0000000 --- a/llvm/lib/Target/AMDGPU/InstCombineTables.td +++ /dev/null @@ -1,10 +0,0 @@ -include "AMDGPU.td" - -def AMDGPUImageDMaskIntrinsicTable : GenericTable { - let FilterClass = "AMDGPUImageDMaskIntrinsic"; - let Fields = ["Intr"]; - - let PrimaryKey = ["Intr"]; - let PrimaryKeyName = "getAMDGPUImageDMaskIntrinsic"; - let PrimaryKeyEarlyOut = 1; -} diff --git a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp index afaa190..9ec1213 100644 --- a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp +++ b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp @@ -322,13 +322,13 @@ bool AMDGPUCustomBehaviour::hasModifiersSet( } // taken from SIInstrInfo::isGWS() -bool AMDGPUCustomBehaviour::isGWS(uint16_t Opcode) const { +bool AMDGPUCustomBehaviour::isGWS(uint32_t Opcode) const { const MCInstrDesc &MCID = MCII.get(Opcode); return MCID.TSFlags & SIInstrFlags::GWS; } // taken from SIInstrInfo::isAlwaysGDS() -bool AMDGPUCustomBehaviour::isAlwaysGDS(uint16_t Opcode) const { +bool AMDGPUCustomBehaviour::isAlwaysGDS(uint32_t Opcode) const { return Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::DS_ADD_GS_REG_RTN || Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode); diff --git a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h index cbc7427..aeb5c03 100644 --- a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h +++ b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h @@ -32,7 +32,7 @@ public: AMDGPUInstrPostProcess(const MCSubtargetInfo &STI, const MCInstrInfo &MCII) : InstrPostProcess(STI, MCII) {} - ~AMDGPUInstrPostProcess() = default; + ~AMDGPUInstrPostProcess() override = default; void postProcessInstruction(Instruction &Inst, const MCInst &MCI) override; }; @@ -68,9 +68,9 @@ class AMDGPUCustomBehaviour : public CustomBehaviour { bool hasModifiersSet(const std::unique_ptr<Instruction> &Inst, AMDGPU::OpName OpName) const; /// Helper function used in generateWaitCntInfo() - bool isGWS(uint16_t Opcode) const; + bool isGWS(uint32_t Opcode) const; /// Helper function used in generateWaitCntInfo() - bool isAlwaysGDS(uint16_t Opcode) const; + bool isAlwaysGDS(uint32_t Opcode) const; /// Helper function used in generateWaitCntInfo() bool isVMEM(const MCInstrDesc &MCID); /// This method gets called from checkCustomHazard when mca is attempting to @@ -88,7 +88,7 @@ public: AMDGPUCustomBehaviour(const MCSubtargetInfo &STI, const mca::SourceMgr &SrcMgr, const MCInstrInfo &MCII); - ~AMDGPUCustomBehaviour() = default; + ~AMDGPUCustomBehaviour() override = default; /// This method is used to determine if an instruction /// should be allowed to be dispatched. The return value is /// how many cycles until the instruction can be dispatched. diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index 703ec0a..4aa4083 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -12,6 +12,7 @@ #include "SIDefines.h" #include "Utils/AMDGPUAsmUtils.h" #include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" @@ -336,7 +337,7 @@ void AMDGPUInstPrinter::printSymbolicFormat(const MCInst *MI, // \returns a low 256 vgpr representing a high vgpr \p Reg [v256..v1023] or // \p Reg itself otherwise. -static MCPhysReg getRegForPrinting(MCPhysReg Reg, const MCRegisterInfo &MRI) { +static MCRegister getRegForPrinting(MCRegister Reg, const MCRegisterInfo &MRI) { unsigned Enc = MRI.getEncodingValue(Reg); unsigned Idx = Enc & AMDGPU::HWEncoding::REG_IDX_MASK; if (Idx < 0x100) @@ -355,10 +356,10 @@ static MCPhysReg getRegForPrinting(MCPhysReg Reg, const MCRegisterInfo &MRI) { } // Restore MSBs of a VGPR above 255 from the MCInstrAnalysis. -static MCPhysReg getRegFromMIA(MCPhysReg Reg, unsigned OpNo, - const MCInstrDesc &Desc, - const MCRegisterInfo &MRI, - const AMDGPUMCInstrAnalysis &MIA) { +static MCRegister getRegFromMIA(MCRegister Reg, unsigned OpNo, + const MCInstrDesc &Desc, + const MCRegisterInfo &MRI, + const AMDGPUMCInstrAnalysis &MIA) { unsigned VgprMSBs = MIA.getVgprMSBs(); if (!VgprMSBs) return Reg; @@ -403,10 +404,10 @@ void AMDGPUInstPrinter::printRegOperand(MCRegister Reg, raw_ostream &O, } #endif - unsigned PrintReg = getRegForPrinting(Reg, MRI); + MCRegister PrintReg = getRegForPrinting(Reg, MRI); O << getRegisterName(PrintReg); - if (PrintReg != Reg.id()) + if (PrintReg != Reg) O << " /*" << getRegisterName(Reg) << "*/"; } @@ -490,6 +491,18 @@ void AMDGPUInstPrinter::printVINTRPDst(const MCInst *MI, unsigned OpNo, printRegularOperand(MI, OpNo, STI, O); } +void AMDGPUInstPrinter::printAVLdSt32Align2RegOp(const MCInst *MI, + unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + MCRegister Reg = MI->getOperand(OpNo).getReg(); + + // On targets with an even alignment requirement + if (MCRegister SubReg = MRI.getSubReg(Reg, AMDGPU::sub0)) + Reg = SubReg; + printRegOperand(Reg, O, MRI); +} + void AMDGPUInstPrinter::printImmediateInt16(uint32_t Imm, const MCSubtargetInfo &STI, raw_ostream &O) { @@ -610,6 +623,25 @@ void AMDGPUInstPrinter::printImmediateV216(uint32_t Imm, uint8_t OpType, printImmediateFP16(static_cast<uint16_t>(Imm), STI, O)) return; break; + case AMDGPU::OPERAND_REG_IMM_V2FP16_SPLAT: { + if (AMDGPU::isGFX11Plus(STI)) { + // For GFX11+, the inline constant is duplicated to both channels, so we + // need to check if the low and high 16 bits are the same, and then if + // they can be printed as inline constant values. + uint16_t Lo16 = static_cast<uint16_t>(Imm & 0xFFFF); + uint16_t Hi16 = static_cast<uint16_t>((Imm >> 16) & 0xFFFF); + if (Lo16 == Hi16 && + printImmediateFP16(static_cast<uint16_t>(Imm), STI, O)) + return; + } else { + // For pre-GFX11, the inline constant is in the low 16 bits, so we need + // to check if it can be printed as inline constant value. + if (isUInt<16>(Imm) && + printImmediateFP16(static_cast<uint16_t>(Imm), STI, O)) + return; + } + break; + } case AMDGPU::OPERAND_REG_IMM_V2BF16: case AMDGPU::OPERAND_REG_INLINE_C_V2BF16: if (isUInt<16>(Imm) && @@ -795,14 +827,24 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo, // Intention: print disassembler message when invalid code is decoded, // for example sgpr register used in VReg or VISrc(VReg or imm) operand. const MCOperandInfo &OpInfo = Desc.operands()[OpNo]; - int16_t RCID = MII.getOpRegClassID( - OpInfo, STI.getHwMode(MCSubtargetInfo::HwMode_RegInfo)); - if (RCID != -1) { + if (OpInfo.RegClass != -1) { + int16_t RCID = MII.getOpRegClassID( + OpInfo, STI.getHwMode(MCSubtargetInfo::HwMode_RegInfo)); const MCRegisterClass &RC = MRI.getRegClass(RCID); auto Reg = mc2PseudoReg(Op.getReg()); if (!RC.contains(Reg) && !isInlineValue(Reg)) { - O << "/*Invalid register, operand has \'" << MRI.getRegClassName(&RC) - << "\' register class*/"; + bool IsWaveSizeOp = OpInfo.isLookupRegClassByHwMode() && + (OpInfo.RegClass == AMDGPU::SReg_1 || + OpInfo.RegClass == AMDGPU::SReg_1_XEXEC); + // Suppress this comment for a mismatched wavesize. Some users expect to + // be able to assemble and disassemble modules with mixed wavesizes, but + // we do not know the subtarget in different functions in MC. + // + // TODO: Should probably print it anyway, maybe a more specific version. + if (!IsWaveSizeOp) { + O << "/*Invalid register, operand has \'" << MRI.getRegClassName(&RC) + << "\' register class*/"; + } } } } else if (Op.isImm()) { @@ -844,6 +886,7 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo, case AMDGPU::OPERAND_REG_IMM_V2INT16: case AMDGPU::OPERAND_REG_IMM_V2BF16: case AMDGPU::OPERAND_REG_IMM_V2FP16: + case AMDGPU::OPERAND_REG_IMM_V2FP16_SPLAT: case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: case AMDGPU::OPERAND_REG_INLINE_C_V2BF16: @@ -1331,12 +1374,9 @@ void AMDGPUInstPrinter::printPackedModifier(const MCInst *MI, return; O << Name; - for (int I = 0; I < NumOps; ++I) { - if (I != 0) - O << ','; - - O << !!(Ops[I] & Mod); - } + ListSeparator Sep(","); + for (int I = 0; I < NumOps; ++I) + O << Sep << !!(Ops[I] & Mod); if (HasDstSel) { O << ',' << !!(Ops[0] & SISrcMods::DST_OP_SEL); @@ -1428,26 +1468,10 @@ void AMDGPUInstPrinter::printMatrixFMT(const MCInst *MI, unsigned OpNo, return; O << " matrix_" << AorB << "_fmt:"; - switch (Imm) { - default: + if (Imm < static_cast<int64_t>(std::size(WMMAMods::ModMatrixFmt))) + O << WMMAMods::ModMatrixFmt[Imm]; + else O << Imm; - break; - case WMMA::MatrixFMT::MATRIX_FMT_FP8: - O << "MATRIX_FMT_FP8"; - break; - case WMMA::MatrixFMT::MATRIX_FMT_BF8: - O << "MATRIX_FMT_BF8"; - break; - case WMMA::MatrixFMT::MATRIX_FMT_FP6: - O << "MATRIX_FMT_FP6"; - break; - case WMMA::MatrixFMT::MATRIX_FMT_BF6: - O << "MATRIX_FMT_BF6"; - break; - case WMMA::MatrixFMT::MATRIX_FMT_FP4: - O << "MATRIX_FMT_FP4"; - break; - } } void AMDGPUInstPrinter::printMatrixAFMT(const MCInst *MI, unsigned OpNo, @@ -1470,17 +1494,10 @@ void AMDGPUInstPrinter::printMatrixScale(const MCInst *MI, unsigned OpNo, return; O << " matrix_" << AorB << "_scale:"; - switch (Imm) { - default: + if (Imm < static_cast<int64_t>(std::size(WMMAMods::ModMatrixScale))) + O << WMMAMods::ModMatrixScale[Imm]; + else O << Imm; - break; - case WMMA::MatrixScale::MATRIX_SCALE_ROW0: - O << "MATRIX_SCALE_ROW0"; - break; - case WMMA::MatrixScale::MATRIX_SCALE_ROW1: - O << "MATRIX_SCALE_ROW1"; - break; - } } void AMDGPUInstPrinter::printMatrixAScale(const MCInst *MI, unsigned OpNo, @@ -1503,20 +1520,10 @@ void AMDGPUInstPrinter::printMatrixScaleFmt(const MCInst *MI, unsigned OpNo, return; O << " matrix_" << AorB << "_scale_fmt:"; - switch (Imm) { - default: + if (Imm < static_cast<int64_t>(std::size(WMMAMods::ModMatrixScaleFmt))) + O << WMMAMods::ModMatrixScaleFmt[Imm]; + else O << Imm; - break; - case WMMA::MatrixScaleFmt::MATRIX_SCALE_FMT_E8: - O << "MATRIX_SCALE_FMT_E8"; - break; - case WMMA::MatrixScaleFmt::MATRIX_SCALE_FMT_E5M3: - O << "MATRIX_SCALE_FMT_E5M3"; - break; - case WMMA::MatrixScaleFmt::MATRIX_SCALE_FMT_E4M3: - O << "MATRIX_SCALE_FMT_E4M3"; - break; - } } void AMDGPUInstPrinter::printMatrixAScaleFmt(const MCInst *MI, unsigned OpNo, @@ -1574,14 +1581,10 @@ void AMDGPUInstPrinter::printGPRIdxMode(const MCInst *MI, unsigned OpNo, O << formatHex(static_cast<uint64_t>(Val)); } else { O << "gpr_idx("; - bool NeedComma = false; + ListSeparator Sep(","); for (unsigned ModeId = ID_MIN; ModeId <= ID_MAX; ++ModeId) { - if (Val & (1 << ModeId)) { - if (NeedComma) - O << ','; - O << IdSymbolic[ModeId]; - NeedComma = true; - } + if (Val & (1 << ModeId)) + O << Sep << IdSymbolic[ModeId]; } O << ')'; } @@ -1658,6 +1661,19 @@ void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo, } } +void AMDGPUInstPrinter::printWaitEvent(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + using namespace llvm::AMDGPU::WaitEvent; + const uint16_t Imm16 = static_cast<uint16_t>(MI->getOperand(OpNo).getImm()); + + StringRef EventName = getWaitEventMaskName(Imm16, STI); + if (EventName.empty()) + O << formatHex(static_cast<uint64_t>(Imm16)); + else + O << EventName; +} + static void printSwizzleBitmask(const uint16_t AndMask, const uint16_t OrMask, const uint16_t XorMask, @@ -1788,25 +1804,16 @@ void AMDGPUInstPrinter::printSWaitCnt(const MCInst *MI, unsigned OpNo, bool IsDefaultLgkmcnt = Lgkmcnt == getLgkmcntBitMask(ISA); bool PrintAll = IsDefaultVmcnt && IsDefaultExpcnt && IsDefaultLgkmcnt; - bool NeedSpace = false; + ListSeparator Sep(" "); - if (!IsDefaultVmcnt || PrintAll) { - O << "vmcnt(" << Vmcnt << ')'; - NeedSpace = true; - } + if (!IsDefaultVmcnt || PrintAll) + O << Sep << "vmcnt(" << Vmcnt << ')'; - if (!IsDefaultExpcnt || PrintAll) { - if (NeedSpace) - O << ' '; - O << "expcnt(" << Expcnt << ')'; - NeedSpace = true; - } + if (!IsDefaultExpcnt || PrintAll) + O << Sep << "expcnt(" << Expcnt << ')'; - if (!IsDefaultLgkmcnt || PrintAll) { - if (NeedSpace) - O << ' '; - O << "lgkmcnt(" << Lgkmcnt << ')'; - } + if (!IsDefaultLgkmcnt || PrintAll) + O << Sep << "lgkmcnt(" << Lgkmcnt << ')'; } void AMDGPUInstPrinter::printDepCtr(const MCInst *MI, unsigned OpNo, @@ -1822,14 +1829,10 @@ void AMDGPUInstPrinter::printDepCtr(const MCInst *MI, unsigned OpNo, StringRef Name; unsigned Val; bool IsDefault; - bool NeedSpace = false; + ListSeparator Sep(" "); while (decodeDepCtr(Imm16, Id, Name, Val, IsDefault, STI)) { - if (!IsDefault || !HasNonDefaultVal) { - if (NeedSpace) - O << ' '; - O << Name << '(' << Val << ')'; - NeedSpace = true; - } + if (!IsDefault || !HasNonDefaultVal) + O << Sep << Name << '(' << Val << ')'; } } else { O << formatHex(Imm16); diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h index b27295e..5e9ebc6 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h @@ -77,6 +77,9 @@ private: raw_ostream &O); void printVINTRPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + void printAVLdSt32Align2RegOp(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printImmediateInt16(uint32_t Imm, const MCSubtargetInfo &STI, raw_ostream &O); void printImmediateBF16(uint32_t Imm, const MCSubtargetInfo &STI, @@ -232,6 +235,8 @@ protected: raw_ostream &O); void printSendMsg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + void printWaitEvent(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); void printSwizzle(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printSWaitCnt(const MCInst *MI, unsigned OpNo, diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp index ea758bb..029d2ea 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp @@ -95,6 +95,13 @@ private: void getBinaryCodeForInstr(const MCInst &MI, SmallVectorImpl<MCFixup> &Fixups, APInt &Inst, APInt &Scratch, const MCSubtargetInfo &STI) const; + + template <bool HasSrc0, bool HasSrc1, bool HasSrc2> + APInt postEncodeVOP3(const MCInst &MI, APInt EncodedValue, + const MCSubtargetInfo &STI) const; + + APInt postEncodeVOPCX(const MCInst &MI, APInt EncodedValue, + const MCSubtargetInfo &STI) const; }; } // end anonymous namespace @@ -343,6 +350,14 @@ std::optional<uint64_t> AMDGPUMCCodeEmitter::getLitEncoding( return AMDGPU::getInlineEncodingV2F16(static_cast<uint32_t>(Imm)) .value_or(255); + case AMDGPU::OPERAND_REG_IMM_V2FP16_SPLAT: + // V_PK_FMAC_F16 has different inline constant behavior on pre-GFX11 vs + // GFX11+: pre-GFX11 produces (f16, 0), GFX11+ duplicates f16 to both + // halves. + return AMDGPU::getPKFMACF16InlineEncoding(static_cast<uint32_t>(Imm), + AMDGPU::isGFX11Plus(STI)) + .value_or(255); + case AMDGPU::OPERAND_REG_IMM_V2BF16: case AMDGPU::OPERAND_REG_INLINE_C_V2BF16: return AMDGPU::getInlineEncodingV2BF16(static_cast<uint32_t>(Imm)) @@ -374,11 +389,6 @@ uint64_t AMDGPUMCCodeEmitter::getImplicitOpSelHiEncoding(int Opcode) const { return OP_SEL_HI_0 | OP_SEL_HI_1 | OP_SEL_HI_2; } -static bool isVCMPX64(const MCInstrDesc &Desc) { - return (Desc.TSFlags & SIInstrFlags::VOP3) && - Desc.hasImplicitDefOfPhysReg(AMDGPU::EXEC); -} - void AMDGPUMCCodeEmitter::encodeInstruction(const MCInst &MI, SmallVectorImpl<char> &CB, SmallVectorImpl<MCFixup> &Fixups, @@ -403,18 +413,6 @@ void AMDGPUMCCodeEmitter::encodeInstruction(const MCInst &MI, Encoding |= getImplicitOpSelHiEncoding(Opcode); } - // GFX10+ v_cmpx opcodes promoted to VOP3 have implied dst=EXEC. - // Documentation requires dst to be encoded as EXEC (0x7E), - // but it looks like the actual value encoded for dst operand - // is ignored by HW. It was decided to define dst as "do not care" - // in td files to allow disassembler accept any dst value. - // However, dst is encoded as EXEC for compatibility with SP3. - if (AMDGPU::isGFX10Plus(STI) && isVCMPX64(Desc)) { - assert((Encoding & 0xFF) == 0); - Encoding |= MRI.getEncodingValue(AMDGPU::EXEC_LO) & - AMDGPU::HWEncoding::LO256_REG_IDX_MASK; - } - for (unsigned i = 0; i < bytes; i++) { CB.push_back((uint8_t)Encoding.extractBitsAsZExtValue(8, 8 * i)); } @@ -733,4 +731,37 @@ void AMDGPUMCCodeEmitter::getMachineOpValueCommon( llvm_unreachable("Encoding of this operand type is not supported yet."); } +template <bool HasSrc0, bool HasSrc1, bool HasSrc2> +APInt AMDGPUMCCodeEmitter::postEncodeVOP3(const MCInst &MI, APInt EncodedValue, + const MCSubtargetInfo &STI) const { + if (!AMDGPU::isGFX10Plus(STI)) + return EncodedValue; + // Set unused source fields in VOP3 encodings to inline immediate 0 to avoid + // hardware conservatively assuming the instruction reads SGPRs. + constexpr uint64_t InlineImmediate0 = 0x80; + if (!HasSrc0) + EncodedValue |= InlineImmediate0 << 32; + if (!HasSrc1) + EncodedValue |= InlineImmediate0 << 41; + if (!HasSrc2) + EncodedValue |= InlineImmediate0 << 50; + return EncodedValue; +} + +APInt AMDGPUMCCodeEmitter::postEncodeVOPCX(const MCInst &MI, APInt EncodedValue, + const MCSubtargetInfo &STI) const { + // GFX10+ v_cmpx opcodes promoted to VOP3 have implied dst=EXEC. + // Documentation requires dst to be encoded as EXEC (0x7E), + // but it looks like the actual value encoded for dst operand + // is ignored by HW. It was decided to define dst as "do not care" + // in td files to allow disassembler accept any dst value. + // However, dst is encoded as EXEC for compatibility with SP3. + [[maybe_unused]] const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); + assert((Desc.TSFlags & SIInstrFlags::VOP3) && + Desc.hasImplicitDefOfPhysReg(AMDGPU::EXEC)); + EncodedValue |= MRI.getEncodingValue(AMDGPU::EXEC_LO) & + AMDGPU::HWEncoding::LO256_REG_IDX_MASK; + return postEncodeVOP3<true, true, false>(MI, EncodedValue, STI); +} + #include "AMDGPUGenMCCodeEmitter.inc" diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp index c27be02..63437779 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp @@ -7,9 +7,7 @@ //===----------------------------------------------------------------------===// #include "AMDGPUMCExpr.h" -#include "GCNSubtarget.h" #include "Utils/AMDGPUBaseInfo.h" -#include "llvm/IR/Function.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCContext.h" @@ -317,30 +315,6 @@ const AMDGPUMCExpr *AMDGPUMCExpr::createTotalNumVGPR(const MCExpr *NumAGPR, return create(AGVK_TotalNumVGPRs, {NumAGPR, NumVGPR}, Ctx); } -/// Mimics GCNSubtarget::computeOccupancy for MCExpr. -/// -/// Remove dependency on GCNSubtarget and depend only only the necessary values -/// for said occupancy computation. Should match computeOccupancy implementation -/// without passing \p STM on. -const AMDGPUMCExpr *AMDGPUMCExpr::createOccupancy( - unsigned InitOcc, const MCExpr *NumSGPRs, const MCExpr *NumVGPRs, - unsigned DynamicVGPRBlockSize, const GCNSubtarget &STM, MCContext &Ctx) { - unsigned MaxWaves = IsaInfo::getMaxWavesPerEU(&STM); - unsigned Granule = IsaInfo::getVGPRAllocGranule(&STM, DynamicVGPRBlockSize); - unsigned TargetTotalNumVGPRs = IsaInfo::getTotalNumVGPRs(&STM); - unsigned Generation = STM.getGeneration(); - - auto CreateExpr = [&Ctx](unsigned Value) { - return MCConstantExpr::create(Value, Ctx); - }; - - return create(AGVK_Occupancy, - {CreateExpr(MaxWaves), CreateExpr(Granule), - CreateExpr(TargetTotalNumVGPRs), CreateExpr(Generation), - CreateExpr(InitOcc), NumSGPRs, NumVGPRs}, - Ctx); -} - const AMDGPUMCExpr *AMDGPUMCExpr::createLit(LitModifier Lit, int64_t Value, MCContext &Ctx) { assert(Lit == LitModifier::Lit || Lit == LitModifier::Lit64); @@ -481,7 +455,7 @@ static void unaryOpKnownBitsMapHelper(const MCExpr *Expr, KnownBitsMap &KBM, return; case MCUnaryExpr::Opcode::Minus: { KB.makeNegative(); - KBM[Expr] = KB; + KBM[Expr] = std::move(KB); return; } case MCUnaryExpr::Opcode::Not: { @@ -492,7 +466,7 @@ static void unaryOpKnownBitsMapHelper(const MCExpr *Expr, KnownBitsMap &KBM, } case MCUnaryExpr::Opcode::Plus: { KB.makeNonNegative(); - KBM[Expr] = KB; + KBM[Expr] = std::move(KB); return; } } @@ -514,7 +488,7 @@ static void targetOpKnownBitsMapHelper(const MCExpr *Expr, KnownBitsMap &KBM, knownBitsMapHelper(Arg, KBM, Depth + 1); KB |= KBM[Arg]; } - KBM[Expr] = KB; + KBM[Expr] = std::move(KB); return; } case AMDGPUMCExpr::VariantKind::AGVK_Max: { @@ -524,7 +498,7 @@ static void targetOpKnownBitsMapHelper(const MCExpr *Expr, KnownBitsMap &KBM, knownBitsMapHelper(Arg, KBM, Depth + 1); KB = KnownBits::umax(KB, KBM[Arg]); } - KBM[Expr] = KB; + KBM[Expr] = std::move(KB); return; } case AMDGPUMCExpr::VariantKind::AGVK_ExtraSGPRs: diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h index 54fcd2a..bf7b40b 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h @@ -64,7 +64,7 @@ private: ArrayRef<const MCExpr *> Args; AMDGPUMCExpr(VariantKind Kind, ArrayRef<const MCExpr *> Args, MCContext &Ctx); - ~AMDGPUMCExpr(); + ~AMDGPUMCExpr() override; bool evaluateExtraSGPRs(MCValue &Res, const MCAssembler *Asm) const; bool evaluateTotalNumVGPR(MCValue &Res, const MCAssembler *Asm) const; @@ -98,11 +98,6 @@ public: return create(VariantKind::AGVK_AlignTo, {Value, Align}, Ctx); } - static const AMDGPUMCExpr * - createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs, - const MCExpr *NumVGPRs, unsigned DynamicVGPRBlockSize, - const GCNSubtarget &STM, MCContext &Ctx); - static const AMDGPUMCExpr *createLit(LitModifier Lit, int64_t Value, MCContext &Ctx); diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp index 013cfeb..28b4da8 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp @@ -168,7 +168,7 @@ bool AMDGPUMCInstrAnalysis::evaluateBranch(const MCInst &Inst, uint64_t Addr, void AMDGPUMCInstrAnalysis::updateState(const MCInst &Inst, uint64_t Addr) { if (Inst.getOpcode() == AMDGPU::S_SET_VGPR_MSB_gfx12) - VgprMSBs = Inst.getOperand(0).getImm(); + VgprMSBs = Inst.getOperand(0).getImm() & 0xff; else if (isTerminator(Inst)) VgprMSBs = 0; } diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index 5a08573..86c5d1c 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -114,10 +114,12 @@ StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) { case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1151: AK = GK_GFX1151; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1152: AK = GK_GFX1152; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1153: AK = GK_GFX1153; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1170: AK = GK_GFX1170; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1200: AK = GK_GFX1200; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1201: AK = GK_GFX1201; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1250: AK = GK_GFX1250; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1251: AK = GK_GFX1251; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1310: AK = GK_GFX1310; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC: AK = GK_GFX9_GENERIC; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC: AK = GK_GFX9_4_GENERIC; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC: AK = GK_GFX10_1_GENERIC; break; @@ -201,10 +203,12 @@ unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) { case GK_GFX1151: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1151; case GK_GFX1152: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1152; case GK_GFX1153: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1153; + case GK_GFX1170: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1170; case GK_GFX1200: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1200; case GK_GFX1201: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1201; case GK_GFX1250: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1250; case GK_GFX1251: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1251; + case GK_GFX1310: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1310; case GK_GFX9_GENERIC: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC; case GK_GFX9_4_GENERIC: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC; case GK_GFX10_1_GENERIC: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC; @@ -302,9 +306,9 @@ void AMDGPUTargetAsmStreamer::EmitMCResourceInfo( #undef PRINT_RES_INFO } -void AMDGPUTargetAsmStreamer::EmitMCResourceMaximums(const MCSymbol *MaxVGPR, - const MCSymbol *MaxAGPR, - const MCSymbol *MaxSGPR) { +void AMDGPUTargetAsmStreamer::EmitMCResourceMaximums( + const MCSymbol *MaxVGPR, const MCSymbol *MaxAGPR, const MCSymbol *MaxSGPR, + const MCSymbol *MaxNamedBarrier) { #define PRINT_RES_INFO(ARG) \ OS << "\t.set "; \ ARG->print(OS, getContext().getAsmInfo()); \ @@ -315,6 +319,7 @@ void AMDGPUTargetAsmStreamer::EmitMCResourceMaximums(const MCSymbol *MaxVGPR, PRINT_RES_INFO(MaxVGPR); PRINT_RES_INFO(MaxAGPR); PRINT_RES_INFO(MaxSGPR); + PRINT_RES_INFO(MaxNamedBarrier); #undef PRINT_RES_INFO } @@ -398,7 +403,7 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor( EmitMCExpr(KD.kernarg_size); OS << '\n'; - if (isGFX1250(STI)) { + if (isGFX1250Plus(STI)) { PrintField(KD.compute_pgm_rsrc2, amdhsa::COMPUTE_PGM_RSRC2_GFX125_USER_SGPR_COUNT_SHIFT, amdhsa::COMPUTE_PGM_RSRC2_GFX125_USER_SGPR_COUNT, @@ -512,7 +517,7 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor( OS << '\n'; } - if (AMDGPU::isGFX1250(STI)) + if (isGFX1250Plus(STI)) PrintField(KD.compute_pgm_rsrc3, amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT_SHIFT, amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT, diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h index 22afcde..3a0d8dc 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h @@ -69,7 +69,8 @@ public: virtual void EmitMCResourceMaximums(const MCSymbol *MaxVGPR, const MCSymbol *MaxAGPR, - const MCSymbol *MaxSGPR) {}; + const MCSymbol *MaxSGPR, + const MCSymbol *MaxNamedBarrier) {}; /// \returns True on success, false on failure. virtual bool EmitISAVersion() { return true; } @@ -149,7 +150,8 @@ public: const MCSymbol *HasRecursion, const MCSymbol *HasIndirectCall) override; void EmitMCResourceMaximums(const MCSymbol *MaxVGPR, const MCSymbol *MaxAGPR, - const MCSymbol *MaxSGPR) override; + const MCSymbol *MaxSGPR, + const MCSymbol *MaxNamedBarrier) override; /// \returns True on success, false on failure. bool EmitISAVersion() override; diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td index 5f6d742..b023c96 100644 --- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td +++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td @@ -877,69 +877,69 @@ multiclass MIMG_Store <mimgopc op, string asm, bit has_d16, bit mip = 0> { } class MIMG_Atomic_gfx6789_base <bits<8> op, string asm, RegisterOperand data_rc, - RegisterClass addr_rc, string dns=""> - : MIMG_gfx6789 <op, (outs data_rc:$vdst), dns> { - let Constraints = "$vdst = $vdata"; - + RegisterClass addr_rc, bit noRtn, string dns=""> + : MIMG_gfx6789 <op, !if(noRtn, (outs), (outs data_rc:$vdst)), dns> { + let Constraints = !if(noRtn, "", "$vdst = $vdata"); + let isCodeGenOnly = noRtn; let InOperandList = (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256_XNULL:$srsrc, DMask:$dmask, UNorm:$unorm, CPol:$cpol, R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da); - let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$cpol$r128$tfe$lwe$da"; + let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$cpol$r128$tfe$lwe$da"; } class MIMG_Atomic_gfx90a_base <bits<8> op, string asm, RegisterOperand data_rc, - RegisterClass addr_rc, string dns=""> - : MIMG_gfx90a <op, (outs getAlign2RegOp<data_rc>.ret:$vdst), dns> { - let Constraints = "$vdst = $vdata"; - + RegisterClass addr_rc, bit noRtn, string dns=""> + : MIMG_gfx90a <op, !if(noRtn, (outs), (outs getAlign2RegOp<data_rc>.ret:$vdst)), dns> { + let Constraints = !if(noRtn, "", "$vdst = $vdata"); + let isCodeGenOnly = noRtn; let InOperandList = (ins getAlign2RegOp<data_rc>.ret:$vdata, addr_rc:$vaddr, SReg_256_XNULL:$srsrc, DMask:$dmask, UNorm:$unorm, CPol:$cpol, R128A16:$r128, LWE:$lwe, DA:$da); - let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$cpol$r128$lwe$da"; + let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$cpol$r128$lwe$da"; } class MIMG_Atomic_si<mimgopc op, string asm, RegisterOperand data_rc, - RegisterClass addr_rc, bit enableDasm = 0> - : MIMG_Atomic_gfx6789_base<op.SI, asm, data_rc, addr_rc, + RegisterClass addr_rc, bit noRtn = 0, bit enableDasm = 0> + : MIMG_Atomic_gfx6789_base<op.SI, asm, data_rc, addr_rc, noRtn, !if(enableDasm, "GFX6GFX7", "")> { let AssemblerPredicate = isGFX6GFX7; } class MIMG_Atomic_vi<mimgopc op, string asm, RegisterOperand data_rc, - RegisterClass addr_rc, bit enableDasm = 0> - : MIMG_Atomic_gfx6789_base<op.VI, asm, data_rc, addr_rc, !if(enableDasm, "GFX8", "")> { + RegisterClass addr_rc, bit noRtn = 0, bit enableDasm = 0> + : MIMG_Atomic_gfx6789_base<op.VI, asm, data_rc, addr_rc, noRtn, !if(enableDasm, "GFX8", "")> { let AssemblerPredicate = isGFX8GFX9NotGFX90A; let MIMGEncoding = MIMGEncGfx8; } class MIMG_Atomic_gfx90a<mimgopc op, string asm, RegisterOperand data_rc, - RegisterClass addr_rc, bit enableDasm = 0> - : MIMG_Atomic_gfx90a_base<op.VI, asm, data_rc, addr_rc, !if(enableDasm, "GFX90A", "")> { + RegisterClass addr_rc, bit noRtn = 0, bit enableDasm = 0> + : MIMG_Atomic_gfx90a_base<op.VI, asm, data_rc, addr_rc, noRtn, !if(enableDasm, "GFX90A", "")> { let AssemblerPredicate = isGFX90APlus; let MIMGEncoding = MIMGEncGfx90a; } class MIMG_Atomic_gfx10<mimgopc op, string opcode, RegisterOperand DataRC, RegisterClass AddrRC, - bit enableDisasm = 0> - : MIMG_gfx10<op.GFX10M, (outs DataRC:$vdst), + bit noRtn = 0, bit enableDisasm = 0> + : MIMG_gfx10<op.GFX10M, !if(noRtn, (outs), (outs DataRC:$vdst)), !if(enableDisasm, "GFX10", "")> { - let Constraints = "$vdst = $vdata"; - + let Constraints = !if(noRtn, "", "$vdst = $vdata"); + let isCodeGenOnly = noRtn; let InOperandList = (ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256_XNULL:$srsrc, DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe); - let AsmString = opcode#" $vdst, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe"; + let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe"; } class MIMG_Atomic_nsa_gfx10<mimgopc op, string opcode, RegisterOperand DataRC, int num_addrs, - bit enableDisasm = 0> - : MIMG_nsa_gfx10<op.GFX10M, (outs DataRC:$vdst), num_addrs, + bit noRtn = 0, bit enableDisasm = 0> + : MIMG_nsa_gfx10<op.GFX10M, !if(noRtn, (outs), (outs DataRC:$vdst)), num_addrs, !if(enableDisasm, "GFX10", "")> { - let Constraints = "$vdst = $vdata"; - + let Constraints = !if(noRtn, "", "$vdst = $vdata"); + let isCodeGenOnly = noRtn; let InOperandList = !con((ins DataRC:$vdata), AddrIns, (ins SReg_256_XNULL:$srsrc, DMask:$dmask, @@ -950,24 +950,24 @@ class MIMG_Atomic_nsa_gfx10<mimgopc op, string opcode, class MIMG_Atomic_gfx11<mimgopc op, string opcode, RegisterOperand DataRC, RegisterClass AddrRC, - bit enableDisasm = 0> - : MIMG_gfx11<op.GFX11, (outs DataRC:$vdst), + bit noRtn = 0, bit enableDisasm = 0> + : MIMG_gfx11<op.GFX11, !if(noRtn, (outs), (outs DataRC:$vdst)), !if(enableDisasm, "GFX11", "")> { - let Constraints = "$vdst = $vdata"; - + let Constraints = !if(noRtn, "", "$vdst = $vdata"); + let isCodeGenOnly = noRtn; let InOperandList = (ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256_XNULL:$srsrc, DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe); - let AsmString = opcode#" $vdst, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe"; + let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe"; } class MIMG_Atomic_nsa_gfx11<mimgopc op, string opcode, RegisterOperand DataRC, int num_addrs, - bit enableDisasm = 0> - : MIMG_nsa_gfx11<op.GFX11, (outs DataRC:$vdst), num_addrs, + bit noRtn = 0, bit enableDisasm = 0> + : MIMG_nsa_gfx11<op.GFX11, !if(noRtn, (outs), (outs DataRC:$vdst)), num_addrs, !if(enableDisasm, "GFX11", "")> { - let Constraints = "$vdst = $vdata"; - + let Constraints = !if(noRtn, "", "$vdst = $vdata"); + let isCodeGenOnly = noRtn; let InOperandList = !con((ins DataRC:$vdata), AddrIns, (ins SReg_256_XNULL:$srsrc, DMask:$dmask, @@ -977,11 +977,11 @@ class MIMG_Atomic_nsa_gfx11<mimgopc op, string opcode, } class VIMAGE_Atomic_gfx12<mimgopc op, string opcode, RegisterOperand DataRC, - int num_addrs, string renamed, bit enableDisasm = 0> - : VIMAGE_gfx12<op.GFX12, (outs DataRC:$vdst), num_addrs, + int num_addrs, string renamed, bit noRtn = 0, bit enableDisasm = 0> + : VIMAGE_gfx12<op.GFX12, !if(noRtn, (outs), (outs DataRC:$vdst)), num_addrs, !if(enableDisasm, "GFX12", "")> { - let Constraints = "$vdst = $vdata"; - + let Constraints = !if(noRtn, "", "$vdst = $vdata"); + let isCodeGenOnly = noRtn; let InOperandList = !con((ins DataRC:$vdata), AddrIns, (ins SReg_256_XNULL:$rsrc, DMask:$dmask, Dim:$dim, @@ -994,95 +994,96 @@ multiclass MIMG_Atomic_Addr_Helper_m <mimgopc op, string asm, RegisterOperand data_rc, bit enableDasm = 0, bit isFP = 0, + bit noRtn = 0, string renamed = ""> { let hasSideEffects = 1, // FIXME: remove this mayLoad = 1, mayStore = 1, hasPostISelHook = 0, DisableWQM = 1, - FPAtomic = isFP in { + FPAtomic = isFP, IsAtomicNoRet = noRtn in { let VAddrDwords = 1 in { let ssamp = 0 in { if op.HAS_SI then { - def _V1_si : MIMG_Atomic_si <op, asm, data_rc, VGPR_32, enableDasm>; + def _V1_si : MIMG_Atomic_si <op, asm, data_rc, VGPR_32, noRtn, enableDasm>; } if op.HAS_VI then { - def _V1_vi : MIMG_Atomic_vi <op, asm, data_rc, VGPR_32, enableDasm>; + def _V1_vi : MIMG_Atomic_vi <op, asm, data_rc, VGPR_32, noRtn, enableDasm>; let hasPostISelHook = 1 in - def _V1_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VGPR_32, enableDasm>; + def _V1_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VGPR_32, noRtn, enableDasm>; } if op.HAS_GFX10M then { - def _V1_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VGPR_32, enableDasm>; + def _V1_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VGPR_32, noRtn, enableDasm>; } if op.HAS_GFX11 then { - def _V1_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VGPR_32, enableDasm>; + def _V1_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VGPR_32, noRtn, enableDasm>; } } if op.HAS_GFX12 then { - def _V1_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 1, renamed>; + def _V1_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 1, renamed, noRtn>; } } let VAddrDwords = 2 in { let ssamp = 0 in { if op.HAS_SI then { - def _V2_si : MIMG_Atomic_si <op, asm, data_rc, VReg_64, 0>; + def _V2_si : MIMG_Atomic_si <op, asm, data_rc, VReg_64, noRtn, 0>; } if op.HAS_VI then { - def _V2_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_64, 0>; - def _V2_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_64_Align2, 0>; + def _V2_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_64, noRtn, 0>; + def _V2_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_64_Align2, noRtn, 0>; } if op.HAS_GFX10M then { - def _V2_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_64, 0>; - def _V2_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 2, 0>; + def _V2_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_64, noRtn, 0>; + def _V2_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 2, noRtn, 0>; } if op.HAS_GFX11 then { - def _V2_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_64, 0>; - def _V2_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 2, 0>; + def _V2_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_64, noRtn, 0>; + def _V2_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 2, noRtn, 0>; } } if op.HAS_GFX12 then { - def _V2_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 2, renamed>; + def _V2_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 2, renamed, noRtn>; } } let VAddrDwords = 3 in { let ssamp = 0 in { if op.HAS_SI then { - def _V3_si : MIMG_Atomic_si <op, asm, data_rc, VReg_96, 0>; + def _V3_si : MIMG_Atomic_si <op, asm, data_rc, VReg_96, noRtn, 0>; } if op.HAS_VI then { - def _V3_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_96, 0>; - def _V3_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_96_Align2, 0>; + def _V3_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_96, noRtn, 0>; + def _V3_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_96_Align2, noRtn, 0>; } if op.HAS_GFX10M then { - def _V3_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_96, 0>; - def _V3_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 3, 0>; + def _V3_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_96, noRtn, 0>; + def _V3_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 3, noRtn, 0>; } if op.HAS_GFX11 then { - def _V3_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_96, 0>; - def _V3_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 3, 0>; + def _V3_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_96, noRtn, 0>; + def _V3_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 3, noRtn, 0>; } } if op.HAS_GFX12 then { - def _V3_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 3, renamed>; + def _V3_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 3, renamed, noRtn>; } } let VAddrDwords = 4 in { let ssamp = 0 in { if op.HAS_SI then { - def _V4_si : MIMG_Atomic_si <op, asm, data_rc, VReg_128, 0>; + def _V4_si : MIMG_Atomic_si <op, asm, data_rc, VReg_128, noRtn, 0>; } if op.HAS_VI then { - def _V4_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_128, 0>; - def _V4_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_128_Align2, 0>; + def _V4_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_128, noRtn, 0>; + def _V4_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_128_Align2, noRtn, 0>; } if op.HAS_GFX10M then { - def _V4_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_128, 0>; - def _V4_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 4, enableDasm>; + def _V4_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_128, noRtn, 0>; + def _V4_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 4, noRtn, enableDasm>; } if op.HAS_GFX11 then { - def _V4_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_128, 0>; - def _V4_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 4, enableDasm>; + def _V4_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_128, noRtn, 0>; + def _V4_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 4, noRtn, enableDasm>; } } if op.HAS_GFX12 then { - def _V4_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 4, renamed, enableDasm>; + def _V4_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 4, renamed, noRtn, enableDasm>; } } } @@ -1095,12 +1096,13 @@ multiclass MIMG_Atomic_Addr_Helper_m <mimgopc op, string asm, } } -multiclass MIMG_Atomic <mimgopc op, string asm, bit isCmpSwap = 0, bit isFP = 0, - string renamed = ""> { // 64-bit atomics - let IsAtomicRet = 1 in { +multiclass MIMG_Atomic_Base <mimgopc op, string asm, bit isCmpSwap = 0, bit isFP = 0, + bit noRtn = 0, string renamed = ""> { // 64-bit atomics + let IsAtomicRet = !not(noRtn) in { def "" : MIMGBaseOpcode { let Atomic = 1; let AtomicX2 = isCmpSwap; + let NoReturn = noRtn; } let BaseOpcode = !cast<MIMGBaseOpcode>(NAME) in { @@ -1109,22 +1111,28 @@ multiclass MIMG_Atomic <mimgopc op, string asm, bit isCmpSwap = 0, bit isFP = 0, // Other variants are reconstructed by disassembler using dmask and tfe. if !not(isCmpSwap) then { let VDataDwords = 1 in - defm _V1 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_32, 1, isFP, renamed>; + defm _V1 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_32, 1, isFP, noRtn, renamed>; } let VDataDwords = 2 in - defm _V2 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_64, isCmpSwap, isFP, renamed>; + defm _V2 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_64, isCmpSwap, isFP, noRtn, renamed>; let VDataDwords = 3 in - defm _V3 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_96, 0, isFP, renamed>; + defm _V3 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_96, 0, isFP, noRtn, renamed>; if isCmpSwap then { let VDataDwords = 4 in - defm _V4 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_128, 0, isFP, renamed>; + defm _V4 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_128, 0, isFP, noRtn, renamed>; let VDataDwords = 5 in - defm _V5 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_160, 0, isFP, renamed>; + defm _V5 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_160, 0, isFP, noRtn, renamed>; } } - } // End IsAtomicRet = 1 + } +} + +multiclass MIMG_Atomic <mimgopc op, string asm, bit isCmpSwap = 0, bit isFP = 0, + string renamed = ""> { + defm "" : MIMG_Atomic_Base <op, asm, isCmpSwap, isFP, /*noRtn=*/0, renamed>; + defm "_NORTN" : MIMG_Atomic_Base <op, asm, isCmpSwap, isFP, /*noRtn=*/1, renamed>; } multiclass MIMG_Atomic_Renamed <mimgopc op, string asm, string renamed, @@ -1820,6 +1828,7 @@ let SubtargetPredicate = isGFX12Plus in { class ImageDimIntrinsicInfo<AMDGPUImageDimIntrinsic I> { Intrinsic Intr = I; MIMGBaseOpcode BaseOpcode = !cast<MIMGBaseOpcode>(!strconcat("IMAGE_", I.P.OpMod)); + MIMGBaseOpcode AtomicNoRetBaseOpcode = BaseOpcode; AMDGPUDimProps Dim = I.P.Dim; AMDGPUImageDimIntrinsicEval DimEval = AMDGPUImageDimIntrinsicEval<I.P>; @@ -1855,13 +1864,20 @@ class ImageDimIntrinsicInfo<AMDGPUImageDimIntrinsic I> { bits<8> CoordTyArg = !add(GradientTyArg, !if(I.P.Gradients, 1, 0)); } +class ImageDimAtomicIntrinsicInfo<AMDGPUImageDimIntrinsic I> + : ImageDimIntrinsicInfo<I> { + MIMGBaseOpcode AtomicNoRetBaseOpcode = + !cast<MIMGBaseOpcode>(!strconcat("IMAGE_", I.P.OpMod, "_NORTN")); +} + def ImageDimIntrinsicTable : GenericTable { let FilterClass = "ImageDimIntrinsicInfo"; - let Fields = ["Intr", "BaseOpcode", "Dim", "NumOffsetArgs", "NumBiasArgs", "NumZCompareArgs", "NumGradients", "NumDmask", "NumData", "NumVAddrs", "NumArgs", - "DMaskIndex", "VAddrStart", "OffsetIndex", "BiasIndex", "ZCompareIndex", "GradientStart", "CoordStart", "LodIndex", "MipIndex", "VAddrEnd", - "RsrcIndex", "SampIndex", "UnormIndex", "TexFailCtrlIndex", "CachePolicyIndex", + let Fields = ["Intr", "BaseOpcode", "AtomicNoRetBaseOpcode", "Dim", "NumOffsetArgs", "NumBiasArgs", "NumZCompareArgs", "NumGradients", "NumDmask", "NumData", + "NumVAddrs", "NumArgs", "DMaskIndex", "VAddrStart", "OffsetIndex", "BiasIndex", "ZCompareIndex", "GradientStart", "CoordStart", "LodIndex", "MipIndex", + "VAddrEnd", "RsrcIndex", "SampIndex", "UnormIndex", "TexFailCtrlIndex", "CachePolicyIndex", "BiasTyArg", "GradientTyArg", "CoordTyArg"]; string TypeOf_BaseOpcode = "MIMGBaseOpcode"; + string TypeOf_AtomicNoRetBaseOpcode = "MIMGBaseOpcode"; string TypeOf_Dim = "MIMGDim"; let PrimaryKey = ["Intr"]; @@ -1874,11 +1890,14 @@ def getImageDimIntrinsicByBaseOpcode : SearchIndex { let Key = ["BaseOpcode", "Dim"]; } -foreach intr = !listconcat(AMDGPUImageDimIntrinsics, - AMDGPUImageDimAtomicIntrinsics) in { +foreach intr = AMDGPUImageDimIntrinsics in { def : ImageDimIntrinsicInfo<intr>; } +foreach intr = AMDGPUImageDimAtomicIntrinsics in { + def : ImageDimAtomicIntrinsicInfo<intr>; +} + // L to LZ Optimization Mapping def : MIMGLZMapping<IMAGE_SAMPLE_L, IMAGE_SAMPLE_LZ>; def : MIMGLZMapping<IMAGE_SAMPLE_C_L, IMAGE_SAMPLE_C_LZ>; @@ -2057,12 +2076,12 @@ class VIMAGE_TENSOR_Pseudo<string opName, bit _UpTo2D = 0> : string AsmOperands = " $vaddr0, $vaddr1"#!if(UpTo2D, "", ", $vaddr2, $vaddr3")#"$r128$cpol"; } -let SubtargetPredicate = isGFX1250Plus in { +let SubtargetPredicate = isGFX125xOnly in { def TENSOR_LOAD_TO_LDS : VIMAGE_TENSOR_Pseudo<"tensor_load_to_lds">; def TENSOR_STORE_FROM_LDS : VIMAGE_TENSOR_Pseudo<"tensor_store_from_lds">; def TENSOR_LOAD_TO_LDS_D2 : VIMAGE_TENSOR_Pseudo<"tensor_load_to_lds", 1>; def TENSOR_STORE_FROM_LDS_D2 : VIMAGE_TENSOR_Pseudo<"tensor_store_from_lds", 1>; -} // End SubtargetPredicate = isGFX1250Plus. +} // End SubtargetPredicate = isGFX125xOnly. class TensorPat <VIMAGE_TENSOR_Pseudo inst, SDPatternOperator node> : GCNPat < (node v4i32:$vaddr0, v8i32:$vaddr1, v4i32:$vaddr2, v4i32:$vaddr3, (i32 timm:$cpol)), @@ -2074,12 +2093,12 @@ class TensorD2Pat <VIMAGE_TENSOR_Pseudo inst, SDPatternOperator node> : GCNPat < (inst $vaddr0, $vaddr1, 0, $cpol) >; -let SubtargetPredicate = isGFX1250Plus in { +let SubtargetPredicate = isGFX125xOnly in { def : TensorPat <TENSOR_LOAD_TO_LDS, int_amdgcn_tensor_load_to_lds>; def : TensorPat <TENSOR_STORE_FROM_LDS, int_amdgcn_tensor_store_from_lds>; def : TensorD2Pat <TENSOR_LOAD_TO_LDS_D2, int_amdgcn_tensor_load_to_lds_d2>; def : TensorD2Pat <TENSOR_STORE_FROM_LDS_D2, int_amdgcn_tensor_store_from_lds_d2>; -} +} // End SubtargetPredicate = isGFX125xOnly. class VIMAGE_TENSOR_Real <bits<8> op, VIMAGE_TENSOR_Pseudo ps, string opName = ps.Mnemonic> : InstSI <ps.OutOperandList, ps.InOperandList, opName # ps.AsmOperands, []>, @@ -2097,8 +2116,10 @@ class VIMAGE_TENSOR_Real <bits<8> op, VIMAGE_TENSOR_Pseudo ps, string opName = p let vaddr2 = !if(ps.UpTo2D, !cast<int>(SGPR_NULL_gfx11plus.HWEncoding), ?); let vaddr3 = !if(ps.UpTo2D, !cast<int>(SGPR_NULL_gfx11plus.HWEncoding), ?); + // Set VADDR4 to NULL + let vaddr4 = !cast<int>(SGPR_NULL_gfx11plus.HWEncoding); + // set to 0 based on SPG. - let vaddr4 = 0; let rsrc = 0; let vdata = 0; let d16 = 0; @@ -2109,7 +2130,7 @@ class VIMAGE_TENSOR_Real <bits<8> op, VIMAGE_TENSOR_Pseudo ps, string opName = p } multiclass VIMAGE_TENSOR_Real_gfx1250<bits<8> op> { - let AssemblerPredicate = isGFX1250Plus, DecoderNamespace = "GFX1250" in { + let AssemblerPredicate = isGFX125xOnly, DecoderNamespace = "GFX1250" in { foreach DSuffix = ["_D2", ""] in { defvar ps = !cast<VIMAGE_TENSOR_Pseudo>(NAME # DSuffix); def DSuffix # _gfx1250 : VIMAGE_TENSOR_Real<op, ps, ps.Mnemonic>, diff --git a/llvm/lib/Target/AMDGPU/R600.td b/llvm/lib/Target/AMDGPU/R600.td index 9148edb..bdfaac9 100644 --- a/llvm/lib/Target/AMDGPU/R600.td +++ b/llvm/lib/Target/AMDGPU/R600.td @@ -8,15 +8,6 @@ include "llvm/Target/Target.td" -def R600InstrInfo : InstrInfo { - let guessInstructionProperties = 1; -} - -def R600 : Target { - let InstructionSet = R600InstrInfo; - let AllowRegisterRenaming = 1; -} - let Namespace = "R600" in { foreach Index = 0-15 in { @@ -27,6 +18,18 @@ include "R600RegisterInfo.td" } +defm : RemapAllTargetPseudoPointerOperands<R600_Addr>; + +def R600InstrInfo : InstrInfo { + let guessInstructionProperties = 1; +} + +def R600 : Target { + let InstructionSet = R600InstrInfo; + let AllowRegisterRenaming = 1; +} + + def NullALU : InstrItinClass; def ALU_NULL : FuncUnit; diff --git a/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp index 81b142e..248d734 100644 --- a/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp +++ b/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp @@ -68,7 +68,7 @@ bool CFStack::requiresWorkAroundForInst(unsigned Opcode) { getLoopDepth() > 1) return true; - if (!ST->hasCFAluBug()) + if (!ST->hasCFALUBug()) return false; switch(Opcode) { diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp index 2aa54c9..90c09fe 100644 --- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -13,6 +13,7 @@ #include "R600ISelLowering.h" #include "AMDGPU.h" +#include "AMDGPUSelectionDAGInfo.h" #include "MCTargetDesc/R600MCTargetDesc.h" #include "R600Defines.h" #include "R600MachineFunctionInfo.h" @@ -29,7 +30,8 @@ using namespace llvm; R600TargetLowering::R600TargetLowering(const TargetMachine &TM, const R600Subtarget &STI) - : AMDGPUTargetLowering(TM, STI), Subtarget(&STI), Gen(STI.getGeneration()) { + : AMDGPUTargetLowering(TM, STI, STI), Subtarget(&STI), + Gen(STI.getGeneration()) { addRegisterClass(MVT::f32, &R600::R600_Reg32RegClass); addRegisterClass(MVT::i32, &R600::R600_Reg32RegClass); addRegisterClass(MVT::v2f32, &R600::R600_Reg64RegClass); @@ -1129,12 +1131,9 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { if ((AS == AMDGPUAS::PRIVATE_ADDRESS) && TruncatingStore) { // Add an extra level of chain to isolate this vector SDValue NewChain = DAG.getNode(AMDGPUISD::DUMMY_CHAIN, DL, MVT::Other, Chain); - // TODO: can the chain be replaced without creating a new store? - SDValue NewStore = DAG.getTruncStore( - NewChain, DL, Value, Ptr, StoreNode->getPointerInfo(), MemVT, - StoreNode->getAlign(), StoreNode->getMemOperand()->getFlags(), - StoreNode->getAAInfo()); - StoreNode = cast<StoreSDNode>(NewStore); + SmallVector<SDValue, 4> NewOps(StoreNode->ops()); + NewOps[0] = NewChain; + StoreNode = cast<StoreSDNode>(DAG.UpdateNodeOperands(StoreNode, NewOps)); } return scalarizeVectorStore(StoreNode, DAG); @@ -1481,6 +1480,9 @@ SDValue R600TargetLowering::LowerFormalArguments( MemVT = MemVT.getVectorElementType(); } + if (VT.isInteger() && !MemVT.isInteger()) + MemVT = MemVT.changeTypeToInteger(); + if (AMDGPU::isShader(CallConv)) { Register Reg = MF.addLiveIn(VA.getLocReg(), &R600::R600_Reg128RegClass); SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT); @@ -1497,11 +1499,15 @@ SDValue R600TargetLowering::LowerFormalArguments( // thread group and global sizes. ISD::LoadExtType Ext = ISD::NON_EXTLOAD; if (MemVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) { - // FIXME: This should really check the extload type, but the handling of - // extload vector parameters seems to be broken. + if (VT.isFloatingPoint()) { + Ext = ISD::EXTLOAD; + } else { + // FIXME: This should really check the extload type, but the handling of + // extload vector parameters seems to be broken. - // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD; - Ext = ISD::SEXTLOAD; + // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD; + Ext = ISD::SEXTLOAD; + } } // Compute the offset from the value. @@ -2179,18 +2185,20 @@ SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node, } TargetLowering::AtomicExpansionKind -R600TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { +R600TargetLowering::shouldExpandAtomicRMWInIR(const AtomicRMWInst *RMW) const { switch (RMW->getOperation()) { case AtomicRMWInst::Nand: case AtomicRMWInst::FAdd: case AtomicRMWInst::FSub: case AtomicRMWInst::FMax: case AtomicRMWInst::FMin: + case AtomicRMWInst::USubCond: + case AtomicRMWInst::USubSat: return AtomicExpansionKind::CmpXChg; case AtomicRMWInst::UIncWrap: case AtomicRMWInst::UDecWrap: // FIXME: Cayman at least appears to have instructions for this, but the - // instruction defintions appear to be missing. + // instruction definitions appear to be missing. return AtomicExpansionKind::CmpXChg; case AtomicRMWInst::Xchg: { const DataLayout &DL = RMW->getFunction()->getDataLayout(); diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.h b/llvm/lib/Target/AMDGPU/R600ISelLowering.h index fc361c01..661efb8 100644 --- a/llvm/lib/Target/AMDGPU/R600ISelLowering.h +++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.h @@ -116,7 +116,7 @@ private: SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override; TargetLowering::AtomicExpansionKind - shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const override; + shouldExpandAtomicRMWInIR(const AtomicRMWInst *RMW) const override; }; } // End namespace llvm; diff --git a/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp b/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp index 3e256cc..7f805e6 100644 --- a/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp @@ -29,7 +29,7 @@ using namespace llvm; #include "R600GenInstrInfo.inc" R600InstrInfo::R600InstrInfo(const R600Subtarget &ST) - : R600GenInstrInfo(ST, -1, -1), RI(), ST(ST) {} + : R600GenInstrInfo(ST, RI, -1, -1), RI(), ST(ST) {} bool R600InstrInfo::isVector(const MachineInstr &MI) const { return get(MI.getOpcode()).TSFlags & R600_InstFlag::VECTOR; @@ -176,7 +176,7 @@ bool R600InstrInfo::usesVertexCache(unsigned Opcode) const { } bool R600InstrInfo::usesVertexCache(const MachineInstr &MI) const { - const MachineFunction *MF = MI.getParent()->getParent(); + const MachineFunction *MF = MI.getMF(); return !AMDGPU::isCompute(MF->getFunction().getCallingConv()) && usesVertexCache(MI.getOpcode()); } @@ -186,7 +186,7 @@ bool R600InstrInfo::usesTextureCache(unsigned Opcode) const { } bool R600InstrInfo::usesTextureCache(const MachineInstr &MI) const { - const MachineFunction *MF = MI.getParent()->getParent(); + const MachineFunction *MF = MI.getMF(); return (AMDGPU::isCompute(MF->getFunction().getCallingConv()) && usesVertexCache(MI.getOpcode())) || usesTextureCache(MI.getOpcode()); @@ -948,7 +948,7 @@ bool R600InstrInfo::PredicateInstruction(MachineInstr &MI, .setReg(Pred[2].getReg()); MI.getOperand(getOperandIdx(MI, R600::OpName::pred_sel_W)) .setReg(Pred[2].getReg()); - MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI); + MachineInstrBuilder MIB(*MI.getMF(), MI); MIB.addReg(R600::PREDICATE_BIT, RegState::Implicit); return true; } @@ -956,7 +956,7 @@ bool R600InstrInfo::PredicateInstruction(MachineInstr &MI, if (PIdx != -1) { MachineOperand &PMO = MI.getOperand(PIdx); PMO.setReg(Pred[2].getReg()); - MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI); + MachineInstrBuilder MIB(*MI.getMF(), MI); MIB.addReg(R600::PREDICATE_BIT, RegState::Implicit); return true; } diff --git a/llvm/lib/Target/AMDGPU/R600InstrInfo.h b/llvm/lib/Target/AMDGPU/R600InstrInfo.h index 68bbac1..b96c17e 100644 --- a/llvm/lib/Target/AMDGPU/R600InstrInfo.h +++ b/llvm/lib/Target/AMDGPU/R600InstrInfo.h @@ -326,7 +326,7 @@ public: namespace R600 { -int getLDSNoRetOp(uint16_t Opcode); +int64_t getLDSNoRetOp(uint32_t Opcode); } //End namespace AMDGPU diff --git a/llvm/lib/Target/AMDGPU/R600Instructions.td b/llvm/lib/Target/AMDGPU/R600Instructions.td index dda0cf6..6d7cc8b 100644 --- a/llvm/lib/Target/AMDGPU/R600Instructions.td +++ b/llvm/lib/Target/AMDGPU/R600Instructions.td @@ -736,22 +736,22 @@ def MIN_DX10 : R600_2OP_Helper <0x6, "MIN_DX10", fminnum>; // XXX: Use the defs in TargetSelectionDAG.td instead of intrinsics. def SETE : R600_2OP < 0x08, "SETE", - [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_OEQ))] + [(set f32:$dst, (selectcc f32:$src0, f32:$src1, fpimm_one, fpimm_zero, COND_OEQ))] >; def SGT : R600_2OP < 0x09, "SETGT", - [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_OGT))] + [(set f32:$dst, (selectcc f32:$src0, f32:$src1, fpimm_one, fpimm_zero, COND_OGT))] >; def SGE : R600_2OP < 0xA, "SETGE", - [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_OGE))] + [(set f32:$dst, (selectcc f32:$src0, f32:$src1, fpimm_one, fpimm_zero, COND_OGE))] >; def SNE : R600_2OP < 0xB, "SETNE", - [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_UNE_NE))] + [(set f32:$dst, (selectcc f32:$src0, f32:$src1, fpimm_one, fpimm_zero, COND_UNE_NE))] >; def SETE_DX10 : R600_2OP < @@ -1004,19 +1004,19 @@ class FMA_Common <bits<5> inst> : R600_3OP < class CNDE_Common <bits<5> inst> : R600_3OP < inst, "CNDE", - [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_OEQ))] + [(set f32:$dst, (selectcc f32:$src0, fpimm_zero, f32:$src1, f32:$src2, COND_OEQ))] >; class CNDGT_Common <bits<5> inst> : R600_3OP < inst, "CNDGT", - [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_OGT))] + [(set f32:$dst, (selectcc f32:$src0, fpimm_zero, f32:$src1, f32:$src2, COND_OGT))] > { let Itinerary = VecALU; } class CNDGE_Common <bits<5> inst> : R600_3OP < inst, "CNDGE", - [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_OGE))] + [(set f32:$dst, (selectcc f32:$src0, fpimm_zero, f32:$src1, f32:$src2, COND_OGE))] > { let Itinerary = VecALU; } diff --git a/llvm/lib/Target/AMDGPU/R600MCInstLower.cpp b/llvm/lib/Target/AMDGPU/R600MCInstLower.cpp index 48b4e7f..ac6508c 100644 --- a/llvm/lib/Target/AMDGPU/R600MCInstLower.cpp +++ b/llvm/lib/Target/AMDGPU/R600MCInstLower.cpp @@ -55,7 +55,7 @@ void R600AsmPrinter::emitInstruction(const MachineInstr *MI) { StringRef Err; if (!STI.getInstrInfo()->verifyInstruction(*MI, Err)) { - LLVMContext &C = MI->getParent()->getParent()->getFunction().getContext(); + LLVMContext &C = MI->getMF()->getFunction().getContext(); C.emitError("Illegal instruction detected: " + Err); MI->print(errs()); } diff --git a/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp b/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp index d9902e1..56d1a19 100644 --- a/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp +++ b/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp @@ -464,7 +464,7 @@ void R600MachineCFGStructurizer::insertCondBranchBefore( MachineInstr *NewMI = MF->CreateMachineInstr(TII->get(NewOpcode), DL); MBB->insert(I, NewMI); MachineInstrBuilder MIB(*MF, NewMI); - MIB.addReg(OldMI->getOperand(1).getReg(), false); + MIB.addReg(OldMI->getOperand(1).getReg()); SHOWNEWINSTR(NewMI); //erase later oldInstr->eraseFromParent(); } @@ -476,7 +476,7 @@ void R600MachineCFGStructurizer::insertCondBranchBefore( MachineInstr *NewInstr = MF->CreateMachineInstr(TII->get(NewOpcode), DL); //insert before blk->insert(I, NewInstr); - MachineInstrBuilder(*MF, NewInstr).addReg(RegNum, false); + MachineInstrBuilder(*MF, NewInstr).addReg(RegNum); SHOWNEWINSTR(NewInstr); } @@ -1401,7 +1401,7 @@ void R600MachineCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingM << LandMBB->getNumber() << "\n";); MachineInstr *BranchMI = getLoopendBlockBranchInstr(ExitingMBB); assert(BranchMI && isCondBranch(BranchMI)); - DebugLoc DL = BranchMI->getDebugLoc(); + const DebugLoc &DL = BranchMI->getDebugLoc(); MachineBasicBlock *TrueBranch = getTrueBranch(BranchMI); MachineBasicBlock::iterator I = BranchMI; if (TrueBranch != LandMBB) @@ -1427,7 +1427,7 @@ void R600MachineCFGStructurizer::settleLoopcontBlock(MachineBasicBlock *ContingM MachineBasicBlock::iterator I = MI; MachineBasicBlock *TrueBranch = getTrueBranch(MI); int OldOpcode = MI->getOpcode(); - DebugLoc DL = MI->getDebugLoc(); + const DebugLoc &DL = MI->getDebugLoc(); bool UseContinueLogical = ((&*ContingMBB->rbegin()) == MI); diff --git a/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp index 7f75f27..9e1a97e 100644 --- a/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp +++ b/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp @@ -181,7 +181,7 @@ MachineInstr *R600VectorRegMerger::RebuildVector( Register Reg = RSI->Instr->getOperand(0).getReg(); MachineBasicBlock::iterator Pos = RSI->Instr; MachineBasicBlock &MBB = *Pos->getParent(); - DebugLoc DL = Pos->getDebugLoc(); + const DebugLoc &DL = Pos->getDebugLoc(); Register SrcVec = BaseRSI->Instr->getOperand(0).getReg(); DenseMap<Register, unsigned> UpdatedRegToChan = BaseRSI->RegToChan; @@ -222,8 +222,8 @@ MachineInstr *R600VectorRegMerger::RebuildVector( // Update RSI RSI->Instr = NewMI; - RSI->RegToChan = UpdatedRegToChan; - RSI->UndefReg = UpdatedUndef; + RSI->RegToChan = std::move(UpdatedRegToChan); + RSI->UndefReg = std::move(UpdatedUndef); return NewMI; } diff --git a/llvm/lib/Target/AMDGPU/R600Processors.td b/llvm/lib/Target/AMDGPU/R600Processors.td index 0265a97..dc21eb9 100644 --- a/llvm/lib/Target/AMDGPU/R600Processors.td +++ b/llvm/lib/Target/AMDGPU/R600Processors.td @@ -14,7 +14,7 @@ class SubtargetFeatureFetchLimit <string Value> : >; def FeatureR600ALUInst : SubtargetFeature<"R600ALUInst", - "R600ALUInst", + "HasR600ALUInst", "false", "Older version of ALU instructions encoding" >; @@ -29,37 +29,43 @@ def FeatureVertexCache : SubtargetFeature<"HasVertexCache", >; def FeatureCaymanISA : SubtargetFeature<"caymanISA", - "CaymanISA", + "HasCaymanISA", "true", "Use Cayman ISA" >; def FeatureCFALUBug : SubtargetFeature<"cfalubug", - "CFALUBug", + "HasCFALUBug", "true", "GPU has CF_ALU bug" >; +def FeatureMadMacF32Insts : SubtargetFeature<"mad-mac-f32-insts", + "HasMadMacF32Insts", + "true", + "Has v_mad_f32/v_mac_f32/v_madak_f32/v_madmk_f32 instructions" +>; + class R600SubtargetFeatureGeneration <string Value, string FeatureName, list<SubtargetFeature> Implies> : SubtargetFeatureGeneration <Value, FeatureName, "R600Subtarget", Implies>; def FeatureR600 : R600SubtargetFeatureGeneration<"R600", "r600", - [FeatureR600ALUInst, FeatureFetchLimit8] + [FeatureR600ALUInst, FeatureFetchLimit8, FeatureMadMacF32Insts] >; def FeatureR700 : R600SubtargetFeatureGeneration<"R700", "r700", - [FeatureFetchLimit16] + [FeatureFetchLimit16, FeatureMadMacF32Insts] >; def FeatureEvergreen : R600SubtargetFeatureGeneration<"EVERGREEN", "evergreen", - [FeatureFetchLimit16, FeatureAddressableLocalMemorySize32768] + [FeatureFetchLimit16, FeatureAddressableLocalMemorySize32768, FeatureMadMacF32Insts] >; def FeatureNorthernIslands : R600SubtargetFeatureGeneration<"NORTHERN_ISLANDS", "northern-islands", [FeatureFetchLimit16, FeatureWavefrontSize64, - FeatureAddressableLocalMemorySize32768] + FeatureAddressableLocalMemorySize32768, FeatureMadMacF32Insts] >; diff --git a/llvm/lib/Target/AMDGPU/R600Subtarget.h b/llvm/lib/Target/AMDGPU/R600Subtarget.h index 22e56b6..71398ce 100644 --- a/llvm/lib/Target/AMDGPU/R600Subtarget.h +++ b/llvm/lib/Target/AMDGPU/R600Subtarget.h @@ -27,15 +27,14 @@ namespace llvm { class R600Subtarget final : public R600GenSubtargetInfo, public AMDGPUSubtarget { + +#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \ + bool ATTRIBUTE = DEFAULT; +#include "R600GenSubtargetInfo.inc" + private: R600InstrInfo InstrInfo; R600FrameLowering FrameLowering; - bool FMA = false; - bool CaymanISA = false; - bool CFALUBug = false; - bool HasVertexCache = false; - bool R600ALUInst = false; - bool FP64 = false; short TexVTXClauseSize = 0; Generation Gen = R600; R600TargetLowering TLInfo; @@ -102,9 +101,7 @@ public: return (getGeneration() >= EVERGREEN); } - bool hasCaymanISA() const { - return CaymanISA; - } + bool hasCaymanISA() const { return HasCaymanISA; } bool hasFFBL() const { return (getGeneration() >= EVERGREEN); @@ -114,9 +111,15 @@ public: return (getGeneration() >= EVERGREEN); } - bool hasFMA() const { return FMA; } + bool hasFMA() const override { return HasFMA; } + + bool hasMadMacF32Insts() const override { return HasMadMacF32Insts; } + + bool enablePromoteAlloca() const override { return EnablePromoteAlloca; } + + bool hasFP64() const override { return HasFP64; } - bool hasCFAluBug() const { return CFALUBug; } + bool hasCFALUBug() const { return HasCFALUBug; } bool hasVertexCache() const { return HasVertexCache; } diff --git a/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp b/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp index c20487e..4771967 100644 --- a/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp @@ -57,9 +57,9 @@ public: R600CodeGenPassBuilder(R600TargetMachine &TM, const CGPassBuilderOption &Opts, PassInstrumentationCallbacks *PIC); - void addPreISel(AddIRPass &addPass) const; - void addAsmPrinter(AddMachinePass &, CreateMCStreamer) const; - Error addInstSelector(AddMachinePass &) const; + void addPreISel(PassManagerWrapper &PMW) const; + void addAsmPrinter(PassManagerWrapper &PMW, CreateMCStreamer) const; + Error addInstSelector(PassManagerWrapper &PMW) const; }; //===----------------------------------------------------------------------===// @@ -188,16 +188,16 @@ R600CodeGenPassBuilder::R600CodeGenPassBuilder( Opt.RequiresCodeGenSCCOrder = true; } -void R600CodeGenPassBuilder::addPreISel(AddIRPass &addPass) const { +void R600CodeGenPassBuilder::addPreISel(PassManagerWrapper &PMW) const { // TODO: Add passes pre instruction selection. } -void R600CodeGenPassBuilder::addAsmPrinter(AddMachinePass &addPass, +void R600CodeGenPassBuilder::addAsmPrinter(PassManagerWrapper &PMW, CreateMCStreamer) const { // TODO: Add AsmPrinter. } -Error R600CodeGenPassBuilder::addInstSelector(AddMachinePass &) const { +Error R600CodeGenPassBuilder::addInstSelector(PassManagerWrapper &PMW) const { // TODO: Add instruction selector. return Error::success(); } diff --git a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp index 3093227..c08edc1 100644 --- a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp @@ -108,19 +108,17 @@ InstructionCost R600TTIImpl::getCFInstrCost(unsigned Opcode, } } -InstructionCost R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, - TTI::TargetCostKind CostKind, - unsigned Index, - const Value *Op0, - const Value *Op1) const { +InstructionCost R600TTIImpl::getVectorInstrCost( + unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, unsigned Index, + const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const { switch (Opcode) { case Instruction::ExtractElement: case Instruction::InsertElement: { unsigned EltSize = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType()); if (EltSize < 32) { - return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, - Op1); + return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1, + VIC); } // Extracts are just reads of a subregister, so are free. Inserts are @@ -131,7 +129,8 @@ InstructionCost R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, return Index == ~0u ? 2 : 0; } default: - return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1); + return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1, + VIC); } } diff --git a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h index 3deae69..ade1b15 100644 --- a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h @@ -62,10 +62,11 @@ public: InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I = nullptr) const override; using BaseT::getVectorInstrCost; - InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy, - TTI::TargetCostKind CostKind, - unsigned Index, const Value *Op0, - const Value *Op1) const override; + InstructionCost + getVectorInstrCost(unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, + unsigned Index, const Value *Op0, const Value *Op1, + TTI::VectorInstrContext VIC = + TTI::VectorInstrContext::None) const override; }; } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h index ecc2824..0c7c642 100644 --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -46,6 +46,7 @@ enum { GFX11 = 10, GFX12 = 11, GFX1250 = 12, + GFX13 = 13, }; } @@ -207,6 +208,7 @@ enum OperandType : unsigned { OPERAND_REG_IMM_FP16, OPERAND_REG_IMM_V2BF16, OPERAND_REG_IMM_V2FP16, + OPERAND_REG_IMM_V2FP16_SPLAT, OPERAND_REG_IMM_V2INT16, OPERAND_REG_IMM_NOINLINE_V2FP16, OPERAND_REG_IMM_V2INT32, @@ -423,6 +425,9 @@ enum CPol { // Volatile (used to preserve/signal operation volatility for buffer // operations not a real instruction bit) VOLATILE = 1 << 31, + // The set of "cache policy" bits used for compiler features that + // do not correspond to handware features. + VIRTUAL_BITS = VOLATILE, }; } // namespace CPol @@ -445,7 +450,6 @@ enum Id { // Message ID, width(4) [3:0]. ID_EARLY_PRIM_DEALLOC = 8, // added in GFX9, removed in GFX10 ID_GS_ALLOC_REQ = 9, // added in GFX9 ID_GET_DOORBELL = 10, // added in GFX9, removed in GFX11 - ID_SAVEWAVE_HAS_TDM = 10, // added in GFX1250 ID_GET_DDID = 11, // added in GFX10, removed in GFX11 ID_SYSMSG = 15, @@ -459,6 +463,7 @@ enum Id { // Message ID, width(4) [3:0]. ID_RTN_GET_SE_AID_ID = 135, ID_RTN_GET_CLUSTER_BARRIER_STATE = 136, // added in GFX1250 + ID_RTN_SAVE_WAVE_HAS_TDM = 152, // added in GFX1250 ID_MASK_PreGFX11_ = 0xF, ID_MASK_GFX11Plus_ = 0xFF @@ -496,6 +501,14 @@ enum StreamId : unsigned { // Stream ID, (2) [9:8]. } // namespace SendMsg +namespace WaitEvent { // Encoding of SIMM16 used in s_wait_event +enum Id { + DONT_WAIT_EXPORT_READY = 1 << 0, // Only used in gfx11 + EXPORT_READY = 1 << 1, // gfx12+ +}; + +} // namespace WaitEvent + namespace Hwreg { // Encoding of SIMM16 used in s_setreg/getreg* insns. enum Id { // HwRegCode, (6) [5:0] @@ -520,6 +533,7 @@ enum Id { // HwRegCode, (6) [5:0] ID_HW_ID1 = 23, ID_HW_ID2 = 24, ID_POPS_PACKER = 25, + ID_SCHED_MODE = 26, ID_PERF_SNAPSHOT_DATA_gfx11 = 27, ID_IB_STS2 = 28, ID_SHADER_CYCLES = 29, @@ -578,11 +592,11 @@ enum ModeRegisterMasks : uint32_t { CSP_MASK = 0x7u << 29, // Bits 29..31 // GFX1250 - DST_VGPR_MSB = 1 << 12, - SRC0_VGPR_MSB = 1 << 13, - SRC1_VGPR_MSB = 1 << 14, - SRC2_VGPR_MSB = 1 << 15, - VGPR_MSB_MASK = 0xf << 12, // Bits 12..15 + DST_VGPR_MSB = 0x3 << 12, + SRC0_VGPR_MSB = 0x3 << 14, + SRC1_VGPR_MSB = 0x3 << 16, + SRC2_VGPR_MSB = 0x3 << 18, + VGPR_MSB_MASK = 0xff << 12, // Bits 12..19 REPLAY_MODE = 1 << 25, FLAT_SCRATCH_IS_NV = 1 << 26, diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index 7793907..8782fc5 100644 --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -111,7 +111,7 @@ public: V2SCopyInfo(unsigned Id, MachineInstr *C, unsigned Width) : Copy(C), NumReadfirstlanes(Width / 32), ID(Id){}; #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) - void dump() { + void dump() const { dbgs() << ID << " : " << *Copy << "\n\tS:" << SChain.size() << "\n\tSV:" << NumSVCopies << "\n\tSP: " << SiblingPenalty << "\nScore: " << Score << "\n"; @@ -238,7 +238,7 @@ static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC, static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI, const SIRegisterInfo *TRI, const SIInstrInfo *TII) { - MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); auto &Src = MI.getOperand(1); Register DstReg = MI.getOperand(0).getReg(); Register SrcReg = Src.getReg(); @@ -856,8 +856,8 @@ void SIFixSGPRCopies::processPHINode(MachineInstr &MI) { } } - if (TRI->isVectorRegister(*MRI, PHIRes) || - RC0 == &AMDGPU::VReg_1RegClass) { + if (TRI->hasVectorRegisters(MRI->getRegClass(PHIRes)) || + RC0 == &AMDGPU::VReg_1RegClass) { LLVM_DEBUG(dbgs() << "Legalizing PHI: " << MI); TII->legalizeOperands(MI, MDT); } @@ -902,14 +902,28 @@ bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI, // really much we can do to fix this. // Some special instructions use M0 as an input. Some even only use // the first lane. Insert a readfirstlane and hope for the best. - if (DstReg == AMDGPU::M0 && - TRI->hasVectorRegisters(MRI->getRegClass(SrcReg))) { + const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcReg); + if (DstReg == AMDGPU::M0 && TRI->hasVectorRegisters(SrcRC)) { Register TmpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); - BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), - TII->get(AMDGPU::V_READFIRSTLANE_B32), TmpReg) + + const MCInstrDesc &ReadFirstLaneDesc = + TII->get(AMDGPU::V_READFIRSTLANE_B32); + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), ReadFirstLaneDesc, TmpReg) .add(MI.getOperand(1)); + + unsigned SubReg = MI.getOperand(1).getSubReg(); MI.getOperand(1).setReg(TmpReg); + MI.getOperand(1).setSubReg(AMDGPU::NoSubRegister); + + const TargetRegisterClass *OpRC = TII->getRegClass(ReadFirstLaneDesc, 1); + const TargetRegisterClass *ConstrainRC = + SubReg == AMDGPU::NoSubRegister + ? OpRC + : TRI->getMatchingSuperRegClass(SrcRC, OpRC, SubReg); + + if (!MRI->constrainRegClass(SrcReg, ConstrainRC)) + llvm_unreachable("failed to constrain register"); } else if (tryMoveVGPRConstToSGPR(MI.getOperand(1), DstReg, MI.getParent(), MI, MI.getDebugLoc())) { I = std::next(I); @@ -930,7 +944,7 @@ bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI, // s_mov_b32. if (isSafeToFoldImmIntoCopy(&MI, MRI->getVRegDef(SrcReg), TII, SMovOp, Imm)) { MI.getOperand(1).ChangeToImmediate(Imm); - MI.addImplicitDefUseOperands(*MI.getParent()->getParent()); + MI.addImplicitDefUseOperands(*MI.getMF()); MI.setDesc(TII->get(SMovOp)); return true; } @@ -999,7 +1013,7 @@ void SIFixSGPRCopies::analyzeVGPRToSGPRCopy(MachineInstr* MI) { AnalysisWorklist.push_back(U); } } - V2SCopies[Info.ID] = Info; + V2SCopies[Info.ID] = std::move(Info); } // The main function that computes the VGPR to SGPR copy score @@ -1058,7 +1072,7 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) { unsigned CurID = LoweringWorklist.pop_back_val(); auto *CurInfoIt = V2SCopies.find(CurID); if (CurInfoIt != V2SCopies.end()) { - V2SCopyInfo C = CurInfoIt->second; + const V2SCopyInfo &C = CurInfoIt->second; LLVM_DEBUG(dbgs() << "Processing ...\n"; C.dump()); for (auto S : C.Siblings) { auto *SibInfoIt = V2SCopies.find(S); @@ -1075,10 +1089,10 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) { } LLVM_DEBUG(dbgs() << "V2S copy " << *C.Copy << " is being turned to VALU\n"); + Copies.insert(C.Copy); // TODO: MapVector::erase is inefficient. Do bulk removal with remove_if // instead. V2SCopies.erase(C.ID); - Copies.insert(C.Copy); } } @@ -1115,16 +1129,27 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) { Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_16RegClass); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), Undef); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), VReg32) - .addReg(SrcReg, 0, SubReg) + .addReg(SrcReg, {}, SubReg) .addImm(AMDGPU::lo16) .addReg(Undef) .addImm(AMDGPU::hi16); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg) .addReg(VReg32); } else if (SrcSize == 32) { - auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), - TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg); - MIB.addReg(SrcReg, 0, SubReg); + const MCInstrDesc &ReadFirstLaneDesc = + TII->get(AMDGPU::V_READFIRSTLANE_B32); + const TargetRegisterClass *OpRC = TII->getRegClass(ReadFirstLaneDesc, 1); + BuildMI(*MBB, MI, MI->getDebugLoc(), ReadFirstLaneDesc, DstReg) + .addReg(SrcReg, {}, SubReg); + + const TargetRegisterClass *ConstrainRC = + SubReg == AMDGPU::NoSubRegister + ? OpRC + : TRI->getMatchingSuperRegClass(MRI->getRegClass(SrcReg), OpRC, + SubReg); + + if (!MRI->constrainRegClass(SrcReg, ConstrainRC)) + llvm_unreachable("failed to constrain register"); } else { auto Result = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE), DstReg); diff --git a/llvm/lib/Target/AMDGPU/SIFixVGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixVGPRCopies.cpp index d0d6792..b368e20 100644 --- a/llvm/lib/Target/AMDGPU/SIFixVGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixVGPRCopies.cpp @@ -27,9 +27,7 @@ class SIFixVGPRCopiesLegacy : public MachineFunctionPass { public: static char ID; - SIFixVGPRCopiesLegacy() : MachineFunctionPass(ID) { - initializeSIFixVGPRCopiesLegacyPass(*PassRegistry::getPassRegistry()); - } + SIFixVGPRCopiesLegacy() : MachineFunctionPass(ID) {} void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesAll(); diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 6616b30..a2fe31b 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -187,7 +187,7 @@ public: unsigned convertToVALUOp(unsigned Opc, bool UseVOP3 = false) const { switch (Opc) { case AMDGPU::S_ADD_I32: { - if (ST->hasAddNoCarry()) + if (ST->hasAddNoCarryInsts()) return UseVOP3 ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_U32_e32; return UseVOP3 ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e32; } @@ -242,7 +242,6 @@ public: SmallVectorImpl<FoldCandidate> &FoldList, SmallVectorImpl<MachineInstr *> &CopiesToReplace) const; - std::optional<int64_t> getImmOrMaterializedImm(MachineOperand &Op) const; bool tryConstantFoldOp(MachineInstr *MI) const; bool tryFoldCndMask(MachineInstr &MI) const; bool tryFoldZeroHighBits(MachineInstr &MI) const; @@ -681,6 +680,10 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const { return false; MI->setDesc(TII->get(NewMFMAOpc)); MI->untieRegOperand(0); + const MCInstrDesc &MCID = MI->getDesc(); + for (unsigned I = 0; I < MI->getNumDefs(); ++I) + if (MCID.getOperandConstraint(I, MCOI::EARLY_CLOBBER) != -1) + MI->getOperand(I).setIsEarlyClobber(true); } // TODO: Should we try to avoid adding this to the candidate list? @@ -709,7 +712,7 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const { // Verify the register is compatible with the operand. if (const TargetRegisterClass *OpRC = - TII->getRegClass(MI->getDesc(), Fold.UseOpNo, TRI)) { + TII->getRegClass(MI->getDesc(), Fold.UseOpNo)) { const TargetRegisterClass *NewRC = TRI->getRegClassForReg(*MRI, New->getReg()); @@ -762,6 +765,29 @@ static void appendFoldCandidate(SmallVectorImpl<FoldCandidate> &FoldList, FoldCandidate(MI, OpNo, FoldOp, Commuted, ShrinkOp)); } +// Returns true if the instruction is a packed F32 instruction and the +// corresponding scalar operand reads 32 bits and replicates the bits to both +// channels. +static bool isPKF32InstrReplicatesLower32BitsOfScalarOperand( + const GCNSubtarget *ST, MachineInstr *MI, unsigned OpNo) { + if (!ST->hasPKF32InstsReplicatingLower32BitsOfScalarInput()) + return false; + const MCOperandInfo &OpDesc = MI->getDesc().operands()[OpNo]; + return OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32; +} + +// Packed FP32 instructions only read 32 bits from a scalar operand (SGPR or +// literal) and replicates the bits to both channels. Therefore, if the hi and +// lo are not same, we can't fold it. +static bool checkImmOpForPKF32InstrReplicatesLower32BitsOfScalarOperand( + const FoldableDef &OpToFold) { + assert(OpToFold.isImm() && "Expected immediate operand"); + uint64_t ImmVal = OpToFold.getEffectiveImmVal().value(); + uint32_t Lo = Lo_32(ImmVal); + uint32_t Hi = Hi_32(ImmVal); + return Lo == Hi; +} + bool SIFoldOperandsImpl::tryAddToFoldList( SmallVectorImpl<FoldCandidate> &FoldList, MachineInstr *MI, unsigned OpNo, const FoldableDef &OpToFold) const { @@ -915,6 +941,13 @@ bool SIFoldOperandsImpl::tryAddToFoldList( return true; } + // Special case for PK_F32 instructions if we are trying to fold an imm to + // src0 or src1. + if (OpToFold.isImm() && + isPKF32InstrReplicatesLower32BitsOfScalarOperand(ST, MI, OpNo) && + !checkImmOpForPKF32InstrReplicatesLower32BitsOfScalarOperand(OpToFold)) + return false; + appendFoldCandidate(FoldList, MI, OpNo, OpToFold); return true; } @@ -1129,40 +1162,14 @@ bool SIFoldOperandsImpl::tryToFoldACImm( if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx)) return false; - MachineOperand &UseOp = UseMI->getOperand(UseOpIdx); if (OpToFold.isImm() && OpToFold.isOperandLegal(*TII, *UseMI, UseOpIdx)) { + if (isPKF32InstrReplicatesLower32BitsOfScalarOperand(ST, UseMI, UseOpIdx) && + !checkImmOpForPKF32InstrReplicatesLower32BitsOfScalarOperand(OpToFold)) + return false; appendFoldCandidate(FoldList, UseMI, UseOpIdx, OpToFold); return true; } - // TODO: Verify the following code handles subregisters correctly. - // TODO: Handle extract of global reference - if (UseOp.getSubReg()) - return false; - - if (!OpToFold.isReg()) - return false; - - Register UseReg = OpToFold.getReg(); - if (!UseReg.isVirtual()) - return false; - - // Maybe it is just a COPY of an immediate itself. - - // FIXME: Remove this handling. There is already special case folding of - // immediate into copy in foldOperand. This is looking for the def of the - // value the folding started from in the first place. - MachineInstr *Def = MRI->getVRegDef(UseReg); - if (Def && TII->isFoldableCopy(*Def)) { - MachineOperand &DefOp = Def->getOperand(1); - if (DefOp.isImm() && TII->isOperandLegal(*UseMI, UseOpIdx, &DefOp)) { - FoldableDef FoldableImm(DefOp.getImm(), OpToFold.DefRC, - OpToFold.DefSubReg); - appendFoldCandidate(FoldList, UseMI, UseOpIdx, FoldableImm); - return true; - } - } - return false; } @@ -1309,10 +1316,11 @@ void SIFoldOperandsImpl::foldOperand( continue; const int SrcIdx = MovOp == AMDGPU::V_MOV_B16_t16_e64 ? 2 : 1; - const TargetRegisterClass *MovSrcRC = - TRI->getRegClass(TII->getOpRegClassID(MovDesc.operands()[SrcIdx])); - if (MovSrcRC) { + int16_t RegClassID = TII->getOpRegClassID(MovDesc.operands()[SrcIdx]); + if (RegClassID != -1) { + const TargetRegisterClass *MovSrcRC = TRI->getRegClass(RegClassID); + if (UseSubReg) MovSrcRC = TRI->getMatchingSuperRegClass(SrcRC, MovSrcRC, UseSubReg); @@ -1351,7 +1359,7 @@ void SIFoldOperandsImpl::foldOperand( if (MovOp == AMDGPU::V_MOV_B16_t16_e64) { const auto &SrcOp = UseMI->getOperand(UseOpIdx); MachineOperand NewSrcOp(SrcOp); - MachineFunction *MF = UseMI->getParent()->getParent(); + MachineFunction *MF = UseMI->getMF(); UseMI->removeOperand(1); UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // src0_modifiers UseMI->addOperand(NewSrcOp); // src0 @@ -1382,7 +1390,7 @@ void SIFoldOperandsImpl::foldOperand( // Remove this if 16-bit SGPRs (i.e. SGPR_LO16) are added to the // VS_16RegClass // - // Excerpt from AMDGPUGenRegisterInfo.inc + // Excerpt from AMDGPUGenRegisterInfoEnums.inc // NoSubRegister, //0 // hi16, // 1 // lo16, // 2 @@ -1437,6 +1445,7 @@ void SIFoldOperandsImpl::foldOperand( return; UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32)); + UseMI->clearFlag(MachineInstr::NoConvergent); if (OpToFold.isImm()) { UseMI->getOperand(1).ChangeToImmediate( @@ -1468,6 +1477,7 @@ void SIFoldOperandsImpl::foldOperand( UseMI->getOperand(1).setSubReg(OpToFold.getSubReg()); UseMI->getOperand(1).setIsKill(false); UseMI->removeOperand(2); // Remove exec read (or src1 for readlane) + UseMI->clearFlag(MachineInstr::NoConvergent); return; } } @@ -1558,38 +1568,6 @@ static unsigned getMovOpc(bool IsScalar) { return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; } -static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) { - MI.setDesc(NewDesc); - - // Remove any leftover implicit operands from mutating the instruction. e.g. - // if we replace an s_and_b32 with a copy, we don't need the implicit scc def - // anymore. - const MCInstrDesc &Desc = MI.getDesc(); - unsigned NumOps = Desc.getNumOperands() + Desc.implicit_uses().size() + - Desc.implicit_defs().size(); - - for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I) - MI.removeOperand(I); -} - -std::optional<int64_t> -SIFoldOperandsImpl::getImmOrMaterializedImm(MachineOperand &Op) const { - if (Op.isImm()) - return Op.getImm(); - - if (!Op.isReg() || !Op.getReg().isVirtual()) - return std::nullopt; - - const MachineInstr *Def = MRI->getVRegDef(Op.getReg()); - if (Def && Def->isMoveImmediate()) { - const MachineOperand &ImmSrc = Def->getOperand(1); - if (ImmSrc.isImm()) - return TII->extractSubregFromImm(ImmSrc.getImm(), Op.getSubReg()); - } - - return std::nullopt; -} - // Try to simplify operations with a constant that may appear after instruction // selection. // TODO: See if a frame index with a fixed offset can fold. @@ -1604,13 +1582,14 @@ bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const { return false; MachineOperand *Src0 = &MI->getOperand(Src0Idx); - std::optional<int64_t> Src0Imm = getImmOrMaterializedImm(*Src0); + std::optional<int64_t> Src0Imm = TII->getImmOrMaterializedImm(*Src0); if ((Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 || Opc == AMDGPU::S_NOT_B32) && Src0Imm) { MI->getOperand(1).ChangeToImmediate(~*Src0Imm); - mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32))); + TII->mutateAndCleanupImplicit( + *MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32))); return true; } @@ -1619,7 +1598,7 @@ bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const { return false; MachineOperand *Src1 = &MI->getOperand(Src1Idx); - std::optional<int64_t> Src1Imm = getImmOrMaterializedImm(*Src1); + std::optional<int64_t> Src1Imm = TII->getImmOrMaterializedImm(*Src1); if (!Src0Imm && !Src1Imm) return false; @@ -1638,7 +1617,7 @@ bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const { // instruction. MI->getOperand(Src0Idx).ChangeToImmediate(NewImm); MI->removeOperand(Src1Idx); - mutateCopyOp(*MI, TII->get(getMovOpc(IsSGPR))); + TII->mutateAndCleanupImplicit(*MI, TII->get(getMovOpc(IsSGPR))); return true; } @@ -1658,11 +1637,12 @@ bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const { if (Src1Val == 0) { // y = or x, 0 => y = copy x MI->removeOperand(Src1Idx); - mutateCopyOp(*MI, TII->get(AMDGPU::COPY)); + TII->mutateAndCleanupImplicit(*MI, TII->get(AMDGPU::COPY)); } else if (Src1Val == -1) { // y = or x, -1 => y = v_mov_b32 -1 MI->removeOperand(Src1Idx); - mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32))); + TII->mutateAndCleanupImplicit( + *MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32))); } else return false; @@ -1674,11 +1654,12 @@ bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const { if (Src1Val == 0) { // y = and x, 0 => y = v_mov_b32 0 MI->removeOperand(Src0Idx); - mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32))); + TII->mutateAndCleanupImplicit( + *MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32))); } else if (Src1Val == -1) { // y = and x, -1 => y = copy x MI->removeOperand(Src1Idx); - mutateCopyOp(*MI, TII->get(AMDGPU::COPY)); + TII->mutateAndCleanupImplicit(*MI, TII->get(AMDGPU::COPY)); } else return false; @@ -1690,7 +1671,7 @@ bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const { if (Src1Val == 0) { // y = xor x, 0 => y = copy x MI->removeOperand(Src1Idx); - mutateCopyOp(*MI, TII->get(AMDGPU::COPY)); + TII->mutateAndCleanupImplicit(*MI, TII->get(AMDGPU::COPY)); return true; } } @@ -1708,11 +1689,11 @@ bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI) const { MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); if (!Src1->isIdenticalTo(*Src0)) { - std::optional<int64_t> Src1Imm = getImmOrMaterializedImm(*Src1); + std::optional<int64_t> Src1Imm = TII->getImmOrMaterializedImm(*Src1); if (!Src1Imm) return false; - std::optional<int64_t> Src0Imm = getImmOrMaterializedImm(*Src0); + std::optional<int64_t> Src0Imm = TII->getImmOrMaterializedImm(*Src0); if (!Src0Imm || *Src0Imm != *Src1Imm) return false; } @@ -1736,7 +1717,7 @@ bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI) const { MI.removeOperand(Src1ModIdx); if (Src0ModIdx != -1) MI.removeOperand(Src0ModIdx); - mutateCopyOp(MI, NewDesc); + TII->mutateAndCleanupImplicit(MI, NewDesc); LLVM_DEBUG(dbgs() << MI); return true; } @@ -1746,7 +1727,8 @@ bool SIFoldOperandsImpl::tryFoldZeroHighBits(MachineInstr &MI) const { MI.getOpcode() != AMDGPU::V_AND_B32_e32) return false; - std::optional<int64_t> Src0Imm = getImmOrMaterializedImm(MI.getOperand(1)); + std::optional<int64_t> Src0Imm = + TII->getImmOrMaterializedImm(MI.getOperand(1)); if (!Src0Imm || *Src0Imm != 0xffff || !MI.getOperand(2).isReg()) return false; @@ -1804,7 +1786,7 @@ bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI, if (CopiesToReplace.empty() && FoldList.empty()) return Changed; - MachineFunction *MF = MI.getParent()->getParent(); + MachineFunction *MF = MI.getMF(); // Make sure we add EXEC uses to any new v_mov instructions created. for (MachineInstr *Copy : CopiesToReplace) Copy->addImplicitDefUseOperands(*MF); @@ -2419,7 +2401,7 @@ bool SIFoldOperandsImpl::tryFoldRegSequence(MachineInstr &MI) { unsigned OpIdx = Op - &UseMI->getOperand(0); const MCInstrDesc &InstDesc = UseMI->getDesc(); - const TargetRegisterClass *OpRC = TII->getRegClass(InstDesc, OpIdx, TRI); + const TargetRegisterClass *OpRC = TII->getRegClass(InstDesc, OpIdx); if (!OpRC || !TRI->isVectorSuperClass(OpRC)) return false; @@ -2435,7 +2417,7 @@ bool SIFoldOperandsImpl::tryFoldRegSequence(MachineInstr &MI) { } else { // This is a copy MachineInstr *SubDef = MRI->getVRegDef(Def->getReg()); SubDef->getOperand(1).setIsKill(false); - RS.addReg(SubDef->getOperand(1).getReg(), 0, Def->getSubReg()); + RS.addReg(SubDef->getOperand(1).getReg(), {}, Def->getSubReg()); } RS.addImm(SubIdx); } @@ -2759,7 +2741,7 @@ bool SIFoldOperandsImpl::tryOptimizeAGPRPhis(MachineBasicBlock &MBB) { MachineInstr *VGPRCopy = BuildMI(*DefMBB, ++Def->getIterator(), Def->getDebugLoc(), TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TempVGPR) - .addReg(Reg, /* flags */ 0, SubReg); + .addReg(Reg, /* flags */ {}, SubReg); // Copy back to an AGPR and use that instead of the AGPR subreg in all MOs. Register TempAGPR = MRI->createVirtualRegister(ARC); @@ -2793,7 +2775,6 @@ bool SIFoldOperandsImpl::run(MachineFunction &MF) { // // FIXME: Also need to check strictfp bool IsIEEEMode = MFI->getMode().IEEE; - bool HasNSZ = MFI->hasNoSignedZerosFPMath(); bool Changed = false; for (MachineBasicBlock *MBB : depth_first(&MF)) { @@ -2832,8 +2813,7 @@ bool SIFoldOperandsImpl::run(MachineFunction &MF) { // TODO: Omod might be OK if there is NSZ only on the source // instruction, and not the omod multiply. - if (IsIEEEMode || (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) || - !tryFoldOMod(MI)) + if (IsIEEEMode || !MI.getFlag(MachineInstr::FmNsz) || !tryFoldOMod(MI)) Changed |= tryFoldClamp(MI); } diff --git a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp index 6b13b06..9820341 100644 --- a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp +++ b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp @@ -33,7 +33,7 @@ MaxClause("amdgpu-max-memory-clause", cl::Hidden, cl::init(15), namespace { class SIFormMemoryClausesImpl { - using RegUse = DenseMap<unsigned, std::pair<unsigned, LaneBitmask>>; + using RegUse = DenseMap<unsigned, std::pair<RegState, LaneBitmask>>; bool canBundle(const MachineInstr &MI, const RegUse &Defs, const RegUse &Uses) const; @@ -61,9 +61,7 @@ class SIFormMemoryClausesLegacy : public MachineFunctionPass { public: static char ID; - SIFormMemoryClausesLegacy() : MachineFunctionPass(ID) { - initializeSIFormMemoryClausesLegacyPass(*PassRegistry::getPassRegistry()); - } + SIFormMemoryClausesLegacy() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override; @@ -132,8 +130,8 @@ static bool isValidClauseInst(const MachineInstr &MI, bool IsVMEMClause) { return true; } -static unsigned getMopState(const MachineOperand &MO) { - unsigned S = 0; +static RegState getMopState(const MachineOperand &MO) { + RegState S = {}; if (MO.isImplicit()) S |= RegState::Implicit; if (MO.isDead()) @@ -234,7 +232,7 @@ void SIFormMemoryClausesImpl::collectRegUses(const MachineInstr &MI, : LaneBitmask::getAll(); RegUse &Map = MO.isDef() ? Defs : Uses; - unsigned State = getMopState(MO); + RegState State = getMopState(MO); auto [Loc, Inserted] = Map.try_emplace(Reg, State, Mask); if (!Inserted) { Loc->second.first |= State; @@ -349,7 +347,7 @@ bool SIFormMemoryClausesImpl::run(MachineFunction &MF) { continue; // Collect the register operands we should extend the live ranges of. - SmallVector<std::tuple<unsigned, unsigned>> KillOps; + SmallVector<std::tuple<RegState, unsigned>> KillOps; const LiveInterval &LI = LIS->getInterval(R.first); if (!LI.hasSubRanges()) { diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index 0189e7b..a0952b3 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -139,8 +139,8 @@ static void buildPrologSpill(const GCNSubtarget &ST, const SIRegisterInfo &TRI, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SpillReg, int FI, Register FrameReg, int64_t DwordOff = 0) { - unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR - : AMDGPU::BUFFER_STORE_DWORD_OFFSET; + unsigned Opc = ST.hasFlatScratchEnabled() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR + : AMDGPU::BUFFER_STORE_DWORD_OFFSET; MachineFrameInfo &FrameInfo = MF.getFrameInfo(); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); @@ -163,8 +163,8 @@ static void buildEpilogRestore(const GCNSubtarget &ST, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SpillReg, int FI, Register FrameReg, int64_t DwordOff = 0) { - unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR - : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; + unsigned Opc = ST.hasFlatScratchEnabled() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR + : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; MachineFrameInfo &FrameInfo = MF.getFrameInfo(); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); @@ -591,7 +591,7 @@ Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg( } static unsigned getScratchScaleFactor(const GCNSubtarget &ST) { - return ST.enableFlatScratch() ? 1 : ST.getWavefrontSize(); + return ST.hasFlatScratchEnabled() ? 1 : ST.getWavefrontSize(); } void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, @@ -629,7 +629,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, // This will return `Register()` in cases where there are no actual // uses of the SRSRC. Register ScratchRsrcReg; - if (!ST.enableFlatScratch()) + if (!ST.hasFlatScratchEnabled()) ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF); // Make the selected register live throughout the function. @@ -724,14 +724,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, FrameInfo.getMaxAlign()); MFI->setScratchReservedForDynamicVGPRs(VGPRSize); - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), FPReg) - .addImm(AMDGPU::Hwreg::HwregEncoding::encode( - AMDGPU::Hwreg::ID_HW_ID2, AMDGPU::Hwreg::OFFSET_ME_ID, 2)); - // The MicroEngine ID is 0 for the graphics queue, and 1 or 2 for compute - // (3 is unused, so we ignore it). Unfortunately, S_GETREG doesn't set - // SCC, so we need to check for 0 manually. - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32)).addImm(0).addReg(FPReg); - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CMOVK_I32), FPReg).addImm(VGPRSize); + BuildMI(MBB, I, DL, TII->get(AMDGPU::GET_STACK_BASE), FPReg); if (requiresStackPointerReference(MF)) { Register SPReg = MFI->getStackPtrOffsetReg(); assert(SPReg != AMDGPU::SP_REG); @@ -755,10 +748,10 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, bool NeedsFlatScratchInit = MFI->getUserSGPRInfo().hasFlatScratchInit() && (MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() || - (!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch())); + (!allStackObjectsAreDead(FrameInfo) && ST.hasFlatScratchEnabled())); if ((NeedsFlatScratchInit || ScratchRsrcReg) && - PreloadedScratchWaveOffsetReg && !ST.flatScratchIsArchitected()) { + PreloadedScratchWaveOffsetReg && !ST.hasArchitectedFlatScratch()) { MRI.addLiveIn(PreloadedScratchWaveOffsetReg); MBB.addLiveIn(PreloadedScratchWaveOffsetReg); } @@ -772,6 +765,17 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, PreloadedScratchRsrcReg, ScratchRsrcReg, ScratchWaveOffsetReg); } + + if (ST.hasWaitXcnt()) { + // Set REPLAY_MODE (bit 25) in MODE register to enable multi-group XNACK + // replay. This aligns hardware behavior with the compiler's s_wait_xcnt + // insertion logic, which assumes multi-group mode by default. + unsigned RegEncoding = + AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 25, 1); + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_IMM32_B32)) + .addImm(1) + .addImm(RegEncoding); + } } // Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg` @@ -1034,16 +1038,13 @@ void SIFrameLowering::emitCSRSpillStores( StoreWWMRegisters(WWMCalleeSavedRegs); if (FuncInfo->isWholeWaveFunction()) { - // SI_WHOLE_WAVE_FUNC_SETUP has outlived its purpose, so we can remove - // it now. If we have already saved some WWM CSR registers, then the EXEC is - // already -1 and we don't need to do anything else. Otherwise, set EXEC to - // -1 here. + // If we have already saved some WWM CSR registers, then the EXEC is already + // -1 and we don't need to do anything else. Otherwise, set EXEC to -1 here. if (!ScratchExecCopy) buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, /*IsProlog*/ true, /*EnableInactiveLanes*/ true); else if (WWMCalleeSavedRegs.empty()) EnableAllLanes(); - TII->getWholeWaveFunctionSetup(MF)->eraseFromParent(); } else if (ScratchExecCopy) { // FIXME: Split block and make terminator. BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg) @@ -1340,6 +1341,11 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, "Needed to save BP but didn't save it anywhere"); assert((HasBP || !BPSaved) && "Saved BP but didn't need it"); + + if (FuncInfo->isWholeWaveFunction()) { + // SI_WHOLE_WAVE_FUNC_SETUP has outlived its purpose. + TII->getWholeWaveFunctionSetup(MF)->eraseFromParent(); + } } void SIFrameLowering::emitEpilogue(MachineFunction &MF, @@ -1831,9 +1837,7 @@ void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF, static void assignSlotsUsingVGPRBlocks(MachineFunction &MF, const GCNSubtarget &ST, - std::vector<CalleeSavedInfo> &CSI, - unsigned &MinCSFrameIndex, - unsigned &MaxCSFrameIndex) { + std::vector<CalleeSavedInfo> &CSI) { SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); MachineFrameInfo &MFI = MF.getFrameInfo(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); @@ -1902,10 +1906,7 @@ static void assignSlotsUsingVGPRBlocks(MachineFunction &MF, int FrameIdx = MFI.CreateStackObject(BlockSize, TRI->getSpillAlign(*BlockRegClass), /*isSpillSlot=*/true); - if ((unsigned)FrameIdx < MinCSFrameIndex) - MinCSFrameIndex = FrameIdx; - if ((unsigned)FrameIdx > MaxCSFrameIndex) - MaxCSFrameIndex = FrameIdx; + MFI.setIsCalleeSavedObjectIndex(FrameIdx, true); CSIt->setFrameIdx(FrameIdx); CSIt->setReg(RegBlock); @@ -1915,8 +1916,7 @@ static void assignSlotsUsingVGPRBlocks(MachineFunction &MF, bool SIFrameLowering::assignCalleeSavedSpillSlots( MachineFunction &MF, const TargetRegisterInfo *TRI, - std::vector<CalleeSavedInfo> &CSI, unsigned &MinCSFrameIndex, - unsigned &MaxCSFrameIndex) const { + std::vector<CalleeSavedInfo> &CSI) const { if (CSI.empty()) return true; // Early exit if no callee saved registers are modified! @@ -1924,12 +1924,12 @@ bool SIFrameLowering::assignCalleeSavedSpillSlots( bool UseVGPRBlocks = ST.useVGPRBlockOpsForCSR(); if (UseVGPRBlocks) - assignSlotsUsingVGPRBlocks(MF, ST, CSI, MinCSFrameIndex, MaxCSFrameIndex); + assignSlotsUsingVGPRBlocks(MF, ST, CSI); - return assignCalleeSavedSpillSlots(MF, TRI, CSI) || UseVGPRBlocks; + return assignCalleeSavedSpillSlotsImpl(MF, TRI, CSI) || UseVGPRBlocks; } -bool SIFrameLowering::assignCalleeSavedSpillSlots( +bool SIFrameLowering::assignCalleeSavedSpillSlotsImpl( MachineFunction &MF, const TargetRegisterInfo *TRI, std::vector<CalleeSavedInfo> &CSI) const { if (CSI.empty()) @@ -1986,7 +1986,7 @@ bool SIFrameLowering::allocateScavengingFrameIndexesNearIncomingSP( // TODO: We could try sorting the objects to find a hole in the first bytes // rather than allocating as close to possible. This could save a lot of space // on frames with alignment requirements. - if (ST.enableFlatScratch()) { + if (ST.hasFlatScratchEnabled()) { if (TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS, SIInstrFlags::FlatScratch)) return false; @@ -2168,7 +2168,9 @@ bool SIFrameLowering::hasFPImpl(const MachineFunction &MF) const { return MFI.getStackSize() != 0; } - return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() || + return (frameTriviallyRequiresSP(MFI) && + !MF.getInfo<SIMachineFunctionInfo>()->isChainFunction()) || + MFI.isFrameAddressTaken() || MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->hasStackRealignment( MF) || mayReserveScratchForCWSR(MF) || diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h index a727729..4c1cf3c 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h @@ -49,11 +49,9 @@ public: const TargetRegisterInfo *TRI, std::vector<CalleeSavedInfo> &CSI) const override; - bool assignCalleeSavedSpillSlots(MachineFunction &MF, - const TargetRegisterInfo *TRI, - std::vector<CalleeSavedInfo> &CSI, - unsigned &MinCSFrameIndex, - unsigned &MaxCSFrameIndex) const override; + bool assignCalleeSavedSpillSlotsImpl(MachineFunction &MF, + const TargetRegisterInfo *TRI, + std::vector<CalleeSavedInfo> &CSI) const; bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index a2841c11..fe1d24f 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -15,6 +15,7 @@ #include "AMDGPU.h" #include "AMDGPUInstrInfo.h" #include "AMDGPULaneMaskUtils.h" +#include "AMDGPUSelectionDAGInfo.h" #include "AMDGPUTargetMachine.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" @@ -34,6 +35,8 @@ #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachinePassManager.h" +#include "llvm/CodeGen/PseudoSourceValueManager.h" #include "llvm/CodeGen/SDPatternMatch.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/IRBuilder.h" @@ -86,69 +89,78 @@ static unsigned findFirstFreeSGPR(CCState &CCInfo) { SITargetLowering::SITargetLowering(const TargetMachine &TM, const GCNSubtarget &STI) - : AMDGPUTargetLowering(TM, STI), Subtarget(&STI) { + : AMDGPUTargetLowering(TM, STI, STI), Subtarget(&STI) { addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass); addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass); addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass); - addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass); + + const SIRegisterInfo *TRI = STI.getRegisterInfo(); + const TargetRegisterClass *V32RegClass = + TRI->getDefaultVectorSuperClassForBitWidth(32); + addRegisterClass(MVT::f32, V32RegClass); addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass); - const SIRegisterInfo *TRI = STI.getRegisterInfo(); - const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class(); + const TargetRegisterClass *V64RegClass = + TRI->getDefaultVectorSuperClassForBitWidth(64); addRegisterClass(MVT::f64, V64RegClass); addRegisterClass(MVT::v2f32, V64RegClass); addRegisterClass(MVT::Untyped, V64RegClass); addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass); - addRegisterClass(MVT::v3f32, &AMDGPU::VReg_96RegClass); + addRegisterClass(MVT::v3f32, TRI->getDefaultVectorSuperClassForBitWidth(96)); addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass); addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass); addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass); - addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); + addRegisterClass(MVT::v4f32, TRI->getDefaultVectorSuperClassForBitWidth(128)); addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass); - addRegisterClass(MVT::v5f32, &AMDGPU::VReg_160RegClass); + addRegisterClass(MVT::v5f32, TRI->getDefaultVectorSuperClassForBitWidth(160)); addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass); - addRegisterClass(MVT::v6f32, &AMDGPU::VReg_192RegClass); + addRegisterClass(MVT::v6f32, TRI->getDefaultVectorSuperClassForBitWidth(192)); addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass); - addRegisterClass(MVT::v3f64, &AMDGPU::VReg_192RegClass); + addRegisterClass(MVT::v3f64, TRI->getDefaultVectorSuperClassForBitWidth(192)); addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass); - addRegisterClass(MVT::v7f32, &AMDGPU::VReg_224RegClass); + addRegisterClass(MVT::v7f32, TRI->getDefaultVectorSuperClassForBitWidth(224)); addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass); - addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass); + addRegisterClass(MVT::v8f32, TRI->getDefaultVectorSuperClassForBitWidth(256)); addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass); - addRegisterClass(MVT::v4f64, &AMDGPU::VReg_256RegClass); + addRegisterClass(MVT::v4f64, TRI->getDefaultVectorSuperClassForBitWidth(256)); addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass); - addRegisterClass(MVT::v9f32, &AMDGPU::VReg_288RegClass); + addRegisterClass(MVT::v9f32, TRI->getDefaultVectorSuperClassForBitWidth(288)); addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass); - addRegisterClass(MVT::v10f32, &AMDGPU::VReg_320RegClass); + addRegisterClass(MVT::v10f32, + TRI->getDefaultVectorSuperClassForBitWidth(320)); addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass); - addRegisterClass(MVT::v11f32, &AMDGPU::VReg_352RegClass); + addRegisterClass(MVT::v11f32, + TRI->getDefaultVectorSuperClassForBitWidth(352)); addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass); - addRegisterClass(MVT::v12f32, &AMDGPU::VReg_384RegClass); + addRegisterClass(MVT::v12f32, + TRI->getDefaultVectorSuperClassForBitWidth(384)); addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass); - addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass); + addRegisterClass(MVT::v16f32, + TRI->getDefaultVectorSuperClassForBitWidth(512)); addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass); - addRegisterClass(MVT::v8f64, &AMDGPU::VReg_512RegClass); + addRegisterClass(MVT::v8f64, TRI->getDefaultVectorSuperClassForBitWidth(512)); addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass); - addRegisterClass(MVT::v16f64, &AMDGPU::VReg_1024RegClass); + addRegisterClass(MVT::v16f64, + TRI->getDefaultVectorSuperClassForBitWidth(1024)); if (Subtarget->has16BitInsts()) { if (Subtarget->useRealTrue16Insts()) { @@ -180,7 +192,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, } addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass); - addRegisterClass(MVT::v32f32, &AMDGPU::VReg_1024RegClass); + addRegisterClass(MVT::v32f32, + TRI->getDefaultVectorSuperClassForBitWidth(1024)); computeRegisterProperties(Subtarget->getRegisterInfo()); @@ -217,9 +230,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FNEARBYINT, ISD::FROUND, ISD::FROUNDEVEN, ISD::FFLOOR, ISD::FCANONICALIZE, ISD::SETCC}) { - // FIXME: The promoted to type shouldn't need to be explicit setOperationAction(Opc, MVT::bf16, Promote); - AddPromotedToType(Opc, MVT::bf16, MVT::f32); } setOperationAction(ISD::FP_ROUND, MVT::bf16, Expand); @@ -263,6 +274,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand); setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom); + setOperationAction(ISD::ExternalSymbol, {MVT::i32, MVT::i64}, Custom); setOperationAction(ISD::SELECT, MVT::i1, Promote); setOperationAction(ISD::SELECT, MVT::i64, Custom); @@ -298,7 +310,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::BR_CC, {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand); - setOperationAction({ISD::UADDO, ISD::USUBO}, MVT::i32, Legal); + setOperationAction({ISD::ABS, ISD::UADDO, ISD::USUBO}, MVT::i32, Legal); setOperationAction({ISD::UADDO_CARRY, ISD::USUBO_CARRY}, MVT::i32, Legal); @@ -492,6 +504,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, if (Subtarget->has16BitInsts()) { setOperationAction({ISD::FPOW, ISD::FPOWI}, MVT::f16, Promote); setOperationAction({ISD::FLOG, ISD::FEXP, ISD::FLOG10}, MVT::f16, Custom); + setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal); + setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Legal); + setOperationAction(ISD::FCANONICALIZE, MVT::f16, Legal); } else { setOperationAction(ISD::FSQRT, MVT::f16, Custom); } @@ -499,21 +514,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, if (Subtarget->hasMadMacF32Insts()) setOperationAction(ISD::FMAD, MVT::f32, Legal); - if (!Subtarget->hasBFI()) - // fcopysign can be done in a single instruction with BFI. - setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand); - - if (!Subtarget->hasBCNT(32)) - setOperationAction(ISD::CTPOP, MVT::i32, Expand); - - if (!Subtarget->hasBCNT(64)) - setOperationAction(ISD::CTPOP, MVT::i64, Expand); - - if (Subtarget->hasFFBH()) - setOperationAction({ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, MVT::i32, Custom); - - if (Subtarget->hasFFBL()) - setOperationAction({ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF}, MVT::i32, Custom); + setOperationAction({ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, MVT::i32, Custom); + setOperationAction({ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF}, MVT::i32, Custom); // We only really have 32-bit BFE instructions (and 16-bit on VI). // @@ -523,14 +525,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that // span the midpoint are probably relatively rare, so don't worry about them // for now. - if (Subtarget->hasBFE()) - setHasExtractBitsInsn(true); + setHasExtractBitsInsn(true); // Clamp modifier on add/sub if (Subtarget->hasIntClamp()) setOperationAction({ISD::UADDSAT, ISD::USUBSAT}, MVT::i32, Legal); - if (Subtarget->hasAddNoCarry()) + if (Subtarget->hasAddNoCarryInsts()) setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32}, Legal); @@ -562,6 +563,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand); setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand); + setOperationAction({ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT}, MVT::i32, + Custom); + setOperationAction({ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT}, MVT::i16, + Custom); + setOperationAction({ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT}, MVT::i1, + Custom); + // Custom lower these because we can't specify a rule based on an illegal // source bf16. setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f32, Custom); @@ -623,8 +631,12 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, if (Subtarget->hasBF16TransInsts()) setOperationAction({ISD::FCOS, ISD::FSIN, ISD::FDIV}, MVT::bf16, Custom); - setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::f16, Promote); - setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::bf16, Promote); + setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT, + ISD::FP_TO_UINT_SAT}, + MVT::f16, Promote); + setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT, + ISD::FP_TO_UINT_SAT}, + MVT::bf16, Promote); // F16 - VOP2 Actions. setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16}, @@ -657,6 +669,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, break; case ISD::EXTRACT_SUBVECTOR: case ISD::CONCAT_VECTORS: + case ISD::FSIN: + case ISD::FCOS: setOperationAction(Op, VT, Custom); break; default: @@ -1016,6 +1030,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, ISD::SCALAR_TO_VECTOR, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND_INREG, + ISD::ANY_EXTEND, ISD::EXTRACT_VECTOR_ELT, ISD::INSERT_VECTOR_ELT, ISD::FCOPYSIGN}); @@ -1047,6 +1062,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, ISD::ATOMIC_LOAD_FMAX, ISD::ATOMIC_LOAD_UINC_WRAP, ISD::ATOMIC_LOAD_UDEC_WRAP, + ISD::ATOMIC_LOAD_USUB_COND, + ISD::ATOMIC_LOAD_USUB_SAT, ISD::INTRINSIC_VOID, ISD::INTRINSIC_W_CHAIN}); @@ -1109,12 +1126,9 @@ MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, EVT ScalarVT = VT.getScalarType(); unsigned Size = ScalarVT.getSizeInBits(); if (Size == 16) { - if (Subtarget->has16BitInsts()) { - if (VT.isInteger()) - return MVT::v2i16; - return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16); - } - return VT.isInteger() ? MVT::i32 : MVT::f32; + return Subtarget->has16BitInsts() + ? MVT::getVectorVT(ScalarVT.getSimpleVT(), 2) + : MVT::i32; } if (Size < 16) @@ -1122,6 +1136,9 @@ MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32; } + if (!Subtarget->has16BitInsts() && VT.getSizeInBits() == 16) + return MVT::i32; + if (VT.getSizeInBits() > 32) return MVT::i32; @@ -1140,7 +1157,7 @@ unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, unsigned Size = ScalarVT.getSizeInBits(); // FIXME: Should probably promote 8-bit vectors to i16. - if (Size == 16 && Subtarget->has16BitInsts()) + if (Size == 16) return (NumElts + 1) / 2; if (Size <= 32) @@ -1164,16 +1181,13 @@ unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv( // FIXME: We should fix the ABI to be the same on targets without 16-bit // support, but unless we can properly handle 3-vectors, it will be still be // inconsistent. - if (Size == 16 && Subtarget->has16BitInsts()) { - if (ScalarVT == MVT::bf16) { - RegisterVT = MVT::i32; - IntermediateVT = MVT::v2bf16; - } else { - RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16; - IntermediateVT = RegisterVT; - } + if (Size == 16) { + MVT SimpleIntermediateVT = + MVT::getVectorVT(ScalarVT.getSimpleVT(), ElementCount::getFixed(2)); + IntermediateVT = SimpleIntermediateVT; + RegisterVT = Subtarget->has16BitInsts() ? SimpleIntermediateVT : MVT::i32; NumIntermediates = (NumElts + 1) / 2; - return NumIntermediates; + return (NumElts + 1) / 2; } if (Size == 32) { @@ -1279,57 +1293,61 @@ static unsigned getIntrMemWidth(unsigned IntrID) { case Intrinsic::amdgcn_global_store_async_from_lds_b32: case Intrinsic::amdgcn_cooperative_atomic_load_32x4B: case Intrinsic::amdgcn_cooperative_atomic_store_32x4B: + case Intrinsic::amdgcn_flat_load_monitor_b32: + case Intrinsic::amdgcn_global_load_monitor_b32: return 32; case Intrinsic::amdgcn_global_load_async_to_lds_b64: case Intrinsic::amdgcn_cluster_load_async_to_lds_b64: case Intrinsic::amdgcn_global_store_async_from_lds_b64: case Intrinsic::amdgcn_cooperative_atomic_load_16x8B: case Intrinsic::amdgcn_cooperative_atomic_store_16x8B: + case Intrinsic::amdgcn_flat_load_monitor_b64: + case Intrinsic::amdgcn_global_load_monitor_b64: return 64; case Intrinsic::amdgcn_global_load_async_to_lds_b128: case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: case Intrinsic::amdgcn_global_store_async_from_lds_b128: case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: + case Intrinsic::amdgcn_flat_load_monitor_b128: + case Intrinsic::amdgcn_global_load_monitor_b128: return 128; default: llvm_unreachable("Unknown width"); } } -static void getCoopAtomicOperandsInfo(const CallInst &CI, bool IsLoad, - TargetLoweringBase::IntrinsicInfo &Info) { - Value *OrderingArg = CI.getArgOperand(IsLoad ? 1 : 2); +static AtomicOrdering parseAtomicOrderingCABIArg(const CallBase &CI, + unsigned ArgIdx) { + Value *OrderingArg = CI.getArgOperand(ArgIdx); unsigned Ord = cast<ConstantInt>(OrderingArg)->getZExtValue(); switch (AtomicOrderingCABI(Ord)) { case AtomicOrderingCABI::acquire: - Info.order = AtomicOrdering::Acquire; + return AtomicOrdering::Acquire; break; case AtomicOrderingCABI::release: - Info.order = AtomicOrdering::Release; + return AtomicOrdering::Release; break; case AtomicOrderingCABI::seq_cst: - Info.order = AtomicOrdering::SequentiallyConsistent; + return AtomicOrdering::SequentiallyConsistent; break; default: - Info.order = AtomicOrdering::Monotonic; - break; + return AtomicOrdering::Monotonic; } +} - Info.flags = - (IsLoad ? MachineMemOperand::MOLoad : MachineMemOperand::MOStore); - Info.flags |= MOCooperative; - +static unsigned parseSyncscopeMDArg(const CallBase &CI, unsigned ArgIdx) { MDNode *ScopeMD = cast<MDNode>( - cast<MetadataAsValue>(CI.getArgOperand(IsLoad ? 2 : 3))->getMetadata()); + cast<MetadataAsValue>(CI.getArgOperand(ArgIdx))->getMetadata()); StringRef Scope = cast<MDString>(ScopeMD->getOperand(0))->getString(); - Info.ssid = CI.getContext().getOrInsertSyncScopeID(Scope); + return CI.getContext().getOrInsertSyncScopeID(Scope); } -bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, - const CallInst &CI, +void SITargetLowering::getTgtMemIntrinsic(SmallVectorImpl<IntrinsicInfo> &Infos, + const CallBase &CI, MachineFunction &MF, unsigned IntrID) const { + IntrinsicInfo Info; Info.flags = MachineMemOperand::MONone; if (CI.hasMetadata(LLVMContext::MD_invariant_load)) Info.flags |= MachineMemOperand::MOInvariant; @@ -1343,7 +1361,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Intrinsic::getFnAttributes(CI.getContext(), (Intrinsic::ID)IntrID); MemoryEffects ME = Attr.getMemoryEffects(); if (ME.doesNotAccessMemory()) - return false; + return; // TODO: Should images get their own address space? Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE; @@ -1433,13 +1451,35 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, } break; case Intrinsic::amdgcn_raw_buffer_load_lds: + case Intrinsic::amdgcn_raw_buffer_load_async_lds: case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: + case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds: case Intrinsic::amdgcn_struct_buffer_load_lds: - case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: { + case Intrinsic::amdgcn_struct_buffer_load_async_lds: + case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: + case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds: { unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue(); + + // Entry 0: Load from buffer. + // Don't set an offset, since the pointer value always represents the + // base of the buffer. Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8); - Info.ptrVal = CI.getArgOperand(1); - return true; + Info.flags &= ~MachineMemOperand::MOStore; + Infos.push_back(Info); + + // Entry 1: Store to LDS. + // Instruction offset is applied, and an additional per-lane offset + // which we simulate using a larger memory type. + Info.memVT = EVT::getIntegerVT( + CI.getContext(), Width * 8 * Subtarget->getWavefrontSize()); + Info.ptrVal = CI.getArgOperand(1); // LDS destination pointer + Info.offset = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 2)) + ->getZExtValue(); + Info.fallbackAddressSpace = AMDGPUAS::LOCAL_ADDRESS; + Info.flags &= ~MachineMemOperand::MOLoad; + Info.flags |= MachineMemOperand::MOStore; + Infos.push_back(Info); + return; } case Intrinsic::amdgcn_raw_atomic_buffer_load: case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load: @@ -1449,11 +1489,13 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, memVTFromLoadIntrReturn(*this, MF.getDataLayout(), CI.getType(), std::numeric_limits<unsigned>::max()); Info.flags &= ~MachineMemOperand::MOStore; - return true; + Infos.push_back(Info); + return; } } } - return true; + Infos.push_back(Info); + return; } switch (IntrID) { @@ -1469,7 +1511,8 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, if (!Vol->isZero()) Info.flags |= MachineMemOperand::MOVolatile; - return true; + Infos.push_back(Info); + return; } case Intrinsic::amdgcn_ds_add_gs_reg_rtn: case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: { @@ -1478,7 +1521,8 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.ptrVal = nullptr; Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER; Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; - return true; + Infos.push_back(Info); + return; } case Intrinsic::amdgcn_ds_append: case Intrinsic::amdgcn_ds_consume: { @@ -1492,7 +1536,8 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, if (!Vol->isZero()) Info.flags |= MachineMemOperand::MOVolatile; - return true; + Infos.push_back(Info); + return; } case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64: case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: { @@ -1505,16 +1550,8 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.size = 8; Info.align.reset(); Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; - return true; - } - case Intrinsic::amdgcn_global_atomic_csub: { - Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.memVT = MVT::getVT(CI.getType()); - Info.ptrVal = CI.getOperand(0); - Info.align.reset(); - Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore | - MachineMemOperand::MOVolatile; - return true; + Infos.push_back(Info); + return; } case Intrinsic::amdgcn_image_bvh_dual_intersect_ray: case Intrinsic::amdgcn_image_bvh_intersect_ray: @@ -1530,14 +1567,14 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.align.reset(); Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable; - return true; + Infos.push_back(Info); + return; } case Intrinsic::amdgcn_global_atomic_fmin_num: case Intrinsic::amdgcn_global_atomic_fmax_num: case Intrinsic::amdgcn_global_atomic_ordered_add_b64: case Intrinsic::amdgcn_flat_atomic_fmin_num: - case Intrinsic::amdgcn_flat_atomic_fmax_num: - case Intrinsic::amdgcn_atomic_cond_sub_u32: { + case Intrinsic::amdgcn_flat_atomic_fmax_num: { Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(CI.getType()); Info.ptrVal = CI.getOperand(0); @@ -1545,14 +1582,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore | MachineMemOperand::MODereferenceable | MachineMemOperand::MOVolatile; - return true; + Infos.push_back(Info); + return; } - case Intrinsic::amdgcn_flat_load_monitor_b32: - case Intrinsic::amdgcn_flat_load_monitor_b64: - case Intrinsic::amdgcn_flat_load_monitor_b128: - case Intrinsic::amdgcn_global_load_monitor_b32: - case Intrinsic::amdgcn_global_load_monitor_b64: - case Intrinsic::amdgcn_global_load_monitor_b128: case Intrinsic::amdgcn_cluster_load_b32: case Intrinsic::amdgcn_cluster_load_b64: case Intrinsic::amdgcn_cluster_load_b128: @@ -1573,7 +1605,24 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.ptrVal = CI.getOperand(0); Info.align.reset(); Info.flags |= MachineMemOperand::MOLoad; - return true; + Infos.push_back(Info); + return; + } + case Intrinsic::amdgcn_flat_load_monitor_b32: + case Intrinsic::amdgcn_flat_load_monitor_b64: + case Intrinsic::amdgcn_flat_load_monitor_b128: + case Intrinsic::amdgcn_global_load_monitor_b32: + case Intrinsic::amdgcn_global_load_monitor_b64: + case Intrinsic::amdgcn_global_load_monitor_b128: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID)); + Info.ptrVal = CI.getOperand(0); + Info.align.reset(); + Info.flags = MachineMemOperand::MOLoad; + Info.order = parseAtomicOrderingCABIArg(CI, 1); + Info.ssid = parseSyncscopeMDArg(CI, 2); + Infos.push_back(Info); + return; } case Intrinsic::amdgcn_cooperative_atomic_load_32x4B: case Intrinsic::amdgcn_cooperative_atomic_load_16x8B: @@ -1582,8 +1631,11 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID)); Info.ptrVal = CI.getOperand(0); Info.align.reset(); - getCoopAtomicOperandsInfo(CI, /*IsLoad=*/true, Info); - return true; + Info.flags = (MachineMemOperand::MOLoad | MOCooperative); + Info.order = parseAtomicOrderingCABIArg(CI, 1); + Info.ssid = parseSyncscopeMDArg(CI, 2); + Infos.push_back(Info); + return; } case Intrinsic::amdgcn_cooperative_atomic_store_32x4B: case Intrinsic::amdgcn_cooperative_atomic_store_16x8B: @@ -1592,8 +1644,11 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID)); Info.ptrVal = CI.getArgOperand(0); Info.align.reset(); - getCoopAtomicOperandsInfo(CI, /*IsLoad=*/false, Info); - return true; + Info.flags = (MachineMemOperand::MOStore | MOCooperative); + Info.order = parseAtomicOrderingCABIArg(CI, 2); + Info.ssid = parseSyncscopeMDArg(CI, 3); + Infos.push_back(Info); + return; } case Intrinsic::amdgcn_ds_gws_init: case Intrinsic::amdgcn_ds_gws_barrier: @@ -1618,7 +1673,8 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.flags |= MachineMemOperand::MOLoad; else Info.flags |= MachineMemOperand::MOStore; - return true; + Infos.push_back(Info); + return; } case Intrinsic::amdgcn_global_load_async_to_lds_b8: case Intrinsic::amdgcn_global_load_async_to_lds_b32: @@ -1628,30 +1684,68 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::amdgcn_cluster_load_async_to_lds_b32: case Intrinsic::amdgcn_cluster_load_async_to_lds_b64: case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: { + // Entry 0: Load from source (global/flat). Info.opc = ISD::INTRINSIC_VOID; Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID)); - Info.ptrVal = CI.getArgOperand(1); - Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; - return true; + Info.ptrVal = CI.getArgOperand(0); // Global pointer + Info.offset = cast<ConstantInt>(CI.getArgOperand(2))->getSExtValue(); + Info.flags |= MachineMemOperand::MOLoad; + Infos.push_back(Info); + + // Entry 1: Store to LDS (same offset). + Info.flags &= ~MachineMemOperand::MOLoad; + Info.flags |= MachineMemOperand::MOStore; + Info.ptrVal = CI.getArgOperand(1); // LDS pointer + Infos.push_back(Info); + return; } case Intrinsic::amdgcn_global_store_async_from_lds_b8: case Intrinsic::amdgcn_global_store_async_from_lds_b32: case Intrinsic::amdgcn_global_store_async_from_lds_b64: case Intrinsic::amdgcn_global_store_async_from_lds_b128: { + // Entry 0: Load from LDS. Info.opc = ISD::INTRINSIC_VOID; Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID)); - Info.ptrVal = CI.getArgOperand(0); - Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; - return true; + Info.ptrVal = CI.getArgOperand(1); // LDS pointer + Info.offset = cast<ConstantInt>(CI.getArgOperand(2))->getSExtValue(); + Info.flags |= MachineMemOperand::MOLoad; + Infos.push_back(Info); + + // Entry 1: Store to global (same offset). + Info.flags &= ~MachineMemOperand::MOLoad; + Info.flags |= MachineMemOperand::MOStore; + Info.ptrVal = CI.getArgOperand(0); // Global pointer + Infos.push_back(Info); + return; } case Intrinsic::amdgcn_load_to_lds: - case Intrinsic::amdgcn_global_load_lds: { - Info.opc = ISD::INTRINSIC_VOID; + case Intrinsic::amdgcn_load_async_to_lds: + case Intrinsic::amdgcn_global_load_lds: + case Intrinsic::amdgcn_global_load_async_lds: { unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue(); + auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1)); + bool IsVolatile = Aux->getZExtValue() & AMDGPU::CPol::VOLATILE; + + // Entry 0: Load from source (global/flat). + Info.opc = ISD::INTRINSIC_VOID; Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8); - Info.ptrVal = CI.getArgOperand(1); - Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; - return true; + Info.ptrVal = CI.getArgOperand(0); // Source pointer + Info.offset = cast<ConstantInt>(CI.getArgOperand(3))->getSExtValue(); + Info.flags |= MachineMemOperand::MOLoad; + if (IsVolatile) + Info.flags |= MachineMemOperand::MOVolatile; + Infos.push_back(Info); + + // Entry 1: Store to LDS. + // Same offset from the instruction, but an additional per-lane offset is + // added. Represent that using a wider memory type. + Info.memVT = EVT::getIntegerVT(CI.getContext(), + Width * 8 * Subtarget->getWavefrontSize()); + Info.ptrVal = CI.getArgOperand(1); // LDS destination pointer + Info.flags &= ~MachineMemOperand::MOLoad; + Info.flags |= MachineMemOperand::MOStore; + Infos.push_back(Info); + return; } case Intrinsic::amdgcn_ds_bvh_stack_rtn: case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn: @@ -1671,7 +1765,8 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.align = Align(4); Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; - return true; + Infos.push_back(Info); + return; } case Intrinsic::amdgcn_s_prefetch_data: case Intrinsic::amdgcn_flat_prefetch: @@ -1680,10 +1775,11 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = EVT::getIntegerVT(CI.getContext(), 8); Info.ptrVal = CI.getArgOperand(0); Info.flags |= MachineMemOperand::MOLoad; - return true; + Infos.push_back(Info); + return; } default: - return false; + return; } } @@ -1709,7 +1805,6 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II, Type *&AccessTy) const { Value *Ptr = nullptr; switch (II->getIntrinsicID()) { - case Intrinsic::amdgcn_atomic_cond_sub_u32: case Intrinsic::amdgcn_cluster_load_b128: case Intrinsic::amdgcn_cluster_load_b64: case Intrinsic::amdgcn_cluster_load_b32: @@ -1729,16 +1824,9 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II, case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: case Intrinsic::amdgcn_flat_atomic_fmax_num: case Intrinsic::amdgcn_flat_atomic_fmin_num: - case Intrinsic::amdgcn_flat_load_monitor_b128: - case Intrinsic::amdgcn_flat_load_monitor_b32: - case Intrinsic::amdgcn_flat_load_monitor_b64: - case Intrinsic::amdgcn_global_atomic_csub: case Intrinsic::amdgcn_global_atomic_fmax_num: case Intrinsic::amdgcn_global_atomic_fmin_num: case Intrinsic::amdgcn_global_atomic_ordered_add_b64: - case Intrinsic::amdgcn_global_load_monitor_b128: - case Intrinsic::amdgcn_global_load_monitor_b32: - case Intrinsic::amdgcn_global_load_monitor_b64: case Intrinsic::amdgcn_global_load_tr_b64: case Intrinsic::amdgcn_global_load_tr_b128: case Intrinsic::amdgcn_global_load_tr4_b64: @@ -1750,7 +1838,9 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II, Ptr = II->getArgOperand(0); break; case Intrinsic::amdgcn_load_to_lds: + case Intrinsic::amdgcn_load_async_to_lds: case Intrinsic::amdgcn_global_load_lds: + case Intrinsic::amdgcn_global_load_async_lds: case Intrinsic::amdgcn_global_load_async_to_lds_b8: case Intrinsic::amdgcn_global_load_async_to_lds_b32: case Intrinsic::amdgcn_global_load_async_to_lds_b64: @@ -1917,7 +2007,7 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, } if (AS == AMDGPUAS::PRIVATE_ADDRESS) - return Subtarget->enableFlatScratch() + return Subtarget->hasFlatScratchEnabled() ? isLegalFlatAddressingMode(AM, AMDGPUAS::PRIVATE_ADDRESS) : isLegalMUBUFAddressingMode(AM); @@ -1980,7 +2070,7 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl( Align RequiredAlignment( PowerOf2Ceil(divideCeil(Size, 8))); // Natural alignment. - if (Subtarget->hasLDSMisalignedBug() && Size > 32 && + if (Subtarget->hasLDSMisalignedBugInWGPMode() && Size > 32 && Alignment < RequiredAlignment) return false; @@ -2229,7 +2319,8 @@ bool SITargetLowering::isExtractVecEltCheap(EVT VT, unsigned Index) const { // TODO: This should be more aggressive, particular for 16-bit element // vectors. However there are some mixed improvements and regressions. EVT EltTy = VT.getVectorElementType(); - return EltTy.getSizeInBits() % 32 == 0; + unsigned MinAlign = Subtarget->useRealTrue16Insts() ? 16 : 32; + return EltTy.getSizeInBits() % MinAlign == 0; } bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const { @@ -2251,6 +2342,14 @@ bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const { return TargetLowering::isTypeDesirableForOp(Op, VT); } +MachinePointerInfo +SITargetLowering::getKernargSegmentPtrInfo(MachineFunction &MF) const { + // This isn't really a constant pool but close enough. + MachinePointerInfo PtrInfo(MF.getPSVManager().getConstantPool()); + PtrInfo.AddrSpace = AMDGPUAS::CONSTANT_ADDRESS; + return PtrInfo; +} + SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, @@ -2313,9 +2412,16 @@ SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT, Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT)); } - if (MemVT.isFloatingPoint()) - Val = getFPExtOrFPRound(DAG, Val, SL, VT); - else if (Signed) + if (MemVT.isFloatingPoint()) { + if (VT.isFloatingPoint()) { + Val = getFPExtOrFPRound(DAG, Val, SL, VT); + } else { + assert(!MemVT.isVector()); + EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()); + SDValue Cast = DAG.getBitcast(IntVT, Val); + Val = DAG.getAnyExtOrTrunc(Cast, SL, VT); + } + } else if (Signed) Val = DAG.getSExtOrTrunc(Val, SL, VT); else Val = DAG.getZExtOrTrunc(Val, SL, VT); @@ -2327,7 +2433,9 @@ SDValue SITargetLowering::lowerKernargMemParameter( SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain, uint64_t Offset, Align Alignment, bool Signed, const ISD::InputArg *Arg) const { - MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); + + MachinePointerInfo PtrInfo = + getKernargSegmentPtrInfo(DAG.getMachineFunction()); // Try to avoid using an extload by loading earlier than the argument address, // and extracting the relevant bits. The load should hopefully be merged with @@ -2342,7 +2450,8 @@ SDValue SITargetLowering::lowerKernargMemParameter( // TODO: If we passed in the base kernel offset we could have a better // alignment than 4, but we don't really need it. SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset); - SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4), + SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, + PtrInfo.getWithOffset(AlignDownOffset), Align(4), MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant); @@ -2357,9 +2466,9 @@ SDValue SITargetLowering::lowerKernargMemParameter( } SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset); - SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment, - MachineMemOperand::MODereferenceable | - MachineMemOperand::MOInvariant); + SDValue Load = DAG.getLoad( + MemVT, SL, Chain, Ptr, PtrInfo.getWithOffset(Offset), Alignment, + MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant); SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg); return DAG.getMergeValues({Val, Load.getValue(1)}, SL); @@ -3023,7 +3132,7 @@ void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, CallingConv::ID CallConv, bool IsShader) const { bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs(); - if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) { + if (Subtarget->hasUserSGPRInit16BugInWave32() && !IsShader) { // Note: user SGPRs are handled by the front-end for graphics shaders // Pad up the used user SGPRs with dead inputs. @@ -3092,7 +3201,7 @@ void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg); } - assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader || + assert(!Subtarget->hasUserSGPRInit16BugInWave32() || IsShader || Info.getNumPreloadedSGPRs() >= 16); } @@ -3120,7 +3229,7 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM, // the scratch registers to pass in. bool RequiresStackAccess = HasStackObjects || MFI.hasCalls(); - if (!ST.enableFlatScratch()) { + if (!ST.hasFlatScratchEnabled()) { if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) { // If we have stack objects, we unquestionably need the private buffer // resource. For the Code Object V2 ABI, this will be the first 4 user @@ -3263,7 +3372,7 @@ SDValue SITargetLowering::LowerFormalArguments( !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ()); (void)UserSGPRInfo; - if (!Subtarget->enableFlatScratch()) + if (!Subtarget->hasFlatScratchEnabled()) assert(!UserSGPRInfo.hasFlatScratchInit()); if ((CallConv != CallingConv::AMDGPU_CS && CallConv != CallingConv::AMDGPU_Gfx && @@ -3334,7 +3443,7 @@ SDValue SITargetLowering::LowerFormalArguments( allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info); // FIXME: Sink this into allocateSpecialInputSGPRs - if (!Subtarget->enableFlatScratch()) + if (!Subtarget->hasFlatScratchEnabled()) CCInfo.AllocateReg(Info->getScratchRSrcReg()); allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info); @@ -3559,11 +3668,17 @@ SDValue SITargetLowering::LowerFormalArguments( if (IsEntryFunc) allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics); - // DAG.getPass() returns nullptr when using new pass manager. - // TODO: Use DAG.getMFAM() to access analysis result. if (DAG.getPass()) { - auto &ArgUsageInfo = DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>(); - ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo()); + auto &ArgUsageInfo = + DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfoWrapperLegacy>(); + ArgUsageInfo.getArgUsageInfo().setFuncArgInfo(Fn, Info->getArgInfo()); + } else if (auto *MFAM = DAG.getMFAM()) { + Module &M = *MF.getFunction().getParent(); + auto *ArgUsageInfo = + MFAM->getResult<ModuleAnalysisManagerMachineFunctionProxy>(MF) + .getCachedResult<AMDGPUArgumentUsageAnalysis>(M); + if (ArgUsageInfo) + ArgUsageInfo->setFuncArgInfo(Fn, Info->getArgInfo()); } unsigned StackArgSize = CCInfo.getStackSize(); @@ -3778,12 +3893,19 @@ void SITargetLowering::passSpecialInputs( const AMDGPUFunctionArgInfo *CalleeArgInfo = &AMDGPUArgumentUsageInfo::FixedABIFunctionInfo; if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) { - // DAG.getPass() returns nullptr when using new pass manager. - // TODO: Use DAG.getMFAM() to access analysis result. if (DAG.getPass()) { auto &ArgUsageInfo = - DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>(); - CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc); + DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfoWrapperLegacy>(); + CalleeArgInfo = + &ArgUsageInfo.getArgUsageInfo().lookupFuncArgInfo(*CalleeFunc); + } else if (auto *MFAM = DAG.getMFAM()) { + Module &M = *DAG.getMachineFunction().getFunction().getParent(); + auto *ArgUsageInfo = + MFAM->getResult<ModuleAnalysisManagerMachineFunctionProxy>( + DAG.getMachineFunction()) + .getCachedResult<AMDGPUArgumentUsageAnalysis>(M); + if (ArgUsageInfo) + CalleeArgInfo = &ArgUsageInfo->lookupFuncArgInfo(*CalleeFunc); } } @@ -4049,7 +4171,7 @@ bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { if (!CI->isTailCall()) return false; - const Function *ParentFn = CI->getParent()->getParent(); + const Function *ParentFn = CI->getFunction(); if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv())) return false; return true; @@ -4233,7 +4355,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL); if (!IsSibCall || IsChainCallConv) { - if (!Subtarget->enableFlatScratch()) { + if (!Subtarget->hasFlatScratchEnabled()) { SmallVector<SDValue, 4> CopyFromChains; // In the HSA case, this should be an identity copy. @@ -5058,7 +5180,7 @@ emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, // Compare the just read M0 value to all possible Idx values. BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg) .addReg(CurrentIdxReg) - .addReg(Idx.getReg(), 0, Idx.getSubReg()); + .addReg(Idx.getReg(), {}, Idx.getSubReg()); // Update EXEC, save the original EXEC value to VCC. BuildMI(LoopBB, I, DL, TII->get(LMC.AndSaveExecOpc), NewExec) @@ -5259,7 +5381,7 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI, setM0ToIndexFromSGPR(TII, MRI, MI, Offset); BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst) - .addReg(SrcReg, 0, SubReg) + .addReg(SrcReg, {}, SubReg) .addReg(SrcReg, RegState::Implicit); } @@ -5293,7 +5415,7 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI, .addImm(SubReg); } else { BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst) - .addReg(SrcReg, 0, SubReg) + .addReg(SrcReg, {}, SubReg) .addReg(SrcReg, RegState::Implicit); } @@ -5466,6 +5588,10 @@ static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc) { return std::numeric_limits<uint32_t>::min(); case AMDGPU::S_MAX_I32: return std::numeric_limits<int32_t>::min(); + case AMDGPU::V_ADD_F32_e64: // -0.0 + return 0x80000000; + case AMDGPU::V_SUB_F32_e64: // +0.0 + return 0x0; case AMDGPU::S_ADD_I32: case AMDGPU::S_SUB_I32: case AMDGPU::S_OR_B32: @@ -5473,6 +5599,9 @@ static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc) { return std::numeric_limits<uint32_t>::min(); case AMDGPU::S_AND_B32: return std::numeric_limits<uint32_t>::max(); + case AMDGPU::V_MIN_F32_e64: + case AMDGPU::V_MAX_F32_e64: + return 0x7fc00000; // qNAN default: llvm_unreachable( "Unexpected opcode in getIdentityValueFor32BitWaveReduction"); @@ -5489,6 +5618,11 @@ static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc) { return std::numeric_limits<uint64_t>::min(); case AMDGPU::V_CMP_GT_I64_e64: // max.i64 return std::numeric_limits<int64_t>::min(); + case AMDGPU::V_MIN_F64_e64: + case AMDGPU::V_MAX_F64_e64: + case AMDGPU::V_MIN_NUM_F64_e64: + case AMDGPU::V_MAX_NUM_F64_e64: + return 0x7FF8000000000000; // qNAN case AMDGPU::S_ADD_U64_PSEUDO: case AMDGPU::S_SUB_U64_PSEUDO: case AMDGPU::S_OR_B64: @@ -5496,6 +5630,9 @@ static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc) { return std::numeric_limits<uint64_t>::min(); case AMDGPU::S_AND_B64: return std::numeric_limits<uint64_t>::max(); + case AMDGPU::V_ADD_F64_e64: + case AMDGPU::V_ADD_F64_pseudo_e64: + return 0x8000000000000000; // -0.0 default: llvm_unreachable( "Unexpected opcode in getIdentityValueFor64BitWaveReduction"); @@ -5507,7 +5644,17 @@ static bool is32bitWaveReduceOperation(unsigned Opc) { Opc == AMDGPU::S_MAX_U32 || Opc == AMDGPU::S_MAX_I32 || Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 || Opc == AMDGPU::S_AND_B32 || Opc == AMDGPU::S_OR_B32 || - Opc == AMDGPU::S_XOR_B32; + Opc == AMDGPU::S_XOR_B32 || Opc == AMDGPU::V_MIN_F32_e64 || + Opc == AMDGPU::V_MAX_F32_e64 || Opc == AMDGPU::V_ADD_F32_e64 || + Opc == AMDGPU::V_SUB_F32_e64; +} + +static bool isFloatingPointWaveReduceOperation(unsigned Opc) { + return Opc == AMDGPU::V_MIN_F32_e64 || Opc == AMDGPU::V_MAX_F32_e64 || + Opc == AMDGPU::V_ADD_F32_e64 || Opc == AMDGPU::V_SUB_F32_e64 || + Opc == AMDGPU::V_MIN_F64_e64 || Opc == AMDGPU::V_MAX_F64_e64 || + Opc == AMDGPU::V_MIN_NUM_F64_e64 || Opc == AMDGPU::V_MAX_NUM_F64_e64 || + Opc == AMDGPU::V_ADD_F64_e64 || Opc == AMDGPU::V_ADD_F64_pseudo_e64; } static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, @@ -5528,8 +5675,10 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, switch (Opc) { case AMDGPU::S_MIN_U32: case AMDGPU::S_MIN_I32: + case AMDGPU::V_MIN_F32_e64: case AMDGPU::S_MAX_U32: case AMDGPU::S_MAX_I32: + case AMDGPU::V_MAX_F32_e64: case AMDGPU::S_AND_B32: case AMDGPU::S_OR_B32: { // Idempotent operations. @@ -5541,6 +5690,10 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, case AMDGPU::V_CMP_LT_I64_e64: // min case AMDGPU::V_CMP_GT_U64_e64: // umax case AMDGPU::V_CMP_GT_I64_e64: // max + case AMDGPU::V_MIN_F64_e64: + case AMDGPU::V_MIN_NUM_F64_e64: + case AMDGPU::V_MAX_F64_e64: + case AMDGPU::V_MAX_NUM_F64_e64: case AMDGPU::S_AND_B64: case AMDGPU::S_OR_B64: { // Idempotent operations. @@ -5552,8 +5705,12 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, case AMDGPU::S_XOR_B64: case AMDGPU::S_ADD_I32: case AMDGPU::S_ADD_U64_PSEUDO: + case AMDGPU::V_ADD_F32_e64: + case AMDGPU::V_ADD_F64_e64: + case AMDGPU::V_ADD_F64_pseudo_e64: case AMDGPU::S_SUB_I32: - case AMDGPU::S_SUB_U64_PSEUDO: { + case AMDGPU::S_SUB_U64_PSEUDO: + case AMDGPU::V_SUB_F32_e64: { const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass(); const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg); Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass); @@ -5708,6 +5865,72 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, .addImm(AMDGPU::sub1); break; } + case AMDGPU::V_ADD_F32_e64: + case AMDGPU::V_ADD_F64_e64: + case AMDGPU::V_ADD_F64_pseudo_e64: + case AMDGPU::V_SUB_F32_e64: { + bool is32BitOpc = is32bitWaveReduceOperation(Opc); + const TargetRegisterClass *VregRC = TII->getRegClass(TII->get(Opc), 0); + Register ActiveLanesVreg = MRI.createVirtualRegister(VregRC); + Register DstVreg = MRI.createVirtualRegister(VregRC); + // Get number of active lanes as a float val. + BuildMI(BB, MI, DL, + TII->get(is32BitOpc ? AMDGPU::V_CVT_F32_I32_e64 + : AMDGPU::V_CVT_F64_I32_e64), + ActiveLanesVreg) + .addReg(NewAccumulator->getOperand(0).getReg()) + .addImm(0) // clamp + .addImm(0); // output-modifier + + // Take negation of input for SUB reduction + unsigned srcMod = + (Opc == AMDGPU::V_SUB_F32_e64 || + MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64) + ? SISrcMods::NEG + : SISrcMods::NONE; + unsigned MulOpc = is32BitOpc ? AMDGPU::V_MUL_F32_e64 + : ST.getGeneration() >= AMDGPUSubtarget::GFX12 + ? AMDGPU::V_MUL_F64_pseudo_e64 + : AMDGPU::V_MUL_F64_e64; + auto DestVregInst = BuildMI(BB, MI, DL, TII->get(MulOpc), + DstVreg) + .addImm(srcMod) // src0 modifier + .addReg(SrcReg) + .addImm(SISrcMods::NONE) // src1 modifier + .addReg(ActiveLanesVreg) + .addImm(SISrcMods::NONE) // clamp + .addImm(SISrcMods::NONE); // output-mod + if (is32BitOpc) { + BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg) + .addReg(DstVreg); + } else { + Register LaneValueLoReg = + MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + Register LaneValueHiReg = + MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + const TargetRegisterClass *VregSubRC = + TRI->getSubRegisterClass(VregRC, AMDGPU::sub0); + MachineOperand Op1L = + TII->buildExtractSubRegOrImm(MI, MRI, DestVregInst->getOperand(0), + VregRC, AMDGPU::sub0, VregSubRC); + MachineOperand Op1H = + TII->buildExtractSubRegOrImm(MI, MRI, DestVregInst->getOperand(0), + VregRC, AMDGPU::sub1, VregSubRC); + // lane value input should be in an sgpr + BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), + LaneValueLoReg) + .add(Op1L); + BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), + LaneValueHiReg) + .add(Op1H); + NewAccumulator = + BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg) + .addReg(LaneValueLoReg) + .addImm(AMDGPU::sub0) + .addReg(LaneValueHiReg) + .addImm(AMDGPU::sub1); + } + } } RetBB = &BB; } @@ -5725,6 +5948,7 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, MachineBasicBlock::iterator I = BB.end(); Register SrcReg = MI.getOperand(1).getReg(); bool is32BitOpc = is32bitWaveReduceOperation(Opc); + bool isFPOp = isFloatingPointWaveReduceOperation(Opc); // Create Control flow for loop // Split MI's Machine Basic block into For loop @@ -5753,7 +5977,10 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), IdentityValReg) .addImm(IdentityValue); } else { - uint64_t IdentityValue = getIdentityValueFor64BitWaveReduction(Opc); + uint64_t IdentityValue = + MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64 + ? 0x0 // +0.0 for double sub reduction + : getIdentityValueFor64BitWaveReduction(Opc); BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), IdentityValReg) .addImm(IdentityValue); } @@ -5784,9 +6011,29 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, LaneValueReg) .addReg(SrcReg) .addReg(FF1Reg); - NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg) - .addReg(Accumulator->getOperand(0).getReg()) - .addReg(LaneValueReg); + if (isFPOp) { + Register LaneValVreg = + MRI.createVirtualRegister(MRI.getRegClass(SrcReg)); + Register DstVreg = MRI.createVirtualRegister(MRI.getRegClass(SrcReg)); + // Get the Lane Value in VGPR to avoid the Constant Bus Restriction + BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), + LaneValVreg) + .addReg(LaneValueReg); + BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstVreg) + .addImm(0) // src0 modifier + .addReg(Accumulator->getOperand(0).getReg()) + .addImm(0) // src1 modifier + .addReg(LaneValVreg) + .addImm(0) // clamp + .addImm(0); // omod + NewAccumulator = BuildMI(*ComputeLoop, I, DL, + TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg) + .addReg(DstVreg); + } else { + NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg) + .addReg(Accumulator->getOperand(0).getReg()) + .addReg(LaneValueReg); + } } else { Register LaneValueLoReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); @@ -5832,7 +6079,10 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, Register LaneMaskReg = MRI.createVirtualRegister(WaveMaskRegClass); Register ComparisonResultReg = MRI.createVirtualRegister(WaveMaskRegClass); - const TargetRegisterClass *VregClass = TRI->getVGPR64Class(); + int SrcIdx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src); + const TargetRegisterClass *VregClass = + TRI->getAllocatableClass(TII->getRegClass(MI.getDesc(), SrcIdx)); const TargetRegisterClass *VSubRegClass = TRI->getSubRegisterClass(VregClass, AMDGPU::sub0); Register AccumulatorVReg = MRI.createVirtualRegister(VregClass); @@ -5863,6 +6113,60 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, .addReg(Accumulator->getOperand(0).getReg()); break; } + case AMDGPU::V_MIN_F64_e64: + case AMDGPU::V_MIN_NUM_F64_e64: + case AMDGPU::V_MAX_F64_e64: + case AMDGPU::V_MAX_NUM_F64_e64: + case AMDGPU::V_ADD_F64_e64: + case AMDGPU::V_ADD_F64_pseudo_e64: { + int SrcIdx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src); + const TargetRegisterClass *VregRC = + TRI->getAllocatableClass(TII->getRegClass(MI.getDesc(), SrcIdx)); + const TargetRegisterClass *VregSubRC = + TRI->getSubRegisterClass(VregRC, AMDGPU::sub0); + Register AccumulatorVReg = MRI.createVirtualRegister(VregRC); + Register DstVreg = MRI.createVirtualRegister(VregRC); + Register LaneValLo = + MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + Register LaneValHi = + MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::COPY), AccumulatorVReg) + .addReg(Accumulator->getOperand(0).getReg()); + unsigned Modifier = + MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64 + ? SISrcMods::NEG + : SISrcMods::NONE; + auto DstVregInst = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstVreg) + .addImm(Modifier) // src0 modifiers + .addReg(LaneValue->getOperand(0).getReg()) + .addImm(SISrcMods::NONE) // src1 modifiers + .addReg(AccumulatorVReg) + .addImm(SISrcMods::NONE) // clamp + .addImm(SISrcMods::NONE); // omod + auto ReadLaneLo = + BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), + LaneValLo); + auto ReadLaneHi = + BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), + LaneValHi); + MachineBasicBlock::iterator Iters = *ReadLaneLo; + MachineOperand Op1L = + TII->buildExtractSubRegOrImm(Iters, MRI, DstVregInst->getOperand(0), + VregRC, AMDGPU::sub0, VregSubRC); + MachineOperand Op1H = + TII->buildExtractSubRegOrImm(Iters, MRI, DstVregInst->getOperand(0), + VregRC, AMDGPU::sub1, VregSubRC); + ReadLaneLo.add(Op1L); + ReadLaneHi.add(Op1H); + NewAccumulator = BuildMI(*ComputeLoop, I, DL, + TII->get(TargetOpcode::REG_SEQUENCE), DstReg) + .addReg(LaneValLo) + .addImm(AMDGPU::sub0) + .addReg(LaneValHi) + .addImm(AMDGPU::sub1); + break; + } case AMDGPU::S_ADD_U64_PSEUDO: case AMDGPU::S_SUB_U64_PSEUDO: { NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg) @@ -5918,6 +6222,13 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32); case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64: return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_I64_e64); + case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F32: + return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_MIN_F32_e64); + case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F64: + return lowerWaveReduce(MI, *BB, *getSubtarget(), + ST.getGeneration() >= AMDGPUSubtarget::GFX12 + ? AMDGPU::V_MIN_NUM_F64_e64 + : AMDGPU::V_MIN_F64_e64); case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32: return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32); case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64: @@ -5926,14 +6237,37 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32); case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64: return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64); + case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F32: + return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_MAX_F32_e64); + case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F64: + return lowerWaveReduce(MI, *BB, *getSubtarget(), + ST.getGeneration() >= AMDGPUSubtarget::GFX12 + ? AMDGPU::V_MAX_NUM_F64_e64 + : AMDGPU::V_MAX_F64_e64); case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32: return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32); case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64: return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO); + case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F32: + return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_ADD_F32_e64); + case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F64: + return lowerWaveReduce(MI, *BB, *getSubtarget(), + ST.getGeneration() >= AMDGPUSubtarget::GFX12 + ? AMDGPU::V_ADD_F64_pseudo_e64 + : AMDGPU::V_ADD_F64_e64); case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32: return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32); case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64: return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO); + case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32: + return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_SUB_F32_e64); + case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64: + // There is no S/V_SUB_F64 opcode. Double type subtraction is expanded as + // fadd + neg, by setting the NEG bit in the instruction. + return lowerWaveReduce(MI, *BB, *getSubtarget(), + ST.getGeneration() >= AMDGPUSubtarget::GFX12 + ? AMDGPU::V_ADD_F64_pseudo_e64 + : AMDGPU::V_ADD_F64_e64); case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32: return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32); case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64: @@ -6203,7 +6537,11 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, } case AMDGPU::SI_INDIRECT_SRC_V1: case AMDGPU::SI_INDIRECT_SRC_V2: + case AMDGPU::SI_INDIRECT_SRC_V3: case AMDGPU::SI_INDIRECT_SRC_V4: + case AMDGPU::SI_INDIRECT_SRC_V5: + case AMDGPU::SI_INDIRECT_SRC_V6: + case AMDGPU::SI_INDIRECT_SRC_V7: case AMDGPU::SI_INDIRECT_SRC_V8: case AMDGPU::SI_INDIRECT_SRC_V9: case AMDGPU::SI_INDIRECT_SRC_V10: @@ -6214,7 +6552,11 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, return emitIndirectSrc(MI, *BB, *getSubtarget()); case AMDGPU::SI_INDIRECT_DST_V1: case AMDGPU::SI_INDIRECT_DST_V2: + case AMDGPU::SI_INDIRECT_DST_V3: case AMDGPU::SI_INDIRECT_DST_V4: + case AMDGPU::SI_INDIRECT_DST_V5: + case AMDGPU::SI_INDIRECT_DST_V6: + case AMDGPU::SI_INDIRECT_DST_V7: case AMDGPU::SI_INDIRECT_DST_V8: case AMDGPU::SI_INDIRECT_DST_V9: case AMDGPU::SI_INDIRECT_DST_V10: @@ -6344,8 +6686,6 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case AMDGPU::DS_GWS_INIT: case AMDGPU::DS_GWS_SEMA_BR: case AMDGPU::DS_GWS_BARRIER: - TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0); - [[fallthrough]]; case AMDGPU::DS_GWS_SEMA_V: case AMDGPU::DS_GWS_SEMA_P: case AMDGPU::DS_GWS_SEMA_RELEASE_ALL: @@ -6711,6 +7051,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return LowerBRCOND(Op, DAG); case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); + case ISD::SPONENTRY: + return LowerSPONENTRY(Op, DAG); case ISD::LOAD: { SDValue Result = LowerLOAD(Op, DAG); assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) && @@ -6743,6 +7085,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); return LowerGlobalAddress(MFI, Op, DAG); } + case ISD::ExternalSymbol: + return LowerExternalSymbol(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); case ISD::INTRINSIC_W_CHAIN: @@ -6792,6 +7136,12 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return splitTernaryVectorOp(Op, DAG); case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: + if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11 && + Op.getValueType() == MVT::i16 && + Op.getOperand(0).getValueType() == MVT::f32) { + // Make f32->i16 legal so we can select V_CVT_PK_[IU]16_F32. + return Op; + } return LowerFP_TO_INT(Op, DAG); case ISD::SHL: case ISD::SRA: @@ -7032,9 +7382,15 @@ static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SDLoc SL(N); if (Src.getOpcode() == ISD::SETCC) { + SDValue Op0 = Src.getOperand(0); + SDValue Op1 = Src.getOperand(1); + // Need to expand bfloat to float for comparison (setcc). + if (Op0.getValueType() == MVT::bf16) { + Op0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op0); + Op1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op1); + } // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...) - return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0), - Src.getOperand(1), Src.getOperand(2)); + return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Op0, Op1, Src.getOperand(2)); } if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) { // (ballot 0) -> 0 @@ -7260,6 +7616,84 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, return DAG.getBitcast(VT, UnrolledLaneOp); } +static SDValue lowerWaveShuffle(const SITargetLowering &TLI, SDNode *N, + SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + + if (VT.getSizeInBits() != 32) + return SDValue(); + + SDLoc SL(N); + + SDValue Value = N->getOperand(1); + SDValue Index = N->getOperand(2); + + // ds_bpermute requires index to be multiplied by 4 + SDValue ShiftAmount = DAG.getShiftAmountConstant(2, MVT::i32, SL); + SDValue ShiftedIndex = + DAG.getNode(ISD::SHL, SL, Index.getValueType(), Index, ShiftAmount); + + // Intrinsics will require i32 to operate on + SDValue ValueI32 = DAG.getBitcast(MVT::i32, Value); + + auto MakeIntrinsic = [&DAG, &SL](unsigned IID, MVT RetVT, + SmallVector<SDValue> IntrinArgs) -> SDValue { + SmallVector<SDValue> Operands(1); + Operands[0] = DAG.getTargetConstant(IID, SL, MVT::i32); + Operands.append(IntrinArgs); + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, RetVT, Operands); + }; + + // If we can bpermute across the whole wave, then just do that + if (TLI.getSubtarget()->supportsWaveWideBPermute()) { + SDValue BPermute = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32, + {ShiftedIndex, ValueI32}); + return DAG.getBitcast(VT, BPermute); + } + + assert(TLI.getSubtarget()->isWave64()); + + // Otherwise, we need to make use of whole wave mode + SDValue PoisonVal = DAG.getPOISON(ValueI32->getValueType(0)); + + // Set inactive lanes to poison + SDValue WWMValue = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32, + {ValueI32, PoisonVal}); + SDValue WWMIndex = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32, + {ShiftedIndex, PoisonVal}); + + SDValue Swapped = + MakeIntrinsic(Intrinsic::amdgcn_permlane64, MVT::i32, {WWMValue}); + + // Get permutation of each half, then we'll select which one to use + SDValue BPermSameHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32, + {WWMIndex, WWMValue}); + SDValue BPermOtherHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, + MVT::i32, {WWMIndex, Swapped}); + SDValue BPermOtherHalfWWM = + MakeIntrinsic(Intrinsic::amdgcn_wwm, MVT::i32, {BPermOtherHalf}); + + // Select which side to take the permute from + SDValue ThreadIDMask = DAG.getAllOnesConstant(SL, MVT::i32); + // We can get away with only using mbcnt_lo here since we're only + // trying to detect which side of 32 each lane is on, and mbcnt_lo + // returns 32 for lanes 32-63. + SDValue ThreadID = + MakeIntrinsic(Intrinsic::amdgcn_mbcnt_lo, MVT::i32, + {ThreadIDMask, DAG.getTargetConstant(0, SL, MVT::i32)}); + + SDValue SameOrOtherHalf = + DAG.getNode(ISD::AND, SL, MVT::i32, + DAG.getNode(ISD::XOR, SL, MVT::i32, ThreadID, Index), + DAG.getTargetConstant(32, SL, MVT::i32)); + SDValue UseSameHalf = + DAG.getSetCC(SL, MVT::i1, SameOrOtherHalf, + DAG.getConstant(0, SL, MVT::i32), ISD::SETEQ); + SDValue Result = DAG.getSelect(SL, MVT::i32, UseSameHalf, BPermSameHalf, + BPermOtherHalfWWM); + return DAG.getBitcast(VT, Result); +} + void SITargetLowering::ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const { @@ -7632,6 +8066,20 @@ SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const { return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT); } +SDValue SITargetLowering::LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + + // For functions that set up their own stack, select the GET_STACK_BASE + // pseudo. + if (MFI->isBottomOfStack()) + return Op; + + // For everything else, create a dummy stack object. + int FI = MF.getFrameInfo().CreateFixedObject(1, 0, /*IsImmutable=*/false); + return DAG.getFrameIndex(FI, Op.getValueType()); +} + SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op, const SDLoc &DL, EVT VT) const { return Op.getValueType().bitsLE(VT) @@ -7701,8 +8149,7 @@ SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { // Round-inexact-to-odd f64 to f32, then do the final rounding using the // hardware f32 -> bf16 instruction. - EVT F32VT = SrcVT.isVector() ? SrcVT.changeVectorElementType(MVT::f32) : - MVT::f32; + EVT F32VT = SrcVT.changeElementType(*DAG.getContext(), MVT::f32); SDValue Rod = expandRoundInexactToOdd(F32VT, Src, DL, DAG); return DAG.getNode(ISD::FP_ROUND, DL, DstVT, Rod, DAG.getTargetConstant(0, DL, MVT::i32)); @@ -7849,14 +8296,13 @@ SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op, EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType() : Op->getOperand(0).getValueType(); - auto ExtTy = OpTy.changeElementType(MVT::i32); + auto &DAG = DCI.DAG; + auto ExtTy = OpTy.changeElementType(*DAG.getContext(), MVT::i32); if (DCI.isBeforeLegalizeOps() || isNarrowingProfitable(Op.getNode(), ExtTy, OpTy)) return SDValue(); - auto &DAG = DCI.DAG; - SDLoc DL(Op); SDValue LHS; SDValue RHS; @@ -8033,7 +8479,7 @@ SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const { } SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const { - if (!Subtarget->isTrapHandlerEnabled() || + if (!Subtarget->hasTrapHandler() || Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) return lowerTrapEndpgm(Op, DAG); @@ -8054,10 +8500,11 @@ SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT, MachineFunction &MF = DAG.getMachineFunction(); uint64_t Offset = getImplicitParameterOffset(MF, Param); SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset); - MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); - return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, PtrInfo, Alignment, - MachineMemOperand::MODereferenceable | - MachineMemOperand::MOInvariant); + MachinePointerInfo PtrInfo = + getKernargSegmentPtrInfo(DAG.getMachineFunction()); + return DAG.getLoad( + VT, DL, DAG.getEntryNode(), Ptr, PtrInfo.getWithOffset(Offset), Alignment, + MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant); } SDValue SITargetLowering::lowerTrapHsaQueuePtr(SDValue Op, @@ -8115,7 +8562,7 @@ SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = Op.getOperand(0); MachineFunction &MF = DAG.getMachineFunction(); - if (!Subtarget->isTrapHandlerEnabled() || + if (!Subtarget->hasTrapHandler() || Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { LLVMContext &Ctx = MF.getFunction().getContext(); Ctx.diagnose(DiagnosticInfoUnsupported(MF.getFunction(), @@ -8319,6 +8766,9 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op, Op.getValueType() == MVT::i64) { const SIMachineFunctionInfo *Info = DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>(); + if (Info->get32BitAddressHighBits() == 0) + return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, Src); + SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32); SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi); return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec); @@ -8847,17 +9297,17 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { - Type *Ty = GV->getValueType(); + const GlobalVariable &GVar = *cast<GlobalVariable>(GV); // HIP uses an unsized array `extern __shared__ T s[]` or similar // zero-sized type in other languages to declare the dynamic shared // memory which size is not known at the compile time. They will be // allocated by the runtime and placed directly after the static // allocated ones. They all share the same offset. - if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) { + if (GVar.getGlobalSize(GVar.getDataLayout()) == 0) { assert(PtrVT == MVT::i32 && "32-bit pointer is expected."); // Adjust alignment for that dynamic shared memory array. Function &F = DAG.getMachineFunction().getFunction(); - MFI->setDynLDSAlign(F, *cast<GlobalVariable>(GV)); + MFI->setDynLDSAlign(F, GVar); MFI->setUsesDynamicLDS(true); return SDValue( DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0); @@ -8912,6 +9362,15 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, MachineMemOperand::MOInvariant); } +SDValue SITargetLowering::LowerExternalSymbol(SDValue Op, + SelectionDAG &DAG) const { + // TODO: Handle this. It should be mostly the same as LowerGlobalAddress. + const Function &Fn = DAG.getMachineFunction().getFunction(); + DAG.getContext()->diagnose(DiagnosticInfoUnsupported( + Fn, "unsupported external symbol", Op.getDebugLoc())); + return DAG.getPOISON(Op.getValueType()); +} + SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const { // We can't use S_MOV_B32 directly, because there is no way to specify m0 as @@ -9131,16 +9590,23 @@ SDValue SITargetLowering::lowerImage(SDValue Op, SDLoc DL(Op); MachineFunction &MF = DAG.getMachineFunction(); const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>(); + unsigned IntrOpcode = Intr->BaseOpcode; + // For image atomic: use no-return opcode if result is unused. + if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode && + !Op.getNode()->hasAnyUseOfValue(0)) + IntrOpcode = Intr->AtomicNoRetBaseOpcode; const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = - AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); + AMDGPU::getMIMGBaseOpcodeInfo(IntrOpcode); const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim); - unsigned IntrOpcode = Intr->BaseOpcode; bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget); bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget); bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget); SmallVector<EVT, 3> ResultTypes(Op->values()); SmallVector<EVT, 3> OrigResultTypes(Op->values()); + if (BaseOpcode->NoReturn && BaseOpcode->Atomic) + ResultTypes.erase(&ResultTypes[0]); + bool IsD16 = false; bool IsG16 = false; bool IsA16 = false; @@ -9159,8 +9625,10 @@ SDValue SITargetLowering::lowerImage(SDValue Op, VData = Op.getOperand(2); IsAtomicPacked16Bit = - (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 || - Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16); + (IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 || + IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16_NORTN || + IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16 || + IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16_NORTN); bool Is64Bit = VData.getValueSizeInBits() == 64; if (BaseOpcode->AtomicX2) { @@ -9170,7 +9638,9 @@ SDValue SITargetLowering::lowerImage(SDValue Op, if (Is64Bit) VData = DAG.getBitcast(MVT::v4i32, VData); - ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32; + if (!BaseOpcode->NoReturn) + ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32; + DMask = Is64Bit ? 0xf : 0x3; NumVDataDwords = Is64Bit ? 4 : 2; } else { @@ -9396,8 +9866,9 @@ SDValue SITargetLowering::lowerImage(SDValue Op, } unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex); - if (BaseOpcode->Atomic) - CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization + // Keep GLC only when the atomic's result is actually used. + if (BaseOpcode->Atomic && !BaseOpcode->NoReturn) + CPol |= AMDGPU::CPol::GLC; if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) | AMDGPU::CPol::VOLATILE)) return Op; @@ -9509,13 +9980,20 @@ SDValue SITargetLowering::lowerImage(SDValue Op, DAG.setNodeMemRefs(NewNode, {MemRef}); } + if (BaseOpcode->NoReturn) { + if (BaseOpcode->Atomic) + return DAG.getMergeValues( + {DAG.getPOISON(OrigResultTypes[0]), SDValue(NewNode, 0)}, DL); + + return SDValue(NewNode, 0); + } + if (BaseOpcode->AtomicX2) { SmallVector<SDValue, 1> Elt; DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1); return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL); } - if (BaseOpcode->NoReturn) - return SDValue(NewNode, 0); + return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail, Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes, NumVDataDwords, IsAtomicPacked16Bit, DL); @@ -9709,7 +10187,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); } case Intrinsic::amdgcn_kernarg_segment_ptr: { - if (!AMDGPU::isKernel(MF.getFunction().getCallingConv())) { + if (!AMDGPU::isKernel(MF.getFunction())) { // This only makes sense to call in a kernel, so just lower to null. return DAG.getConstant(0, DL, VT); } @@ -10110,11 +10588,13 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SDLoc SL(Op); auto IndexKey = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, IndexKeyTy); - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(), - {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2), - Op.getOperand(3), Op.getOperand(4), Op.getOperand(5), - IndexKey, Op.getOperand(7), - Op.getOperand(8)}); // No clamp operand + SmallVector<SDValue> Args{ + Op.getOperand(0), Op.getOperand(1), Op.getOperand(2), + Op.getOperand(3), Op.getOperand(4), Op.getOperand(5), + IndexKey, Op.getOperand(7), Op.getOperand(8)}; + if (IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8) + Args.push_back(Op.getOperand(9)); + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(), Args); } case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4: case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8: @@ -10148,6 +10628,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, Poisons.push_back(DAG.getPOISON(ValTy)); return DAG.getMergeValues(Poisons, SDLoc(Op)); } + case Intrinsic::amdgcn_wave_shuffle: + return lowerWaveShuffle(*this, Op.getNode(), DAG); default: if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrinsicID)) @@ -10455,9 +10937,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_raw_buffer_atomic_dec: case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec: return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC); - case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32: - return lowerRawBufferAtomicIntrin(Op, DAG, - AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32); case Intrinsic::amdgcn_struct_buffer_atomic_swap: case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap: return lowerStructBufferAtomicIntrin(Op, DAG, @@ -10499,10 +10978,21 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_struct_buffer_atomic_dec: case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec: return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC); + case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32: + return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_CSUB); + case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32: + return lowerStructBufferAtomicIntrin(Op, DAG, + AMDGPUISD::BUFFER_ATOMIC_CSUB); + case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32: + return lowerRawBufferAtomicIntrin(Op, DAG, + AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32); case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32: return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32); - case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: { SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG); @@ -10752,6 +11242,19 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(), Ops, M->getMemOperand()); } + case Intrinsic::amdgcn_s_alloc_vgpr: { + SDValue NumVGPRs = Op.getOperand(2); + if (!NumVGPRs->isDivergent()) + return Op; + + SDValue ReadFirstLaneID = + DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32); + NumVGPRs = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32, + ReadFirstLaneID, NumVGPRs); + + return DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, Op->getVTList(), + Op.getOperand(0), Op.getOperand(1), NumVGPRs); + } case Intrinsic::amdgcn_s_get_barrier_state: case Intrinsic::amdgcn_s_get_named_barrier_state: { SDValue Chain = Op->getOperand(0); @@ -10794,6 +11297,26 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, return DAG.getAtomicLoad(ISD::NON_EXTLOAD, DL, MII->getMemoryVT(), VT, Chain, Ptr, MII->getMemOperand()); } + case Intrinsic::amdgcn_flat_load_monitor_b32: + case Intrinsic::amdgcn_flat_load_monitor_b64: + case Intrinsic::amdgcn_flat_load_monitor_b128: { + MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op); + SDValue Chain = Op->getOperand(0); + SDValue Ptr = Op->getOperand(2); + return DAG.getMemIntrinsicNode(AMDGPUISD::FLAT_LOAD_MONITOR, DL, + Op->getVTList(), {Chain, Ptr}, + MII->getMemoryVT(), MII->getMemOperand()); + } + case Intrinsic::amdgcn_global_load_monitor_b32: + case Intrinsic::amdgcn_global_load_monitor_b64: + case Intrinsic::amdgcn_global_load_monitor_b128: { + MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op); + SDValue Chain = Op->getOperand(0); + SDValue Ptr = Op->getOperand(2); + return DAG.getMemIntrinsicNode(AMDGPUISD::GLOBAL_LOAD_MONITOR, DL, + Op->getVTList(), {Chain, Ptr}, + MII->getMemoryVT(), MII->getMemOperand()); + } default: if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = @@ -10932,12 +11455,24 @@ SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG, return VData; } +static bool isAsyncLDSDMA(Intrinsic::ID Intr) { + switch (Intr) { + case Intrinsic::amdgcn_raw_buffer_load_async_lds: + case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds: + case Intrinsic::amdgcn_struct_buffer_load_async_lds: + case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds: + case Intrinsic::amdgcn_load_async_to_lds: + case Intrinsic::amdgcn_global_load_async_lds: + return true; + } + return false; +} + SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); SDValue Chain = Op.getOperand(0); unsigned IntrinsicID = Op.getConstantOperandVal(1); - MachineFunction &MF = DAG.getMachineFunction(); switch (IntrinsicID) { case Intrinsic::amdgcn_exp_compr: { @@ -11128,15 +11663,21 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, M->getMemoryVT(), M->getMemOperand()); } case Intrinsic::amdgcn_raw_buffer_load_lds: + case Intrinsic::amdgcn_raw_buffer_load_async_lds: case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: + case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds: case Intrinsic::amdgcn_struct_buffer_load_lds: - case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: { + case Intrinsic::amdgcn_struct_buffer_load_async_lds: + case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: + case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds: { if (!Subtarget->hasVMemToLDSLoad()) return SDValue(); unsigned Opc; bool HasVIndex = IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds || - IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds; + IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_async_lds || + IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds || + IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds; unsigned OpOffset = HasVIndex ? 1 : 0; SDValue VOffset = Op.getOperand(5 + OpOffset); bool HasVOffset = !isNullConstant(VOffset); @@ -11208,33 +11749,14 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, ? 1 : 0, DL, MVT::i8)); // swz + Ops.push_back( + DAG.getTargetConstant(isAsyncLDSDMA(IntrinsicID), DL, MVT::i8)); Ops.push_back(M0Val.getValue(0)); // Chain Ops.push_back(M0Val.getValue(1)); // Glue auto *M = cast<MemSDNode>(Op); - MachineMemOperand *LoadMMO = M->getMemOperand(); - // Don't set the offset value here because the pointer points to the base of - // the buffer. - MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo(); - - MachinePointerInfo StorePtrI = LoadPtrI; - LoadPtrI.V = PoisonValue::get( - PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS)); - LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS; - StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS; - - auto F = LoadMMO->getFlags() & - ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad); - LoadMMO = - MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, Size, - LoadMMO->getBaseAlign(), LoadMMO->getAAInfo()); - - MachineMemOperand *StoreMMO = MF.getMachineMemOperand( - StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), - LoadMMO->getBaseAlign(), LoadMMO->getAAInfo()); - auto *Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops); - DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO}); + DAG.setNodeMemRefs(Load, M->memoperands()); return SDValue(Load, 0); } @@ -11242,7 +11764,9 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, // for "trust me" that the remaining cases are global pointers until // such time as we can put two mem operands on an intrinsic. case Intrinsic::amdgcn_load_to_lds: - case Intrinsic::amdgcn_global_load_lds: { + case Intrinsic::amdgcn_load_async_to_lds: + case Intrinsic::amdgcn_global_load_lds: + case Intrinsic::amdgcn_global_load_async_lds: { if (!Subtarget->hasVMemToLDSLoad()) return SDValue(); @@ -11307,30 +11831,19 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, } Ops.push_back(Op.getOperand(5)); // Offset - Ops.push_back(Op.getOperand(6)); // CPol + + unsigned Aux = Op.getConstantOperandVal(6); + Ops.push_back(DAG.getTargetConstant(Aux & ~AMDGPU::CPol::VIRTUAL_BITS, DL, + MVT::i32)); // CPol + Ops.push_back( + DAG.getTargetConstant(isAsyncLDSDMA(IntrinsicID), DL, MVT::i8)); + Ops.push_back(M0Val.getValue(0)); // Chain Ops.push_back(M0Val.getValue(1)); // Glue auto *M = cast<MemSDNode>(Op); - MachineMemOperand *LoadMMO = M->getMemOperand(); - MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo(); - LoadPtrI.Offset = Op->getConstantOperandVal(5); - MachinePointerInfo StorePtrI = LoadPtrI; - LoadPtrI.V = PoisonValue::get( - PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS)); - LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS; - StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS; - auto F = LoadMMO->getFlags() & - ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad); - LoadMMO = - MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, Size, - LoadMMO->getBaseAlign(), LoadMMO->getAAInfo()); - MachineMemOperand *StoreMMO = MF.getMachineMemOperand( - StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4), - LoadMMO->getAAInfo()); - auto *Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops); - DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO}); + DAG.setNodeMemRefs(Load, M->memoperands()); return SDValue(Load, 0); } @@ -11375,6 +11888,11 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops); return SDValue(NewMI, 0); } + case Intrinsic::amdgcn_s_wakeup_barrier: { + if (!Subtarget->hasSWakeupBarrier()) + return SDValue(); + [[fallthrough]]; + } case Intrinsic::amdgcn_s_barrier_join: { // these three intrinsics have one operand: barrier pointer SDValue Chain = Op->getOperand(0); @@ -11384,16 +11902,32 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, if (isa<ConstantSDNode>(BarOp)) { uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue(); - Opc = AMDGPU::S_BARRIER_JOIN_IMM; - + switch (IntrinsicID) { + default: + return SDValue(); + case Intrinsic::amdgcn_s_barrier_join: + Opc = AMDGPU::S_BARRIER_JOIN_IMM; + break; + case Intrinsic::amdgcn_s_wakeup_barrier: + Opc = AMDGPU::S_WAKEUP_BARRIER_IMM; + break; + } // extract the BarrierID from bits 4-9 of the immediate unsigned BarID = (BarVal >> 4) & 0x3F; SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32); Ops.push_back(K); Ops.push_back(Chain); } else { - Opc = AMDGPU::S_BARRIER_JOIN_M0; - + switch (IntrinsicID) { + default: + return SDValue(); + case Intrinsic::amdgcn_s_barrier_join: + Opc = AMDGPU::S_BARRIER_JOIN_M0; + break; + case Intrinsic::amdgcn_s_wakeup_barrier: + Opc = AMDGPU::S_WAKEUP_BARRIER_M0; + break; + } // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0] SDValue M0Val; M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp, @@ -11482,7 +12016,7 @@ SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const { // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before // being added, so we can only safely match a 32-bit addition with no // unsigned overflow. - bool CheckNUW = AMDGPU::isGFX1250(*Subtarget); + bool CheckNUW = Subtarget->hasGFX1250Insts(); if (!CheckNUW || isNoUnsignedWrap(N0)) { C1 = cast<ConstantSDNode>(N0.getOperand(1)); N0 = N0.getOperand(0); @@ -11542,11 +12076,15 @@ void SITargetLowering::setBufferOffsets(SDValue CombinedOffset, } } if (DAG.isBaseWithConstantOffset(CombinedOffset)) { + // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before + // being added, so we can only safely match a 32-bit addition with no + // unsigned overflow. + bool CheckNUW = Subtarget->hasGFX1250Insts(); SDValue N0 = CombinedOffset.getOperand(0); SDValue N1 = CombinedOffset.getOperand(1); uint32_t SOffset, ImmOffset; int Offset = cast<ConstantSDNode>(N1)->getSExtValue(); - if (Offset >= 0 && + if (Offset >= 0 && (!CheckNUW || isNoUnsignedWrap(CombinedOffset)) && TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) { Offsets[0] = N0; Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32); @@ -11845,7 +12383,8 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { Align Alignment = Load->getAlign(); unsigned AS = Load->getAddressSpace(); - if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS && + if (Subtarget->hasLDSMisalignedBugInWGPMode() && + AS == AMDGPUAS::FLAT_ADDRESS && Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) { return SplitVectorLoad(Op, DAG); } @@ -11866,7 +12405,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT || (AS == AMDGPUAS::GLOBAL_ADDRESS && Subtarget->getScalarizeGlobalBehavior() && Load->isSimple() && - isMemOpHasNoClobberedMemOperand(Load))) { + (Load->isInvariant() || isMemOpHasNoClobberedMemOperand(Load)))) { if ((!Op->isDivergent() || AMDGPU::isUniformMMO(MMO)) && Alignment >= Align(4) && NumElements < 32) { if (MemVT.isPow2VectorType() || @@ -12161,7 +12700,10 @@ SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const { SDValue LHS = Op.getOperand(1); SDValue RHS = Op.getOperand(2); - SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags); + // TODO: The combiner should probably handle elimination of redundant fabs. + SDValue r1 = DAG.SignBitIsZeroFP(RHS) + ? RHS + : DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags); const APFloat K0Val(0x1p+96f); const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32); @@ -12466,7 +13008,8 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { Store->getValue().getValueType().getScalarType() == MVT::i32); unsigned AS = Store->getAddressSpace(); - if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS && + if (Subtarget->hasLDSMisalignedBugInWGPMode() && + AS == AMDGPUAS::FLAT_ADDRESS && Store->getAlign().value() < VT.getStoreSize() && VT.getSizeInBits() > 32) { return SplitVectorStore(Op, DAG); @@ -12506,7 +13049,7 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { return SDValue(); case 16: if (NumElements > 4 || - (NumElements == 3 && !Subtarget->enableFlatScratch())) + (NumElements == 3 && !Subtarget->hasFlatScratchEnabled())) return SplitVectorStore(Op, DAG); return SDValue(); default: @@ -12728,23 +13271,36 @@ SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { // if Arg is already the result of a multiply by constant. auto Flags = Op->getFlags(); + // AMDGPUISD nodes of vector type must be unrolled here since + // they will not be expanded elsewhere. + auto UnrollIfVec = [&DAG](SDValue V) -> SDValue { + if (!V.getValueType().isVector()) + return V; + + return DAG.UnrollVectorOp(cast<SDNode>(V)); + }; + SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT); if (Subtarget->hasTrigReducedRange()) { SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags); - TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags); + TrigVal = UnrollIfVec(DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags)); } else { TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags); } switch (Op.getOpcode()) { case ISD::FCOS: - return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags); + TrigVal = DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags); + break; case ISD::FSIN: - return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags); + TrigVal = DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags); + break; default: llvm_unreachable("Wrong trig opcode"); } + + return UnrollIfVec(TrigVal); } SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, @@ -13398,6 +13954,7 @@ calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0, return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1); } + case ISD::ANY_EXTEND: case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: case ISD::SIGN_EXTEND_INREG: { @@ -13904,6 +14461,12 @@ static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { assert(OtherOp.getValueSizeInBits() == 32); } + // Check that we haven't just recreated the same FSHR node. + if (N->getOpcode() == ISD::FSHR && + (N->getOperand(0) == Op || N->getOperand(0) == OtherOp) && + (N->getOperand(1) == Op || N->getOperand(1) == OtherOp)) + return SDValue(); + if (hasNon16BitAccesses(PermMask, Op, OtherOp)) { assert(Op.getValueType().isByteSized() && @@ -14181,10 +14744,11 @@ SDValue SITargetLowering::performXorCombine(SDNode *N, return SDValue(); } -SDValue SITargetLowering::performZeroExtendCombine(SDNode *N, - DAGCombinerInfo &DCI) const { +SDValue +SITargetLowering::performZeroOrAnyExtendCombine(SDNode *N, + DAGCombinerInfo &DCI) const { if (!Subtarget->has16BitInsts() || - DCI.getDAGCombineLevel() < AfterLegalizeDAG) + DCI.getDAGCombineLevel() < AfterLegalizeTypes) return SDValue(); EVT VT = N->getValueType(0); @@ -14195,7 +14759,44 @@ SDValue SITargetLowering::performZeroExtendCombine(SDNode *N, if (Src.getValueType() != MVT::i16) return SDValue(); - return SDValue(); + if (!Src->hasOneUse()) + return SDValue(); + + // TODO: We bail out below if SrcOffset is not in the first dword (>= 4). It's + // possible we're missing out on some combine opportunities, but we'd need to + // weigh the cost of extracting the byte from the upper dwords. + + std::optional<ByteProvider<SDValue>> BP0 = + calculateByteProvider(SDValue(N, 0), 0, 0, 0); + if (!BP0 || BP0->SrcOffset >= 4 || !BP0->Src) + return SDValue(); + SDValue V0 = *BP0->Src; + + std::optional<ByteProvider<SDValue>> BP1 = + calculateByteProvider(SDValue(N, 0), 1, 0, 1); + if (!BP1 || BP1->SrcOffset >= 4 || !BP1->Src) + return SDValue(); + + SDValue V1 = *BP1->Src; + + if (V0 == V1) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + SDLoc DL(N); + uint32_t PermMask = 0x0c0c0c0c; + if (V0) { + V0 = DAG.getBitcastedAnyExtOrTrunc(V0, DL, MVT::i32); + PermMask = (PermMask & ~0xFF) | (BP0->SrcOffset + 4); + } + + if (V1) { + V1 = DAG.getBitcastedAnyExtOrTrunc(V1, DL, MVT::i32); + PermMask = (PermMask & ~(0xFF << 8)) | (BP1->SrcOffset << 8); + } + + return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, V0, V1, + DAG.getConstant(PermMask, DL, MVT::i32)); } SDValue @@ -14299,6 +14900,7 @@ SDValue SITargetLowering::performRcpCombine(SDNode *N, } bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op, + SDNodeFlags UserFlags, unsigned MaxDepth) const { unsigned Opcode = Op.getOpcode(); if (Opcode == ISD::FCANONICALIZE) @@ -14498,7 +15100,7 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op, // FIXME: denormalsEnabledForType is broken for dynamic return denormalsEnabledForType(DAG, Op.getValueType()) && - DAG.isKnownNeverSNaN(Op); + (UserFlags.hasNoNaNs() || DAG.isKnownNeverSNaN(Op)); } bool SITargetLowering::isCanonicalized(Register Reg, const MachineFunction &MF, @@ -14993,8 +15595,9 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N, // for some types, but at a higher cost since it's implemented with a 3 // operand form. const SDNodeFlags Flags = N->getFlags(); - if ((Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM) && - !Subtarget->hasIEEEMinimumMaximumInsts() && Flags.hasNoNaNs()) { + if ((Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM) && Flags.hasNoNaNs() && + !Subtarget->hasIEEEMinimumMaximumInsts() && + isOperationLegal(ISD::FMINNUM_IEEE, VT.getScalarType())) { unsigned NewOpc = Opc == ISD::FMINIMUM ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE; return DAG.getNode(NewOpc, SDLoc(N), VT, Op0, Op1, Flags); @@ -16335,7 +16938,9 @@ SDValue SITargetLowering::performFDivCombine(SDNode *N, SelectionDAG &DAG = DCI.DAG; SDLoc SL(N); EVT VT = N->getValueType(0); - if ((VT != MVT::f16 && VT != MVT::bf16) || !Subtarget->has16BitInsts()) + + // fsqrt legality correlates to rsq availability. + if ((VT != MVT::f16 && VT != MVT::bf16) || !isOperationLegal(ISD::FSQRT, VT)) return SDValue(); SDValue LHS = N->getOperand(0); @@ -16370,7 +16975,7 @@ SDValue SITargetLowering::performFMulCombine(SDNode *N, SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); EVT ScalarVT = VT.getScalarType(); - EVT IntVT = VT.changeElementType(MVT::i32); + EVT IntVT = VT.changeElementType(*DAG.getContext(), MVT::i32); if (!N->isDivergent() && getSubtarget()->hasSALUFloatInsts() && (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) { @@ -16548,7 +17153,6 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N, LHS.getOpcode() == ISD::SELECT && isa<ConstantSDNode>(LHS.getOperand(1)) && isa<ConstantSDNode>(LHS.getOperand(2)) && - LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) && isBoolSGPR(LHS.getOperand(0))) { // Given CT != FT: // setcc (select cc, CT, CF), CF, eq => xor cc, -1 @@ -16558,13 +17162,34 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N, const APInt &CT = LHS.getConstantOperandAPInt(1); const APInt &CF = LHS.getConstantOperandAPInt(2); - if ((CF == CRHSVal && CC == ISD::SETEQ) || - (CT == CRHSVal && CC == ISD::SETNE)) - return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0), - DAG.getAllOnesConstant(SL, MVT::i1)); - if ((CF == CRHSVal && CC == ISD::SETNE) || - (CT == CRHSVal && CC == ISD::SETEQ)) - return LHS.getOperand(0); + if (CT != CF) { + if ((CF == CRHSVal && CC == ISD::SETEQ) || + (CT == CRHSVal && CC == ISD::SETNE)) + return DAG.getNOT(SL, LHS.getOperand(0), MVT::i1); + if ((CF == CRHSVal && CC == ISD::SETNE) || + (CT == CRHSVal && CC == ISD::SETEQ)) + return LHS.getOperand(0); + } + } + + // setcc v.64, 0xXXXX'XXXX'0000'0000, lt/ge + // => setcc v.hi32, 0xXXXX'XXXX, lt/ge + // + // setcc v.64, 0xXXXX'XXXX'FFFF'FFFF, le/gt + // => setcc v.hi32, 0xXXXX'XXXX, le/gt + if (VT == MVT::i64) { + const uint64_t Mask32 = maskTrailingOnes<uint64_t>(32); + const uint64_t CRHSInt = CRHSVal.getZExtValue(); + + if ( // setcc v.64, 0xXXXX'XXXX'0000'0000, lt/ge + ((CRHSInt & Mask32) == 0 && (CC == ISD::SETULT || CC == ISD::SETUGE || + CC == ISD::SETLT || CC == ISD::SETGE)) || + // setcc v.64, 0xXXXX'XXXX'FFFF'FFFF, le/gt + ((CRHSInt & Mask32) == Mask32 && + (CC == ISD::SETULE || CC == ISD::SETUGT || CC == ISD::SETLE || + CC == ISD::SETGT))) + return DAG.getSetCC(SL, N->getValueType(0), getHiHalf64(LHS, DAG), + DAG.getConstant(CRHSInt >> 32, SL, MVT::i32), CC); } } @@ -16877,8 +17502,9 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, } case ISD::XOR: return performXorCombine(N, DCI); + case ISD::ANY_EXTEND: case ISD::ZERO_EXTEND: - return performZeroExtendCombine(N, DCI); + return performZeroOrAnyExtendCombine(N, DCI); case ISD::SIGN_EXTEND_INREG: return performSignExtendInRegCombine(N, DCI); case AMDGPUISD::FP_CLASS: @@ -17335,12 +17961,14 @@ void SITargetLowering::AddMemOpInit(MachineInstr &MI) const { // Abandon attempt if the dst size isn't large enough // - this is in fact an error but this is picked up elsewhere and // reported correctly. - uint32_t DstSize = - TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32; + const TargetRegisterClass *DstRC = TII->getRegClass(MI.getDesc(), DstIdx); + + uint32_t DstSize = TRI.getRegSizeInBits(*DstRC) / 32; if (DstSize < InitIdx) return; } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) { - InitIdx = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32; + const TargetRegisterClass *DstRC = TII->getRegClass(MI.getDesc(), DstIdx); + InitIdx = TRI.getRegSizeInBits(*DstRC) / 32; } else { return; } @@ -17388,7 +18016,7 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const { const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); - MachineFunction *MF = MI.getParent()->getParent(); + MachineFunction *MF = MI.getMF(); MachineRegisterInfo &MRI = MF->getRegInfo(); if (TII->isVOP3(MI.getOpcode())) { @@ -17524,6 +18152,8 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_, break; case 'v': switch (BitWidth) { + case 1: + return std::pair(0U, nullptr); case 16: RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass : &AMDGPU::VGPR_32_Lo256RegClass; @@ -17541,6 +18171,8 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_, if (!Subtarget->hasMAIInsts()) break; switch (BitWidth) { + case 1: + return std::pair(0U, nullptr); case 16: RC = &AMDGPU::AGPR_32RegClass; break; @@ -18050,6 +18682,11 @@ void SITargetLowering::computeKnownBitsForTargetInstr( case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: Known.Zero.setHighBits(16); break; + case AMDGPU::G_AMDGPU_COPY_SCC_VCC: + // G_AMDGPU_COPY_SCC_VCC converts a uniform boolean in VCC to SGPR s32, + // producing exactly 0 or 1. + Known.Zero.setHighBits(Known.getBitWidth() - 1); + break; case AMDGPU::G_AMDGPU_SMED3: case AMDGPU::G_AMDGPU_UMED3: { auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs(); @@ -18226,7 +18863,6 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N, case AMDGPUISD::BUFFER_ATOMIC_INC: case AMDGPUISD::BUFFER_ATOMIC_DEC: case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP: - case AMDGPUISD::BUFFER_ATOMIC_CSUB: case AMDGPUISD::BUFFER_ATOMIC_FADD: case AMDGPUISD::BUFFER_ATOMIC_FMIN: case AMDGPUISD::BUFFER_ATOMIC_FMAX: @@ -18378,12 +19014,12 @@ static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local // allocations work. if (HasSystemScope) { - if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics() && + if (Subtarget.hasAgentScopeFineGrainedRemoteMemoryAtomics() && RMW->hasMetadata("amdgpu.no.remote.memory")) return true; if (Subtarget.hasEmulatedSystemScopeAtomics()) return true; - } else if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics()) + } else if (Subtarget.hasAgentScopeFineGrainedRemoteMemoryAtomics()) return true; return RMW->hasMetadata("amdgpu.no.fine.grained.memory"); @@ -18413,7 +19049,7 @@ getPrivateAtomicExpansionKind(const GCNSubtarget &STI) { } TargetLowering::AtomicExpansionKind -SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { +SITargetLowering::shouldExpandAtomicRMWInIR(const AtomicRMWInst *RMW) const { unsigned AS = RMW->getPointerAddressSpace(); if (AS == AMDGPUAS::PRIVATE_ADDRESS) return getPrivateAtomicExpansionKind(*getSubtarget()); @@ -18461,7 +19097,19 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { case AtomicRMWInst::UMax: case AtomicRMWInst::UMin: case AtomicRMWInst::UIncWrap: - case AtomicRMWInst::UDecWrap: { + case AtomicRMWInst::UDecWrap: + case AtomicRMWInst::USubCond: + case AtomicRMWInst::USubSat: { + if (Op == AtomicRMWInst::USubCond && !Subtarget->hasCondSubInsts()) + return AtomicExpansionKind::CmpXChg; + if (Op == AtomicRMWInst::USubSat && !Subtarget->hasSubClampInsts()) + return AtomicExpansionKind::CmpXChg; + if (Op == AtomicRMWInst::USubCond || Op == AtomicRMWInst::USubSat) { + auto *IT = dyn_cast<IntegerType>(RMW->getType()); + if (!IT || IT->getBitWidth() != 32) + return AtomicExpansionKind::CmpXChg; + } + if (AMDGPU::isFlatGlobalAddrSpace(AS) || AS == AMDGPUAS::BUFFER_FAT_POINTER) { if (Subtarget->hasEmulatedSystemScopeAtomics()) @@ -18481,7 +19129,7 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { // If fine-grained remote memory works at device scope, we don't need to // do anything. if (!HasSystemScope && - Subtarget->supportsAgentScopeFineGrainedRemoteMemoryAtomics()) + Subtarget->hasAgentScopeFineGrainedRemoteMemoryAtomics()) return atomicSupportedIfLegalIntType(RMW); // If we are targeting a remote allocated address, it depends what kind of @@ -18500,7 +19148,7 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { Op == AtomicRMWInst::Xor) { // Atomic sub/or/xor do not work over PCI express, but atomic add // does. InstCombine transforms these with 0 to or, so undo that. - if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand()); + if (const Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand()); ConstVal && ConstVal->isNullValue()) return AtomicExpansionKind::CustomExpand; } @@ -18699,7 +19347,8 @@ SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { } TargetLowering::AtomicExpansionKind -SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const { +SITargetLowering::shouldExpandAtomicCmpXchgInIR( + const AtomicCmpXchgInst *CmpX) const { unsigned AddrSpace = CmpX->getPointerAddressSpace(); if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) return getPrivateAtomicExpansionKind(*getSubtarget()); @@ -18726,8 +19375,11 @@ SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const { : &AMDGPU::SReg_32RegClass; if (!TRI->isSGPRClass(RC) && !isDivergent) return TRI->getEquivalentSGPRClass(RC); - if (TRI->isSGPRClass(RC) && isDivergent) + if (TRI->isSGPRClass(RC) && isDivergent) { + if (Subtarget->hasGFX90AInsts()) + return TRI->getEquivalentAVClass(RC); return TRI->getEquivalentVGPRClass(RC); + } return RC; } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index 74e58f4..59b8f43 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -45,6 +45,8 @@ public: LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override; + MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const; + private: SDValue lowerKernArgParameterPtr(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, uint64_t Offset) const; @@ -77,6 +79,8 @@ private: SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const override; + SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op, MVT VT, unsigned Offset) const; SDValue lowerImage(SDValue Op, const AMDGPU::ImageDimIntrinsicInfo *Intr, @@ -128,6 +132,7 @@ private: SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const; SDValue adjustLoadValueType(unsigned Opcode, MemSDNode *M, SelectionDAG &DAG, ArrayRef<SDValue> Ops, bool IsIntrinsic = false) const; @@ -205,7 +210,7 @@ private: SDValue performAndCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performOrCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performXorCombine(SDNode *N, DAGCombinerInfo &DCI) const; - SDValue performZeroExtendCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performZeroOrAnyExtendCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performSignExtendInRegCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue getCanonicalConstantFP(SelectionDAG &DAG, const SDLoc &SL, EVT VT, @@ -332,7 +337,7 @@ public: MVT getPointerTy(const DataLayout &DL, unsigned AS) const override; MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override; - bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, + void getTgtMemIntrinsic(SmallVectorImpl<IntrinsicInfo> &, const CallBase &, MachineFunction &MF, unsigned IntrinsicID) const override; @@ -555,7 +560,7 @@ public: Register N1) const override; bool isCanonicalized(SelectionDAG &DAG, SDValue Op, - unsigned MaxDepth = 5) const; + SDNodeFlags UserFlags = {}, unsigned MaxDepth = 5) const; bool isCanonicalized(Register Reg, const MachineFunction &MF, unsigned MaxDepth = 5) const; bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const; @@ -564,11 +569,12 @@ public: bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN = false, unsigned Depth = 0) const override; - AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override; + AtomicExpansionKind + shouldExpandAtomicRMWInIR(const AtomicRMWInst *) const override; AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override; AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override; AtomicExpansionKind - shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override; + shouldExpandAtomicCmpXchgInIR(const AtomicCmpXchgInst *AI) const override; void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const; void emitExpandAtomicRMW(AtomicRMWInst *AI) const override; diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 6dcbced..1118675 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -42,6 +42,7 @@ #include "llvm/TargetParser/TargetParser.h" using namespace llvm; +using namespace llvm::AMDGPU; #define DEBUG_TYPE "si-insert-waitcnts" @@ -63,58 +64,96 @@ static cl::opt<bool> ForceEmitZeroLoadFlag( cl::desc("Force all waitcnt load counters to wait until 0"), cl::init(false), cl::Hidden); +static cl::opt<bool> ExpertSchedulingModeFlag( + "amdgpu-expert-scheduling-mode", + cl::desc("Enable expert scheduling mode 2 for all functions (GFX12+ only)"), + cl::init(false), cl::Hidden); + namespace { -// Class of object that encapsulates latest instruction counter score -// associated with the operand. Used for determining whether -// s_waitcnt instruction needs to be emitted. - -enum InstCounterType { - LOAD_CNT = 0, // VMcnt prior to gfx12. - DS_CNT, // LKGMcnt prior to gfx12. - EXP_CNT, // - STORE_CNT, // VScnt in gfx10/gfx11. - NUM_NORMAL_INST_CNTS, - SAMPLE_CNT = NUM_NORMAL_INST_CNTS, // gfx12+ only. - BVH_CNT, // gfx12+ only. - KM_CNT, // gfx12+ only. - X_CNT, // gfx1250. - NUM_EXTENDED_INST_CNTS, - NUM_INST_CNTS = NUM_EXTENDED_INST_CNTS -}; -} // namespace +// Get the maximum wait count value for a given counter type. +static unsigned getWaitCountMax(const AMDGPU::HardwareLimits &Limits, + InstCounterType T) { + switch (T) { + case LOAD_CNT: + return Limits.LoadcntMax; + case DS_CNT: + return Limits.DscntMax; + case EXP_CNT: + return Limits.ExpcntMax; + case STORE_CNT: + return Limits.StorecntMax; + case SAMPLE_CNT: + return Limits.SamplecntMax; + case BVH_CNT: + return Limits.BvhcntMax; + case KM_CNT: + return Limits.KmcntMax; + case X_CNT: + return Limits.XcntMax; + case VA_VDST: + return Limits.VaVdstMax; + case VM_VSRC: + return Limits.VmVsrcMax; + default: + return 0; + } +} -namespace llvm { -template <> struct enum_iteration_traits<InstCounterType> { - static constexpr bool is_iterable = true; -}; -} // namespace llvm +static bool isSoftXcnt(MachineInstr &MI) { + return MI.getOpcode() == AMDGPU::S_WAIT_XCNT_soft; +} -namespace { -// Return an iterator over all counters between LOAD_CNT (the first counter) -// and \c MaxCounter (exclusive, default value yields an enumeration over -// all counters). -auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) { - return enum_seq(LOAD_CNT, MaxCounter); +static bool isAtomicRMW(MachineInstr &MI) { + return (MI.getDesc().TSFlags & SIInstrFlags::maybeAtomic) && MI.mayLoad() && + MI.mayStore(); } -using RegInterval = std::pair<int, int>; - -struct HardwareLimits { - unsigned LoadcntMax; // Corresponds to VMcnt prior to gfx12. - unsigned ExpcntMax; - unsigned DscntMax; // Corresponds to LGKMcnt prior to gfx12. - unsigned StorecntMax; // Corresponds to VScnt in gfx10/gfx11. - unsigned SamplecntMax; // gfx12+ only. - unsigned BvhcntMax; // gfx12+ only. - unsigned KmcntMax; // gfx12+ only. - unsigned XcntMax; // gfx1250. +enum class AtomicRMWState { + NewBlock, // Start of a new atomic RMW block + InsideBlock, // Middle of an existing block + NotInBlock // Not in an atomic RMW block }; +/// Integer IDs used to track vector memory locations we may have to wait on. +/// Encoded as u16 chunks: +/// +/// [0, REGUNITS_END ): MCRegUnit +/// [LDSDMA_BEGIN, LDSDMA_END ) : LDS DMA IDs +/// +/// NOTE: The choice of encoding these as "u16 chunks" is arbitrary. +/// It gives (2 << 16) - 1 entries per category which is more than enough +/// for all register units. MCPhysReg is u16 so we don't even support >u16 +/// physical register numbers at this time, let alone >u16 register units. +/// In any case, an assertion in "WaitcntBrackets" ensures REGUNITS_END +/// is enough for all register units. +using VMEMID = uint32_t; + +enum : VMEMID { + TRACKINGID_RANGE_LEN = (1 << 16), + + // Important: MCRegUnits must always be tracked starting from 0, as we + // need to be able to convert between a MCRegUnit and a VMEMID freely. + REGUNITS_BEGIN = 0, + REGUNITS_END = REGUNITS_BEGIN + TRACKINGID_RANGE_LEN, + + // Note for LDSDMA: LDSDMA_BEGIN corresponds to the "common" + // entry, which is updated for all LDS DMA operations encountered. + // Specific LDS DMA IDs start at LDSDMA_BEGIN + 1. + NUM_LDSDMA = TRACKINGID_RANGE_LEN, + LDSDMA_BEGIN = REGUNITS_END, + LDSDMA_END = LDSDMA_BEGIN + NUM_LDSDMA, +}; + +/// Convert a MCRegUnit to a VMEMID. +static constexpr VMEMID toVMEMID(MCRegUnit RU) { + return static_cast<unsigned>(RU); +} + #define AMDGPU_DECLARE_WAIT_EVENTS(DECL) \ - DECL(VMEM_ACCESS) /* vmem read & write */ \ - DECL(VMEM_READ_ACCESS) /* vmem read */ \ + DECL(VMEM_ACCESS) /* vmem read & write (pre-gfx10), vmem read (gfx10+) */ \ DECL(VMEM_SAMPLER_READ_ACCESS) /* vmem SAMPLER read (gfx12+ only) */ \ DECL(VMEM_BVH_READ_ACCESS) /* vmem BVH read (gfx12+ only) */ \ + DECL(GLOBAL_INV_ACCESS) /* GLOBAL_INV (gfx12+ only) */ \ DECL(VMEM_WRITE_ACCESS) /* vmem write that is not scratch */ \ DECL(SCRATCH_WRITE_ACCESS) /* vmem write that may be scratch */ \ DECL(VMEM_GROUP) /* vmem group */ \ @@ -129,7 +168,14 @@ struct HardwareLimits { DECL(EXP_POS_ACCESS) /* write to export position */ \ DECL(EXP_PARAM_ACCESS) /* write to export parameter */ \ DECL(VMW_GPR_LOCK) /* vmem write holding on its data src */ \ - DECL(EXP_LDS_ACCESS) /* read by ldsdir counting as export */ + DECL(EXP_LDS_ACCESS) /* read by ldsdir counting as export */ \ + DECL(VGPR_CSMACC_WRITE) /* write VGPR dest in Core/Side-MACC VALU */ \ + DECL(VGPR_DPMACC_WRITE) /* write VGPR dest in DPMACC VALU */ \ + DECL(VGPR_TRANS_WRITE) /* write VGPR dest in TRANS VALU */ \ + DECL(VGPR_XDL_WRITE) /* write VGPR dest in XDL VALU */ \ + DECL(VGPR_LDS_READ) /* read VGPR source in LDS */ \ + DECL(VGPR_FLAT_READ) /* read VGPR source in FLAT */ \ + DECL(VGPR_VMEM_READ) /* read VGPR source in other VMEM */ // clang-format off #define AMDGPU_EVENT_ENUM(Name) Name, @@ -138,38 +184,33 @@ enum WaitEventType { NUM_WAIT_EVENTS }; #undef AMDGPU_EVENT_ENUM +} // namespace + +namespace llvm { +template <> struct enum_iteration_traits<WaitEventType> { + static constexpr bool is_iterable = true; +}; +} // namespace llvm + +namespace { + +/// Return an iterator over all events between VMEM_ACCESS (the first event) +/// and \c MaxEvent (exclusive, default value yields an enumeration over +/// all counters). +auto wait_events(WaitEventType MaxEvent = NUM_WAIT_EVENTS) { + return enum_seq(VMEM_ACCESS, MaxEvent); +} #define AMDGPU_EVENT_NAME(Name) #Name, static constexpr StringLiteral WaitEventTypeName[] = { AMDGPU_DECLARE_WAIT_EVENTS(AMDGPU_EVENT_NAME) }; #undef AMDGPU_EVENT_NAME +static constexpr StringLiteral getWaitEventTypeName(WaitEventType Event) { + return WaitEventTypeName[Event]; +} // clang-format on -// The mapping is: -// 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs -// SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots -// NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs -// NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS .. SCC -// We reserve a fixed number of VGPR slots in the scoring tables for -// special tokens like SCMEM_LDS (needed for buffer load to LDS). -enum RegisterMapping { - SQ_MAX_PGM_VGPRS = 2048, // Maximum programmable VGPRs across all targets. - AGPR_OFFSET = 512, // Maximum programmable ArchVGPRs across all targets. - SQ_MAX_PGM_SGPRS = 128, // Maximum programmable SGPRs across all targets. - // Artificial register slots to track LDS writes into specific LDS locations - // if a location is known. When slots are exhausted or location is - // unknown use the first slot. The first slot is also always updated in - // addition to known location's slot to properly generate waits if dependent - // instruction's location is unknown. - FIRST_LDS_VGPR = SQ_MAX_PGM_VGPRS, // Extra slots for LDS stores. - NUM_LDS_VGPRS = 9, // One more than the stores we track. - NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_LDS_VGPRS, // Where SGPRs start. - NUM_ALL_ALLOCATABLE = NUM_ALL_VGPRS + SQ_MAX_PGM_SGPRS, - // Remaining non-allocatable registers - SCC = NUM_ALL_ALLOCATABLE -}; - // Enumerate different types of result-returning VMEM operations. Although // s_waitcnt orders them all with a single vmcnt counter, in the absence of // s_waitcnt only instructions of the same VmemType are guaranteed to write @@ -187,7 +228,7 @@ enum VmemType { // Maps values of InstCounterType to the instruction that waits on that // counter. Only used if GCNSubtarget::hasExtendedWaitCounts() -// returns true. +// returns true, and does not cover VA_VDST or VM_VSRC. static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = { AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT, AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT, @@ -224,49 +265,80 @@ VmemType getVmemType(const MachineInstr &Inst) { return VMEM_NOSAMPLER; } -unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) { - switch (T) { - case LOAD_CNT: - return Wait.LoadCnt; - case EXP_CNT: - return Wait.ExpCnt; - case DS_CNT: - return Wait.DsCnt; - case STORE_CNT: - return Wait.StoreCnt; - case SAMPLE_CNT: - return Wait.SampleCnt; - case BVH_CNT: - return Wait.BvhCnt; - case KM_CNT: - return Wait.KmCnt; - case X_CNT: - return Wait.XCnt; - default: - llvm_unreachable("bad InstCounterType"); - } -} - void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) { - unsigned &WC = getCounterRef(Wait, T); - WC = std::min(WC, Count); + Wait.set(T, std::min(Wait.get(T), Count)); } -void setNoWait(AMDGPU::Waitcnt &Wait, InstCounterType T) { - getCounterRef(Wait, T) = ~0u; -} +void setNoWait(AMDGPU::Waitcnt &Wait, InstCounterType T) { Wait.set(T, ~0u); } -unsigned getWait(AMDGPU::Waitcnt &Wait, InstCounterType T) { - return getCounterRef(Wait, T); -} +/// A small set of events. +class WaitEventSet { + unsigned Mask = 0; -// Mapping from event to counter according to the table masks. -InstCounterType eventCounter(const unsigned *masks, WaitEventType E) { - for (auto T : inst_counter_types()) { - if (masks[T] & (1 << E)) - return T; +public: + WaitEventSet() = default; + explicit constexpr WaitEventSet(WaitEventType Event) { + static_assert(NUM_WAIT_EVENTS <= sizeof(Mask) * 8, + "Not enough bits in Mask for all the events"); + Mask |= 1 << Event; + } + constexpr WaitEventSet(std::initializer_list<WaitEventType> Events) { + for (auto &E : Events) { + Mask |= 1 << E; + } + } + void insert(const WaitEventType &Event) { Mask |= 1 << Event; } + void remove(const WaitEventType &Event) { Mask &= ~(1 << Event); } + void remove(const WaitEventSet &Other) { Mask &= ~Other.Mask; } + bool contains(const WaitEventType &Event) const { + return Mask & (1 << Event); + } + /// \Returns true if this set contains all elements of \p Other. + bool contains(const WaitEventSet &Other) const { + return (~Mask & Other.Mask) == 0; + } + /// \Returns the intersection of this and \p Other. + WaitEventSet operator&(const WaitEventSet &Other) const { + auto Copy = *this; + Copy.Mask &= Other.Mask; + return Copy; + } + /// \Returns the union of this and \p Other. + WaitEventSet operator|(const WaitEventSet &Other) const { + auto Copy = *this; + Copy.Mask |= Other.Mask; + return Copy; + } + /// This set becomes the union of this and \p Other. + WaitEventSet &operator|=(const WaitEventSet &Other) { + Mask |= Other.Mask; + return *this; + } + /// This set becomes the intersection of this and \p Other. + WaitEventSet &operator&=(const WaitEventSet &Other) { + Mask &= Other.Mask; + return *this; + } + bool operator==(const WaitEventSet &Other) const { + return Mask == Other.Mask; + } + bool operator!=(const WaitEventSet &Other) const { return !(*this == Other); } + bool empty() const { return Mask == 0; } + /// \Returns true if the set contains more than one element. + bool twoOrMore() const { return Mask & (Mask - 1); } + operator bool() const { return !empty(); } + void print(raw_ostream &OS) const { + ListSeparator LS(", "); + for (WaitEventType Event : wait_events()) { + OS << LS << getWaitEventTypeName(Event); + } } - llvm_unreachable("event type has no associated counter"); + LLVM_DUMP_METHOD void dump() const; +}; + +void WaitEventSet::dump() const { + print(dbgs()); + dbgs() << "\n"; } class WaitcntBrackets; @@ -279,24 +351,33 @@ class WaitcntBrackets; // otherwise have had to become. class WaitcntGenerator { protected: - const GCNSubtarget *ST = nullptr; - const SIInstrInfo *TII = nullptr; + const GCNSubtarget &ST; + const SIInstrInfo &TII; AMDGPU::IsaVersion IV; InstCounterType MaxCounter; bool OptNone; + bool ExpandWaitcntProfiling = false; + const AMDGPU::HardwareLimits *Limits = nullptr; public: - WaitcntGenerator() = default; - WaitcntGenerator(const MachineFunction &MF, InstCounterType MaxCounter) - : ST(&MF.getSubtarget<GCNSubtarget>()), TII(ST->getInstrInfo()), - IV(AMDGPU::getIsaVersion(ST->getCPU())), MaxCounter(MaxCounter), + WaitcntGenerator() = delete; + WaitcntGenerator(const WaitcntGenerator &) = delete; + WaitcntGenerator(const MachineFunction &MF, InstCounterType MaxCounter, + const AMDGPU::HardwareLimits *Limits) + : ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()), + IV(AMDGPU::getIsaVersion(ST.getCPU())), MaxCounter(MaxCounter), OptNone(MF.getFunction().hasOptNone() || - MF.getTarget().getOptLevel() == CodeGenOptLevel::None) {} + MF.getTarget().getOptLevel() == CodeGenOptLevel::None), + ExpandWaitcntProfiling( + MF.getFunction().hasFnAttribute("amdgpu-expand-waitcnt-profiling")), + Limits(Limits) {} // Return true if the current function should be compiled with no // optimization. bool isOptNone() const { return OptNone; } + const AMDGPU::HardwareLimits &getLimits() const { return *Limits; } + // Edits an existing sequence of wait count instructions according // to an incoming Waitcnt value, which is itself updated to reflect // any new wait count instructions which may need to be generated by @@ -316,39 +397,51 @@ public: // Transform a soft waitcnt into a normal one. bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const; - // Generates new wait count instructions according to the value of + // Generates new wait count instructions according to the value of // Wait, returning true if any new instructions were created. + // ScoreBrackets is used for profiling expansion. virtual bool createNewWaitcnt(MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It, - AMDGPU::Waitcnt Wait) = 0; + AMDGPU::Waitcnt Wait, + const WaitcntBrackets &ScoreBrackets) = 0; - // Returns an array of bit masks which can be used to map values in - // WaitEventType to corresponding counter values in InstCounterType. - virtual const unsigned *getWaitEventMask() const = 0; + // Returns the WaitEventSet that corresponds to counter \p T. + virtual const WaitEventSet &getWaitEvents(InstCounterType T) const = 0; + + /// \returns the counter that corresponds to event \p E. + InstCounterType getCounterFromEvent(WaitEventType E) const { + for (auto T : inst_counter_types()) { + if (getWaitEvents(T).contains(E)) + return T; + } + llvm_unreachable("event type has no associated counter"); + } // Returns a new waitcnt with all counters except VScnt set to 0. If // IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u. virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0; virtual ~WaitcntGenerator() = default; - - // Create a mask value from the initializer list of wait event types. - static constexpr unsigned - eventMask(std::initializer_list<WaitEventType> Events) { - unsigned Mask = 0; - for (auto &E : Events) - Mask |= 1 << E; - - return Mask; - } }; -class WaitcntGeneratorPreGFX12 : public WaitcntGenerator { -public: - WaitcntGeneratorPreGFX12() = default; - WaitcntGeneratorPreGFX12(const MachineFunction &MF) - : WaitcntGenerator(MF, NUM_NORMAL_INST_CNTS) {} +class WaitcntGeneratorPreGFX12 final : public WaitcntGenerator { + static constexpr const WaitEventSet + WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = { + WaitEventSet( + {VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS}), + WaitEventSet({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}), + WaitEventSet({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, + EXP_PARAM_ACCESS, EXP_POS_ACCESS, EXP_LDS_ACCESS}), + WaitEventSet({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}), + WaitEventSet(), + WaitEventSet(), + WaitEventSet(), + WaitEventSet(), + WaitEventSet(), + WaitEventSet()}; +public: + using WaitcntGenerator::WaitcntGenerator; bool applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait, @@ -356,35 +449,41 @@ public: bool createNewWaitcnt(MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It, - AMDGPU::Waitcnt Wait) override; - - const unsigned *getWaitEventMask() const override { - assert(ST); + AMDGPU::Waitcnt Wait, + const WaitcntBrackets &ScoreBrackets) override; - static const unsigned WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = { - eventMask({VMEM_ACCESS, VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS, - VMEM_BVH_READ_ACCESS}), - eventMask({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}), - eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS, - EXP_POS_ACCESS, EXP_LDS_ACCESS}), - eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}), - 0, - 0, - 0, - 0}; - - return WaitEventMaskForInstPreGFX12; + const WaitEventSet &getWaitEvents(InstCounterType T) const override { + return WaitEventMaskForInstPreGFX12[T]; } AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override; }; -class WaitcntGeneratorGFX12Plus : public WaitcntGenerator { +class WaitcntGeneratorGFX12Plus final : public WaitcntGenerator { +protected: + bool IsExpertMode; + static constexpr const WaitEventSet + WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = { + WaitEventSet({VMEM_ACCESS, GLOBAL_INV_ACCESS}), + WaitEventSet({LDS_ACCESS, GDS_ACCESS}), + WaitEventSet({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, + EXP_PARAM_ACCESS, EXP_POS_ACCESS, EXP_LDS_ACCESS}), + WaitEventSet({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}), + WaitEventSet({VMEM_SAMPLER_READ_ACCESS}), + WaitEventSet({VMEM_BVH_READ_ACCESS}), + WaitEventSet({SMEM_ACCESS, SQ_MESSAGE, SCC_WRITE}), + WaitEventSet({VMEM_GROUP, SMEM_GROUP}), + WaitEventSet({VGPR_CSMACC_WRITE, VGPR_DPMACC_WRITE, VGPR_TRANS_WRITE, + VGPR_XDL_WRITE}), + WaitEventSet({VGPR_LDS_READ, VGPR_FLAT_READ, VGPR_VMEM_READ})}; + public: - WaitcntGeneratorGFX12Plus() = default; + WaitcntGeneratorGFX12Plus() = delete; WaitcntGeneratorGFX12Plus(const MachineFunction &MF, - InstCounterType MaxCounter) - : WaitcntGenerator(MF, MaxCounter) {} + InstCounterType MaxCounter, + const AMDGPU::HardwareLimits *Limits, + bool IsExpertMode) + : WaitcntGenerator(MF, MaxCounter, Limits), IsExpertMode(IsExpertMode) {} bool applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets, @@ -393,28 +492,22 @@ public: bool createNewWaitcnt(MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It, - AMDGPU::Waitcnt Wait) override; - - const unsigned *getWaitEventMask() const override { - assert(ST); + AMDGPU::Waitcnt Wait, + const WaitcntBrackets &ScoreBrackets) override; - static const unsigned WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = { - eventMask({VMEM_ACCESS, VMEM_READ_ACCESS}), - eventMask({LDS_ACCESS, GDS_ACCESS}), - eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS, - EXP_POS_ACCESS, EXP_LDS_ACCESS}), - eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}), - eventMask({VMEM_SAMPLER_READ_ACCESS}), - eventMask({VMEM_BVH_READ_ACCESS}), - eventMask({SMEM_ACCESS, SQ_MESSAGE, SCC_WRITE}), - eventMask({VMEM_GROUP, SMEM_GROUP})}; - - return WaitEventMaskForInstGFX12Plus; + const WaitEventSet &getWaitEvents(InstCounterType T) const override { + return WaitEventMaskForInstGFX12Plus[T]; } AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override; }; +// Flags indicating which counters should be flushed in a loop preheader. +struct PreheaderFlushFlags { + bool FlushVmCnt = false; + bool FlushDsCnt = false; +}; + class SIInsertWaitcnts { public: const GCNSubtarget *ST; @@ -423,11 +516,11 @@ public: const MachineRegisterInfo *MRI = nullptr; InstCounterType SmemAccessCounter; InstCounterType MaxCounter; - const unsigned *WaitEventMaskForInst; + bool IsExpertMode = false; private: DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses; - DenseMap<MachineBasicBlock *, bool> PreheadersToFlush; + DenseMap<MachineBasicBlock *, PreheaderFlushFlags> PreheadersToFlush; MachineLoopInfo *MLI; MachinePostDominatorTree *PDT; AliasAnalysis *AA = nullptr; @@ -441,19 +534,18 @@ private: bool ForceEmitWaitcnt[NUM_INST_CNTS]; - // In any given run of this pass, WCG will point to one of these two - // generator objects, which must have been re-initialised before use - // from a value made using a subtarget constructor. - WaitcntGeneratorPreGFX12 WCGPreGFX12; - WaitcntGeneratorGFX12Plus WCGGFX12Plus; + std::unique_ptr<WaitcntGenerator> WCG; - WaitcntGenerator *WCG = nullptr; + // Remember call and return instructions in the function. + DenseSet<MachineInstr *> CallInsts; + DenseSet<MachineInstr *> ReturnInsts; - // S_ENDPGM instructions before which we should insert a DEALLOC_VGPRS - // message. - DenseSet<MachineInstr *> ReleaseVGPRInsts; + // Remember all S_ENDPGM instructions. The boolean flag is true if there might + // be outstanding stores but definitely no outstanding scratch stores, to help + // with insertion of DEALLOC_VGPRS messages. + DenseMap<MachineInstr *, bool> EndPgmInsts; - HardwareLimits Limits; + AMDGPU::HardwareLimits Limits; public: SIInsertWaitcnts(MachineLoopInfo *MLI, MachinePostDominatorTree *PDT, @@ -464,34 +556,15 @@ public: (void)ForceVMCounter; } - unsigned getWaitCountMax(InstCounterType T) const { - switch (T) { - case LOAD_CNT: - return Limits.LoadcntMax; - case DS_CNT: - return Limits.DscntMax; - case EXP_CNT: - return Limits.ExpcntMax; - case STORE_CNT: - return Limits.StorecntMax; - case SAMPLE_CNT: - return Limits.SamplecntMax; - case BVH_CNT: - return Limits.BvhcntMax; - case KM_CNT: - return Limits.KmcntMax; - case X_CNT: - return Limits.XcntMax; - default: - break; - } - return 0; - } + const AMDGPU::HardwareLimits &getLimits() const { return Limits; } - bool shouldFlushVmCnt(MachineLoop *ML, const WaitcntBrackets &Brackets); - bool isPreheaderToFlush(MachineBasicBlock &MBB, - const WaitcntBrackets &ScoreBrackets); + PreheaderFlushFlags getPreheaderFlushFlags(MachineLoop *ML, + const WaitcntBrackets &Brackets); + PreheaderFlushFlags isPreheaderToFlush(MachineBasicBlock &MBB, + const WaitcntBrackets &ScoreBrackets); bool isVMEMOrFlatVMEM(const MachineInstr &MI) const; + bool isDSRead(const MachineInstr &MI) const; + bool mayStoreIncrementingDSCNT(const MachineInstr &MI) const; bool run(MachineFunction &MF); void setForceEmitWaitcnt() { @@ -524,6 +597,9 @@ public: ForceEmitWaitcnt[SAMPLE_CNT] = false; ForceEmitWaitcnt[BVH_CNT] = false; } + + ForceEmitWaitcnt[VA_VDST] = false; + ForceEmitWaitcnt[VM_VSRC] = false; #endif // NDEBUG } @@ -531,8 +607,10 @@ public: // instruction. WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const { switch (Inst.getOpcode()) { + // FIXME: GLOBAL_INV needs to be tracked with xcnt too. case AMDGPU::GLOBAL_INV: - return VMEM_READ_ACCESS; // tracked using loadcnt + return GLOBAL_INV_ACCESS; // tracked using loadcnt, but doesn't write + // VGPRs case AMDGPU::GLOBAL_WB: case AMDGPU::GLOBAL_WBINV: return VMEM_WRITE_ACCESS; // tracked using storecnt @@ -542,7 +620,7 @@ public: // Maps VMEM access types to their corresponding WaitEventType. static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = { - VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS}; + VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS}; assert(SIInstrInfo::isVMEM(Inst)); // LDS DMA loads are also stores, but on the LDS side. On the VMEM side @@ -551,22 +629,41 @@ public: return VMEM_ACCESS; if (Inst.mayStore() && (!Inst.mayLoad() || SIInstrInfo::isAtomicNoRet(Inst))) { - // FLAT and SCRATCH instructions may access scratch. Other VMEM - // instructions do not. - if (TII->mayAccessScratchThroughFlat(Inst)) + if (TII->mayAccessScratch(Inst)) return SCRATCH_WRITE_ACCESS; return VMEM_WRITE_ACCESS; } if (!ST->hasExtendedWaitCounts() || SIInstrInfo::isFLAT(Inst)) - return VMEM_READ_ACCESS; + return VMEM_ACCESS; return VmemReadMapping[getVmemType(Inst)]; } + std::optional<WaitEventType> + getExpertSchedulingEventType(const MachineInstr &Inst) const; + + bool isAsync(const MachineInstr &MI) const { + if (!SIInstrInfo::isLDSDMA(MI)) + return false; + if (SIInstrInfo::usesASYNC_CNT(MI)) + return true; + const MachineOperand *Async = + TII->getNamedOperand(MI, AMDGPU::OpName::IsAsync); + return Async && (Async->getImm()); + } + + bool isNonAsyncLdsDmaWrite(const MachineInstr &MI) const { + return SIInstrInfo::mayWriteLDSThroughDMA(MI) && !isAsync(MI); + } + + bool isAsyncLdsDmaWrite(const MachineInstr &MI) const { + return SIInstrInfo::mayWriteLDSThroughDMA(MI) && isAsync(MI); + } + bool isVmemAccess(const MachineInstr &MI) const; bool generateWaitcntInstBefore(MachineInstr &MI, WaitcntBrackets &ScoreBrackets, MachineInstr *OldWaitcntInstr, - bool FlushVmCnt); + PreheaderFlushFlags FlushFlags); bool generateWaitcnt(AMDGPU::Waitcnt Wait, MachineBasicBlock::instr_iterator It, MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets, @@ -579,6 +676,16 @@ public: WaitcntBrackets &ScoreBrackets); bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets); + void setSchedulingMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, + bool ExpertMode) const; + AtomicRMWState getAtomicRMWState(MachineInstr &MI, + AtomicRMWState PrevState) const; + const WaitEventSet &getWaitEvents(InstCounterType T) const { + return WCG->getWaitEvents(T); + } + InstCounterType getCounterFromEvent(WaitEventType E) const { + return WCG->getCounterFromEvent(E); + } }; // This objects maintains the current score brackets of each wait counter, and @@ -591,7 +698,30 @@ public: // "s_waitcnt 0" before use. class WaitcntBrackets { public: - WaitcntBrackets(const SIInsertWaitcnts *Context) : Context(Context) {} + WaitcntBrackets(const SIInsertWaitcnts *Context) : Context(Context) { + assert(Context->TRI->getNumRegUnits() < REGUNITS_END); + } + +#ifndef NDEBUG + ~WaitcntBrackets() { + unsigned NumUnusedVmem = 0, NumUnusedSGPRs = 0; + for (auto &[ID, Val] : VMem) { + if (Val.empty()) + ++NumUnusedVmem; + } + for (auto &[ID, Val] : SGPRs) { + if (Val.empty()) + ++NumUnusedSGPRs; + } + + if (NumUnusedVmem || NumUnusedSGPRs) { + errs() << "WaitcntBracket had unused entries at destruction time: " + << NumUnusedVmem << " VMem and " << NumUnusedSGPRs + << " SGPR unused entries\n"; + std::abort(); + } + } +#endif bool isSmemCounter(InstCounterType T) const { return T == Context->SmemAccessCounter || T == X_CNT; @@ -602,6 +732,18 @@ public: return T == X_CNT ? 1 : 0; } + unsigned getOutstanding(InstCounterType T) const { + return ScoreUBs[T] - ScoreLBs[T]; + } + + bool hasPendingVMEM(VMEMID ID, InstCounterType T) const { + return getVMemScore(ID, T) > getScoreLB(T); + } + + /// \Return true if we have no score entries for counter \p T. + bool empty(InstCounterType T) const { return getScoreRange(T) == 0; } + +private: unsigned getScoreLB(InstCounterType T) const { assert(T < NUM_INST_CNTS); return ScoreLBs[T]; @@ -616,53 +758,58 @@ public: return getScoreUB(T) - getScoreLB(T); } - unsigned getRegScore(int GprNo, InstCounterType T) const { - if (GprNo < NUM_ALL_VGPRS) - return VgprScores[T][GprNo]; - - if (GprNo < NUM_ALL_ALLOCATABLE) - return SgprScores[getSgprScoresIdx(T)][GprNo - NUM_ALL_VGPRS]; + unsigned getSGPRScore(MCRegUnit RU, InstCounterType T) const { + auto It = SGPRs.find(RU); + return It != SGPRs.end() ? It->second.Scores[getSgprScoresIdx(T)] : 0; + } - assert(GprNo == SCC); - return SCCScore; + unsigned getVMemScore(VMEMID TID, InstCounterType T) const { + auto It = VMem.find(TID); + return It != VMem.end() ? It->second.Scores[T] : 0; } +public: bool merge(const WaitcntBrackets &Other); - RegInterval getRegInterval(const MachineInstr *MI, - const MachineOperand &Op) const; - bool counterOutOfOrder(InstCounterType T) const; - void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const; - void simplifyWaitcnt(InstCounterType T, unsigned &Count) const; - - void determineWait(InstCounterType T, RegInterval Interval, - AMDGPU::Waitcnt &Wait) const; - void determineWait(InstCounterType T, int RegNo, - AMDGPU::Waitcnt &Wait) const { - determineWait(T, {RegNo, RegNo + 1}, Wait); + void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const { + simplifyWaitcnt(Wait, Wait); } + void simplifyWaitcnt(const AMDGPU::Waitcnt &CheckWait, + AMDGPU::Waitcnt &UpdateWait) const; + void simplifyWaitcnt(InstCounterType T, unsigned &Count) const; + void simplifyXcnt(const AMDGPU::Waitcnt &CheckWait, + AMDGPU::Waitcnt &UpdateWait) const; + void simplifyVmVsrc(const AMDGPU::Waitcnt &CheckWait, + AMDGPU::Waitcnt &UpdateWait) const; + + void determineWaitForPhysReg(InstCounterType T, MCPhysReg Reg, + AMDGPU::Waitcnt &Wait) const; + void determineWaitForLDSDMA(InstCounterType T, VMEMID TID, + AMDGPU::Waitcnt &Wait) const; + AMDGPU::Waitcnt determineAsyncWait(unsigned N); void tryClearSCCWriteEvent(MachineInstr *Inst); void applyWaitcnt(const AMDGPU::Waitcnt &Wait); void applyWaitcnt(InstCounterType T, unsigned Count); - void applyXcnt(const AMDGPU::Waitcnt &Wait); void updateByEvent(WaitEventType E, MachineInstr &MI); + void recordAsyncMark(MachineInstr &MI); - unsigned hasPendingEvent() const { return PendingEvents; } - unsigned hasPendingEvent(WaitEventType E) const { - return PendingEvents & (1 << E); + bool hasPendingEvent() const { return !PendingEvents.empty(); } + bool hasPendingEvent(WaitEventType E) const { + return PendingEvents.contains(E); } - unsigned hasPendingEvent(InstCounterType T) const { - unsigned HasPending = PendingEvents & Context->WaitEventMaskForInst[T]; - assert((HasPending != 0) == (getScoreRange(T) != 0)); + bool hasPendingEvent(InstCounterType T) const { + bool HasPending = PendingEvents & Context->getWaitEvents(T); + assert(HasPending == !empty(T) && + "Expected pending events iff scoreboard is not empty"); return HasPending; } bool hasMixedPendingEvents(InstCounterType T) const { - unsigned Events = hasPendingEvent(T); + WaitEventSet Events = PendingEvents & Context->getWaitEvents(T); // Return true if more than one bit is set in Events. - return Events & (Events - 1); + return Events.twoOrMore(); } bool hasPendingFlat() const { @@ -683,33 +830,36 @@ public: unsigned getPendingGDSWait() const { return std::min(getScoreUB(DS_CNT) - LastGDS, - Context->getWaitCountMax(DS_CNT) - 1); + getWaitCountMax(Context->getLimits(), DS_CNT) - 1); } void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; } // Return true if there might be pending writes to the vgpr-interval by VMEM // instructions with types different from V. - bool hasOtherPendingVmemTypes(RegInterval Interval, VmemType V) const { - for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { - assert(RegNo < NUM_ALL_VGPRS); - if (VgprVmemTypes[RegNo] & ~(1 << V)) + bool hasOtherPendingVmemTypes(MCPhysReg Reg, VmemType V) const { + for (MCRegUnit RU : regunits(Reg)) { + auto It = VMem.find(toVMEMID(RU)); + if (It != VMem.end() && (It->second.VMEMTypes & ~(1 << V))) return true; } return false; } - void clearVgprVmemTypes(RegInterval Interval) { - for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { - assert(RegNo < NUM_ALL_VGPRS); - VgprVmemTypes[RegNo] = 0; + void clearVgprVmemTypes(MCPhysReg Reg) { + for (MCRegUnit RU : regunits(Reg)) { + if (auto It = VMem.find(toVMEMID(RU)); It != VMem.end()) { + It->second.VMEMTypes = 0; + if (It->second.empty()) + VMem.erase(It); + } } } void setStateOnFunctionEntryOrReturn() { - setScoreUB(STORE_CNT, - getScoreUB(STORE_CNT) + Context->getWaitCountMax(STORE_CNT)); - PendingEvents |= Context->WaitEventMaskForInst[STORE_CNT]; + setScoreUB(STORE_CNT, getScoreUB(STORE_CNT) + + getWaitCountMax(Context->getLimits(), STORE_CNT)); + PendingEvents |= Context->getWaitEvents(STORE_CNT); } ArrayRef<const MachineInstr *> getLDSDMAStores() const { @@ -718,11 +868,15 @@ public: bool hasPointSampleAccel(const MachineInstr &MI) const; bool hasPointSamplePendingVmemTypes(const MachineInstr &MI, - RegInterval Interval) const; + MCPhysReg RU) const; void print(raw_ostream &) const; void dump() const { print(dbgs()); } + // Free up memory by removing empty entries from the DenseMap that track event + // scores. + void purgeEmptyTrackingData(); + private: struct MergeInfo { unsigned OldLB; @@ -730,8 +884,27 @@ private: unsigned MyShift; unsigned OtherShift; }; + + using CounterValueArray = std::array<unsigned, NUM_INST_CNTS>; + + void determineWaitForScore(InstCounterType T, unsigned Score, + AMDGPU::Waitcnt &Wait) const; + static bool mergeScore(const MergeInfo &M, unsigned &Score, unsigned OtherScore); + bool mergeAsyncMarks(ArrayRef<MergeInfo> MergeInfos, + ArrayRef<CounterValueArray> OtherMarks); + + iterator_range<MCRegUnitIterator> regunits(MCPhysReg Reg) const { + assert(Reg != AMDGPU::SCC && "Shouldn't be used on SCC"); + if (!Context->TRI->isInAllocatableClass(Reg)) + return {{}, {}}; + const TargetRegisterClass *RC = Context->TRI->getPhysRegBaseClass(Reg); + unsigned Size = Context->TRI->getRegSizeInBits(*RC); + if (Size == 16 && Context->ST->hasD16Writes32BitVgpr()) + Reg = Context->TRI->get32BitRegister(Reg); + return Context->TRI->regunits(Reg); + } void setScoreLB(InstCounterType T, unsigned Val) { assert(T < NUM_INST_CNTS); @@ -745,49 +918,95 @@ private: if (T != EXP_CNT) return; - if (getScoreRange(EXP_CNT) > Context->getWaitCountMax(EXP_CNT)) - ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - Context->getWaitCountMax(EXP_CNT); + if (getScoreRange(EXP_CNT) > getWaitCountMax(Context->getLimits(), EXP_CNT)) + ScoreLBs[EXP_CNT] = + ScoreUBs[EXP_CNT] - getWaitCountMax(Context->getLimits(), EXP_CNT); + } + + void setRegScore(MCPhysReg Reg, InstCounterType T, unsigned Val) { + const SIRegisterInfo *TRI = Context->TRI; + if (Reg == AMDGPU::SCC) { + SCCScore = Val; + } else if (TRI->isVectorRegister(*Context->MRI, Reg)) { + for (MCRegUnit RU : regunits(Reg)) + VMem[toVMEMID(RU)].Scores[T] = Val; + } else if (TRI->isSGPRReg(*Context->MRI, Reg)) { + auto STy = getSgprScoresIdx(T); + for (MCRegUnit RU : regunits(Reg)) + SGPRs[RU].Scores[STy] = Val; + } else { + llvm_unreachable("Register cannot be tracked/unknown register!"); + } } - void setRegScore(int GprNo, InstCounterType T, unsigned Val) { - setScoreByInterval({GprNo, GprNo + 1}, T, Val); + void setVMemScore(VMEMID TID, InstCounterType T, unsigned Val) { + VMem[TID].Scores[T] = Val; } - void setScoreByInterval(RegInterval Interval, InstCounterType CntTy, - unsigned Score); - - void setScoreByOperand(const MachineInstr *MI, const MachineOperand &Op, - InstCounterType CntTy, unsigned Val); + void setScoreByOperand(const MachineOperand &Op, InstCounterType CntTy, + unsigned Val); const SIInsertWaitcnts *Context; unsigned ScoreLBs[NUM_INST_CNTS] = {0}; unsigned ScoreUBs[NUM_INST_CNTS] = {0}; - unsigned PendingEvents = 0; + WaitEventSet PendingEvents; // Remember the last flat memory operation. unsigned LastFlat[NUM_INST_CNTS] = {0}; // Remember the last GDS operation. unsigned LastGDS = 0; - // wait_cnt scores for every vgpr. - // Keep track of the VgprUB and SgprUB to make merge at join efficient. - int VgprUB = -1; - int SgprUB = -1; - unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}}; - // Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt - // pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant. - // Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps the - // X_CNT score. - unsigned SgprScores[2][SQ_MAX_PGM_SGPRS] = {{0}}; + + // The score tracking logic is fragmented as follows: + // - VMem: VGPR RegUnits and LDS DMA IDs, see the VMEMID encoding. + // - SGPRs: SGPR RegUnits + // - SCC: Non-allocatable and not general purpose: not a SGPR. + // + // For the VMem case, if the key is within the range of LDS DMA IDs, + // then the corresponding index into the `LDSDMAStores` vector below is: + // Key - LDSDMA_BEGIN - 1 + // This is because LDSDMA_BEGIN is a generic entry and does not have an + // associated MachineInstr. + // + // TODO: Could we track SCC alongside SGPRs so it's not longer a special case? + + struct VMEMInfo { + // Scores for all instruction counters. Zero-initialized. + CounterValueArray Scores{}; + // Bitmask of the VmemTypes of VMEM instructions for this VGPR. + unsigned VMEMTypes = 0; + + bool empty() const { return all_of(Scores, equal_to(0)) && !VMEMTypes; } + }; + + struct SGPRInfo { + // Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt + // pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant. + // Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps + // the X_CNT score. + std::array<unsigned, 2> Scores = {0}; + + bool empty() const { return !Scores[0] && !Scores[1]; } + }; + + DenseMap<VMEMID, VMEMInfo> VMem; // VGPR + LDS DMA + DenseMap<MCRegUnit, SGPRInfo> SGPRs; + // Reg score for SCC. unsigned SCCScore = 0; // The unique instruction that has an SCC write pending, if there is one. const MachineInstr *PendingSCCWrite = nullptr; - // Bitmask of the VmemTypes of VMEM instructions that might have a pending - // write to each vgpr. - unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0}; + // Store representative LDS DMA operations. The only useful info here is // alias info. One store is kept per unique AAInfo. - SmallVector<const MachineInstr *, NUM_LDS_VGPRS - 1> LDSDMAStores; + SmallVector<const MachineInstr *> LDSDMAStores; + + // State of all counters at each async mark encountered so far. + SmallVector<CounterValueArray> AsyncMarks; + static constexpr unsigned MaxAsyncMarks = 16; + + // Track the upper bound score for async operations that are not part of a + // mark yet. Initialized to all zeros. + CounterValueArray AsyncScore{}; }; class SIInsertWaitcntsLegacy : public MachineFunctionPass { @@ -813,82 +1032,9 @@ public: } // end anonymous namespace -RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI, - const MachineOperand &Op) const { - if (Op.getReg() == AMDGPU::SCC) - return {SCC, SCC + 1}; - - const SIRegisterInfo *TRI = Context->TRI; - const MachineRegisterInfo *MRI = Context->MRI; - - if (!TRI->isInAllocatableClass(Op.getReg())) - return {-1, -1}; - - // A use via a PW operand does not need a waitcnt. - // A partial write is not a WAW. - assert(!Op.getSubReg() || !Op.isUndef()); - - RegInterval Result; - - MCRegister MCReg = AMDGPU::getMCReg(Op.getReg(), *Context->ST); - unsigned RegIdx = TRI->getHWRegIndex(MCReg); - - const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Op.getReg()); - unsigned Size = TRI->getRegSizeInBits(*RC); - - // AGPRs/VGPRs are tracked every 16 bits, SGPRs by 32 bits - if (TRI->isVectorRegister(*MRI, Op.getReg())) { - unsigned Reg = RegIdx << 1 | (AMDGPU::isHi16Reg(MCReg, *TRI) ? 1 : 0); - assert(!Context->ST->hasMAIInsts() || Reg < AGPR_OFFSET); - Result.first = Reg; - if (TRI->isAGPR(*MRI, Op.getReg())) - Result.first += AGPR_OFFSET; - assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS); - assert(Size % 16 == 0); - Result.second = Result.first + (Size / 16); - - if (Size == 16 && Context->ST->hasD16Writes32BitVgpr()) { - // Regardless of which lo16/hi16 is used, consider the full 32-bit - // register used. - if (AMDGPU::isHi16Reg(MCReg, *TRI)) - Result.first -= 1; - else - Result.second += 1; - } - } else if (TRI->isSGPRReg(*MRI, Op.getReg()) && RegIdx < SQ_MAX_PGM_SGPRS) { - // SGPRs including VCC, TTMPs and EXEC but excluding read-only scalar - // sources like SRC_PRIVATE_BASE. - Result.first = RegIdx + NUM_ALL_VGPRS; - Result.second = Result.first + divideCeil(Size, 32); - } else { - return {-1, -1}; - } - - return Result; -} - -void WaitcntBrackets::setScoreByInterval(RegInterval Interval, - InstCounterType CntTy, - unsigned Score) { - for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { - if (RegNo < NUM_ALL_VGPRS) { - VgprUB = std::max(VgprUB, RegNo); - VgprScores[CntTy][RegNo] = Score; - } else if (RegNo < NUM_ALL_ALLOCATABLE) { - SgprUB = std::max(SgprUB, RegNo - NUM_ALL_VGPRS); - SgprScores[getSgprScoresIdx(CntTy)][RegNo - NUM_ALL_VGPRS] = Score; - } else { - assert(RegNo == SCC); - SCCScore = Score; - } - } -} - -void WaitcntBrackets::setScoreByOperand(const MachineInstr *MI, - const MachineOperand &Op, +void WaitcntBrackets::setScoreByOperand(const MachineOperand &Op, InstCounterType CntTy, unsigned Score) { - RegInterval Interval = getRegInterval(MI, Op); - setScoreByInterval(Interval, CntTy, Score); + setRegScore(Op.getReg().asMCReg(), CntTy, Score); } // Return true if the subtarget is one that enables Point Sample Acceleration @@ -911,16 +1057,17 @@ bool WaitcntBrackets::hasPointSampleAccel(const MachineInstr &MI) const { // one that has outstanding writes to vmem-types different than VMEM_NOSAMPLER // (this is the type that a point sample accelerated instruction effectively // becomes) -bool WaitcntBrackets::hasPointSamplePendingVmemTypes( - const MachineInstr &MI, RegInterval Interval) const { +bool WaitcntBrackets::hasPointSamplePendingVmemTypes(const MachineInstr &MI, + MCPhysReg Reg) const { if (!hasPointSampleAccel(MI)) return false; - return hasOtherPendingVmemTypes(Interval, VMEM_NOSAMPLER); + return hasOtherPendingVmemTypes(Reg, VMEM_NOSAMPLER); } void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) { - InstCounterType T = eventCounter(Context->WaitEventMaskForInst, E); + InstCounterType T = Context->getCounterFromEvent(E); + assert(T < Context->MaxCounter); unsigned UB = getScoreUB(T); unsigned CurrScore = UB + 1; @@ -929,7 +1076,7 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) { // PendingEvents and ScoreUB need to be update regardless if this event // changes the score of a register or not. // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message. - PendingEvents |= 1 << E; + PendingEvents.insert(E); setScoreUB(T, CurrScore); const SIRegisterInfo *TRI = Context->TRI; @@ -943,57 +1090,52 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) { // All GDS operations must protect their address register (same as // export.) if (const auto *AddrOp = TII->getNamedOperand(Inst, AMDGPU::OpName::addr)) - setScoreByOperand(&Inst, *AddrOp, EXP_CNT, CurrScore); + setScoreByOperand(*AddrOp, EXP_CNT, CurrScore); if (Inst.mayStore()) { if (const auto *Data0 = TII->getNamedOperand(Inst, AMDGPU::OpName::data0)) - setScoreByOperand(&Inst, *Data0, EXP_CNT, CurrScore); + setScoreByOperand(*Data0, EXP_CNT, CurrScore); if (const auto *Data1 = TII->getNamedOperand(Inst, AMDGPU::OpName::data1)) - setScoreByOperand(&Inst, *Data1, EXP_CNT, CurrScore); + setScoreByOperand(*Data1, EXP_CNT, CurrScore); } else if (SIInstrInfo::isAtomicRet(Inst) && !SIInstrInfo::isGWS(Inst) && Inst.getOpcode() != AMDGPU::DS_APPEND && Inst.getOpcode() != AMDGPU::DS_CONSUME && Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) { for (const MachineOperand &Op : Inst.all_uses()) { if (TRI->isVectorRegister(*MRI, Op.getReg())) - setScoreByOperand(&Inst, Op, EXP_CNT, CurrScore); + setScoreByOperand(Op, EXP_CNT, CurrScore); } } } else if (TII->isFLAT(Inst)) { if (Inst.mayStore()) { - setScoreByOperand(&Inst, - *TII->getNamedOperand(Inst, AMDGPU::OpName::data), + setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data), EXP_CNT, CurrScore); } else if (SIInstrInfo::isAtomicRet(Inst)) { - setScoreByOperand(&Inst, - *TII->getNamedOperand(Inst, AMDGPU::OpName::data), + setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data), EXP_CNT, CurrScore); } } else if (TII->isMIMG(Inst)) { if (Inst.mayStore()) { - setScoreByOperand(&Inst, Inst.getOperand(0), EXP_CNT, CurrScore); + setScoreByOperand(Inst.getOperand(0), EXP_CNT, CurrScore); } else if (SIInstrInfo::isAtomicRet(Inst)) { - setScoreByOperand(&Inst, - *TII->getNamedOperand(Inst, AMDGPU::OpName::data), + setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data), EXP_CNT, CurrScore); } } else if (TII->isMTBUF(Inst)) { if (Inst.mayStore()) - setScoreByOperand(&Inst, Inst.getOperand(0), EXP_CNT, CurrScore); + setScoreByOperand(Inst.getOperand(0), EXP_CNT, CurrScore); } else if (TII->isMUBUF(Inst)) { if (Inst.mayStore()) { - setScoreByOperand(&Inst, Inst.getOperand(0), EXP_CNT, CurrScore); + setScoreByOperand(Inst.getOperand(0), EXP_CNT, CurrScore); } else if (SIInstrInfo::isAtomicRet(Inst)) { - setScoreByOperand(&Inst, - *TII->getNamedOperand(Inst, AMDGPU::OpName::data), + setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data), EXP_CNT, CurrScore); } } else if (TII->isLDSDIR(Inst)) { // LDSDIR instructions attach the score to the destination. - setScoreByOperand(&Inst, - *TII->getNamedOperand(Inst, AMDGPU::OpName::vdst), + setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::vdst), EXP_CNT, CurrScore); } else { if (TII->isEXP(Inst)) { @@ -1003,27 +1145,37 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) { // score. for (MachineOperand &DefMO : Inst.all_defs()) { if (TRI->isVGPR(*MRI, DefMO.getReg())) { - setScoreByOperand(&Inst, DefMO, EXP_CNT, CurrScore); + setScoreByOperand(DefMO, EXP_CNT, CurrScore); } } } for (const MachineOperand &Op : Inst.all_uses()) { if (TRI->isVectorRegister(*MRI, Op.getReg())) - setScoreByOperand(&Inst, Op, EXP_CNT, CurrScore); + setScoreByOperand(Op, EXP_CNT, CurrScore); } } } else if (T == X_CNT) { WaitEventType OtherEvent = E == SMEM_GROUP ? VMEM_GROUP : SMEM_GROUP; - if (PendingEvents & (1 << OtherEvent)) { + if (PendingEvents.contains(OtherEvent)) { // Hardware inserts an implicit xcnt between interleaved // SMEM and VMEM operations. So there will never be // outstanding address translations for both SMEM and // VMEM at the same time. setScoreLB(T, getScoreUB(T) - 1); - PendingEvents &= ~(1 << OtherEvent); + PendingEvents.remove(OtherEvent); } for (const MachineOperand &Op : Inst.all_uses()) - setScoreByOperand(&Inst, Op, T, CurrScore); + setScoreByOperand(Op, T, CurrScore); + } else if (T == VA_VDST || T == VM_VSRC) { + // Match the score to the VGPR destination or source registers as + // appropriate + for (const MachineOperand &Op : Inst.operands()) { + if (!Op.isReg() || (T == VA_VDST && Op.isUse()) || + (T == VM_VSRC && Op.isDef())) + continue; + if (TRI->isVectorRegister(*Context->MRI, Op.getReg())) + setScoreByOperand(Op, T, CurrScore); + } } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ { // Match the score to the destination registers. // @@ -1035,9 +1187,8 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) { // Special cases where implicit register defs exists, such as M0 or VCC, // but none with memory instructions. for (const MachineOperand &Op : Inst.defs()) { - RegInterval Interval = getRegInterval(&Inst, Op); if (T == LOAD_CNT || T == SAMPLE_CNT || T == BVH_CNT) { - if (Interval.first >= NUM_ALL_VGPRS) + if (!TRI->isVectorRegister(*MRI, Op.getReg())) // TODO: add wrapper continue; if (updateVMCntOnly(Inst)) { // updateVMCntOnly should only leave us with VGPRs @@ -1050,16 +1201,20 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) { // this with another potential dependency if (hasPointSampleAccel(Inst)) TypesMask |= 1 << VMEM_NOSAMPLER; - for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) - VgprVmemTypes[RegNo] |= TypesMask; + for (MCRegUnit RU : regunits(Op.getReg().asMCReg())) + VMem[toVMEMID(RU)].VMEMTypes |= TypesMask; } } - setScoreByInterval(Interval, T, CurrScore); + setScoreByOperand(Op, T, CurrScore); } if (Inst.mayStore() && - (TII->isDS(Inst) || TII->mayWriteLDSThroughDMA(Inst))) { + (TII->isDS(Inst) || Context->isNonAsyncLdsDmaWrite(Inst))) { // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS // written can be accessed. A load from LDS to VMEM does not need a wait. + // + // The "Slot" is the offset from LDSDMA_BEGIN. If it's non-zero, then + // there is a MachineInstr in LDSDMAStores used to track this LDSDMA + // store. The "Slot" is the index into LDSDMAStores + 1. unsigned Slot = 0; for (const auto *MemOp : Inst.memoperands()) { if (!MemOp->isStore() || @@ -1072,9 +1227,7 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) { // original memory object and practically produced in the module LDS // lowering pass. If there is no scope available we will not be able // to disambiguate LDS aliasing as after the module lowering all LDS - // is squashed into a single big object. Do not attempt to use one of - // the limited LDSDMAStores for something we will not be able to use - // anyway. + // is squashed into a single big object. if (!AAI || !AAI.Scope) break; for (unsigned I = 0, E = LDSDMAStores.size(); I != E && !Slot; ++I) { @@ -1085,61 +1238,93 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) { } } } - if (Slot || LDSDMAStores.size() == NUM_LDS_VGPRS - 1) + if (Slot) break; + // The slot may not be valid because it can be >= NUM_LDSDMA which + // means the scoreboard cannot track it. We still want to preserve the + // MI in order to check alias information, though. LDSDMAStores.push_back(&Inst); Slot = LDSDMAStores.size(); break; } - setRegScore(FIRST_LDS_VGPR + Slot, T, CurrScore); - if (Slot) - setRegScore(FIRST_LDS_VGPR, T, CurrScore); + setVMemScore(LDSDMA_BEGIN, T, CurrScore); + if (Slot && Slot < NUM_LDSDMA) + setVMemScore(LDSDMA_BEGIN + Slot, T, CurrScore); + } + + // FIXME: Not supported on GFX12 yet. Newer async operations use other + // counters too, so will need a map from instruction or event types to + // counter types. + if (Context->isAsyncLdsDmaWrite(Inst) && T == LOAD_CNT) { + assert(!SIInstrInfo::usesASYNC_CNT(Inst)); + AsyncScore[T] = CurrScore; } if (SIInstrInfo::isSBarrierSCCWrite(Inst.getOpcode())) { - setRegScore(SCC, T, CurrScore); + setRegScore(AMDGPU::SCC, T, CurrScore); PendingSCCWrite = &Inst; } } } +void WaitcntBrackets::recordAsyncMark(MachineInstr &Inst) { + // In the absence of loops, AsyncMarks can grow linearly with the program + // until we encounter an ASYNCMARK_WAIT. We could drop the oldest mark above a + // limit every time we push a new mark, but that seems like unnecessary work + // in practical cases. We do separately truncate the array when processing a + // loop, which should be sufficient. + AsyncMarks.push_back(AsyncScore); + AsyncScore = {}; + LLVM_DEBUG({ + dbgs() << "recordAsyncMark:\n" << Inst; + for (const auto &Mark : AsyncMarks) { + llvm::interleaveComma(Mark, dbgs()); + dbgs() << '\n'; + } + }); +} + void WaitcntBrackets::print(raw_ostream &OS) const { const GCNSubtarget *ST = Context->ST; - OS << '\n'; for (auto T : inst_counter_types(Context->MaxCounter)) { unsigned SR = getScoreRange(T); - switch (T) { case LOAD_CNT: OS << " " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT(" - << SR << "): "; + << SR << "):"; break; case DS_CNT: OS << " " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT(" - << SR << "): "; + << SR << "):"; break; case EXP_CNT: - OS << " EXP_CNT(" << SR << "): "; + OS << " EXP_CNT(" << SR << "):"; break; case STORE_CNT: OS << " " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT(" - << SR << "): "; + << SR << "):"; break; case SAMPLE_CNT: - OS << " SAMPLE_CNT(" << SR << "): "; + OS << " SAMPLE_CNT(" << SR << "):"; break; case BVH_CNT: - OS << " BVH_CNT(" << SR << "): "; + OS << " BVH_CNT(" << SR << "):"; break; case KM_CNT: - OS << " KM_CNT(" << SR << "): "; + OS << " KM_CNT(" << SR << "):"; break; case X_CNT: - OS << " X_CNT(" << SR << "): "; + OS << " X_CNT(" << SR << "):"; + break; + case VA_VDST: + OS << " VA_VDST(" << SR << "): "; + break; + case VM_VSRC: + OS << " VM_VSRC(" << SR << "): "; break; default: - OS << " UNKNOWN(" << SR << "): "; + OS << " UNKNOWN(" << SR << "):"; break; } @@ -1147,29 +1332,38 @@ void WaitcntBrackets::print(raw_ostream &OS) const { // Print vgpr scores. unsigned LB = getScoreLB(T); - for (int J = 0; J <= VgprUB; J++) { - unsigned RegScore = getRegScore(J, T); + SmallVector<VMEMID> SortedVMEMIDs(VMem.keys()); + sort(SortedVMEMIDs); + + for (auto ID : SortedVMEMIDs) { + unsigned RegScore = VMem.at(ID).Scores[T]; if (RegScore <= LB) continue; unsigned RelScore = RegScore - LB - 1; - if (J < FIRST_LDS_VGPR) { - OS << RelScore << ":v" << J << " "; + if (ID < REGUNITS_END) { + OS << ' ' << RelScore << ":vRU" << ID; } else { - OS << RelScore << ":ds "; + assert(ID >= LDSDMA_BEGIN && ID < LDSDMA_END && + "Unhandled/unexpected ID value!"); + OS << ' ' << RelScore << ":LDSDMA" << ID; } } + // Also need to print sgpr scores for lgkm_cnt or xcnt. if (isSmemCounter(T)) { - for (int J = 0; J <= SgprUB; J++) { - unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, T); + SmallVector<MCRegUnit> SortedSMEMIDs(SGPRs.keys()); + sort(SortedSMEMIDs); + for (auto ID : SortedSMEMIDs) { + unsigned RegScore = SGPRs.at(ID).Scores[getSgprScoresIdx(T)]; if (RegScore <= LB) continue; unsigned RelScore = RegScore - LB - 1; - OS << RelScore << ":s" << J << " "; + OS << ' ' << RelScore << ":sRU" << static_cast<unsigned>(ID); } } + if (T == KM_CNT && SCCScore > 0) - OS << SCCScore << ":scc "; + OS << ' ' << SCCScore << ":scc"; } OS << '\n'; } @@ -1187,20 +1381,70 @@ void WaitcntBrackets::print(raw_ostream &OS) const { } OS << '\n'; + OS << "Async score: "; + if (AsyncScore.empty()) + OS << "none"; + else + llvm::interleaveComma(AsyncScore, OS); + OS << '\n'; + + OS << "Async marks: " << AsyncMarks.size() << '\n'; + + for (const auto &Mark : AsyncMarks) { + for (auto T : inst_counter_types()) { + unsigned MarkedScore = Mark[T]; + switch (T) { + case LOAD_CNT: + OS << " " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM") + << "_CNT: " << MarkedScore; + break; + case DS_CNT: + OS << " " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM") + << "_CNT: " << MarkedScore; + break; + case EXP_CNT: + OS << " EXP_CNT: " << MarkedScore; + break; + case STORE_CNT: + OS << " " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS") + << "_CNT: " << MarkedScore; + break; + case SAMPLE_CNT: + OS << " SAMPLE_CNT: " << MarkedScore; + break; + case BVH_CNT: + OS << " BVH_CNT: " << MarkedScore; + break; + case KM_CNT: + OS << " KM_CNT: " << MarkedScore; + break; + case X_CNT: + OS << " X_CNT: " << MarkedScore; + break; + default: + OS << " UNKNOWN: " << MarkedScore; + break; + } + } + OS << '\n'; + } OS << '\n'; } -/// Simplify the waitcnt, in the sense of removing redundant counts, and return -/// whether a waitcnt instruction is needed at all. -void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const { - simplifyWaitcnt(LOAD_CNT, Wait.LoadCnt); - simplifyWaitcnt(EXP_CNT, Wait.ExpCnt); - simplifyWaitcnt(DS_CNT, Wait.DsCnt); - simplifyWaitcnt(STORE_CNT, Wait.StoreCnt); - simplifyWaitcnt(SAMPLE_CNT, Wait.SampleCnt); - simplifyWaitcnt(BVH_CNT, Wait.BvhCnt); - simplifyWaitcnt(KM_CNT, Wait.KmCnt); - simplifyWaitcnt(X_CNT, Wait.XCnt); +/// Simplify \p UpdateWait by removing waits that are redundant based on the +/// current WaitcntBrackets and any other waits specified in \p CheckWait. +void WaitcntBrackets::simplifyWaitcnt(const AMDGPU::Waitcnt &CheckWait, + AMDGPU::Waitcnt &UpdateWait) const { + simplifyWaitcnt(LOAD_CNT, UpdateWait.LoadCnt); + simplifyWaitcnt(EXP_CNT, UpdateWait.ExpCnt); + simplifyWaitcnt(DS_CNT, UpdateWait.DsCnt); + simplifyWaitcnt(STORE_CNT, UpdateWait.StoreCnt); + simplifyWaitcnt(SAMPLE_CNT, UpdateWait.SampleCnt); + simplifyWaitcnt(BVH_CNT, UpdateWait.BvhCnt); + simplifyWaitcnt(KM_CNT, UpdateWait.KmCnt); + simplifyXcnt(CheckWait, UpdateWait); + simplifyWaitcnt(VA_VDST, UpdateWait.VaVdst); + simplifyVmVsrc(CheckWait, UpdateWait); } void WaitcntBrackets::simplifyWaitcnt(InstCounterType T, @@ -1212,52 +1456,155 @@ void WaitcntBrackets::simplifyWaitcnt(InstCounterType T, Count = ~0u; } -void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval, - AMDGPU::Waitcnt &Wait) const { +void WaitcntBrackets::simplifyXcnt(const AMDGPU::Waitcnt &CheckWait, + AMDGPU::Waitcnt &UpdateWait) const { + // Try to simplify xcnt further by checking for joint kmcnt and loadcnt + // optimizations. On entry to a block with multiple predescessors, there may + // be pending SMEM and VMEM events active at the same time. + // In such cases, only clear one active event at a time. + // TODO: Revisit xcnt optimizations for gfx1250. + // Wait on XCNT is redundant if we are already waiting for a load to complete. + // SMEM can return out of order, so only omit XCNT wait if we are waiting till + // zero. + if (CheckWait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP)) + UpdateWait.XCnt = ~0u; + // If we have pending store we cannot optimize XCnt because we do not wait for + // stores. VMEM loads retun in order, so if we only have loads XCnt is + // decremented to the same number as LOADCnt. + if (CheckWait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) && + !hasPendingEvent(STORE_CNT) && CheckWait.XCnt >= CheckWait.LoadCnt) + UpdateWait.XCnt = ~0u; + simplifyWaitcnt(X_CNT, UpdateWait.XCnt); +} + +void WaitcntBrackets::simplifyVmVsrc(const AMDGPU::Waitcnt &CheckWait, + AMDGPU::Waitcnt &UpdateWait) const { + // Waiting for some counters implies waiting for VM_VSRC, since an + // instruction that decrements a counter on completion would have + // decremented VM_VSRC once its VGPR operands had been read. + if (CheckWait.VmVsrc >= + std::min({CheckWait.LoadCnt, CheckWait.StoreCnt, CheckWait.SampleCnt, + CheckWait.BvhCnt, CheckWait.DsCnt})) + UpdateWait.VmVsrc = ~0u; + simplifyWaitcnt(VM_VSRC, UpdateWait.VmVsrc); +} + +void WaitcntBrackets::purgeEmptyTrackingData() { + for (auto &[K, V] : make_early_inc_range(VMem)) { + if (V.empty()) + VMem.erase(K); + } + for (auto &[K, V] : make_early_inc_range(SGPRs)) { + if (V.empty()) + SGPRs.erase(K); + } +} + +void WaitcntBrackets::determineWaitForScore(InstCounterType T, + unsigned ScoreToWait, + AMDGPU::Waitcnt &Wait) const { const unsigned LB = getScoreLB(T); const unsigned UB = getScoreUB(T); - for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { - unsigned ScoreToWait = getRegScore(RegNo, T); - - // If the score of src_operand falls within the bracket, we need an - // s_waitcnt instruction. - if ((UB >= ScoreToWait) && (ScoreToWait > LB)) { - if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() && - !Context->ST->hasFlatLgkmVMemCountInOrder()) { - // If there is a pending FLAT operation, and this is a VMem or LGKM - // waitcnt and the target can report early completion, then we need - // to force a waitcnt 0. - addWait(Wait, T, 0); - } else if (counterOutOfOrder(T)) { - // Counter can get decremented out-of-order when there - // are multiple types event in the bracket. Also emit an s_wait counter - // with a conservative value of 0 for the counter. - addWait(Wait, T, 0); - } else { - // If a counter has been maxed out avoid overflow by waiting for - // MAX(CounterType) - 1 instead. - unsigned NeededWait = - std::min(UB - ScoreToWait, Context->getWaitCountMax(T) - 1); - addWait(Wait, T, NeededWait); - } + + // If the score falls within the bracket, we need a waitcnt. + if ((UB >= ScoreToWait) && (ScoreToWait > LB)) { + if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() && + !Context->ST->hasFlatLgkmVMemCountInOrder()) { + // If there is a pending FLAT operation, and this is a VMem or LGKM + // waitcnt and the target can report early completion, then we need + // to force a waitcnt 0. + addWait(Wait, T, 0); + } else if (counterOutOfOrder(T)) { + // Counter can get decremented out-of-order when there + // are multiple types event in the bracket. Also emit an s_wait counter + // with a conservative value of 0 for the counter. + addWait(Wait, T, 0); + } else { + // If a counter has been maxed out avoid overflow by waiting for + // MAX(CounterType) - 1 instead. + unsigned NeededWait = std::min( + UB - ScoreToWait, getWaitCountMax(Context->getLimits(), T) - 1); + addWait(Wait, T, NeededWait); } } } +AMDGPU::Waitcnt WaitcntBrackets::determineAsyncWait(unsigned N) { + LLVM_DEBUG({ + dbgs() << "Need " << N << " async marks. Found " << AsyncMarks.size() + << ":\n"; + for (const auto &Mark : AsyncMarks) { + llvm::interleaveComma(Mark, dbgs()); + dbgs() << '\n'; + } + }); + + AMDGPU::Waitcnt Wait; + if (AsyncMarks.size() == MaxAsyncMarks) { + // Enforcing MaxAsyncMarks here is unnecessary work because the size of + // MaxAsyncMarks is linear when traversing straightline code. But we do + // need to check if truncation may have occured at a merge, and adjust N + // to ensure that a wait is generated. + LLVM_DEBUG(dbgs() << "Possible truncation. Ensuring a non-trivial wait.\n"); + N = std::min(N, (unsigned)MaxAsyncMarks - 1); + } + + if (AsyncMarks.size() <= N) { + LLVM_DEBUG(dbgs() << "No additional wait for async mark.\n"); + return Wait; + } + + size_t MarkIndex = AsyncMarks.size() - N - 1; + const auto &RequiredMark = AsyncMarks[MarkIndex]; + for (InstCounterType T : inst_counter_types()) + determineWaitForScore(T, RequiredMark[T], Wait); + + // Immediately remove the waited mark and all older ones + // This happens BEFORE the wait is actually inserted, which is fine + // because we've already extracted the wait requirements + LLVM_DEBUG({ + dbgs() << "Removing " << (MarkIndex + 1) + << " async marks after determining wait\n"; + }); + AsyncMarks.erase(AsyncMarks.begin(), AsyncMarks.begin() + MarkIndex + 1); + + LLVM_DEBUG(dbgs() << "Waits to add: " << Wait); + return Wait; +} + +void WaitcntBrackets::determineWaitForPhysReg(InstCounterType T, MCPhysReg Reg, + AMDGPU::Waitcnt &Wait) const { + if (Reg == AMDGPU::SCC) { + determineWaitForScore(T, SCCScore, Wait); + } else { + bool IsVGPR = Context->TRI->isVectorRegister(*Context->MRI, Reg); + for (MCRegUnit RU : regunits(Reg)) + determineWaitForScore( + T, IsVGPR ? getVMemScore(toVMEMID(RU), T) : getSGPRScore(RU, T), + Wait); + } +} + +void WaitcntBrackets::determineWaitForLDSDMA(InstCounterType T, VMEMID TID, + AMDGPU::Waitcnt &Wait) const { + assert(TID >= LDSDMA_BEGIN && TID < LDSDMA_END); + determineWaitForScore(T, getVMemScore(TID, T), Wait); +} + void WaitcntBrackets::tryClearSCCWriteEvent(MachineInstr *Inst) { // S_BARRIER_WAIT on the same barrier guarantees that the pending write to // SCC has landed if (PendingSCCWrite && PendingSCCWrite->getOpcode() == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM && PendingSCCWrite->getOperand(0).getImm() == Inst->getOperand(0).getImm()) { - unsigned SCC_WRITE_PendingEvent = 1 << SCC_WRITE; + WaitEventSet SCC_WRITE_PendingEvent(SCC_WRITE); // If this SCC_WRITE is the only pending KM_CNT event, clear counter. - if ((PendingEvents & Context->WaitEventMaskForInst[KM_CNT]) == + if ((PendingEvents & Context->getWaitEvents(KM_CNT)) == SCC_WRITE_PendingEvent) { setScoreLB(KM_CNT, getScoreUB(KM_CNT)); } - PendingEvents &= ~SCC_WRITE_PendingEvent; + PendingEvents.remove(SCC_WRITE_PendingEvent); PendingSCCWrite = nullptr; } } @@ -1270,7 +1617,9 @@ void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) { applyWaitcnt(SAMPLE_CNT, Wait.SampleCnt); applyWaitcnt(BVH_CNT, Wait.BvhCnt); applyWaitcnt(KM_CNT, Wait.KmCnt); - applyXcnt(Wait); + applyWaitcnt(X_CNT, Wait.XCnt); + applyWaitcnt(VA_VDST, Wait.VaVdst); + applyWaitcnt(VM_VSRC, Wait.VmVsrc); } void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) { @@ -1283,25 +1632,22 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) { setScoreLB(T, std::max(getScoreLB(T), UB - Count)); } else { setScoreLB(T, UB); - PendingEvents &= ~Context->WaitEventMaskForInst[T]; + PendingEvents.remove(Context->getWaitEvents(T)); } -} - -void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) { - // Wait on XCNT is redundant if we are already waiting for a load to complete. - // SMEM can return out of order, so only omit XCNT wait if we are waiting till - // zero. - if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP)) - return applyWaitcnt(X_CNT, 0); - - // If we have pending store we cannot optimize XCnt because we do not wait for - // stores. VMEM loads retun in order, so if we only have loads XCnt is - // decremented to the same number as LOADCnt. - if (Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) && - !hasPendingEvent(STORE_CNT)) - return applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt)); - applyWaitcnt(X_CNT, Wait.XCnt); + if (T == KM_CNT && Count == 0 && hasPendingEvent(SMEM_GROUP)) { + if (!hasMixedPendingEvents(X_CNT)) + applyWaitcnt(X_CNT, 0); + else + PendingEvents.remove(SMEM_GROUP); + } + if (T == LOAD_CNT && hasPendingEvent(VMEM_GROUP) && + !hasPendingEvent(STORE_CNT)) { + if (!hasMixedPendingEvents(X_CNT)) + applyWaitcnt(X_CNT, Count); + else if (Count == 0) + PendingEvents.remove(VMEM_GROUP); + } } // Where there are multiple types of event in the bracket of a counter, @@ -1311,6 +1657,20 @@ bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const { if ((T == Context->SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) || (T == X_CNT && hasPendingEvent(SMEM_GROUP))) return true; + + // GLOBAL_INV completes in-order with other LOAD_CNT events (VMEM_ACCESS), + // so having GLOBAL_INV_ACCESS mixed with other LOAD_CNT events doesn't cause + // out-of-order completion. + if (T == LOAD_CNT) { + unsigned Events = hasPendingEvent(T); + // Remove GLOBAL_INV_ACCESS from the event mask before checking for mixed + // events + Events &= ~(1 << GLOBAL_INV_ACCESS); + // Return true only if there are still multiple event types after removing + // GLOBAL_INV + return Events & (Events - 1); + } + return hasMixedPendingEvents(T); } @@ -1373,7 +1733,7 @@ bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt) const { if (Opcode == Waitcnt->getOpcode()) return false; - Waitcnt->setDesc(TII->get(Opcode)); + Waitcnt->setDesc(TII.get(Opcode)); return true; } @@ -1385,7 +1745,6 @@ bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt) const { bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt( WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const { - assert(ST); assert(isNormalMode(MaxCounter)); bool Modified = false; @@ -1394,7 +1753,7 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt( LLVM_DEBUG({ dbgs() << "PreGFX12::applyPreexistingWaitcnt at: "; - if (It == OldWaitcntInstr.getParent()->instr_end()) + if (It.isEnd()) dbgs() << "end of block\n"; else dbgs() << *It; @@ -1427,11 +1786,11 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt( } else WaitcntInstr = &II; } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) { - assert(ST->hasVMemToLDSLoad()); + assert(ST.hasVMemToLDSLoad()); LLVM_DEBUG(dbgs() << "Processing S_WAITCNT_lds_direct: " << II - << "Before: " << Wait.LoadCnt << '\n';); - ScoreBrackets.determineWait(LOAD_CNT, FIRST_LDS_VGPR, Wait); - LLVM_DEBUG(dbgs() << "After: " << Wait.LoadCnt << '\n';); + << "Before: " << Wait << '\n';); + ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, LDSDMA_BEGIN, Wait); + LLVM_DEBUG(dbgs() << "After: " << Wait << '\n';); // It is possible (but unlikely) that this is the only wait instruction, // in which case, we exit this loop without a WaitcntInstr to consume @@ -1440,12 +1799,17 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt( // possibility in an articial MIR test since such a situation cannot be // recreated by running the memory legalizer. II.eraseFromParent(); + } else if (Opcode == AMDGPU::WAIT_ASYNCMARK) { + unsigned N = II.getOperand(0).getImm(); + LLVM_DEBUG(dbgs() << "Processing WAIT_ASYNCMARK: " << II << '\n';); + AMDGPU::Waitcnt OldWait = ScoreBrackets.determineAsyncWait(N); + Wait = Wait.combined(OldWait); } else { assert(Opcode == AMDGPU::S_WAITCNT_VSCNT); assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL); unsigned OldVSCnt = - TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm(); + TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm(); if (TrySimplify) ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt); Wait.StoreCnt = std::min(Wait.StoreCnt, OldVSCnt); @@ -1470,13 +1834,12 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt( Wait.ExpCnt = ~0u; Wait.DsCnt = ~0u; - LLVM_DEBUG(It == WaitcntInstr->getParent()->end() - ? dbgs() - << "applied pre-existing waitcnt\n" - << "New Instr at block end: " << *WaitcntInstr << '\n' - : dbgs() << "applied pre-existing waitcnt\n" - << "Old Instr: " << *It - << "New Instr: " << *WaitcntInstr << '\n'); + LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n" + << "New Instr at block end: " + << *WaitcntInstr << '\n' + : dbgs() << "applied pre-existing waitcnt\n" + << "Old Instr: " << *It + << "New Instr: " << *WaitcntInstr << '\n'); } if (WaitcntVsCntInstr) { @@ -1487,7 +1850,7 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt( ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt); Wait.StoreCnt = ~0u; - LLVM_DEBUG(It == WaitcntVsCntInstr->getParent()->end() + LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n" << "New Instr at block end: " << *WaitcntVsCntInstr << '\n' @@ -1503,38 +1866,100 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt( /// required counters in \p Wait bool WaitcntGeneratorPreGFX12::createNewWaitcnt( MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It, - AMDGPU::Waitcnt Wait) { - assert(ST); + AMDGPU::Waitcnt Wait, const WaitcntBrackets &ScoreBrackets) { assert(isNormalMode(MaxCounter)); bool Modified = false; const DebugLoc &DL = Block.findDebugLoc(It); + // Helper to emit expanded waitcnt sequence for profiling. + // Emits waitcnts from (Outstanding-1) down to Target. + // The EmitWaitcnt callback emits a single waitcnt. + auto EmitExpandedWaitcnt = [&](unsigned Outstanding, unsigned Target, + auto EmitWaitcnt) { + do { + EmitWaitcnt(--Outstanding); + } while (Outstanding > Target); + Modified = true; + }; + // Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a // single instruction while VScnt has its own instruction. if (Wait.hasWaitExceptStoreCnt()) { - unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); - [[maybe_unused]] auto SWaitInst = - BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); - Modified = true; + // If profiling expansion is enabled, emit an expanded sequence + if (ExpandWaitcntProfiling) { + // Check if any of the counters to be waited on are out-of-order. + // If so, fall back to normal (non-expanded) behavior since expansion + // would provide misleading profiling information. + bool AnyOutOfOrder = false; + for (auto CT : {LOAD_CNT, DS_CNT, EXP_CNT}) { + unsigned WaitCnt = Wait.get(CT); + if (WaitCnt != ~0u && ScoreBrackets.counterOutOfOrder(CT)) { + AnyOutOfOrder = true; + break; + } + } - LLVM_DEBUG(dbgs() << "generateWaitcnt\n"; - if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; - dbgs() << "New Instr: " << *SWaitInst << '\n'); + if (AnyOutOfOrder) { + // Fall back to non-expanded wait + unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); + BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT)).addImm(Enc); + Modified = true; + } else { + // All counters are in-order, safe to expand + for (auto CT : {LOAD_CNT, DS_CNT, EXP_CNT}) { + unsigned WaitCnt = Wait.get(CT); + if (WaitCnt == ~0u) + continue; + + unsigned Outstanding = std::min(ScoreBrackets.getOutstanding(CT), + getWaitCountMax(getLimits(), CT) - 1); + EmitExpandedWaitcnt(Outstanding, WaitCnt, [&](unsigned Count) { + AMDGPU::Waitcnt W; + W.set(CT, Count); + BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT)) + .addImm(AMDGPU::encodeWaitcnt(IV, W)); + }); + } + } + } else { + // Normal behavior: emit single combined waitcnt + unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); + [[maybe_unused]] auto SWaitInst = + BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT)).addImm(Enc); + Modified = true; + + LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n"; + if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; + dbgs() << "New Instr: " << *SWaitInst << '\n'); + } } if (Wait.hasWaitStoreCnt()) { - assert(ST->hasVscnt()); - - [[maybe_unused]] auto SWaitInst = - BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT)) + assert(ST.hasVscnt()); + + if (ExpandWaitcntProfiling && Wait.StoreCnt != ~0u && + !ScoreBrackets.counterOutOfOrder(STORE_CNT)) { + // Only expand if counter is not out-of-order + unsigned Outstanding = + std::min(ScoreBrackets.getOutstanding(STORE_CNT), + getWaitCountMax(getLimits(), STORE_CNT) - 1); + EmitExpandedWaitcnt(Outstanding, Wait.StoreCnt, [&](unsigned Count) { + BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT_VSCNT)) .addReg(AMDGPU::SGPR_NULL, RegState::Undef) - .addImm(Wait.StoreCnt); - Modified = true; + .addImm(Count); + }); + } else { + [[maybe_unused]] auto SWaitInst = + BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT_VSCNT)) + .addReg(AMDGPU::SGPR_NULL, RegState::Undef) + .addImm(Wait.StoreCnt); + Modified = true; - LLVM_DEBUG(dbgs() << "generateWaitcnt\n"; - if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; - dbgs() << "New Instr: " << *SWaitInst << '\n'); + LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n"; + if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; + dbgs() << "New Instr: " << *SWaitInst << '\n'); + } } return Modified; @@ -1542,13 +1967,14 @@ bool WaitcntGeneratorPreGFX12::createNewWaitcnt( AMDGPU::Waitcnt WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const { - return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST->hasVscnt() ? 0 : ~0u); + return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST.hasVscnt() ? 0 : ~0u); } AMDGPU::Waitcnt WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const { + unsigned ExpertVal = IsExpertMode ? 0 : ~0u; return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0, - ~0u /* XCNT */); + ~0u /* XCNT */, ExpertVal, ExpertVal); } /// Combine consecutive S_WAIT_*CNT instructions that precede \p It and @@ -1558,22 +1984,25 @@ WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const { bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt( WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const { - assert(ST); assert(!isNormalMode(MaxCounter)); bool Modified = false; MachineInstr *CombinedLoadDsCntInstr = nullptr; MachineInstr *CombinedStoreDsCntInstr = nullptr; + MachineInstr *WaitcntDepctrInstr = nullptr; MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {}; LLVM_DEBUG({ dbgs() << "GFX12Plus::applyPreexistingWaitcnt at: "; - if (It == OldWaitcntInstr.getParent()->instr_end()) + if (It.isEnd()) dbgs() << "end of block\n"; else dbgs() << *It; }); + // Accumulate waits that should not be simplified. + AMDGPU::Waitcnt RequiredWait; + for (auto &II : make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) { LLVM_DEBUG(dbgs() << "pre-existing iter: " << II); @@ -1597,45 +2026,81 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt( if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) { unsigned OldEnc = - TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm(); + TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm(); AMDGPU::Waitcnt OldWait = AMDGPU::decodeLoadcntDscnt(IV, OldEnc); if (TrySimplify) - ScoreBrackets.simplifyWaitcnt(OldWait); - Wait = Wait.combined(OldWait); + Wait = Wait.combined(OldWait); + else + RequiredWait = RequiredWait.combined(OldWait); UpdatableInstr = &CombinedLoadDsCntInstr; } else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) { unsigned OldEnc = - TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm(); + TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm(); AMDGPU::Waitcnt OldWait = AMDGPU::decodeStorecntDscnt(IV, OldEnc); if (TrySimplify) + Wait = Wait.combined(OldWait); + else + RequiredWait = RequiredWait.combined(OldWait); + UpdatableInstr = &CombinedStoreDsCntInstr; + } else if (Opcode == AMDGPU::S_WAITCNT_DEPCTR) { + unsigned OldEnc = + TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm(); + AMDGPU::Waitcnt OldWait; + OldWait.VaVdst = AMDGPU::DepCtr::decodeFieldVaVdst(OldEnc); + OldWait.VmVsrc = AMDGPU::DepCtr::decodeFieldVmVsrc(OldEnc); + if (TrySimplify) ScoreBrackets.simplifyWaitcnt(OldWait); Wait = Wait.combined(OldWait); - UpdatableInstr = &CombinedStoreDsCntInstr; + UpdatableInstr = &WaitcntDepctrInstr; } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) { // Architectures higher than GFX10 do not have direct loads to // LDS, so no work required here yet. II.eraseFromParent(); continue; + } else if (Opcode == AMDGPU::WAIT_ASYNCMARK) { + reportFatalUsageError("WAIT_ASYNCMARK is not ready for GFX12 yet"); } else { std::optional<InstCounterType> CT = counterTypeForInstr(Opcode); assert(CT.has_value()); unsigned OldCnt = - TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm(); + TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm(); if (TrySimplify) - ScoreBrackets.simplifyWaitcnt(CT.value(), OldCnt); - addWait(Wait, CT.value(), OldCnt); + addWait(Wait, CT.value(), OldCnt); + else + addWait(RequiredWait, CT.value(), OldCnt); UpdatableInstr = &WaitInstrs[CT.value()]; } // Merge consecutive waitcnt of the same type by erasing multiples. if (!*UpdatableInstr) { *UpdatableInstr = &II; + } else if (Opcode == AMDGPU::S_WAITCNT_DEPCTR) { + // S_WAITCNT_DEPCTR requires special care. Don't remove a + // duplicate if it is waiting on things other than VA_VDST or + // VM_VSRC. If that is the case, just make sure the VA_VDST and + // VM_VSRC subfields of the operand are set to the "no wait" + // values. + + unsigned Enc = TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm(); + Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(Enc, ~0u); + Enc = AMDGPU::DepCtr::encodeFieldVaVdst(Enc, ~0u); + + if (Enc != (unsigned)AMDGPU::DepCtr::getDefaultDepCtrEncoding(ST)) { + Modified |= updateOperandIfDifferent(II, AMDGPU::OpName::simm16, Enc); + Modified |= promoteSoftWaitCnt(&II); + } else { + II.eraseFromParent(); + Modified = true; + } } else { II.eraseFromParent(); Modified = true; } } + ScoreBrackets.simplifyWaitcnt(Wait.combined(RequiredWait), Wait); + Wait = Wait.combined(RequiredWait); + if (CombinedLoadDsCntInstr) { // Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need // to be waited for. Otherwise, let the instruction be deleted so @@ -1644,6 +2109,11 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt( // createNewWaitcnt(). As a side effect, resetting the wait counts will // cause any redundant S_WAIT_LOADCNT or S_WAIT_DSCNT to be removed by // the loop below that deals with single counter instructions. + // + // A wait for LOAD_CNT or DS_CNT implies a wait for VM_VSRC, since + // instructions that have decremented LOAD_CNT or DS_CNT on completion + // will have needed to wait for their register sources to be available + // first. if (Wait.LoadCnt != ~0u && Wait.DsCnt != ~0u) { unsigned NewEnc = AMDGPU::encodeLoadcntDscnt(IV, Wait); Modified |= updateOperandIfDifferent(*CombinedLoadDsCntInstr, @@ -1654,13 +2124,12 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt( Wait.LoadCnt = ~0u; Wait.DsCnt = ~0u; - LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end() - ? dbgs() << "applied pre-existing waitcnt\n" - << "New Instr at block end: " - << *CombinedLoadDsCntInstr << '\n' - : dbgs() << "applied pre-existing waitcnt\n" - << "Old Instr: " << *It << "New Instr: " - << *CombinedLoadDsCntInstr << '\n'); + LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n" + << "New Instr at block end: " + << *CombinedLoadDsCntInstr << '\n' + : dbgs() << "applied pre-existing waitcnt\n" + << "Old Instr: " << *It << "New Instr: " + << *CombinedLoadDsCntInstr << '\n'); } else { CombinedLoadDsCntInstr->eraseFromParent(); Modified = true; @@ -1679,13 +2148,12 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt( Wait.StoreCnt = ~0u; Wait.DsCnt = ~0u; - LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end() - ? dbgs() << "applied pre-existing waitcnt\n" - << "New Instr at block end: " - << *CombinedStoreDsCntInstr << '\n' - : dbgs() << "applied pre-existing waitcnt\n" - << "Old Instr: " << *It << "New Instr: " - << *CombinedStoreDsCntInstr << '\n'); + LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n" + << "New Instr at block end: " + << *CombinedStoreDsCntInstr << '\n' + : dbgs() << "applied pre-existing waitcnt\n" + << "Old Instr: " << *It << "New Instr: " + << *CombinedStoreDsCntInstr << '\n'); } else { CombinedStoreDsCntInstr->eraseFromParent(); Modified = true; @@ -1729,7 +2197,7 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt( if (!WaitInstrs[CT]) continue; - unsigned NewCnt = getWait(Wait, CT); + unsigned NewCnt = Wait.get(CT); if (NewCnt != ~0u) { Modified |= updateOperandIfDifferent(*WaitInstrs[CT], AMDGPU::OpName::simm16, NewCnt); @@ -1738,7 +2206,7 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt( ScoreBrackets.applyWaitcnt(CT, NewCnt); setNoWait(Wait, CT); - LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end() + LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n" << "New Instr at block end: " << *WaitInstrs[CT] << '\n' @@ -1751,19 +2219,86 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt( } } + if (WaitcntDepctrInstr) { + // Get the encoded Depctr immediate and override the VA_VDST and VM_VSRC + // subfields with the new required values. + unsigned Enc = + TII.getNamedOperand(*WaitcntDepctrInstr, AMDGPU::OpName::simm16) + ->getImm(); + Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(Enc, Wait.VmVsrc); + Enc = AMDGPU::DepCtr::encodeFieldVaVdst(Enc, Wait.VaVdst); + + ScoreBrackets.applyWaitcnt(VA_VDST, Wait.VaVdst); + ScoreBrackets.applyWaitcnt(VM_VSRC, Wait.VmVsrc); + Wait.VaVdst = ~0u; + Wait.VmVsrc = ~0u; + + // If that new encoded Depctr immediate would actually still wait + // for anything, update the instruction's operand. Otherwise it can + // just be deleted. + if (Enc != (unsigned)AMDGPU::DepCtr::getDefaultDepCtrEncoding(ST)) { + Modified |= updateOperandIfDifferent(*WaitcntDepctrInstr, + AMDGPU::OpName::simm16, Enc); + LLVM_DEBUG(It.isEnd() ? dbgs() << "applyPreexistingWaitcnt\n" + << "New Instr at block end: " + << *WaitcntDepctrInstr << '\n' + : dbgs() << "applyPreexistingWaitcnt\n" + << "Old Instr: " << *It << "New Instr: " + << *WaitcntDepctrInstr << '\n'); + } else { + WaitcntDepctrInstr->eraseFromParent(); + Modified = true; + } + } + return Modified; } /// Generate S_WAIT_*CNT instructions for any required counters in \p Wait bool WaitcntGeneratorGFX12Plus::createNewWaitcnt( MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It, - AMDGPU::Waitcnt Wait) { - assert(ST); + AMDGPU::Waitcnt Wait, const WaitcntBrackets &ScoreBrackets) { assert(!isNormalMode(MaxCounter)); bool Modified = false; const DebugLoc &DL = Block.findDebugLoc(It); + // Helper to emit expanded waitcnt sequence for profiling. + auto EmitExpandedWaitcnt = [&](unsigned Outstanding, unsigned Target, + auto EmitWaitcnt) { + for (unsigned I = Outstanding - 1; I > Target && I != ~0u; --I) + EmitWaitcnt(I); + EmitWaitcnt(Target); + Modified = true; + }; + + // For GFX12+, we use separate wait instructions, which makes expansion + // simpler + if (ExpandWaitcntProfiling) { + for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) { + unsigned Count = Wait.get(CT); + if (Count == ~0u) + continue; + + // Skip expansion for out-of-order counters - emit normal wait instead + if (ScoreBrackets.counterOutOfOrder(CT)) { + BuildMI(Block, It, DL, TII.get(instrsForExtendedCounterTypes[CT])) + .addImm(Count); + Modified = true; + continue; + } + + unsigned Outstanding = std::min(ScoreBrackets.getOutstanding(CT), + getWaitCountMax(getLimits(), CT) - 1); + EmitExpandedWaitcnt(Outstanding, Count, [&](unsigned Val) { + BuildMI(Block, It, DL, TII.get(instrsForExtendedCounterTypes[CT])) + .addImm(Val); + }); + } + return Modified; + } + + // Normal behavior (no expansion) // Check for opportunities to use combined wait instructions. if (Wait.DsCnt != ~0u) { MachineInstr *SWaitInst = nullptr; @@ -1771,7 +2306,7 @@ bool WaitcntGeneratorGFX12Plus::createNewWaitcnt( if (Wait.LoadCnt != ~0u) { unsigned Enc = AMDGPU::encodeLoadcntDscnt(IV, Wait); - SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT)) + SWaitInst = BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAIT_LOADCNT_DSCNT)) .addImm(Enc); Wait.LoadCnt = ~0u; @@ -1779,9 +2314,8 @@ bool WaitcntGeneratorGFX12Plus::createNewWaitcnt( } else if (Wait.StoreCnt != ~0u) { unsigned Enc = AMDGPU::encodeStorecntDscnt(IV, Wait); - SWaitInst = - BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_STORECNT_DSCNT)) - .addImm(Enc); + SWaitInst = BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAIT_STORECNT_DSCNT)) + .addImm(Enc); Wait.StoreCnt = ~0u; Wait.DsCnt = ~0u; @@ -1790,7 +2324,7 @@ bool WaitcntGeneratorGFX12Plus::createNewWaitcnt( if (SWaitInst) { Modified = true; - LLVM_DEBUG(dbgs() << "generateWaitcnt\n"; + LLVM_DEBUG(dbgs() << "GFX12Plus::createNewWaitcnt\n"; if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; dbgs() << "New Instr: " << *SWaitInst << '\n'); } @@ -1800,16 +2334,31 @@ bool WaitcntGeneratorGFX12Plus::createNewWaitcnt( // waiting for. for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) { - unsigned Count = getWait(Wait, CT); + unsigned Count = Wait.get(CT); if (Count == ~0u) continue; [[maybe_unused]] auto SWaitInst = - BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT])) + BuildMI(Block, It, DL, TII.get(instrsForExtendedCounterTypes[CT])) .addImm(Count); Modified = true; + LLVM_DEBUG(dbgs() << "GFX12Plus::createNewWaitcnt\n"; + if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; + dbgs() << "New Instr: " << *SWaitInst << '\n'); + } + + if (Wait.hasWaitDepctr()) { + assert(IsExpertMode); + unsigned Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(Wait.VmVsrc, ST); + Enc = AMDGPU::DepCtr::encodeFieldVaVdst(Enc, Wait.VaVdst); + + [[maybe_unused]] auto SWaitInst = + BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT_DEPCTR)).addImm(Enc); + + Modified = true; + LLVM_DEBUG(dbgs() << "generateWaitcnt\n"; if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; dbgs() << "New Instr: " << *SWaitInst << '\n'); @@ -1818,19 +2367,6 @@ bool WaitcntGeneratorGFX12Plus::createNewWaitcnt( return Modified; } -/// \returns true if the callee inserts an s_waitcnt 0 on function entry. -static bool callWaitsOnFunctionEntry(const MachineInstr &MI) { - // Currently all conventions wait, but this may not always be the case. - // - // TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make - // senses to omit the wait and do it in the caller. - return true; -} - -/// \returns true if the callee is expected to wait for any outstanding waits -/// before returning. -static bool callWaitsOnFunctionReturn(const MachineInstr &MI) { return true; } - /// Generate s_waitcnt instruction to be placed before cur_Inst. /// Instructions of a given type are returned in order, /// but instructions of different types can complete out of order. @@ -1841,12 +2377,13 @@ static bool callWaitsOnFunctionReturn(const MachineInstr &MI) { return true; } /// and if so what the value of each counter is. /// The "score bracket" is bound by the lower bound and upper bound /// scores (*_score_LB and *_score_ub respectively). -/// If FlushVmCnt is true, that means that we want to generate a s_waitcnt to -/// flush the vmcnt counter here. -bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, - WaitcntBrackets &ScoreBrackets, - MachineInstr *OldWaitcntInstr, - bool FlushVmCnt) { +/// If FlushFlags.FlushVmCnt is true, we want to flush the vmcnt counter here. +/// If FlushFlags.FlushDsCnt is true, we want to flush the dscnt counter here +/// (GFX12+ only, where DS_CNT is a separate counter). +bool SIInsertWaitcnts::generateWaitcntInstBefore( + MachineInstr &MI, WaitcntBrackets &ScoreBrackets, + MachineInstr *OldWaitcntInstr, PreheaderFlushFlags FlushFlags) { + LLVM_DEBUG(dbgs() << "\n*** GenerateWaitcntInstBefore: "; MI.print(dbgs());); setForceEmitWaitcnt(); assert(!MI.isMetaInstruction()); @@ -1854,54 +2391,70 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, AMDGPU::Waitcnt Wait; const unsigned Opc = MI.getOpcode(); - // FIXME: This should have already been handled by the memory legalizer. - // Removing this currently doesn't affect any lit tests, but we need to - // verify that nothing was relying on this. The number of buffer invalidates - // being handled here should not be expanded. - if (Opc == AMDGPU::BUFFER_WBINVL1 || Opc == AMDGPU::BUFFER_WBINVL1_SC || - Opc == AMDGPU::BUFFER_WBINVL1_VOL || Opc == AMDGPU::BUFFER_GL0_INV || - Opc == AMDGPU::BUFFER_GL1_INV) { - Wait.LoadCnt = 0; - } - - // All waits must be resolved at call return. - // NOTE: this could be improved with knowledge of all call sites or - // with knowledge of the called routines. - if (Opc == AMDGPU::SI_RETURN_TO_EPILOG || Opc == AMDGPU::SI_RETURN || - Opc == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN || - Opc == AMDGPU::S_SETPC_B64_return || - (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) { - Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false)); - } - // In dynamic VGPR mode, we want to release the VGPRs before the wave exits. - // Technically the hardware will do this on its own if we don't, but that - // might cost extra cycles compared to doing it explicitly. - // When not in dynamic VGPR mode, identify S_ENDPGM instructions which may - // have to wait for outstanding VMEM stores. In this case it can be useful to - // send a message to explicitly release all VGPRs before the stores have - // completed, but it is only safe to do this if there are no outstanding - // scratch stores. - else if (Opc == AMDGPU::S_ENDPGM || Opc == AMDGPU::S_ENDPGM_SAVED) { - if (!WCG->isOptNone() && - (MI.getMF()->getInfo<SIMachineFunctionInfo>()->isDynamicVGPREnabled() || - (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && - ScoreBrackets.getScoreRange(STORE_CNT) != 0 && - !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS)))) - ReleaseVGPRInsts.insert(&MI); - } - // Resolve vm waits before gs-done. - else if ((Opc == AMDGPU::S_SENDMSG || Opc == AMDGPU::S_SENDMSGHALT) && - ST->hasLegacyGeometry() && - ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) == - AMDGPU::SendMsg::ID_GS_DONE_PreGFX11)) { + switch (Opc) { + case AMDGPU::BUFFER_WBINVL1: + case AMDGPU::BUFFER_WBINVL1_SC: + case AMDGPU::BUFFER_WBINVL1_VOL: + case AMDGPU::BUFFER_GL0_INV: + case AMDGPU::BUFFER_GL1_INV: { + // FIXME: This should have already been handled by the memory legalizer. + // Removing this currently doesn't affect any lit tests, but we need to + // verify that nothing was relying on this. The number of buffer invalidates + // being handled here should not be expanded. Wait.LoadCnt = 0; + break; + } + case AMDGPU::SI_RETURN_TO_EPILOG: + case AMDGPU::SI_RETURN: + case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: + case AMDGPU::S_SETPC_B64_return: { + // All waits must be resolved at call return. + // NOTE: this could be improved with knowledge of all call sites or + // with knowledge of the called routines. + ReturnInsts.insert(&MI); + AMDGPU::Waitcnt AllZeroWait = + WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false); + // On GFX12+, if LOAD_CNT is pending but no VGPRs are waiting for loads + // (e.g., only GLOBAL_INV is pending), we can skip waiting on loadcnt. + // GLOBAL_INV increments loadcnt but doesn't write to VGPRs, so there's + // no need to wait for it at function boundaries. + if (ST->hasExtendedWaitCounts() && + !ScoreBrackets.hasPendingEvent(VMEM_ACCESS)) + AllZeroWait.LoadCnt = ~0u; + Wait = AllZeroWait; + break; + } + case AMDGPU::S_ENDPGM: + case AMDGPU::S_ENDPGM_SAVED: { + // In dynamic VGPR mode, we want to release the VGPRs before the wave exits. + // Technically the hardware will do this on its own if we don't, but that + // might cost extra cycles compared to doing it explicitly. + // When not in dynamic VGPR mode, identify S_ENDPGM instructions which may + // have to wait for outstanding VMEM stores. In this case it can be useful + // to send a message to explicitly release all VGPRs before the stores have + // completed, but it is only safe to do this if there are no outstanding + // scratch stores. + EndPgmInsts[&MI] = !ScoreBrackets.empty(STORE_CNT) && + !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS); + break; + } + case AMDGPU::S_SENDMSG: + case AMDGPU::S_SENDMSGHALT: { + if (ST->hasLegacyGeometry() && + ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) == + AMDGPU::SendMsg::ID_GS_DONE_PreGFX11)) { + // Resolve vm waits before gs-done. + Wait.LoadCnt = 0; + break; + } + [[fallthrough]]; } + default: { - // Export & GDS instructions do not read the EXEC mask until after the export - // is granted (which can occur well after the instruction is issued). - // The shader program must flush all EXP operations on the export-count - // before overwriting the EXEC mask. - else { + // Export & GDS instructions do not read the EXEC mask until after the + // export is granted (which can occur well after the instruction is issued). + // The shader program must flush all EXP operations on the export-count + // before overwriting the EXEC mask. if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) { // Export and GDS are tracked individually, either may trigger a waitcnt // for EXEC. @@ -1918,27 +2471,22 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, if (TII->isAlwaysGDS(Opc) && ScoreBrackets.hasPendingGDS()) addWait(Wait, DS_CNT, ScoreBrackets.getPendingGDSWait()); - if (MI.isCall() && callWaitsOnFunctionEntry(MI)) { + if (MI.isCall()) { // The function is going to insert a wait on everything in its prolog. // This still needs to be careful if the call target is a load (e.g. a GOT // load). We also need to check WAW dependency with saved PC. + CallInsts.insert(&MI); Wait = AMDGPU::Waitcnt(); - const auto &CallAddrOp = *TII->getNamedOperand(MI, AMDGPU::OpName::src0); + const MachineOperand &CallAddrOp = TII->getCalleeOperand(MI); if (CallAddrOp.isReg()) { - RegInterval CallAddrOpInterval = - ScoreBrackets.getRegInterval(&MI, CallAddrOp); - - ScoreBrackets.determineWait(SmemAccessCounter, CallAddrOpInterval, - Wait); + ScoreBrackets.determineWaitForPhysReg( + SmemAccessCounter, CallAddrOp.getReg().asMCReg(), Wait); if (const auto *RtnAddrOp = TII->getNamedOperand(MI, AMDGPU::OpName::dst)) { - RegInterval RtnAddrOpInterval = - ScoreBrackets.getRegInterval(&MI, *RtnAddrOp); - - ScoreBrackets.determineWait(SmemAccessCounter, RtnAddrOpInterval, - Wait); + ScoreBrackets.determineWaitForPhysReg( + SmemAccessCounter, RtnAddrOp->getReg().asMCReg(), Wait); } } } else if (Opc == AMDGPU::S_BARRIER_WAIT) { @@ -1975,18 +2523,26 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, continue; // LOAD_CNT is only relevant to vgpr or LDS. - unsigned RegNo = FIRST_LDS_VGPR; + unsigned TID = LDSDMA_BEGIN; if (Ptr && Memop->getAAInfo()) { const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores(); for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) { - if (MI.mayAlias(AA, *LDSDMAStores[I], true)) - ScoreBrackets.determineWait(LOAD_CNT, RegNo + I + 1, Wait); + if (MI.mayAlias(AA, *LDSDMAStores[I], true)) { + if ((I + 1) >= NUM_LDSDMA) { + // We didn't have enough slot to track this LDS DMA store, it + // has been tracked using the common RegNo (FIRST_LDS_VGPR). + ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, TID, Wait); + break; + } + + ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, TID + I + 1, Wait); + } } } else { - ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait); + ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, TID, Wait); } if (Memop->isStore()) { - ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait); + ScoreBrackets.determineWaitForLDSDMA(EXP_CNT, TID, Wait); } } @@ -1999,7 +2555,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, if (Op.isTied() && Op.isUse() && TII->doesNotReadTiedSource(MI)) continue; - RegInterval Interval = ScoreBrackets.getRegInterval(&MI, Op); + MCPhysReg Reg = Op.getReg().asMCReg(); const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg()); if (IsVGPR) { @@ -2011,6 +2567,9 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, if (Op.isImplicit() && MI.mayLoadOrStore()) continue; + ScoreBrackets.determineWaitForPhysReg(VA_VDST, Reg, Wait); + if (Op.isDef()) + ScoreBrackets.determineWaitForPhysReg(VM_VSRC, Reg, Wait); // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the // previous write and this write are the same type of VMEM // instruction, in which case they are (in some architectures) @@ -2018,31 +2577,31 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, // Additionally check instructions where Point Sample Acceleration // might be applied. if (Op.isUse() || !updateVMCntOnly(MI) || - ScoreBrackets.hasOtherPendingVmemTypes(Interval, - getVmemType(MI)) || - ScoreBrackets.hasPointSamplePendingVmemTypes(MI, Interval) || + ScoreBrackets.hasOtherPendingVmemTypes(Reg, getVmemType(MI)) || + ScoreBrackets.hasPointSamplePendingVmemTypes(MI, Reg) || !ST->hasVmemWriteVgprInOrder()) { - ScoreBrackets.determineWait(LOAD_CNT, Interval, Wait); - ScoreBrackets.determineWait(SAMPLE_CNT, Interval, Wait); - ScoreBrackets.determineWait(BVH_CNT, Interval, Wait); - ScoreBrackets.clearVgprVmemTypes(Interval); + ScoreBrackets.determineWaitForPhysReg(LOAD_CNT, Reg, Wait); + ScoreBrackets.determineWaitForPhysReg(SAMPLE_CNT, Reg, Wait); + ScoreBrackets.determineWaitForPhysReg(BVH_CNT, Reg, Wait); + ScoreBrackets.clearVgprVmemTypes(Reg); } if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) { - ScoreBrackets.determineWait(EXP_CNT, Interval, Wait); + ScoreBrackets.determineWaitForPhysReg(EXP_CNT, Reg, Wait); } - ScoreBrackets.determineWait(DS_CNT, Interval, Wait); + ScoreBrackets.determineWaitForPhysReg(DS_CNT, Reg, Wait); } else if (Op.getReg() == AMDGPU::SCC) { - ScoreBrackets.determineWait(KM_CNT, Interval, Wait); + ScoreBrackets.determineWaitForPhysReg(KM_CNT, Reg, Wait); } else { - ScoreBrackets.determineWait(SmemAccessCounter, Interval, Wait); + ScoreBrackets.determineWaitForPhysReg(SmemAccessCounter, Reg, Wait); } - if (ST->hasWaitXCnt() && Op.isDef()) - ScoreBrackets.determineWait(X_CNT, Interval, Wait); + if (ST->hasWaitXcnt() && Op.isDef()) + ScoreBrackets.determineWaitForPhysReg(X_CNT, Reg, Wait); } } } + } // Ensure safety against exceptions from outstanding memory operations while // waiting for a barrier: @@ -2057,7 +2616,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, // In all other cases, ensure safety by ensuring that there are no outstanding // memory operations. if (Opc == AMDGPU::S_BARRIER && !ST->hasAutoWaitcntBeforeBarrier() && - !ST->supportsBackOffBarrier()) { + !ST->hasBackOffBarrier()) { Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true)); } @@ -2072,35 +2631,41 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, // Verify that the wait is actually needed. ScoreBrackets.simplifyWaitcnt(Wait); + // It is only necessary to insert an S_WAITCNT_DEPCTR instruction that + // waits on VA_VDST if the instruction it would precede is not a VALU + // instruction, since hardware handles VALU->VGPR->VALU hazards in + // expert scheduling mode. + if (TII->isVALU(MI)) + Wait.VaVdst = ~0u; + + // Since the translation for VMEM addresses occur in-order, we can apply the + // XCnt if the current instruction is of VMEM type and has a memory + // dependency with another VMEM instruction in flight. + if (Wait.XCnt != ~0u && isVmemAccess(MI)) { + ScoreBrackets.applyWaitcnt(X_CNT, Wait.XCnt); + Wait.XCnt = ~0u; + } + // When forcing emit, we need to skip terminators because that would break the // terminators of the MBB if we emit a waitcnt between terminators. if (ForceEmitZeroFlag && !MI.isTerminator()) Wait = WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false); - if (ForceEmitWaitcnt[LOAD_CNT]) - Wait.LoadCnt = 0; - if (ForceEmitWaitcnt[EXP_CNT]) - Wait.ExpCnt = 0; - if (ForceEmitWaitcnt[DS_CNT]) - Wait.DsCnt = 0; - if (ForceEmitWaitcnt[SAMPLE_CNT]) - Wait.SampleCnt = 0; - if (ForceEmitWaitcnt[BVH_CNT]) - Wait.BvhCnt = 0; - if (ForceEmitWaitcnt[KM_CNT]) - Wait.KmCnt = 0; - if (ForceEmitWaitcnt[X_CNT]) - Wait.XCnt = 0; - - if (FlushVmCnt) { - if (ScoreBrackets.hasPendingEvent(LOAD_CNT)) - Wait.LoadCnt = 0; - if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT)) - Wait.SampleCnt = 0; - if (ScoreBrackets.hasPendingEvent(BVH_CNT)) - Wait.BvhCnt = 0; + // If we force waitcnt then update Wait accordingly. + for (InstCounterType T : inst_counter_types()) { + if (!ForceEmitWaitcnt[T]) + continue; + Wait.set(T, 0); + } + + if (FlushFlags.FlushVmCnt) { + for (InstCounterType T : {LOAD_CNT, SAMPLE_CNT, BVH_CNT}) + Wait.set(T, 0); } + if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(DS_CNT)) + Wait.DsCnt = 0; + if (ForceEmitZeroLoadFlag && Wait.LoadCnt != ~0u) Wait.LoadCnt = 0; @@ -2121,10 +2686,6 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait, Modified = WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It); - // Any counts that could have been applied to any existing waitcnt - // instructions will have been done so, now deal with any remaining. - ScoreBrackets.applyWaitcnt(Wait); - // ExpCnt can be merged into VINTERP. if (Wait.ExpCnt != ~0u && It != Block.instr_end() && SIInstrInfo::isVINTERP(*It)) { @@ -2134,31 +2695,59 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait, WaitExp->setImm(Wait.ExpCnt); Modified = true; } + // Apply ExpCnt before resetting it, so applyWaitcnt below sees all counts. + ScoreBrackets.applyWaitcnt(EXP_CNT, Wait.ExpCnt); Wait.ExpCnt = ~0u; LLVM_DEBUG(dbgs() << "generateWaitcnt\n" << "Update Instr: " << *It); } - // XCnt may be already consumed by a load wait. - if (Wait.XCnt != ~0u) { - if (Wait.KmCnt == 0 && !ScoreBrackets.hasPendingEvent(SMEM_GROUP)) - Wait.XCnt = ~0u; + if (WCG->createNewWaitcnt(Block, It, Wait, ScoreBrackets)) + Modified = true; - if (Wait.LoadCnt == 0 && !ScoreBrackets.hasPendingEvent(VMEM_GROUP)) - Wait.XCnt = ~0u; + // Any counts that could have been applied to any existing waitcnt + // instructions will have been done so, now deal with any remaining. + ScoreBrackets.applyWaitcnt(Wait); - // Since the translation for VMEM addresses occur in-order, we can skip the - // XCnt if the current instruction is of VMEM type and has a memory - // dependency with another VMEM instruction in flight. - if (isVmemAccess(*It)) - Wait.XCnt = ~0u; + return Modified; +} + +std::optional<WaitEventType> +SIInsertWaitcnts::getExpertSchedulingEventType(const MachineInstr &Inst) const { + if (TII->isVALU(Inst)) { + // Core/Side-, DP-, XDL- and TRANS-MACC VALU instructions complete + // out-of-order with respect to each other, so each of these classes + // has its own event. + + if (TII->isXDL(Inst)) + return VGPR_XDL_WRITE; + + if (TII->isTRANS(Inst)) + return VGPR_TRANS_WRITE; + + if (AMDGPU::isDPMACCInstruction(Inst.getOpcode())) + return VGPR_DPMACC_WRITE; + + return VGPR_CSMACC_WRITE; } - if (WCG->createNewWaitcnt(Block, It, Wait)) - Modified = true; + // FLAT and LDS instructions may read their VGPR sources out-of-order + // with respect to each other and all other VMEM instructions, so + // each of these also has a separate event. - return Modified; + if (TII->isFLAT(Inst)) + return VGPR_FLAT_READ; + + if (TII->isDS(Inst)) + return VGPR_LDS_READ; + + if (TII->isVMEM(Inst) || TII->isVIMAGE(Inst) || TII->isVSAMPLE(Inst)) + return VGPR_VMEM_READ; + + // Otherwise, no hazard. + + return {}; } bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const { @@ -2235,6 +2824,12 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, bool IsVMEMAccess = false; bool IsSMEMAccess = false; + + if (IsExpertMode) { + if (const auto ET = getExpertSchedulingEventType(Inst)) + ScoreBrackets->updateByEvent(*ET, Inst); + } + if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) { if (TII->isAlwaysGDS(Inst.getOpcode()) || TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) { @@ -2265,13 +2860,19 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, ScoreBrackets->updateByEvent(LDS_ACCESS, Inst); } - // This is a flat memory operation that access both VMEM and LDS, so note it - // - it will require that both the VM and LGKM be flushed to zero if it is - // pending when a VM or LGKM dependency occurs. - if (FlatASCount > 1) + // Async/LDSDMA operations have FLAT encoding but do not actually use flat + // pointers. They do have two operands that each access global and LDS, thus + // making it appear at this point that they are using a flat pointer. Filter + // them out, and for the rest, generate a dependency on flat pointers so + // that both VM and LGKM counters are flushed. + if (!SIInstrInfo::isLDSDMA(Inst) && FlatASCount > 1) ScoreBrackets->setPendingFlat(); } else if (SIInstrInfo::isVMEM(Inst) && - !llvm::AMDGPU::getMUBUFIsBufferInv(Inst.getOpcode())) { + (!AMDGPU::getMUBUFIsBufferInv(Inst.getOpcode()) || + Inst.getOpcode() == AMDGPU::BUFFER_WBL2)) { + // BUFFER_WBL2 is included here because unlike invalidates, has to be + // followed "S_WAITCNT vmcnt(0)" is needed after to ensure the writeback has + // completed. IsVMEMAccess = true; ScoreBrackets->updateByEvent(getVmemWaitEventType(Inst), Inst); @@ -2283,15 +2884,9 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, IsSMEMAccess = true; ScoreBrackets->updateByEvent(SMEM_ACCESS, Inst); } else if (Inst.isCall()) { - if (callWaitsOnFunctionReturn(Inst)) { - // Act as a wait on everything - ScoreBrackets->applyWaitcnt( - WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false)); - ScoreBrackets->setStateOnFunctionEntryOrReturn(); - } else { - // May need to way wait for anything. - ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt()); - } + // Act as a wait on everything + ScoreBrackets->applyWaitcnt(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false)); + ScoreBrackets->setStateOnFunctionEntryOrReturn(); } else if (SIInstrInfo::isLDSDIR(Inst)) { ScoreBrackets->updateByEvent(EXP_LDS_ACCESS, Inst); } else if (TII->isVINTERP(Inst)) { @@ -2324,7 +2919,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, } } - if (!ST->hasWaitXCnt()) + if (!ST->hasWaitXcnt()) return; if (IsVMEMAccess) @@ -2343,6 +2938,84 @@ bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score, return OtherShifted > MyShifted; } +bool WaitcntBrackets::mergeAsyncMarks(ArrayRef<MergeInfo> MergeInfos, + ArrayRef<CounterValueArray> OtherMarks) { + bool StrictDom = false; + + LLVM_DEBUG(dbgs() << "Merging async marks ..."); + // Early exit: both empty + if (AsyncMarks.empty() && OtherMarks.empty()) { + LLVM_DEBUG(dbgs() << " nothing to merge\n"); + return false; + } + LLVM_DEBUG(dbgs() << '\n'); + + // Determine maximum length needed after merging + auto MaxSize = (unsigned)std::max(AsyncMarks.size(), OtherMarks.size()); + + // For each backedge in isolation, the algorithm reachs a fixed point after + // the first call to merge(). This is unchanged even with the AsyncMarks + // array because we call mergeScore just like the other cases. + // + // But in the rare pathological case, a nest of loops that pushes marks + // without waiting on any mark can cause AsyncMarks to grow very large. We cap + // it to a reasonable limit. We can tune this later or potentially introduce a + // user option to control the value. + MaxSize = std::min(MaxSize, MaxAsyncMarks); + + // Keep only the most recent marks within our limit. + if (AsyncMarks.size() > MaxSize) + AsyncMarks.erase(AsyncMarks.begin(), + AsyncMarks.begin() + (AsyncMarks.size() - MaxSize)); + + // Pad with zero-filled marks if our list is shorter. Zero represents "no + // pending async operations at this checkpoint" and acts as the identity + // element for max() during merging. We pad at the beginning since the marks + // need to be aligned in most-recent order. + CounterValueArray ZeroMark{}; + AsyncMarks.insert(AsyncMarks.begin(), MaxSize - AsyncMarks.size(), ZeroMark); + + LLVM_DEBUG({ + dbgs() << "Before merge:\n"; + for (const auto &Mark : AsyncMarks) { + llvm::interleaveComma(Mark, dbgs()); + dbgs() << '\n'; + } + }); + + LLVM_DEBUG({ + dbgs() << "Other marks:\n"; + for (const auto &Mark : OtherMarks) { + llvm::interleaveComma(Mark, dbgs()); + dbgs() << '\n'; + } + }); + + // Merge element-wise using the existing mergeScore function and the + // appropriate MergeInfo for each counter type. Iterate only while we have + // elements in both vectors. + unsigned OtherSize = OtherMarks.size(); + unsigned OurSize = AsyncMarks.size(); + unsigned MergeCount = std::min(OtherSize, OurSize); + assert(OurSize == MaxSize); + for (unsigned Idx = 1; Idx <= MergeCount; ++Idx) { + for (auto T : inst_counter_types(Context->MaxCounter)) { + StrictDom |= mergeScore(MergeInfos[T], AsyncMarks[OurSize - Idx][T], + OtherMarks[OtherSize - Idx][T]); + } + } + + LLVM_DEBUG({ + dbgs() << "After merge:\n"; + for (const auto &Mark : AsyncMarks) { + llvm::interleaveComma(Mark, dbgs()); + dbgs() << '\n'; + } + }); + + return StrictDom; +} + /// Merge the pending events and associater score brackets of \p Other into /// this brackets status. /// @@ -2351,15 +3024,22 @@ bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score, bool WaitcntBrackets::merge(const WaitcntBrackets &Other) { bool StrictDom = false; - VgprUB = std::max(VgprUB, Other.VgprUB); - SgprUB = std::max(SgprUB, Other.SgprUB); + // Check if "other" has keys we don't have, and create default entries for + // those. If they remain empty after merging, we will clean it up after. + for (auto K : Other.VMem.keys()) + VMem.try_emplace(K); + for (auto K : Other.SGPRs.keys()) + SGPRs.try_emplace(K); + + // Array to store MergeInfo for each counter type + MergeInfo MergeInfos[NUM_INST_CNTS]; for (auto T : inst_counter_types(Context->MaxCounter)) { // Merge event flags for this counter - const unsigned *WaitEventMaskForInst = Context->WaitEventMaskForInst; - const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T]; - const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T]; - if (OtherEvents & ~OldEvents) + const WaitEventSet &EventsForT = Context->getWaitEvents(T); + const WaitEventSet OldEvents = PendingEvents & EventsForT; + const WaitEventSet OtherEvents = Other.PendingEvents & EventsForT; + if (!OldEvents.contains(OtherEvents)) StrictDom = true; PendingEvents |= OtherEvents; @@ -2370,7 +3050,7 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) { if (NewUB < ScoreLBs[T]) report_fatal_error("waitcnt score overflow"); - MergeInfo M; + MergeInfo &M = MergeInfos[T]; M.OldLB = ScoreLBs[T]; M.OtherLB = Other.ScoreLBs[T]; M.MyShift = NewUB - ScoreUBs[T]; @@ -2386,8 +3066,7 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) { if (T == KM_CNT) { StrictDom |= mergeScore(M, SCCScore, Other.SCCScore); if (Other.hasPendingEvent(SCC_WRITE)) { - unsigned OldEventsHasSCCWrite = OldEvents & (1 << SCC_WRITE); - if (!OldEventsHasSCCWrite) { + if (!OldEvents.contains(SCC_WRITE)) { PendingSCCWrite = Other.PendingSCCWrite; } else if (PendingSCCWrite != Other.PendingSCCWrite) { PendingSCCWrite = nullptr; @@ -2395,23 +3074,33 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) { } } - for (int J = 0; J <= VgprUB; J++) - StrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]); + for (auto &[RegID, Info] : VMem) + StrictDom |= mergeScore(M, Info.Scores[T], Other.getVMemScore(RegID, T)); if (isSmemCounter(T)) { unsigned Idx = getSgprScoresIdx(T); - for (int J = 0; J <= SgprUB; J++) - StrictDom |= - mergeScore(M, SgprScores[Idx][J], Other.SgprScores[Idx][J]); + for (auto &[RegID, Info] : SGPRs) { + auto It = Other.SGPRs.find(RegID); + unsigned OtherScore = + (It != Other.SGPRs.end()) ? It->second.Scores[Idx] : 0; + StrictDom |= mergeScore(M, Info.Scores[Idx], OtherScore); + } } } - for (int J = 0; J <= VgprUB; J++) { - unsigned char NewVmemTypes = VgprVmemTypes[J] | Other.VgprVmemTypes[J]; - StrictDom |= NewVmemTypes != VgprVmemTypes[J]; - VgprVmemTypes[J] = NewVmemTypes; + for (auto &[TID, Info] : VMem) { + if (auto It = Other.VMem.find(TID); It != Other.VMem.end()) { + unsigned char NewVmemTypes = Info.VMEMTypes | It->second.VMEMTypes; + StrictDom |= NewVmemTypes != Info.VMEMTypes; + Info.VMEMTypes = NewVmemTypes; + } } + StrictDom |= mergeAsyncMarks(MergeInfos, Other.AsyncMarks); + for (auto T : inst_counter_types(Context->MaxCounter)) + StrictDom |= mergeScore(MergeInfos[T], AsyncScore[T], Other.AsyncScore[T]); + + purgeEmptyTrackingData(); return StrictDom; } @@ -2423,9 +3112,53 @@ static bool isWaitInstr(MachineInstr &Inst) { Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT || Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT || Opcode == AMDGPU::S_WAITCNT_lds_direct || + Opcode == AMDGPU::WAIT_ASYNCMARK || counterTypeForInstr(Opcode).has_value(); } +void SIInsertWaitcnts::setSchedulingMode(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + bool ExpertMode) const { + const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode( + AMDGPU::Hwreg::ID_SCHED_MODE, AMDGPU::Hwreg::HwregOffset::Default, 2); + BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_SETREG_IMM32_B32)) + .addImm(ExpertMode ? 2 : 0) + .addImm(EncodedReg); +} + +// Track back-to-back atomic RMW instructions, referred to as a block. +// +// Determines whether \p MI starts a new atomic RMW block, is inside +// an existing block, or is outside of a block. A block is broken when a +// CU-scoped memory op or an atomic store is encountered. ALU ops +// and non-memory instructions don't break a block. The function returns +// the new state after processing the current instruction based on +// \p PrevState, the previously captured state. +AtomicRMWState +SIInsertWaitcnts::getAtomicRMWState(MachineInstr &MI, + AtomicRMWState PrevState) const { + if (isAtomicRMW(MI)) { + // Transition from NotInBlock -> NewBlock -> InsideBlock. + if (PrevState == AtomicRMWState::NotInBlock) + return AtomicRMWState::NewBlock; + if (PrevState == AtomicRMWState::NewBlock) + return AtomicRMWState::InsideBlock; + + return PrevState; + } + + // LDS memory operations don't break the block. + if (TII->isDS(MI) || (TII->isFLAT(MI) && TII->mayAccessLDSThroughFlat(MI))) + return PrevState; + + // Reset the atomic RMW block state when found other VMEM and SMEM operations. + if (MI.mayLoad() ^ MI.mayStore()) + return AtomicRMWState::NotInBlock; + + // Return the previous state otherwise. + return PrevState; +} + // Generate s_waitcnt instructions where needed. bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block, @@ -2454,6 +3187,7 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, // Walk over the instructions. MachineInstr *OldWaitcntInstr = nullptr; + AtomicRMWState RMWState = AtomicRMWState::NotInBlock; for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(), E = Block.instr_end(); @@ -2463,22 +3197,50 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, ++Iter; continue; } + // Get the atomic RMW block state for current instruction. + RMWState = getAtomicRMWState(Inst, RMWState); // Track pre-existing waitcnts that were added in earlier iterations or by // the memory legalizer. - if (isWaitInstr(Inst)) { - if (!OldWaitcntInstr) - OldWaitcntInstr = &Inst; + if (isWaitInstr(Inst) || + (IsExpertMode && Inst.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR)) { ++Iter; + bool IsSoftXcnt = isSoftXcnt(Inst); + // The Memory Legalizer conservatively inserts a soft xcnt before each + // atomic RMW operation. However, for sequences of back-to-back atomic + // RMWs, only the first s_wait_xcnt insertion is necessary. Optimize away + // the redundant soft xcnts when we're inside an atomic RMW block. + if (Iter != E && IsSoftXcnt) { + // Check if the next instruction can potentially change the atomic RMW + // state. + RMWState = getAtomicRMWState(*Iter, RMWState); + } + + if (IsSoftXcnt && RMWState == AtomicRMWState::InsideBlock) { + // Delete this soft xcnt. + Inst.eraseFromParent(); + Modified = true; + } else if (!OldWaitcntInstr) { + OldWaitcntInstr = &Inst; + } continue; } - bool FlushVmCnt = Block.getFirstTerminator() == Inst && - isPreheaderToFlush(Block, ScoreBrackets); + PreheaderFlushFlags FlushFlags; + if (Block.getFirstTerminator() == Inst) + FlushFlags = isPreheaderToFlush(Block, ScoreBrackets); + + if (Inst.getOpcode() == AMDGPU::ASYNCMARK) { + // FIXME: Not supported on GFX12 yet. Will need a new feature when we do. + assert(ST->getGeneration() < AMDGPUSubtarget::GFX12); + ScoreBrackets.recordAsyncMark(Inst); + ++Iter; + continue; + } // Generate an s_waitcnt instruction to be placed before Inst, if needed. Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr, - FlushVmCnt); + FlushFlags); OldWaitcntInstr = nullptr; // Restore vccz if it's not known to be correct already. @@ -2552,17 +3314,21 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, ++Iter; } - // Flush the LOADcnt, SAMPLEcnt and BVHcnt counters at the end of the block if - // needed. + // Flush counters at the end of the block if needed (for preheaders with no + // terminator). AMDGPU::Waitcnt Wait; - if (Block.getFirstTerminator() == Block.end() && - isPreheaderToFlush(Block, ScoreBrackets)) { - if (ScoreBrackets.hasPendingEvent(LOAD_CNT)) - Wait.LoadCnt = 0; - if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT)) - Wait.SampleCnt = 0; - if (ScoreBrackets.hasPendingEvent(BVH_CNT)) - Wait.BvhCnt = 0; + if (Block.getFirstTerminator() == Block.end()) { + PreheaderFlushFlags FlushFlags = isPreheaderToFlush(Block, ScoreBrackets); + if (FlushFlags.FlushVmCnt) { + if (ScoreBrackets.hasPendingEvent(LOAD_CNT)) + Wait.LoadCnt = 0; + if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT)) + Wait.SampleCnt = 0; + if (ScoreBrackets.hasPendingEvent(BVH_CNT)) + Wait.BvhCnt = 0; + } + if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(DS_CNT)) + Wait.DsCnt = 0; } // Combine or remove any redundant waitcnts at the end of the block. @@ -2578,29 +3344,29 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, return Modified; } -// Return true if the given machine basic block is a preheader of a loop in -// which we want to flush the vmcnt counter, and false otherwise. -bool SIInsertWaitcnts::isPreheaderToFlush( - MachineBasicBlock &MBB, const WaitcntBrackets &ScoreBrackets) { - auto [Iterator, IsInserted] = PreheadersToFlush.try_emplace(&MBB, false); +// Return flags indicating which counters should be flushed in the preheader. +PreheaderFlushFlags +SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB, + const WaitcntBrackets &ScoreBrackets) { + auto [Iterator, IsInserted] = + PreheadersToFlush.try_emplace(&MBB, PreheaderFlushFlags()); if (!IsInserted) return Iterator->second; MachineBasicBlock *Succ = MBB.getSingleSuccessor(); if (!Succ) - return false; + return PreheaderFlushFlags(); MachineLoop *Loop = MLI->getLoopFor(Succ); if (!Loop) - return false; + return PreheaderFlushFlags(); - if (Loop->getLoopPreheader() == &MBB && - shouldFlushVmCnt(Loop, ScoreBrackets)) { - Iterator->second = true; - return true; + if (Loop->getLoopPreheader() == &MBB) { + Iterator->second = getPreheaderFlushFlags(Loop, ScoreBrackets); + return Iterator->second; } - return false; + return PreheaderFlushFlags(); } bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const { @@ -2609,72 +3375,152 @@ bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const { return SIInstrInfo::isVMEM(MI); } -// Return true if it is better to flush the vmcnt counter in the preheader of -// the given loop. We currently decide to flush in two situations: +bool SIInsertWaitcnts::isDSRead(const MachineInstr &MI) const { + return SIInstrInfo::isDS(MI) && MI.mayLoad() && !MI.mayStore(); +} + +// Check if instruction is a store to LDS that is counted via DSCNT +// (where that counter exists). +bool SIInsertWaitcnts::mayStoreIncrementingDSCNT(const MachineInstr &MI) const { + if (!MI.mayStore()) + return false; + if (SIInstrInfo::isDS(MI)) + return true; + return false; +} + +// Return flags indicating which counters should be flushed in the preheader of +// the given loop. We currently decide to flush in a few situations: +// For VMEM (FlushVmCnt): // 1. The loop contains vmem store(s), no vmem load and at least one use of a // vgpr containing a value that is loaded outside of the loop. (Only on // targets with no vscnt counter). // 2. The loop contains vmem load(s), but the loaded values are not used in the // loop, and at least one use of a vgpr containing a value that is loaded // outside of the loop. -bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML, - const WaitcntBrackets &Brackets) { +// For DS (FlushDsCnt, GFX12+ only): +// 3. The loop contains no DS reads, and at least one use of a vgpr containing +// a value that is DS loaded outside of the loop. +// 4. The loop contains DS read(s), loaded values are not used in the same +// iteration but in the next iteration (prefetch pattern), and at least one +// use of a vgpr containing a value that is DS loaded outside of the loop. +// Flushing in preheader reduces wait overhead if the wait requirement in +// iteration 1 would otherwise be more strict. +PreheaderFlushFlags +SIInsertWaitcnts::getPreheaderFlushFlags(MachineLoop *ML, + const WaitcntBrackets &Brackets) { + PreheaderFlushFlags Flags; bool HasVMemLoad = false; bool HasVMemStore = false; - bool UsesVgprLoadedOutside = false; - DenseSet<Register> VgprUse; - DenseSet<Register> VgprDef; + bool SeenDSStoreInLoop = false; + bool UsesVgprLoadedOutsideVMEM = false; + bool UsesVgprLoadedOutsideDS = false; + bool VMemInvalidated = false; + // DS optimization only applies to GFX12+ where DS_CNT is separate. + bool DSInvalidated = !ST->hasExtendedWaitCounts(); + DenseSet<MCRegUnit> VgprUse; + DenseSet<MCRegUnit> VgprDefVMEM; + DenseSet<MCRegUnit> VgprDefDS; for (MachineBasicBlock *MBB : ML->blocks()) { + bool SeenDSStoreInCurrMBB = false; for (MachineInstr &MI : *MBB) { if (isVMEMOrFlatVMEM(MI)) { HasVMemLoad |= MI.mayLoad(); HasVMemStore |= MI.mayStore(); } - + if (mayStoreIncrementingDSCNT(MI)) + SeenDSStoreInCurrMBB = true; + // Stores postdominated by a barrier will have a wait at the barrier + // and thus no need to be waited at the loop header. Barrier found + // later in the same MBB during in-order traversal is used here as a + // cheaper alternative to postdomination check. + if (MI.getOpcode() == AMDGPU::S_BARRIER) + SeenDSStoreInCurrMBB = false; for (const MachineOperand &Op : MI.all_uses()) { if (Op.isDebug() || !TRI->isVectorRegister(*MRI, Op.getReg())) continue; - RegInterval Interval = Brackets.getRegInterval(&MI, Op); // Vgpr use - for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { + for (MCRegUnit RU : TRI->regunits(Op.getReg().asMCReg())) { // If we find a register that is loaded inside the loop, 1. and 2. - // are invalidated and we can exit. - if (VgprDef.contains(RegNo)) - return false; - VgprUse.insert(RegNo); - // If at least one of Op's registers is in the score brackets, the - // value is likely loaded outside of the loop. - if (Brackets.getRegScore(RegNo, LOAD_CNT) > - Brackets.getScoreLB(LOAD_CNT) || - Brackets.getRegScore(RegNo, SAMPLE_CNT) > - Brackets.getScoreLB(SAMPLE_CNT) || - Brackets.getRegScore(RegNo, BVH_CNT) > - Brackets.getScoreLB(BVH_CNT)) { - UsesVgprLoadedOutside = true; - break; - } + // are invalidated. + if (VgprDefVMEM.contains(RU)) + VMemInvalidated = true; + + // Check for DS loads used inside the loop + if (VgprDefDS.contains(RU)) + DSInvalidated = true; + + // Early exit if both optimizations are invalidated + if (VMemInvalidated && DSInvalidated) + return Flags; + + VgprUse.insert(RU); + // Check if this register has a pending VMEM load from outside the + // loop (value loaded outside and used inside). + VMEMID ID = toVMEMID(RU); + if (Brackets.hasPendingVMEM(ID, LOAD_CNT) || + Brackets.hasPendingVMEM(ID, SAMPLE_CNT) || + Brackets.hasPendingVMEM(ID, BVH_CNT)) + UsesVgprLoadedOutsideVMEM = true; + // Check if loaded outside the loop via DS (not VMEM/FLAT). + // Only consider it a DS load if there's no pending VMEM load for + // this register, since FLAT can set both counters. + else if (Brackets.hasPendingVMEM(ID, DS_CNT)) + UsesVgprLoadedOutsideDS = true; } } // VMem load vgpr def if (isVMEMOrFlatVMEM(MI) && MI.mayLoad()) { for (const MachineOperand &Op : MI.all_defs()) { - RegInterval Interval = Brackets.getRegInterval(&MI, Op); - for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { + for (MCRegUnit RU : TRI->regunits(Op.getReg().asMCReg())) { // If we find a register that is loaded inside the loop, 1. and 2. - // are invalidated and we can exit. - if (VgprUse.contains(RegNo)) - return false; - VgprDef.insert(RegNo); + // are invalidated. + if (VgprUse.contains(RU)) + VMemInvalidated = true; + VgprDefVMEM.insert(RU); + } + } + // Early exit if both optimizations are invalidated + if (VMemInvalidated && DSInvalidated) + return Flags; + } + + // DS read vgpr def + // Note: Unlike VMEM, we DON'T invalidate when VgprUse.contains(RegNo). + // If USE comes before DEF, it's the prefetch pattern (use value from + // previous iteration, load for next iteration). We should still flush + // in preheader so iteration 1 doesn't need to wait inside the loop. + // Only invalidate when DEF comes before USE (same-iteration consumption, + // checked above when processing uses). + if (isDSRead(MI)) { + for (const MachineOperand &Op : MI.all_defs()) { + for (MCRegUnit RU : TRI->regunits(Op.getReg().asMCReg())) { + VgprDefDS.insert(RU); } } } } + // Accumulate unprotected DS stores from this MBB + SeenDSStoreInLoop |= SeenDSStoreInCurrMBB; } - if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside) - return true; - return HasVMemLoad && UsesVgprLoadedOutside && ST->hasVmemWriteVgprInOrder(); + + // VMEM flush decision + if (!VMemInvalidated && UsesVgprLoadedOutsideVMEM && + ((!ST->hasVscnt() && HasVMemStore && !HasVMemLoad) || + (HasVMemLoad && ST->hasVmemWriteVgprInOrder()))) + Flags.FlushVmCnt = true; + + // DS flush decision: flush if loop uses DS-loaded values from outside + // and either has no DS reads in the loop, or DS reads whose results + // are not used in the loop. + // DSInvalidated is pre-set to true on non-GFX12+ targets where DS_CNT + // is LGKM_CNT which also tracks FLAT/SMEM. + if (!DSInvalidated && !SeenDSStoreInLoop && UsesVgprLoadedOutsideDS) + Flags.FlushDsCnt = true; + + return Flags; } bool SIInsertWaitcntsLegacy::runOnMachineFunction(MachineFunction &MF) { @@ -2714,48 +3560,36 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) { AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST->getCPU()); + // Initialize hardware limits first, as they're needed by the generators. + Limits = AMDGPU::HardwareLimits(IV); + if (ST->hasExtendedWaitCounts()) { - MaxCounter = NUM_EXTENDED_INST_CNTS; - WCGGFX12Plus = WaitcntGeneratorGFX12Plus(MF, MaxCounter); - WCG = &WCGGFX12Plus; + IsExpertMode = ST->hasExpertSchedulingMode() && + (ExpertSchedulingModeFlag.getNumOccurrences() + ? ExpertSchedulingModeFlag + : MF.getFunction() + .getFnAttribute("amdgpu-expert-scheduling-mode") + .getValueAsBool()); + MaxCounter = IsExpertMode ? NUM_EXPERT_INST_CNTS : NUM_EXTENDED_INST_CNTS; + if (!WCG) + WCG = std::make_unique<WaitcntGeneratorGFX12Plus>(MF, MaxCounter, &Limits, + IsExpertMode); } else { MaxCounter = NUM_NORMAL_INST_CNTS; - WCGPreGFX12 = WaitcntGeneratorPreGFX12(MF); - WCG = &WCGPreGFX12; + if (!WCG) + WCG = std::make_unique<WaitcntGeneratorPreGFX12>(MF, NUM_NORMAL_INST_CNTS, + &Limits); } for (auto T : inst_counter_types()) ForceEmitWaitcnt[T] = false; - WaitEventMaskForInst = WCG->getWaitEventMask(); - - SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS); - - if (ST->hasExtendedWaitCounts()) { - Limits.LoadcntMax = AMDGPU::getLoadcntBitMask(IV); - Limits.DscntMax = AMDGPU::getDscntBitMask(IV); - } else { - Limits.LoadcntMax = AMDGPU::getVmcntBitMask(IV); - Limits.DscntMax = AMDGPU::getLgkmcntBitMask(IV); - } - Limits.ExpcntMax = AMDGPU::getExpcntBitMask(IV); - Limits.StorecntMax = AMDGPU::getStorecntBitMask(IV); - Limits.SamplecntMax = AMDGPU::getSamplecntBitMask(IV); - Limits.BvhcntMax = AMDGPU::getBvhcntBitMask(IV); - Limits.KmcntMax = AMDGPU::getKmcntBitMask(IV); - Limits.XcntMax = AMDGPU::getXcntBitMask(IV); - - [[maybe_unused]] unsigned NumVGPRsMax = - ST->getAddressableNumVGPRs(MFI->getDynamicVGPRBlockSize()); - [[maybe_unused]] unsigned NumSGPRsMax = ST->getAddressableNumSGPRs(); - assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS); - assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS); + SmemAccessCounter = getCounterFromEvent(SMEM_ACCESS); BlockInfos.clear(); bool Modified = false; MachineBasicBlock &EntryBB = MF.front(); - MachineBasicBlock::iterator I = EntryBB.begin(); if (!MFI->isEntryFunction()) { // Wait for any outstanding memory operations that the input registers may @@ -2764,9 +3598,9 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) { // TODO: Could insert earlier and schedule more liberally with operations // that only use caller preserved registers. - for (MachineBasicBlock::iterator E = EntryBB.end(); - I != E && (I->isPHI() || I->isMetaInstruction()); ++I) - ; + MachineBasicBlock::iterator I = EntryBB.begin(); + while (I != EntryBB.end() && I->isMetaInstruction()) + ++I; if (ST->hasExtendedWaitCounts()) { BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT)) @@ -2783,6 +3617,12 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) { TII->get(instrsForExtendedCounterTypes[CT])) .addImm(0); } + if (IsExpertMode) { + unsigned Enc = AMDGPU::DepCtr::encodeFieldVaVdst(0, *ST); + Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(Enc, 0); + BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_DEPCTR)) + .addImm(Enc); + } } else { BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0); } @@ -2839,7 +3679,7 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) { if (!SuccBI.Incoming) { SuccBI.Dirty = true; if (SuccBII <= BII) { - LLVM_DEBUG(dbgs() << "repeat on backedge\n"); + LLVM_DEBUG(dbgs() << "Repeat on backedge without merge\n"); Repeat = true; } if (!MoveBracketsToSucc) { @@ -2847,11 +3687,20 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) { } else { SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets); } - } else if (SuccBI.Incoming->merge(*Brackets)) { - SuccBI.Dirty = true; - if (SuccBII <= BII) { - LLVM_DEBUG(dbgs() << "repeat on backedge\n"); - Repeat = true; + } else { + LLVM_DEBUG({ + dbgs() << "Try to merge "; + MBB->printName(dbgs()); + dbgs() << " into "; + Succ->printName(dbgs()); + dbgs() << '\n'; + }); + if (SuccBI.Incoming->merge(*Brackets)) { + SuccBI.Dirty = true; + if (SuccBII <= BII) { + LLVM_DEBUG(dbgs() << "Repeat on backedge with merge\n"); + Repeat = true; + } } } } @@ -2907,26 +3756,49 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) { } } + if (IsExpertMode) { + // Enable expert scheduling on function entry. To satisfy ABI requirements + // and to allow calls between function with different expert scheduling + // settings, disable it around calls and before returns. + + MachineBasicBlock::iterator I = EntryBB.begin(); + while (I != EntryBB.end() && I->isMetaInstruction()) + ++I; + setSchedulingMode(EntryBB, I, true); + + for (MachineInstr *MI : CallInsts) { + MachineBasicBlock &MBB = *MI->getParent(); + setSchedulingMode(MBB, MI, false); + setSchedulingMode(MBB, std::next(MI->getIterator()), true); + } + + for (MachineInstr *MI : ReturnInsts) + setSchedulingMode(*MI->getParent(), MI, false); + + Modified = true; + } + // Deallocate the VGPRs before previously identified S_ENDPGM instructions. // This is done in different ways depending on how the VGPRs were allocated // (i.e. whether we're in dynamic VGPR mode or not). // Skip deallocation if kernel is waveslot limited vs VGPR limited. A short // waveslot limited kernel runs slower with the deallocation. - if (MFI->isDynamicVGPREnabled()) { - for (MachineInstr *MI : ReleaseVGPRInsts) { + if (!WCG->isOptNone() && MFI->isDynamicVGPREnabled()) { + for (auto [MI, _] : EndPgmInsts) { BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::S_ALLOC_VGPR)) .addImm(0); Modified = true; } - } else { - if (!ReleaseVGPRInsts.empty() && - (MF.getFrameInfo().hasCalls() || - ST->getOccupancyWithNumVGPRs( - TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass), - /*IsDynamicVGPR=*/false) < - AMDGPU::IsaInfo::getMaxWavesPerEU(ST))) { - for (MachineInstr *MI : ReleaseVGPRInsts) { + } else if (!WCG->isOptNone() && + ST->getGeneration() >= AMDGPUSubtarget::GFX11 && + (MF.getFrameInfo().hasCalls() || + ST->getOccupancyWithNumVGPRs( + TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass), + /*IsDynamicVGPR=*/false) < + AMDGPU::IsaInfo::getMaxWavesPerEU(ST))) { + for (auto [MI, Flag] : EndPgmInsts) { + if (Flag) { if (ST->requiresNopBeforeDeallocVGPRs()) { BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::S_NOP)) @@ -2939,7 +3811,10 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) { } } } - ReleaseVGPRInsts.clear(); + + CallInsts.clear(); + ReturnInsts.clear(); + EndPgmInsts.clear(); PreheadersToFlush.clear(); SLoadAddresses.clear(); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index d516330..24aa31a 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -63,7 +63,8 @@ static cl::opt<bool> Fix16BitCopies( cl::ReallyHidden); SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST) - : AMDGPUGenInstrInfo(ST, AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN), + : AMDGPUGenInstrInfo(ST, RI, AMDGPU::ADJCALLSTACKUP, + AMDGPU::ADJCALLSTACKDOWN), RI(ST), ST(ST) { SchedModel.init(&ST); } @@ -161,7 +162,7 @@ bool SIInstrInfo::resultDependsOnExec(const MachineInstr &MI) const { if (!DstReg.isVirtual()) return true; - const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) { switch (Use.getOpcode()) { case AMDGPU::S_AND_SAVEEXEC_B32: @@ -179,6 +180,10 @@ bool SIInstrInfo::resultDependsOnExec(const MachineInstr &MI) const { return false; } + // If it is not convergent it does not depend on EXEC. + if (!MI.isConvergent()) + return false; + switch (MI.getOpcode()) { default: break; @@ -1154,7 +1159,7 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, } int SIInstrInfo::commuteOpcode(unsigned Opcode) const { - int NewOpc; + int64_t NewOpc; // Try to map original to commuted opcode NewOpc = AMDGPU::getCommuteRev(Opcode); @@ -1325,7 +1330,8 @@ bool SIInstrInfo::getConstValDefinedInReg(const MachineInstr &MI, case AMDGPU::AV_MOV_B32_IMM_PSEUDO: case AMDGPU::AV_MOV_B64_IMM_PSEUDO: case AMDGPU::S_MOV_B64_IMM_PSEUDO: - case AMDGPU::V_MOV_B64_PSEUDO: { + case AMDGPU::V_MOV_B64_PSEUDO: + case AMDGPU::V_MOV_B16_t16_e32: { const MachineOperand &Src0 = MI.getOperand(1); if (Src0.isImm()) { ImmVal = Src0.getImm(); @@ -1334,6 +1340,15 @@ bool SIInstrInfo::getConstValDefinedInReg(const MachineInstr &MI, return false; } + case AMDGPU::V_MOV_B16_t16_e64: { + const MachineOperand &Src0 = MI.getOperand(2); + if (Src0.isImm() && !MI.getOperand(1).getImm()) { + ImmVal = Src0.getImm(); + return MI.getOperand(0).getReg() == Reg; + } + + return false; + } case AMDGPU::S_BREV_B32: case AMDGPU::V_BFREV_B32_e32: case AMDGPU::V_BFREV_B32_e64: { @@ -1361,6 +1376,24 @@ bool SIInstrInfo::getConstValDefinedInReg(const MachineInstr &MI, } } +std::optional<int64_t> +SIInstrInfo::getImmOrMaterializedImm(MachineOperand &Op) const { + if (Op.isImm()) + return Op.getImm(); + + if (!Op.isReg() || !Op.getReg().isVirtual()) + return std::nullopt; + MachineRegisterInfo &MRI = Op.getParent()->getMF()->getRegInfo(); + const MachineInstr *Def = MRI.getVRegDef(Op.getReg()); + if (Def && Def->isMoveImmediate()) { + const MachineOperand &ImmSrc = Def->getOperand(1); + if (ImmSrc.isImm()) + return extractSubregFromImm(ImmSrc.getImm(), Op.getSubReg()); + } + + return std::nullopt; +} + unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { if (RI.isAGPRClass(DstRC)) @@ -1393,6 +1426,10 @@ SIInstrInfo::getIndirectGPRIDXPseudo(unsigned VecSize, return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4); if (VecSize <= 160) // 20 bytes return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5); + if (VecSize <= 192) // 24 bytes + return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V6); + if (VecSize <= 224) // 28 bytes + return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V7); if (VecSize <= 256) // 32 bytes return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8); if (VecSize <= 288) // 36 bytes @@ -1421,6 +1458,10 @@ SIInstrInfo::getIndirectGPRIDXPseudo(unsigned VecSize, return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4); if (VecSize <= 160) // 20 bytes return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5); + if (VecSize <= 192) // 24 bytes + return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V6); + if (VecSize <= 224) // 28 bytes + return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V7); if (VecSize <= 256) // 32 bytes return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8); if (VecSize <= 288) // 36 bytes @@ -1450,6 +1491,10 @@ static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) { return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4; if (VecSize <= 160) // 20 bytes return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5; + if (VecSize <= 192) // 24 bytes + return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V6; + if (VecSize <= 224) // 28 bytes + return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V7; if (VecSize <= 256) // 32 bytes return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8; if (VecSize <= 288) // 36 bytes @@ -1479,6 +1524,10 @@ static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) { return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4; if (VecSize <= 160) // 20 bytes return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5; + if (VecSize <= 192) // 24 bytes + return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V6; + if (VecSize <= 224) // 28 bytes + return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V7; if (VecSize <= 256) // 32 bytes return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8; if (VecSize <= 288) // 36 bytes @@ -1667,8 +1716,7 @@ unsigned SIInstrInfo::getVectorRegSpillSaveOpcode( void SIInstrInfo::storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, - bool isKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags) const { MachineFunction *MF = MBB.getParent(); SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); @@ -1680,7 +1728,7 @@ void SIInstrInfo::storeRegToStackSlot( MachineMemOperand *MMO = MF->getMachineMemOperand( PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex), FrameInfo.getObjectAlign(FrameIndex)); - unsigned SpillSize = TRI->getSpillSize(*RC); + unsigned SpillSize = RI.getSpillSize(*RC); MachineRegisterInfo &MRI = MF->getRegInfo(); if (RI.isSGPRClass(RC)) { @@ -1862,14 +1910,13 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, - Register VReg, + Register VReg, unsigned SubReg, MachineInstr::MIFlag Flags) const { MachineFunction *MF = MBB.getParent(); SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); MachineFrameInfo &FrameInfo = MF->getFrameInfo(); const DebugLoc &DL = MBB.findDebugLoc(MI); - unsigned SpillSize = TRI->getSpillSize(*RC); + unsigned SpillSize = RI.getSpillSize(*RC); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(*MF, FrameIndex); @@ -1955,17 +2002,15 @@ MachineBasicBlock *SIInstrInfo::insertSimulatedTrap(MachineRegisterInfo &MRI, constexpr unsigned ECQueueWaveAbort = 0x400; MachineBasicBlock *TrapBB = &MBB; - MachineBasicBlock *ContBB = &MBB; MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock(); if (!MBB.succ_empty() || std::next(MI.getIterator()) != MBB.end()) { - ContBB = MBB.splitAt(MI, /*UpdateLiveIns=*/false); + MBB.splitAt(MI, /*UpdateLiveIns=*/false); TrapBB = MF->CreateMachineBasicBlock(); BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(TrapBB); MF->push_back(TrapBB); MBB.addSuccessor(TrapBB); } - // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this // will be a nop. BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_TRAP)) @@ -2001,7 +2046,7 @@ MachineBasicBlock *SIInstrInfo::insertSimulatedTrap(MachineRegisterInfo &MRI, MF->push_back(HaltLoopBB); HaltLoopBB->addSuccessor(HaltLoopBB); - return ContBB; + return MBB.getNextNode(); } unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) { @@ -2132,11 +2177,14 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0); Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1); + const MCInstrDesc &Mov64Desc = get(AMDGPU::V_MOV_B64_e32); + const TargetRegisterClass *Mov64RC = getRegClass(Mov64Desc, /*OpNum=*/0); + const MachineOperand &SrcOp = MI.getOperand(1); // FIXME: Will this work for 64-bit floating point immediates? assert(!SrcOp.isFPImm()); - if (ST.hasMovB64()) { - MI.setDesc(get(AMDGPU::V_MOV_B64_e32)); + if (ST.hasMovB64() && Mov64RC->contains(Dst)) { + MI.setDesc(Mov64Desc); if (SrcOp.isReg() || isInlineConstant(MI, 1) || isUInt<32>(SrcOp.getImm()) || ST.has64BitLiterals()) break; @@ -2145,17 +2193,21 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { APInt Imm(64, SrcOp.getImm()); APInt Lo(32, Imm.getLoBits(32).getZExtValue()); APInt Hi(32, Imm.getHiBits(32).getZExtValue()); - if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo)) { - BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst) - .addImm(SISrcMods::OP_SEL_1) - .addImm(Lo.getSExtValue()) - .addImm(SISrcMods::OP_SEL_1) - .addImm(Lo.getSExtValue()) - .addImm(0) // op_sel_lo - .addImm(0) // op_sel_hi - .addImm(0) // neg_lo - .addImm(0) // neg_hi - .addImm(0); // clamp + const MCInstrDesc &PkMovDesc = get(AMDGPU::V_PK_MOV_B32); + const TargetRegisterClass *PkMovRC = getRegClass(PkMovDesc, /*OpNum=*/0); + + if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo) && + PkMovRC->contains(Dst)) { + BuildMI(MBB, MI, DL, PkMovDesc, Dst) + .addImm(SISrcMods::OP_SEL_1) + .addImm(Lo.getSExtValue()) + .addImm(SISrcMods::OP_SEL_1) + .addImm(Lo.getSExtValue()) + .addImm(0) // op_sel_lo + .addImm(0) // op_sel_hi + .addImm(0) // neg_lo + .addImm(0) // neg_hi + .addImm(0); // clamp } else { BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) .addImm(Lo.getSExtValue()) @@ -2241,6 +2293,8 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3: case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4: case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5: + case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V6: + case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V7: case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8: case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9: case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10: @@ -2253,6 +2307,8 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3: case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4: case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5: + case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V6: + case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V7: case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8: case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9: case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10: @@ -2282,11 +2338,11 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { assert(VecReg == MI.getOperand(1).getReg()); MachineInstrBuilder MIB = - BuildMI(MBB, MI, DL, OpDesc) - .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef) - .add(MI.getOperand(2)) - .addReg(VecReg, RegState::ImplicitDefine) - .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0)); + BuildMI(MBB, MI, DL, OpDesc) + .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef) + .add(MI.getOperand(2)) + .addReg(VecReg, RegState::ImplicitDefine) + .addReg(VecReg, RegState::Implicit | getUndefRegState(IsUndef)); const int ImpDefIdx = OpDesc.getNumOperands() + OpDesc.implicit_uses().size(); @@ -2300,6 +2356,8 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3: case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4: case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5: + case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V6: + case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V7: case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8: case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9: case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10: @@ -2324,8 +2382,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef) .add(MI.getOperand(2)) .addReg(VecReg, RegState::ImplicitDefine) - .addReg(VecReg, - RegState::Implicit | (IsUndef ? RegState::Undef : 0)); + .addReg(VecReg, RegState::Implicit | getUndefRegState(IsUndef)); const int ImpDefIdx = OpDesc.getNumOperands() + OpDesc.implicit_uses().size(); @@ -2344,6 +2401,8 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3: case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4: case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5: + case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V6: + case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V7: case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8: case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9: case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10: @@ -2355,18 +2414,17 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { Register Dst = MI.getOperand(0).getReg(); Register VecReg = MI.getOperand(1).getReg(); bool IsUndef = MI.getOperand(1).isUndef(); - Register Idx = MI.getOperand(2).getReg(); Register SubReg = MI.getOperand(3).getImm(); MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON)) - .addReg(Idx) + .add(MI.getOperand(2)) .addImm(AMDGPU::VGPRIndexMode::SRC0_ENABLE); SetOn->getOperand(3).setIsUndef(); BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read)) .addDef(Dst) .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef) - .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0)); + .addReg(VecReg, RegState::Implicit | getUndefRegState(IsUndef)); MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF)); @@ -2500,7 +2558,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { } break; - case AMDGPU::V_MAX_BF16_PSEUDO_e64: + case AMDGPU::V_MAX_BF16_PSEUDO_e64: { assert(ST.hasBF16PackedInsts()); MI.setDesc(get(AMDGPU::V_PK_MAX_NUM_BF16)); MI.addOperand(MachineOperand::CreateImm(0)); // op_sel @@ -2513,13 +2571,46 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { break; } + case AMDGPU::GET_STACK_BASE: + // The stack starts at offset 0 unless we need to reserve some space at the + // bottom. + if (ST.getFrameLowering()->mayReserveScratchForCWSR(*MBB.getParent())) { + // When CWSR is used in dynamic VGPR mode, the trap handler needs to save + // some of the VGPRs. The size of the required scratch space has already + // been computed by prolog epilog insertion. + const SIMachineFunctionInfo *MFI = + MBB.getParent()->getInfo<SIMachineFunctionInfo>(); + unsigned VGPRSize = MFI->getScratchReservedForDynamicVGPRs(); + Register DestReg = MI.getOperand(0).getReg(); + BuildMI(MBB, MI, DL, get(AMDGPU::S_GETREG_B32), DestReg) + .addImm(AMDGPU::Hwreg::HwregEncoding::encode( + AMDGPU::Hwreg::ID_HW_ID2, AMDGPU::Hwreg::OFFSET_ME_ID, 2)); + // The MicroEngine ID is 0 for the graphics queue, and 1 or 2 for compute + // (3 is unused, so we ignore it). Unfortunately, S_GETREG doesn't set + // SCC, so we need to check for 0 manually. + BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32)).addImm(0).addReg(DestReg); + // Change the implicif-def of SCC to an explicit use (but first remove + // the dead flag if present). + MI.getOperand(MI.getNumExplicitOperands()).setIsDead(false); + MI.getOperand(MI.getNumExplicitOperands()).setIsUse(); + MI.setDesc(get(AMDGPU::S_CMOVK_I32)); + MI.addOperand(MachineOperand::CreateImm(VGPRSize)); + } else { + MI.setDesc(get(AMDGPU::S_MOV_B32)); + MI.addOperand(MachineOperand::CreateImm(0)); + MI.removeOperand( + MI.getNumExplicitOperands()); // Drop implicit def of SCC. + } + break; + } + return true; } void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg, - unsigned SubIdx, const MachineInstr &Orig, - const TargetRegisterInfo &RI) const { + unsigned SubIdx, + const MachineInstr &Orig) const { // Try shrinking the instruction to remat only the part needed for current // context. @@ -2569,7 +2660,7 @@ void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB, const MCInstrDesc &TID = get(NewOpcode); const TargetRegisterClass *NewRC = - RI.getAllocatableClass(getRegClass(TID, 0, &RI)); + RI.getAllocatableClass(getRegClass(TID, 0)); MRI.setRegClass(DestReg, NewRC); UseMO->setReg(DestReg); @@ -2599,7 +2690,7 @@ void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB, break; } - TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, RI); + TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig); } std::pair<MachineInstr*, MachineInstr*> @@ -2644,7 +2735,7 @@ SIInstrInfo::expandMovDPP64(MachineInstr &MI) const { if (Src.isPhysical()) MovDPP.addReg(RI.getSubReg(Src, Sub)); else - MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub); + MovDPP.addReg(Src, getUndefRegState(SrcOp.isUndef()), Sub); } } @@ -2907,7 +2998,7 @@ void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, auto I = MBB.end(); auto &MCCtx = MF->getContext(); - if (ST.hasAddPC64Inst()) { + if (ST.useAddPC64Inst()) { MCSymbol *Offset = MCCtx.createTempSymbol("offset", /*AlwaysAddSuffix=*/true); auto AddPC = BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_PC_I64)) @@ -2935,7 +3026,7 @@ void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, auto ApplyHazardWorkarounds = [this, &MBB, &I, &DL, FlushSGPRWrites]() { if (FlushSGPRWrites) BuildMI(MBB, I, DL, get(AMDGPU::S_WAITCNT_DEPCTR)) - .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0)); + .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0, ST)); }; // We need to compute the offset relative to the instruction immediately after @@ -2953,11 +3044,11 @@ void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true); BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32)) .addReg(PCReg, RegState::Define, AMDGPU::sub0) - .addReg(PCReg, 0, AMDGPU::sub0) + .addReg(PCReg, {}, AMDGPU::sub0) .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET); BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32)) .addReg(PCReg, RegState::Define, AMDGPU::sub1) - .addReg(PCReg, 0, AMDGPU::sub1) + .addReg(PCReg, {}, AMDGPU::sub1) .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET); ApplyHazardWorkarounds(); @@ -3377,15 +3468,13 @@ void SIInstrInfo::insertSelect(MachineBasicBlock &MBB, MachineInstr *Select; if (SelOp == AMDGPU::V_CNDMASK_B32_e32) { - Select = - BuildMI(MBB, I, DL, get(SelOp), DstElt) - .addReg(FalseReg, 0, SubIdx) - .addReg(TrueReg, 0, SubIdx); + Select = BuildMI(MBB, I, DL, get(SelOp), DstElt) + .addReg(FalseReg, {}, SubIdx) + .addReg(TrueReg, {}, SubIdx); } else { - Select = - BuildMI(MBB, I, DL, get(SelOp), DstElt) - .addReg(TrueReg, 0, SubIdx) - .addReg(FalseReg, 0, SubIdx); + Select = BuildMI(MBB, I, DL, get(SelOp), DstElt) + .addReg(TrueReg, {}, SubIdx) + .addReg(FalseReg, {}, SubIdx); } preserveCondRegFlags(Select->getOperand(3), Cond[1]); @@ -3461,6 +3550,21 @@ void SIInstrInfo::removeModOperands(MachineInstr &MI) const { } } +void SIInstrInfo::mutateAndCleanupImplicit(MachineInstr &MI, + const MCInstrDesc &NewDesc) const { + MI.setDesc(NewDesc); + + // Remove any leftover implicit operands from mutating the instruction. e.g. + // if we replace an s_and_b32 with a copy, we don't need the implicit scc def + // anymore. + const MCInstrDesc &Desc = MI.getDesc(); + unsigned NumOps = Desc.getNumOperands() + Desc.implicit_uses().size() + + Desc.implicit_defs().size(); + + for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I) + MI.removeOperand(I); +} + std::optional<int64_t> SIInstrInfo::extractSubregFromImm(int64_t Imm, unsigned SubRegIndex) { switch (SubRegIndex) { @@ -3503,6 +3607,8 @@ static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc) { case AMDGPU::V_FMAC_F16_e64: case AMDGPU::V_FMAC_F16_t16_e64: case AMDGPU::V_FMAC_F16_fake16_e64: + case AMDGPU::V_FMAC_F16_t16_e32: + case AMDGPU::V_FMAC_F16_fake16_e32: case AMDGPU::V_FMA_F16_e64: return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts() ? AMDGPU::V_FMAAK_F16_t16 @@ -3535,6 +3641,8 @@ static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc) { case AMDGPU::V_FMAC_F16_e64: case AMDGPU::V_FMAC_F16_t16_e64: case AMDGPU::V_FMAC_F16_fake16_e64: + case AMDGPU::V_FMAC_F16_t16_e32: + case AMDGPU::V_FMAC_F16_fake16_e32: case AMDGPU::V_FMA_F16_e64: return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts() ? AMDGPU::V_FMAMK_F16_t16 @@ -3612,7 +3720,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_ACCVGPR_WRITE_B32_e64}) { const MCInstrDesc &MovDesc = get(MovOp); - const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0, &RI); + const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0); if (Is16Bit) { // We just need to find a correctly sized register class, so the // subregister index compatibility doesn't matter since we're statically @@ -3703,6 +3811,23 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1); MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2); + auto CopyRegOperandToNarrowerRC = + [MRI, this](MachineInstr &MI, unsigned OpNo, + const TargetRegisterClass *NewRC) -> void { + if (!MI.getOperand(OpNo).isReg()) + return; + Register Reg = MI.getOperand(OpNo).getReg(); + const TargetRegisterClass *RC = RI.getRegClassForReg(*MRI, Reg); + if (RI.getCommonSubClass(RC, NewRC) != NewRC) + return; + Register Tmp = MRI->createVirtualRegister(NewRC); + BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), + get(AMDGPU::COPY), Tmp) + .addReg(Reg); + MI.getOperand(OpNo).setReg(Tmp); + MI.getOperand(OpNo).setIsKill(); + }; + // Multiplied part is the constant: Use v_madmk_{f16, f32}. if ((Src0->isReg() && Src0->getReg() == Reg) || (Src1->isReg() && Src1->getReg() == Reg)) { @@ -3734,13 +3859,6 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, if (pseudoToMCOpcode(NewOpc) == -1) return false; - // V_FMAMK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAMK_F16_fake16 - // takes VGPR_32_Lo128 operands, so the rewrite would also require - // restricting their register classes. For now just bail out. - if (NewOpc == AMDGPU::V_FMAMK_F16_t16 || - NewOpc == AMDGPU::V_FMAMK_F16_fake16) - return false; - const std::optional<int64_t> SubRegImm = extractSubregFromImm( Imm, RegSrc == Src1 ? Src0->getSubReg() : Src1->getSubReg()); @@ -3765,6 +3883,19 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, removeModOperands(UseMI); UseMI.setDesc(get(NewOpc)); + if (NewOpc == AMDGPU::V_FMAMK_F16_t16 || + NewOpc == AMDGPU::V_FMAMK_F16_fake16) { + const TargetRegisterClass *NewRC = getRegClass(get(NewOpc), 0); + Register Tmp = MRI->createVirtualRegister(NewRC); + BuildMI(*UseMI.getParent(), std::next(UseMI.getIterator()), + UseMI.getDebugLoc(), get(AMDGPU::COPY), + UseMI.getOperand(0).getReg()) + .addReg(Tmp, RegState::Kill); + UseMI.getOperand(0).setReg(Tmp); + CopyRegOperandToNarrowerRC(UseMI, 1, NewRC); + CopyRegOperandToNarrowerRC(UseMI, 3, NewRC); + } + bool DeleteDef = MRI->use_nodbg_empty(Reg); if (DeleteDef) DefMI.eraseFromParent(); @@ -3812,13 +3943,6 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, if (pseudoToMCOpcode(NewOpc) == -1) return false; - // V_FMAAK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAAK_F16_fake16 - // takes VGPR_32_Lo128 operands, so the rewrite would also require - // restricting their register classes. For now just bail out. - if (NewOpc == AMDGPU::V_FMAAK_F16_t16 || - NewOpc == AMDGPU::V_FMAAK_F16_fake16) - return false; - // FIXME: This would be a lot easier if we could return a new instruction // instead of having to modify in place. @@ -3838,6 +3962,20 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, // These come before src2. removeModOperands(UseMI); UseMI.setDesc(get(NewOpc)); + + if (NewOpc == AMDGPU::V_FMAAK_F16_t16 || + NewOpc == AMDGPU::V_FMAAK_F16_fake16) { + const TargetRegisterClass *NewRC = getRegClass(get(NewOpc), 0); + Register Tmp = MRI->createVirtualRegister(NewRC); + BuildMI(*UseMI.getParent(), std::next(UseMI.getIterator()), + UseMI.getDebugLoc(), get(AMDGPU::COPY), + UseMI.getOperand(0).getReg()) + .addReg(Tmp, RegState::Kill); + UseMI.getOperand(0).setReg(Tmp); + CopyRegOperandToNarrowerRC(UseMI, 1, NewRC); + CopyRegOperandToNarrowerRC(UseMI, 2, NewRC); + } + // It might happen that UseMI was commuted // and we now have SGPR as SRC1. If so 2 inlined // constant and SGPR are illegal. @@ -3917,6 +4055,9 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, if (isLDSDMA(MIa) || isLDSDMA(MIb)) return false; + if (MIa.isBundle() || MIb.isBundle()) + return false; + // TODO: Should we check the address space from the MachineMemOperand? That // would allow us to distinguish objects we know don't alias based on the // underlying address space, even if it was lowered to a different one, @@ -3982,7 +4123,7 @@ static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm, MachineInstr **DefMI = nullptr) { if (!MO->isReg()) return false; - const MachineFunction *MF = MO->getParent()->getParent()->getParent(); + const MachineFunction *MF = MO->getParent()->getMF(); const MachineRegisterInfo &MRI = MF->getRegInfo(); return getFoldableImm(MO->getReg(), MRI, Imm, DefMI); } @@ -4032,28 +4173,50 @@ static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) { } } +/// Helper struct for the implementation of 3-address conversion to communicate +/// updates made to instruction operands. +struct SIInstrInfo::ThreeAddressUpdates { + /// Other instruction whose def is no longer used by the converted + /// instruction. + MachineInstr *RemoveMIUse = nullptr; +}; + MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const { MachineBasicBlock &MBB = *MI.getParent(); - unsigned Opc = MI.getOpcode(); + MachineInstr *CandidateMI = &MI; - // Handle MFMA. - int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc); - if (NewMFMAOpc != -1) { - MachineInstrBuilder MIB = - BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc)); - for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) - MIB.add(MI.getOperand(I)); - updateLiveVariables(LV, MI, *MIB); + if (MI.isBundle()) { + // This is a temporary placeholder for bundle handling that enables us to + // exercise the relevant code paths in the two-address instruction pass. + if (MI.getBundleSize() != 1) + return nullptr; + CandidateMI = MI.getNextNode(); + } + + ThreeAddressUpdates U; + MachineInstr *NewMI = convertToThreeAddressImpl(*CandidateMI, U); + if (!NewMI) + return nullptr; + + if (MI.isBundle()) { + CandidateMI->eraseFromBundle(); + + for (MachineOperand &MO : MI.all_defs()) { + if (MO.isTied()) + MI.untieRegOperand(MO.getOperandNo()); + } + } else { + updateLiveVariables(LV, MI, *NewMI); if (LIS) { - LIS->ReplaceMachineInstrInMaps(MI, *MIB); + LIS->ReplaceMachineInstrInMaps(MI, *NewMI); // SlotIndex of defs needs to be updated when converting to early-clobber - MachineOperand &Def = MIB->getOperand(0); + MachineOperand &Def = NewMI->getOperand(0); if (Def.isEarlyClobber() && Def.isReg() && LIS->hasInterval(Def.getReg())) { - SlotIndex OldIndex = LIS->getInstructionIndex(*MIB).getRegSlot(false); - SlotIndex NewIndex = LIS->getInstructionIndex(*MIB).getRegSlot(true); + SlotIndex OldIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(false); + SlotIndex NewIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(true); auto &LI = LIS->getInterval(Def.getReg()); auto UpdateDefIndex = [&](LiveRange &LR) { auto *S = LR.find(OldIndex); @@ -4068,6 +4231,88 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, UpdateDefIndex(SR); } } + } + + if (U.RemoveMIUse) { + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + // The only user is the instruction which will be killed. + Register DefReg = U.RemoveMIUse->getOperand(0).getReg(); + + if (MRI.hasOneNonDBGUse(DefReg)) { + // We cannot just remove the DefMI here, calling pass will crash. + U.RemoveMIUse->setDesc(get(AMDGPU::IMPLICIT_DEF)); + U.RemoveMIUse->getOperand(0).setIsDead(true); + for (unsigned I = U.RemoveMIUse->getNumOperands() - 1; I != 0; --I) + U.RemoveMIUse->removeOperand(I); + if (LV) + LV->getVarInfo(DefReg).AliveBlocks.clear(); + } + + if (MI.isBundle()) { + VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg); + if (!VRI.Reads && !VRI.Writes) { + for (MachineOperand &MO : MI.all_uses()) { + if (MO.isReg() && MO.getReg() == DefReg) { + assert(MO.getSubReg() == 0 && + "tied sub-registers in bundles currently not supported"); + MI.removeOperand(MO.getOperandNo()); + break; + } + } + + if (LIS) + LIS->shrinkToUses(&LIS->getInterval(DefReg)); + } + } else if (LIS) { + LiveInterval &DefLI = LIS->getInterval(DefReg); + + // We cannot delete the original instruction here, so hack out the use + // in the original instruction with a dummy register so we can use + // shrinkToUses to deal with any multi-use edge cases. Other targets do + // not have the complexity of deleting a use to consider here. + Register DummyReg = MRI.cloneVirtualRegister(DefReg); + for (MachineOperand &MIOp : MI.uses()) { + if (MIOp.isReg() && MIOp.getReg() == DefReg) { + MIOp.setIsUndef(true); + MIOp.setReg(DummyReg); + } + } + + if (MI.isBundle()) { + VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg); + if (!VRI.Reads && !VRI.Writes) { + for (MachineOperand &MIOp : MI.uses()) { + if (MIOp.isReg() && MIOp.getReg() == DefReg) { + MIOp.setIsUndef(true); + MIOp.setReg(DummyReg); + } + } + } + + MI.addOperand(MachineOperand::CreateReg(DummyReg, false, false, false, + false, /*isUndef=*/true)); + } + + LIS->shrinkToUses(&DefLI); + } + } + + return MI.isBundle() ? &MI : NewMI; +} + +MachineInstr * +SIInstrInfo::convertToThreeAddressImpl(MachineInstr &MI, + ThreeAddressUpdates &U) const { + MachineBasicBlock &MBB = *MI.getParent(); + unsigned Opc = MI.getOpcode(); + + // Handle MFMA. + int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc); + if (NewMFMAOpc != -1) { + MachineInstrBuilder MIB = + BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc)); + for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) + MIB.add(MI.getOperand(I)); return MIB; } @@ -4075,13 +4320,8 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode()); MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc)) .setMIFlags(MI.getFlags()); - for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) + for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) MIB->addOperand(MI.getOperand(I)); - - updateLiveVariables(LV, MI, *MIB); - if (LIS) - LIS->ReplaceMachineInstrInMaps(MI, *MIB); - return MIB; } @@ -4152,39 +4392,6 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() || !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) { MachineInstr *DefMI; - const auto killDef = [&]() -> void { - MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); - // The only user is the instruction which will be killed. - Register DefReg = DefMI->getOperand(0).getReg(); - - if (MRI.hasOneNonDBGUse(DefReg)) { - // We cannot just remove the DefMI here, calling pass will crash. - DefMI->setDesc(get(AMDGPU::IMPLICIT_DEF)); - DefMI->getOperand(0).setIsDead(true); - for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I) - DefMI->removeOperand(I); - if (LV) - LV->getVarInfo(DefReg).AliveBlocks.clear(); - } - - if (LIS) { - LiveInterval &DefLI = LIS->getInterval(DefReg); - - // We cannot delete the original instruction here, so hack out the use - // in the original instruction with a dummy register so we can use - // shrinkToUses to deal with any multi-use edge cases. Other targets do - // not have the complexity of deleting a use to consider here. - Register DummyReg = MRI.cloneVirtualRegister(DefReg); - for (MachineOperand &MIOp : MI.uses()) { - if (MIOp.isReg() && MIOp.getReg() == DefReg) { - MIOp.setIsUndef(true); - MIOp.setReg(DummyReg); - } - } - - LIS->shrinkToUses(&DefLI); - } - }; int64_t Imm; if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) { @@ -4196,10 +4403,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, .add(*Src1) .addImm(Imm) .setMIFlags(MI.getFlags()); - updateLiveVariables(LV, MI, *MIB); - if (LIS) - LIS->ReplaceMachineInstrInMaps(MI, *MIB); - killDef(); + U.RemoveMIUse = DefMI; return MIB; } } @@ -4212,11 +4416,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, .addImm(Imm) .add(*Src2) .setMIFlags(MI.getFlags()); - updateLiveVariables(LV, MI, *MIB); - - if (LIS) - LIS->ReplaceMachineInstrInMaps(MI, *MIB); - killDef(); + U.RemoveMIUse = DefMI; return MIB; } } @@ -4235,12 +4435,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, .addImm(Imm) .add(*Src2) .setMIFlags(MI.getFlags()); - updateLiveVariables(LV, MI, *MIB); - - if (LIS) - LIS->ReplaceMachineInstrInMaps(MI, *MIB); - if (DefMI) - killDef(); + U.RemoveMIUse = DefMI; return MIB; } } @@ -4269,9 +4464,6 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, .setMIFlags(MI.getFlags()); if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel)) MIB.addImm(OpSel ? OpSel->getImm() : 0); - updateLiveVariables(LV, MI, *MIB); - if (LIS) - LIS->ReplaceMachineInstrInMaps(MI, *MIB); return MIB; } @@ -4321,24 +4513,26 @@ bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI, changesVGPRIndexingMode(MI); } -bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const { +bool SIInstrInfo::isAlwaysGDS(uint32_t Opcode) const { return Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::DS_ADD_GS_REG_RTN || Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode); } -bool SIInstrInfo::mayAccessScratchThroughFlat(const MachineInstr &MI) const { - if (!isFLAT(MI) || isFLATGlobal(MI)) - return false; - - // If scratch is not initialized, we can never access it. - if (MI.getMF()->getFunction().hasFnAttribute("amdgpu-no-flat-scratch-init")) +bool SIInstrInfo::mayAccessScratch(const MachineInstr &MI) const { + // Instructions that access scratch use FLAT encoding or BUF encodings. + if ((!isFLAT(MI) || isFLATGlobal(MI)) && !isBUF(MI)) return false; // SCRATCH instructions always access scratch. if (isFLATScratch(MI)) return true; + // If FLAT_SCRATCH registers are not initialized, we can never access scratch + // via the aperture. + if (MI.getMF()->getFunction().hasFnAttribute("amdgpu-no-flat-scratch-init")) + return false; + // If there are no memory operands then conservatively assume the flat // operation may access scratch. if (MI.memoperands_empty()) @@ -4569,6 +4763,8 @@ bool SIInstrInfo::isInlineConstant(int64_t Imm, uint8_t OperandType) const { case AMDGPU::OPERAND_REG_IMM_V2FP16: case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: return AMDGPU::isInlinableLiteralV2F16(Imm); + case AMDGPU::OPERAND_REG_IMM_V2FP16_SPLAT: + return AMDGPU::isPKFMACF16InlineConstant(Imm, ST.isGFX11Plus()); case AMDGPU::OPERAND_REG_IMM_V2BF16: case AMDGPU::OPERAND_REG_INLINE_C_V2BF16: return AMDGPU::isInlinableLiteralV2BF16(Imm); @@ -4945,8 +5141,8 @@ bool SIInstrInfo::verifyCopy(const MachineInstr &MI, bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const { - uint16_t Opcode = MI.getOpcode(); - const MachineFunction *MF = MI.getParent()->getParent(); + uint32_t Opcode = MI.getOpcode(); + const MachineFunction *MF = MI.getMF(); const MachineRegisterInfo &MRI = MF->getRegInfo(); // FIXME: At this point the COPY verify is done only for non-ssa forms. @@ -5036,6 +5232,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, case AMDGPU::OPERAND_REG_IMM_FP16: case AMDGPU::OPERAND_REG_IMM_FP64: case AMDGPU::OPERAND_REG_IMM_V2FP16: + case AMDGPU::OPERAND_REG_IMM_V2FP16_SPLAT: case AMDGPU::OPERAND_REG_IMM_V2INT16: case AMDGPU::OPERAND_REG_IMM_V2INT32: case AMDGPU::OPERAND_REG_IMM_V2BF16: @@ -5104,7 +5301,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, // aligned register constraint. // FIXME: We do not verify inline asm operands, but custom inline asm // verification is broken anyway - if (ST.needsAlignedVGPRs() && Opcode != AMDGPU::AV_MOV_B64_IMM_PSEUDO) { + if (ST.needsAlignedVGPRs() && Opcode != AMDGPU::AV_MOV_B64_IMM_PSEUDO && + Opcode != AMDGPU::V_MOV_B64_PSEUDO) { const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg); if (RI.hasVectorRegisters(RC) && MO.getSubReg()) { if (const TargetRegisterClass *SubRC = @@ -5200,7 +5398,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, } } - uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode); + uint32_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode); if (isVOPC(BasicOpcode)) { if (!ST.hasSDWASdst() && DstIdx != -1) { // Only vcc allowed as dst on VI for VOPC @@ -5450,9 +5648,10 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, Desc.getNumOperands() + Desc.implicit_uses().size(); const unsigned NumImplicitOps = IsDst ? 2 : 1; - // Allow additional implicit operands. This allows a fixup done by the post - // RA scheduler where the main implicit operand is killed and implicit-defs - // are added for sub-registers that remain live after this instruction. + // Require additional implicit operands. This allows a fixup done by the + // post RA scheduler where the main implicit operand is killed and + // implicit-defs are added for sub-registers that remain live after this + // instruction. if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) { ErrInfo = "missing implicit register operands"; return false; @@ -5734,6 +5933,17 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, } } + if (ST.hasFlatScratchHiInB64InstHazard() && isSALU(MI) && + MI.readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, nullptr)) { + const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst); + if ((Dst && RI.getRegClassForReg(MRI, Dst->getReg()) == + &AMDGPU::SReg_64RegClass) || + Opcode == AMDGPU::S_BITCMP0_B64 || Opcode == AMDGPU::S_BITCMP1_B64) { + ErrInfo = "Instruction cannot read flat_scratch_base_hi"; + return false; + } + } + return true; } @@ -5752,17 +5962,17 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const { case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM; case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM; case AMDGPU::S_MOV_B32: { - const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); return MI.getOperand(1).isReg() || RI.isAGPR(MRI, MI.getOperand(0).getReg()) ? AMDGPU::COPY : AMDGPU::V_MOV_B32_e32; } case AMDGPU::S_ADD_I32: - return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32; + return ST.hasAddNoCarryInsts() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32; case AMDGPU::S_ADDC_U32: return AMDGPU::V_ADDC_U32_e32; case AMDGPU::S_SUB_I32: - return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32; + return ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32; // FIXME: These are not consistently handled, and selected when the carry is // used. case AMDGPU::S_ADD_U32: @@ -6019,19 +6229,6 @@ SIInstrInfo::getWholeWaveFunctionSetup(MachineFunction &MF) const { llvm_unreachable("Couldn't find SI_SETUP_WHOLE_WAVE_FUNC instruction"); } -// FIXME: This should not be an overridable function. All subtarget dependent -// operand modifications should go through isLookupRegClassByHwMode in the -// generic handling. -const TargetRegisterClass * -SIInstrInfo::getRegClass(const MCInstrDesc &TID, unsigned OpNum, - const TargetRegisterInfo *TRI) const { - if (OpNum >= TID.getNumOperands()) - return nullptr; - const MCOperandInfo &OpInfo = TID.operands()[OpNum]; - int16_t RegClass = getOpRegClassID(OpInfo); - return RI.getRegClass(RegClass); -} - const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, unsigned OpNo) const { const MCInstrDesc &Desc = get(MI.getOpcode()); @@ -6040,14 +6237,14 @@ const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, Register Reg = MI.getOperand(OpNo).getReg(); if (Reg.isVirtual()) { - const MachineRegisterInfo &MRI = - MI.getParent()->getParent()->getRegInfo(); + const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); return MRI.getRegClass(Reg); } return RI.getPhysRegBaseClass(Reg); } - return RI.getRegClass(getOpRegClassID(Desc.operands()[OpNo])); + int16_t RegClass = getOpRegClassID(Desc.operands()[OpNo]); + return RegClass < 0 ? nullptr : RI.getRegClass(RegClass); } void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const { @@ -6086,7 +6283,7 @@ unsigned SIInstrInfo::buildExtractSubReg( unsigned NewSubIdx = RI.composeSubRegIndices(SuperReg.getSubReg(), SubIdx); BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) - .addReg(SuperReg.getReg(), 0, NewSubIdx); + .addReg(SuperReg.getReg(), {}, NewSubIdx); return SubReg; } @@ -6131,7 +6328,7 @@ bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI, const TargetRegisterClass *RC = MRI.getRegClass(Reg); if (MO.getSubReg()) { - const MachineFunction *MF = MO.getParent()->getParent()->getParent(); + const MachineFunction *MF = MO.getParent()->getMF(); const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF); if (!SuperRC) return false; @@ -6143,7 +6340,7 @@ bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI, bool SIInstrInfo::isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx, const MachineOperand &MO) const { - const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx]; unsigned Opc = MI.getOpcode(); @@ -6151,7 +6348,7 @@ bool SIInstrInfo::isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx, // information. if (AMDGPU::isPackedFP32Inst(MI.getOpcode()) && AMDGPU::isGFX12Plus(ST) && MO.isReg() && RI.isSGPRReg(MRI, MO.getReg())) { - constexpr const AMDGPU::OpName OpNames[] = { + constexpr AMDGPU::OpName OpNames[] = { AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2}; for (auto [I, OpName] : enumerate(OpNames)) { @@ -6196,6 +6393,18 @@ bool SIInstrInfo::isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx, (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) && RI.isSGPRReg(MRI, MO.getReg())) return false; + + if (ST.hasFlatScratchHiInB64InstHazard() && + MO.getReg() == AMDGPU::SRC_FLAT_SCRATCH_BASE_HI && isSALU(MI)) { + if (const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst)) { + if (AMDGPU::getRegBitWidth(*RI.getRegClassForReg(MRI, Dst->getReg())) == + 64) + return false; + } + if (Opc == AMDGPU::S_BITCMP0_B64 || Opc == AMDGPU::S_BITCMP1_B64) + return false; + } + return true; } @@ -6213,8 +6422,8 @@ bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI, bool SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand( const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN, const MachineOperand *MO) const { - constexpr const unsigned NumOps = 3; - constexpr const AMDGPU::OpName OpNames[NumOps * 2] = { + constexpr unsigned NumOps = 3; + constexpr AMDGPU::OpName OpNames[NumOps * 2] = { AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2, AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers}; @@ -6245,7 +6454,7 @@ bool SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand( bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO) const { - const MachineFunction &MF = *MI.getParent()->getParent(); + const MachineFunction &MF = *MI.getMF(); const MachineRegisterInfo &MRI = MF.getRegInfo(); const MCInstrDesc &InstDesc = MI.getDesc(); const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx]; @@ -6670,7 +6879,7 @@ Register SIInstrInfo::readlaneVGPRToSGPR( Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), get(AMDGPU::V_READFIRSTLANE_B32), SGPR) - .addReg(SrcReg, 0, RI.getSubRegFromChannel(i)); + .addReg(SrcReg, {}, RI.getSubRegFromChannel(i)); SRegs.push_back(SGPR); } @@ -6799,7 +7008,7 @@ void SIInstrInfo::legalizeOperandsFLAT(MachineRegisterInfo &MRI, return; const TargetRegisterClass *DeclaredRC = - getRegClass(MI.getDesc(), SAddr->getOperandNo(), &RI); + getRegClass(MI.getDesc(), SAddr->getOperandNo()); Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI, DeclaredRC); SAddr->setReg(ToSGPR); @@ -6898,7 +7107,7 @@ emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, ScalarOp->setIsKill(); } else { SmallVector<Register, 8> ReadlanePieces; - unsigned VScalarOpUndef = getUndefRegState(ScalarOp->isUndef()); + RegState VScalarOpUndef = getUndefRegState(ScalarOp->isUndef()); assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 && "Unhandled register size"); @@ -7141,7 +7350,7 @@ extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc) { MachineBasicBlock * SIInstrInfo::legalizeOperands(MachineInstr &MI, MachineDominatorTree *MDT) const { - MachineFunction &MF = *MI.getParent()->getParent(); + MachineFunction &MF = *MI.getMF(); MachineRegisterInfo &MRI = MF.getRegInfo(); MachineBasicBlock *CreatedBB = nullptr; @@ -7169,44 +7378,12 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI, return CreatedBB; } - // Legalize REG_SEQUENCE and PHI - // The register class of the operands much be the same type as the register + // Legalize PHI + // The register class of the operands must be the same type as the register // class of the output. if (MI.getOpcode() == AMDGPU::PHI) { - const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr; - for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) { - if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual()) - continue; - const TargetRegisterClass *OpRC = - MRI.getRegClass(MI.getOperand(i).getReg()); - if (RI.hasVectorRegisters(OpRC)) { - VRC = OpRC; - } else { - SRC = OpRC; - } - } - - // If any of the operands are VGPR registers, then they all most be - // otherwise we will create illegal VGPR->SGPR copies when legalizing - // them. - if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) { - if (!VRC) { - assert(SRC); - if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) { - VRC = &AMDGPU::VReg_1RegClass; - } else - VRC = RI.isAGPRClass(getOpRegClass(MI, 0)) - ? RI.getEquivalentAGPRClass(SRC) - : RI.getEquivalentVGPRClass(SRC); - } else { - VRC = RI.isAGPRClass(getOpRegClass(MI, 0)) - ? RI.getEquivalentAGPRClass(VRC) - : RI.getEquivalentVGPRClass(VRC); - } - RC = VRC; - } else { - RC = SRC; - } + const TargetRegisterClass *VRC = getOpRegClass(MI, 0); + assert(!RI.isSGPRClass(VRC)); // Update all the operands so they have the same type. for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { @@ -7220,7 +7397,7 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI, // Avoid creating no-op copies with the same src and dst reg class. These // confuse some of the machine passes. - legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc()); + legalizeGenericOperand(*InsertBB, Insert, VRC, Op, MRI, MI.getDebugLoc()); } } @@ -7426,18 +7603,18 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI, // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0 const DebugLoc &DL = MI.getDebugLoc(); BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo) - .addDef(CondReg0) - .addReg(RsrcPtr, 0, AMDGPU::sub0) - .addReg(VAddr->getReg(), 0, AMDGPU::sub0) - .addImm(0); + .addDef(CondReg0) + .addReg(RsrcPtr, {}, AMDGPU::sub0) + .addReg(VAddr->getReg(), {}, AMDGPU::sub0) + .addImm(0); // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi) - .addDef(CondReg1, RegState::Dead) - .addReg(RsrcPtr, 0, AMDGPU::sub1) - .addReg(VAddr->getReg(), 0, AMDGPU::sub1) - .addReg(CondReg0, RegState::Kill) - .addImm(0); + .addDef(CondReg1, RegState::Dead) + .addReg(RsrcPtr, {}, AMDGPU::sub1) + .addReg(VAddr->getReg(), {}, AMDGPU::sub1) + .addReg(CondReg0, RegState::Kill) + .addImm(0); // NewVaddr = {NewVaddrHi, NewVaddrLo} BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) @@ -7510,9 +7687,9 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI, // NewVaddr = {NewVaddrHi, NewVaddrLo} BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) - .addReg(RsrcPtr, 0, AMDGPU::sub0) + .addReg(RsrcPtr, {}, AMDGPU::sub0) .addImm(AMDGPU::sub0) - .addReg(RsrcPtr, 0, AMDGPU::sub1) + .addReg(RsrcPtr, {}, AMDGPU::sub1) .addImm(AMDGPU::sub1); } else { // Legalize a VGPR Rsrc and soffset together. @@ -7630,6 +7807,8 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); unsigned Opcode = Inst.getOpcode(); unsigned NewOpcode = getVALUOp(Inst); + const DebugLoc &DL = Inst.getDebugLoc(); + // Handle some special cases switch (Opcode) { default: @@ -7781,6 +7960,11 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, Inst.eraseFromParent(); return; + case AMDGPU::S_ABSDIFF_I32: + lowerScalarAbsDiff(Worklist, Inst); + Inst.eraseFromParent(); + return; + case AMDGPU::S_CBRANCH_SCC0: case AMDGPU::S_CBRANCH_SCC1: { // Clear unused bits of vcc @@ -7867,7 +8051,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, return; case AMDGPU::S_UADDO_PSEUDO: case AMDGPU::S_USUBO_PSEUDO: { - const DebugLoc &DL = Inst.getDebugLoc(); MachineOperand &Dest0 = Inst.getOperand(0); MachineOperand &Dest1 = Inst.getOperand(1); MachineOperand &Src0 = Inst.getOperand(2); @@ -7887,12 +8070,37 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, legalizeOperands(*NewInstr, MDT); MRI.replaceRegWith(Dest0.getReg(), DestReg); - addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI, - Worklist); + addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist); Inst.eraseFromParent(); } return; + case AMDGPU::S_LSHL1_ADD_U32: + case AMDGPU::S_LSHL2_ADD_U32: + case AMDGPU::S_LSHL3_ADD_U32: + case AMDGPU::S_LSHL4_ADD_U32: { + MachineOperand &Dest = Inst.getOperand(0); + MachineOperand &Src0 = Inst.getOperand(1); + MachineOperand &Src1 = Inst.getOperand(2); + unsigned ShiftAmt = (Opcode == AMDGPU::S_LSHL1_ADD_U32 ? 1 + : Opcode == AMDGPU::S_LSHL2_ADD_U32 ? 2 + : Opcode == AMDGPU::S_LSHL3_ADD_U32 ? 3 + : 4); + + const TargetRegisterClass *NewRC = + RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())); + Register DestReg = MRI.createVirtualRegister(NewRC); + MachineInstr *NewInstr = + BuildMI(*MBB, &Inst, DL, get(AMDGPU::V_LSHL_ADD_U32_e64), DestReg) + .add(Src0) + .addImm(ShiftAmt) + .add(Src1); + legalizeOperands(*NewInstr, MDT); + MRI.replaceRegWith(Dest.getReg(), DestReg); + addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist); + Inst.eraseFromParent(); + } + return; case AMDGPU::S_CSELECT_B32: case AMDGPU::S_CSELECT_B64: lowerSelect(Worklist, Inst, MDT); @@ -7943,7 +8151,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, } legalizeOperands(*NewInstr, MDT); int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr); - MachineOperand SCCOp = Inst.getOperand(SCCIdx); + const MachineOperand &SCCOp = Inst.getOperand(SCCIdx); addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg); Inst.eraseFromParent(); return; @@ -7983,13 +8191,12 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, legalizeOperandsVALUt16(*NewInstr, MRI); legalizeOperands(*NewInstr, MDT); int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr); - MachineOperand SCCOp = Inst.getOperand(SCCIdx); + const MachineOperand &SCCOp = Inst.getOperand(SCCIdx); addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg); Inst.eraseFromParent(); return; } case AMDGPU::S_CVT_HI_F32_F16: { - const DebugLoc &DL = Inst.getDebugLoc(); Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); if (ST.useRealTrue16Insts()) { @@ -7997,7 +8204,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, .add(Inst.getOperand(1)); BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst) .addImm(0) // src0_modifiers - .addReg(TmpReg, 0, AMDGPU::hi16) + .addReg(TmpReg, {}, AMDGPU::hi16) .addImm(0) // clamp .addImm(0) // omod .addImm(0); // op_sel0 @@ -8019,7 +8226,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, } case AMDGPU::S_MINIMUM_F32: case AMDGPU::S_MAXIMUM_F32: { - const DebugLoc &DL = Inst.getDebugLoc(); Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst) .addImm(0) // src0_modifiers @@ -8037,7 +8243,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, } case AMDGPU::S_MINIMUM_F16: case AMDGPU::S_MAXIMUM_F16: { - const DebugLoc &DL = Inst.getDebugLoc(); Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass : &AMDGPU::VGPR_32RegClass); @@ -8061,7 +8266,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, case AMDGPU::V_S_RCP_F16_e64: case AMDGPU::V_S_RSQ_F16_e64: case AMDGPU::V_S_SQRT_F16_e64: { - const DebugLoc &DL = Inst.getDebugLoc(); Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass : &AMDGPU::VGPR_32RegClass); @@ -8110,26 +8314,34 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, return; } - if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() && - NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) { - // Instead of creating a copy where src and dst are the same register - // class, we just replace all uses of dst with src. These kinds of - // copies interfere with the heuristics MachineSink uses to decide - // whether or not to split a critical edge. Since the pass assumes - // that copies will end up as machine instructions and not be - // eliminated. - addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist); + if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual()) { Register NewDstReg = Inst.getOperand(1).getReg(); - MRI.replaceRegWith(DstReg, NewDstReg); - MRI.clearKillFlags(NewDstReg); - Inst.getOperand(0).setReg(DstReg); - Inst.eraseFromParent(); - // Legalize t16 operand since replaceReg is called after addUsersToVALU - for (MachineOperand &MO : - make_early_inc_range(MRI.use_operands(NewDstReg))) { - legalizeOperandsVALUt16(*MO.getParent(), MRI); + const TargetRegisterClass *SrcRC = RI.getRegClassForReg(MRI, NewDstReg); + if (const TargetRegisterClass *CommonRC = + RI.getCommonSubClass(NewDstRC, SrcRC)) { + // Instead of creating a copy where src and dst are the same register + // class, we just replace all uses of dst with src. These kinds of + // copies interfere with the heuristics MachineSink uses to decide + // whether or not to split a critical edge. Since the pass assumes + // that copies will end up as machine instructions and not be + // eliminated. + addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist); + MRI.replaceRegWith(DstReg, NewDstReg); + MRI.clearKillFlags(NewDstReg); + Inst.getOperand(0).setReg(DstReg); + + if (!MRI.constrainRegClass(NewDstReg, CommonRC)) + llvm_unreachable("failed to constrain register"); + + Inst.eraseFromParent(); + // Legalize t16 operand since replaceReg is called after addUsersToVALU + for (MachineOperand &MO : + make_early_inc_range(MRI.use_operands(NewDstReg))) { + legalizeOperandsVALUt16(*MO.getParent(), MRI); + } + + return; } - return; } // If this is a v2s copy between 16bit and 32bit reg, @@ -8181,7 +8393,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, AMDGPU::OpName::src0_modifiers) >= 0) NewInstr.addImm(0); if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) { - MachineOperand Src = Inst.getOperand(1); + const MachineOperand &Src = Inst.getOperand(1); NewInstr->addOperand(Src); } @@ -8268,7 +8480,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, std::pair<bool, MachineBasicBlock *> SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst, MachineDominatorTree *MDT) const { - if (ST.hasAddNoCarry()) { + if (ST.hasAddNoCarryInsts()) { // Assume there is no user of scc since we don't select this in that case. // Since scc isn't used, it doesn't really matter if the i32 or u32 variant // is used. @@ -8307,7 +8519,7 @@ void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst, MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); MachineBasicBlock::iterator MII = Inst; - DebugLoc DL = Inst.getDebugLoc(); + const DebugLoc &DL = Inst.getDebugLoc(); MachineOperand &Dest = Inst.getOperand(0); MachineOperand &Src0 = Inst.getOperand(1); @@ -8388,15 +8600,15 @@ void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist, MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); MachineBasicBlock::iterator MII = Inst; - DebugLoc DL = Inst.getDebugLoc(); + const DebugLoc &DL = Inst.getDebugLoc(); MachineOperand &Dest = Inst.getOperand(0); MachineOperand &Src = Inst.getOperand(1); Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - unsigned SubOp = ST.hasAddNoCarry() ? - AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32; + unsigned SubOp = ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e32 + : AMDGPU::V_SUB_CO_U32_e32; BuildMI(MBB, MII, DL, get(SubOp), TmpReg) .addImm(0) @@ -8410,6 +8622,37 @@ void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist, addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); } +void SIInstrInfo::lowerScalarAbsDiff(SIInstrWorklist &Worklist, + MachineInstr &Inst) const { + MachineBasicBlock &MBB = *Inst.getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + MachineBasicBlock::iterator MII = Inst; + const DebugLoc &DL = Inst.getDebugLoc(); + + MachineOperand &Dest = Inst.getOperand(0); + MachineOperand &Src1 = Inst.getOperand(1); + MachineOperand &Src2 = Inst.getOperand(2); + Register SubResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + unsigned SubOp = ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e32 + : AMDGPU::V_SUB_CO_U32_e32; + + BuildMI(MBB, MII, DL, get(SubOp), SubResultReg) + .addReg(Src1.getReg()) + .addReg(Src2.getReg()); + + BuildMI(MBB, MII, DL, get(SubOp), TmpReg).addImm(0).addReg(SubResultReg); + + BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg) + .addReg(SubResultReg) + .addReg(TmpReg); + + MRI.replaceRegWith(Dest.getReg(), ResultReg); + addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); +} + void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist, MachineInstr &Inst) const { MachineBasicBlock &MBB = *Inst.getParent(); @@ -8541,7 +8784,7 @@ void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist, MachineOperand &Dest = Inst.getOperand(0); MachineOperand &Src0 = Inst.getOperand(1); - DebugLoc DL = Inst.getDebugLoc(); + const DebugLoc &DL = Inst.getDebugLoc(); MachineBasicBlock::iterator MII = Inst; @@ -8775,7 +9018,7 @@ void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist, MachineOperand &Dest = Inst.getOperand(0); MachineOperand &Src0 = Inst.getOperand(1); MachineOperand &Src1 = Inst.getOperand(2); - DebugLoc DL = Inst.getDebugLoc(); + const DebugLoc &DL = Inst.getDebugLoc(); MachineBasicBlock::iterator MII = Inst; @@ -8937,7 +9180,7 @@ void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist, Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo) - .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0) + .addReg(Inst.getOperand(1).getReg(), {}, AMDGPU::sub0) .addImm(0) .addImm(BitWidth); @@ -8961,14 +9204,14 @@ void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist, Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg) - .addImm(31) - .addReg(Src.getReg(), 0, AMDGPU::sub0); + .addImm(31) + .addReg(Src.getReg(), {}, AMDGPU::sub0); BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) - .addReg(Src.getReg(), 0, AMDGPU::sub0) - .addImm(AMDGPU::sub0) - .addReg(TmpReg) - .addImm(AMDGPU::sub1); + .addReg(Src.getReg(), {}, AMDGPU::sub0) + .addImm(AMDGPU::sub0) + .addReg(TmpReg) + .addImm(AMDGPU::sub1); MRI.replaceRegWith(Dest.getReg(), ResultReg); addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); @@ -8993,8 +9236,8 @@ void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist, const MCInstrDesc &InstDesc = get(Opcode); bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32; - unsigned OpcodeAdd = - ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32; + unsigned OpcodeAdd = ST.hasAddNoCarryInsts() ? AMDGPU::V_ADD_U32_e64 + : AMDGPU::V_ADD_CO_U32_e32; const TargetRegisterClass *SrcRC = Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass; @@ -9072,6 +9315,67 @@ void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist, MachineOperand &Src1 = Inst.getOperand(2); const DebugLoc &DL = Inst.getDebugLoc(); + if (ST.useRealTrue16Insts()) { + Register SrcReg0, SrcReg1; + if (!Src0.isReg() || !RI.isVGPR(MRI, Src0.getReg())) { + SrcReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), SrcReg0).add(Src0); + } else { + SrcReg0 = Src0.getReg(); + } + + if (!Src1.isReg() || !RI.isVGPR(MRI, Src1.getReg())) { + SrcReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), SrcReg1).add(Src1); + } else { + SrcReg1 = Src1.getReg(); + } + + bool isSrc0Reg16 = MRI.constrainRegClass(SrcReg0, &AMDGPU::VGPR_16RegClass); + bool isSrc1Reg16 = MRI.constrainRegClass(SrcReg1, &AMDGPU::VGPR_16RegClass); + + auto NewMI = BuildMI(*MBB, Inst, DL, get(AMDGPU::REG_SEQUENCE), ResultReg); + switch (Inst.getOpcode()) { + case AMDGPU::S_PACK_LL_B32_B16: + NewMI + .addReg(SrcReg0, {}, + isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16) + .addImm(AMDGPU::lo16) + .addReg(SrcReg1, {}, + isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16) + .addImm(AMDGPU::hi16); + break; + case AMDGPU::S_PACK_LH_B32_B16: + NewMI + .addReg(SrcReg0, {}, + isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16) + .addImm(AMDGPU::lo16) + .addReg(SrcReg1, {}, AMDGPU::hi16) + .addImm(AMDGPU::hi16); + break; + case AMDGPU::S_PACK_HL_B32_B16: + NewMI.addReg(SrcReg0, {}, AMDGPU::hi16) + .addImm(AMDGPU::lo16) + .addReg(SrcReg1, {}, + isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16) + .addImm(AMDGPU::hi16); + break; + case AMDGPU::S_PACK_HH_B32_B16: + NewMI.addReg(SrcReg0, {}, AMDGPU::hi16) + .addImm(AMDGPU::lo16) + .addReg(SrcReg1, {}, AMDGPU::hi16) + .addImm(AMDGPU::hi16); + break; + default: + llvm_unreachable("unhandled s_pack_* instruction"); + } + + MachineOperand &Dest = Inst.getOperand(0); + MRI.replaceRegWith(Dest.getReg(), ResultReg); + addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); + return; + } + switch (Inst.getOpcode()) { case AMDGPU::S_PACK_LL_B32_B16: { Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); @@ -9136,7 +9440,7 @@ void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist, addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); } -void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op, +void SIInstrInfo::addSCCDefUsersToVALUWorklist(const MachineOperand &Op, MachineInstr &SCCDefInst, SIInstrWorklist &Worklist, Register NewCond) const { @@ -9154,7 +9458,7 @@ void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op, int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, &RI, false); if (SCCIdx != -1) { if (MI.isCopy()) { - MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); Register DestReg = MI.getOperand(0).getReg(); MRI.replaceRegWith(DestReg, NewCond); @@ -9266,7 +9570,7 @@ Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI, return SGPRReg; Register UsedSGPRs[3] = {Register()}; - const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); for (unsigned i = 0; i < 3; ++i) { int Idx = OpIndices[i]; @@ -9490,7 +9794,14 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { LiteralSize = 8; break; case AMDGPU::OPERAND_REG_IMM_INT64: - if (!Op.isImm() || !AMDGPU::isValid32BitLiteral(Op.getImm(), false)) + // A 32-bit literal is only valid when the value fits in BOTH signed + // and unsigned 32-bit ranges [0, 2^31-1], matching the MC code + // emitter's getLit64Encoding logic. This is because of the lack of + // abilility to tell signedness of the literal, therefore we need to + // be conservative and assume values outside this range require a + // 64-bit literal encoding (8 bytes). + if (!Op.isImm() || !isInt<32>(Op.getImm()) || + !isUInt<32>(Op.getImm())) LiteralSize = 8; break; } @@ -9516,7 +9827,7 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { return getInstBundleSize(MI); case TargetOpcode::INLINEASM: case TargetOpcode::INLINEASM_BR: { - const MachineFunction *MF = MI.getParent()->getParent(); + const MachineFunction *MF = MI.getMF(); const char *AsmStr = MI.getOperand(0).getSymbolName(); return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST); } @@ -9628,6 +9939,7 @@ SIInstrInfo::getSerializableMachineMemOperandTargetFlags() const { {MONoClobber, "amdgpu-noclobber"}, {MOLastUse, "amdgpu-last-use"}, {MOCooperative, "amdgpu-cooperative"}, + {MOThreadPrivate, "amdgpu-thread-private"}, }; return ArrayRef(TargetFlags); @@ -9643,6 +9955,30 @@ unsigned SIInstrInfo::getLiveRangeSplitOpcode(Register SrcReg, return AMDGPU::COPY; } +bool SIInstrInfo::canAddToBBProlog(const MachineInstr &MI) const { + uint32_t Opcode = MI.getOpcode(); + // Check if it is SGPR spill or wwm-register spill Opcode. + if (isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode)) + return true; + + const MachineFunction *MF = MI.getMF(); + const MachineRegisterInfo &MRI = MF->getRegInfo(); + const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); + + // See if this is Liverange split instruction inserted for SGPR or + // wwm-register. The implicit def inserted for wwm-registers should also be + // included as they can appear at the bb begin. + bool IsLRSplitInst = MI.getFlag(MachineInstr::LRSplit); + if (!IsLRSplitInst && Opcode != AMDGPU::IMPLICIT_DEF) + return false; + + Register Reg = MI.getOperand(0).getReg(); + if (RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg))) + return IsLRSplitInst; + + return MFI->isWWMReg(Reg); +} + bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI, Register Reg) const { // We need to handle instructions which may be inserted during register @@ -9651,20 +9987,16 @@ bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI, // needed by the prolog. However, the insertions for scalar registers can // always be placed at the BB top as they are independent of the exec mask // value. - const MachineFunction *MF = MI.getParent()->getParent(); bool IsNullOrVectorRegister = true; if (Reg) { + const MachineFunction *MF = MI.getMF(); const MachineRegisterInfo &MRI = MF->getRegInfo(); IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg)); } - uint16_t Opcode = MI.getOpcode(); - const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); return IsNullOrVectorRegister && - (isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode) || - (Opcode == AMDGPU::IMPLICIT_DEF && - MFI->isWWMReg(MI.getOperand(0).getReg())) || - (!MI.isTerminator() && Opcode != AMDGPU::COPY && + (canAddToBBProlog(MI) || + (!MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY && MI.modifiesRegister(AMDGPU::EXEC, &RI))); } @@ -9673,7 +10005,7 @@ SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg) const { - if (ST.hasAddNoCarry()) + if (ST.hasAddNoCarryInsts()) return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); @@ -9689,7 +10021,7 @@ MachineInstrBuilder SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB, const DebugLoc &DL, Register DestReg, RegScavenger &RS) const { - if (ST.hasAddNoCarry()) + if (ST.hasAddNoCarryInsts()) return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg); // If available, prefer to use vcc. @@ -9746,6 +10078,9 @@ void SIInstrInfo::fixImplicitOperands(MachineInstr &MI) const { if (MI.isInlineAsm()) return; + if (MI.getNumOperands() < MI.getNumExplicitOperands()) + return; + for (auto &Op : MI.implicit_operands()) { if (Op.isReg() && Op.getReg() == AMDGPU::VCC) Op.setReg(AMDGPU::VCC_LO); @@ -9928,6 +10263,8 @@ static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) { case AMDGPUSubtarget::GFX12: return ST.hasGFX1250Insts() ? SIEncodingFamily::GFX1250 : SIEncodingFamily::GFX12; + case AMDGPUSubtarget::GFX13: + return SIEncodingFamily::GFX13; } llvm_unreachable("Unknown subtarget generation!"); } @@ -9986,7 +10323,8 @@ static bool isRenamedInGFX9(int Opcode) { } int SIInstrInfo::pseudoToMCOpcode(int Opcode) const { - Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode); + assert(Opcode == (int)SIInstrInfo::getNonSoftWaitcntOpcode(Opcode) && + "SIInsertWaitcnts should have promoted soft waitcnt instructions!"); unsigned Gen = subtargetEncodingFamily(ST); @@ -10019,9 +10357,9 @@ int SIInstrInfo::pseudoToMCOpcode(int Opcode) const { Opcode = MFMAOp; } - int MCOp = AMDGPU::getMCOpcode(Opcode, Gen); + int64_t MCOp = AMDGPU::getMCOpcode(Opcode, Gen); - if (MCOp == (uint16_t)-1 && ST.hasGFX1250Insts()) + if (MCOp == (uint32_t)-1 && ST.hasGFX1250Insts()) MCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX12); // -1 means that Opcode is already a native instruction. @@ -10029,20 +10367,20 @@ int SIInstrInfo::pseudoToMCOpcode(int Opcode) const { return Opcode; if (ST.hasGFX90AInsts()) { - uint16_t NMCOp = (uint16_t)-1; + uint32_t NMCOp = (uint32_t)-1; if (ST.hasGFX940Insts()) NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX940); - if (NMCOp == (uint16_t)-1) + if (NMCOp == (uint32_t)-1) NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX90A); - if (NMCOp == (uint16_t)-1) + if (NMCOp == (uint32_t)-1) NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX9); - if (NMCOp != (uint16_t)-1) + if (NMCOp != (uint32_t)-1) MCOp = NMCOp; } - // (uint16_t)-1 means that Opcode is a pseudo instruction that has + // (uint32_t)-1 means that Opcode is a pseudo instruction that has // no encoding in the given subtarget generation. - if (MCOp == (uint16_t)-1) + if (MCOp == (uint32_t)-1) return -1; if (isAsmOnlyOpcode(MCOp)) @@ -10097,7 +10435,7 @@ static bool followSubRegDef(MachineInstr &MI, } MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, - MachineRegisterInfo &MRI) { + const MachineRegisterInfo &MRI) { assert(MRI.isSSA()); if (!P.Reg.isVirtual()) return nullptr; @@ -10246,7 +10584,7 @@ MachineInstr *SIInstrInfo::createPHISourceCopy( InsPt++; return BuildMI(MBB, InsPt, DL, get(AMDGPU::LaneMaskConstants::get(ST).MovTermOpc), Dst) - .addReg(Src, 0, SrcSubReg) + .addReg(Src, {}, SrcSubReg) .addReg(AMDGPU::EXEC, RegState::Implicit); } return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg, @@ -10310,6 +10648,14 @@ unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, return SchedModel.computeInstrLatency(&MI); } +const MachineOperand & +SIInstrInfo::getCalleeOperand(const MachineInstr &MI) const { + if (const MachineOperand *CallAddrOp = + getNamedOperand(MI, AMDGPU::OpName::src0)) + return *CallAddrOp; + return TargetInstrInfo::getCalleeOperand(MI); +} + InstructionUniformity SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const { const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); @@ -10385,6 +10731,12 @@ SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const { return InstructionUniformity::Default; } +const MIRFormatter *SIInstrInfo::getMIRFormatter() const { + if (!Formatter) + Formatter = std::make_unique<AMDGPUMIRFormatter>(ST); + return Formatter.get(); +} + InstructionUniformity SIInstrInfo::getInstructionUniformity(const MachineInstr &MI) const { @@ -10438,7 +10790,7 @@ SIInstrInfo::getInstructionUniformity(const MachineInstr &MI) const { return InstructionUniformity::Default; } - const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo(); // FIXME: It's conceptually broken to report this for an instruction, and not @@ -10555,6 +10907,135 @@ bool SIInstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg, return false; } +static bool isSCCDeadOnExit(MachineBasicBlock *MBB) { + for (MachineBasicBlock *S : MBB->successors()) { + if (S->isLiveIn(AMDGPU::SCC)) + return false; + } + return true; +} + +// Invert all uses of SCC following SCCDef because SCCDef may be deleted and +// (incoming SCC) = !(SCC defined by SCCDef). +// Return true if all uses can be re-written, false otherwise. +bool SIInstrInfo::invertSCCUse(MachineInstr *SCCDef) const { + MachineBasicBlock *MBB = SCCDef->getParent(); + SmallVector<MachineInstr *> InvertInstr; + bool SCCIsDead = false; + + // Scan instructions for SCC uses that need to be inverted until SCC is dead. + constexpr unsigned ScanLimit = 12; + unsigned Count = 0; + for (MachineInstr &MI : + make_range(std::next(MachineBasicBlock::iterator(SCCDef)), MBB->end())) { + if (++Count > ScanLimit) + return false; + if (MI.readsRegister(AMDGPU::SCC, &RI)) { + if (MI.getOpcode() == AMDGPU::S_CSELECT_B32 || + MI.getOpcode() == AMDGPU::S_CSELECT_B64 || + MI.getOpcode() == AMDGPU::S_CBRANCH_SCC0 || + MI.getOpcode() == AMDGPU::S_CBRANCH_SCC1) + InvertInstr.push_back(&MI); + else + return false; + } + if (MI.definesRegister(AMDGPU::SCC, &RI)) { + SCCIsDead = true; + break; + } + } + if (!SCCIsDead && isSCCDeadOnExit(MBB)) + SCCIsDead = true; + + // SCC may have more uses. Can't invert all of them. + if (!SCCIsDead) + return false; + + // Invert uses + for (MachineInstr *MI : InvertInstr) { + if (MI->getOpcode() == AMDGPU::S_CSELECT_B32 || + MI->getOpcode() == AMDGPU::S_CSELECT_B64) { + swapOperands(*MI); + } else if (MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0 || + MI->getOpcode() == AMDGPU::S_CBRANCH_SCC1) { + MI->setDesc(get(MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0 + ? AMDGPU::S_CBRANCH_SCC1 + : AMDGPU::S_CBRANCH_SCC0)); + } else { + llvm_unreachable("SCC used but no inversion handling"); + } + } + return true; +} + +// SCC is already valid after SCCValid. +// SCCRedefine will redefine SCC to the same value already available after +// SCCValid. If there are no intervening SCC conflicts delete SCCRedefine and +// update kill/dead flags if necessary. +bool SIInstrInfo::optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine, + bool NeedInversion) const { + MachineInstr *KillsSCC = nullptr; + if (SCCValid->getParent() != SCCRedefine->getParent()) + return false; + for (MachineInstr &MI : make_range(std::next(SCCValid->getIterator()), + SCCRedefine->getIterator())) { + if (MI.modifiesRegister(AMDGPU::SCC, &RI)) + return false; + if (MI.killsRegister(AMDGPU::SCC, &RI)) + KillsSCC = &MI; + } + if (NeedInversion && !invertSCCUse(SCCRedefine)) + return false; + if (MachineOperand *SccDef = + SCCValid->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr)) + SccDef->setIsDead(false); + if (KillsSCC) + KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr); + SCCRedefine->eraseFromParent(); + return true; +} + +static bool foldableSelect(const MachineInstr &Def) { + if (Def.getOpcode() != AMDGPU::S_CSELECT_B32 && + Def.getOpcode() != AMDGPU::S_CSELECT_B64) + return false; + bool Op1IsNonZeroImm = + Def.getOperand(1).isImm() && Def.getOperand(1).getImm() != 0; + bool Op2IsZeroImm = + Def.getOperand(2).isImm() && Def.getOperand(2).getImm() == 0; + if (!Op1IsNonZeroImm || !Op2IsZeroImm) + return false; + return true; +} + +static bool setsSCCIfResultIsZero(const MachineInstr &Def, bool &NeedInversion, + unsigned &NewDefOpc) { + // S_ADD_U32 X, 1 sets SCC on carryout which can only happen if result==0. + // S_ADD_I32 X, 1 can be converted to S_ADD_U32 X, 1 if SCC is dead. + if (Def.getOpcode() != AMDGPU::S_ADD_I32 && + Def.getOpcode() != AMDGPU::S_ADD_U32) + return false; + const MachineOperand &AddSrc1 = Def.getOperand(1); + const MachineOperand &AddSrc2 = Def.getOperand(2); + int64_t addend; + + if ((!AddSrc1.isImm() || AddSrc1.getImm() != 1) && + (!AddSrc2.isImm() || AddSrc2.getImm() != 1) && + (!getFoldableImm(&AddSrc1, addend) || addend != 1) && + (!getFoldableImm(&AddSrc2, addend) || addend != 1)) + return false; + + if (Def.getOpcode() == AMDGPU::S_ADD_I32) { + const MachineOperand *SccDef = + Def.findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr); + if (!SccDef->isDead()) + return false; + NewDefOpc = AMDGPU::S_ADD_U32; + } + NeedInversion = !NeedInversion; + return true; +} + bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, @@ -10565,6 +11046,72 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue)) return false; + const auto optimizeCmpSelect = [&CmpInstr, SrcReg, CmpValue, MRI, + this](bool NeedInversion) -> bool { + if (CmpValue != 0) + return false; + + MachineInstr *Def = MRI->getVRegDef(SrcReg); + if (!Def) + return false; + + // For S_OP that set SCC = DST!=0, do the transformation + // + // s_cmp_[lg|eq]_* (S_OP ...), 0 => (S_OP ...) + // + // For (S_OP ...) that set SCC = DST==0, invert NeedInversion and + // do the transformation: + // + // s_cmp_[lg|eq]_* (S_OP ...), 0 => (S_OP ...) + // + // If foldableSelect, s_cmp_lg_* is redundant because the SCC input value + // for S_CSELECT* already has the same value that will be calculated by + // s_cmp_lg_* + // + // s_cmp_[lg|eq]_* (S_CSELECT* (non-zero imm), 0), 0 => (S_CSELECT* + // (non-zero imm), 0) + + unsigned NewDefOpc = Def->getOpcode(); + if (!setsSCCIfResultIsNonZero(*Def) && + !setsSCCIfResultIsZero(*Def, NeedInversion, NewDefOpc) && + !foldableSelect(*Def)) + return false; + + if (!optimizeSCC(Def, &CmpInstr, NeedInversion)) + return false; + + if (NewDefOpc != Def->getOpcode()) + Def->setDesc(get(NewDefOpc)); + + // If s_or_b32 result, sY, is unused (i.e. it is effectively a 64-bit + // s_cmp_lg of a register pair) and the inputs are the hi and lo-halves of a + // 64-bit foldableSelect then delete s_or_b32 in the sequence: + // sX = s_cselect_b64 (non-zero imm), 0 + // sLo = copy sX.sub0 + // sHi = copy sX.sub1 + // sY = s_or_b32 sLo, sHi + if (Def->getOpcode() == AMDGPU::S_OR_B32 && + MRI->use_nodbg_empty(Def->getOperand(0).getReg())) { + const MachineOperand &OrOpnd1 = Def->getOperand(1); + const MachineOperand &OrOpnd2 = Def->getOperand(2); + if (OrOpnd1.isReg() && OrOpnd2.isReg()) { + MachineInstr *Def1 = MRI->getVRegDef(OrOpnd1.getReg()); + MachineInstr *Def2 = MRI->getVRegDef(OrOpnd2.getReg()); + if (Def1 && Def1->getOpcode() == AMDGPU::COPY && Def2 && + Def2->getOpcode() == AMDGPU::COPY && Def1->getOperand(1).isReg() && + Def2->getOperand(1).isReg() && + Def1->getOperand(1).getSubReg() == AMDGPU::sub0 && + Def2->getOperand(1).getSubReg() == AMDGPU::sub1 && + Def1->getOperand(1).getReg() == Def2->getOperand(1).getReg()) { + MachineInstr *Select = MRI->getVRegDef(Def1->getOperand(1).getReg()); + if (Select && foldableSelect(*Select)) + optimizeSCC(Select, Def, /*NeedInversion=*/false); + } + } + } + return true; + }; + const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI, this](int64_t ExpectedValue, unsigned SrcSize, bool IsReversible, bool IsSigned) -> bool { @@ -10591,8 +11138,8 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n - MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg); - if (!Def || Def->getParent() != CmpInstr.getParent()) + MachineInstr *Def = MRI->getVRegDef(SrcReg); + if (!Def) return false; if (Def->getOpcode() != AMDGPU::S_AND_B32 && @@ -10639,17 +11186,8 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg)) return false; - for (auto I = std::next(Def->getIterator()), E = CmpInstr.getIterator(); - I != E; ++I) { - if (I->modifiesRegister(AMDGPU::SCC, &RI) || - I->killsRegister(AMDGPU::SCC, &RI)) - return false; - } - - MachineOperand *SccDef = - Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr); - SccDef->setIsDead(false); - CmpInstr.eraseFromParent(); + if (!optimizeSCC(Def, &CmpInstr, /*NeedInversion=*/false)) + return false; if (!MRI->use_nodbg_empty(DefReg)) { assert(!IsReversedCC); @@ -10679,7 +11217,8 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, case AMDGPU::S_CMP_EQ_I32: case AMDGPU::S_CMPK_EQ_U32: case AMDGPU::S_CMPK_EQ_I32: - return optimizeCmpAnd(1, 32, true, false); + return optimizeCmpAnd(1, 32, true, false) || + optimizeCmpSelect(/*NeedInversion=*/true); case AMDGPU::S_CMP_GE_U32: case AMDGPU::S_CMPK_GE_U32: return optimizeCmpAnd(1, 32, false, false); @@ -10692,7 +11231,8 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, case AMDGPU::S_CMP_LG_I32: case AMDGPU::S_CMPK_LG_U32: case AMDGPU::S_CMPK_LG_I32: - return optimizeCmpAnd(0, 32, true, false); + return optimizeCmpAnd(0, 32, true, false) || + optimizeCmpSelect(/*NeedInversion=*/false); case AMDGPU::S_CMP_GT_U32: case AMDGPU::S_CMPK_GT_U32: return optimizeCmpAnd(0, 32, false, false); @@ -10700,7 +11240,8 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, case AMDGPU::S_CMPK_GT_I32: return optimizeCmpAnd(0, 32, false, true); case AMDGPU::S_CMP_LG_U64: - return optimizeCmpAnd(0, 64, true, false); + return optimizeCmpAnd(0, 64, true, false) || + optimizeCmpSelect(/*NeedInversion=*/false); } return false; @@ -10731,7 +11272,7 @@ void SIInstrInfo::enforceOperandRCAlignment(MachineInstr &MI, MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass : &AMDGPU::VReg_64_Align2RegClass); BuildMI(*BB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewVR) - .addReg(DataReg, 0, Op.getSubReg()) + .addReg(DataReg, {}, Op.getSubReg()) .addImm(AMDGPU::sub0) .addReg(Undef) .addImm(AMDGPU::sub1); @@ -10751,7 +11292,7 @@ bool SIInstrInfo::isXDLWMMA(const MachineInstr &MI) const { if (!isWMMA(MI) && !isSWMMAC(MI)) return false; - if (AMDGPU::isGFX1250(ST)) + if (ST.hasGFX1250Insts()) return AMDGPU::getWMMAIsXDL(MI.getOpcode()); return true; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index e979eeb..0b54513 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -52,6 +52,11 @@ static const MachineMemOperand::Flags MOLastUse = static const MachineMemOperand::Flags MOCooperative = MachineMemOperand::MOTargetFlag3; +/// Mark the MMO of accesses to memory locations that are +/// never written to by other threads. +static const MachineMemOperand::Flags MOThreadPrivate = + MachineMemOperand::MOTargetFlag4; + /// Utility to store machine instructions worklist. struct SIInstrWorklist { SIInstrWorklist() = default; @@ -88,6 +93,8 @@ private: }; class SIInstrInfo final : public AMDGPUGenInstrInfo { + struct ThreeAddressUpdates; + private: const SIRegisterInfo RI; const GCNSubtarget &ST; @@ -123,6 +130,11 @@ public: unsigned SubIdx, const TargetRegisterClass *SubRC) const; private: + bool optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine, + bool NeedInversion) const; + + bool invertSCCUse(MachineInstr *SCCDef) const; + void swapOperands(MachineInstr &Inst) const; std::pair<bool, MachineBasicBlock *> @@ -134,6 +146,8 @@ private: void lowerScalarAbs(SIInstrWorklist &Worklist, MachineInstr &Inst) const; + void lowerScalarAbsDiff(SIInstrWorklist &Worklist, MachineInstr &Inst) const; + void lowerScalarXnor(SIInstrWorklist &Worklist, MachineInstr &Inst) const; void splitScalarNotBinop(SIInstrWorklist &Worklist, MachineInstr &Inst, @@ -170,7 +184,7 @@ private: void addUsersToMoveToVALUWorklist(Register Reg, MachineRegisterInfo &MRI, SIInstrWorklist &Worklist) const; - void addSCCDefUsersToVALUWorklist(MachineOperand &Op, + void addSCCDefUsersToVALUWorklist(const MachineOperand &Op, MachineInstr &SCCDefInst, SIInstrWorklist &Worklist, Register NewCond = Register()) const; @@ -190,6 +204,9 @@ private: bool resultDependsOnExec(const MachineInstr &MI) const; + MachineInstr *convertToThreeAddressImpl(MachineInstr &MI, + ThreeAddressUpdates &Updates) const; + protected: /// If the specific machine instruction is a instruction that moves/copies /// value from one register to another register return destination and source @@ -291,6 +308,8 @@ public: bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg, int64_t &ImmVal) const override; + std::optional<int64_t> getImmOrMaterializedImm(MachineOperand &Op) const; + unsigned getVectorRegSpillSaveOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, @@ -302,22 +321,20 @@ public: void storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, - bool isKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, - int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + int FrameIndex, const TargetRegisterClass *RC, Register VReg, + unsigned SubReg = 0, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; bool expandPostRAPseudo(MachineInstr &MI) const override; void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, - const MachineInstr &Orig, - const TargetRegisterInfo &TRI) const override; + const MachineInstr &Orig) const override; // Splits a V_MOV_B64_DPP_PSEUDO opcode into a pair of v_mov_b32_dpp // instructions. Returns a pair of generated instructions. @@ -421,6 +438,9 @@ public: void removeModOperands(MachineInstr &MI) const; + void mutateAndCleanupImplicit(MachineInstr &MI, + const MCInstrDesc &NewDesc) const; + /// Return the extracted immediate value in a subregister use from a constant /// materialized in a super register. /// @@ -446,7 +466,7 @@ public: return MI.getDesc().TSFlags & SIInstrFlags::SALU; } - bool isSALU(uint16_t Opcode) const { + bool isSALU(uint32_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::SALU; } @@ -454,7 +474,7 @@ public: return MI.getDesc().TSFlags & SIInstrFlags::VALU; } - bool isVALU(uint16_t Opcode) const { + bool isVALU(uint32_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::VALU; } @@ -462,7 +482,7 @@ public: return isMIMG(MI) || isVSAMPLE(MI) || isVIMAGE(MI); } - bool isImage(uint16_t Opcode) const { + bool isImage(uint32_t Opcode) const { return isMIMG(Opcode) || isVSAMPLE(Opcode) || isVIMAGE(Opcode); } @@ -470,7 +490,7 @@ public: return isMUBUF(MI) || isMTBUF(MI) || isImage(MI) || isFLAT(MI); } - bool isVMEM(uint16_t Opcode) const { + bool isVMEM(uint32_t Opcode) const { return isMUBUF(Opcode) || isMTBUF(Opcode) || isImage(Opcode); } @@ -478,7 +498,7 @@ public: return MI.getDesc().TSFlags & SIInstrFlags::SOP1; } - bool isSOP1(uint16_t Opcode) const { + bool isSOP1(uint32_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::SOP1; } @@ -486,7 +506,7 @@ public: return MI.getDesc().TSFlags & SIInstrFlags::SOP2; } - bool isSOP2(uint16_t Opcode) const { + bool isSOP2(uint32_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::SOP2; } @@ -494,7 +514,7 @@ public: return MI.getDesc().TSFlags & SIInstrFlags::SOPC; } - bool isSOPC(uint16_t Opcode) const { + bool isSOPC(uint32_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::SOPC; } @@ -502,7 +522,7 @@ public: return MI.getDesc().TSFlags & SIInstrFlags::SOPK; } - bool isSOPK(uint16_t Opcode) const { + bool isSOPK(uint32_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::SOPK; } @@ -510,7 +530,7 @@ public: return MI.getDesc().TSFlags & SIInstrFlags::SOPP; } - bool isSOPP(uint16_t Opcode) const { + bool isSOPP(uint32_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::SOPP; } @@ -518,7 +538,7 @@ public: return MI.getDesc().TSFlags & SIInstrFlags::IsPacked; } - bool isPacked(uint16_t Opcode) const { + bool isPacked(uint32_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::IsPacked; } @@ -526,7 +546,7 @@ public: return MI.getDesc().TSFlags & SIInstrFlags::VOP1; } - bool isVOP1(uint16_t Opcode) const { + bool isVOP1(uint32_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::VOP1; } @@ -534,7 +554,7 @@ public: return MI.getDesc().TSFlags & SIInstrFlags::VOP2; } - bool isVOP2(uint16_t Opcode) const { + bool isVOP2(uint32_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::VOP2; } @@ -544,13 +564,13 @@ public: static bool isVOP3(const MachineInstr &MI) { return isVOP3(MI.getDesc()); } - bool isVOP3(uint16_t Opcode) const { return isVOP3(get(Opcode)); } + bool isVOP3(uint32_t Opcode) const { return isVOP3(get(Opcode)); } static bool isSDWA(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::SDWA; } - bool isSDWA(uint16_t Opcode) const { + bool isSDWA(uint32_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::SDWA; } @@ -558,7 +578,7 @@ public: return MI.getDesc().TSFlags & SIInstrFlags::VOPC; } - bool isVOPC(uint16_t Opcode) const { + bool isVOPC(uint32_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::VOPC; } @@ -566,7 +586,7 @@ public: return MI.getDesc().TSFlags & SIInstrFlags::MUBUF; } - bool isMUBUF(uint16_t Opcode) const { + bool isMUBUF(uint32_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::MUBUF; } @@ -574,15 +594,19 @@ public: return MI.getDesc().TSFlags & SIInstrFlags::MTBUF; } - bool isMTBUF(uint16_t Opcode) const { + bool isMTBUF(uint32_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::MTBUF; } + static bool isBUF(const MachineInstr &MI) { + return isMUBUF(MI) || isMTBUF(MI); + } + static bool isSMRD(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::SMRD; } - bool isSMRD(uint16_t Opcode) const { + bool isSMRD(uint32_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::SMRD; } @@ -592,33 +616,35 @@ public: return MI.getDesc().TSFlags & SIInstrFlags::DS; } - bool isDS(uint16_t Opcode) const { + bool isDS(uint32_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::DS; } static bool isLDSDMA(const MachineInstr &MI) { - return isVALU(MI) && (isMUBUF(MI) || isFLAT(MI)); + return (isVALU(MI) && (isMUBUF(MI) || isFLAT(MI))) || + (MI.getDesc().TSFlags & SIInstrFlags::TENSOR_CNT); } - bool isLDSDMA(uint16_t Opcode) { - return isVALU(Opcode) && (isMUBUF(Opcode) || isFLAT(Opcode)); + bool isLDSDMA(uint32_t Opcode) { + return (isVALU(Opcode) && (isMUBUF(Opcode) || isFLAT(Opcode))) || + (get(Opcode).TSFlags & SIInstrFlags::TENSOR_CNT); } static bool isGWS(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::GWS; } - bool isGWS(uint16_t Opcode) const { + bool isGWS(uint32_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::GWS; } - bool isAlwaysGDS(uint16_t Opcode) const; + bool isAlwaysGDS(uint32_t Opcode) const; static bool isMIMG(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::MIMG; } - bool isMIMG(uint16_t Opcode) const { + bool isMIMG(uint32_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::MIMG; } @@ -626,7 +652,7 @@ public: return MI.getDesc().TSFlags & SIInstrFlags::VIMAGE; } - bool isVIMAGE(uint16_t Opcode) const { + bool isVIMAGE(uint32_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::VIMAGE; } @@ -634,7 +660,7 @@ public: return MI.getDesc().TSFlags & SIInstrFlags::VSAMPLE; } - bool isVSAMPLE(uint16_t Opcode) const { + bool isVSAMPLE(uint32_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::VSAMPLE; } @@ -642,7 +668,7 @@ public: return MI.getDesc().TSFlags & SIInstrFlags::Gather4; } - bool isGather4(uint16_t Opcode) const { + bool isGather4(uint32_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::Gather4; } @@ -657,7 +683,7 @@ public: return Flags & (SIInstrFlags::FlatGlobal | SIInstrFlags::FlatScratch); } - bool isSegmentSpecificFLAT(uint16_t Opcode) const { + bool isSegmentSpecificFLAT(uint32_t Opcode) const { auto Flags = get(Opcode).TSFlags; return Flags & (SIInstrFlags::FlatGlobal | SIInstrFlags::FlatScratch); } @@ -666,7 +692,7 @@ public: return MI.getDesc().TSFlags & SIInstrFlags::FlatGlobal; } - bool isFLATGlobal(uint16_t Opcode) const { + bool isFLATGlobal(uint32_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::FlatGlobal; } @@ -674,20 +700,20 @@ public: return MI.getDesc().TSFlags & SIInstrFlags::FlatScratch; } - bool isFLATScratch(uint16_t Opcode) const { + bool isFLATScratch(uint32_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::FlatScratch; } // Any FLAT encoded instruction, including global_* and scratch_*. - bool isFLAT(uint16_t Opcode) const { + bool isFLAT(uint32_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::FLAT; } - /// \returns true for SCRATCH_ instructions, or FLAT_ instructions with - /// SCRATCH_ memory operands. + /// \returns true for SCRATCH_ instructions, or FLAT/BUF instructions unless + /// the MMOs do not include scratch. /// Conservatively correct; will return true if \p MI cannot be proven /// to not hit scratch. - bool mayAccessScratchThroughFlat(const MachineInstr &MI) const; + bool mayAccessScratch(const MachineInstr &MI) const; /// \returns true for FLAT instructions that can access VMEM. bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const; @@ -695,7 +721,7 @@ public: /// \returns true for FLAT instructions that can access LDS. bool mayAccessLDSThroughFlat(const MachineInstr &MI) const; - static bool isBlockLoadStore(uint16_t Opcode) { + static bool isBlockLoadStore(uint32_t Opcode) { switch (Opcode) { case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE: case AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE: @@ -709,6 +735,52 @@ public: } } + static bool setsSCCIfResultIsNonZero(const MachineInstr &MI) { + switch (MI.getOpcode()) { + case AMDGPU::S_ABSDIFF_I32: + case AMDGPU::S_ABS_I32: + case AMDGPU::S_AND_B32: + case AMDGPU::S_AND_B64: + case AMDGPU::S_ANDN2_B32: + case AMDGPU::S_ANDN2_B64: + case AMDGPU::S_ASHR_I32: + case AMDGPU::S_ASHR_I64: + case AMDGPU::S_BCNT0_I32_B32: + case AMDGPU::S_BCNT0_I32_B64: + case AMDGPU::S_BCNT1_I32_B32: + case AMDGPU::S_BCNT1_I32_B64: + case AMDGPU::S_BFE_I32: + case AMDGPU::S_BFE_I64: + case AMDGPU::S_BFE_U32: + case AMDGPU::S_BFE_U64: + case AMDGPU::S_LSHL_B32: + case AMDGPU::S_LSHL_B64: + case AMDGPU::S_LSHR_B32: + case AMDGPU::S_LSHR_B64: + case AMDGPU::S_NAND_B32: + case AMDGPU::S_NAND_B64: + case AMDGPU::S_NOR_B32: + case AMDGPU::S_NOR_B64: + case AMDGPU::S_NOT_B32: + case AMDGPU::S_NOT_B64: + case AMDGPU::S_OR_B32: + case AMDGPU::S_OR_B64: + case AMDGPU::S_ORN2_B32: + case AMDGPU::S_ORN2_B64: + case AMDGPU::S_QUADMASK_B32: + case AMDGPU::S_QUADMASK_B64: + case AMDGPU::S_WQM_B32: + case AMDGPU::S_WQM_B64: + case AMDGPU::S_XNOR_B32: + case AMDGPU::S_XNOR_B64: + case AMDGPU::S_XOR_B32: + case AMDGPU::S_XOR_B64: + return true; + default: + return false; + } + } + static bool isEXP(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::EXP; } @@ -721,7 +793,7 @@ public: Target == AMDGPU::Exp::ET_DUAL_SRC_BLEND1; } - bool isEXP(uint16_t Opcode) const { + bool isEXP(uint32_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::EXP; } @@ -729,7 +801,7 @@ public: return MI.getDesc().TSFlags & SIInstrFlags::IsAtomicNoRet; } - bool isAtomicNoRet(uint16_t Opcode) const { + bool isAtomicNoRet(uint32_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::IsAtomicNoRet; } @@ -737,7 +809,7 @@ public: return MI.getDesc().TSFlags & SIInstrFlags::IsAtomicRet; } - bool isAtomicRet(uint16_t Opcode) const { + bool isAtomicRet(uint32_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::IsAtomicRet; } @@ -746,13 +818,17 @@ public: SIInstrFlags::IsAtomicNoRet); } - bool isAtomic(uint16_t Opcode) const { + bool isAtomic(uint32_t Opcode) const { return get(Opcode).TSFlags & (SIInstrFlags::IsAtomicRet | SIInstrFlags::IsAtomicNoRet); } static bool mayWriteLDSThroughDMA(const MachineInstr &MI) { - return isLDSDMA(MI) && MI.getOpcode() != AMDGPU::BUFFER_STORE_LDS_DWORD; + unsigned Opc = MI.getOpcode(); + // Exclude instructions that read FROM LDS (not write to it) + return isLDSDMA(MI) && Opc != AMDGPU::BUFFER_STORE_LDS_DWORD && + Opc != AMDGPU::TENSOR_STORE_FROM_LDS && + Opc != AMDGPU::TENSOR_STORE_FROM_LDS_D2; } static bool isSBarrierSCCWrite(unsigned Opcode) { @@ -771,7 +847,7 @@ public: return MI.getDesc().TSFlags & SIInstrFlags::WQM; } - bool isWQM(uint16_t Opcode) const { + bool isWQM(uint32_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::WQM; } @@ -779,7 +855,7 @@ public: return MI.getDesc().TSFlags & SIInstrFlags::DisableWQM; } - bool isDisableWQM(uint16_t Opcode) const { + bool isDisableWQM(uint32_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::DisableWQM; } @@ -794,7 +870,7 @@ public: (isSpill(MI) && isVALU(MI)); } - bool isVGPRSpill(uint16_t Opcode) const { + bool isVGPRSpill(uint32_t Opcode) const { return Opcode != AMDGPU::SI_SPILL_S32_TO_VGPR && Opcode != AMDGPU::SI_RESTORE_S32_FROM_VGPR && (isSpill(Opcode) && isVALU(Opcode)); @@ -806,13 +882,13 @@ public: (isSpill(MI) && isSALU(MI)); } - bool isSGPRSpill(uint16_t Opcode) const { + bool isSGPRSpill(uint32_t Opcode) const { return Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR || Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR || (isSpill(Opcode) && isSALU(Opcode)); } - bool isSpill(uint16_t Opcode) const { + bool isSpill(uint32_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::Spill; } @@ -822,7 +898,7 @@ public: static bool isSpill(const MachineInstr &MI) { return isSpill(MI.getDesc()); } - static bool isWWMRegSpillOpcode(uint16_t Opcode) { + static bool isWWMRegSpillOpcode(uint32_t Opcode) { return Opcode == AMDGPU::SI_SPILL_WWM_V32_SAVE || Opcode == AMDGPU::SI_SPILL_WWM_AV32_SAVE || Opcode == AMDGPU::SI_SPILL_WWM_V32_RESTORE || @@ -838,7 +914,7 @@ public: return MI.getDesc().TSFlags & SIInstrFlags::DPP; } - bool isDPP(uint16_t Opcode) const { + bool isDPP(uint32_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::DPP; } @@ -846,7 +922,7 @@ public: return MI.getDesc().TSFlags & SIInstrFlags::TRANS; } - bool isTRANS(uint16_t Opcode) const { + bool isTRANS(uint32_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::TRANS; } @@ -854,7 +930,7 @@ public: return MI.getDesc().TSFlags & SIInstrFlags::VOP3P; } - bool isVOP3P(uint16_t Opcode) const { + bool isVOP3P(uint32_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::VOP3P; } @@ -862,7 +938,7 @@ public: return MI.getDesc().TSFlags & SIInstrFlags::VINTRP; } - bool isVINTRP(uint16_t Opcode) const { + bool isVINTRP(uint32_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::VINTRP; } @@ -872,13 +948,18 @@ public: static bool isMAI(const MachineInstr &MI) { return isMAI(MI.getDesc()); } - bool isMAI(uint16_t Opcode) const { return isMAI(get(Opcode)); } + bool isMAI(uint32_t Opcode) const { return isMAI(get(Opcode)); } static bool isMFMA(const MachineInstr &MI) { return isMAI(MI) && MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 && MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64; } + bool isMFMA(uint32_t Opcode) const { + return isMAI(Opcode) && Opcode != AMDGPU::V_ACCVGPR_WRITE_B32_e64 && + Opcode != AMDGPU::V_ACCVGPR_READ_B32_e64; + } + static bool isDOT(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::IsDOT; } @@ -887,7 +968,7 @@ public: return MI.getDesc().TSFlags & SIInstrFlags::IsWMMA; } - bool isWMMA(uint16_t Opcode) const { + bool isWMMA(uint32_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::IsWMMA; } @@ -895,15 +976,19 @@ public: return isMFMA(MI) || isWMMA(MI) || isSWMMAC(MI); } + bool isMFMAorWMMA(uint32_t Opcode) const { + return isMFMA(Opcode) || isWMMA(Opcode) || isSWMMAC(Opcode); + } + static bool isSWMMAC(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::IsSWMMAC; } - bool isSWMMAC(uint16_t Opcode) const { + bool isSWMMAC(uint32_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::IsSWMMAC; } - bool isDOT(uint16_t Opcode) const { + bool isDOT(uint32_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::IsDOT; } @@ -917,7 +1002,7 @@ public: return MI.getDesc().TSFlags & SIInstrFlags::LDSDIR; } - bool isLDSDIR(uint16_t Opcode) const { + bool isLDSDIR(uint32_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::LDSDIR; } @@ -925,7 +1010,7 @@ public: return MI.getDesc().TSFlags & SIInstrFlags::VINTERP; } - bool isVINTERP(uint16_t Opcode) const { + bool isVINTERP(uint32_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::VINTERP; } @@ -941,6 +1026,14 @@ public: return MI.getDesc().TSFlags & SIInstrFlags::LGKM_CNT; } + static bool usesASYNC_CNT(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::ASYNC_CNT; + } + + bool usesASYNC_CNT(uint32_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::ASYNC_CNT; + } + // Most sopk treat the immediate as a signed 16-bit, however some // use it as unsigned. static bool sopkIsZext(unsigned Opcode) { @@ -957,7 +1050,7 @@ public: return MI.getDesc().TSFlags & SIInstrFlags::SCALAR_STORE; } - bool isScalarStore(uint16_t Opcode) const { + bool isScalarStore(uint32_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::SCALAR_STORE; } @@ -965,7 +1058,7 @@ public: return MI.getDesc().TSFlags & SIInstrFlags::FIXED_SIZE; } - bool isFixedSize(uint16_t Opcode) const { + bool isFixedSize(uint32_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::FIXED_SIZE; } @@ -973,7 +1066,7 @@ public: return MI.getDesc().TSFlags & SIInstrFlags::FPClamp; } - bool hasFPClamp(uint16_t Opcode) const { + bool hasFPClamp(uint32_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::FPClamp; } @@ -993,7 +1086,7 @@ public: return MI.getDesc().TSFlags & SIInstrFlags::FPDPRounding; } - bool usesFPDPRounding(uint16_t Opcode) const { + bool usesFPDPRounding(uint32_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::FPDPRounding; } @@ -1001,7 +1094,7 @@ public: return MI.getDesc().TSFlags & SIInstrFlags::FPAtomic; } - bool isFPAtomic(uint16_t Opcode) const { + bool isFPAtomic(uint32_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::FPAtomic; } @@ -1046,7 +1139,7 @@ public: return MI.getDesc().TSFlags & SIInstrFlags::TiedSourceNotRead; } - bool doesNotReadTiedSource(uint16_t Opcode) const { + bool doesNotReadTiedSource(uint32_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::TiedSourceNotRead; } @@ -1114,13 +1207,13 @@ public: bool isVGPRCopy(const MachineInstr &MI) const { assert(isCopyInstr(MI)); Register Dest = MI.getOperand(0).getReg(); - const MachineFunction &MF = *MI.getParent()->getParent(); + const MachineFunction &MF = *MI.getMF(); const MachineRegisterInfo &MRI = MF.getRegInfo(); return !RI.isSGPRReg(MRI, Dest); } bool hasVGPRUses(const MachineInstr &MI) const { - const MachineFunction &MF = *MI.getParent()->getParent(); + const MachineFunction &MF = *MI.getMF(); const MachineRegisterInfo &MRI = MF.getRegInfo(); return llvm::any_of(MI.explicit_uses(), [&MRI, this](const MachineOperand &MO) { @@ -1289,7 +1382,7 @@ public: /// Return the size in bytes of the operand OpNo on the given // instruction opcode. - unsigned getOpSize(uint16_t Opcode, unsigned OpNo) const { + unsigned getOpSize(uint32_t Opcode, unsigned OpNo) const { const MCOperandInfo &OpInfo = get(Opcode).operands()[OpNo]; if (OpInfo.RegClass == -1) { @@ -1501,6 +1594,8 @@ public: bool isBasicBlockPrologue(const MachineInstr &MI, Register Reg = Register()) const override; + bool canAddToBBProlog(const MachineInstr &MI) const; + MachineInstr *createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, @@ -1562,10 +1657,6 @@ public: /// Return true if this opcode should not be used by codegen. bool isAsmOnlyOpcode(int MCOp) const; - const TargetRegisterClass * - getRegClass(const MCInstrDesc &TID, unsigned OpNum, - const TargetRegisterInfo *TRI) const override; - void fixImplicitOperands(MachineInstr &MI) const; MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, @@ -1579,22 +1670,21 @@ public: const MachineInstr &MI, unsigned *PredCost = nullptr) const override; + const MachineOperand &getCalleeOperand(const MachineInstr &MI) const override; + InstructionUniformity - getInstructionUniformity(const MachineInstr &MI) const override final; + getInstructionUniformity(const MachineInstr &MI) const final; InstructionUniformity getGenericInstructionUniformity(const MachineInstr &MI) const; - const MIRFormatter *getMIRFormatter() const override { - if (!Formatter) - Formatter = std::make_unique<AMDGPUMIRFormatter>(); - return Formatter.get(); - } + const MIRFormatter *getMIRFormatter() const override; static unsigned getDSShaderTypeValue(const MachineFunction &MF); const TargetSchedModel &getSchedModel() const { return SchedModel; } + // FIXME: This should be removed // Enforce operand's \p OpName even alignment if required by target. // This is used if an operand is a 32 bit register but needs to be aligned // regardless. @@ -1627,7 +1717,7 @@ TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, /// skipping copy like instructions and subreg-manipulation pseudos. /// Following another subreg of a reg:subreg isn't supported. MachineInstr *getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, - MachineRegisterInfo &MRI); + const MachineRegisterInfo &MRI); /// \brief Return false if EXEC is not changed between the def of \p VReg at \p /// DefMI and the use at \p UseMI. Should be run on SSA. Currently does not @@ -1647,86 +1737,86 @@ bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, namespace AMDGPU { LLVM_READONLY - int getVOPe64(uint16_t Opcode); + int64_t getVOPe64(uint32_t Opcode); LLVM_READONLY - int getVOPe32(uint16_t Opcode); + int64_t getVOPe32(uint32_t Opcode); LLVM_READONLY - int getSDWAOp(uint16_t Opcode); + int64_t getSDWAOp(uint32_t Opcode); LLVM_READONLY - int getDPPOp32(uint16_t Opcode); + int64_t getDPPOp32(uint32_t Opcode); LLVM_READONLY - int getDPPOp64(uint16_t Opcode); + int64_t getDPPOp64(uint32_t Opcode); LLVM_READONLY - int getBasicFromSDWAOp(uint16_t Opcode); + int64_t getBasicFromSDWAOp(uint32_t Opcode); LLVM_READONLY - int getCommuteRev(uint16_t Opcode); + int64_t getCommuteRev(uint32_t Opcode); LLVM_READONLY - int getCommuteOrig(uint16_t Opcode); + int64_t getCommuteOrig(uint32_t Opcode); LLVM_READONLY - int getAddr64Inst(uint16_t Opcode); + int64_t getAddr64Inst(uint32_t Opcode); /// Check if \p Opcode is an Addr64 opcode. /// /// \returns \p Opcode if it is an Addr64 opcode, otherwise -1. LLVM_READONLY - int getIfAddr64Inst(uint16_t Opcode); + int64_t getIfAddr64Inst(uint32_t Opcode); LLVM_READONLY - int getSOPKOp(uint16_t Opcode); + int64_t getSOPKOp(uint32_t Opcode); /// \returns SADDR form of a FLAT Global instruction given an \p Opcode /// of a VADDR form. LLVM_READONLY - int getGlobalSaddrOp(uint16_t Opcode); + int64_t getGlobalSaddrOp(uint32_t Opcode); /// \returns VADDR form of a FLAT Global instruction given an \p Opcode /// of a SADDR form. LLVM_READONLY - int getGlobalVaddrOp(uint16_t Opcode); + int64_t getGlobalVaddrOp(uint32_t Opcode); LLVM_READONLY - int getVCMPXNoSDstOp(uint16_t Opcode); + int64_t getVCMPXNoSDstOp(uint32_t Opcode); /// \returns ST form with only immediate offset of a FLAT Scratch instruction /// given an \p Opcode of an SS (SADDR) form. LLVM_READONLY - int getFlatScratchInstSTfromSS(uint16_t Opcode); + int64_t getFlatScratchInstSTfromSS(uint32_t Opcode); /// \returns SV (VADDR) form of a FLAT Scratch instruction given an \p Opcode /// of an SVS (SADDR + VADDR) form. LLVM_READONLY - int getFlatScratchInstSVfromSVS(uint16_t Opcode); + int64_t getFlatScratchInstSVfromSVS(uint32_t Opcode); /// \returns SS (SADDR) form of a FLAT Scratch instruction given an \p Opcode /// of an SV (VADDR) form. LLVM_READONLY - int getFlatScratchInstSSfromSV(uint16_t Opcode); + int64_t getFlatScratchInstSSfromSV(uint32_t Opcode); /// \returns SV (VADDR) form of a FLAT Scratch instruction given an \p Opcode /// of an SS (SADDR) form. LLVM_READONLY - int getFlatScratchInstSVfromSS(uint16_t Opcode); + int64_t getFlatScratchInstSVfromSS(uint32_t Opcode); /// \returns earlyclobber version of a MAC MFMA is exists. LLVM_READONLY - int getMFMAEarlyClobberOp(uint16_t Opcode); + int64_t getMFMAEarlyClobberOp(uint32_t Opcode); /// \returns Version of an MFMA instruction which uses AGPRs for srcC and /// vdst, given an \p Opcode of an MFMA which uses VGPRs for srcC/vdst. LLVM_READONLY - int getMFMASrcCVDstAGPROp(uint16_t Opcode); + int64_t getMFMASrcCVDstAGPROp(uint32_t Opcode); /// \returns v_cmpx version of a v_cmp instruction. LLVM_READONLY - int getVCMPXOpFromVCMP(uint16_t Opcode); + int64_t getVCMPXOpFromVCMP(uint32_t Opcode); const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL; const uint64_t RSRC_ELEMENT_SIZE_SHIFT = (32 + 19); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index b7f63ec..f063b4e 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -6,13 +6,6 @@ // //===----------------------------------------------------------------------===// -def isWave32 : Predicate<"Subtarget->isWave32()">, - AssemblerPredicate <(any_of FeatureWavefrontSize32, - FeatureAssemblerPermissiveWavesize)>; -def isWave64 : Predicate<"Subtarget->isWave64()">, - AssemblerPredicate <(any_of FeatureWavefrontSize64, - FeatureAssemblerPermissiveWavesize)>; - class AMDGPUMnemonicAlias<string From, string To, string VariantName = ""> : MnemonicAlias<From, To, VariantName>, PredicateControl; @@ -34,6 +27,7 @@ def SIEncodingFamily { int GFX11 = 10; int GFX12 = 11; int GFX1250 = 12; + int GFX13 = 13; } //===----------------------------------------------------------------------===// @@ -47,6 +41,7 @@ class GFXGen<Predicate pred, string dn, string suffix, int sub> { int Subtarget = sub; } +def GFX13Gen : GFXGen<isGFX13Only, "GFX13", "_gfx13", SIEncodingFamily.GFX13>; def GFX1250Gen : GFXGen<isGFX125xOnly, "GFX1250", "_gfx1250", SIEncodingFamily.GFX1250>; def GFX12Not12_50Gen : GFXGen<isGFX12Not12_50, "GFX12", "_gfx12", SIEncodingFamily.GFX12>; def GFX12Gen : GFXGen<isGFX12Only, "GFX12", "_gfx12", SIEncodingFamily.GFX12>; @@ -57,6 +52,8 @@ def GFX10Gen : GFXGen<isGFX10Only, "GFX10", "_gfx10", SIEncodingFamily.GFX10>; // SI DAG Nodes //===----------------------------------------------------------------------===// +// CLAMP value between 0.0 and 1.0. NaN clamped to 0, following clamp output +// modifier behavior with dx10_enable. def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPUnaryOp>; def SDTSBufferLoad : SDTypeProfile<1, 3, @@ -331,7 +328,7 @@ def mfma_f32_32x32x64_f8f6f4 : UnscaledMFMAOptimizationPat<int_amdgcn_mfma_scale //===----------------------------------------------------------------------===// class isIntType<ValueType SrcVT> { - bit ret = !and(SrcVT.isInteger, !ne(SrcVT.Value, i1.Value)); + bit ret = !and(SrcVT.isInteger, !ne(SrcVT, i1)); } def SDTSBufferPrefetch : SDTypeProfile<0, 3, @@ -776,11 +773,7 @@ def xnor : PatFrag < foreach I = 1-4 in { def shl#I#_add : PatFrag < (ops node:$src0, node:$src1), - (add (shl_oneuse $src0, (i32 I)), $src1)> { - // FIXME: Poor substitute for disabling pattern in SelectionDAG - let PredicateCode = [{return false;}]; - let GISelPredicateCode = [{return true;}]; -} + (add (shl_oneuse $src0, (i32 I)), $src1)>; } multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0, @@ -818,6 +811,8 @@ defm atomic_load_add : SIAtomicM0Glue2 <"LOAD_ADD">; defm atomic_load_sub : SIAtomicM0Glue2 <"LOAD_SUB">; defm atomic_load_uinc_wrap : SIAtomicM0Glue2 <"LOAD_UINC_WRAP">; defm atomic_load_udec_wrap : SIAtomicM0Glue2 <"LOAD_UDEC_WRAP">; +defm atomic_load_usub_cond : SIAtomicM0Glue2 <"LOAD_USUB_COND">; +defm atomic_load_usub_sat : SIAtomicM0Glue2 <"LOAD_USUB_SAT">; defm atomic_load_and : SIAtomicM0Glue2 <"LOAD_AND">; defm atomic_load_min : SIAtomicM0Glue2 <"LOAD_MIN">; defm atomic_load_max : SIAtomicM0Glue2 <"LOAD_MAX">; @@ -963,6 +958,9 @@ def InlineImmFP64 : FPImmLeaf<f64, [{ return isInlineImmediate(Imm); }]>; +def fpimm_pos_zero : FPImmLeaf<fAny, [{ + return Imm.isZero() && !Imm.isNegative(); +}]>; class VGPRImm <dag frag> : PatLeaf<frag, [{ return isVGPRImm(N); @@ -991,6 +989,11 @@ def fp16_zeros_high_16bits : PatLeaf<(f16 VGPR_32:$src), [{ return fp16SrcZerosHighBits(N->getOpcode()); }]>; + +class build_vector_fpimm_pos_zero_v2<VTVec vec> : PatLeaf< + (vec (build_vector (vec.ElementType fpimm_pos_zero), + (vec.ElementType fpimm_pos_zero)))>; + def MFMALdScaleXForm : SDNodeXForm<timm, [{ unsigned Val = N->getZExtValue(); unsigned New = 0; @@ -1001,11 +1004,13 @@ def MFMALdScaleXForm : SDNodeXForm<timm, [{ return CurDAG->getTargetConstant(New, SDLoc(N), MVT::i32); }]>; -def is_canonicalized : PatLeaf<(fAny srcvalue:$src), [{ +def fcanonicalize_canonicalized + : PatFrag<(ops node:$op), (fcanonicalize node:$op), [{ const SITargetLowering &Lowering = *static_cast<const SITargetLowering *>(getTargetLowering()); - return Lowering.isCanonicalized(*CurDAG, Op); + return Lowering.isCanonicalized(*CurDAG, Op->getOperand(0), N->getFlags()); }]> { + // FIXME: This predicate for GlobalISel is dead code. let GISelPredicateCode = [{ const SITargetLowering *TLI = static_cast<const SITargetLowering *>( MF.getSubtarget().getTargetLowering()); @@ -1084,6 +1089,8 @@ def VReg32OrOffClass : AsmOperandClass { def SendMsg : CustomOperand<i32>; +def WaitEvent : CustomOperand<i16>; + def Swizzle : CustomOperand<i16, 1>; def Endpgm : CustomOperand<i16, 1>; @@ -1197,12 +1204,12 @@ class NamedIntOperand<string prefix, bit Optional = 1, string name = NAME> !if(AlwaysPrint, "true", "false")#"); }"; } -class NamedBitOperand<string Id, string Name = NAME> +class NamedBitOperand<string Id, string Name = NAME, bit AlwaysIgnoreNegative = 0> : CustomOperand<i1, 1, Name> { let PredicateMethod = "isImmTy<AMDGPUOperand::"#ImmTy#">"; let ParserMethod = "[this](OperandVector &Operands) -> ParseStatus { "# - "return parseNamedBit(\""#Id#"\", Operands, AMDGPUOperand::"#ImmTy#"); }"; + "return parseNamedBit(\""#Id#"\", Operands, AMDGPUOperand::"#ImmTy# !if(AlwaysIgnoreNegative, ", true", ", false")#"); }"; let PrintMethod = "[this](const MCInst *MI, unsigned OpNo, "# "const MCSubtargetInfo &STI, raw_ostream &O) { "# "printNamedBit(MI, OpNo, O, \""#Id#"\"); }"; @@ -1260,6 +1267,8 @@ def CPol_NonGLC : ValuePredicatedOperand<CPol, "!(Op.getImm() & CPol::GLC)", 1>; def CPol_GLC_WithDefault : DefaultOperand<CPol_GLC, !shl(1, CPolBit.GLC)>; def CPol_NonGLC_WithDefault : DefaultOperand<CPol_NonGLC, 0>; +def IsAsync : NamedBitOperand<"isasync">; + def TFE : NamedBitOperand<"tfe">; def UNorm : NamedBitOperand<"unorm">; def DA : NamedBitOperand<"da">; @@ -1267,8 +1276,10 @@ def R128A16 : CustomOperand<i1, 1>; def A16 : NamedBitOperand<"a16">; def D16 : NamedBitOperand<"d16">; def LWE : NamedBitOperand<"lwe">; -def exp_compr : NamedBitOperand<"compr", "ExpCompr">; -def exp_vm : NamedBitOperand<"vm", "ExpVM">; +def exp_compr : NamedBitOperand<"compr", "ExpCompr", 1>; +def exp_vm : NamedBitOperand<"vm", "ExpVM", 1>; +def exp_done : NamedBitOperand<"done", "Done", 1>; +def exp_row_en : NamedBitOperand<"row_en", "RowEn", 1>; def FORMAT : CustomOperand<i8>; @@ -1796,10 +1807,10 @@ class SIMCInstr <string pseudo, int subtarget> { class getNumSrcArgs<ValueType Src0, ValueType Src1, ValueType Src2> { int ret = - !if (!eq(Src0.Value, untyped.Value), 0, - !if (!eq(Src1.Value, untyped.Value), 1, // VOP1 - !if (!eq(Src2.Value, untyped.Value), 2, // VOP2 - 3))); // VOP3 + !if (!eq(Src0, untyped), 0, + !if (!eq(Src1, untyped), 1, // VOP1 + !if (!eq(Src2, untyped), 2, // VOP2 + 3))); // VOP3 } // Returns the register class to use for the destination of VOP[123C] @@ -1868,17 +1879,17 @@ class getVCSrcForVT<ValueType VT> { !if(VT.isFP, !if(!eq(VT.Size, 64), VCSrc_f64, - !cond(!eq(VT.Value, f16.Value) : VCSrc_f16, - !eq(VT.Value, bf16.Value) : VCSrc_bf16, - !eq(VT.Value, v2f16.Value) : VCSrc_v2f16, - !eq(VT.Value, v2bf16.Value) : VCSrc_v2bf16, + !cond(!eq(VT, f16) : VCSrc_f16, + !eq(VT, bf16) : VCSrc_bf16, + !eq(VT, v2f16) : VCSrc_v2f16, + !eq(VT, v2bf16) : VCSrc_v2bf16, 1 : VCSrc_f32) ), !if(!eq(VT.Size, 64), VCSrc_b64, - !if(!eq(VT.Value, i16.Value), + !if(!eq(VT, i16), VCSrc_b16, - !if(!eq(VT.Value, v2i16.Value), + !if(!eq(VT, v2i16), VCSrc_v2b16, VCSrc_b32 ) @@ -2003,28 +2014,28 @@ class getVOP3DPPSrcForVT<ValueType VT, bit IsFake16 = 1> { // Float or packed int class isModifierType<ValueType SrcVT> { - bit ret = !or(!eq(SrcVT.Value, f16.Value), - !eq(SrcVT.Value, bf16.Value), - !eq(SrcVT.Value, f32.Value), - !eq(SrcVT.Value, f64.Value), - !eq(SrcVT.Value, v2f16.Value), - !eq(SrcVT.Value, v2i16.Value), - !eq(SrcVT.Value, v2bf16.Value), - !eq(SrcVT.Value, v2f32.Value), - !eq(SrcVT.Value, v2i32.Value), - !eq(SrcVT.Value, v4f16.Value), - !eq(SrcVT.Value, v4i16.Value), - !eq(SrcVT.Value, v4bf16.Value), - !eq(SrcVT.Value, v4f32.Value), - !eq(SrcVT.Value, v4i32.Value), - !eq(SrcVT.Value, v8f16.Value), - !eq(SrcVT.Value, v8i16.Value), - !eq(SrcVT.Value, v8bf16.Value), - !eq(SrcVT.Value, v8f32.Value), - !eq(SrcVT.Value, v8i32.Value), - !eq(SrcVT.Value, v16f16.Value), - !eq(SrcVT.Value, v16i16.Value), - !eq(SrcVT.Value, v16bf16.Value)); + bit ret = !or(!eq(SrcVT, f16), + !eq(SrcVT, bf16), + !eq(SrcVT, f32), + !eq(SrcVT, f64), + !eq(SrcVT, v2f16), + !eq(SrcVT, v2i16), + !eq(SrcVT, v2bf16), + !eq(SrcVT, v2f32), + !eq(SrcVT, v2i32), + !eq(SrcVT, v4f16), + !eq(SrcVT, v4i16), + !eq(SrcVT, v4bf16), + !eq(SrcVT, v4f32), + !eq(SrcVT, v4i32), + !eq(SrcVT, v8f16), + !eq(SrcVT, v8i16), + !eq(SrcVT, v8bf16), + !eq(SrcVT, v8f32), + !eq(SrcVT, v8i32), + !eq(SrcVT, v16f16), + !eq(SrcVT, v16i16), + !eq(SrcVT, v16bf16)); } // Return type of input modifiers operand for specified input operand. @@ -2057,9 +2068,9 @@ class getSrcModDPP <ValueType VT> { class getSrcModDPP_t16 <ValueType VT, bit IsFake16 = 1> { Operand ret = !if (VT.isFP, - !if (!or(!eq(VT.Value, f16.Value), !eq(VT.Value, bf16.Value)), + !if (!or(!eq(VT, f16), !eq(VT, bf16)), FPT16_Lo128VRegInputMods<IsFake16>, FPVRegInputMods), - !if (!eq(VT.Value, i16.Value), + !if (!eq(VT, i16), IntT16_Lo128VRegInputMods<IsFake16>, IntVRegInputMods)); } @@ -2068,11 +2079,11 @@ class getSrcModDPP_t16 <ValueType VT, bit IsFake16 = 1> { class getSrcModVOP3VC <ValueType VT, bit IsFake16 = 1> { Operand ret = !if (VT.isFP, - !if (!or(!eq(VT.Value, f16.Value), !eq(VT.Value, bf16.Value)), + !if (!or(!eq(VT, f16), !eq(VT, bf16)), FPT16VCSrcInputMods<IsFake16>, - !if (!eq(VT.Value, f64.Value), FP64VCSrcInputMods, + !if (!eq(VT, f64), FP64VCSrcInputMods, FP32VCSrcInputMods)), - !if (!eq(VT.Value, i16.Value), + !if (!eq(VT, i16), IntT16VCSrcInputMods<IsFake16>, Int32VCSrcInputMods)); } @@ -2084,15 +2095,15 @@ class getSrcModVOP3VC <ValueType VT, bit IsFake16 = 1> { class getSrc0ModVOP3DPP <ValueType VT, ValueType DstVT, bit IsFake16 = 1> { defvar T16Dst = !if (VT.isFP, - !if (!or(!eq(VT.Value, f16.Value), !eq(VT.Value, bf16.Value)), + !if (!or(!eq(VT, f16), !eq(VT, bf16)), FPT16VRegInputMods<IsFake16>, FPVRegT16DstInputMods), - !if (!eq(VT.Value, i16.Value), IntT16VRegInputMods<IsFake16>, + !if (!eq(VT, i16), IntT16VRegInputMods<IsFake16>, IntVRegT16DstInputMods)); defvar Normal = !if (VT.isFP, - !if (!or(!eq(VT.Value, f16.Value), !eq(VT.Value, bf16.Value)), + !if (!or(!eq(VT, f16), !eq(VT, bf16)), FPT16VRegInputMods<IsFake16>, FPVRegInputMods), - !if (!eq(VT.Value, i16.Value), + !if (!eq(VT, i16), IntT16VRegInputMods<IsFake16>, IntVRegInputMods)); Operand ret = !if(!and(!not(IsFake16), !eq(DstVT.Size, 16)), T16Dst, Normal); @@ -2102,16 +2113,16 @@ class getSrc0ModVOP3DPP <ValueType VT, ValueType DstVT, bit IsFake16 = 1> { // only operands (VOPD3 vsrc1 and vsrc2). class getSrcModVOP3V <ValueType VT> { Operand ret = - !if (!eq(VT.Value, f64.Value), FP64VRegSrcInputMods, + !if (!eq(VT, f64), FP64VRegSrcInputMods, FP32VRegSrcInputMods); } // Return type of input modifiers operand specified input operand for SDWA class getSrcModSDWA <ValueType VT> { - Operand ret = !if(!eq(VT.Value, f16.Value), FP16SDWAInputMods, - !if(!eq(VT.Value, f32.Value), FP32SDWAInputMods, - !if(!eq(VT.Value, i16.Value), Int16SDWAInputMods, - !if(!eq(VT.Value, bf16.Value), FP16SDWAInputMods, + Operand ret = !if(!eq(VT, f16), FP16SDWAInputMods, + !if(!eq(VT, f32), FP32SDWAInputMods, + !if(!eq(VT, i16), Int16SDWAInputMods, + !if(!eq(VT, bf16), FP16SDWAInputMods, Int32SDWAInputMods)))); } @@ -2778,14 +2789,14 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> { field bit HasFP8ByteSel = !or(HasFP8SrcByteSel, HasFP8DstByteSel); field bit HasBitOp3 = 0; - field bit HasDst = !ne(DstVT.Value, untyped.Value); + field bit HasDst = !ne(DstVT, untyped); field bit HasDst32 = HasDst; field bit EmitDst = HasDst; // force dst encoding, see v_movreld_b32 special case field bit EmitDstSel = EmitDst; field int NumSrcArgs = getNumSrcArgs<Src0VT, Src1VT, Src2VT>.ret; - field bit HasSrc0 = !ne(Src0VT.Value, untyped.Value); - field bit HasSrc1 = !ne(Src1VT.Value, untyped.Value); - field bit HasSrc2 = !ne(Src2VT.Value, untyped.Value); + field bit HasSrc0 = !ne(Src0VT, untyped); + field bit HasSrc1 = !ne(Src1VT, untyped); + field bit HasSrc2 = !ne(Src2VT, untyped); field bit HasSrc0FloatMods = Src0VT.isFP; field bit HasSrc1FloatMods = Src1VT.isFP; @@ -3364,7 +3375,8 @@ def getMCOpcodeGen : InstrMapping { [!cast<string>(SIEncodingFamily.GFX940)], [!cast<string>(SIEncodingFamily.GFX11)], [!cast<string>(SIEncodingFamily.GFX12)], - [!cast<string>(SIEncodingFamily.GFX1250)]]; + [!cast<string>(SIEncodingFamily.GFX1250)], + [!cast<string>(SIEncodingFamily.GFX13)]]; } // Get equivalent SOPK instruction. diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 27e5ee9c..cde3523 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -131,7 +131,7 @@ def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst), // 64-bit vector move instruction. This is mainly used by the // SIFoldOperands pass to enable folding of inline immediates. -def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64_AlignTarget:$vdst), +def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst), (ins VSrc_b64:$src0)> { let isReMaterializable = 1; let isAsCheapAsAMove = 1; @@ -328,7 +328,7 @@ def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)), multiclass AMDGPUWaveReducePseudoGenerator<string Op, string DataType, ValueType ty, RegisterClass RetReg, SrcRegOrImm9 Reg> { - let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in { + let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, UseNamedOperandTable = 1, Uses = [EXEC] in { def !toupper(Op) #"_PSEUDO_" #DataType : VPseudoInstSI<(outs RetReg : $sdst), (ins Reg : $src, VSrc_b32 : $strategy), @@ -348,7 +348,7 @@ class WaveReduceOp<string OpName, string TypeStr, ValueType Ty, // Input list : [Operation_name, // type - Signed(I)/Unsigned(U)/Float(F)/Bitwise(B), -// bit-width +// input-type // output register class, // input register class] defvar Operations = [ @@ -371,6 +371,15 @@ defvar Operations = [ WaveReduceOp<"and", "B64", i64, SGPR_64, VSrc_b64>, WaveReduceOp<"or", "B64", i64, SGPR_64, VSrc_b64>, WaveReduceOp<"xor", "B64", i64, SGPR_64, VSrc_b64>, + + WaveReduceOp<"fmin", "F32", f32, SGPR_32, VSrc_b32>, + WaveReduceOp<"fmin", "F64", f64, SGPR_64, VSrc_b64>, + WaveReduceOp<"fmax", "F32", f32, SGPR_32, VSrc_b32>, + WaveReduceOp<"fmax", "F64", f64, SGPR_64, VSrc_b64>, + WaveReduceOp<"fadd", "F32", f32, SGPR_32, VSrc_b32>, + WaveReduceOp<"fadd", "F64", f64, SGPR_64, VSrc_b64>, + WaveReduceOp<"fsub", "F32", f32, SGPR_32, VSrc_b32>, + WaveReduceOp<"fsub", "F64", f64, SGPR_64, VSrc_b64>, ]; foreach Op = Operations in { @@ -791,6 +800,17 @@ def : GCNPat< (SI_CALL_ISEL $src0, (i64 0)) >; +// Funnel shift right (fshr) patterns for uniform inputs. +// These patterns implement this using scalar instructions by constructing a 64-bit +// value {a, b} and performing a single right shift. +def : GCNPat<(UniformTernaryFrag<fshr> i32:$src0, i32:$src1, i32:$src2), + (i32 (EXTRACT_SUBREG (S_LSHR_B64 (REG_SEQUENCE SReg_64, $src1, sub0, $src0, sub1), (S_AND_B32 $src2, (i32 31))), sub0)) +>; + +def : GCNPat<(UniformTernaryFrag<fshr> i32:$src0, i32:$src1, (i32 ShiftAmt32Imm:$src2)), + (i32 (EXTRACT_SUBREG (S_LSHR_B64 (REG_SEQUENCE SReg_64, $src1, sub0, $src0, sub1), $src2), sub0)) +>; + // Wrapper around s_swappc_b64 with extra $callee parameter to track // the called function after regalloc. def SI_CALL : SPseudoInstSI < @@ -804,9 +824,8 @@ def SI_CALL : SPseudoInstSI < let isConvergent = 1; } -class SI_TCRETURN_Pseudo<RegisterClass rc, SDNode sd> : SPseudoInstSI <(outs), - (ins rc:$src0, unknown:$callee, i32imm:$fpdiff), - [(sd i64:$src0, tglobaladdr:$callee, i32:$fpdiff)]> { +class SI_TCRETURN_Pseudo<RegisterClass rc, list<dag> pattern = []> + : SPseudoInstSI <(outs), (ins rc:$src0, unknown:$callee, i32imm:$fpdiff), pattern> { let Size = 4; let FixedSize = 1; let isCall = 1; @@ -820,8 +839,15 @@ class SI_TCRETURN_Pseudo<RegisterClass rc, SDNode sd> : SPseudoInstSI <(outs), } // Tail call handling pseudo -def SI_TCRETURN : SI_TCRETURN_Pseudo<CCR_SGPR_64, AMDGPUtc_return>; -def SI_TCRETURN_GFX : SI_TCRETURN_Pseudo<Gfx_CCR_SGPR_64, AMDGPUtc_return_gfx>; +def SI_TCRETURN : SI_TCRETURN_Pseudo<CCR_SGPR_64, + [(AMDGPUtc_return i64:$src0, tglobaladdr:$callee, i32:$fpdiff)]>; +def SI_TCRETURN_GFX : SI_TCRETURN_Pseudo<Gfx_CCR_SGPR_64, + [(AMDGPUtc_return_gfx i64:$src0, tglobaladdr:$callee, i32:$fpdiff)]>; + +// Tail call for chain calling conventions. +// Uses unrestricted SGPR_64 instead of CCR_SGPR_64 because chain calls +// never return and don't need to preserve any SGPRs. +def SI_TCRETURN_CHAIN : SI_TCRETURN_Pseudo<SGPR_64>; // Handle selecting indirect tail calls def : GCNPat< @@ -851,13 +877,13 @@ multiclass SI_CS_CHAIN_TC< // This is essentially a tail call, but it also takes a mask to put in EXEC // right before jumping to the callee. def NAME: SPseudoInstSI <(outs), - (ins CCR_SGPR_64:$src0, unknown:$callee, i32imm:$fpdiff, execrc:$exec)>; + (ins SGPR_64:$src0, unknown:$callee, i32imm:$fpdiff, execrc:$exec)>; // Same as above, but it will first try to reallocate the VGPRs, and choose an // EXEC mask and a callee depending on the success of the reallocation attempt. def _DVGPR : SPseudoInstSI <(outs), - (ins CCR_SGPR_64:$src0, i64imm:$callee, i32imm:$fpdiff, execrc:$exec, - SSrc_b32:$numvgprs, execrc:$fbexec, CCR_SGPR_64:$fbcallee)>; + (ins SGPR_64:$src0, i64imm:$callee, i32imm:$fpdiff, execrc:$exec, + SSrc_b32:$numvgprs, execrc:$fbexec, SGPR_64:$fbcallee)>; } // End FixedSize = 0 etc } @@ -869,7 +895,7 @@ multiclass si_cs_chain_tc_pattern< dag callee, ValueType execvt, RegisterOperand execrc, Instruction tc> { def : GCNPat< (AMDGPUtc_return_chain i64:$src0, callee, (i32 timm:$fpdiff), execvt:$exec), - (tc CCR_SGPR_64:$src0, callee, i32imm:$fpdiff, execrc:$exec) + (tc SGPR_64:$src0, callee, i32imm:$fpdiff, execrc:$exec) >; } @@ -896,14 +922,15 @@ multiclass si_cs_chain_tc_dvgpr_patterns< (AMDGPUtc_return_chain_dvgpr i64:$src0, callee, (i32 timm:$fpdiff), execvt:$exec, i32:$numvgprs, execvt:$fbexec, i64:$fbcallee), - (tc CCR_SGPR_64:$src0, (i64 0), i32imm:$fpdiff, execrc:$exec, - SSrc_b32:$numvgprs, execrc:$fbexec, CCR_SGPR_64:$fbcallee) + (tc SGPR_64:$src0, (i64 0), i32imm:$fpdiff, execrc:$exec, + SSrc_b32:$numvgprs, execrc:$fbexec, SGPR_64:$fbcallee) >; } } defm : si_cs_chain_tc_dvgpr_patterns<i32>; // On GFX12, dVGPR mode is wave32-only. +let Defs = [SCC] in { def ADJCALLSTACKUP : SPseudoInstSI< (outs), (ins i32imm:$amt0, i32imm:$amt1), [(callseq_start timm:$amt0, timm:$amt1)], @@ -913,7 +940,6 @@ def ADJCALLSTACKUP : SPseudoInstSI< let hasSideEffects = 1; let usesCustomInserter = 1; let SchedRW = [WriteSALU]; - let Defs = [SCC]; } def ADJCALLSTACKDOWN : SPseudoInstSI< @@ -924,9 +950,16 @@ def ADJCALLSTACKDOWN : SPseudoInstSI< let hasSideEffects = 1; let usesCustomInserter = 1; let SchedRW = [WriteSALU]; - let Defs = [SCC]; } +// Get the offset of the base of the stack, skipping any reserved areas. +def GET_STACK_BASE : SPseudoInstSI<(outs SGPR_32:$dst), (ins), + [(set p5:$dst, (sponentry))]> { + let Size = 16; // Worst case (s_getreg, s_cmp, s_cselect + constant). + let SchedRW = [WriteSALU]; +} +} // End Defs = [SCC] + let Defs = [M0, EXEC, SCC], UseNamedOperandTable = 1 in { @@ -947,7 +980,11 @@ class SI_INDIRECT_DST<RegisterClass rc> : VPseudoInstSI < def SI_INDIRECT_SRC_V1 : SI_INDIRECT_SRC<VGPR_32>; def SI_INDIRECT_SRC_V2 : SI_INDIRECT_SRC<VReg_64>; +def SI_INDIRECT_SRC_V3 : SI_INDIRECT_SRC<VReg_96>; def SI_INDIRECT_SRC_V4 : SI_INDIRECT_SRC<VReg_128>; +def SI_INDIRECT_SRC_V5 : SI_INDIRECT_SRC<VReg_160>; +def SI_INDIRECT_SRC_V6 : SI_INDIRECT_SRC<VReg_192>; +def SI_INDIRECT_SRC_V7 : SI_INDIRECT_SRC<VReg_224>; def SI_INDIRECT_SRC_V8 : SI_INDIRECT_SRC<VReg_256>; def SI_INDIRECT_SRC_V9 : SI_INDIRECT_SRC<VReg_288>; def SI_INDIRECT_SRC_V10 : SI_INDIRECT_SRC<VReg_320>; @@ -958,7 +995,11 @@ def SI_INDIRECT_SRC_V32 : SI_INDIRECT_SRC<VReg_1024>; def SI_INDIRECT_DST_V1 : SI_INDIRECT_DST<VGPR_32>; def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST<VReg_64>; +def SI_INDIRECT_DST_V3 : SI_INDIRECT_DST<VReg_96>; def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST<VReg_128>; +def SI_INDIRECT_DST_V5 : SI_INDIRECT_DST<VReg_160>; +def SI_INDIRECT_DST_V6 : SI_INDIRECT_DST<VReg_192>; +def SI_INDIRECT_DST_V7 : SI_INDIRECT_DST<VReg_224>; def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST<VReg_256>; def SI_INDIRECT_DST_V9 : SI_INDIRECT_DST<VReg_288>; def SI_INDIRECT_DST_V10 : SI_INDIRECT_DST<VReg_320>; @@ -1004,6 +1045,8 @@ def V_INDIRECT_REG_WRITE_MOVREL_B32_V2 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo< def V_INDIRECT_REG_WRITE_MOVREL_B32_V3 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_96>; def V_INDIRECT_REG_WRITE_MOVREL_B32_V4 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_128>; def V_INDIRECT_REG_WRITE_MOVREL_B32_V5 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_160>; +def V_INDIRECT_REG_WRITE_MOVREL_B32_V6 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_192>; +def V_INDIRECT_REG_WRITE_MOVREL_B32_V7 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_224>; def V_INDIRECT_REG_WRITE_MOVREL_B32_V8 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_256>; def V_INDIRECT_REG_WRITE_MOVREL_B32_V9 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_288>; def V_INDIRECT_REG_WRITE_MOVREL_B32_V10 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_320>; @@ -1017,6 +1060,8 @@ def S_INDIRECT_REG_WRITE_MOVREL_B32_V2 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo< def S_INDIRECT_REG_WRITE_MOVREL_B32_V3 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_96>; def S_INDIRECT_REG_WRITE_MOVREL_B32_V4 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_128>; def S_INDIRECT_REG_WRITE_MOVREL_B32_V5 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_160>; +def S_INDIRECT_REG_WRITE_MOVREL_B32_V6 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_192>; +def S_INDIRECT_REG_WRITE_MOVREL_B32_V7 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_224>; def S_INDIRECT_REG_WRITE_MOVREL_B32_V8 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_256>; def S_INDIRECT_REG_WRITE_MOVREL_B32_V9 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_288>; def S_INDIRECT_REG_WRITE_MOVREL_B32_V10 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_320>; @@ -1049,6 +1094,8 @@ def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VR def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_96>; def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_128>; def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_160>; +def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V6 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_192>; +def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V7 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_224>; def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_256>; def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_288>; def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_320>; @@ -1069,6 +1116,8 @@ def V_INDIRECT_REG_READ_GPR_IDX_B32_V2 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg def V_INDIRECT_REG_READ_GPR_IDX_B32_V3 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_96>; def V_INDIRECT_REG_READ_GPR_IDX_B32_V4 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_128>; def V_INDIRECT_REG_READ_GPR_IDX_B32_V5 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_160>; +def V_INDIRECT_REG_READ_GPR_IDX_B32_V6 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_192>; +def V_INDIRECT_REG_READ_GPR_IDX_B32_V7 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_224>; def V_INDIRECT_REG_READ_GPR_IDX_B32_V8 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_256>; def V_INDIRECT_REG_READ_GPR_IDX_B32_V9 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_288>; def V_INDIRECT_REG_READ_GPR_IDX_B32_V10 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_320>; @@ -1319,22 +1368,22 @@ multiclass f16_to_fp_Pats<Instruction cvt_f16_f32_inst_e64, Instruction cvt_f32_ >; def : GCNPat < - (f32 (f16_to_fp (and_oneuse i32:$src0, 0x7fff))), + (f32 (any_f16_to_fp (and_oneuse i32:$src0, 0x7fff))), (cvt_f32_f16_inst_e64 SRCMODS.ABS, $src0) >; def : GCNPat < - (f32 (f16_to_fp (i32 (srl_oneuse (and_oneuse i32:$src0, 0x7fff0000), (i32 16))))), + (f32 (any_f16_to_fp (i32 (srl_oneuse (and_oneuse i32:$src0, 0x7fff0000), (i32 16))))), (cvt_f32_f16_inst_e64 SRCMODS.ABS, (i32 (V_LSHRREV_B32_e64 (i32 16), i32:$src0))) >; def : GCNPat < - (f32 (f16_to_fp (or_oneuse i32:$src0, 0x8000))), + (f32 (any_f16_to_fp (or_oneuse i32:$src0, 0x8000))), (cvt_f32_f16_inst_e64 SRCMODS.NEG_ABS, $src0) >; def : GCNPat < - (f32 (f16_to_fp (xor_oneuse i32:$src0, 0x8000))), + (f32 (any_f16_to_fp (xor_oneuse i32:$src0, 0x8000))), (cvt_f32_f16_inst_e64 SRCMODS.NEG, $src0) >; @@ -1429,7 +1478,7 @@ def : GCNPat < // Don't allow source modifiers. If there are any source modifiers then it's // better to select fma instead of fmac. -let SubtargetPredicate = HasFmaLegacy32 in +let SubtargetPredicate = HasFmacLegacy32 in def : GCNPat < (f32 (int_amdgcn_fma_legacy (VOP3NoMods f32:$src0), (VOP3NoMods f32:$src1), @@ -2223,8 +2272,8 @@ def : GCNPat < def : GCNPat < (DivergentUnaryFrag<fneg> (v2f32 VReg_64:$src)), - (V_PK_ADD_F32 11 /* OP_SEL_1 | NEG_LO | HEG_HI */, VReg_64:$src, - 11 /* OP_SEL_1 | NEG_LO | HEG_HI */, (i64 0), + (V_PK_ADD_F32 !or(SRCMODS.OP_SEL_1, SRCMODS.NEG, SRCMODS.NEG_HI), VReg_64:$src, + !or(SRCMODS.OP_SEL_1, SRCMODS.NEG, SRCMODS.NEG_HI), (i64 0), 0, 0, 0, 0, 0) > { let SubtargetPredicate = HasPackedFP32Ops; @@ -2238,12 +2287,34 @@ def : GCNPat < >; def : GCNPat < + (UniformBinFrag<fcopysign> (fp16vt fpimm_pos_zero), fp16vt:$src1), + (S_AND_B32 (S_MOV_B32 (i32 0x00008000)), SReg_32:$src1) +>; + +def : GCNPat < + (fcopysign (fp16vt fpimm_pos_zero), fp16vt:$src1), + (V_AND_B32_e32 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src1) +>; + +def : GCNPat < (fcopysign f32:$src0, fp16vt:$src1), (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0, (V_LSHLREV_B32_e64 (i32 16), $src1)) >; def : GCNPat < + (UniformBinFrag<fcopysign> (f32 fpimm_pos_zero), fp16vt:$src1), + (S_AND_B32 (S_MOV_B32 (i32 0x80000000)), + (S_LSHL_B32 SReg_32:$src1, (i32 16))) +>; + +def : GCNPat < + (fcopysign (f32 fpimm_pos_zero), fp16vt:$src1), + (V_AND_B32_e32 (S_MOV_B32 (i32 0x80000000)), + (V_LSHLREV_B32_e64 (i32 16), VGPR_32:$src1)) +>; + +def : GCNPat < (fcopysign f64:$src0, fp16vt:$src1), (REG_SEQUENCE SReg_64, (i32 (EXTRACT_SUBREG $src0, sub0)), sub0, @@ -2258,6 +2329,18 @@ def : GCNPat < >; def : GCNPat < + (UniformBinFrag<fcopysign> (fp16vt fpimm_pos_zero), f32:$src1), + (S_AND_B32 (S_MOV_B32 (i32 0x00008000)), + (S_LSHR_B32 SReg_32:$src1, (i32 16))) +>; + +def : GCNPat < + (fcopysign (fp16vt fpimm_pos_zero), f32:$src1), + (V_AND_B32_e32 (S_MOV_B32 (i32 0x00008000)), + (V_LSHRREV_B32_e64 (i32 16), VGPR_32:$src1)) +>; + +def : GCNPat < (fcopysign fp16vt:$src0, f64:$src1), (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0, (V_LSHRREV_B32_e64 (i32 16), (EXTRACT_SUBREG $src1, sub1))) @@ -2271,12 +2354,27 @@ def : GCNPat < (REG_SEQUENCE VGPR_32, $src1, lo16, (i16 (IMPLICIT_DEF)), hi16)), lo16) >; +// TODO: Scalar case for 0 magnitude special case +def : GCNPat < + (fcopysign (fp16vt fpimm_pos_zero), fp16vt:$src1), + (V_AND_B16_t16_e64 0, (S_MOV_B32 (i32 0x00008000)), 0, VGPR_16:$src1) +>; + +// TODO: Handle 0 magnitude special case def : GCNPat < (fcopysign f32:$src0, fp16vt:$src1), (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0, (REG_SEQUENCE VGPR_32, (i16 (IMPLICIT_DEF)), lo16, $src1, hi16)) >; +// TODO: Scalar case for 0 magnitude special case +def : GCNPat < + (fcopysign (f32 fpimm_pos_zero), fp16vt:$src1), + (REG_SEQUENCE VGPR_32, + (V_MOV_B16_t16_e64 0, (i16 0), 0), lo16, + (V_AND_B16_t16_e64 0, (S_MOV_B32 (i32 0x00008000)), 0, VGPR_16:$src1), hi16) +>; + def : GCNPat < (fcopysign f64:$src0, fp16vt:$src1), (REG_SEQUENCE VReg_64, @@ -2292,6 +2390,13 @@ def : GCNPat < (V_LSHRREV_B32_e64 (i32 16), $src1)), lo16) >; +// TODO: Scalar case for 0 magnitude special case +def : GCNPat < + (fcopysign (fp16vt fpimm_pos_zero), f32:$src1), + (V_AND_B16_t16_e64 0, (S_MOV_B32 (i32 0x00008000)), + 0, (EXTRACT_SUBREG VGPR_32:$src1, hi16)) +>; + def : GCNPat < (fcopysign fp16vt:$src0, f64:$src1), (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), @@ -2309,6 +2414,16 @@ def : GCNPat < (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fff7fff)), $src0, $src1) >; +def : GCNPat < + (UniformBinFrag<fcopysign> build_vector_fpimm_pos_zero_v2<fp16vt>, fp16vt:$src1), + (S_AND_B32 (S_MOV_B32 (i32 0x80008000)), SReg_32:$src1) +>; + +def : GCNPat < + (fcopysign build_vector_fpimm_pos_zero_v2<fp16vt>, fp16vt:$src1), + (V_AND_B32_e32 (S_MOV_B32 (i32 0x80008000)), VGPR_32:$src1) +>; + } /********** ================== **********/ @@ -2638,12 +2753,34 @@ def : AMDGPUPat < >; def : AMDGPUPat < + (UniformBinFrag<fcopysign> (f32 fpimm_pos_zero), (f32 SReg_32:$src1)), + (S_AND_B32 (S_MOV_B32 (i32 0x80000000)), $src1) +>; + +def : AMDGPUPat < + (fcopysign (f32 fpimm_pos_zero), (f32 VGPR_32:$src1)), + (V_AND_B32_e32 (S_MOV_B32 (i32 0x80000000)), $src1) +>; + +def : AMDGPUPat < (fcopysign f32:$src0, f64:$src1), (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0, (i32 (EXTRACT_SUBREG SReg_64:$src1, sub1))) >; def : AMDGPUPat < + (UniformBinFrag<fcopysign> (f32 fpimm_pos_zero), SReg_64:$src1), + (S_AND_B32 (i32 (S_MOV_B32 (i32 0x80000000))), + (i32 (EXTRACT_SUBREG SReg_64:$src1, sub1))) +>; + +def : AMDGPUPat < + (fcopysign (f32 fpimm_pos_zero), VReg_64:$src1), + (V_AND_B32_e32 (i32 (S_MOV_B32 (i32 0x80000000))), + (i32 (EXTRACT_SUBREG SReg_64:$src1, sub1))) +>; + +def : AMDGPUPat < (fcopysign f64:$src0, f64:$src1), (REG_SEQUENCE SReg_64, (i32 (EXTRACT_SUBREG $src0, sub0)), sub0, @@ -2663,8 +2800,6 @@ def : AMDGPUPat < let True16Predicate = NotHasTrue16BitInsts in { let SubtargetPredicate = isNotGFX9Plus in { -def : ROTRPattern <V_ALIGNBIT_B32_e64>; - def : GCNPat<(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (and i32:$src1, (i32 31))))), (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)), (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>; @@ -2675,14 +2810,6 @@ def : GCNPat<(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (i32 ShiftAmt32Imm: } // isNotGFX9Plus let SubtargetPredicate = isGFX9GFX10 in { -def : GCNPat < - (rotr i32:$src0, i32:$src1), - (V_ALIGNBIT_B32_opsel_e64 /* src0_modifiers */ 0, $src0, - /* src1_modifiers */ 0, $src0, - /* src2_modifiers */ 0, - $src1, /* clamp */ 0, /* op_sel */ 0) ->; - foreach pat = [(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (and i32:$src1, (i32 31))))), (i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (i32 ShiftAmt32Imm:$src1))))] in def : GCNPat<pat, @@ -2704,15 +2831,6 @@ def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2), } // end True16Predicate = NotHasTrue16BitInsts let True16Predicate = UseRealTrue16Insts in { -def : GCNPat < - (rotr i32:$src0, i32:$src1), - (V_ALIGNBIT_B32_t16_e64 /* src0_modifiers */ 0, $src0, - /* src1_modifiers */ 0, $src0, - /* src2_modifiers */ 0, - (EXTRACT_SUBREG $src1, lo16), - /* clamp */ 0, /* op_sel */ 0) ->; - def : GCNPat<(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))), (V_ALIGNBIT_B32_t16_e64 0, /* src0_modifiers */ (i32 (EXTRACT_SUBREG (i64 $src0), sub1)), @@ -2731,14 +2849,6 @@ def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2), } // end True16Predicate = UseRealTrue16Insts let True16Predicate = UseFakeTrue16Insts in { -def : GCNPat < - (rotr i32:$src0, i32:$src1), - (V_ALIGNBIT_B32_fake16_e64 /* src0_modifiers */ 0, $src0, - /* src1_modifiers */ 0, $src0, - /* src2_modifiers */ 0, - $src1, /* clamp */ 0, /* op_sel */ 0) ->; - def : GCNPat<(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (and i32:$src1, (i32 31))))), (V_ALIGNBIT_B32_fake16_e64 0, /* src0_modifiers */ (i32 (EXTRACT_SUBREG (i64 $src0), sub1)), @@ -2784,7 +2894,11 @@ multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, string VecSize> { } defm : SI_INDIRECT_Pattern <v2f32, f32, "V2">; +defm : SI_INDIRECT_Pattern<v3f32, f32, "V3">; defm : SI_INDIRECT_Pattern <v4f32, f32, "V4">; +defm : SI_INDIRECT_Pattern<v5f32, f32, "V5">; +defm : SI_INDIRECT_Pattern<v6f32, f32, "V6">; +defm : SI_INDIRECT_Pattern<v7f32, f32, "V7">; defm : SI_INDIRECT_Pattern <v8f32, f32, "V8">; defm : SI_INDIRECT_Pattern <v9f32, f32, "V9">; defm : SI_INDIRECT_Pattern <v10f32, f32, "V10">; @@ -2794,7 +2908,11 @@ defm : SI_INDIRECT_Pattern <v16f32, f32, "V16">; defm : SI_INDIRECT_Pattern <v32f32, f32, "V32">; defm : SI_INDIRECT_Pattern <v2i32, i32, "V2">; +defm : SI_INDIRECT_Pattern<v3i32, i32, "V3">; defm : SI_INDIRECT_Pattern <v4i32, i32, "V4">; +defm : SI_INDIRECT_Pattern<v5i32, i32, "V5">; +defm : SI_INDIRECT_Pattern<v6i32, i32, "V6">; +defm : SI_INDIRECT_Pattern<v7i32, i32, "V7">; defm : SI_INDIRECT_Pattern <v8i32, i32, "V8">; defm : SI_INDIRECT_Pattern <v9i32, i32, "V9">; defm : SI_INDIRECT_Pattern <v10i32, i32, "V10">; @@ -2930,15 +3048,25 @@ def : GCNPat < >; def : GCNPat < - (i64 (zext i32:$src)), + (i64 (UniformUnaryFrag<zext> i32:$src)), (REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 (i32 0)), sub1) >; def : GCNPat < - (i64 (anyext i32:$src)), + (i64 (zext i32:$src)), + (REG_SEQUENCE VReg_64, $src, sub0, (V_MOV_B32_e32 (i32 0)), sub1) +>; + +def : GCNPat < + (i64 (UniformUnaryFrag<anyext> i32:$src)), (REG_SEQUENCE SReg_64, $src, sub0, (i32 (IMPLICIT_DEF)), sub1) >; +def : GCNPat < + (i64 (anyext i32:$src)), + (REG_SEQUENCE VReg_64, $src, sub0, (i32 (IMPLICIT_DEF)), sub1) +>; + class ZExt_i64_i1_Pat <SDNode ext> : GCNPat < (i64 (ext i1:$src)), (REG_SEQUENCE VReg_64, @@ -3459,10 +3587,7 @@ def : GCNPat< // If fcanonicalize's operand is implicitly canonicalized, we only need a copy. let AddedComplexity = 8 in { foreach vt = [f16, v2f16, f32, v2f32, f64] in { - def : GCNPat< - (fcanonicalize (vt is_canonicalized:$src)), - (COPY vt:$src) - >; + def : GCNPat<(fcanonicalize_canonicalized vt:$src), (COPY vt:$src)>; } } @@ -3481,30 +3606,6 @@ def : GCNPat< >; } // End True16Predicate -let True16Predicate = UseRealTrue16Insts in { -def : GCNPat< - (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))), - (V_MUL_F16_t16_e64 0, (i16 CONST.FP16_ONE), $src_mods, $src, 0/*Clamp*/, /*omod*/0, /*opsel*/0) ->; - -def : GCNPat< - (fcanonicalize (f16 (fneg (VOP3Mods f16:$src, i32:$src_mods)))), - (V_MUL_F16_t16_e64 0, (i16 CONST.FP16_NEG_ONE), $src_mods, $src, 0/*Clamp*/, /*omod*/0, /*opsel*/0) ->; -} // End True16Predicate - -let True16Predicate = UseFakeTrue16Insts in { -def : GCNPat< - (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))), - (V_MUL_F16_fake16_e64 0, (i32 CONST.FP16_ONE), $src_mods, $src) ->; - -def : GCNPat< - (fcanonicalize (f16 (fneg (VOP3Mods f16:$src, i32:$src_mods)))), - (V_MUL_F16_fake16_e64 0, (i32 CONST.FP16_NEG_ONE), $src_mods, $src) ->; -} // End True16Predicate - def : GCNPat< (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))), (V_PK_MUL_F16 0, (i32 CONST.FP16_ONE), $src_mods, $src, DSTCLAMP.NONE) @@ -3663,8 +3764,6 @@ def : GCNPat < SRCMODS.NONE, $src2) >; -// COPY is workaround tablegen bug from multiple outputs -// from S_LSHL_B32's multiple outputs from implicit scc def. let AddedComplexity = 1 in { def : GCNPat < (v2i16 (UniformBinFrag<build_vector> (i16 0), (i16 SReg_32:$src1))), @@ -3683,7 +3782,7 @@ def : GCNPat < >; def : GCNPat < - (v2f16 (DivergentBinFrag<build_vector> (f16 VGPR_32:$src1), (f16 FP_ZERO))), + (v2f16 (DivergentBinFrag<build_vector> (f16 VGPR_32:$src1), (f16 fpimm_pos_zero))), (v2f16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), VGPR_32:$src1)) >; } @@ -3694,7 +3793,7 @@ def : GCNPat < >; def : GCNPat < - (v2f16 (UniformBinFrag<build_vector> (f16 SReg_32:$src1), (f16 FP_ZERO))), + (v2f16 (UniformBinFrag<build_vector> (f16 SReg_32:$src1), (f16 fpimm_pos_zero))), (S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1) >; @@ -3879,9 +3978,6 @@ let SubtargetPredicate = isGFX9Plus in { let True16Predicate = NotHasTrue16BitInsts in def : PackB32Pat<V_PACK_B32_F16_e64>; -let True16Predicate = UseRealTrue16Insts in - def : PackB32Pat<V_PACK_B32_F16_t16_e64>; - let True16Predicate = UseFakeTrue16Insts in def : PackB32Pat<V_PACK_B32_F16_fake16_e64>; } // End SubtargetPredicate = isGFX9Plus @@ -4551,6 +4647,7 @@ def G_AMDGPU_BUFFER_ATOMIC_SMAX : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_UMAX : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_AND : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32 : BufferAtomicGenericInstruction; +def G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32 : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_OR : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_XOR : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_INC : BufferAtomicGenericInstruction; @@ -4737,6 +4834,23 @@ def G_AMDGPU_READANYLANE : AMDGPUGenericInstruction { let hasSideEffects = 0; } +// llvm.sponentry +def G_AMDGPU_SPONENTRY : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins); + let hasSideEffects = 0; +} + +class LoadMonitorInstruction : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins ptype1:$ptr); + let hasSideEffects = 0; + let mayLoad = 1; +} + +def G_AMDGPU_FLAT_LOAD_MONITOR : LoadMonitorInstruction; +def G_AMDGPU_GLOBAL_LOAD_MONITOR : LoadMonitorInstruction; + //============================================================================// // Dummy Instructions //============================================================================// @@ -4749,3 +4863,14 @@ def V_ILLEGAL : Enc32, InstSI<(outs), (ins), "v_illegal"> { let hasSideEffects = 1; let SubtargetPredicate = isGFX10Plus; } + +defvar VGPR32_Ptr_Opcodes = [LOAD_STACK_GUARD]; +defvar VGPR64_Ptr_Opcodes = !listremove(PseudosWithPtrOps, VGPR32_Ptr_Opcodes); + +foreach inst = VGPR32_Ptr_Opcodes in { + def : RemapPointerOperands<inst, VGPR_32>; +} + +foreach inst = VGPR64_Ptr_Opcodes in { + def : RemapPointerOperands<inst, VReg_64_AlignTarget>; +} diff --git a/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp b/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp index 6537b79..83cf457 100644 --- a/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp @@ -17,6 +17,7 @@ #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachinePassManager.h" #include "llvm/InitializePasses.h" @@ -32,6 +33,7 @@ private: const SIInstrInfo *TII; const SIRegisterInfo *TRI; MachineDominatorTree *MDT; + MachineLoopInfo *MLI; const AMDGPU::LaneMaskConstants &LMC; void expandChainCall(MachineInstr &MI, const GCNSubtarget &ST, @@ -39,9 +41,10 @@ private: void earlyTerm(MachineInstr &MI, MachineBasicBlock *EarlyExitBlock); public: - SILateBranchLowering(const GCNSubtarget &ST, MachineDominatorTree *MDT) + SILateBranchLowering(const GCNSubtarget &ST, MachineDominatorTree *MDT, + MachineLoopInfo *MLI) : ST(ST), TII(ST.getInstrInfo()), TRI(&TII->getRegisterInfo()), MDT(MDT), - LMC(AMDGPU::LaneMaskConstants::get(ST)) {} + MLI(MLI), LMC(AMDGPU::LaneMaskConstants::get(ST)) {} bool run(MachineFunction &MF); }; @@ -54,7 +57,9 @@ public: bool runOnMachineFunction(MachineFunction &MF) override { const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); auto *MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); - return SILateBranchLowering(ST, MDT).run(MF); + auto *MLIWP = getAnalysisIfAvailable<MachineLoopInfoWrapperPass>(); + MachineLoopInfo *MLI = MLIWP ? &MLIWP->getLI() : nullptr; + return SILateBranchLowering(ST, MDT, MLI).run(MF); } StringRef getPassName() const override { @@ -64,6 +69,7 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<MachineDominatorTreeWrapperPass>(); AU.addPreserved<MachineDominatorTreeWrapperPass>(); + AU.addPreserved<MachineLoopInfoWrapperPass>(); MachineFunctionPass::getAnalysisUsage(AU); } }; @@ -117,7 +123,7 @@ static void generateEndPgm(MachineBasicBlock &MBB, } static void splitBlock(MachineBasicBlock &MBB, MachineInstr &MI, - MachineDominatorTree *MDT) { + MachineDominatorTree *MDT, MachineLoopInfo *MLI) { MachineBasicBlock *SplitBB = MBB.splitAt(MI, /*UpdateLiveIns*/ true); // Update dominator tree @@ -129,6 +135,12 @@ static void splitBlock(MachineBasicBlock &MBB, MachineInstr &MI, } DTUpdates.push_back({DomTreeT::Insert, &MBB, SplitBB}); MDT->applyUpdates(DTUpdates); + + // Update loop info if available + if (MLI) { + if (MachineLoop *Loop = MLI->getLoopFor(&MBB)) + Loop->addBasicBlockToLoop(SplitBB, *MLI); + } } static void copyOpWithoutRegFlags(MachineInstrBuilder &MIB, @@ -186,20 +198,20 @@ void SILateBranchLowering::expandChainCall(MachineInstr &MI, for (int OpIdx = MI.getNumExplicitOperands() - 1; OpIdx >= ExecIdx; --OpIdx) MI.removeOperand(OpIdx); - MI.setDesc(TII->get(AMDGPU::SI_TCRETURN)); + MI.setDesc(TII->get(AMDGPU::SI_TCRETURN_CHAIN)); } void SILateBranchLowering::earlyTerm(MachineInstr &MI, MachineBasicBlock *EarlyExitBlock) { MachineBasicBlock &MBB = *MI.getParent(); - const DebugLoc DL = MI.getDebugLoc(); + const DebugLoc &DL = MI.getDebugLoc(); auto BranchMI = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC0)) .addMBB(EarlyExitBlock); auto Next = std::next(MI.getIterator()); if (Next != MBB.end() && !Next->isTerminator()) - splitBlock(MBB, *BranchMI, MDT); + splitBlock(MBB, *BranchMI, MDT, MLI); MBB.addSuccessor(EarlyExitBlock); MDT->insertEdge(&MBB, EarlyExitBlock); @@ -210,11 +222,14 @@ llvm::SILateBranchLoweringPass::run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM) { const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); auto *MDT = &MFAM.getResult<MachineDominatorTreeAnalysis>(MF); - if (!SILateBranchLowering(ST, MDT).run(MF)) + auto *MLI = MFAM.getCachedResult<MachineLoopAnalysis>(MF); + if (!SILateBranchLowering(ST, MDT, MLI).run(MF)) return PreservedAnalyses::all(); - return getMachineFunctionPassPreservedAnalyses() - .preserve<MachineDominatorTreeAnalysis>(); + auto PA = getMachineFunctionPassPreservedAnalyses(); + PA.preserve<MachineDominatorTreeAnalysis>(); + PA.preserve<MachineLoopAnalysis>(); + return PA; } bool SILateBranchLowering::run(MachineFunction &MF) { diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index f0d1117..0141c36 100644 --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -193,6 +193,8 @@ class SILoadStoreOptimizer { unsigned LoSubReg = 0; unsigned HiSubReg = 0; + // True when using V_ADD_U64_e64 pattern + bool UseV64Pattern = false; }; struct MemAddress { @@ -233,10 +235,11 @@ private: void copyToDestRegs(CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator InsertBefore, - AMDGPU::OpName OpName, Register DestReg) const; + const DebugLoc &DL, AMDGPU::OpName OpName, + Register DestReg) const; Register copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator InsertBefore, - AMDGPU::OpName OpName) const; + const DebugLoc &DL, AMDGPU::OpName OpName) const; unsigned read2Opcode(unsigned EltSize) const; unsigned read2ST64Opcode(unsigned EltSize) const; @@ -278,9 +281,12 @@ private: void updateBaseAndOffset(MachineInstr &I, Register NewBase, int32_t NewOffset) const; + void updateAsyncLDSAddress(MachineInstr &MI, int32_t OffsetDiff) const; Register computeBase(MachineInstr &MI, const MemAddress &Addr) const; MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const; - std::optional<int32_t> extractConstOffset(const MachineOperand &Op) const; + bool processBaseWithConstOffset64(MachineInstr *AddDef, + const MachineOperand &Base, + MemAddress &Addr) const; void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const; /// Promotes constant offset to the immediate by adjusting the base. It /// tries to use a base from the nearby instructions that allows it to have @@ -1336,11 +1342,9 @@ SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI, int Data1Idx = AMDGPU::getNamedOperandIdx(Write2Opc.getOpcode(), AMDGPU::OpName::data1); - const TargetRegisterClass *DataRC0 = - TII->getRegClass(Write2Opc, Data0Idx, TRI); + const TargetRegisterClass *DataRC0 = TII->getRegClass(Write2Opc, Data0Idx); - const TargetRegisterClass *DataRC1 = - TII->getRegClass(Write2Opc, Data1Idx, TRI); + const TargetRegisterClass *DataRC1 = TII->getRegClass(Write2Opc, Data1Idx); if (unsigned SubReg = Data0->getSubReg()) { DataRC0 = TRI->getMatchingSuperRegClass(MRI->getRegClass(Data0->getReg()), @@ -1367,10 +1371,9 @@ SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI, // Paired. void SILoadStoreOptimizer::copyToDestRegs( CombineInfo &CI, CombineInfo &Paired, - MachineBasicBlock::iterator InsertBefore, AMDGPU::OpName OpName, - Register DestReg) const { + MachineBasicBlock::iterator InsertBefore, const DebugLoc &DL, + AMDGPU::OpName OpName, Register DestReg) const { MachineBasicBlock *MBB = CI.I->getParent(); - DebugLoc DL = CI.I->getDebugLoc(); auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired); @@ -1387,7 +1390,7 @@ void SILoadStoreOptimizer::copyToDestRegs( BuildMI(*MBB, InsertBefore, DL, CopyDesc) .add(*Dest0) // Copy to same destination including flags and sub reg. - .addReg(DestReg, 0, SubRegIdx0); + .addReg(DestReg, {}, SubRegIdx0); BuildMI(*MBB, InsertBefore, DL, CopyDesc) .add(*Dest1) .addReg(DestReg, RegState::Kill, SubRegIdx1); @@ -1398,9 +1401,9 @@ void SILoadStoreOptimizer::copyToDestRegs( Register SILoadStoreOptimizer::copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator InsertBefore, + const DebugLoc &DL, AMDGPU::OpName OpName) const { MachineBasicBlock *MBB = CI.I->getParent(); - DebugLoc DL = CI.I->getDebugLoc(); auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired); @@ -1456,11 +1459,12 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); Register DestReg = MRI->createVirtualRegister(SuperRC); - DebugLoc DL = CI.I->getDebugLoc(); + DebugLoc DL = + DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc()); Register BaseReg = AddrReg->getReg(); unsigned BaseSubReg = AddrReg->getSubReg(); - unsigned BaseRegFlags = 0; + RegState BaseRegFlags = {}; if (CI.BaseOff) { Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) @@ -1471,7 +1475,7 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg) .addReg(ImmReg) - .addReg(AddrReg->getReg(), 0, BaseSubReg) + .addReg(AddrReg->getReg(), {}, BaseSubReg) .addImm(0); // clamp bit BaseSubReg = 0; } @@ -1484,7 +1488,7 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, .addImm(0) // gds .cloneMergedMemRefs({&*CI.I, &*Paired.I}); - copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg); + copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdst, DestReg); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); @@ -1541,11 +1545,12 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); const MCInstrDesc &Write2Desc = TII->get(Opc); - DebugLoc DL = CI.I->getDebugLoc(); + DebugLoc DL = + DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc()); Register BaseReg = AddrReg->getReg(); unsigned BaseSubReg = AddrReg->getSubReg(); - unsigned BaseRegFlags = 0; + RegState BaseRegFlags = {}; if (CI.BaseOff) { Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) @@ -1556,7 +1561,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg) .addReg(ImmReg) - .addReg(AddrReg->getReg(), 0, BaseSubReg) + .addReg(AddrReg->getReg(), {}, BaseSubReg) .addImm(0); // clamp bit BaseSubReg = 0; } @@ -1582,7 +1587,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); - DebugLoc DL = CI.I->getDebugLoc(); + DebugLoc DL = + DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc()); + const unsigned Opcode = getNewOpcode(CI, Paired); const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); @@ -1607,7 +1614,7 @@ SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired, MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); - copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg); + copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata, DestReg); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); @@ -1618,7 +1625,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair( CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); - DebugLoc DL = CI.I->getDebugLoc(); + DebugLoc DL = + DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc()); + const unsigned Opcode = getNewOpcode(CI, Paired); const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); @@ -1639,7 +1648,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair( New.addImm(MergedOffset); New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); - copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::sdst, DestReg); + copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::sdst, DestReg); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); @@ -1650,7 +1659,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); - DebugLoc DL = CI.I->getDebugLoc(); + + DebugLoc DL = + DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc()); const unsigned Opcode = getNewOpcode(CI, Paired); @@ -1680,7 +1691,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( .addImm(0) // swz .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); - copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg); + copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata, DestReg); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); @@ -1691,7 +1702,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); - DebugLoc DL = CI.I->getDebugLoc(); + + DebugLoc DL = + DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc()); const unsigned Opcode = getNewOpcode(CI, Paired); @@ -1731,7 +1744,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( .addImm(0) // swz .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); - copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg); + copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata, DestReg); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); @@ -1742,12 +1755,13 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair( CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); - DebugLoc DL = CI.I->getDebugLoc(); + DebugLoc DL = + DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc()); const unsigned Opcode = getNewOpcode(CI, Paired); Register SrcReg = - copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata); + copyFromSrcRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata); auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) .addReg(SrcReg, RegState::Kill); @@ -1789,7 +1803,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair( CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); - DebugLoc DL = CI.I->getDebugLoc(); + + DebugLoc DL = + DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc()); const unsigned Opcode = getNewOpcode(CI, Paired); @@ -1807,7 +1823,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair( .addImm(CI.CPol) .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); - copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg); + copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdst, DestReg); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); @@ -1818,12 +1834,14 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair( CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); - DebugLoc DL = CI.I->getDebugLoc(); + + DebugLoc DL = + DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc()); const unsigned Opcode = getNewOpcode(CI, Paired); Register SrcReg = - copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata); + copyFromSrcRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata); auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)) @@ -2094,12 +2112,13 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); - DebugLoc DL = CI.I->getDebugLoc(); + DebugLoc DL = + DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc()); const unsigned Opcode = getNewOpcode(CI, Paired); Register SrcReg = - copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata); + copyFromSrcRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata); auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) .addReg(SrcReg, RegState::Kill); @@ -2149,8 +2168,35 @@ Register SILoadStoreOptimizer::computeBase(MachineInstr &MI, const MemAddress &Addr) const { MachineBasicBlock *MBB = MI.getParent(); MachineBasicBlock::iterator MBBI = MI.getIterator(); - DebugLoc DL = MI.getDebugLoc(); + const DebugLoc &DL = MI.getDebugLoc(); + + LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n"); + // Use V_ADD_U64_e64 when the original pattern used it (gfx1250+) + if (Addr.Base.UseV64Pattern) { + Register FullDestReg = MRI->createVirtualRegister( + TII->getRegClass(TII->get(AMDGPU::V_ADD_U64_e64), 0)); + + // Load the 64-bit offset into an SGPR pair if needed + Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); + MachineInstr *MovOffset = + BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), + OffsetReg) + .addImm(Addr.Offset); + MachineInstr *Add64 = + BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_U64_e64), FullDestReg) + .addReg(Addr.Base.LoReg) + .addReg(OffsetReg, RegState::Kill) + .addImm(0); + (void)MovOffset; + (void)Add64; + LLVM_DEBUG(dbgs() << " " << *MovOffset << "\n"; + dbgs() << " " << *Add64 << "\n\n";); + + return FullDestReg; + } + + // Original carry-chain pattern (V_ADD_CO_U32 + V_ADDC_U32) assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 || Addr.Base.LoSubReg) && "Expected 32-bit Base-Register-Low!!"); @@ -2159,7 +2205,6 @@ Register SILoadStoreOptimizer::computeBase(MachineInstr &MI, Addr.Base.HiSubReg) && "Expected 32-bit Base-Register-Hi!!"); - LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n"); MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI); MachineOperand OffsetHi = createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI); @@ -2171,23 +2216,19 @@ Register SILoadStoreOptimizer::computeBase(MachineInstr &MI, Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); MachineInstr *LoHalf = - BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0) - .addReg(CarryReg, RegState::Define) - .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg) - .add(OffsetLo) - .addImm(0); // clamp bit - (void)LoHalf; - LLVM_DEBUG(dbgs() << " "; LoHalf->dump();); + BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0) + .addReg(CarryReg, RegState::Define) + .addReg(Addr.Base.LoReg, {}, Addr.Base.LoSubReg) + .add(OffsetLo) + .addImm(0); // clamp bit MachineInstr *HiHalf = - BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1) - .addReg(DeadCarryReg, RegState::Define | RegState::Dead) - .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg) - .add(OffsetHi) - .addReg(CarryReg, RegState::Kill) - .addImm(0); // clamp bit - (void)HiHalf; - LLVM_DEBUG(dbgs() << " "; HiHalf->dump();); + BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1) + .addReg(DeadCarryReg, RegState::Define | RegState::Dead) + .addReg(Addr.Base.HiReg, {}, Addr.Base.HiSubReg) + .add(OffsetHi) + .addReg(CarryReg, RegState::Kill) + .addImm(0); // clamp bit Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class()); MachineInstr *FullBase = @@ -2196,8 +2237,13 @@ Register SILoadStoreOptimizer::computeBase(MachineInstr &MI, .addImm(AMDGPU::sub0) .addReg(DestSub1) .addImm(AMDGPU::sub1); + + (void)LoHalf; + (void)HiHalf; (void)FullBase; - LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";); + LLVM_DEBUG(dbgs() << " " << *LoHalf << "\n"; + dbgs() << " " << *HiHalf << "\n"; + dbgs() << " " << *FullBase << "\n\n";); return FullDestReg; } @@ -2212,20 +2258,33 @@ void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI, TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset); } -std::optional<int32_t> -SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const { - if (Op.isImm()) - return Op.getImm(); +// Helper to extract a 64-bit constant offset from a V_ADD_U64_e64 instruction. +// Returns true if successful, populating Addr with base register info and +// offset. +bool SILoadStoreOptimizer::processBaseWithConstOffset64( + MachineInstr *AddDef, const MachineOperand &Base, MemAddress &Addr) const { + if (!Base.isReg()) + return false; + + MachineOperand *Src0 = TII->getNamedOperand(*AddDef, AMDGPU::OpName::src0); + MachineOperand *Src1 = TII->getNamedOperand(*AddDef, AMDGPU::OpName::src1); - if (!Op.isReg()) - return std::nullopt; + const MachineOperand *BaseOp = nullptr; - MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg()); - if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 || - !Def->getOperand(1).isImm()) - return std::nullopt; + auto Offset = TII->getImmOrMaterializedImm(*Src1); - return Def->getOperand(1).getImm(); + if (Offset) { + BaseOp = Src0; + Addr.Offset = *Offset; + } else { + // Both or neither are constants - can't handle this pattern + return false; + } + + // Now extract the base register (which should be a 64-bit VGPR). + Addr.Base.LoReg = BaseOp->getReg(); + Addr.Base.UseV64Pattern = true; + return true; } // Analyze Base and extracts: @@ -2238,14 +2297,27 @@ SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const { // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec // %Base:vreg_64 = // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1 +// +// Also handles V_ADD_U64_e64 pattern (gfx1250+): +// %OFFSET:sreg_64 = S_MOV_B64_IMM_PSEUDO 256 +// %Base:vreg_64 = V_ADD_U64_e64 %BASE:vreg_64, %OFFSET:sreg_64, 0 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const { if (!Base.isReg()) return; MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg()); - if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE - || Def->getNumOperands() != 5) + if (!Def) + return; + + // Try V_ADD_U64_e64 pattern first (simpler, used on gfx1250+) + if (Def->getOpcode() == AMDGPU::V_ADD_U64_e64) { + if (processBaseWithConstOffset64(Def, Base, Addr)) + return; + } + + // Fall through to REG_SEQUENCE + V_ADD_CO_U32 + V_ADDC_U32 pattern + if (Def->getOpcode() != AMDGPU::REG_SEQUENCE || Def->getNumOperands() != 5) return; MachineOperand BaseLo = Def->getOperand(1); @@ -2260,14 +2332,14 @@ void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64) return; - const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0); - const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1); + MachineOperand *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0); + MachineOperand *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1); - auto Offset0P = extractConstOffset(*Src0); + auto Offset0P = TII->getImmOrMaterializedImm(*Src0); if (Offset0P) BaseLo = *Src1; else { - if (!(Offset0P = extractConstOffset(*Src1))) + if (!(Offset0P = TII->getImmOrMaterializedImm(*Src1))) return; BaseLo = *Src0; } @@ -2297,6 +2369,32 @@ void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32); } +// Maintain the correct LDS address for async loads and stores. +// It becomes incorrect when promoteConstantOffsetToImm adds an offset only +// meant for the global address operand. For async loads the LDS address is in +// vdst. For async stores, the LDS address is in vdata. +void SILoadStoreOptimizer::updateAsyncLDSAddress(MachineInstr &MI, + int32_t OffsetDiff) const { + if (!TII->usesASYNC_CNT(MI) || OffsetDiff == 0) + return; + + MachineOperand *LDSAddr = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); + if (!LDSAddr) + LDSAddr = TII->getNamedOperand(MI, AMDGPU::OpName::vdata); + assert(LDSAddr); + + Register OldReg = LDSAddr->getReg(); + Register NewReg = MRI->createVirtualRegister(MRI->getRegClass(OldReg)); + MachineBasicBlock &MBB = *MI.getParent(); + const DebugLoc &DL = MI.getDebugLoc(); + BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64), NewReg) + .addReg(OldReg) + .addImm(-OffsetDiff) + .addImm(0); + + LDSAddr->setReg(NewReg); +} + bool SILoadStoreOptimizer::promoteConstantOffsetToImm( MachineInstr &MI, MemInfoMap &Visited, @@ -2426,7 +2524,9 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm( // Instead of moving up, just re-compute anchor-instruction's base address. Register Base = computeBase(MI, AnchorAddr); - updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset); + int32_t OffsetDiff = MAddr.Offset - AnchorAddr.Offset; + updateBaseAndOffset(MI, Base, OffsetDiff); + updateAsyncLDSAddress(MI, OffsetDiff); LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump();); for (auto [OtherMI, OtherOffset] : InstsWCommonBase) { @@ -2437,7 +2537,9 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm( if (TLI->isLegalFlatAddressingMode(AM, AS)) { LLVM_DEBUG(dbgs() << " Promote Offset(" << OtherOffset; dbgs() << ")"; OtherMI->dump()); - updateBaseAndOffset(*OtherMI, Base, OtherOffset - AnchorAddr.Offset); + int32_t OtherOffsetDiff = OtherOffset - AnchorAddr.Offset; + updateBaseAndOffset(*OtherMI, Base, OtherOffsetDiff); + updateAsyncLDSAddress(*OtherMI, OtherOffsetDiff); LLVM_DEBUG(dbgs() << " After promotion: "; OtherMI->dump()); } } diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp index 8586d6c..9cc86e8 100644 --- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -56,6 +56,7 @@ #include "llvm/ADT/SmallSet.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/LiveVariables.h" +#include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachinePostDominators.h" @@ -160,6 +161,7 @@ public: AU.addPreserved<SlotIndexesWrapperPass>(); AU.addPreserved<LiveIntervalsWrapperPass>(); AU.addPreserved<LiveVariablesWrapperPass>(); + AU.addPreserved<MachineBlockFrequencyInfoWrapperPass>(); MachineFunctionPass::getAnalysisUsage(AU); } }; @@ -742,6 +744,11 @@ bool SILowerControlFlow::removeMBBifRedundant(MachineBasicBlock &MBB) { if (PDT) PDT->applyUpdates(DTUpdates); + if (MDT && MDT->getNode(&MBB)) + MDT->eraseNode(&MBB); + if (PDT && PDT->getNode(&MBB)) + PDT->eraseNode(&MBB); + MBB.clear(); MBB.eraseFromParent(); if (FallThrough && !FallThrough->isLayoutSuccessor(Succ)) { @@ -880,5 +887,6 @@ SILowerControlFlowPass::run(MachineFunction &MF, PA.preserve<SlotIndexesAnalysis>(); PA.preserve<LiveIntervalsAnalysis>(); PA.preserve<LiveVariablesAnalysis>(); + PA.preserve<MachineBlockFrequencyAnalysis>(); return PA; } diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp index 96131bd..0b8c71a 100644 --- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp @@ -75,7 +75,7 @@ Vreg1LoweringHelper::Vreg1LoweringHelper(MachineFunction *MF, bool Vreg1LoweringHelper::cleanConstrainRegs(bool Changed) { assert(Changed || ConstrainRegs.empty()); for (Register Reg : ConstrainRegs) - MRI->constrainRegClass(Reg, &AMDGPU::SReg_1_XEXECRegClass); + MRI->constrainRegClass(Reg, TII->getRegisterInfo().getWaveMaskRegClass()); ConstrainRegs.clear(); return Changed; @@ -417,7 +417,7 @@ bool Vreg1LoweringHelper::lowerCopiesFromI1() { // Copy into a 32-bit vector register. LLVM_DEBUG(dbgs() << "Lower copy from i1: " << MI); - DebugLoc DL = MI.getDebugLoc(); + const DebugLoc &DL = MI.getDebugLoc(); assert(isVRegCompatibleReg(TII->getRegisterInfo(), *MRI, DstReg)); assert(!MI.getOperand(0).getSubReg()); @@ -616,7 +616,7 @@ bool Vreg1LoweringHelper::lowerCopiesToI1() { if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF) continue; - DebugLoc DL = MI.getDebugLoc(); + const DebugLoc &DL = MI.getDebugLoc(); Register SrcReg = MI.getOperand(1).getReg(); assert(!MI.getOperand(1).getSubReg()); @@ -881,18 +881,14 @@ SILowerI1CopiesPass::run(MachineFunction &MF, return PreservedAnalyses::all(); // TODO: Probably preserves most. - PreservedAnalyses PA; - PA.preserveSet<CFGAnalyses>(); - return PA; + return getMachineFunctionPassPreservedAnalyses().preserveSet<CFGAnalyses>(); } class SILowerI1CopiesLegacy : public MachineFunctionPass { public: static char ID; - SILowerI1CopiesLegacy() : MachineFunctionPass(ID) { - initializeSILowerI1CopiesLegacyPass(*PassRegistry::getPassRegistry()); - } + SILowerI1CopiesLegacy() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override; diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp index 40eeeb8..cbd08f0 100644 --- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp @@ -117,27 +117,26 @@ static void insertCSRSaves(MachineBasicBlock &SaveBlock, MachineFunction &MF = *SaveBlock.getParent(); const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); - const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIRegisterInfo *RI = ST.getRegisterInfo(); MachineBasicBlock::iterator I = SaveBlock.begin(); - if (!TFI->spillCalleeSavedRegisters(SaveBlock, I, CSI, TRI)) { + if (!TFI->spillCalleeSavedRegisters(SaveBlock, I, CSI, RI)) { for (const CalleeSavedInfo &CS : CSI) { // Insert the spill to the stack frame. MCRegister Reg = CS.getReg(); MachineInstrSpan MIS(I, &SaveBlock); - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass( + const TargetRegisterClass *RC = RI->getMinimalPhysRegClass( Reg, Reg == RI->getReturnAddressReg(MF) ? MVT::i64 : MVT::i32); // If this value was already livein, we probably have a direct use of the // incoming register value, so don't kill at the spill point. This happens // since we pass some special inputs (workgroup IDs) in the callee saved // range. - const bool IsLiveIn = isLiveIntoMBB(Reg, SaveBlock, TRI); + const bool IsLiveIn = isLiveIntoMBB(Reg, SaveBlock, RI); TII.storeRegToStackSlot(SaveBlock, I, Reg, !IsLiveIn, CS.getFrameIdx(), - RC, TRI, Register()); + RC, Register()); if (Indexes) { assert(std::distance(MIS.begin(), I) == 1); diff --git a/llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp b/llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp index ef384c2..4aa4186 100644 --- a/llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp @@ -53,9 +53,7 @@ class SILowerWWMCopiesLegacy : public MachineFunctionPass { public: static char ID; - SILowerWWMCopiesLegacy() : MachineFunctionPass(ID) { - initializeSILowerWWMCopiesLegacyPass(*PassRegistry::getPassRegistry()); - } + SILowerWWMCopiesLegacy() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override; diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index b398db4..af3226d 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -37,7 +37,7 @@ static cl::opt<bool, true> MFMAVGPRFormOpt( "amdgpu-mfma-vgpr-form", cl::desc("Whether to force use VGPR for Opc and Dest of MFMA. If " "unspecified, default to compiler heuristics"), - cl::location(SIMachineFunctionInfo::MFMAVGPRForm), cl::init(false), + cl::location(SIMachineFunctionInfo::MFMAVGPRForm), cl::init(true), cl::Hidden); const GCNTargetMachine &getTM(const GCNSubtarget *STI) { @@ -114,7 +114,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F, FrameOffsetReg = AMDGPU::SGPR33; StackPtrOffsetReg = AMDGPU::SGPR32; - if (!ST.enableFlatScratch()) { + if (!ST.hasFlatScratchEnabled()) { // Non-entry functions have no special inputs for now, other registers // required for scratch access. ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3; @@ -169,7 +169,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F, if (WorkItemIDZ) WorkItemIDY = true; - if (!ST.flatScratchIsArchitected()) { + if (!ST.hasArchitectedFlatScratch()) { PrivateSegmentWaveByteOffset = true; // HS and GS always have the scratch wave offset in SGPR5 on GFX9. @@ -692,11 +692,10 @@ convertArgumentInfo(const AMDGPUFunctionArgInfo &ArgInfo, if (Arg.isMasked()) SA.Mask = Arg.getMask(); - A = SA; + A = std::move(SA); return true; }; - // TODO: Need to serialize kernarg preloads. bool Any = false; Any |= convertArg(AI.PrivateSegmentBuffer, ArgInfo.PrivateSegmentBuffer); Any |= convertArg(AI.DispatchPtr, ArgInfo.DispatchPtr); @@ -718,6 +717,21 @@ convertArgumentInfo(const AMDGPUFunctionArgInfo &ArgInfo, Any |= convertArg(AI.WorkItemIDY, ArgInfo.WorkItemIDY); Any |= convertArg(AI.WorkItemIDZ, ArgInfo.WorkItemIDZ); + // Write FirstKernArgPreloadReg separately, since it's a Register, + // not ArgDescriptor. + if (ArgInfo.FirstKernArgPreloadReg) { + Register Reg = ArgInfo.FirstKernArgPreloadReg; + assert(Reg.isPhysical() && + "FirstKernArgPreloadReg must be a physical register"); + + yaml::SIArgument SA = yaml::SIArgument::createArgument(true); + raw_string_ostream OS(SA.RegisterName.Value); + OS << printReg(Reg, &TRI); + + AI.FirstKernArgPreloadReg = SA; + Any = true; + } + if (Any) return AI; @@ -730,9 +744,8 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo( : ExplicitKernArgSize(MFI.getExplicitKernArgSize()), MaxKernArgAlign(MFI.getMaxKernArgAlign()), LDSSize(MFI.getLDSSize()), GDSSize(MFI.getGDSSize()), DynLDSAlign(MFI.getDynLDSAlign()), - IsEntryFunction(MFI.isEntryFunction()), - NoSignedZerosFPMath(MFI.hasNoSignedZerosFPMath()), - MemoryBound(MFI.isMemoryBound()), WaveLimiter(MFI.needsWaveLimiter()), + IsEntryFunction(MFI.isEntryFunction()), MemoryBound(MFI.isMemoryBound()), + WaveLimiter(MFI.needsWaveLimiter()), HasSpilledSGPRs(MFI.hasSpilledSGPRs()), HasSpilledVGPRs(MFI.hasSpilledVGPRs()), NumWaveDispatchSGPRs(MFI.getNumWaveDispatchSGPRs()), @@ -750,7 +763,8 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo( Mode(MFI.getMode()), HasInitWholeWave(MFI.hasInitWholeWave()), IsWholeWaveFunction(MFI.isWholeWaveFunction()), DynamicVGPRBlockSize(MFI.getDynamicVGPRBlockSize()), - ScratchReservedForDynamicVGPRs(MFI.getScratchReservedForDynamicVGPRs()) { + ScratchReservedForDynamicVGPRs(MFI.getScratchReservedForDynamicVGPRs()), + NumKernargPreloadSGPRs(MFI.getNumKernargPreloadedSGPRs()) { for (Register Reg : MFI.getSGPRSpillPhysVGPRs()) SpillPhysVGPRS.push_back(regToString(Reg, TRI)); @@ -788,7 +802,6 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields( HighBitsOf32BitAddress = YamlMFI.HighBitsOf32BitAddress; Occupancy = YamlMFI.Occupancy; IsEntryFunction = YamlMFI.IsEntryFunction; - NoSignedZerosFPMath = YamlMFI.NoSignedZerosFPMath; MemoryBound = YamlMFI.MemoryBound; WaveLimiter = YamlMFI.WaveLimiter; HasSpilledSGPRs = YamlMFI.HasSpilledSGPRs; @@ -799,6 +812,8 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields( ReturnsVoid = YamlMFI.ReturnsVoid; IsWholeWaveFunction = YamlMFI.IsWholeWaveFunction; + UserSGPRInfo.allocKernargPreloadSGPRs(YamlMFI.NumKernargPreloadSGPRs); + if (YamlMFI.ScavengeFI) { auto FIOrErr = YamlMFI.ScavengeFI->getFI(MF.getFrameInfo()); if (!FIOrErr) { diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 2c1a13c..617862d 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -170,6 +170,7 @@ struct SIArgumentInfo { std::optional<SIArgument> DispatchID; std::optional<SIArgument> FlatScratchInit; std::optional<SIArgument> PrivateSegmentSize; + std::optional<SIArgument> FirstKernArgPreloadReg; std::optional<SIArgument> WorkGroupIDX; std::optional<SIArgument> WorkGroupIDY; @@ -195,6 +196,7 @@ template <> struct MappingTraits<SIArgumentInfo> { YamlIO.mapOptional("dispatchID", AI.DispatchID); YamlIO.mapOptional("flatScratchInit", AI.FlatScratchInit); YamlIO.mapOptional("privateSegmentSize", AI.PrivateSegmentSize); + YamlIO.mapOptional("firstKernArgPreloadReg", AI.FirstKernArgPreloadReg); YamlIO.mapOptional("workGroupIDX", AI.WorkGroupIDX); YamlIO.mapOptional("workGroupIDY", AI.WorkGroupIDY); @@ -265,7 +267,6 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo { Align DynLDSAlign; bool IsEntryFunction = false; bool IsChainFunction = false; - bool NoSignedZerosFPMath = false; bool MemoryBound = false; bool WaveLimiter = false; bool HasSpilledSGPRs = false; @@ -305,13 +306,15 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo { unsigned DynamicVGPRBlockSize = 0; unsigned ScratchReservedForDynamicVGPRs = 0; + unsigned NumKernargPreloadSGPRs = 0; + SIMachineFunctionInfo() = default; SIMachineFunctionInfo(const llvm::SIMachineFunctionInfo &, const TargetRegisterInfo &TRI, const llvm::MachineFunction &MF); void mappingImpl(yaml::IO &YamlIO) override; - ~SIMachineFunctionInfo() = default; + ~SIMachineFunctionInfo() override = default; }; template <> struct MappingTraits<SIMachineFunctionInfo> { @@ -324,7 +327,6 @@ template <> struct MappingTraits<SIMachineFunctionInfo> { YamlIO.mapOptional("dynLDSAlign", MFI.DynLDSAlign, Align()); YamlIO.mapOptional("isEntryFunction", MFI.IsEntryFunction, false); YamlIO.mapOptional("isChainFunction", MFI.IsChainFunction, false); - YamlIO.mapOptional("noSignedZerosFPMath", MFI.NoSignedZerosFPMath, false); YamlIO.mapOptional("memoryBound", MFI.MemoryBound, false); YamlIO.mapOptional("waveLimiter", MFI.WaveLimiter, false); YamlIO.mapOptional("hasSpilledSGPRs", MFI.HasSpilledSGPRs, false); @@ -361,6 +363,7 @@ template <> struct MappingTraits<SIMachineFunctionInfo> { YamlIO.mapOptional("dynamicVGPRBlockSize", MFI.DynamicVGPRBlockSize, false); YamlIO.mapOptional("scratchReservedForDynamicVGPRs", MFI.ScratchReservedForDynamicVGPRs, 0); + YamlIO.mapOptional("numKernargPreloadSGPRs", MFI.NumKernargPreloadSGPRs, 0); YamlIO.mapOptional("isWholeWaveFunction", MFI.IsWholeWaveFunction, false); } }; @@ -1014,7 +1017,9 @@ public: void setNumWaveDispatchVGPRs(unsigned Count) { NumWaveDispatchVGPRs = Count; } Register getPrivateSegmentWaveByteOffsetSystemSGPR() const { - return ArgInfo.PrivateSegmentWaveByteOffset.getRegister(); + if (ArgInfo.PrivateSegmentWaveByteOffset) + return ArgInfo.PrivateSegmentWaveByteOffset.getRegister(); + return MCRegister(); } /// Returns the physical register reserved for use as the resource diff --git a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp index fd28abe..fb0c7e6 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp @@ -323,8 +323,8 @@ void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock, // Do not Track Physical Registers, because it messes up. for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) { - if (RegMaskPair.RegUnit.isVirtual()) - LiveInRegs.insert(RegMaskPair.RegUnit); + if (RegMaskPair.VRegOrUnit.isVirtualReg()) + LiveInRegs.insert(RegMaskPair.VRegOrUnit.asVirtualReg()); } LiveOutRegs.clear(); // There is several possibilities to distinguish: @@ -350,12 +350,13 @@ void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock, // Comparing to LiveInRegs is not sufficient to differentiate 4 vs 5, 7 // The use of findDefBetween removes the case 4. for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) { - Register Reg = RegMaskPair.RegUnit; - if (Reg.isVirtual() && - isDefBetween(Reg, LIS->getInstructionIndex(*BeginBlock).getRegSlot(), + VirtRegOrUnit VRegOrUnit = RegMaskPair.VRegOrUnit; + if (VRegOrUnit.isVirtualReg() && + isDefBetween(VRegOrUnit.asVirtualReg(), + LIS->getInstructionIndex(*BeginBlock).getRegSlot(), LIS->getInstructionIndex(*EndBlock).getRegSlot(), MRI, LIS)) { - LiveOutRegs.insert(Reg); + LiveOutRegs.insert(VRegOrUnit.asVirtualReg()); } } @@ -578,11 +579,11 @@ void SIScheduleBlock::printDebug(bool full) { << LiveOutPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n\n"; dbgs() << "LiveIns:\n"; for (Register Reg : LiveInRegs) - dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; + dbgs() << printReg(Reg, DAG->getTRI()) << ' '; dbgs() << "\nLiveOuts:\n"; for (Register Reg : LiveOutRegs) - dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; + dbgs() << printReg(Reg, DAG->getTRI()) << ' '; } dbgs() << "\nInstructions:\n"; @@ -921,7 +922,7 @@ void SIScheduleBlockCreator::colorEndsAccordingToDependencies() { // combination of children. PendingColoring[SU->NodeNum] = NextNonReservedID++; } - CurrentColoring = PendingColoring; + CurrentColoring = std::move(PendingColoring); } @@ -1446,23 +1447,24 @@ SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG, } #endif - std::set<Register> InRegs = DAG->getInRegs(); + std::set<VirtRegOrUnit> InRegs = DAG->getInRegs(); addLiveRegs(InRegs); // Increase LiveOutRegsNumUsages for blocks // producing registers consumed in another // scheduling region. - for (Register Reg : DAG->getOutRegs()) { + for (VirtRegOrUnit VRegOrUnit : DAG->getOutRegs()) { for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { // Do reverse traversal int ID = BlocksStruct.TopDownIndex2Block[Blocks.size()-1-i]; SIScheduleBlock *Block = Blocks[ID]; const std::set<Register> &OutRegs = Block->getOutRegs(); - if (OutRegs.find(Reg) == OutRegs.end()) + if (!VRegOrUnit.isVirtualReg() || + OutRegs.find(VRegOrUnit.asVirtualReg()) == OutRegs.end()) continue; - ++LiveOutRegsNumUsages[ID][Reg]; + ++LiveOutRegsNumUsages[ID][VRegOrUnit.asVirtualReg()]; break; } } @@ -1565,15 +1567,18 @@ SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() { maxVregUsage = VregCurrentUsage; if (SregCurrentUsage > maxSregUsage) maxSregUsage = SregCurrentUsage; - LLVM_DEBUG(dbgs() << "Picking New Blocks\n"; dbgs() << "Available: "; - for (SIScheduleBlock *Block : ReadyBlocks) - dbgs() << Block->getID() << ' '; - dbgs() << "\nCurrent Live:\n"; - for (Register Reg : LiveRegs) - dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; - dbgs() << '\n'; - dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n'; - dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';); + LLVM_DEBUG({ + dbgs() << "Picking New Blocks\n"; + dbgs() << "Available: "; + for (SIScheduleBlock *Block : ReadyBlocks) + dbgs() << Block->getID() << ' '; + dbgs() << "\nCurrent Live:\n"; + for (Register Reg : LiveRegs) + dbgs() << printReg(Reg, DAG->getTRI()) << ' '; + dbgs() << '\n'; + dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n'; + dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n'; + }); Cand.Block = nullptr; for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(), @@ -1625,13 +1630,13 @@ SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() { // Tracking of currently alive registers to determine VGPR Usage. -void SIScheduleBlockScheduler::addLiveRegs(std::set<Register> &Regs) { - for (Register Reg : Regs) { +void SIScheduleBlockScheduler::addLiveRegs(std::set<VirtRegOrUnit> &Regs) { + for (VirtRegOrUnit VRegOrUnit : Regs) { // For now only track virtual registers. - if (!Reg.isVirtual()) + if (!VRegOrUnit.isVirtualReg()) continue; // If not already in the live set, then add it. - (void) LiveRegs.insert(Reg); + (void)LiveRegs.insert(VRegOrUnit.asVirtualReg()); } } @@ -1662,7 +1667,7 @@ void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) { void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) { decreaseLiveRegs(Block, Block->getInRegs()); - addLiveRegs(Block->getOutRegs()); + LiveRegs.insert(Block->getOutRegs().begin(), Block->getOutRegs().end()); releaseBlockSuccs(Block); for (const auto &RegP : LiveOutRegsNumUsages[Block->getID()]) { // We produce this register, thus it must not be previously alive. @@ -1689,7 +1694,7 @@ SIScheduleBlockScheduler::checkRegUsageImpact(std::set<Register> &InRegs, continue; if (LiveRegsConsumers[Reg] > 1) continue; - PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg); + PSetIterator PSetI = DAG->getMRI()->getPressureSets(VirtRegOrUnit(Reg)); for (; PSetI.isValid(); ++PSetI) { DiffSetPressure[*PSetI] -= PSetI.getWeight(); } @@ -1699,7 +1704,7 @@ SIScheduleBlockScheduler::checkRegUsageImpact(std::set<Register> &InRegs, // For now only track virtual registers. if (!Reg.isVirtual()) continue; - PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg); + PSetIterator PSetI = DAG->getMRI()->getPressureSets(VirtRegOrUnit(Reg)); for (; PSetI.isValid(); ++PSetI) { DiffSetPressure[*PSetI] += PSetI.getWeight(); } @@ -1846,7 +1851,7 @@ SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End, // For now only track virtual registers if (!Reg.isVirtual()) continue; - PSetIterator PSetI = MRI.getPressureSets(Reg); + PSetIterator PSetI = MRI.getPressureSets(VirtRegOrUnit(Reg)); for (; PSetI.isValid(); ++PSetI) { if (*PSetI == AMDGPU::RegisterPressureSets::VGPR_32) VgprUsage += PSetI.getWeight(); diff --git a/llvm/lib/Target/AMDGPU/SIMachineScheduler.h b/llvm/lib/Target/AMDGPU/SIMachineScheduler.h index b219cbd..1245774 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineScheduler.h +++ b/llvm/lib/Target/AMDGPU/SIMachineScheduler.h @@ -389,7 +389,7 @@ private: SIBlockSchedCandidate &TryCand); SIScheduleBlock *pickBlock(); - void addLiveRegs(std::set<Register> &Regs); + void addLiveRegs(std::set<VirtRegOrUnit> &Regs); void decreaseLiveRegs(SIScheduleBlock *Block, std::set<Register> &Regs); void releaseBlockSuccs(SIScheduleBlock *Parent); void blockScheduled(SIScheduleBlock *Block); @@ -462,18 +462,18 @@ public: unsigned &VgprUsage, unsigned &SgprUsage); - std::set<Register> getInRegs() { - std::set<Register> InRegs; + std::set<VirtRegOrUnit> getInRegs() { + std::set<VirtRegOrUnit> InRegs; for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) { - InRegs.insert(RegMaskPair.RegUnit); + InRegs.insert(RegMaskPair.VRegOrUnit); } return InRegs; } - std::set<unsigned> getOutRegs() { - std::set<unsigned> OutRegs; + std::set<VirtRegOrUnit> getOutRegs() { + std::set<VirtRegOrUnit> OutRegs; for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) { - OutRegs.insert(RegMaskPair.RegUnit); + OutRegs.insert(RegMaskPair.VRegOrUnit); } return OutRegs; }; diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 484861d..0daeecd 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -25,6 +25,7 @@ #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/MemoryModelRelaxationAnnotations.h" #include "llvm/IR/PassManager.h" +#include "llvm/Support/AMDGPUAddrSpace.h" #include "llvm/Support/AtomicOrdering.h" #include "llvm/TargetParser/TargetParser.h" @@ -277,6 +278,12 @@ public: /// rmw operation, "std::nullopt" otherwise. std::optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const; + + /// \returns DMA to LDS info if \p MI is as a direct-to/from-LDS load/store, + /// along with an indication of whether this is a load or store. If it is not + /// a direct-to-LDS operation, returns std::nullopt. + std::optional<SIMemOpInfo> + getLDSDMAInfo(const MachineBasicBlock::iterator &MI) const; }; class SICacheControl { @@ -295,16 +302,17 @@ protected: SICacheControl(const GCNSubtarget &ST); - /// Sets named bit \p BitName to "true" if present in instruction \p MI. + /// Sets CPol \p Bits to "true" if present in instruction \p MI. /// \returns Returns true if \p MI is modified, false otherwise. - bool enableNamedBit(const MachineBasicBlock::iterator MI, - AMDGPU::CPol::CPol Bit) const; + bool enableCPolBits(const MachineBasicBlock::iterator MI, + unsigned Bits) const; /// Check if any atomic operation on AS can affect memory accessible via the /// global address space. bool canAffectGlobalAddrSpace(SIAtomicAddrSpace AS) const; public: + using CPol = AMDGPU::CPol::CPol; /// Create a cache control for the subtarget \p ST. static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST); @@ -360,11 +368,13 @@ public: /// between memory instructions to enforce the order they become visible as /// observed by other memory instructions executing in memory scope \p Scope. /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between - /// address spaces. Returns true iff any instructions inserted. + /// address spaces. If \p AtomicsOnly is true, only insert waits for counters + /// that are used by atomic instructions. + /// Returns true iff any instructions inserted. virtual bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, SIMemOp Op, bool IsCrossAddrSpaceOrdering, Position Pos, - AtomicOrdering Order) const = 0; + AtomicOrdering Order, bool AtomicsOnly) const = 0; /// Inserts any necessary instructions at position \p Pos relative to /// instruction \p MI to ensure any subsequent memory instructions of this @@ -388,31 +398,17 @@ public: bool IsCrossAddrSpaceOrdering, Position Pos) const = 0; - /// Inserts any necessary instructions before the barrier start instruction - /// \p MI in order to support pairing of barriers and fences. - virtual bool insertBarrierStart(MachineBasicBlock::iterator &MI) const { - return false; - }; + /// Handle operations that are considered non-volatile. + /// See \ref isNonVolatileMemoryAccess + virtual bool handleNonVolatile(MachineInstr &MI) const { return false; } /// Virtual destructor to allow derivations to be deleted. virtual ~SICacheControl() = default; }; -class SIGfx6CacheControl : public SICacheControl { -protected: - - /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI - /// is modified, false otherwise. - bool enableGLCBit(const MachineBasicBlock::iterator &MI) const { - return enableNamedBit(MI, AMDGPU::CPol::GLC); - } - - /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI - /// is modified, false otherwise. - bool enableSLCBit(const MachineBasicBlock::iterator &MI) const { - return enableNamedBit(MI, AMDGPU::CPol::SLC); - } - +/// Generates code sequences for the memory model of all GFX targets below +/// GFX10. +class SIGfx6CacheControl final : public SICacheControl { public: SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {} @@ -437,7 +433,7 @@ public: bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, SIMemOp Op, bool IsCrossAddrSpaceOrdering, Position Pos, - AtomicOrdering Order) const override; + AtomicOrdering Order, bool AtomicsOnly) const override; bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, @@ -451,30 +447,26 @@ public: Position Pos) const override; }; -class SIGfx7CacheControl : public SIGfx6CacheControl { -public: - - SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {} - - bool insertAcquire(MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace, - Position Pos) const override; - -}; - -class SIGfx90ACacheControl : public SIGfx7CacheControl { +/// Generates code sequences for the memory model of GFX10/11. +class SIGfx10CacheControl final : public SICacheControl { public: - - SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {} + SIGfx10CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {} bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const override; + bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const override { + return false; + } + bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace) const override; + SIAtomicAddrSpace AddrSpace) const override { + return false; + } bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, @@ -484,124 +476,27 @@ public: bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, SIMemOp Op, bool IsCrossAddrSpaceOrdering, Position Pos, - AtomicOrdering Order) const override; - - bool insertAcquire(MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace, - Position Pos) const override; - - bool insertRelease(MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace, - bool IsCrossAddrSpaceOrdering, - Position Pos) const override; -}; - -class SIGfx940CacheControl : public SIGfx90ACacheControl { -protected: - - /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI - /// is modified, false otherwise. - bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const { - return enableNamedBit(MI, AMDGPU::CPol::SC0); - } - - /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI - /// is modified, false otherwise. - bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const { - return enableNamedBit(MI, AMDGPU::CPol::SC1); - } - - /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI - /// is modified, false otherwise. - bool enableNTBit(const MachineBasicBlock::iterator &MI) const { - return enableNamedBit(MI, AMDGPU::CPol::NT); - } - -public: - SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {}; - - bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace) const override; - - bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace) const override; - - bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace) const override; - - bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, - SIAtomicAddrSpace AddrSpace, SIMemOp Op, - bool IsVolatile, bool IsNonTemporal, - bool IsLastUse) const override; + AtomicOrdering Order, bool AtomicsOnly) const override; bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, Position Pos) const override; bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, - Position Pos) const override; -}; - -class SIGfx10CacheControl : public SIGfx7CacheControl { -protected: - - /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI - /// is modified, false otherwise. - bool enableDLCBit(const MachineBasicBlock::iterator &MI) const { - return enableNamedBit(MI, AMDGPU::CPol::DLC); + Position Pos) const override { + return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, + IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release, + /*AtomicsOnly=*/false); } - -public: - - SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {} - - bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace) const override; - - bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, - SIAtomicAddrSpace AddrSpace, SIMemOp Op, - bool IsVolatile, bool IsNonTemporal, - bool IsLastUse) const override; - - bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace, SIMemOp Op, - bool IsCrossAddrSpaceOrdering, Position Pos, - AtomicOrdering Order) const override; - - bool insertAcquire(MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace, - Position Pos) const override; - - bool insertBarrierStart(MachineBasicBlock::iterator &MI) const override; -}; - -class SIGfx11CacheControl : public SIGfx10CacheControl { -public: - SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {} - - bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace) const override; - - bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, - SIAtomicAddrSpace AddrSpace, SIMemOp Op, - bool IsVolatile, bool IsNonTemporal, - bool IsLastUse) const override; }; -class SIGfx12CacheControl : public SIGfx11CacheControl { +class SIGfx12CacheControl final : public SICacheControl { protected: // Sets TH policy to \p Value if CPol operand is present in instruction \p MI. // \returns Returns true if \p MI is modified, false otherwise. bool setTH(const MachineBasicBlock::iterator MI, AMDGPU::CPol::CPol Value) const; + // Sets Scope policy to \p Value if CPol operand is present in instruction \p // MI. \returns Returns true if \p MI is modified, false otherwise. bool setScope(const MachineBasicBlock::iterator MI, @@ -620,16 +515,16 @@ protected: SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const; public: - SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) { - // GFX12.0 and GFX12.5 memory models greatly overlap, and in some cases - // the behavior is the same if assuming GFX12.0 in CU mode. - assert(!ST.hasGFX1250Insts() || ST.isCuModeEnabled()); + SIGfx12CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) { + // GFX120x and GFX125x memory models greatly overlap, and in some cases + // the behavior is the same if assuming GFX120x in CU mode. + assert(!ST.hasGFX1250Insts() || ST.hasGFX13Insts() || ST.isCuModeEnabled()); } bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, SIMemOp Op, bool IsCrossAddrSpaceOrdering, Position Pos, - AtomicOrdering Order) const override; + AtomicOrdering Order, bool AtomicsOnly) const override; bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, Position Pos) const override; @@ -641,7 +536,7 @@ public: bool finalizeStore(MachineInstr &MI, bool Atomic) const override; - virtual bool handleCooperativeAtomic(MachineInstr &MI) const override; + bool handleCooperativeAtomic(MachineInstr &MI) const override; bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, @@ -664,6 +559,8 @@ public: SIAtomicAddrSpace AddrSpace) const override { return setAtomicScope(MI, Scope, AddrSpace); } + + bool handleNonVolatile(MachineInstr &MI) const override; }; class SIMemoryLegalizer final { @@ -701,6 +598,9 @@ private: /// instructions are added/deleted or \p MI is modified, false otherwise. bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, MachineBasicBlock::iterator &MI); + /// Expands LDS DMA operation \p MI. Returns true if instructions are + /// added/deleted or \p MI is modified, false otherwise. + bool expandLDSDMA(const SIMemOpInfo &MOI, MachineBasicBlock::iterator &MI); public: SIMemoryLegalizer(const MachineModuleInfo &MMI) : MMI(MMI) {}; @@ -775,7 +675,7 @@ getSynchronizeAddrSpaceMD(const MachineInstr &MI) { void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI, const char *Msg) const { - const Function &Func = MI->getParent()->getParent()->getFunction(); + const Function &Func = MI->getMF()->getFunction(); Func.getContext().diagnose( DiagnosticInfoUnsupported(Func, Msg, MI->getDebugLoc())); } @@ -830,6 +730,9 @@ SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const { return SIAtomicAddrSpace::SCRATCH; if (AS == AMDGPUAS::REGION_ADDRESS) return SIAtomicAddrSpace::GDS; + if (AS == AMDGPUAS::BUFFER_FAT_POINTER || AS == AMDGPUAS::BUFFER_RESOURCE || + AS == AMDGPUAS::BUFFER_STRIDED_POINTER) + return SIAtomicAddrSpace::GLOBAL; return SIAtomicAddrSpace::OTHER; } @@ -879,6 +782,13 @@ std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO( } } + // FIXME: The MMO of buffer atomic instructions does not always have an atomic + // ordering. We only need to handle VBUFFER atomics on GFX12+ so we can fix it + // here, but the lowering should really be cleaned up at some point. + if ((ST.getGeneration() >= GCNSubtarget::GFX12) && SIInstrInfo::isBUF(*MI) && + SIInstrInfo::isAtomic(*MI) && Ordering == AtomicOrdering::NotAtomic) + Ordering = AtomicOrdering::Monotonic; + SIAtomicScope Scope = SIAtomicScope::NONE; SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; bool IsCrossAddressSpaceOrdering = false; @@ -985,19 +895,41 @@ std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo( return constructFromMIWithMMO(MI); } +std::optional<SIMemOpInfo> +SIMemOpAccess::getLDSDMAInfo(const MachineBasicBlock::iterator &MI) const { + assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); + + if (!SIInstrInfo::isLDSDMA(*MI)) + return std::nullopt; + + return constructFromMIWithMMO(MI); +} + +/// \returns true if \p MI has one or more MMO, and all of them are fit for +/// being marked as non-volatile. This means that either they are accessing the +/// constant address space, are accessing a known invariant memory location, or +/// that they are marked with the non-volatile metadata/MMO flag. +static bool isNonVolatileMemoryAccess(const MachineInstr &MI) { + if (MI.getNumMemOperands() == 0) + return false; + return all_of(MI.memoperands(), [&](const MachineMemOperand *MMO) { + return MMO->getFlags() & (MOThreadPrivate | MachineMemOperand::MOInvariant); + }); +} + SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) { TII = ST.getInstrInfo(); IV = getIsaVersion(ST.getCPU()); InsertCacheInv = !AmdgcnSkipCacheInvalidations; } -bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI, - AMDGPU::CPol::CPol Bit) const { +bool SICacheControl::enableCPolBits(const MachineBasicBlock::iterator MI, + unsigned Bits) const { MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol); if (!CPol) return false; - CPol->setImm(CPol->getImm() | Bit); + CPol->setImm(CPol->getImm() | Bits); return true; } @@ -1013,18 +945,10 @@ bool SICacheControl::canAffectGlobalAddrSpace(SIAtomicAddrSpace AS) const { /* static */ std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) { GCNSubtarget::Generation Generation = ST.getGeneration(); - if (ST.hasGFX940Insts()) - return std::make_unique<SIGfx940CacheControl>(ST); - if (ST.hasGFX90AInsts()) - return std::make_unique<SIGfx90ACacheControl>(ST); - if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS) - return std::make_unique<SIGfx6CacheControl>(ST); if (Generation < AMDGPUSubtarget::GFX10) - return std::make_unique<SIGfx7CacheControl>(ST); - if (Generation < AMDGPUSubtarget::GFX11) - return std::make_unique<SIGfx10CacheControl>(ST); + return std::make_unique<SIGfx6CacheControl>(ST); if (Generation < AMDGPUSubtarget::GFX12) - return std::make_unique<SIGfx11CacheControl>(ST); + return std::make_unique<SIGfx10CacheControl>(ST); return std::make_unique<SIGfx12CacheControl>(ST); } @@ -1033,33 +957,61 @@ bool SIGfx6CacheControl::enableLoadCacheBypass( SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const { assert(MI->mayLoad() && !MI->mayStore()); - bool Changed = false; - if (canAffectGlobalAddrSpace(AddrSpace)) { - switch (Scope) { - case SIAtomicScope::SYSTEM: - case SIAtomicScope::AGENT: + if (!canAffectGlobalAddrSpace(AddrSpace)) { + /// The scratch address space does not need the global memory caches + /// to be bypassed as all memory operations by the same thread are + /// sequentially consistent, and no other thread can access scratch + /// memory. + + /// Other address spaces do not have a cache. + return false; + } + + bool Changed = false; + switch (Scope) { + case SIAtomicScope::SYSTEM: + if (ST.hasGFX940Insts()) { + // Set SC bits to indicate system scope. + Changed |= enableCPolBits(MI, CPol::SC0 | CPol::SC1); + break; + } + [[fallthrough]]; + case SIAtomicScope::AGENT: + if (ST.hasGFX940Insts()) { + // Set SC bits to indicate agent scope. + Changed |= enableCPolBits(MI, CPol::SC1); + } else { // Set L1 cache policy to MISS_EVICT. // Note: there is no L2 cache bypass policy at the ISA level. - Changed |= enableGLCBit(MI); - break; - case SIAtomicScope::WORKGROUP: - case SIAtomicScope::WAVEFRONT: - case SIAtomicScope::SINGLETHREAD: - // No cache to bypass. - break; - default: - llvm_unreachable("Unsupported synchronization scope"); + Changed |= enableCPolBits(MI, CPol::GLC); } + break; + case SIAtomicScope::WORKGROUP: + if (ST.hasGFX940Insts()) { + // In threadgroup split mode the waves of a work-group can be executing + // on different CUs. Therefore need to bypass the L1 which is per CU. + // Otherwise in non-threadgroup split mode all waves of a work-group are + // on the same CU, and so the L1 does not need to be bypassed. Setting + // SC bits to indicate work-group scope will do this automatically. + Changed |= enableCPolBits(MI, CPol::SC0); + } else if (ST.hasGFX90AInsts()) { + // In threadgroup split mode the waves of a work-group can be executing + // on different CUs. Therefore need to bypass the L1 which is per CU. + // Otherwise in non-threadgroup split mode all waves of a work-group are + // on the same CU, and so the L1 does not need to be bypassed. + if (ST.isTgSplitEnabled()) + Changed |= enableCPolBits(MI, CPol::GLC); + } + break; + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // No cache to bypass. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); } - /// The scratch address space does not need the global memory caches - /// to be bypassed as all memory operations by the same thread are - /// sequentially consistent, and no other thread can access scratch - /// memory. - - /// Other address spaces do not have a cache. - return Changed; } @@ -1070,8 +1022,39 @@ bool SIGfx6CacheControl::enableStoreCacheBypass( assert(!MI->mayLoad() && MI->mayStore()); bool Changed = false; - /// The L1 cache is write through so does not need to be bypassed. There is no - /// bypass control for the L2 cache at the isa level. + /// For targets other than GFX940, the L1 cache is write through so does not + /// need to be bypassed. There is no bypass control for the L2 cache at the + /// isa level. + + if (ST.hasGFX940Insts() && canAffectGlobalAddrSpace(AddrSpace)) { + switch (Scope) { + case SIAtomicScope::SYSTEM: + // Set SC bits to indicate system scope. + Changed |= enableCPolBits(MI, CPol::SC0 | CPol::SC1); + break; + case SIAtomicScope::AGENT: + // Set SC bits to indicate agent scope. + Changed |= enableCPolBits(MI, CPol::SC1); + break; + case SIAtomicScope::WORKGROUP: + // Set SC bits to indicate workgroup scope. + Changed |= enableCPolBits(MI, CPol::SC0); + break; + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // Leave SC bits unset to indicate wavefront scope. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + + /// The scratch address space does not need the global memory caches + /// to be bypassed as all memory operations by the same thread are + /// sequentially consistent, and no other thread can access scratch + /// memory. + + /// Other address spaces do not have a cache. + } return Changed; } @@ -1083,10 +1066,31 @@ bool SIGfx6CacheControl::enableRMWCacheBypass( assert(MI->mayLoad() && MI->mayStore()); bool Changed = false; - /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically - /// bypassed, and the GLC bit is instead used to indicate if they are - /// return or no-return. - /// Note: there is no L2 cache coherent bypass control at the ISA level. + /// For targets other than GFX940, do not set GLC for RMW atomic operations as + /// L0/L1 cache is automatically bypassed, and the GLC bit is instead used to + /// indicate if they are return or no-return. Note: there is no L2 cache + /// coherent bypass control at the ISA level. + /// For GFX90A+, RMW atomics implicitly bypass the L1 cache. + + if (ST.hasGFX940Insts() && canAffectGlobalAddrSpace(AddrSpace)) { + switch (Scope) { + case SIAtomicScope::SYSTEM: + // Set SC1 bit to indicate system scope. + Changed |= enableCPolBits(MI, CPol::SC1); + break; + case SIAtomicScope::AGENT: + case SIAtomicScope::WORKGROUP: + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // RMW atomic operations implicitly bypass the L1 cache and only use SC1 + // to indicate system or agent scope. The SC0 bit is used to indicate if + // they are return or no-return. Leave SC1 bit unset to indicate agent + // scope. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + } return Changed; } @@ -1097,7 +1101,7 @@ bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal( // Only handle load and store, not atomic read-modify-write insructions. The // latter use glc to indicate if the atomic returns a result and so must not // be used for cache control. - assert(MI->mayLoad() ^ MI->mayStore()); + assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI)); // Only update load and store, not LLVM IR atomic read-modify-write // instructions. The latter are always marked as volatile so cannot sensibly @@ -1108,11 +1112,15 @@ bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal( bool Changed = false; if (IsVolatile) { - // Set L1 cache policy to be MISS_EVICT for load instructions - // and MISS_LRU for store instructions. - // Note: there is no L2 cache bypass policy at the ISA level. - if (Op == SIMemOp::LOAD) - Changed |= enableGLCBit(MI); + if (ST.hasGFX940Insts()) { + // Set SC bits to indicate system scope. + Changed |= enableCPolBits(MI, CPol::SC0 | CPol::SC1); + } else if (Op == SIMemOp::LOAD) { + // Set L1 cache policy to be MISS_EVICT for load instructions + // and MISS_LRU for store instructions. + // Note: there is no L2 cache bypass policy at the ISA level. + Changed |= enableCPolBits(MI, CPol::GLC); + } // Ensure operation has completed at system scope to cause all volatile // operations to be visible outside the program in a global order. Do not @@ -1120,16 +1128,20 @@ bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal( // observable outside the program, so no need to cause a waitcnt for LDS // address space operations. Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, - Position::AFTER, AtomicOrdering::Unordered); + Position::AFTER, AtomicOrdering::Unordered, + /*AtomicsOnly=*/false); return Changed; } if (IsNonTemporal) { - // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT - // for both loads and stores, and the L2 cache policy to STREAM. - Changed |= enableGLCBit(MI); - Changed |= enableSLCBit(MI); + if (ST.hasGFX940Insts()) { + Changed |= enableCPolBits(MI, CPol::NT); + } else { + // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT + // for both loads and stores, and the L2 cache policy to STREAM. + Changed |= enableCPolBits(MI, CPol::SLC | CPol::GLC); + } return Changed; } @@ -1140,15 +1152,36 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, SIMemOp Op, bool IsCrossAddrSpaceOrdering, Position Pos, - AtomicOrdering Order) const { + AtomicOrdering Order, + bool AtomicsOnly) const { bool Changed = false; MachineBasicBlock &MBB = *MI->getParent(); - DebugLoc DL = MI->getDebugLoc(); + const DebugLoc &DL = MI->getDebugLoc(); if (Pos == Position::AFTER) ++MI; + // GFX90A+ + if (ST.hasGFX90AInsts() && ST.isTgSplitEnabled()) { + // In threadgroup split mode the waves of a work-group can be executing on + // different CUs. Therefore need to wait for global or GDS memory operations + // to complete to ensure they are visible to waves in the other CUs. + // Otherwise in non-threadgroup split mode all waves of a work-group are on + // the same CU, so no need to wait for global memory as all waves in the + // work-group access the same the L1, nor wait for GDS as access are ordered + // on a CU. + if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH | + SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) && + (Scope == SIAtomicScope::WORKGROUP)) { + // Same as <GFX90A at AGENT scope; + Scope = SIAtomicScope::AGENT; + } + // In threadgroup split mode LDS cannot be allocated so no need to wait for + // LDS memory operations. + AddrSpace &= ~SIAtomicAddrSpace::LDS; + } + bool VMCnt = false; bool LGKMCnt = false; @@ -1243,61 +1276,13 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, return Changed; } -bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace, - Position Pos) const { - if (!InsertCacheInv) +static bool canUseBUFFER_WBINVL1_VOL(const GCNSubtarget &ST) { + if (ST.getGeneration() <= AMDGPUSubtarget::SOUTHERN_ISLANDS) return false; - - bool Changed = false; - - MachineBasicBlock &MBB = *MI->getParent(); - DebugLoc DL = MI->getDebugLoc(); - - if (Pos == Position::AFTER) - ++MI; - - if (canAffectGlobalAddrSpace(AddrSpace)) { - switch (Scope) { - case SIAtomicScope::SYSTEM: - case SIAtomicScope::AGENT: - BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1)); - Changed = true; - break; - case SIAtomicScope::WORKGROUP: - case SIAtomicScope::WAVEFRONT: - case SIAtomicScope::SINGLETHREAD: - // No cache to invalidate. - break; - default: - llvm_unreachable("Unsupported synchronization scope"); - } - } - - /// The scratch address space does not need the global memory cache - /// to be flushed as all memory operations by the same thread are - /// sequentially consistent, and no other thread can access scratch - /// memory. - - /// Other address spaces do not have a cache. - - if (Pos == Position::AFTER) - --MI; - - return Changed; -} - -bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace, - bool IsCrossAddrSpaceOrdering, - Position Pos) const { - return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, - IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release); + return !ST.isAmdPalOS() && !ST.isMesa3DOS(); } -bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, +bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, Position Pos) const { @@ -1307,235 +1292,97 @@ bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, bool Changed = false; MachineBasicBlock &MBB = *MI->getParent(); - DebugLoc DL = MI->getDebugLoc(); - - const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>(); - - const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS() - ? AMDGPU::BUFFER_WBINVL1 - : AMDGPU::BUFFER_WBINVL1_VOL; + const DebugLoc &DL = MI->getDebugLoc(); if (Pos == Position::AFTER) ++MI; - if (canAffectGlobalAddrSpace(AddrSpace)) { - switch (Scope) { - case SIAtomicScope::SYSTEM: - case SIAtomicScope::AGENT: - BuildMI(MBB, MI, DL, TII->get(InvalidateL1)); - Changed = true; - break; - case SIAtomicScope::WORKGROUP: - case SIAtomicScope::WAVEFRONT: - case SIAtomicScope::SINGLETHREAD: - // No cache to invalidate. - break; - default: - llvm_unreachable("Unsupported synchronization scope"); - } - } - - /// The scratch address space does not need the global memory cache - /// to be flushed as all memory operations by the same thread are - /// sequentially consistent, and no other thread can access scratch - /// memory. - - /// Other address spaces do not have a cache. - - if (Pos == Position::AFTER) - --MI; - - return Changed; -} - -bool SIGfx90ACacheControl::enableLoadCacheBypass( - const MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace) const { - assert(MI->mayLoad() && !MI->mayStore()); - bool Changed = false; + const unsigned InvalidateL1 = canUseBUFFER_WBINVL1_VOL(ST) + ? AMDGPU::BUFFER_WBINVL1_VOL + : AMDGPU::BUFFER_WBINVL1; if (canAffectGlobalAddrSpace(AddrSpace)) { switch (Scope) { case SIAtomicScope::SYSTEM: - case SIAtomicScope::AGENT: - // Set the L1 cache policy to MISS_LRU. - // Note: there is no L2 cache bypass policy at the ISA level. - Changed |= enableGLCBit(MI); - break; - case SIAtomicScope::WORKGROUP: - // In threadgroup split mode the waves of a work-group can be executing on - // different CUs. Therefore need to bypass the L1 which is per CU. - // Otherwise in non-threadgroup split mode all waves of a work-group are - // on the same CU, and so the L1 does not need to be bypassed. - if (ST.isTgSplitEnabled()) - Changed |= enableGLCBit(MI); - break; - case SIAtomicScope::WAVEFRONT: - case SIAtomicScope::SINGLETHREAD: - // No cache to bypass. - break; - default: - llvm_unreachable("Unsupported synchronization scope"); - } - } - - /// The scratch address space does not need the global memory caches - /// to be bypassed as all memory operations by the same thread are - /// sequentially consistent, and no other thread can access scratch - /// memory. - - /// Other address spaces do not have a cache. - - return Changed; -} - -bool SIGfx90ACacheControl::enableRMWCacheBypass( - const MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace) const { - assert(MI->mayLoad() && MI->mayStore()); - bool Changed = false; + if (ST.hasGFX940Insts()) { + // Ensures that following loads will not see stale remote VMEM data or + // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW + // and CC will never be stale due to the local memory probes. + BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) + // Set SC bits to indicate system scope. + .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); + // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the + // hardware does not reorder memory operations by the same wave with + // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to + // remove any cache lines of earlier writes by the same wave and ensures + // later reads by the same wave will refetch the cache lines. + Changed = true; + break; + } - if (canAffectGlobalAddrSpace(AddrSpace)) { - switch (Scope) { - case SIAtomicScope::SYSTEM: + if (ST.hasGFX90AInsts()) { + // Ensures that following loads will not see stale remote VMEM data or + // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW + // and CC will never be stale due to the local memory probes. + BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2)); + BuildMI(MBB, MI, DL, TII->get(InvalidateL1)); + // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the + // hardware does not reorder memory operations by the same wave with + // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed + // to remove any cache lines of earlier writes by the same wave and + // ensures later reads by the same wave will refetch the cache lines. + Changed = true; + break; + } + [[fallthrough]]; case SIAtomicScope::AGENT: - /// Do not set glc for RMW atomic operations as they implicitly bypass - /// the L1 cache, and the glc bit is instead used to indicate if they are - /// return or no-return. - break; - case SIAtomicScope::WORKGROUP: - case SIAtomicScope::WAVEFRONT: - case SIAtomicScope::SINGLETHREAD: - // No cache to bypass. RMW atomics implicitly bypass the L1 cache. - break; - default: - llvm_unreachable("Unsupported synchronization scope"); - } - } - - return Changed; -} - -bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal( - MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, - bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const { - // Only handle load and store, not atomic read-modify-write insructions. The - // latter use glc to indicate if the atomic returns a result and so must not - // be used for cache control. - assert(MI->mayLoad() ^ MI->mayStore()); - - // Only update load and store, not LLVM IR atomic read-modify-write - // instructions. The latter are always marked as volatile so cannot sensibly - // handle it as do not want to pessimize all atomics. Also they do not support - // the nontemporal attribute. - assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); - - bool Changed = false; - - if (IsVolatile) { - // Set L1 cache policy to be MISS_EVICT for load instructions - // and MISS_LRU for store instructions. - // Note: there is no L2 cache bypass policy at the ISA level. - if (Op == SIMemOp::LOAD) - Changed |= enableGLCBit(MI); - - // Ensure operation has completed at system scope to cause all volatile - // operations to be visible outside the program in a global order. Do not - // request cross address space as only the global address space can be - // observable outside the program, so no need to cause a waitcnt for LDS - // address space operations. - Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, - Position::AFTER, AtomicOrdering::Unordered); - - return Changed; - } - - if (IsNonTemporal) { - // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT - // for both loads and stores, and the L2 cache policy to STREAM. - Changed |= enableGLCBit(MI); - Changed |= enableSLCBit(MI); - return Changed; - } - - return Changed; -} - -bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace, SIMemOp Op, - bool IsCrossAddrSpaceOrdering, - Position Pos, - AtomicOrdering Order) const { - if (ST.isTgSplitEnabled()) { - // In threadgroup split mode the waves of a work-group can be executing on - // different CUs. Therefore need to wait for global or GDS memory operations - // to complete to ensure they are visible to waves in the other CUs. - // Otherwise in non-threadgroup split mode all waves of a work-group are on - // the same CU, so no need to wait for global memory as all waves in the - // work-group access the same the L1, nor wait for GDS as access are ordered - // on a CU. - if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH | - SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) && - (Scope == SIAtomicScope::WORKGROUP)) { - // Same as GFX7 using agent scope. - Scope = SIAtomicScope::AGENT; - } - // In threadgroup split mode LDS cannot be allocated so no need to wait for - // LDS memory operations. - AddrSpace &= ~SIAtomicAddrSpace::LDS; - } - return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op, - IsCrossAddrSpaceOrdering, Pos, Order); -} - -bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace, - Position Pos) const { - if (!InsertCacheInv) - return false; - - bool Changed = false; - - MachineBasicBlock &MBB = *MI->getParent(); - DebugLoc DL = MI->getDebugLoc(); - - if (Pos == Position::AFTER) - ++MI; - - if (canAffectGlobalAddrSpace(AddrSpace)) { - switch (Scope) { - case SIAtomicScope::SYSTEM: - // Ensures that following loads will not see stale remote VMEM data or - // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and - // CC will never be stale due to the local memory probes. - BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2)); - // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the - // hardware does not reorder memory operations by the same wave with - // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to - // remove any cache lines of earlier writes by the same wave and ensures - // later reads by the same wave will refetch the cache lines. + if (ST.hasGFX940Insts()) { + // Ensures that following loads will not see stale remote date or local + // MTYPE NC global data. Local MTYPE RW and CC memory will never be + // stale due to the memory probes. + BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) + // Set SC bits to indicate agent scope. + .addImm(AMDGPU::CPol::SC1); + // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware + // does not reorder memory operations with respect to preceeding buffer + // invalidate. The invalidate is guaranteed to remove any cache lines of + // earlier writes and ensures later writes will refetch the cache lines. + } else + BuildMI(MBB, MI, DL, TII->get(InvalidateL1)); Changed = true; break; - case SIAtomicScope::AGENT: - // Same as GFX7. - break; case SIAtomicScope::WORKGROUP: - // In threadgroup split mode the waves of a work-group can be executing on - // different CUs. Therefore need to invalidate the L1 which is per CU. - // Otherwise in non-threadgroup split mode all waves of a work-group are - // on the same CU, and so the L1 does not need to be invalidated. if (ST.isTgSplitEnabled()) { - // Same as GFX7 using agent scope. - Scope = SIAtomicScope::AGENT; + if (ST.hasGFX940Insts()) { + // In threadgroup split mode the waves of a work-group can be + // executing on different CUs. Therefore need to invalidate the L1 + // which is per CU. Otherwise in non-threadgroup split mode all waves + // of a work-group are on the same CU, and so the L1 does not need to + // be invalidated. + + // Ensures L1 is invalidated if in threadgroup split mode. In + // non-threadgroup split mode it is a NOP, but no point generating it + // in that case if know not in that mode. + BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) + // Set SC bits to indicate work-group scope. + .addImm(AMDGPU::CPol::SC0); + // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware + // does not reorder memory operations with respect to preceeding + // buffer invalidate. The invalidate is guaranteed to remove any cache + // lines of earlier writes and ensures later writes will refetch the + // cache lines. + Changed = true; + } else if (ST.hasGFX90AInsts()) { + BuildMI(MBB, MI, DL, TII->get(InvalidateL1)); + Changed = true; + } } break; case SIAtomicScope::WAVEFRONT: case SIAtomicScope::SINGLETHREAD: - // Same as GFX7. + // For GFX940, we could generate "BUFFER_INV" but it would do nothing as + // there are no caches to invalidate. All other targets have no cache to + // invalidate. break; default: llvm_unreachable("Unsupported synchronization scope"); @@ -1552,366 +1399,76 @@ bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI, if (Pos == Position::AFTER) --MI; - Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos); - return Changed; } -bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace, - bool IsCrossAddrSpaceOrdering, - Position Pos) const { - bool Changed = false; - - MachineBasicBlock &MBB = *MI->getParent(); - const DebugLoc &DL = MI->getDebugLoc(); - - if (Pos == Position::AFTER) - ++MI; - - if (canAffectGlobalAddrSpace(AddrSpace)) { - switch (Scope) { - case SIAtomicScope::SYSTEM: - // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the - // hardware does not reorder memory operations by the same wave with - // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed - // to initiate writeback of any dirty cache lines of earlier writes by the - // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the - // writeback has completed. - BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) - // Set SC bits to indicate system scope. - .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); - // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT - // vmcnt(0)" needed by the "BUFFER_WBL2". - Changed = true; - break; - case SIAtomicScope::AGENT: - case SIAtomicScope::WORKGROUP: - case SIAtomicScope::WAVEFRONT: - case SIAtomicScope::SINGLETHREAD: - // Same as GFX7. - break; - default: - llvm_unreachable("Unsupported synchronization scope"); - } - } - - if (Pos == Position::AFTER) - --MI; - - Changed |= - SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace, - IsCrossAddrSpaceOrdering, Pos); - - return Changed; -} - -bool SIGfx940CacheControl::enableLoadCacheBypass( - const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace) const { - assert(MI->mayLoad() && !MI->mayStore()); - bool Changed = false; - - if (canAffectGlobalAddrSpace(AddrSpace)) { - switch (Scope) { - case SIAtomicScope::SYSTEM: - // Set SC bits to indicate system scope. - Changed |= enableSC0Bit(MI); - Changed |= enableSC1Bit(MI); - break; - case SIAtomicScope::AGENT: - // Set SC bits to indicate agent scope. - Changed |= enableSC1Bit(MI); - break; - case SIAtomicScope::WORKGROUP: - // In threadgroup split mode the waves of a work-group can be executing on - // different CUs. Therefore need to bypass the L1 which is per CU. - // Otherwise in non-threadgroup split mode all waves of a work-group are - // on the same CU, and so the L1 does not need to be bypassed. Setting SC - // bits to indicate work-group scope will do this automatically. - Changed |= enableSC0Bit(MI); - break; - case SIAtomicScope::WAVEFRONT: - case SIAtomicScope::SINGLETHREAD: - // Leave SC bits unset to indicate wavefront scope. - break; - default: - llvm_unreachable("Unsupported synchronization scope"); - } - } - - /// The scratch address space does not need the global memory caches - /// to be bypassed as all memory operations by the same thread are - /// sequentially consistent, and no other thread can access scratch - /// memory. - - /// Other address spaces do not have a cache. - - return Changed; -} - -bool SIGfx940CacheControl::enableStoreCacheBypass( - const MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const { - assert(!MI->mayLoad() && MI->mayStore()); - bool Changed = false; - - if (canAffectGlobalAddrSpace(AddrSpace)) { - switch (Scope) { - case SIAtomicScope::SYSTEM: - // Set SC bits to indicate system scope. - Changed |= enableSC0Bit(MI); - Changed |= enableSC1Bit(MI); - break; - case SIAtomicScope::AGENT: - // Set SC bits to indicate agent scope. - Changed |= enableSC1Bit(MI); - break; - case SIAtomicScope::WORKGROUP: - // Set SC bits to indicate workgroup scope. - Changed |= enableSC0Bit(MI); - break; - case SIAtomicScope::WAVEFRONT: - case SIAtomicScope::SINGLETHREAD: - // Leave SC bits unset to indicate wavefront scope. - break; - default: - llvm_unreachable("Unsupported synchronization scope"); - } - } - - /// The scratch address space does not need the global memory caches - /// to be bypassed as all memory operations by the same thread are - /// sequentially consistent, and no other thread can access scratch - /// memory. - - /// Other address spaces do not have a cache. - - return Changed; -} - -bool SIGfx940CacheControl::enableRMWCacheBypass( - const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace) const { - assert(MI->mayLoad() && MI->mayStore()); - bool Changed = false; - - if (canAffectGlobalAddrSpace(AddrSpace)) { - switch (Scope) { - case SIAtomicScope::SYSTEM: - // Set SC1 bit to indicate system scope. - Changed |= enableSC1Bit(MI); - break; - case SIAtomicScope::AGENT: - case SIAtomicScope::WORKGROUP: - case SIAtomicScope::WAVEFRONT: - case SIAtomicScope::SINGLETHREAD: - // RMW atomic operations implicitly bypass the L1 cache and only use SC1 - // to indicate system or agent scope. The SC0 bit is used to indicate if - // they are return or no-return. Leave SC1 bit unset to indicate agent - // scope. - break; - default: - llvm_unreachable("Unsupported synchronization scope"); - } - } - - return Changed; -} - -bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal( - MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, - bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const { - // Only handle load and store, not atomic read-modify-write insructions. The - // latter use glc to indicate if the atomic returns a result and so must not - // be used for cache control. - assert(MI->mayLoad() ^ MI->mayStore()); - - // Only update load and store, not LLVM IR atomic read-modify-write - // instructions. The latter are always marked as volatile so cannot sensibly - // handle it as do not want to pessimize all atomics. Also they do not support - // the nontemporal attribute. - assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); - - bool Changed = false; - - if (IsVolatile) { - // Set SC bits to indicate system scope. - Changed |= enableSC0Bit(MI); - Changed |= enableSC1Bit(MI); - - // Ensure operation has completed at system scope to cause all volatile - // operations to be visible outside the program in a global order. Do not - // request cross address space as only the global address space can be - // observable outside the program, so no need to cause a waitcnt for LDS - // address space operations. - Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, - Position::AFTER, AtomicOrdering::Unordered); - - return Changed; - } - - if (IsNonTemporal) { - Changed |= enableNTBit(MI); - return Changed; - } - - return Changed; -} - -bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace, - Position Pos) const { - if (!InsertCacheInv) - return false; - +bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, + bool IsCrossAddrSpaceOrdering, + Position Pos) const { bool Changed = false; - MachineBasicBlock &MBB = *MI->getParent(); - DebugLoc DL = MI->getDebugLoc(); + if (ST.hasGFX90AInsts()) { + MachineBasicBlock &MBB = *MI->getParent(); + const DebugLoc &DL = MI->getDebugLoc(); - if (Pos == Position::AFTER) - ++MI; + if (Pos == Position::AFTER) + ++MI; - if (canAffectGlobalAddrSpace(AddrSpace)) { - switch (Scope) { - case SIAtomicScope::SYSTEM: - // Ensures that following loads will not see stale remote VMEM data or - // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and - // CC will never be stale due to the local memory probes. - BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) - // Set SC bits to indicate system scope. - .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); - // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the - // hardware does not reorder memory operations by the same wave with - // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to - // remove any cache lines of earlier writes by the same wave and ensures - // later reads by the same wave will refetch the cache lines. - Changed = true; - break; - case SIAtomicScope::AGENT: - // Ensures that following loads will not see stale remote date or local - // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale - // due to the memory probes. - BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) - // Set SC bits to indicate agent scope. - .addImm(AMDGPU::CPol::SC1); - // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware - // does not reorder memory operations with respect to preceeding buffer - // invalidate. The invalidate is guaranteed to remove any cache lines of - // earlier writes and ensures later writes will refetch the cache lines. - Changed = true; - break; - case SIAtomicScope::WORKGROUP: - // In threadgroup split mode the waves of a work-group can be executing on - // different CUs. Therefore need to invalidate the L1 which is per CU. - // Otherwise in non-threadgroup split mode all waves of a work-group are - // on the same CU, and so the L1 does not need to be invalidated. - if (ST.isTgSplitEnabled()) { - // Ensures L1 is invalidated if in threadgroup split mode. In - // non-threadgroup split mode it is a NOP, but no point generating it in - // that case if know not in that mode. - BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) - // Set SC bits to indicate work-group scope. - .addImm(AMDGPU::CPol::SC0); - // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware - // does not reorder memory operations with respect to preceeding buffer - // invalidate. The invalidate is guaranteed to remove any cache lines of - // earlier writes and ensures later writes will refetch the cache lines. + if (canAffectGlobalAddrSpace(AddrSpace)) { + switch (Scope) { + case SIAtomicScope::SYSTEM: + // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the + // hardware does not reorder memory operations by the same wave with + // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed + // to initiate writeback of any dirty cache lines of earlier writes by + // the same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the + // writeback has completed. + BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) + // Set SC bits to indicate system scope. + .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); Changed = true; + break; + case SIAtomicScope::AGENT: + if (ST.hasGFX940Insts()) { + BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) + // Set SC bits to indicate agent scope. + .addImm(AMDGPU::CPol::SC1); + + // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is + // SIAtomicScope::AGENT, the following insertWait will generate the + // required "S_WAITCNT vmcnt(0)". + Changed = true; + } + break; + case SIAtomicScope::WORKGROUP: + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // For GFX940, do not generate "BUFFER_WBL2" as there are no caches it + // would writeback, and would require an otherwise unnecessary + // "S_WAITCNT vmcnt(0)". + break; + default: + llvm_unreachable("Unsupported synchronization scope"); } - break; - case SIAtomicScope::WAVEFRONT: - case SIAtomicScope::SINGLETHREAD: - // Could generate "BUFFER_INV" but it would do nothing as there are no - // caches to invalidate. - break; - default: - llvm_unreachable("Unsupported synchronization scope"); } - } - - /// The scratch address space does not need the global memory cache - /// to be flushed as all memory operations by the same thread are - /// sequentially consistent, and no other thread can access scratch - /// memory. - - /// Other address spaces do not have a cache. - if (Pos == Position::AFTER) - --MI; - - return Changed; -} - -bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace, - bool IsCrossAddrSpaceOrdering, - Position Pos) const { - bool Changed = false; - - MachineBasicBlock &MBB = *MI->getParent(); - DebugLoc DL = MI->getDebugLoc(); - - if (Pos == Position::AFTER) - ++MI; - - if (canAffectGlobalAddrSpace(AddrSpace)) { - switch (Scope) { - case SIAtomicScope::SYSTEM: - // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the - // hardware does not reorder memory operations by the same wave with - // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed - // to initiate writeback of any dirty cache lines of earlier writes by the - // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the - // writeback has completed. - BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) - // Set SC bits to indicate system scope. - .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); - // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is - // SIAtomicScope::SYSTEM, the following insertWait will generate the - // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2". - Changed = true; - break; - case SIAtomicScope::AGENT: - BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) - // Set SC bits to indicate agent scope. - .addImm(AMDGPU::CPol::SC1); - - // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is - // SIAtomicScope::AGENT, the following insertWait will generate the - // required "S_WAITCNT vmcnt(0)". - Changed = true; - break; - case SIAtomicScope::WORKGROUP: - case SIAtomicScope::WAVEFRONT: - case SIAtomicScope::SINGLETHREAD: - // Do not generate "BUFFER_WBL2" as there are no caches it would - // writeback, and would require an otherwise unnecessary - // "S_WAITCNT vmcnt(0)". - break; - default: - llvm_unreachable("Unsupported synchronization scope"); - } + if (Pos == Position::AFTER) + --MI; } - if (Pos == Position::AFTER) - --MI; - // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other // S_WAITCNT needed. Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, - IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release); + IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release, + /*AtomicsOnly=*/false); return Changed; } bool SIGfx10CacheControl::enableLoadCacheBypass( - const MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, + const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const { assert(MI->mayLoad() && !MI->mayStore()); bool Changed = false; @@ -1922,8 +1479,9 @@ bool SIGfx10CacheControl::enableLoadCacheBypass( case SIAtomicScope::AGENT: // Set the L0 and L1 cache policies to MISS_EVICT. // Note: there is no L2 cache coherent bypass control at the ISA level. - Changed |= enableGLCBit(MI); - Changed |= enableDLCBit(MI); + // For GFX10, set GLC+DLC, for GFX11, only set GLC. + Changed |= + enableCPolBits(MI, CPol::GLC | (AMDGPU::isGFX10(ST) ? CPol::DLC : 0)); break; case SIAtomicScope::WORKGROUP: // In WGP mode the waves of a work-group can be executing on either CU of @@ -1931,7 +1489,7 @@ bool SIGfx10CacheControl::enableLoadCacheBypass( // CU mode all waves of a work-group are on the same CU, and so the L0 // does not need to be bypassed. if (!ST.isCuModeEnabled()) - Changed |= enableGLCBit(MI); + Changed |= enableCPolBits(MI, CPol::GLC); break; case SIAtomicScope::WAVEFRONT: case SIAtomicScope::SINGLETHREAD: @@ -1959,7 +1517,7 @@ bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal( // Only handle load and store, not atomic read-modify-write insructions. The // latter use glc to indicate if the atomic returns a result and so must not // be used for cache control. - assert(MI->mayLoad() ^ MI->mayStore()); + assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI)); // Only update load and store, not LLVM IR atomic read-modify-write // instructions. The latter are always marked as volatile so cannot sensibly @@ -1974,17 +1532,21 @@ bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal( // and MISS_LRU for store instructions. // Note: there is no L2 cache coherent bypass control at the ISA level. if (Op == SIMemOp::LOAD) { - Changed |= enableGLCBit(MI); - Changed |= enableDLCBit(MI); + Changed |= enableCPolBits(MI, CPol::GLC | CPol::DLC); } + // GFX11: Set MALL NOALLOC for both load and store instructions. + if (AMDGPU::isGFX11(ST)) + Changed |= enableCPolBits(MI, CPol::DLC); + // Ensure operation has completed at system scope to cause all volatile // operations to be visible outside the program in a global order. Do not // request cross address space as only the global address space can be // observable outside the program, so no need to cause a waitcnt for LDS // address space operations. Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, - Position::AFTER, AtomicOrdering::Unordered); + Position::AFTER, AtomicOrdering::Unordered, + /*AtomicsOnly=*/false); return Changed; } @@ -1994,8 +1556,12 @@ bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal( // For stores setting both GLC and SLC configures L0 and L1 cache policy // to MISS_EVICT and the L2 cache policy to STREAM. if (Op == SIMemOp::STORE) - Changed |= enableGLCBit(MI); - Changed |= enableSLCBit(MI); + Changed |= enableCPolBits(MI, CPol::GLC); + Changed |= enableCPolBits(MI, CPol::SLC); + + // GFX11: Set MALL NOALLOC for both load and store instructions. + if (AMDGPU::isGFX11(ST)) + Changed |= enableCPolBits(MI, CPol::DLC); return Changed; } @@ -2007,11 +1573,12 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, SIMemOp Op, bool IsCrossAddrSpaceOrdering, - Position Pos, AtomicOrdering Order) const { + Position Pos, AtomicOrdering Order, + bool AtomicsOnly) const { bool Changed = false; MachineBasicBlock &MBB = *MI->getParent(); - DebugLoc DL = MI->getDebugLoc(); + const DebugLoc &DL = MI->getDebugLoc(); if (Pos == Position::AFTER) ++MI; @@ -2035,8 +1602,11 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI, // the WGP. Therefore need to wait for operations to complete to ensure // they are visible to waves in the other CU as the L0 is per CU. // Otherwise in CU mode and all waves of a work-group are on the same CU - // which shares the same L0. - if (!ST.isCuModeEnabled()) { + // which shares the same L0. Note that we still need to wait when + // performing a release in this mode to respect the transitivity of + // happens-before, e.g. other waves of the workgroup must be able to + // release the memory from another wave at a wider scope. + if (!ST.isCuModeEnabled() || isReleaseOrStronger(Order)) { if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) VMCnt |= true; if ((Op & SIMemOp::STORE) != SIMemOp::NONE) @@ -2143,7 +1713,7 @@ bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, bool Changed = false; MachineBasicBlock &MBB = *MI->getParent(); - DebugLoc DL = MI->getDebugLoc(); + const DebugLoc &DL = MI->getDebugLoc(); if (Pos == Position::AFTER) ++MI; @@ -2191,117 +1761,6 @@ bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, return Changed; } -bool SIGfx10CacheControl::insertBarrierStart( - MachineBasicBlock::iterator &MI) const { - // We need to wait on vm_vsrc so barriers can pair with fences in GFX10+ CU - // mode. This is because a CU mode release fence does not emit any wait, which - // is fine when only dealing with vmem, but isn't sufficient in the presence - // of barriers which do not go through vmem. - // GFX12.5 does not require this additional wait. - if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts()) - return false; - - BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), - TII->get(AMDGPU::S_WAITCNT_DEPCTR)) - .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0)); - return true; -} - -bool SIGfx11CacheControl::enableLoadCacheBypass( - const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace) const { - assert(MI->mayLoad() && !MI->mayStore()); - bool Changed = false; - - if (canAffectGlobalAddrSpace(AddrSpace)) { - switch (Scope) { - case SIAtomicScope::SYSTEM: - case SIAtomicScope::AGENT: - // Set the L0 and L1 cache policies to MISS_EVICT. - // Note: there is no L2 cache coherent bypass control at the ISA level. - Changed |= enableGLCBit(MI); - break; - case SIAtomicScope::WORKGROUP: - // In WGP mode the waves of a work-group can be executing on either CU of - // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in - // CU mode all waves of a work-group are on the same CU, and so the L0 - // does not need to be bypassed. - if (!ST.isCuModeEnabled()) - Changed |= enableGLCBit(MI); - break; - case SIAtomicScope::WAVEFRONT: - case SIAtomicScope::SINGLETHREAD: - // No cache to bypass. - break; - default: - llvm_unreachable("Unsupported synchronization scope"); - } - } - - /// The scratch address space does not need the global memory caches - /// to be bypassed as all memory operations by the same thread are - /// sequentially consistent, and no other thread can access scratch - /// memory. - - /// Other address spaces do not have a cache. - - return Changed; -} - -bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal( - MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, - bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const { - - // Only handle load and store, not atomic read-modify-write insructions. The - // latter use glc to indicate if the atomic returns a result and so must not - // be used for cache control. - assert(MI->mayLoad() ^ MI->mayStore()); - - // Only update load and store, not LLVM IR atomic read-modify-write - // instructions. The latter are always marked as volatile so cannot sensibly - // handle it as do not want to pessimize all atomics. Also they do not support - // the nontemporal attribute. - assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); - - bool Changed = false; - - if (IsVolatile) { - // Set L0 and L1 cache policy to be MISS_EVICT for load instructions - // and MISS_LRU for store instructions. - // Note: there is no L2 cache coherent bypass control at the ISA level. - if (Op == SIMemOp::LOAD) - Changed |= enableGLCBit(MI); - - // Set MALL NOALLOC for load and store instructions. - Changed |= enableDLCBit(MI); - - // Ensure operation has completed at system scope to cause all volatile - // operations to be visible outside the program in a global order. Do not - // request cross address space as only the global address space can be - // observable outside the program, so no need to cause a waitcnt for LDS - // address space operations. - Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, - Position::AFTER, AtomicOrdering::Unordered); - return Changed; - } - - if (IsNonTemporal) { - // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT - // and L2 cache policy to STREAM. - // For stores setting both GLC and SLC configures L0 and L1 cache policy - // to MISS_EVICT and the L2 cache policy to STREAM. - if (Op == SIMemOp::STORE) - Changed |= enableGLCBit(MI); - Changed |= enableSLCBit(MI); - - // Set MALL NOALLOC for load and store instructions. - Changed |= enableDLCBit(MI); - return Changed; - } - - return Changed; -} - bool SIGfx12CacheControl::setTH(const MachineBasicBlock::iterator MI, AMDGPU::CPol::CPol Value) const { MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol); @@ -2354,11 +1813,12 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, SIMemOp Op, bool IsCrossAddrSpaceOrdering, - Position Pos, AtomicOrdering Order) const { + Position Pos, AtomicOrdering Order, + bool AtomicsOnly) const { bool Changed = false; MachineBasicBlock &MBB = *MI->getParent(); - DebugLoc DL = MI->getDebugLoc(); + const DebugLoc &DL = MI->getDebugLoc(); bool LOADCnt = false; bool DSCnt = false; @@ -2383,15 +1843,20 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI, // In WGP mode the waves of a work-group can be executing on either CU // of the WGP. Therefore need to wait for operations to complete to // ensure they are visible to waves in the other CU as the L0 is per CU. + // // Otherwise in CU mode and all waves of a work-group are on the same CU - // which shares the same L0. + // which shares the same L0. Note that we still need to wait when + // performing a release in this mode to respect the transitivity of + // happens-before, e.g. other waves of the workgroup must be able to + // release the memory from another wave at a wider scope. // // GFX12.5: // CU$ has two ports. To ensure operations are visible at the workgroup // level, we need to ensure all operations in this port have completed // so the other SIMDs in the WG can see them. There is no ordering // guarantee between the ports. - if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts()) { + if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts() || + isReleaseOrStronger(Order)) { if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) LOADCnt |= true; if ((Op & SIMemOp::STORE) != SIMemOp::NONE) @@ -2444,7 +1909,7 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI, // // This also applies to fences. Fences cannot pair with an instruction // tracked with bvh/samplecnt as we don't have any atomics that do that. - if (Order != AtomicOrdering::Acquire && ST.hasImageInsts()) { + if (!AtomicsOnly && ST.hasImageInsts()) { BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0); BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0); } @@ -2476,7 +1941,7 @@ bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, return false; MachineBasicBlock &MBB = *MI->getParent(); - DebugLoc DL = MI->getDebugLoc(); + const DebugLoc &DL = MI->getDebugLoc(); /// The scratch address space does not need the global memory cache /// to be flushed as all memory operations by the same thread are @@ -2527,6 +1992,17 @@ bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, if (Pos == Position::AFTER) --MI; + // Target requires a waitcnt to ensure that the proceeding INV has completed + // as it may get reorded with following load instructions. + if (ST.hasINVWBL2WaitCntRequirement() && Scope > SIAtomicScope::CLUSTER) { + insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD, + /*IsCrossAddrSpaceOrdering=*/false, Pos, AtomicOrdering::Acquire, + /*AtomicsOnly=*/false); + + if (Pos == Position::AFTER) + --MI; + } + return true; } @@ -2538,7 +2014,7 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI, bool Changed = false; MachineBasicBlock &MBB = *MI->getParent(); - DebugLoc DL = MI->getDebugLoc(); + const DebugLoc &DL = MI->getDebugLoc(); // The scratch address space does not need the global memory cache // writeback as all memory operations by the same thread are @@ -2554,19 +2030,15 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI, // // Emitting it for lower scopes is a slow no-op, so we omit it // for performance. + std::optional<AMDGPU::CPol::CPol> NeedsWB; switch (Scope) { case SIAtomicScope::SYSTEM: - BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB)) - .addImm(AMDGPU::CPol::SCOPE_SYS); - Changed = true; + NeedsWB = AMDGPU::CPol::SCOPE_SYS; break; case SIAtomicScope::AGENT: // GFX12.5 may have >1 L2 per device so we must emit a device scope WB. - if (ST.hasGFX1250Insts()) { - BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB)) - .addImm(AMDGPU::CPol::SCOPE_DEV); - Changed = true; - } + if (ST.hasGFX1250Insts()) + NeedsWB = AMDGPU::CPol::SCOPE_DEV; break; case SIAtomicScope::CLUSTER: case SIAtomicScope::WORKGROUP: @@ -2579,6 +2051,20 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI, llvm_unreachable("Unsupported synchronization scope"); } + if (NeedsWB) { + // Target requires a waitcnt to ensure that the proceeding store + // proceeding store/rmw operations have completed in L2 so their data will + // be written back by the WB instruction. + if (ST.hasINVWBL2WaitCntRequirement()) + insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, + /*IsCrossAddrSpaceOrdering=*/false, Pos, + AtomicOrdering::Release, + /*AtomicsOnly=*/false); + + BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB)).addImm(*NeedsWB); + Changed = true; + } + if (Pos == Position::AFTER) --MI; } @@ -2587,17 +2073,29 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI, // complete, whether we inserted a WB or not. If we inserted a WB (storecnt), // we of course need to wait for that as well. Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, - IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release); + IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release, + /*AtomicsOnly=*/false); return Changed; } +bool SIGfx12CacheControl::handleNonVolatile(MachineInstr &MI) const { + // On GFX12.5, set the NV CPol bit. + if (!ST.hasGFX1250Insts()) + return false; + MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol); + if (!CPol) + return false; + CPol->setImm(CPol->getImm() | AMDGPU::CPol::NV); + return true; +} + bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const { // Only handle load and store, not atomic read-modify-write instructions. - assert(MI->mayLoad() ^ MI->mayStore()); + assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI)); // Only update load and store, not LLVM IR atomic read-modify-write // instructions. The latter are always marked as volatile so cannot sensibly @@ -2618,13 +2116,21 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( if (IsVolatile) { Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS); + if (ST.requiresWaitXCntForSingleAccessInstructions() && + SIInstrInfo::isVMEM(*MI)) { + MachineBasicBlock &MBB = *MI->getParent(); + BuildMI(MBB, MI, MI->getDebugLoc(), TII->get(S_WAIT_XCNT_soft)).addImm(0); + Changed = true; + } + // Ensure operation has completed at system scope to cause all volatile // operations to be visible outside the program in a global order. Do not // request cross address space as only the global address space can be // observable outside the program, so no need to cause a waitcnt for LDS // address space operations. Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, - Position::AFTER, AtomicOrdering::Unordered); + Position::AFTER, AtomicOrdering::Unordered, + /*AtomicsOnly=*/false); } return Changed; @@ -2635,9 +2141,8 @@ bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const { const bool IsRMW = (MI.mayLoad() && MI.mayStore()); bool Changed = false; - // GFX12.5 only: xcnt wait is needed before flat and global atomics - // stores/rmw. - if (Atomic && ST.requiresWaitXCntBeforeAtomicStores() && TII->isFLAT(MI)) { + if (Atomic && ST.requiresWaitXCntForSingleAccessInstructions() && + SIInstrInfo::isVMEM(MI)) { MachineBasicBlock &MBB = *MI.getParent(); BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(S_WAIT_XCNT_soft)).addImm(0); Changed = true; @@ -2653,7 +2158,8 @@ bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const { const unsigned Scope = CPol->getImm() & CPol::SCOPE; // GFX12.0 only: Extra waits needed before system scope stores. - if (!ST.hasGFX1250Insts() && !Atomic && Scope == CPol::SCOPE_SYS) + if (ST.requiresWaitsBeforeSystemScopeStores() && !Atomic && + Scope == CPol::SCOPE_SYS) Changed |= insertWaitsBeforeSystemScopeStore(MI.getIterator()); return Changed; @@ -2748,13 +2254,15 @@ bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI, Changed |= CC->insertWait(MI, MOI.getScope(), MOI.getOrderingAddrSpace(), SIMemOp::LOAD | SIMemOp::STORE, MOI.getIsCrossAddressSpaceOrdering(), - Position::BEFORE, Order); + Position::BEFORE, Order, /*AtomicsOnly=*/false); if (Order == AtomicOrdering::Acquire || Order == AtomicOrdering::SequentiallyConsistent) { - Changed |= CC->insertWait( - MI, MOI.getScope(), MOI.getInstrAddrSpace(), SIMemOp::LOAD, - MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER, Order); + // The wait below only needs to wait on the prior atomic. + Changed |= + CC->insertWait(MI, MOI.getScope(), MOI.getInstrAddrSpace(), + SIMemOp::LOAD, MOI.getIsCrossAddressSpaceOrdering(), + Position::AFTER, Order, /*AtomicsOnly=*/true); Changed |= CC->insertAcquire(MI, MOI.getScope(), MOI.getOrderingAddrSpace(), Position::AFTER); @@ -2830,9 +2338,11 @@ bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI, if (MOI.isAtomic()) { const AtomicOrdering Order = MOI.getOrdering(); if (Order == AtomicOrdering::Acquire) { - Changed |= CC->insertWait( - MI, MOI.getScope(), OrderingAddrSpace, SIMemOp::LOAD | SIMemOp::STORE, - MOI.getIsCrossAddressSpaceOrdering(), Position::BEFORE, Order); + // Acquire fences only need to wait on the previous atomic they pair with. + Changed |= CC->insertWait(MI, MOI.getScope(), OrderingAddrSpace, + SIMemOp::LOAD | SIMemOp::STORE, + MOI.getIsCrossAddressSpaceOrdering(), + Position::BEFORE, Order, /*AtomicsOnly=*/true); } if (Order == AtomicOrdering::Release || @@ -2897,10 +2407,12 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, Order == AtomicOrdering::SequentiallyConsistent || MOI.getFailureOrdering() == AtomicOrdering::Acquire || MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) { - Changed |= CC->insertWait( - MI, MOI.getScope(), MOI.getInstrAddrSpace(), - isAtomicRet(*MI) ? SIMemOp::LOAD : SIMemOp::STORE, - MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER, Order); + // Only wait on the previous atomic. + Changed |= + CC->insertWait(MI, MOI.getScope(), MOI.getInstrAddrSpace(), + isAtomicRet(*MI) ? SIMemOp::LOAD : SIMemOp::STORE, + MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER, + Order, /*AtomicsOnly=*/true); Changed |= CC->insertAcquire(MI, MOI.getScope(), MOI.getOrderingAddrSpace(), Position::AFTER); @@ -2913,6 +2425,23 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, return Changed; } +bool SIMemoryLegalizer::expandLDSDMA(const SIMemOpInfo &MOI, + MachineBasicBlock::iterator &MI) { + assert(MI->mayLoad() && MI->mayStore()); + + // The volatility or nontemporal-ness of the operation is a + // function of the global memory, not the LDS. + SIMemOp OpKind = + SIInstrInfo::mayWriteLDSThroughDMA(*MI) ? SIMemOp::LOAD : SIMemOp::STORE; + + // Handle volatile and/or nontemporal markers on direct-to-LDS loads and + // stores. The operation is treated as a volatile/nontemporal store + // to its second argument. + return CC->enableVolatileAndOrNonTemporal( + MI, MOI.getInstrAddrSpace(), OpKind, MOI.isVolatile(), + MOI.isNonTemporal(), MOI.isLastUse()); +} + bool SIMemoryLegalizerLegacy::runOnMachineFunction(MachineFunction &MF) { const MachineModuleInfo &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI(); @@ -2956,22 +2485,21 @@ bool SIMemoryLegalizer::run(MachineFunction &MF) { MI = II->getIterator(); } - if (ST.getInstrInfo()->isBarrierStart(MI->getOpcode())) { - Changed |= CC->insertBarrierStart(MI); - continue; + if (MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic) { + if (const auto &MOI = MOA.getLoadInfo(MI)) + Changed |= expandLoad(*MOI, MI); + else if (const auto &MOI = MOA.getStoreInfo(MI)) + Changed |= expandStore(*MOI, MI); + else if (const auto &MOI = MOA.getLDSDMAInfo(MI)) + Changed |= expandLDSDMA(*MOI, MI); + else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) + Changed |= expandAtomicFence(*MOI, MI); + else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) + Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI); } - if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic)) - continue; - - if (const auto &MOI = MOA.getLoadInfo(MI)) - Changed |= expandLoad(*MOI, MI); - else if (const auto &MOI = MOA.getStoreInfo(MI)) { - Changed |= expandStore(*MOI, MI); - } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) - Changed |= expandAtomicFence(*MOI, MI); - else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) - Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI); + if (isNonVolatileMemoryAccess(*MI)) + Changed |= CC->handleNonVolatile(*MI); } } diff --git a/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.cpp b/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.cpp index f9efee6..9a58382 100644 --- a/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.cpp +++ b/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.cpp @@ -28,19 +28,9 @@ SIModeRegisterDefaults::SIModeRegisterDefaults(const Function &F, DX10Clamp = DX10ClampAttr == "true"; } - StringRef DenormF32Attr = - F.getFnAttribute("denormal-fp-math-f32").getValueAsString(); - if (!DenormF32Attr.empty()) - FP32Denormals = parseDenormalFPAttribute(DenormF32Attr); - - StringRef DenormAttr = - F.getFnAttribute("denormal-fp-math").getValueAsString(); - if (!DenormAttr.empty()) { - DenormalMode DenormMode = parseDenormalFPAttribute(DenormAttr); - if (DenormF32Attr.empty()) - FP32Denormals = DenormMode; - FP64FP16Denormals = DenormMode; - } + DenormalFPEnv FPEnv = F.getDenormalFPEnv(); + FP64FP16Denormals = FPEnv.DefaultMode; + FP32Denormals = FPEnv.F32Mode; } using namespace AMDGPU; diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp index aa028c8..47bc218 100644 --- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp +++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp @@ -76,9 +76,7 @@ class SIOptimizeExecMaskingLegacy : public MachineFunctionPass { public: static char ID; - SIOptimizeExecMaskingLegacy() : MachineFunctionPass(ID) { - initializeSIOptimizeExecMaskingLegacyPass(*PassRegistry::getPassRegistry()); - } + SIOptimizeExecMaskingLegacy() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override; diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp index c186f5a..ac24f2f 100644 --- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp +++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp @@ -54,10 +54,7 @@ class SIOptimizeExecMaskingPreRALegacy : public MachineFunctionPass { public: static char ID; - SIOptimizeExecMaskingPreRALegacy() : MachineFunctionPass(ID) { - initializeSIOptimizeExecMaskingPreRALegacyPass( - *PassRegistry::getPassRegistry()); - } + SIOptimizeExecMaskingPreRALegacy() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override; @@ -473,6 +470,8 @@ bool SIOptimizeExecMaskingPreRA::run(MachineFunction &MF) { assert(Idx != -1); if (SingleExecUser->getParent() == I->getParent() && !SingleExecUser->getOperand(Idx).isImplicit() && + static_cast<unsigned>(Idx) < + SingleExecUser->getDesc().getNumOperands() && TII->isOperandLegal(*SingleExecUser, Idx, &I->getOperand(1))) { LLVM_DEBUG(dbgs() << "Redundant EXEC COPY: " << *I << '\n'); LIS->RemoveMachineInstrFromMaps(*I); diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp index 41402bd..610a835 100644 --- a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp +++ b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp @@ -81,6 +81,7 @@ #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/IR/Dominators.h" +#include "llvm/InitializePasses.h" using namespace llvm; diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index bfac639..926c52f 100644 --- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -118,7 +118,7 @@ public: MachineInstr *getParentInst() const { return Target->getParent(); } MachineRegisterInfo *getMRI() const { - return &getParentInst()->getParent()->getParent()->getRegInfo(); + return &getParentInst()->getMF()->getRegInfo(); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -1284,7 +1284,7 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, // Clone the instruction to allow revoking changes // made to MI during the processing of the operands // if the conversion fails. - SDWAInst = MI.getParent()->getParent()->CloneMachineInstr(&MI); + SDWAInst = MI.getMF()->CloneMachineInstr(&MI); MI.getParent()->insert(MI.getIterator(), SDWAInst); } else { SDWAInst = createSDWAVersion(MI); @@ -1334,20 +1334,21 @@ void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, const MCInstrDesc &Desc = TII->get(MI.getOpcode()); unsigned ConstantBusCount = 0; for (MachineOperand &Op : MI.explicit_uses()) { - if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(*MRI, Op.getReg()))) - continue; - - unsigned I = Op.getOperandNo(); + if (Op.isReg()) { + if (TRI->isVGPR(*MRI, Op.getReg())) + continue; - int16_t RegClass = TII->getOpRegClassID(Desc.operands()[I]); - if (RegClass == -1 || !TRI->isVSSuperClass(TRI->getRegClass(RegClass))) + if (ST.hasSDWAScalar() && ConstantBusCount == 0) { + ++ConstantBusCount; + continue; + } + } else if (!Op.isImm()) continue; - if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() && - TRI->isSGPRReg(*MRI, Op.getReg())) { - ++ConstantBusCount; + unsigned I = Op.getOperandNo(); + const TargetRegisterClass *OpRC = TII->getRegClass(Desc, I); + if (!OpRC || !TRI->isVSSuperClass(OpRC)) continue; - } Register VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), @@ -1355,8 +1356,7 @@ void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, if (Op.isImm()) Copy.addImm(Op.getImm()); else if (Op.isReg()) - Copy.addReg(Op.getReg(), Op.isKill() ? RegState::Kill : 0, - Op.getSubReg()); + Copy.addReg(Op.getReg(), getKillRegState(Op.isKill()), Op.getSubReg()); Op.ChangeToRegister(VGPR, false); } } diff --git a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp index 5720b97..787f7b3 100644 --- a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp +++ b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp @@ -29,9 +29,7 @@ public: static char ID; public: - SIPostRABundlerLegacy() : MachineFunctionPass(ID) { - initializeSIPostRABundlerLegacyPass(*PassRegistry::getPassRegistry()); - } + SIPostRABundlerLegacy() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override; @@ -110,7 +108,7 @@ void SIPostRABundler::collectUsedRegUnits(const MachineInstr &MI, "subregister indexes should not be present after RA"); for (MCRegUnit Unit : TRI->regunits(Reg)) - UsedRegUnits.set(Unit); + UsedRegUnits.set(static_cast<unsigned>(Unit)); } } diff --git a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp index ecfaa5c..b9f2993 100644 --- a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp @@ -153,11 +153,13 @@ void SIPreAllocateWWMRegs::rewriteRegs(MachineFunction &MF) { SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); for (unsigned Reg : RegsToRewrite) { - LIS->removeInterval(Reg); - const Register PhysReg = VRM->getPhys(Reg); assert(PhysReg != 0); + LiveInterval &LI = LIS->getInterval(Reg); + Matrix->unassign(LI, /*ClearAllReferencingSegments=*/true); + LIS->removeInterval(Reg); + MFI->reserveWWMRegister(PhysReg); } diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp index 01a40c1..73aab4e 100644 --- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp @@ -22,10 +22,11 @@ #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/ADT/SetVector.h" +#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/CodeGen/TargetSchedule.h" #include "llvm/Support/BranchProbability.h" - using namespace llvm; #define DEBUG_TYPE "si-pre-emit-peephole" @@ -47,9 +48,6 @@ private: const MachineBasicBlock &From, const MachineBasicBlock &To) const; bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB); - // Check if the machine instruction being processed is a supported packed - // instruction. - bool isUnpackingSupportedInstr(MachineInstr &MI) const; // Creates a list of packed instructions following an MFMA that are suitable // for unpacking. void collectUnpackingCandidates(MachineInstr &BeginMI, @@ -68,11 +66,11 @@ private: // this transformation. void performF32Unpacking(MachineInstr &I); // Select corresponding unpacked instruction - uint16_t mapToUnpackedOpcode(MachineInstr &I); + uint32_t mapToUnpackedOpcode(MachineInstr &I); // Creates the unpacked instruction to be inserted. Adds source modifiers to // the unpacked instructions based on the source modifiers in the packed // instruction. - MachineInstrBuilder createUnpackedMI(MachineInstr &I, uint16_t UnpackedOpcode, + MachineInstrBuilder createUnpackedMI(MachineInstr &I, uint32_t UnpackedOpcode, bool IsHiBits); // Process operands/source modifiers from packed instructions and insert the // appropriate source modifers and operands into the unpacked instructions. @@ -87,9 +85,7 @@ class SIPreEmitPeepholeLegacy : public MachineFunctionPass { public: static char ID; - SIPreEmitPeepholeLegacy() : MachineFunctionPass(ID) { - initializeSIPreEmitPeepholeLegacyPass(*PassRegistry::getPassRegistry()); - } + SIPreEmitPeepholeLegacy() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override { return SIPreEmitPeephole().run(MF); @@ -156,11 +152,12 @@ bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const { MachineOperand &Op1 = A->getOperand(1); MachineOperand &Op2 = A->getOperand(2); - if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) { + if ((!Op1.isReg() || Op1.getReg() != ExecReg) && Op2.isReg() && + Op2.getReg() == ExecReg) { TII->commuteInstruction(*A); Changed = true; } - if (Op1.getReg() != ExecReg) + if (!Op1.isReg() || Op1.getReg() != ExecReg) return Changed; if (Op2.isImm() && !(Op2.getImm() == -1 || Op2.getImm() == 0)) return Changed; @@ -299,7 +296,7 @@ bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First, for (MachineBasicBlock::instr_iterator I = std::next(First.getIterator()), E = MI.getIterator(); I != E; ++I) { - if (I->isBundle()) + if (I->isBundle() || I->isDebugInstr()) continue; switch (I->getOpcode()) { case AMDGPU::S_SET_GPR_IDX_MODE: @@ -454,23 +451,6 @@ bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI, return true; } -// If support is extended to new operations, add tests in -// llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir. -bool SIPreEmitPeephole::isUnpackingSupportedInstr(MachineInstr &MI) const { - if (!TII->isNeverCoissue(MI)) - return false; - unsigned Opcode = MI.getOpcode(); - switch (Opcode) { - case AMDGPU::V_PK_ADD_F32: - case AMDGPU::V_PK_MUL_F32: - case AMDGPU::V_PK_FMA_F32: - return true; - default: - return false; - } - llvm_unreachable("Fully covered switch"); -} - bool SIPreEmitPeephole::canUnpackingClobberRegister(const MachineInstr &MI) { unsigned OpCode = MI.getOpcode(); Register DstReg = MI.getOperand(0).getReg(); @@ -528,7 +508,7 @@ bool SIPreEmitPeephole::canUnpackingClobberRegister(const MachineInstr &MI) { return false; } -uint16_t SIPreEmitPeephole::mapToUnpackedOpcode(MachineInstr &I) { +uint32_t SIPreEmitPeephole::mapToUnpackedOpcode(MachineInstr &I) { unsigned Opcode = I.getOpcode(); // Use 64 bit encoding to allow use of VOP3 instructions. // VOP3 e64 instructions allow source modifiers @@ -541,7 +521,7 @@ uint16_t SIPreEmitPeephole::mapToUnpackedOpcode(MachineInstr &I) { case AMDGPU::V_PK_FMA_F32: return AMDGPU::V_FMA_F32_e64; default: - return std::numeric_limits<uint16_t>::max(); + return std::numeric_limits<uint32_t>::max(); } llvm_unreachable("Fully covered switch"); } @@ -612,10 +592,13 @@ void SIPreEmitPeephole::collectUnpackingCandidates( for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) { MachineInstr &Instr = *I; + uint32_t UnpackedOpCode = mapToUnpackedOpcode(Instr); + bool IsUnpackable = + !(UnpackedOpCode == std::numeric_limits<uint32_t>::max()); if (Instr.isMetaInstruction()) continue; if ((Instr.isTerminator()) || - (TII->isNeverCoissue(Instr) && !isUnpackingSupportedInstr(Instr)) || + (TII->isNeverCoissue(Instr) && !IsUnpackable) || (SIInstrInfo::modifiesModeRegister(Instr) && Instr.modifiesRegister(AMDGPU::EXEC, TRI))) return; @@ -639,7 +622,7 @@ void SIPreEmitPeephole::collectUnpackingCandidates( if (TRI->regsOverlap(MFMADef, InstrMO.getReg())) return; } - if (!isUnpackingSupportedInstr(Instr)) + if (!IsUnpackable) continue; if (canUnpackingClobberRegister(Instr)) @@ -657,10 +640,10 @@ void SIPreEmitPeephole::collectUnpackingCandidates( } void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) { - MachineOperand DstOp = I.getOperand(0); + const MachineOperand &DstOp = I.getOperand(0); - uint16_t UnpackedOpcode = mapToUnpackedOpcode(I); - assert(UnpackedOpcode != std::numeric_limits<uint16_t>::max() && + uint32_t UnpackedOpcode = mapToUnpackedOpcode(I); + assert(UnpackedOpcode != std::numeric_limits<uint32_t>::max() && "Unsupported Opcode"); MachineInstrBuilder Op0LOp1L = @@ -683,12 +666,12 @@ void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) { } MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I, - uint16_t UnpackedOpcode, + uint32_t UnpackedOpcode, bool IsHiBits) { MachineBasicBlock &MBB = *I.getParent(); const DebugLoc &DL = I.getDebugLoc(); - const MachineOperand *SrcMO1 = TII->getNamedOperand(I, AMDGPU::OpName::src0); - const MachineOperand *SrcMO2 = TII->getNamedOperand(I, AMDGPU::OpName::src1); + const MachineOperand *SrcMO0 = TII->getNamedOperand(I, AMDGPU::OpName::src0); + const MachineOperand *SrcMO1 = TII->getNamedOperand(I, AMDGPU::OpName::src1); Register DstReg = I.getOperand(0).getReg(); unsigned OpCode = I.getOpcode(); Register UnpackedDstReg = IsHiBits ? TRI->getSubReg(DstReg, AMDGPU::sub1) @@ -702,15 +685,15 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I, MachineInstrBuilder NewMI = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode)); NewMI.addDef(UnpackedDstReg); // vdst - addOperandAndMods(NewMI, Src0Mods, IsHiBits, *SrcMO1); - addOperandAndMods(NewMI, Src1Mods, IsHiBits, *SrcMO2); + addOperandAndMods(NewMI, Src0Mods, IsHiBits, *SrcMO0); + addOperandAndMods(NewMI, Src1Mods, IsHiBits, *SrcMO1); if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) { - const MachineOperand *SrcMO3 = + const MachineOperand *SrcMO2 = TII->getNamedOperand(I, AMDGPU::OpName::src2); unsigned Src2Mods = TII->getNamedOperand(I, AMDGPU::OpName::src2_modifiers)->getImm(); - addOperandAndMods(NewMI, Src2Mods, IsHiBits, *SrcMO3); + addOperandAndMods(NewMI, Src2Mods, IsHiBits, *SrcMO2); } NewMI.addImm(ClampVal); // clamp // Packed instructions do not support output modifiers. safe to assign them 0 @@ -722,10 +705,17 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I, PreservedAnalyses llvm::SIPreEmitPeepholePass::run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM) { - if (!SIPreEmitPeephole().run(MF)) - return PreservedAnalyses::all(); + auto *MDT = MFAM.getCachedResult<MachineDominatorTreeAnalysis>(MF); + auto *MPDT = MFAM.getCachedResult<MachinePostDominatorTreeAnalysis>(MF); + + if (SIPreEmitPeephole().run(MF)) + return getMachineFunctionPassPreservedAnalyses(); - return getMachineFunctionPassPreservedAnalyses(); + if (MDT) + MDT->updateBlockNumbers(); + if (MPDT) + MPDT->updateBlockNumbers(); + return PreservedAnalyses::all(); } bool SIPreEmitPeephole::run(MachineFunction &MF) { @@ -787,9 +777,13 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) { // TODO: Fold this into previous block, if possible. Evaluate and handle any // side effects. + + // Perform the extra MF scans only for supported archs + if (!ST.hasGFX940Insts()) + return Changed; for (MachineBasicBlock &MBB : MF) { - // Unpack packed instructions overlapped by MFMAs. This allows the compiler - // to co-issue unpacked instructions with MFMA + // Unpack packed instructions overlapped by MFMAs. This allows the + // compiler to co-issue unpacked instructions with MFMA auto SchedModel = TII->getSchedModel(); SetVector<MachineInstr *> InstrsToUnpack; for (auto &MI : make_early_inc_range(MBB.instrs())) { diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index ebd2e7e..ee46157 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -340,10 +340,12 @@ SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) "getNumCoveredRegs() will not work with generated subreg masks!"); RegPressureIgnoredUnits.resize(getNumRegUnits()); - RegPressureIgnoredUnits.set(*regunits(MCRegister::from(AMDGPU::M0)).begin()); + RegPressureIgnoredUnits.set( + static_cast<unsigned>(*regunits(MCRegister::from(AMDGPU::M0)).begin())); for (auto Reg : AMDGPU::VGPR_16RegClass) { if (AMDGPU::isHi16Reg(Reg, *this)) - RegPressureIgnoredUnits.set(*regunits(Reg).begin()); + RegPressureIgnoredUnits.set( + static_cast<unsigned>(*regunits(Reg).begin())); } // HACK: Until this is fully tablegen'd. @@ -864,7 +866,8 @@ bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const { [[fallthrough]]; } case AMDGPU::V_ADD_U32_e64: - // FIXME: This optimization is barely profitable enableFlatScratch as-is. + // FIXME: This optimization is barely profitable hasFlatScratchEnabled + // as-is. // // Much of the benefit with the MUBUF handling is we avoid duplicating the // shift of the frame register, which isn't needed with scratch. @@ -872,7 +875,7 @@ bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const { // materializeFrameBaseRegister doesn't know the register classes of the // uses, and unconditionally uses an s_add_i32, which will end up using a // copy for the vector uses. - return !ST.enableFlatScratch(); + return !ST.hasFlatScratchEnabled(); case AMDGPU::V_ADD_CO_U32_e32: if (ST.getConstantBusLimit(AMDGPU::V_ADD_CO_U32_e32) < 2 && !isFIPlusImmOrVGPR(*this, *MI)) @@ -912,12 +915,12 @@ Register SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, MachineFunction *MF = MBB->getParent(); const SIInstrInfo *TII = ST.getInstrInfo(); MachineRegisterInfo &MRI = MF->getRegInfo(); - unsigned MovOpc = ST.enableFlatScratch() ? AMDGPU::S_MOV_B32 - : AMDGPU::V_MOV_B32_e32; + unsigned MovOpc = + ST.hasFlatScratchEnabled() ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; Register BaseReg = MRI.createVirtualRegister( - ST.enableFlatScratch() ? &AMDGPU::SReg_32_XEXEC_HIRegClass - : &AMDGPU::VGPR_32RegClass); + ST.hasFlatScratchEnabled() ? &AMDGPU::SReg_32_XEXEC_HIRegClass + : &AMDGPU::VGPR_32RegClass); if (Offset == 0) { BuildMI(*MBB, Ins, DL, TII->get(MovOpc), BaseReg) @@ -927,16 +930,16 @@ Register SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); - Register FIReg = MRI.createVirtualRegister( - ST.enableFlatScratch() ? &AMDGPU::SReg_32_XM0RegClass - : &AMDGPU::VGPR_32RegClass); + Register FIReg = MRI.createVirtualRegister(ST.hasFlatScratchEnabled() + ? &AMDGPU::SReg_32_XM0RegClass + : &AMDGPU::VGPR_32RegClass); BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) .addImm(Offset); BuildMI(*MBB, Ins, DL, TII->get(MovOpc), FIReg) .addFrameIndex(FrameIdx); - if (ST.enableFlatScratch() ) { + if (ST.hasFlatScratchEnabled()) { // FIXME: Make sure scc isn't live in. BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_I32), BaseReg) .addReg(OffsetReg, RegState::Kill) @@ -989,9 +992,9 @@ void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg, MachineRegisterInfo &MRI = MF->getRegInfo(); // FIXME: materializeFrameBaseRegister does not know the register class of - // the uses of the frame index, and assumes SGPR for enableFlatScratch. Emit - // a copy so we have a legal operand and hope the register coalescer can - // clean it up. + // the uses of the frame index, and assumes SGPR for hasFlatScratchEnabled. + // Emit a copy so we have a legal operand and hope the register coalescer + // can clean it up. if (isSGPRReg(MRI, BaseReg)) { Register BaseRegVGPR = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); @@ -1407,7 +1410,7 @@ static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST, unsigned Dst = IsStore ? Reg : ValueReg; unsigned Src = IsStore ? ValueReg : Reg; bool IsVGPR = TRI->isVGPR(MRI, Reg); - DebugLoc DL = MI->getDebugLoc(); + const DebugLoc &DL = MI->getDebugLoc(); if (IsVGPR == TRI->isVGPR(MRI, ValueReg)) { // Spiller during regalloc may restore a spilled register to its superclass. // It could result in AGPR spills restored to VGPRs or the other way around, @@ -1546,7 +1549,10 @@ void SIRegisterInfo::buildSpillLoadStore( int64_t Offset = InstOffset + MFI.getObjectOffset(Index); int64_t MaterializedOffset = Offset; - int64_t MaxOffset = Offset + Size + RemSize - EltSize; + // Maxoffset is the starting offset for the last chunk to be spilled. + // In case of non-zero remainder element, max offset will be the + // last address(offset + Size) after spilling all the EltSize chunks. + int64_t MaxOffset = Offset + Size - (RemSize ? 0 : EltSize); int64_t ScratchOffsetRegDelta = 0; if (IsFlat && EltSize > 4) { @@ -1730,8 +1736,8 @@ void SIRegisterInfo::buildSpillLoadStore( : Register(getSubReg(ValueReg, getSubRegFromChannel(RegOffset / 4, NumRegs))); - unsigned SOffsetRegState = 0; - unsigned SrcDstRegState = getDefRegState(!IsStore); + RegState SOffsetRegState = {}; + RegState SrcDstRegState = getDefRegState(!IsStore); const bool IsLastSubReg = i + 1 == e; const bool IsFirstSubReg = i == 0; if (IsLastSubReg) { @@ -1771,7 +1777,7 @@ void SIRegisterInfo::buildSpillLoadStore( } if ((IsSubReg || NeedSuperRegImpOperand) && (IsFirstSubReg || IsLastSubReg)) { NeedSuperRegImpOperand = true; - unsigned State = SrcDstRegState; + RegState State = SrcDstRegState; if (!IsLastSubReg || (Lane != LaneE)) State &= ~RegState::Kill; if (!IsFirstSubReg || (Lane != LaneS)) @@ -1823,10 +1829,22 @@ void SIRegisterInfo::buildSpillLoadStore( } } + Register FinalValueReg = ValueReg; + if (LoadStoreOp == AMDGPU::SCRATCH_LOAD_USHORT_SADDR) { + // If we are loading 16-bit value with SRAMECC endabled we need a temp + // 32-bit VGPR to load and extract 16-bits into the final register. + ValueReg = + RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0); + SubReg = ValueReg; + IsKill = false; + } + + // Create the MMO, additional set the NonVolatile flag as scratch memory + // used for spills will not be used outside the thread. MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(RegOffset); - MachineMemOperand *NewMMO = - MF->getMachineMemOperand(PInfo, MMO->getFlags(), RemEltSize, - commonAlignment(Alignment, RegOffset)); + MachineMemOperand *NewMMO = MF->getMachineMemOperand( + PInfo, MMO->getFlags() | MOThreadPrivate, RemEltSize, + commonAlignment(Alignment, RegOffset)); auto MIB = BuildMI(MBB, MI, DL, *Desc) @@ -1863,6 +1881,17 @@ void SIRegisterInfo::buildSpillLoadStore( MIB.addImm(0); // swz MIB.addMemOperand(NewMMO); + if (FinalValueReg != ValueReg) { + // Extract 16-bit from the loaded 32-bit value. + ValueReg = getSubReg(ValueReg, AMDGPU::lo16); + MIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B16_t16_e64)) + .addReg(FinalValueReg, getDefRegState(true)) + .addImm(0) + .addReg(ValueReg, getKillRegState(true)) + .addImm(0); + ValueReg = FinalValueReg; + } + if (!IsAGPR && NeedSuperRegDef) MIB.addReg(ValueReg, RegState::ImplicitDefine); @@ -1873,10 +1902,14 @@ void SIRegisterInfo::buildSpillLoadStore( MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse); } - bool IsSrcDstDef = SrcDstRegState & RegState::Define; + bool IsSrcDstDef = hasRegState(SrcDstRegState, RegState::Define); + bool PartialReloadCopy = (RemEltSize != EltSize) && !IsStore; if (NeedSuperRegImpOperand && - (IsFirstSubReg || (IsLastSubReg && !IsSrcDstDef))) + (IsFirstSubReg || (IsLastSubReg && !IsSrcDstDef))) { MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState); + if (PartialReloadCopy) + MIB.addReg(ValueReg, RegState::Implicit); + } // The epilog restore of a wwm-scratch register can cause undesired // optimization during machine-cp post PrologEpilogInserter if the same @@ -1924,7 +1957,7 @@ void SIRegisterInfo::buildSpillLoadStore( void SIRegisterInfo::addImplicitUsesForBlockCSRLoad(MachineInstrBuilder &MIB, Register BlockReg) const { - const MachineFunction *MF = MIB->getParent()->getParent(); + const MachineFunction *MF = MIB->getMF(); const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>(); uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps(BlockReg); Register BaseVGPR = getSubReg(BlockReg, AMDGPU::sub0); @@ -1953,13 +1986,15 @@ void SIRegisterInfo::buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index, SB.EltSize, Alignment); if (IsLoad) { - unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR - : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; + unsigned Opc = ST.hasFlatScratchEnabled() + ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR + : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, false, FrameReg, (int64_t)Offset * SB.EltSize, MMO, SB.RS); } else { - unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR - : AMDGPU::BUFFER_STORE_DWORD_OFFSET; + unsigned Opc = ST.hasFlatScratchEnabled() + ? AMDGPU::SCRATCH_STORE_DWORD_SADDR + : AMDGPU::BUFFER_STORE_DWORD_OFFSET; buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, IsKill, FrameReg, (int64_t)Offset * SB.EltSize, MMO, SB.RS); // This only ever adds one VGPR spill @@ -2039,13 +2074,13 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index, SB.prepare(); // SubReg carries the "Kill" flag when SubReg == SB.SuperReg. - unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill); + RegState SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill); // Per VGPR helper data auto PVD = SB.getPerVGPRData(); for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) { - unsigned TmpVGPRFlags = RegState::Undef; + RegState TmpVGPRFlags = RegState::Undef; // Write sub registers into the VGPR for (unsigned i = Offset * PVD.PerVGPR, @@ -2062,7 +2097,7 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index, .addReg(SubReg, SubKillState) .addImm(i % PVD.PerVGPR) .addReg(SB.TmpVGPR, TmpVGPRFlags); - TmpVGPRFlags = 0; + TmpVGPRFlags = {}; if (Indexes) { if (i == 0) @@ -2075,7 +2110,7 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index, // TODO: Can we detect this and skip the spill? if (SB.NumSubRegs > 1) { // The last implicit use of the SB.SuperReg carries the "Kill" flag. - unsigned SuperKillState = 0; + RegState SuperKillState = {}; if (i + 1 == SB.NumSubRegs) SuperKillState |= getKillRegState(SB.IsKill); WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState); @@ -2185,10 +2220,10 @@ bool SIRegisterInfo::spillEmergencySGPR(MachineBasicBlock::iterator MI, RS); SB.prepare(); // Generate the spill of SGPR to SB.TmpVGPR. - unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill); + RegState SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill); auto PVD = SB.getPerVGPRData(); for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) { - unsigned TmpVGPRFlags = RegState::Undef; + RegState TmpVGPRFlags = RegState::Undef; // Write sub registers into the VGPR for (unsigned i = Offset * PVD.PerVGPR, e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs); @@ -2204,12 +2239,12 @@ bool SIRegisterInfo::spillEmergencySGPR(MachineBasicBlock::iterator MI, .addReg(SubReg, SubKillState) .addImm(i % PVD.PerVGPR) .addReg(SB.TmpVGPR, TmpVGPRFlags); - TmpVGPRFlags = 0; + TmpVGPRFlags = {}; // There could be undef components of a spilled super register. // TODO: Can we detect this and skip the spill? if (SB.NumSubRegs > 1) { // The last implicit use of the SB.SuperReg carries the "Kill" flag. - unsigned SuperKillState = 0; + RegState SuperKillState = {}; if (i + 1 == SB.NumSubRegs) SuperKillState |= getKillRegState(SB.IsKill); WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState); @@ -2294,7 +2329,7 @@ bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex( bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const { - MachineFunction *MF = MI->getParent()->getParent(); + MachineFunction *MF = MI->getMF(); MachineBasicBlock *MBB = MI->getParent(); SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); MachineFrameInfo &FrameInfo = MF->getFrameInfo(); @@ -2415,13 +2450,13 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, unsigned Opc; if (MI->getOpcode() == AMDGPU::SI_SPILL_V16_SAVE) { - assert(ST.enableFlatScratch() && "Flat Scratch is not enabled!"); + assert(ST.hasFlatScratchEnabled() && "Flat Scratch is not enabled!"); Opc = AMDGPU::SCRATCH_STORE_SHORT_SADDR_t16; } else { Opc = MI->getOpcode() == AMDGPU::SI_BLOCK_SPILL_V1024_SAVE ? AMDGPU::SCRATCH_STORE_BLOCK_SADDR - : ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR - : AMDGPU::BUFFER_STORE_DWORD_OFFSET; + : ST.hasFlatScratchEnabled() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR + : AMDGPU::BUFFER_STORE_DWORD_OFFSET; } auto *MBB = MI->getParent(); @@ -2500,13 +2535,15 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, unsigned Opc; if (MI->getOpcode() == AMDGPU::SI_SPILL_V16_RESTORE) { - assert(ST.enableFlatScratch() && "Flat Scratch is not enabled!"); - Opc = AMDGPU::SCRATCH_LOAD_SHORT_D16_SADDR_t16; + assert(ST.hasFlatScratchEnabled() && "Flat Scratch is not enabled!"); + Opc = ST.d16PreservesUnusedBits() + ? AMDGPU::SCRATCH_LOAD_SHORT_D16_SADDR_t16 + : AMDGPU::SCRATCH_LOAD_USHORT_SADDR; } else { Opc = MI->getOpcode() == AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE ? AMDGPU::SCRATCH_LOAD_BLOCK_SADDR - : ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR - : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; + : ST.hasFlatScratchEnabled() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR + : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; } auto *MBB = MI->getParent(); @@ -2585,7 +2622,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, Offset = 0; } - if (FrameReg && !ST.enableFlatScratch()) { + if (FrameReg && !ST.hasFlatScratchEnabled()) { // We should just do an in-place update of the result register. However, // the value there may also be used by the add, in which case we need a // temporary register. @@ -2606,7 +2643,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, } if ((!OtherOp->isImm() || OtherOp->getImm() != 0) && MaterializedReg) { - if (ST.enableFlatScratch() && + if (ST.hasFlatScratchEnabled() && !TII->isOperandLegal(*MI, Src1Idx, OtherOp)) { // We didn't need the shift above, so we have an SGPR for the frame // register, but may have a VGPR only operand. @@ -2624,7 +2661,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), ScavengedVGPR) .addReg(MaterializedReg, - MaterializedReg != FrameReg ? RegState::Kill : 0); + getKillRegState(MaterializedReg != FrameReg)); MaterializedReg = ScavengedVGPR; } @@ -2636,8 +2673,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, if (NumDefs == 2) AddI32.add(MI->getOperand(1)); - unsigned MaterializedRegFlags = - MaterializedReg != FrameReg ? RegState::Kill : 0; + RegState MaterializedRegFlags = + getKillRegState(MaterializedReg != FrameReg); if (isVGPRClass(getPhysRegBaseClass(MaterializedReg))) { // If we know we have a VGPR already, it's more likely the other @@ -2767,7 +2804,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, if (OtherOp.isReg() && OtherOp.getReg() != DstOp.getReg()) TmpReg = DstOp.getReg(); - if (FrameReg && !ST.enableFlatScratch()) { + if (FrameReg && !ST.hasFlatScratchEnabled()) { // FIXME: In the common case where the add does not also read its result // (i.e. this isn't a reg += fi), it's not finding the dest reg as // available. @@ -2852,7 +2889,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, } int64_t Offset = FrameInfo.getObjectOffset(Index); - if (ST.enableFlatScratch()) { + if (ST.hasFlatScratchEnabled()) { if (TII->isFLATScratch(*MI)) { assert( (int16_t)FIOperandNum == @@ -2954,10 +2991,36 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, : RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass, MI, false, 0, !UseSGPR); - // TODO: for flat scratch another attempt can be made with a VGPR index - // if no SGPRs can be scavenged. - if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR)) + if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR)) { + int SVOpcode = AMDGPU::getFlatScratchInstSVfromSS(MI->getOpcode()); + if (ST.hasFlatScratchSVSMode() && SVOpcode != -1) { + Register TmpVGPR = RS->scavengeRegisterBackwards( + AMDGPU::VGPR_32RegClass, MI, false, 0, /*AllowSpill=*/true); + + // Materialize the frame register. + auto MIB = + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR); + if (FrameReg) + MIB.addReg(FrameReg); + else + MIB.addImm(Offset); + + // Add the offset to the frame register. + if (FrameReg && Offset) + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e32), FrameReg) + .addReg(FrameReg, RegState::Kill) + .addImm(Offset); + + BuildMI(*MBB, MI, DL, TII->get(SVOpcode)) + .add(MI->getOperand(0)) // $vdata + .addReg(TmpVGPR) // $vaddr + .addImm(0) // Offset + .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::cpol)); + MI->eraseFromParent(); + return true; + } report_fatal_error("Cannot scavenge register in FI elimination!"); + } if (!TmpSReg) { // Use frame register and restore it after. @@ -3019,7 +3082,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, if (!IsMUBUF && !MFI->isBottomOfStack()) { // Convert to a swizzled stack address by scaling by the wave size. // In an entry function/kernel the offset is already swizzled. - bool IsSALU = isSGPRClass(TII->getOpRegClass(*MI, FIOperandNum)); + bool IsSALU = isSGPRClass(TII->getRegClass(MI->getDesc(), FIOperandNum)); bool LiveSCC = RS->isRegUsed(AMDGPU::SCC) && !MI->definesRegister(AMDGPU::SCC, /*TRI=*/nullptr); const TargetRegisterClass *RC = IsSALU && !LiveSCC @@ -3531,6 +3594,17 @@ SIRegisterInfo::getVectorSuperClassForBitWidth(unsigned BitWidth) const { } const TargetRegisterClass * +SIRegisterInfo::getDefaultVectorSuperClassForBitWidth(unsigned BitWidth) const { + // TODO: In principle this should use AV classes for gfx908 too. This is + // limited to 90a+ to avoid regressing special case copy optimizations which + // need new handling. The core issue is that it's not possible to directly + // copy between AGPRs on gfx908, and the current optimizations around that + // expect to see copies to VGPR. + return ST.hasGFX90AInsts() ? getVectorSuperClassForBitWidth(BitWidth) + : getVGPRClassForBitWidth(BitWidth); +} + +const TargetRegisterClass * SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) { if (BitWidth == 16 || BitWidth == 32) return &AMDGPU::SReg_32RegClass; @@ -3601,6 +3675,14 @@ SIRegisterInfo::getEquivalentAGPRClass(const TargetRegisterClass *SRC) const { } const TargetRegisterClass * +SIRegisterInfo::getEquivalentAVClass(const TargetRegisterClass *SRC) const { + unsigned Size = getRegSizeInBits(*SRC); + const TargetRegisterClass *ARC = getVectorSuperClassForBitWidth(Size); + assert(ARC && "Invalid register class size"); + return ARC; +} + +const TargetRegisterClass * SIRegisterInfo::getEquivalentSGPRClass(const TargetRegisterClass *VRC) const { unsigned Size = getRegSizeInBits(*VRC); if (Size == 32) @@ -3707,27 +3789,6 @@ bool SIRegisterInfo::isAGPR(const MachineRegisterInfo &MRI, return RC && isAGPRClass(RC); } -bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI, - const TargetRegisterClass *SrcRC, - unsigned SubReg, - const TargetRegisterClass *DstRC, - unsigned DstSubReg, - const TargetRegisterClass *NewRC, - LiveIntervals &LIS) const { - unsigned SrcSize = getRegSizeInBits(*SrcRC); - unsigned DstSize = getRegSizeInBits(*DstRC); - unsigned NewSize = getRegSizeInBits(*NewRC); - - // Do not increase size of registers beyond dword, we would need to allocate - // adjacent registers and constraint regalloc more than needed. - - // Always allow dword coalescing. - if (SrcSize <= 32 || DstSize <= 32) - return true; - - return NewSize <= DstSize || NewSize <= SrcSize; -} - unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const { unsigned MinOcc = ST.getOccupancyWithWorkGroupSizes(MF).first; @@ -3761,10 +3822,10 @@ unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, llvm_unreachable("Unexpected register pressure set!"); } -const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const { +const int *SIRegisterInfo::getRegUnitPressureSets(MCRegUnit RegUnit) const { static const int Empty[] = { -1 }; - if (RegPressureIgnoredUnits[RegUnit]) + if (RegPressureIgnoredUnits[static_cast<unsigned>(RegUnit)]) return Empty; return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit); @@ -3888,20 +3949,6 @@ const TargetRegisterClass *SIRegisterInfo::getVGPR64Class() const { : &AMDGPU::VReg_64RegClass; } -const TargetRegisterClass * -SIRegisterInfo::getRegClass(unsigned RCID) const { - switch ((int)RCID) { - case AMDGPU::SReg_1RegClassID: - return getBoolRC(); - case AMDGPU::SReg_1_XEXECRegClassID: - return getWaveMaskRegClass(); - case -1: - return nullptr; - default: - return AMDGPUGenRegisterInfo::getRegClass(RCID); - } -} - // Find reaching register definition MachineInstr *SIRegisterInfo::findReachingDef(Register Reg, unsigned SubReg, MachineInstr &Use, @@ -3990,28 +4037,6 @@ bool SIRegisterInfo::isProperlyAlignedRC(const TargetRegisterClass &RC) const { return true; } -const TargetRegisterClass * -SIRegisterInfo::getProperlyAlignedRC(const TargetRegisterClass *RC) const { - if (!RC || !ST.needsAlignedVGPRs()) - return RC; - - unsigned Size = getRegSizeInBits(*RC); - if (Size <= 32) - return RC; - - if (RC == &AMDGPU::VS_64RegClass) - return &AMDGPU::VS_64_Align2RegClass; - - if (isVGPRClass(RC)) - return getAlignedVGPRClassForBitWidth(Size); - if (isAGPRClass(RC)) - return getAlignedAGPRClassForBitWidth(Size); - if (isVectorSuperClass(RC)) - return getAlignedVectorSuperClassForBitWidth(Size); - - return RC; -} - ArrayRef<MCPhysReg> SIRegisterInfo::getAllSGPR128(const MachineFunction &MF) const { return ArrayRef(AMDGPU::SGPR_128RegClass.begin(), ST.getMaxNumSGPRs(MF) / 4); diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index 7b91ba7..9d1a9ea 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -107,9 +107,7 @@ public: // Stack access is very expensive. CSRs are also the high registers, and we // want to minimize the number of used registers. - unsigned getCSRFirstUseCost() const override { - return 100; - } + unsigned getCSRCost() const override { return 100; } // When building a block VGPR load, we only really transfer a subset of the // registers in the block, based on a mask. Liveness analysis is not aware of @@ -216,6 +214,10 @@ public: getVectorSuperClassForBitWidth(unsigned BitWidth) const; LLVM_READONLY + const TargetRegisterClass * + getDefaultVectorSuperClassForBitWidth(unsigned BitWidth) const; + + LLVM_READONLY static const TargetRegisterClass *getSGPRClassForBitWidth(unsigned BitWidth); /// \returns true if this class contains only SGPR registers @@ -285,6 +287,10 @@ public: const TargetRegisterClass * getEquivalentAGPRClass(const TargetRegisterClass *SRC) const; + /// \returns An AGPR+VGPR super reg class with the same width as \p SRC + const TargetRegisterClass * + getEquivalentAVClass(const TargetRegisterClass *SRC) const; + /// \returns A SGPR reg class with the same width as \p SRC const TargetRegisterClass * getEquivalentSGPRClass(const TargetRegisterClass *VRC) const; @@ -338,14 +344,6 @@ public: ArrayRef<int16_t> getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const; - bool shouldCoalesce(MachineInstr *MI, - const TargetRegisterClass *SrcRC, - unsigned SubReg, - const TargetRegisterClass *DstRC, - unsigned DstSubReg, - const TargetRegisterClass *NewRC, - LiveIntervals &LIS) const override; - unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override; @@ -357,7 +355,7 @@ public: const MachineFunction &MF, const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const override; - const int *getRegUnitPressureSets(unsigned RegUnit) const override; + const int *getRegUnitPressureSets(MCRegUnit RegUnit) const override; MCRegister getReturnAddressReg(const MachineFunction &MF) const; @@ -391,8 +389,6 @@ public: MCRegister getExec() const; - const TargetRegisterClass *getRegClass(unsigned RCID) const; - // Find reaching register definition MachineInstr *findReachingDef(Register Reg, unsigned SubReg, MachineInstr &Use, @@ -433,11 +429,6 @@ public: // the subtarget. bool isProperlyAlignedRC(const TargetRegisterClass &RC) const; - // Given \p RC returns corresponding aligned register class if required - // by the subtarget. - const TargetRegisterClass * - getProperlyAlignedRC(const TargetRegisterClass *RC) const; - /// Return all SGPR128 which satisfy the waves per execution unit requirement /// of the subtarget. ArrayRef<MCPhysReg> getAllSGPR128(const MachineFunction &MF) const; @@ -495,6 +486,17 @@ public: SmallVector<StringLiteral> getVRegFlagsOfReg(Register Reg, const MachineFunction &MF) const override; + + float + getSpillWeightScaleFactor(const TargetRegisterClass *RC) const override { + // Prioritize VGPR_32_Lo256 over other classes which may occupy registers + // beyond v256. + return AMDGPUGenRegisterInfo::getSpillWeightScaleFactor(RC) * + ((RC == &AMDGPU::VGPR_32_Lo256RegClass || + RC == &AMDGPU::VReg_64_Lo256_Align2RegClass) + ? 2.0 + : 1.0); + } }; namespace AMDGPU { diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index fc8f46a..493e267 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -614,9 +614,9 @@ def VGPR_16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16, def VGPR_16_Lo128 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16, (add (interleave (sequence "VGPR%u_LO16", 0, 127), (sequence "VGPR%u_HI16", 0, 127)))> { + let AllocationPriority = !add(2, !mul(BaseClassPriority, BaseClassScaleFactor)); let Size = 16; let GeneratePressureSet = 0; - let isAllocatable = 0; // This is the base class for VGPR{0..127}_{LO16,HI16}. let BaseClassOrder = 16; @@ -644,7 +644,7 @@ def VGPR_32_Lo128 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg1 // Identical to VGPR_32 except it only contains the low 256 (Lo256) registers. def VGPR_32_Lo256 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32, (add (sequence "VGPR%u", 0, 255))> { - let AllocationPriority = 0; + let AllocationPriority = !add(3, !mul(BaseClassPriority, BaseClassScaleFactor)); let GeneratePressureSet = 0; let Size = 32; let Weight = 1; @@ -896,20 +896,6 @@ def SReg_64_Encodable : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v let Size = 64; } -def SReg_1_XEXEC : SIRegisterClass<"AMDGPU", [i1], 32, - (add SReg_64_XEXEC, SReg_32_XEXEC)> { - let CopyCost = 1; - let isAllocatable = 0; - let HasSGPR = 1; -} - -def SReg_1 : SIRegisterClass<"AMDGPU", [i1], 32, - (add SReg_1_XEXEC, EXEC, EXEC_LO, EXEC_HI)> { - let CopyCost = 1; - let isAllocatable = 0; - let HasSGPR = 1; -} - multiclass SRegClass<int numRegs, list<ValueType> regTypes, SIRegisterTuples regList, @@ -1205,6 +1191,34 @@ defm AV_512 : AVRegClass<16, VReg_512.RegTypes, (add VGPR_512), (add AGPR_512)>; defm AV_1024 : AVRegClass<32, VReg_1024.RegTypes, (add VGPR_1024), (add AGPR_1024)>; } +def SReg_1_XEXEC : SIRegisterClassLike<0, false, false, true>, + RegClassByHwMode< + [DefaultMode_Wave64, + AlignedVGPRNoAGPRMode_Wave64, + AVAlign2LoadStoreMode, + DefaultMode_Wave32, + AlignedVGPRNoAGPRMode_Wave32], + [SReg_64_XEXEC, + SReg_64_XEXEC, + SReg_64_XEXEC, + SReg_32_XM0_XEXEC, // FIXME: Why do the wave32 cases exclude m0? + SReg_32_XM0_XEXEC] +>; + +def SReg_1 : SIRegisterClassLike<0, false, false, true>, + RegClassByHwMode< + [DefaultMode_Wave64, + AlignedVGPRNoAGPRMode_Wave64, + AVAlign2LoadStoreMode, + DefaultMode_Wave32, + AlignedVGPRNoAGPRMode_Wave32], + [SReg_64, + SReg_64, + SReg_64, + SReg_32, + SReg_32] +>; + //===----------------------------------------------------------------------===// // // AlignTarget classes. Artifical classes to swap between @@ -1212,17 +1226,36 @@ defm AV_1024 : AVRegClass<32, VReg_1024.RegTypes, (add VGPR_1024), (add AGPR_102 // //===----------------------------------------------------------------------===// +// We have 3 orthogonal properties to consider. Unfortunately we need +// to define the cross product of these states, minus unused +// combinations. + def AV_LdSt_32_Target : RegClassByHwMode< - [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode], - [VGPR_32, AV_32, VGPR_32]>, SIRegisterClassLike<32, true, true> { + [DefaultMode_Wave64, + DefaultMode_Wave32, + AVAlign2LoadStoreMode, + AlignedVGPRNoAGPRMode_Wave64, + AlignedVGPRNoAGPRMode_Wave32], + [VGPR_32, + VGPR_32, + AV_32, + VGPR_32, + VGPR_32]>, + SIRegisterClassLike<32, true, true> { let DecoderMethod = "decodeAVLdSt"; } foreach RegSize = [ 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 512, 1024 ] in { def VReg_#RegSize#_AlignTarget : SIRegisterClassLike<RegSize, true>, RegClassByHwMode< - [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode], + [DefaultMode_Wave64, + DefaultMode_Wave32, + AVAlign2LoadStoreMode, + AlignedVGPRNoAGPRMode_Wave64, + AlignedVGPRNoAGPRMode_Wave32], [!cast<RegisterClass>("VReg_"#RegSize), + !cast<RegisterClass>("VReg_"#RegSize), + !cast<RegisterClass>("VReg_"#RegSize#_Align2), !cast<RegisterClass>("VReg_"#RegSize#_Align2), !cast<RegisterClass>("VReg_"#RegSize#_Align2)]> { let DecoderMethod = "DecodeVReg_"#RegSize#"RegisterClass"; @@ -1230,45 +1263,59 @@ foreach RegSize = [ 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 512, 10 def AReg_#RegSize#_AlignTarget : SIRegisterClassLike<RegSize, false, true>, RegClassByHwMode< - [DefaultMode, AVAlign2LoadStoreMode, /*Unused combination*/], + [DefaultMode_Wave64, /*unused combination*/ AVAlign2LoadStoreMode, /*Unused combination*/ /*Unused combination*/], [!cast<RegisterClass>("AReg_"#RegSize), + /*unused combination*/ !cast<RegisterClass>("AReg_"#RegSize#_Align2) + /*Unused combination*/ /*Unused combination*/]> { let DecoderMethod = "DecodeAReg_"#RegSize#"RegisterClass"; } def AV_#RegSize#_AlignTarget : SIRegisterClassLike<RegSize, true, true>, RegClassByHwMode< - [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode], + [DefaultMode_Wave32, + DefaultMode_Wave64, + AVAlign2LoadStoreMode, + AlignedVGPRNoAGPRMode_Wave64, + AlignedVGPRNoAGPRMode_Wave32], [!cast<RegisterClass>("AV_"#RegSize), + !cast<RegisterClass>("AV_"#RegSize), !cast<RegisterClass>("AV_"#RegSize#_Align2), + !cast<RegisterClass>("VReg_"#RegSize#_Align2), !cast<RegisterClass>("VReg_"#RegSize#_Align2)]> { let DecoderMethod = "DecodeAV_"#RegSize#"RegisterClass"; } def AV_LdSt_#RegSize#_AlignTarget : SIRegisterClassLike<RegSize, true, true>, RegClassByHwMode< - [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode], + [DefaultMode_Wave64, DefaultMode_Wave32, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode_Wave64, AlignedVGPRNoAGPRMode_Wave32], [!cast<RegisterClass>("VReg_"#RegSize), + !cast<RegisterClass>("VReg_"#RegSize), !cast<RegisterClass>("AV_"#RegSize#_Align2), + !cast<RegisterClass>("VReg_"#RegSize#_Align2), !cast<RegisterClass>("VReg_"#RegSize#_Align2)]> { let DecoderMethod = "decodeAVLdSt"; } def AV_LdSt_#RegSize#_Align2 : SIRegisterClassLike<RegSize, true, true>, RegClassByHwMode< - [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode], + [DefaultMode_Wave64, DefaultMode_Wave32, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode_Wave64, AlignedVGPRNoAGPRMode_Wave32], [!cast<RegisterClass>("VReg_"#RegSize#_Align2), + !cast<RegisterClass>("VReg_"#RegSize#_Align2), !cast<RegisterClass>("AV_"#RegSize#_Align2), + !cast<RegisterClass>("VReg_"#RegSize#_Align2), !cast<RegisterClass>("VReg_"#RegSize#_Align2)]> { let DecoderMethod = "decodeAVLdSt"; } def AV_LdSt_#RegSize#_Align1 : SIRegisterClassLike<RegSize, true, true>, RegClassByHwMode< - [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode], + [DefaultMode_Wave64, DefaultMode_Wave32, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode_Wave64, AlignedVGPRNoAGPRMode_Wave32], [!cast<RegisterClass>("VReg_"#RegSize), + !cast<RegisterClass>("VReg_"#RegSize), !cast<RegisterClass>("AV_"#RegSize), + !cast<RegisterClass>("VReg_"#RegSize), !cast<RegisterClass>("VReg_"#RegSize)]> { let DecoderMethod = "decodeAVLdSt"; } @@ -1276,11 +1323,22 @@ foreach RegSize = [ 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 512, 10 def VS_64_AlignTarget : SIRegisterClassLike<64, true, false, true>, RegClassByHwMode< - [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode], - [VS_64, VS_64_Align2, VS_64_Align2]> { + [DefaultMode_Wave64, DefaultMode_Wave32, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode_Wave64, AlignedVGPRNoAGPRMode_Wave32], + [VS_64, VS_64, VS_64_Align2, VS_64_Align2, VS_64_Align2]> { let DecoderMethod = "decodeSrcRegOrImm9"; } + +// Special case for DS_GWS instructions. The register input is really +// 32-bit, but it needs to be even aligned on targets with a VGPR +// alignment requirement. +def AV_LdSt_32_Align2 : SIRegisterClassLike</*Bitwidth=*/32, /*VGPR=*/true, /*AGPR=*/true>, + RegClassByHwMode< + [DefaultMode_Wave64, DefaultMode_Wave32, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode_Wave64, AlignedVGPRNoAGPRMode_Wave32], + [VGPR_32, VGPR_32, AV_64_Align2, VReg_64_Align2, VReg_64_Align2]> { + let DecoderMethod = "decodeAVLdSt<32>"; +} + class RegImmMatcher<string name> : AsmOperandClass { let Name = name; let RenderMethod = "addRegOrImmOperands"; @@ -1314,12 +1372,12 @@ class SrcRegOrImm9_t16<string operandType, RegisterClass regClass = VS_16> let EncoderMethod = "getMachineOpValueT16"; } -def SSrc_b16 : SrcRegOrImm9 <SReg_32, "OPERAND_REG_IMM_INT16">; -def SSrc_bf16: SrcRegOrImm9 <SReg_32, "OPERAND_REG_IMM_BF16">; -def SSrc_f16 : SrcRegOrImm9 <SReg_32, "OPERAND_REG_IMM_FP16">; -def SSrc_b32 : SrcRegOrImm9 <SReg_32, "OPERAND_REG_IMM_INT32">; -def SSrc_f32 : SrcRegOrImm9 <SReg_32, "OPERAND_REG_IMM_FP32">; -def SSrc_b64 : SrcRegOrImm9 <SReg_64_Encodable, "OPERAND_REG_IMM_INT64">; +def SSrc_b16 : SrcRegOrImm9 <SReg_32, "OPERAND_REG_IMM_INT16">; +def SSrc_bf16 : SrcRegOrImm9 <SReg_32, "OPERAND_REG_IMM_BF16">; +def SSrc_f16 : SrcRegOrImm9 <SReg_32, "OPERAND_REG_IMM_FP16">; +def SSrc_b32 : SrcRegOrImm9 <SReg_32, "OPERAND_REG_IMM_INT32">; +def SSrc_f32 : SrcRegOrImm9 <SReg_32, "OPERAND_REG_IMM_FP32">; +def SSrc_b64 : SrcRegOrImm9 <SReg_64_Encodable, "OPERAND_REG_IMM_INT64">; def SSrcOrLds_b32 : SrcRegOrImm9 <SRegOrLds_32, "OPERAND_REG_IMM_INT32">; @@ -1335,35 +1393,35 @@ def SCSrc_b64 : SrcRegOrImm9 <SReg_64, "OPERAND_REG_INLINE_C_INT64">; //===----------------------------------------------------------------------===// // The current and temporary future default used case for VOP3. -def VSrc_b16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_INT16">; -def VSrc_bf16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_BF16">; -def VSrc_f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_FP16">; +def VSrc_b16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_INT16">; +def VSrc_bf16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_BF16">; +def VSrc_f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_FP16">; // True16 VOP3 operands. -def VSrcT_b16 : SrcRegOrImm9_t16 <"OPERAND_REG_IMM_INT16">; +def VSrcT_b16 : SrcRegOrImm9_t16 <"OPERAND_REG_IMM_INT16">; def VSrcT_bf16 : SrcRegOrImm9_t16 <"OPERAND_REG_IMM_BF16">; -def VSrcT_f16 : SrcRegOrImm9_t16 <"OPERAND_REG_IMM_FP16">; +def VSrcT_f16 : SrcRegOrImm9_t16 <"OPERAND_REG_IMM_FP16">; // True16 VOP1/2/C operands. let DecoderMethodName = "decodeOperand_VSrcT16_Lo128", EncoderMethod = "getMachineOpValueT16Lo128" in { - def VSrcT_b16_Lo128 : SrcRegOrImm9_t16 <"OPERAND_REG_IMM_INT16", VS_16_Lo128>; - def VSrcT_bf16_Lo128 : SrcRegOrImm9_t16 <"OPERAND_REG_IMM_BF16", VS_16_Lo128>; - def VSrcT_f16_Lo128 : SrcRegOrImm9_t16 <"OPERAND_REG_IMM_FP16", VS_16_Lo128>; + def VSrcT_b16_Lo128 : SrcRegOrImm9_t16 <"OPERAND_REG_IMM_INT16", VS_16_Lo128>; + def VSrcT_bf16_Lo128 : SrcRegOrImm9_t16 <"OPERAND_REG_IMM_BF16", VS_16_Lo128>; + def VSrcT_f16_Lo128 : SrcRegOrImm9_t16 <"OPERAND_REG_IMM_FP16", VS_16_Lo128>; } // End DecoderMethodName = "decodeOperand_VSrcT16_Lo128", EncoderMethod = "getMachineOpValueT16Lo128" // The current and temporary future default used case for fake VOP1/2/C. // For VOP1,2,C True16 instructions. _Lo128 use first 128 32-bit VGPRs only. -def VSrcFake16_b16_Lo128 : SrcRegOrImm9 <VS_32_Lo128, "OPERAND_REG_IMM_INT16">; +def VSrcFake16_b16_Lo128 : SrcRegOrImm9 <VS_32_Lo128, "OPERAND_REG_IMM_INT16">; def VSrcFake16_bf16_Lo128 : SrcRegOrImm9 <VS_32_Lo128, "OPERAND_REG_IMM_BF16">; -def VSrcFake16_f16_Lo128 : SrcRegOrImm9 <VS_32_Lo128, "OPERAND_REG_IMM_FP16">; +def VSrcFake16_f16_Lo128 : SrcRegOrImm9 <VS_32_Lo128, "OPERAND_REG_IMM_FP16">; -def VSrc_b32 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_INT32">; -def VSrc_f32 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_FP32">; -def VSrc_v2b16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_V2INT16">; +def VSrc_b32 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_INT32">; +def VSrc_f32 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_FP32">; +def VSrc_v2b16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_V2INT16">; def VSrc_v2bf16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_V2BF16">; -def VSrc_v2f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_V2FP16">; -def VSrc_b64 : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_IMM_INT64">; -def VSrc_f64 : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_IMM_FP64"> { +def VSrc_v2f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_V2FP16">; +def VSrc_b64 : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_IMM_INT64">; +def VSrc_f64 : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_IMM_FP64"> { let DecoderMethod = "decodeOperand_VSrc_f64"; } def VSrc_v2b32 : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_IMM_V2INT32">; @@ -1371,6 +1429,8 @@ def VSrc_v2f32 : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_IMM_V2FP32">; def VSrc_NoInline_v2f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_NOINLINE_V2FP16">; +def VSrc_v2f16_splat : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_V2FP16_SPLAT">; + //===----------------------------------------------------------------------===// // VRegSrc_* Operands with a VGPR //===----------------------------------------------------------------------===// @@ -1381,15 +1441,15 @@ class SrcReg9<RegisterClassLike regClass> : RegisterOperand<regClass> { let DecoderMethod = "decodeSrcReg9<" # !cast<SIRegisterClassLike>(regClass).Size # ">"; } -def VRegSrc_32 : SrcReg9<VGPR_32>; -def VRegSrc_64 : SrcReg9<VReg_64_AlignTarget>; -def VRegSrc_96 : SrcReg9<VReg_96_AlignTarget>; -def VRegSrc_128 : SrcReg9<VReg_128_AlignTarget>; -def VRegSrc_192 : SrcReg9<VReg_192_AlignTarget>; -def VRegSrc_256 : SrcReg9<VReg_256_AlignTarget>; -def VRegSrc_384 : SrcReg9<VReg_384_AlignTarget>; -def VRegSrc_512 : SrcReg9<VReg_512_AlignTarget>; -def VRegSrc_1024 : SrcReg9<VReg_1024_AlignTarget>; +def VRegSrc_32 : SrcReg9<VGPR_32>; +def VRegSrc_64 : SrcReg9<VReg_64_AlignTarget>; +def VRegSrc_96 : SrcReg9<VReg_96_AlignTarget>; +def VRegSrc_128 : SrcReg9<VReg_128_AlignTarget>; +def VRegSrc_192 : SrcReg9<VReg_192_AlignTarget>; +def VRegSrc_256 : SrcReg9<VReg_256_AlignTarget>; +def VRegSrc_384 : SrcReg9<VReg_384_AlignTarget>; +def VRegSrc_512 : SrcReg9<VReg_512_AlignTarget>; +def VRegSrc_1024 : SrcReg9<VReg_1024_AlignTarget>; def VRegOrLdsSrc_32 : SrcReg9<VRegOrLds_32>; // True 16 Operands @@ -1454,44 +1514,44 @@ def ARegSrc_32 : AVOperand<AGPR_32, "decodeSrcA9">; // VCSrc_* Operands with an SGPR, VGPR or an inline constant //===----------------------------------------------------------------------===// -def VCSrc_b16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_INT16">; -def VCSrc_bf16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_BF16">; -def VCSrc_f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_FP16">; -def VCSrc_b32 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_INT32">; -def VCSrc_f32 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_FP32">; -def VCSrc_b64 : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_INLINE_C_INT64">; -def VCSrc_f64 : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_INLINE_C_FP64">; -def VCSrc_v2b16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2INT16">; -def VCSrc_v2bf16: SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2BF16">; -def VCSrc_v2f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2FP16">; -def VCSrc_b32_Lo256 : SrcRegOrImm9 <VS_32_Lo256, "OPERAND_REG_INLINE_C_INT32">; -def VCSrc_v2b32 : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_INLINE_C_V2INT32">; -def VCSrc_b64_Lo256 : SrcRegOrImm9 <VS_64_Lo256, "OPERAND_REG_INLINE_C_INT64">; +def VCSrc_b16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_INT16">; +def VCSrc_bf16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_BF16">; +def VCSrc_f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_FP16">; +def VCSrc_b32 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_INT32">; +def VCSrc_f32 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_FP32">; +def VCSrc_v2b16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2INT16">; +def VCSrc_v2bf16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2BF16">; +def VCSrc_v2f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2FP16">; +def VCSrc_b32_Lo256 : SrcRegOrImm9 <VS_32_Lo256, "OPERAND_REG_INLINE_C_INT32">; +def VCSrc_b64_Lo256 : SrcRegOrImm9 <VS_64_Lo256, "OPERAND_REG_INLINE_C_INT64">; +def VCSrc_b64 : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_INLINE_C_INT64">; +def VCSrc_f64 : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_INLINE_C_FP64">; +def VCSrc_v2b32 : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_INLINE_C_V2INT32">; // True 16 Operands -def VCSrcT_b16 : SrcRegOrImm9_t16 <"OPERAND_REG_INLINE_C_INT16">; -def VCSrcT_bf16 : SrcRegOrImm9_t16 <"OPERAND_REG_INLINE_C_BF16">; -def VCSrcT_f16 : SrcRegOrImm9_t16 <"OPERAND_REG_INLINE_C_FP16">; +def VCSrcT_b16 : SrcRegOrImm9_t16 <"OPERAND_REG_INLINE_C_INT16">; +def VCSrcT_bf16 : SrcRegOrImm9_t16 <"OPERAND_REG_INLINE_C_BF16">; +def VCSrcT_f16 : SrcRegOrImm9_t16 <"OPERAND_REG_INLINE_C_FP16">; //===----------------------------------------------------------------------===// // VISrc_* Operands with a VGPR or an inline constant //===----------------------------------------------------------------------===// -def VISrc_64_bf16 : SrcRegOrImm9 <VReg_64_AlignTarget, "OPERAND_REG_INLINE_C_BF16">; -def VISrc_64_f16 : SrcRegOrImm9 <VReg_64_AlignTarget, "OPERAND_REG_INLINE_C_FP16">; -def VISrc_64_b32 : SrcRegOrImm9 <VReg_64_AlignTarget, "OPERAND_REG_INLINE_C_INT32">; -def VISrc_64_f64 : SrcRegOrImm9 <VReg_64_AlignTarget, "OPERAND_REG_INLINE_C_FP64">; -def VISrc_128_bf16 : SrcRegOrImm9 <VReg_128_AlignTarget, "OPERAND_REG_INLINE_C_BF16">; -def VISrc_128_f16 : SrcRegOrImm9 <VReg_128_AlignTarget, "OPERAND_REG_INLINE_C_FP16">; -def VISrc_128_b32 : SrcRegOrImm9 <VReg_128_AlignTarget, "OPERAND_REG_INLINE_C_INT32">; -def VISrc_128_f32 : SrcRegOrImm9 <VReg_128_AlignTarget, "OPERAND_REG_INLINE_C_FP32">; -def VISrc_256_b32 : SrcRegOrImm9 <VReg_256_AlignTarget, "OPERAND_REG_INLINE_C_INT32">; -def VISrc_256_f32 : SrcRegOrImm9 <VReg_256_AlignTarget, "OPERAND_REG_INLINE_C_FP32">; -def VISrc_256_f64 : SrcRegOrImm9 <VReg_256_AlignTarget, "OPERAND_REG_INLINE_C_FP64">; -def VISrc_512_b32 : SrcRegOrImm9 <VReg_512_AlignTarget, "OPERAND_REG_INLINE_C_INT32">; -def VISrc_512_f32 : SrcRegOrImm9 <VReg_512_AlignTarget, "OPERAND_REG_INLINE_C_FP32">; -def VISrc_512_f64 : SrcRegOrImm9 <VReg_512_AlignTarget, "OPERAND_REG_INLINE_C_FP64">; -def VISrc_1024_b32 : SrcRegOrImm9 <VReg_1024_AlignTarget, "OPERAND_REG_INLINE_C_INT32">; -def VISrc_1024_f32 : SrcRegOrImm9 <VReg_1024_AlignTarget, "OPERAND_REG_INLINE_C_FP32">; +def VISrc_64_bf16 : SrcRegOrImm9 <VReg_64_AlignTarget, "OPERAND_REG_INLINE_C_BF16">; +def VISrc_64_f16 : SrcRegOrImm9 <VReg_64_AlignTarget, "OPERAND_REG_INLINE_C_FP16">; +def VISrc_64_b32 : SrcRegOrImm9 <VReg_64_AlignTarget, "OPERAND_REG_INLINE_C_INT32">; +def VISrc_64_f64 : SrcRegOrImm9 <VReg_64_AlignTarget, "OPERAND_REG_INLINE_C_FP64">; +def VISrc_128_bf16 : SrcRegOrImm9 <VReg_128_AlignTarget, "OPERAND_REG_INLINE_C_BF16">; +def VISrc_128_f16 : SrcRegOrImm9 <VReg_128_AlignTarget, "OPERAND_REG_INLINE_C_FP16">; +def VISrc_128_b32 : SrcRegOrImm9 <VReg_128_AlignTarget, "OPERAND_REG_INLINE_C_INT32">; +def VISrc_128_f32 : SrcRegOrImm9 <VReg_128_AlignTarget, "OPERAND_REG_INLINE_C_FP32">; +def VISrc_256_b32 : SrcRegOrImm9 <VReg_256_AlignTarget, "OPERAND_REG_INLINE_C_INT32">; +def VISrc_256_f32 : SrcRegOrImm9 <VReg_256_AlignTarget, "OPERAND_REG_INLINE_C_FP32">; +def VISrc_256_f64 : SrcRegOrImm9 <VReg_256_AlignTarget, "OPERAND_REG_INLINE_C_FP64">; +def VISrc_512_b32 : SrcRegOrImm9 <VReg_512_AlignTarget, "OPERAND_REG_INLINE_C_INT32">; +def VISrc_512_f32 : SrcRegOrImm9 <VReg_512_AlignTarget, "OPERAND_REG_INLINE_C_FP32">; +def VISrc_512_f64 : SrcRegOrImm9 <VReg_512_AlignTarget, "OPERAND_REG_INLINE_C_FP64">; +def VISrc_1024_b32 : SrcRegOrImm9 <VReg_1024_AlignTarget, "OPERAND_REG_INLINE_C_INT32">; +def VISrc_1024_f32 : SrcRegOrImm9 <VReg_1024_AlignTarget, "OPERAND_REG_INLINE_C_FP32">; //===----------------------------------------------------------------------===// // AVSrc_*, AVDst_*, AVLdSt_* Operands with an AGPR or VGPR @@ -1500,13 +1560,13 @@ def VISrc_1024_f32 : SrcRegOrImm9 <VReg_1024_AlignTarget, "OPERAND_REG_INLINE_C_ class AVSrcOperand<RegisterClassLike regClass> : AVOperand<regClass, "decodeSrcAV10">; -def AVSrc_32 : AVSrcOperand<AV_32>; -def AVSrc_64 : AVSrcOperand<AV_64_AlignTarget>; +def AVSrc_32 : AVSrcOperand<AV_32>; +def AVSrc_64 : AVSrcOperand<AV_64_AlignTarget>; def AVSrc_128 : AVSrcOperand<AV_128_AlignTarget>; def AVSrc_192 : AVSrcOperand<AV_192_AlignTarget>; def AVSrc_256 : AVSrcOperand<AV_256_AlignTarget>; -def AVSrc_64_Align2 : AVSrcOperand<AV_64_Align2>; +def AVSrc_64_Align2 : AVSrcOperand<AV_64_Align2>; def AVSrc_128_Align2 : AVSrcOperand<AV_128_Align2>; def AVSrc_192_Align2 : AVSrcOperand<AV_192_Align2>; def AVSrc_256_Align2 : AVSrcOperand<AV_256_Align2>; @@ -1528,11 +1588,22 @@ class AVLdStOperand<RegisterClassLike regClass> def AVLdSt_32 : AVLdStOperand<AV_LdSt_32_Target>; foreach size = ["64", "96", "128", "160", "256", "1024" ] in { - def AVLdSt_#size : AVLdStOperand<!cast<RegisterClassLike>("AV_LdSt_"#size#_AlignTarget)>; + def AVLdSt_#size : AVLdStOperand<!cast<RegisterClassLike>("AV_LdSt_"#size#_AlignTarget)>; def AVLdSt_#size#_Align1 : AVLdStOperand<!cast<RegisterClassLike>("AV_LdSt_"#size#_Align1)>; def AVLdSt_#size#_Align2 : AVLdStOperand<!cast<RegisterClassLike>("AV_LdSt_"#size#_Align2)>; } +def AV_LdSt_32_Align2_RegMatcher : AsmOperandClass { + let Name = "AV_LdSt_32_Align2_RegOp"; + let RenderMethod = "addRegOperands"; +} + +def AV_LdSt_32_Align2_RegOp : RegisterOperand<AV_LdSt_32_Align2> { + let ParserMatchClass = AV_LdSt_32_Align2_RegMatcher; + let PrintMethod = "printAVLdSt32Align2RegOp"; + let EncoderMethod = "getAVOperandEncoding"; +} + //===----------------------------------------------------------------------===// // ACSrc_* Operands with an AGPR or an inline constant //===----------------------------------------------------------------------===// @@ -1542,14 +1613,14 @@ class SrcRegOrImmA9<RegisterClassLike regClass, string operandType> let DecoderMethod = "decodeSrcRegOrImmA9<" # !cast<SIRegisterClassLike>(regClass).Size # ">"; } -def AISrc_64_f64 : SrcRegOrImmA9 <AReg_64_AlignTarget, "OPERAND_REG_INLINE_AC_FP64">; -def AISrc_128_f32 : SrcRegOrImmA9 <AReg_128_AlignTarget, "OPERAND_REG_INLINE_AC_FP32">; -def AISrc_128_b32 : SrcRegOrImmA9 <AReg_128_AlignTarget, "OPERAND_REG_INLINE_AC_INT32">; -def AISrc_256_f64 : SrcRegOrImmA9 <AReg_256_AlignTarget, "OPERAND_REG_INLINE_AC_FP64">; -def AISrc_512_f32 : SrcRegOrImmA9 <AReg_512_AlignTarget, "OPERAND_REG_INLINE_AC_FP32">; -def AISrc_512_b32 : SrcRegOrImmA9 <AReg_512_AlignTarget, "OPERAND_REG_INLINE_AC_INT32">; -def AISrc_1024_f32 : SrcRegOrImmA9 <AReg_1024_AlignTarget, "OPERAND_REG_INLINE_AC_FP32">; -def AISrc_1024_b32 : SrcRegOrImmA9 <AReg_1024_AlignTarget, "OPERAND_REG_INLINE_AC_INT32">; +def AISrc_64_f64 : SrcRegOrImmA9 <AReg_64_AlignTarget, "OPERAND_REG_INLINE_AC_FP64">; +def AISrc_128_f32 : SrcRegOrImmA9 <AReg_128_AlignTarget, "OPERAND_REG_INLINE_AC_FP32">; +def AISrc_128_b32 : SrcRegOrImmA9 <AReg_128_AlignTarget, "OPERAND_REG_INLINE_AC_INT32">; +def AISrc_256_f64 : SrcRegOrImmA9 <AReg_256_AlignTarget, "OPERAND_REG_INLINE_AC_FP64">; +def AISrc_512_f32 : SrcRegOrImmA9 <AReg_512_AlignTarget, "OPERAND_REG_INLINE_AC_FP32">; +def AISrc_512_b32 : SrcRegOrImmA9 <AReg_512_AlignTarget, "OPERAND_REG_INLINE_AC_INT32">; +def AISrc_1024_f32 : SrcRegOrImmA9 <AReg_1024_AlignTarget, "OPERAND_REG_INLINE_AC_FP32">; +def AISrc_1024_b32 : SrcRegOrImmA9 <AReg_1024_AlignTarget, "OPERAND_REG_INLINE_AC_INT32">; //===----------------------------------------------------------------------===// // Tablegen programming utilities diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp index 179ecba..14ed778 100644 --- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -27,6 +27,8 @@ using namespace llvm; namespace { +enum ChangeKind { None, UpdateHint, UpdateInst }; + class SIShrinkInstructions { MachineFunction *MF; MachineRegisterInfo *MRI; @@ -41,10 +43,10 @@ class SIShrinkInstructions { bool isKUImmOperand(const MachineOperand &Src) const; bool isKImmOrKUImmOperand(const MachineOperand &Src, bool &IsUnsigned) const; void copyExtraImplicitOps(MachineInstr &NewMI, MachineInstr &MI) const; - void shrinkScalarCompare(MachineInstr &MI) const; - void shrinkMIMG(MachineInstr &MI) const; - void shrinkMadFma(MachineInstr &MI) const; - bool shrinkScalarLogicOp(MachineInstr &MI) const; + bool shrinkScalarCompare(MachineInstr &MI) const; + bool shrinkMIMG(MachineInstr &MI) const; + bool shrinkMadFma(MachineInstr &MI) const; + ChangeKind shrinkScalarLogicOp(MachineInstr &MI) const; bool tryReplaceDeadSDST(MachineInstr &MI) const; bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R, Register Reg, unsigned SubReg) const; @@ -241,27 +243,30 @@ void SIShrinkInstructions::copyExtraImplicitOps(MachineInstr &NewMI, } } -void SIShrinkInstructions::shrinkScalarCompare(MachineInstr &MI) const { +bool SIShrinkInstructions::shrinkScalarCompare(MachineInstr &MI) const { if (!ST->hasSCmpK()) - return; + return false; // cmpk instructions do scc = dst <cc op> imm16, so commute the instruction to // get constants on the RHS. - if (!MI.getOperand(0).isReg()) - TII->commuteInstruction(MI, false, 0, 1); + bool Changed = false; + if (!MI.getOperand(0).isReg()) { + if (TII->commuteInstruction(MI, false, 0, 1)) + Changed = true; + } // cmpk requires src0 to be a register const MachineOperand &Src0 = MI.getOperand(0); if (!Src0.isReg()) - return; + return Changed; MachineOperand &Src1 = MI.getOperand(1); if (!Src1.isImm()) - return; + return Changed; int SOPKOpc = AMDGPU::getSOPKOp(MI.getOpcode()); if (SOPKOpc == -1) - return; + return Changed; // eq/ne is special because the imm16 can be treated as signed or unsigned, // and initially selected to the unsigned versions. @@ -275,9 +280,10 @@ void SIShrinkInstructions::shrinkScalarCompare(MachineInstr &MI) const { } MI.setDesc(TII->get(SOPKOpc)); + Changed = true; } - return; + return Changed; } const MCInstrDesc &NewDesc = TII->get(SOPKOpc); @@ -287,14 +293,16 @@ void SIShrinkInstructions::shrinkScalarCompare(MachineInstr &MI) const { if (!SIInstrInfo::sopkIsZext(SOPKOpc)) Src1.setImm(SignExtend64(Src1.getImm(), 32)); MI.setDesc(NewDesc); + Changed = true; } + return Changed; } // Shrink NSA encoded instructions with contiguous VGPRs to non-NSA encoding. -void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const { +bool SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const { const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode()); if (!Info) - return; + return false; uint8_t NewEncoding; switch (Info->MIMGEncoding) { @@ -305,7 +313,7 @@ void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const { NewEncoding = AMDGPU::MIMGEncGfx11Default; break; default: - return; + return false; } int VAddr0Idx = @@ -359,7 +367,7 @@ void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const { } else if (Vgpr == NextVgpr) { NextVgpr = Vgpr + Dwords; } else { - return; + return false; } if (!Op.isUndef()) @@ -369,7 +377,7 @@ void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const { } if (VgprBase + NewAddrDwords > 256) - return; + return false; // Further check for implicit tied operands - this may be present if TFE is // enabled @@ -408,21 +416,22 @@ void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const { AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata), ToUntie - (EndVAddr - 1)); } + return true; } // Shrink MAD to MADAK/MADMK and FMA to FMAAK/FMAMK. -void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const { +bool SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const { // Pre-GFX10 VOP3 instructions like MAD/FMA cannot take a literal operand so // there is no reason to try to shrink them. if (!ST->hasVOP3Literal()) - return; + return false; // There is no advantage to doing this pre-RA. if (!IsPostRA) - return; + return false; if (TII->hasAnyModifiersSet(MI)) - return; + return false; const unsigned Opcode = MI.getOpcode(); MachineOperand &Src0 = *TII->getNamedOperand(MI, AMDGPU::OpName::src0); @@ -439,7 +448,7 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const { else if (Src0.isReg() && TRI->isVGPR(*MRI, Src0.getReg())) Swap = true; else - return; + return false; switch (Opcode) { default: @@ -477,7 +486,7 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const { else if (Src0.isImm() && !TII->isInlineConstant(Src0)) Swap = true; else - return; + return false; switch (Opcode) { default: @@ -509,10 +518,10 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const { } if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) - return; + return false; if (AMDGPU::isTrue16Inst(NewOpcode) && !shouldShrinkTrue16(MI)) - return; + return false; if (Swap) { // Swap Src0 and Src1 by building a new instruction. @@ -527,14 +536,17 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const { TII->removeModOperands(MI); MI.setDesc(TII->get(NewOpcode)); } + return true; } /// Attempt to shrink AND/OR/XOR operations requiring non-inlineable literals. /// For AND or OR, try using S_BITSET{0,1} to clear or set bits. /// If the inverse of the immediate is legal, use ANDN2, ORN2 or /// XNOR (as a ^ b == ~(a ^ ~b)). -/// \returns true if the caller should continue the machine function iterator -bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const { +/// \return ChangeKind::None if no changes were made. +/// ChangeKind::UpdateHint if regalloc hints were updated. +/// ChangeKind::UpdateInst if the instruction was modified. +ChangeKind SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const { unsigned Opc = MI.getOpcode(); const MachineOperand *Dest = &MI.getOperand(0); MachineOperand *Src0 = &MI.getOperand(1); @@ -544,13 +556,14 @@ bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const { if (!SrcImm->isImm() || AMDGPU::isInlinableLiteral32(SrcImm->getImm(), ST->hasInv2PiInlineImm())) - return false; + return ChangeKind::None; uint32_t Imm = static_cast<uint32_t>(SrcImm->getImm()); uint32_t NewImm = 0; if (Opc == AMDGPU::S_AND_B32) { - if (isPowerOf2_32(~Imm)) { + if (isPowerOf2_32(~Imm) && + MI.findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr)->isDead()) { NewImm = llvm::countr_one(Imm); Opc = AMDGPU::S_BITSET0_B32; } else if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) { @@ -558,7 +571,8 @@ bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const { Opc = AMDGPU::S_ANDN2_B32; } } else if (Opc == AMDGPU::S_OR_B32) { - if (isPowerOf2_32(Imm)) { + if (isPowerOf2_32(Imm) && + MI.findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr)->isDead()) { NewImm = llvm::countr_zero(Imm); Opc = AMDGPU::S_BITSET1_B32; } else if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) { @@ -578,13 +592,13 @@ bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const { if (Dest->getReg().isVirtual() && SrcReg->isReg()) { MRI->setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg()); MRI->setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg()); - return true; + return ChangeKind::UpdateHint; } if (SrcReg->isReg() && SrcReg->getReg() == Dest->getReg()) { const bool IsUndef = SrcReg->isUndef(); const bool IsKill = SrcReg->isKill(); - MI.setDesc(TII->get(Opc)); + TII->mutateAndCleanupImplicit(MI, TII->get(Opc)); if (Opc == AMDGPU::S_BITSET0_B32 || Opc == AMDGPU::S_BITSET1_B32) { Src0->ChangeToImmediate(NewImm); @@ -596,10 +610,11 @@ bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const { } else { SrcImm->setImm(NewImm); } + return ChangeKind::UpdateInst; } } - return false; + return ChangeKind::None; } // This is the same as MachineInstr::readsRegister/modifiesRegister except @@ -791,10 +806,10 @@ MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const { Y1 = getSubRegForIndex(Y, Ysub, I); auto *MIB = BuildMI(MBB, MovX->getIterator(), MovT.getDebugLoc(), TII->get(AMDGPU::V_SWAP_B32)) - .addDef(X1.Reg, 0, X1.SubReg) - .addDef(Y1.Reg, 0, Y1.SubReg) - .addReg(Y1.Reg, 0, Y1.SubReg) - .addReg(X1.Reg, 0, X1.SubReg) + .addDef(X1.Reg, {}, X1.SubReg) + .addDef(Y1.Reg, {}, Y1.SubReg) + .addReg(Y1.Reg, {}, Y1.SubReg) + .addReg(X1.Reg, {}, X1.SubReg) .getInstr(); Swaps.push_back(MIB); } @@ -854,6 +869,7 @@ bool SIShrinkInstructions::run(MachineFunction &MF) { IsPostRA = MF.getProperties().hasNoVRegs(); unsigned VCCReg = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC; + bool Changed = false; for (MachineBasicBlock &MBB : MF) { MachineBasicBlock::iterator I, Next; @@ -877,6 +893,7 @@ bool SIShrinkInstructions::run(MachineFunction &MF) { if (ModOpcode != 0) { MI.setDesc(TII->get(ModOpcode)); Src.setImm(static_cast<int64_t>(ModImm)); + Changed = true; continue; } } @@ -887,20 +904,35 @@ bool SIShrinkInstructions::run(MachineFunction &MF) { MI.getOpcode() == AMDGPU::COPY)) { if (auto *NextMI = matchSwap(MI)) { Next = NextMI->getIterator(); + Changed = true; continue; } } + // Shrink scalar logic operations. + if (MI.getOpcode() == AMDGPU::S_AND_B32 || + MI.getOpcode() == AMDGPU::S_OR_B32 || + MI.getOpcode() == AMDGPU::S_XOR_B32) { + ChangeKind CK = shrinkScalarLogicOp(MI); + if (CK == ChangeKind::UpdateHint) + continue; + Changed |= (CK == ChangeKind::UpdateInst); + } + // Try to use S_ADDK_I32 and S_MULK_I32. if (MI.getOpcode() == AMDGPU::S_ADD_I32 || - MI.getOpcode() == AMDGPU::S_MUL_I32) { + MI.getOpcode() == AMDGPU::S_MUL_I32 || + (MI.getOpcode() == AMDGPU::S_OR_B32 && + MI.getFlag(MachineInstr::MIFlag::Disjoint))) { const MachineOperand *Dest = &MI.getOperand(0); MachineOperand *Src0 = &MI.getOperand(1); MachineOperand *Src1 = &MI.getOperand(2); if (!Src0->isReg() && Src1->isReg()) { - if (TII->commuteInstruction(MI, false, 1, 2)) + if (TII->commuteInstruction(MI, false, 1, 2)) { std::swap(Src0, Src1); + Changed = true; + } } // FIXME: This could work better if hints worked with subregisters. If @@ -911,22 +943,22 @@ bool SIShrinkInstructions::run(MachineFunction &MF) { MRI->setRegAllocationHint(Src0->getReg(), 0, Dest->getReg()); continue; } - if (Src0->isReg() && Src0->getReg() == Dest->getReg()) { if (Src1->isImm() && isKImmOperand(*Src1)) { - unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ? - AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32; - + unsigned Opc = (MI.getOpcode() == AMDGPU::S_MUL_I32) + ? AMDGPU::S_MULK_I32 + : AMDGPU::S_ADDK_I32; Src1->setImm(SignExtend64(Src1->getImm(), 32)); MI.setDesc(TII->get(Opc)); MI.tieOperands(0, 1); + Changed = true; } } } // Try to use s_cmpk_* if (MI.isCompare() && TII->isSOPC(MI)) { - shrinkScalarCompare(MI); + Changed |= shrinkScalarCompare(MI); continue; } @@ -941,27 +973,21 @@ bool SIShrinkInstructions::run(MachineFunction &MF) { if (isKImmOperand(Src)) { MI.setDesc(TII->get(AMDGPU::S_MOVK_I32)); Src.setImm(SignExtend64(Src.getImm(), 32)); + Changed = true; } else if ((ModOpc = canModifyToInlineImmOp32(TII, Src, ModImm, /*Scalar=*/true))) { MI.setDesc(TII->get(ModOpc)); Src.setImm(static_cast<int64_t>(ModImm)); + Changed = true; } } continue; } - // Shrink scalar logic operations. - if (MI.getOpcode() == AMDGPU::S_AND_B32 || - MI.getOpcode() == AMDGPU::S_OR_B32 || - MI.getOpcode() == AMDGPU::S_XOR_B32) { - if (shrinkScalarLogicOp(MI)) - continue; - } - if (IsPostRA && TII->isMIMG(MI.getOpcode()) && ST->getGeneration() >= AMDGPUSubtarget::GFX10) { - shrinkMIMG(MI); + Changed |= shrinkMIMG(MI); continue; } @@ -977,14 +1003,14 @@ bool SIShrinkInstructions::run(MachineFunction &MF) { MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_fake16_e64 || (MI.getOpcode() == AMDGPU::V_FMA_F64_e64 && ST->hasFmaakFmamkF64Insts())) { - shrinkMadFma(MI); + Changed |= shrinkMadFma(MI); continue; } // If there is no chance we will shrink it and use VCC as sdst to get // a 32 bit form try to replace dead sdst with NULL. if (TII->isVOP3(MI.getOpcode())) { - tryReplaceDeadSDST(MI); + Changed |= tryReplaceDeadSDST(MI); if (!TII->hasVALU32BitEncoding(MI.getOpcode())) { continue; } @@ -995,9 +1021,12 @@ bool SIShrinkInstructions::run(MachineFunction &MF) { // it. if (!MI.isCommutable() || !TII->commuteInstruction(MI) || !TII->canShrink(MI, *MRI)) { - tryReplaceDeadSDST(MI); + Changed |= tryReplaceDeadSDST(MI); continue; } + + // Operands were commuted. + Changed = true; } int Op32 = AMDGPU::getVOPe32(MI.getOpcode()); @@ -1101,9 +1130,10 @@ bool SIShrinkInstructions::run(MachineFunction &MF) { foldImmediates(*Inst32); LLVM_DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n'); + Changed = true; } } - return false; + return Changed; } bool SIShrinkInstructionsLegacy::runOnMachineFunction(MachineFunction &MF) { diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp index 6611e1e..5fd0c1e 100644 --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -188,8 +188,9 @@ private: void markInstruction(MachineInstr &MI, char Flag, std::vector<WorkItem> &Worklist); - void markDefs(const MachineInstr &UseMI, LiveRange &LR, Register Reg, - unsigned SubReg, char Flag, std::vector<WorkItem> &Worklist); + void markDefs(const MachineInstr &UseMI, LiveRange &LR, + VirtRegOrUnit VRegOrUnit, unsigned SubReg, char Flag, + std::vector<WorkItem> &Worklist); void markOperand(const MachineInstr &MI, const MachineOperand &Op, char Flag, std::vector<WorkItem> &Worklist); void markInstructionUses(const MachineInstr &MI, char Flag, @@ -318,8 +319,8 @@ void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag, /// Mark all relevant definitions of register \p Reg in usage \p UseMI. void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR, - Register Reg, unsigned SubReg, char Flag, - std::vector<WorkItem> &Worklist) { + VirtRegOrUnit VRegOrUnit, unsigned SubReg, + char Flag, std::vector<WorkItem> &Worklist) { LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI); LiveQueryResult UseLRQ = LR.Query(LIS->getInstructionIndex(UseMI)); @@ -331,8 +332,9 @@ void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR, // cover registers. const LaneBitmask UseLanes = SubReg ? TRI->getSubRegIndexLaneMask(SubReg) - : (Reg.isVirtual() ? MRI->getMaxLaneMaskForVReg(Reg) - : LaneBitmask::getNone()); + : (VRegOrUnit.isVirtualReg() + ? MRI->getMaxLaneMaskForVReg(VRegOrUnit.asVirtualReg()) + : LaneBitmask::getNone()); // Perform a depth-first iteration of the LiveRange graph marking defs. // Stop processing of a given branch when all use lanes have been defined. @@ -382,11 +384,11 @@ void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR, MachineInstr *MI = LIS->getInstructionFromIndex(Value->def); assert(MI && "Def has no defining instruction"); - if (Reg.isVirtual()) { + if (VRegOrUnit.isVirtualReg()) { // Iterate over all operands to find relevant definitions bool HasDef = false; for (const MachineOperand &Op : MI->all_defs()) { - if (Op.getReg() != Reg) + if (Op.getReg() != VRegOrUnit.asVirtualReg()) continue; // Compute lanes defined and overlap with use @@ -453,7 +455,7 @@ void SIWholeQuadMode::markOperand(const MachineInstr &MI, << " for " << MI); if (Reg.isVirtual()) { LiveRange &LR = LIS->getInterval(Reg); - markDefs(MI, LR, Reg, Op.getSubReg(), Flag, Worklist); + markDefs(MI, LR, VirtRegOrUnit(Reg), Op.getSubReg(), Flag, Worklist); } else { // Handle physical registers that we need to track; this is mostly relevant // for VCC, which can appear as the (implicit) input of a uniform branch, @@ -462,7 +464,8 @@ void SIWholeQuadMode::markOperand(const MachineInstr &MI, LiveRange &LR = LIS->getRegUnit(Unit); const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn(); if (Value) - markDefs(MI, LR, Unit, AMDGPU::NoSubRegister, Flag, Worklist); + markDefs(MI, LR, VirtRegOrUnit(Unit), AMDGPU::NoSubRegister, Flag, + Worklist); } } } @@ -1101,10 +1104,15 @@ MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion( LiveRange &LR = LIS->getRegUnit(*TRI->regunits(MCRegister::from(AMDGPU::SCC)).begin()); auto MBBE = MBB.end(); - SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First) - : LIS->getMBBEndIdx(&MBB); - SlotIndex LastIdx = - Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB); + // Skip debug instructions when getting slot indices, as they don't have + // entries in the slot index map. + auto FirstNonDbg = skipDebugInstructionsForward(First, MBBE); + auto LastNonDbg = skipDebugInstructionsForward(Last, MBBE); + SlotIndex FirstIdx = FirstNonDbg != MBBE + ? LIS->getInstructionIndex(*FirstNonDbg) + : LIS->getMBBEndIdx(&MBB); + SlotIndex LastIdx = LastNonDbg != MBBE ? LIS->getInstructionIndex(*LastNonDbg) + : LIS->getMBBEndIdx(&MBB); SlotIndex Idx = PreferLast ? LastIdx : FirstIdx; const LiveRange::Segment *S; @@ -1121,8 +1129,8 @@ MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion( } else { MachineInstr *EndMI = LIS->getInstructionFromIndex(S->end.getBaseIndex()); assert(EndMI && "Segment does not end on valid instruction"); - auto NextI = std::next(EndMI->getIterator()); - if (NextI == MBB.end()) + auto NextI = next_nodbg(EndMI->getIterator(), MBB.instr_end()); + if (NextI == MBB.instr_end()) break; SlotIndex Next = LIS->getInstructionIndex(*NextI); if (Next > LastIdx) @@ -1176,16 +1184,17 @@ void SIWholeQuadMode::toExact(MachineBasicBlock &MBB, } } + const DebugLoc &DL = MBB.findDebugLoc(Before); MachineInstr *MI; if (SaveWQM) { unsigned Opcode = IsTerminator ? LMC.AndSaveExecTermOpc : LMC.AndSaveExecOpc; - MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), SaveWQM) - .addReg(LiveMaskReg); + MI = + BuildMI(MBB, Before, DL, TII->get(Opcode), SaveWQM).addReg(LiveMaskReg); } else { unsigned Opcode = IsTerminator ? LMC.AndTermOpc : LMC.AndOpc; - MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), LMC.ExecReg) + MI = BuildMI(MBB, Before, DL, TII->get(Opcode), LMC.ExecReg) .addReg(LMC.ExecReg) .addReg(LiveMaskReg); } @@ -1197,13 +1206,14 @@ void SIWholeQuadMode::toExact(MachineBasicBlock &MBB, void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, Register SavedWQM) { + const DebugLoc &DL = MBB.findDebugLoc(Before); MachineInstr *MI; if (SavedWQM) { - MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), LMC.ExecReg) + MI = BuildMI(MBB, Before, DL, TII->get(AMDGPU::COPY), LMC.ExecReg) .addReg(SavedWQM); } else { - MI = BuildMI(MBB, Before, DebugLoc(), TII->get(LMC.WQMOpc), LMC.ExecReg) + MI = BuildMI(MBB, Before, DL, TII->get(LMC.WQMOpc), LMC.ExecReg) .addReg(LMC.ExecReg); } @@ -1219,13 +1229,13 @@ void SIWholeQuadMode::toStrictMode(MachineBasicBlock &MBB, assert(StrictStateNeeded == StateStrictWWM || StrictStateNeeded == StateStrictWQM); + const DebugLoc &DL = MBB.findDebugLoc(Before); + if (StrictStateNeeded == StateStrictWWM) { - MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WWM), - SaveOrig) + MI = BuildMI(MBB, Before, DL, TII->get(AMDGPU::ENTER_STRICT_WWM), SaveOrig) .addImm(-1); } else { - MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WQM), - SaveOrig) + MI = BuildMI(MBB, Before, DL, TII->get(AMDGPU::ENTER_STRICT_WQM), SaveOrig) .addImm(-1); } LIS->InsertMachineInstrInMaps(*MI); @@ -1242,14 +1252,16 @@ void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB, assert(CurrentStrictState == StateStrictWWM || CurrentStrictState == StateStrictWQM); + const DebugLoc &DL = MBB.findDebugLoc(Before); + if (CurrentStrictState == StateStrictWWM) { - MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WWM), - LMC.ExecReg) - .addReg(SavedOrig); + MI = + BuildMI(MBB, Before, DL, TII->get(AMDGPU::EXIT_STRICT_WWM), LMC.ExecReg) + .addReg(SavedOrig); } else { - MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WQM), - LMC.ExecReg) - .addReg(SavedOrig); + MI = + BuildMI(MBB, Before, DL, TII->get(AMDGPU::EXIT_STRICT_WQM), LMC.ExecReg) + .addReg(SavedOrig); } LIS->InsertMachineInstrInMaps(*MI); StateTransition[MI] = NonStrictState; @@ -1629,7 +1641,7 @@ void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) { } // Insert instruction sequence at block beginning (before vector operations). - const DebugLoc DL = MI.getDebugLoc(); + const DebugLoc &DL = MI.getDebugLoc(); const unsigned WavefrontSize = ST->getWavefrontSize(); const unsigned Mask = (WavefrontSize << 1) - 1; Register CountReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass); diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td index 781c61b0..ee8d29c 100644 --- a/llvm/lib/Target/AMDGPU/SMInstructions.td +++ b/llvm/lib/Target/AMDGPU/SMInstructions.td @@ -1464,7 +1464,7 @@ class SMEM_Real_gfx12Plus<bits<6> op, SM_Pseudo ps, string opName, class SMEM_Real_gfx12<bits<6> op, SM_Pseudo ps, string opName = ps.Mnemonic> : SMEM_Real_gfx12Plus<op, ps, opName, SIEncodingFamily.GFX12, SGPR_NULL_gfx11plus> { - let AssemblerPredicate = isGFX12Plus; + let AssemblerPredicate = isGFX12Only; let DecoderNamespace = "GFX12"; let Inst{5-0} = !if(ps.has_sbase, sbase{6-1}, ?); @@ -1537,3 +1537,84 @@ multiclass SMEM_Real_Probe_gfx12<bits<6> op> { defm S_ATC_PROBE : SMEM_Real_Probe_gfx12<0x22>; defm S_ATC_PROBE_BUFFER : SMEM_Real_Probe_gfx12<0x23>; + +//===----------------------------------------------------------------------===// +// GFX13. +//===----------------------------------------------------------------------===// + +class SMEM_Real_gfx13<bits<6> op, SM_Pseudo ps, string opName = ps.Mnemonic> : + SMEM_Real_gfx12Plus<op, ps, opName, SIEncodingFamily.GFX13, + SGPR_NULL_gfx11plus> { + let AssemblerPredicate = isGFX13Plus; + let DecoderNamespace = "GFX13"; + + let Inst{5-0} = !if(ps.has_sbase, sbase{6-1}, ?); + let Inst{12-6} = !if(ps.has_sdst, sdst{6-0}, ?); +} + +class SMEM_Real_Prefetch_gfx13<bits<6> op, SM_Pseudo ps> : + SMEM_Real_gfx13<op, ps> { + bits<7> sdata; // Only 5 bits of sdata are supported. + + let sdst = ?; + let Inst{12-11} = 0; // Unused sdata bits. + let Inst{10-6} = !if(ps.has_sdst, sdata{4-0}, ?); +} + +class SMEM_Real_Load_gfx13<bits<6> op, string ps, string opName, OffsetMode offsets> : + SMEM_Real_gfx13<op, !cast<SM_Pseudo>(ps # offsets.Variant), opName> { + RegisterClass BaseClass = !cast<SM_Load_Pseudo>(ps # offsets.Variant).BaseClass; + let InOperandList = !con((ins BaseClass:$sbase), offsets.Ins, (ins CPol:$cpol)); + + let Inst{20} = cpol{CPolBit.NV}; // non-volatile + let Inst{22-21} = cpol{4-3}; // scope + let Inst{24-23} = cpol{1-0}; // th - only lower 2 bits are supported + let Inst{56} = cpol{CPolBit.SCAL}; // scale offset +} + +multiclass SM_Real_Loads_gfx13<bits<6> op, string ps = NAME> { + defvar opName = !tolower(NAME); + def _IMM_gfx13 : SMEM_Real_Load_gfx13<op, ps, opName, IMM_Offset>; + def _SGPR_IMM_gfx13 : SMEM_Real_Load_gfx13<op, ps, opName, SGPR_IMM_OptOffset>; +} + +defm S_LOAD_B32 : SM_Real_Loads_gfx13<0x00, "S_LOAD_DWORD">; +defm S_LOAD_B64 : SM_Real_Loads_gfx13<0x01, "S_LOAD_DWORDX2">; +defm S_LOAD_B96 : SM_Real_Loads_gfx13<0x0e, "S_LOAD_DWORDX3">; +defm S_LOAD_B128 : SM_Real_Loads_gfx13<0x02, "S_LOAD_DWORDX4">; +defm S_LOAD_B256 : SM_Real_Loads_gfx13<0x03, "S_LOAD_DWORDX8">; +defm S_LOAD_B512 : SM_Real_Loads_gfx13<0x04, "S_LOAD_DWORDX16">; + +defm S_LOAD_I8 : SM_Real_Loads_gfx13<0x30>; +defm S_LOAD_U8 : SM_Real_Loads_gfx13<0x31>; +defm S_LOAD_I16 : SM_Real_Loads_gfx13<0x32>; +defm S_LOAD_U16 : SM_Real_Loads_gfx13<0x33>; + +defm S_BUFFER_LOAD_B32 : SM_Real_Loads_gfx13<0x08, "S_BUFFER_LOAD_DWORD">; +defm S_BUFFER_LOAD_B64 : SM_Real_Loads_gfx13<0x09, "S_BUFFER_LOAD_DWORDX2">; +defm S_BUFFER_LOAD_B96 : SM_Real_Loads_gfx13<0x0d, "S_BUFFER_LOAD_DWORDX3">; +defm S_BUFFER_LOAD_B128 : SM_Real_Loads_gfx13<0x0a, "S_BUFFER_LOAD_DWORDX4">; +defm S_BUFFER_LOAD_B256 : SM_Real_Loads_gfx13<0x0b, "S_BUFFER_LOAD_DWORDX8">; +defm S_BUFFER_LOAD_B512 : SM_Real_Loads_gfx13<0x0c, "S_BUFFER_LOAD_DWORDX16">; + +defm S_BUFFER_LOAD_I8 : SM_Real_Loads_gfx13<0x34>; +defm S_BUFFER_LOAD_U8 : SM_Real_Loads_gfx13<0x35>; +defm S_BUFFER_LOAD_I16 : SM_Real_Loads_gfx13<0x36>; +defm S_BUFFER_LOAD_U16 : SM_Real_Loads_gfx13<0x37>; + +def S_DCACHE_INV_gfx13 : SMEM_Real_gfx13<0x020, S_DCACHE_INV>; + +def S_PREFETCH_INST_gfx13 : SMEM_Real_Prefetch_gfx13<0x22, S_PREFETCH_INST>; +def S_PREFETCH_INST_PC_REL_gfx13 : SMEM_Real_Prefetch_gfx13<0x23, S_PREFETCH_INST_PC_REL>; +def S_PREFETCH_DATA_gfx13 : SMEM_Real_Prefetch_gfx13<0x2c, S_PREFETCH_DATA>; +def S_BUFFER_PREFETCH_DATA_gfx13 : SMEM_Real_Prefetch_gfx13<0x2d, S_BUFFER_PREFETCH_DATA>; +def S_PREFETCH_DATA_PC_REL_gfx13 : SMEM_Real_Prefetch_gfx13<0x2e, S_PREFETCH_DATA_PC_REL>; + +multiclass SMEM_Real_Probe_gfx13<bits<6> op> { + defvar ps = NAME; + def _IMM_gfx13 : SMEM_Real_Prefetch_gfx13<op, !cast<SM_Probe_Pseudo>(ps#_IMM)>; + def _SGPR_IMM_gfx13 : SMEM_Real_Prefetch_gfx13<op, !cast<SM_Probe_Pseudo>(ps#_SGPR_OPT_IMM)>; +} + +defm S_ATC_PROBE : SMEM_Real_Probe_gfx13<0x26>; +defm S_ATC_PROBE_BUFFER : SMEM_Real_Probe_gfx13<0x27>; diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index 84287b6..ce6e862 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -433,8 +433,10 @@ let SubtargetPredicate = isGFX11Plus in { } // End SubtargetPredicate = isGFX11Plus let SubtargetPredicate = isGFX12Plus in { - let hasSideEffects = 1, Defs = [SCC] in { - def S_ALLOC_VGPR : SOP1_0_32 <"s_alloc_vgpr">; + let hasSideEffects = 1, isConvergent = 1, Defs = [SCC] in { + def S_ALLOC_VGPR : SOP1_0_32 <"s_alloc_vgpr", + [(set SCC, (int_amdgcn_s_alloc_vgpr SSrc_b32:$src0))] + >; } } // End SubtargetPredicate = isGFX12Plus @@ -469,6 +471,25 @@ let SubtargetPredicate = HasSALUFloatInsts, Uses = [MODE], } // End SubtargetPredicate = HasSALUFloatInsts, Uses = [MODE] // SchedRW = [WriteSFPU], isReMaterializable = 1 +let SubtargetPredicate = HasSALUFloatInsts, AddedComplexity = 9 in { + // Fallback patterns for f32->i16 conversion. + def : GCNPat<(i16 (UniformUnaryFrag<fp_to_sint> f32:$src0)), + (S_CVT_I32_F32 $src0)>; + def : GCNPat<(i16 (UniformUnaryFrag<fp_to_uint> f32:$src0)), + (S_CVT_U32_F32 $src0)>; + // f16 -> i32 : form chain f16 -> f32 -> i32 + def : GCNPat<(i32 (UniformUnaryFrag<fp_to_sint> f16:$src0)), + (S_CVT_I32_F32 (S_CVT_F32_F16 $src0))>; + def : GCNPat<(i32 (UniformUnaryFrag<fp_to_uint> f16:$src0)), + (S_CVT_U32_F32 (S_CVT_F32_F16 $src0))>; + + // i32 -> f16 : form chain i32 -> f32 -> f16 + def : GCNPat<(f16 (UniformUnaryFrag<sint_to_fp> i32:$src0)), + (S_CVT_F16_F32 (S_CVT_F32_I32 $src0))>; + def : GCNPat<(f16 (UniformUnaryFrag<uint_to_fp> i32:$src0)), + (S_CVT_F16_F32 (S_CVT_F32_U32 $src0))>; +} + let hasSideEffects = 1 in { let has_sdst = 0 in { let Uses = [M0] in { @@ -504,6 +525,12 @@ def S_BARRIER_JOIN_M0 : SOP1_Pseudo <"s_barrier_join m0", (outs), (ins), let isConvergent = 1; } +def S_WAKEUP_BARRIER_M0 : SOP1_Pseudo <"s_wakeup_barrier m0", (outs), (ins), + "", []>{ + let SchedRW = [WriteBarrier]; + let isConvergent = 1; + let SubtargetPredicate = HasSWakeupBarrier; +} } // End Uses = [M0] def S_BARRIER_SIGNAL_IMM : SOP1_Pseudo <"s_barrier_signal", (outs), @@ -527,6 +554,12 @@ def S_BARRIER_JOIN_IMM : SOP1_Pseudo <"s_barrier_join", (outs), let isConvergent = 1; } +def S_WAKEUP_BARRIER_IMM : SOP1_Pseudo <"s_wakeup_barrier", (outs), + (ins SplitBarrier:$src0), "$src0", []>{ + let SchedRW = [WriteBarrier]; + let isConvergent = 1; + let SubtargetPredicate = HasSWakeupBarrier; +} } // End has_sdst = 0 def S_GET_BARRIER_STATE_IMM : SOP1_Pseudo <"s_get_barrier_state", (outs SSrc_b32:$sdst), @@ -838,9 +871,10 @@ def S_CBRANCH_G_FORK : SOP2_Pseudo < let SubtargetPredicate = isGFX6GFX7GFX8GFX9; } -let Defs = [SCC] in { -def S_ABSDIFF_I32 : SOP2_32 <"s_absdiff_i32">; -} // End Defs = [SCC] +let isCommutable = 1, Defs = [SCC] in +def S_ABSDIFF_I32 : SOP2_32 <"s_absdiff_i32", + [(set i32:$sdst, (UniformUnaryFrag<abs> (sub_oneuse i32:$src0, i32:$src1)))] +>; let SubtargetPredicate = isGFX8GFX9 in { def S_RFE_RESTORE_B64 : SOP2_Pseudo < @@ -1618,23 +1652,34 @@ def S_BARRIER_WAIT : SOPP_Pseudo <"s_barrier_wait", (ins i16imm:$simm16), "$simm let isConvergent = 1; } + +let SchedRW = [WriteBarrier], isConvergent = 1 in { + let SubtargetPredicate = isGFX12Only in def S_BARRIER_LEAVE : SOPP_Pseudo <"s_barrier_leave", (ins), "", [(int_amdgcn_s_barrier_leave (i16 srcvalue))] > { - let SchedRW = [WriteBarrier]; - let simm16 = 0; - let fixed_imm = 1; - let isConvergent = 1; - let Defs = [SCC]; + let simm16 = 0; + let fixed_imm = 1; + let Defs = [SCC]; + } + + let SubtargetPredicate = HasSBarrierLeaveImm in + def S_BARRIER_LEAVE_IMM : SOPP_Pseudo <"s_barrier_leave", + (ins i16imm:$simm16), "$simm16", [(int_amdgcn_s_barrier_leave timm:$simm16)]>; } def S_WAKEUP : SOPP_Pseudo <"s_wakeup", (ins) > { - let SubtargetPredicate = isGFX8Plus; + let SubtargetPredicate = isGFX8GFX9GFX10GFX11GFX12; let simm16 = 0; let fixed_imm = 1; let mayLoad = 1; let mayStore = 1; } +let SubtargetPredicate = HasSWakeupImm in { + def S_WAKEUP_imm : SOPP_Pseudo <"s_wakeup", + (ins i16imm:$simm16), "$simm16">; +} // End SubtargetPredicate = HasSWakeupImm + let SubtargetPredicate = isNotGFX1250Plus in { def S_WAITCNT : SOPP_Pseudo <"s_waitcnt" , (ins SWaitCnt:$simm16), "$simm16", [(int_amdgcn_s_waitcnt timm:$simm16)]>; @@ -1667,11 +1712,21 @@ let SubtargetPredicate = HasWaitXcnt in { // Represents the point at which a wave must wait for all outstanding direct loads to LDS. // Typically inserted by the memory legalizer and consumed by SIInsertWaitcnts. - def S_WAITCNT_lds_direct : SPseudoInstSI<(outs), (ins)> { let hasSideEffects = 0; } +let SubtargetPredicate = HasVMemToLDSLoad in { +def ASYNCMARK : SPseudoInstSI<(outs), (ins), + [(int_amdgcn_asyncmark)]> { + let maybeAtomic = 0; +} +def WAIT_ASYNCMARK : SOPP_Pseudo <"", (ins s16imm:$simm16), "$simm16", + [(int_amdgcn_wait_asyncmark timm:$simm16)]> { + let maybeAtomic = 0; +} +} + def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16", [(int_amdgcn_s_sethalt timm:$simm16)]>; def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16"> { @@ -1791,8 +1846,8 @@ let SubtargetPredicate = isGFX10Plus in { let SubtargetPredicate = isGFX11Plus in { let OtherPredicates = [HasExportInsts] in - def S_WAIT_EVENT : SOPP_Pseudo<"s_wait_event", (ins s16imm:$simm16), - "$simm16"> { + def S_WAIT_EVENT : SOPP_Pseudo<"s_wait_event", (ins WaitEvent:$simm16), + "$simm16", [(int_amdgcn_s_wait_event timm:$simm16)]> { let hasSideEffects = 1; } def S_DELAY_ALU : SOPP_Pseudo<"s_delay_alu", (ins SDelayALU:$simm16), @@ -1915,9 +1970,7 @@ def : GCNPat< (S_SEXT_I32_I16 $src) >; -let SubtargetPredicate = isNotGFX12Plus in - def : GCNPat <(int_amdgcn_s_wait_event_export_ready), (S_WAIT_EVENT (i16 0))>; -let SubtargetPredicate = isGFX12Plus in +let SubtargetPredicate = isGFX11Plus in def : GCNPat <(int_amdgcn_s_wait_event_export_ready), (S_WAIT_EVENT (i16 2))>; // The first 10 bits of the mode register are the core FP mode on all @@ -2091,7 +2144,34 @@ class Select_gfx6_gfx7<string opName> : SIMCInstr<opName, SIEncodingFamily.SI> { } //===----------------------------------------------------------------------===// -// SOP1 - GFX11, GFX12 +// SOP1 - GFX13 +//===----------------------------------------------------------------------===// + +multiclass SOP1_Real_gfx13<bits<8> op, string name = !tolower(NAME)> { + defvar ps = !cast<SOP1_Pseudo>(NAME); + def _gfx13 : SOP1_Real<op, ps, name>, + Select<GFX13Gen, ps.Mnemonic>; + if !ne(ps.Mnemonic, name) then + def : MnemonicAlias<ps.Mnemonic, name>, Requires<[isGFX13Only]>; +} + +multiclass SOP1_M0_Real_gfx13<bits<8> op> { + defvar ps = !cast<SOP1_Pseudo>(NAME); + def _gfx13 : SOP1_Real<op, ps>, Select<GFX13Gen, ps.PseudoInstr> { + let Inst{7-0} = M0_gfx11plus.HWEncoding{7-0}; // Set Src0 encoding to M0 + } +} + +multiclass SOP1_IMM_Real_gfx13<bits<8> op> { + defvar ps = !cast<SOP1_Pseudo>(NAME); + def _gfx13 : SOP1_Real<op, ps>, + Select<GFX13Gen, ps.PseudoInstr>; +} + +defm S_GET_SHADER_CYCLES_U64 : SOP1_Real_gfx13<0x011>; + +//===----------------------------------------------------------------------===// +// SOP1 - GFX11, GFX12, GFX13 //===----------------------------------------------------------------------===// multiclass SOP1_Real_gfx11<bits<8> op, string name = !tolower(NAME)> { @@ -2110,23 +2190,29 @@ multiclass SOP1_Real_gfx12<bits<8> op, string name = !tolower(NAME)> { Select<GFX12Gen, ps.PseudoInstr>; if !ne(ps.Mnemonic, name) then def : AMDGPUMnemonicAlias<ps.Mnemonic, name> { - let AssemblerPredicate = isGFX12Plus; + let AssemblerPredicate = isGFX12Only; } } multiclass SOP1_M0_Real_gfx12<bits<8> op> { - def _gfx12 : SOP1_Real<op, !cast<SOP1_Pseudo>(NAME)>, - Select<GFX12Gen, !cast<SOP1_Pseudo>(NAME).PseudoInstr> { + defvar ps = !cast<SOP1_Pseudo>(NAME); + def _gfx12 : SOP1_Real<op, ps>, Select<GFX12Gen, ps.PseudoInstr> { let Inst{7-0} = M0_gfx11plus.HWEncoding{7-0}; // Set Src0 encoding to M0 } } +multiclass SOP1_M0_Real_gfx12_gfx13<bits<8> op> : + SOP1_M0_Real_gfx12<op>, SOP1_M0_Real_gfx13<op>; + multiclass SOP1_IMM_Real_gfx12<bits<8> op> { defvar ps = !cast<SOP1_Pseudo>(NAME); def _gfx12 : SOP1_Real<op, ps>, Select<GFX12Gen, ps.PseudoInstr>; } +multiclass SOP1_IMM_Real_gfx12_gfx13<bits<8> op> : + SOP1_IMM_Real_gfx12<op>, SOP1_IMM_Real_gfx13<op>; + multiclass SOP1_Real_gfx11_gfx12<bits<8> op, string name = !tolower(NAME)> : SOP1_Real_gfx11<op, name>, SOP1_Real_gfx12<op, name>; @@ -2139,6 +2225,12 @@ multiclass SOP1_Real_gfx1250<bits<8> op, string name = !tolower(NAME)> { def : AMDGPUMnemonicAlias<ps.Mnemonic, name>; } +multiclass SOP1_Real_gfx11_gfx12_gfx13<bits<8> op> : + SOP1_Real_gfx11<op>, SOP1_Real_gfx12<op>, SOP1_Real_gfx13<op>; + +multiclass SOP1_Real_gfx12_gfx13<bits<8> op> : + SOP1_Real_gfx12<op>, SOP1_Real_gfx13<op>; + defm S_MOV_B32 : SOP1_Real_gfx11_gfx12<0x000>; defm S_MOV_B64 : SOP1_Real_gfx11_gfx12<0x001>; defm S_CMOV_B32 : SOP1_Real_gfx11_gfx12<0x002>; @@ -2207,47 +2299,49 @@ defm S_GETPC_B64 : SOP1_Real_gfx1250<0x047, "s_get_pc_i64">; defm S_SETPC_B64 : SOP1_Real_gfx1250<0x048, "s_set_pc_i64">; defm S_SWAPPC_B64 : SOP1_Real_gfx1250<0x049, "s_swap_pc_i64">; defm S_RFE_B64 : SOP1_Real_gfx1250<0x04a, "s_rfe_i64">; -defm S_SENDMSG_RTN_B32 : SOP1_Real_gfx11_gfx12<0x04c>; -defm S_SENDMSG_RTN_B64 : SOP1_Real_gfx11_gfx12<0x04d>; -defm S_BARRIER_SIGNAL_M0 : SOP1_M0_Real_gfx12<0x04e>; -defm S_BARRIER_SIGNAL_ISFIRST_M0 : SOP1_M0_Real_gfx12<0x04f>; -defm S_GET_BARRIER_STATE_M0 : SOP1_M0_Real_gfx12<0x050>; -defm S_BARRIER_INIT_M0 : SOP1_M0_Real_gfx12<0x051>; -defm S_BARRIER_JOIN_M0 : SOP1_M0_Real_gfx12<0x052>; -defm S_BARRIER_SIGNAL_IMM : SOP1_IMM_Real_gfx12<0x04e>; -defm S_BARRIER_SIGNAL_ISFIRST_IMM : SOP1_IMM_Real_gfx12<0x04f>; -defm S_GET_BARRIER_STATE_IMM : SOP1_IMM_Real_gfx12<0x050>; -defm S_BARRIER_INIT_IMM : SOP1_IMM_Real_gfx12<0x051>; -defm S_BARRIER_JOIN_IMM : SOP1_IMM_Real_gfx12<0x052>; -defm S_ALLOC_VGPR : SOP1_Real_gfx12<0x053>; -defm S_SLEEP_VAR : SOP1_IMM_Real_gfx12<0x058>; - -// GFX1250 +defm S_SENDMSG_RTN_B32 : SOP1_Real_gfx11_gfx12_gfx13<0x04c>; +defm S_SENDMSG_RTN_B64 : SOP1_Real_gfx11_gfx12_gfx13<0x04d>; +defm S_BARRIER_SIGNAL_M0 : SOP1_M0_Real_gfx12_gfx13<0x04e>; +defm S_BARRIER_SIGNAL_ISFIRST_M0 : SOP1_M0_Real_gfx12_gfx13<0x04f>; +defm S_GET_BARRIER_STATE_M0 : SOP1_M0_Real_gfx12_gfx13<0x050>; +defm S_BARRIER_INIT_M0 : SOP1_M0_Real_gfx12_gfx13<0x051>; +defm S_BARRIER_JOIN_M0 : SOP1_M0_Real_gfx12_gfx13<0x052>; +defm S_BARRIER_SIGNAL_IMM : SOP1_IMM_Real_gfx12_gfx13<0x04e>; +defm S_BARRIER_SIGNAL_ISFIRST_IMM : SOP1_IMM_Real_gfx12_gfx13<0x04f>; +defm S_GET_BARRIER_STATE_IMM : SOP1_IMM_Real_gfx12_gfx13<0x050>; +defm S_BARRIER_INIT_IMM : SOP1_IMM_Real_gfx12_gfx13<0x051>; +defm S_BARRIER_JOIN_IMM : SOP1_IMM_Real_gfx12_gfx13<0x052>; +defm S_ALLOC_VGPR : SOP1_Real_gfx12_gfx13<0x053>; +defm S_SLEEP_VAR : SOP1_IMM_Real_gfx12_gfx13<0x058>; + +// GFX1250, GFX13 defm S_GET_SHADER_CYCLES_U64 : SOP1_Real_gfx12<0x06>; -defm S_ADD_PC_I64 : SOP1_Real_gfx12<0x04b>; +defm S_ADD_PC_I64 : SOP1_Real_gfx12_gfx13<0x04b>; +defm S_WAKEUP_BARRIER_M0 : SOP1_M0_Real_gfx12_gfx13<0x057>; +defm S_WAKEUP_BARRIER_IMM : SOP1_IMM_Real_gfx12_gfx13<0x057>; //===----------------------------------------------------------------------===// -// SOP1 - GFX1150, GFX12 +// SOP1 - GFX1150, GFX12, GFX13 //===----------------------------------------------------------------------===// -defm S_CEIL_F32 : SOP1_Real_gfx11_gfx12<0x060>; -defm S_FLOOR_F32 : SOP1_Real_gfx11_gfx12<0x061>; -defm S_TRUNC_F32 : SOP1_Real_gfx11_gfx12<0x062>; -defm S_RNDNE_F32 : SOP1_Real_gfx11_gfx12<0x063>; -defm S_CVT_F32_I32 : SOP1_Real_gfx11_gfx12<0x064>; -defm S_CVT_F32_U32 : SOP1_Real_gfx11_gfx12<0x065>; -defm S_CVT_I32_F32 : SOP1_Real_gfx11_gfx12<0x066>; -defm S_CVT_U32_F32 : SOP1_Real_gfx11_gfx12<0x067>; -defm S_CVT_F16_F32 : SOP1_Real_gfx11_gfx12<0x068>; -defm S_CVT_F32_F16 : SOP1_Real_gfx11_gfx12<0x069>; -defm S_CVT_HI_F32_F16 : SOP1_Real_gfx11_gfx12<0x06a>; -defm S_CEIL_F16 : SOP1_Real_gfx11_gfx12<0x06b>; -defm S_FLOOR_F16 : SOP1_Real_gfx11_gfx12<0x06c>; -defm S_TRUNC_F16 : SOP1_Real_gfx11_gfx12<0x06d>; -defm S_RNDNE_F16 : SOP1_Real_gfx11_gfx12<0x06e>; +defm S_CEIL_F32 : SOP1_Real_gfx11_gfx12_gfx13<0x060>; +defm S_FLOOR_F32 : SOP1_Real_gfx11_gfx12_gfx13<0x061>; +defm S_TRUNC_F32 : SOP1_Real_gfx11_gfx12_gfx13<0x062>; +defm S_RNDNE_F32 : SOP1_Real_gfx11_gfx12_gfx13<0x063>; +defm S_CVT_F32_I32 : SOP1_Real_gfx11_gfx12_gfx13<0x064>; +defm S_CVT_F32_U32 : SOP1_Real_gfx11_gfx12_gfx13<0x065>; +defm S_CVT_I32_F32 : SOP1_Real_gfx11_gfx12_gfx13<0x066>; +defm S_CVT_U32_F32 : SOP1_Real_gfx11_gfx12_gfx13<0x067>; +defm S_CVT_F16_F32 : SOP1_Real_gfx11_gfx12_gfx13<0x068>; +defm S_CVT_F32_F16 : SOP1_Real_gfx11_gfx12_gfx13<0x069>; +defm S_CVT_HI_F32_F16 : SOP1_Real_gfx11_gfx12_gfx13<0x06a>; +defm S_CEIL_F16 : SOP1_Real_gfx11_gfx12_gfx13<0x06b>; +defm S_FLOOR_F16 : SOP1_Real_gfx11_gfx12_gfx13<0x06c>; +defm S_TRUNC_F16 : SOP1_Real_gfx11_gfx12_gfx13<0x06d>; +defm S_RNDNE_F16 : SOP1_Real_gfx11_gfx12_gfx13<0x06e>; //===----------------------------------------------------------------------===// -// SOP1 - GFX10. +// SOP1 - GFX10, GFX13 //===----------------------------------------------------------------------===// multiclass SOP1_Real_gfx10<bits<8> op> { @@ -2256,30 +2350,33 @@ multiclass SOP1_Real_gfx10<bits<8> op> { Select<GFX10Gen, ps.PseudoInstr>; } -multiclass SOP1_Real_gfx10_gfx11_gfx12<bits<8> op> : - SOP1_Real_gfx10<op>, SOP1_Real_gfx11_gfx12<op>; +multiclass SOP1_Real_gfx10_gfx13<bits<8> op> : + SOP1_Real_gfx10<op>, SOP1_Real_gfx13<op>; -defm S_ANDN1_SAVEEXEC_B64 : SOP1_Real_gfx10<0x037>; -defm S_ORN1_SAVEEXEC_B64 : SOP1_Real_gfx10<0x038>; -defm S_ANDN1_WREXEC_B64 : SOP1_Real_gfx10<0x039>; -defm S_ANDN2_WREXEC_B64 : SOP1_Real_gfx10<0x03a>; -defm S_BITREPLICATE_B64_B32 : SOP1_Real_gfx10<0x03b>; -defm S_AND_SAVEEXEC_B32 : SOP1_Real_gfx10<0x03c>; -defm S_OR_SAVEEXEC_B32 : SOP1_Real_gfx10<0x03d>; -defm S_XOR_SAVEEXEC_B32 : SOP1_Real_gfx10<0x03e>; -defm S_ANDN2_SAVEEXEC_B32 : SOP1_Real_gfx10<0x03f>; -defm S_ORN2_SAVEEXEC_B32 : SOP1_Real_gfx10<0x040>; -defm S_NAND_SAVEEXEC_B32 : SOP1_Real_gfx10<0x041>; -defm S_NOR_SAVEEXEC_B32 : SOP1_Real_gfx10<0x042>; -defm S_XNOR_SAVEEXEC_B32 : SOP1_Real_gfx10<0x043>; -defm S_ANDN1_SAVEEXEC_B32 : SOP1_Real_gfx10<0x044>; -defm S_ORN1_SAVEEXEC_B32 : SOP1_Real_gfx10<0x045>; -defm S_ANDN1_WREXEC_B32 : SOP1_Real_gfx10<0x046>; -defm S_ANDN2_WREXEC_B32 : SOP1_Real_gfx10<0x047>; -defm S_MOVRELSD_2_B32 : SOP1_Real_gfx10<0x049>; +multiclass SOP1_Real_gfx10_Renamed_gfx13<bits<8> op, string gfx13_name> : + SOP1_Real_gfx10<op>, SOP1_Real_gfx13<op, gfx13_name>; + +defm S_ANDN1_SAVEEXEC_B64 : SOP1_Real_gfx10_Renamed_gfx13<0x37, "s_and_not0_saveexec_b64">; +defm S_ORN1_SAVEEXEC_B64 : SOP1_Real_gfx10_Renamed_gfx13<0x38, "s_or_not0_saveexec_b64">; +defm S_ANDN1_WREXEC_B64 : SOP1_Real_gfx10_Renamed_gfx13<0x39, "s_and_not0_wrexec_b64">; +defm S_ANDN2_WREXEC_B64 : SOP1_Real_gfx10_Renamed_gfx13<0x3a, "s_and_not1_wrexec_b64">; +defm S_BITREPLICATE_B64_B32 : SOP1_Real_gfx10_gfx13<0x03b>; +defm S_AND_SAVEEXEC_B32 : SOP1_Real_gfx10_gfx13<0x03c>; +defm S_OR_SAVEEXEC_B32 : SOP1_Real_gfx10_gfx13<0x03d>; +defm S_XOR_SAVEEXEC_B32 : SOP1_Real_gfx10_gfx13<0x03e>; +defm S_ANDN2_SAVEEXEC_B32 : SOP1_Real_gfx10_Renamed_gfx13<0x3f, "s_and_not1_saveexec_b32">; +defm S_ORN2_SAVEEXEC_B32 : SOP1_Real_gfx10_Renamed_gfx13<0x40, "s_or_not1_saveexec_b32">; +defm S_NAND_SAVEEXEC_B32 : SOP1_Real_gfx10_gfx13<0x041>; +defm S_NOR_SAVEEXEC_B32 : SOP1_Real_gfx10_gfx13<0x042>; +defm S_XNOR_SAVEEXEC_B32 : SOP1_Real_gfx10_gfx13<0x043>; +defm S_ANDN1_SAVEEXEC_B32 : SOP1_Real_gfx10_Renamed_gfx13<0x44, "s_and_not0_saveexec_b32">; +defm S_ORN1_SAVEEXEC_B32 : SOP1_Real_gfx10_Renamed_gfx13<0x45, "s_or_not0_saveexec_b32">; +defm S_ANDN1_WREXEC_B32 : SOP1_Real_gfx10_Renamed_gfx13<0x46, "s_and_not0_wrexec_b32">; +defm S_ANDN2_WREXEC_B32 : SOP1_Real_gfx10_Renamed_gfx13<0x47, "s_and_not1_wrexec_b32">; +defm S_MOVRELSD_2_B32 : SOP1_Real_gfx10_gfx13<0x049>; //===----------------------------------------------------------------------===// -// SOP1 - GFX6, GFX7, GFX10, GFX11. +// SOP1 - GFX6, GFX7, GFX10, GFX11, GFX13 //===----------------------------------------------------------------------===// @@ -2292,61 +2389,82 @@ multiclass SOP1_Real_gfx6_gfx7<bits<8> op> { multiclass SOP1_Real_gfx6_gfx7_gfx10<bits<8> op> : SOP1_Real_gfx6_gfx7<op>, SOP1_Real_gfx10<op>; -multiclass SOP1_Real_gfx6_gfx7_gfx10_gfx11_gfx12<bits<8> op> : - SOP1_Real_gfx6_gfx7<op>, SOP1_Real_gfx10_gfx11_gfx12<op>; +multiclass SOP1_Real_gfx6_gfx7_gfx10_gfx13<bits<8> op> : + SOP1_Real_gfx6_gfx7<op>, SOP1_Real_gfx10<op>, SOP1_Real_gfx13<op>; + +multiclass SOP1_Real_gfx6_gfx7_gfx10_Renamed_gfx13<bits<8> op, string gfx13_name> : + SOP1_Real_gfx6_gfx7<op>, SOP1_Real_gfx10<op>, SOP1_Real_gfx13<op, gfx13_name>; + +multiclass SOP1_Real_gfx6_gfx7_gfx10_gfx11_gfx12_gfx13<bits<8> op> : + SOP1_Real_gfx6_gfx7<op>, SOP1_Real_gfx10<op>, SOP1_Real_gfx11<op>, + SOP1_Real_gfx12<op>, SOP1_Real_gfx13<op>; defm S_CBRANCH_JOIN : SOP1_Real_gfx6_gfx7<0x032>; -defm S_MOV_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x003>; -defm S_MOV_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x004>; -defm S_CMOV_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x005>; -defm S_CMOV_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x006>; -defm S_NOT_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x007>; -defm S_NOT_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x008>; -defm S_WQM_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x009>; -defm S_WQM_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x00a>; -defm S_BREV_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x00b>; -defm S_BREV_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x00c>; -defm S_BCNT0_I32_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x00d>; -defm S_BCNT0_I32_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x00e>; -defm S_BCNT1_I32_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x00f>; -defm S_BCNT1_I32_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x010>; +defm S_MOV_B32 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x003>; +defm S_MOV_B64 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x004>; +defm S_CMOV_B32 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x005>; +defm S_CMOV_B64 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x006>; +defm S_NOT_B32 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x007>; +defm S_NOT_B64 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x008>; +defm S_WQM_B32 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x009>; +defm S_WQM_B64 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x00a>; +defm S_BREV_B32 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x00b>; +defm S_BREV_B64 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x00c>; +defm S_BCNT0_I32_B32 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x00d>; +defm S_BCNT0_I32_B64 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x00e>; +defm S_BCNT1_I32_B32 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x00f>; +defm S_BCNT1_I32_B64 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x010>; defm S_FF0_I32_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x011>; defm S_FF0_I32_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x012>; -defm S_FF1_I32_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x013>; -defm S_FF1_I32_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x014>; -defm S_FLBIT_I32_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x015>; -defm S_FLBIT_I32_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x016>; -defm S_FLBIT_I32 : SOP1_Real_gfx6_gfx7_gfx10<0x017>; -defm S_FLBIT_I32_I64 : SOP1_Real_gfx6_gfx7_gfx10<0x018>; -defm S_SEXT_I32_I8 : SOP1_Real_gfx6_gfx7_gfx10<0x019>; -defm S_SEXT_I32_I16 : SOP1_Real_gfx6_gfx7_gfx10<0x01a>; -defm S_BITSET0_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x01b>; -defm S_BITSET0_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x01c>; -defm S_BITSET1_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x01d>; -defm S_BITSET1_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x01e>; -defm S_GETPC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x01f>; -defm S_SETPC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x020>; -defm S_SWAPPC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x021>; -defm S_RFE_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x022>; -defm S_AND_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x024>; -defm S_OR_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x025>; -defm S_XOR_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x026>; -defm S_ANDN2_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x027>; -defm S_ORN2_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x028>; -defm S_NAND_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x029>; -defm S_NOR_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x02a>; -defm S_XNOR_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x02b>; -defm S_QUADMASK_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x02c>; -defm S_QUADMASK_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x02d>; -defm S_MOVRELS_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x02e>; -defm S_MOVRELS_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x02f>; -defm S_MOVRELD_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x030>; -defm S_MOVRELD_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x031>; -defm S_ABS_I32 : SOP1_Real_gfx6_gfx7_gfx10<0x034>; - -//===----------------------------------------------------------------------===// -// SOP2 - GFX12 +defm S_FF1_I32_B32 : SOP1_Real_gfx6_gfx7_gfx10_Renamed_gfx13<0x013, "s_ctz_i32_b32">; +defm S_FF1_I32_B64 : SOP1_Real_gfx6_gfx7_gfx10_Renamed_gfx13<0x014, "s_ctz_i32_b64">; +defm S_FLBIT_I32_B32 : SOP1_Real_gfx6_gfx7_gfx10_Renamed_gfx13<0x015, "s_clz_i32_u32">; +defm S_FLBIT_I32_B64 : SOP1_Real_gfx6_gfx7_gfx10_Renamed_gfx13<0x016, "s_clz_i32_u64">; +defm S_FLBIT_I32 : SOP1_Real_gfx6_gfx7_gfx10_Renamed_gfx13<0x017, "s_cls_i32">; +defm S_FLBIT_I32_I64 : SOP1_Real_gfx6_gfx7_gfx10_Renamed_gfx13<0x018, "s_cls_i32_i64">; +defm S_SEXT_I32_I8 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x019>; +defm S_SEXT_I32_I16 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x01a>; +defm S_BITSET0_B32 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x01b>; +defm S_BITSET0_B64 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x01c>; +defm S_BITSET1_B32 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x01d>; +defm S_BITSET1_B64 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x01e>; +defm S_GETPC_B64 : SOP1_Real_gfx6_gfx7_gfx10_Renamed_gfx13<0x01f, "s_get_pc_i64">; +defm S_SETPC_B64 : SOP1_Real_gfx6_gfx7_gfx10_Renamed_gfx13<0x020, "s_set_pc_i64">; +defm S_SWAPPC_B64 : SOP1_Real_gfx6_gfx7_gfx10_Renamed_gfx13<0x021, "s_swap_pc_i64">; +defm S_RFE_B64 : SOP1_Real_gfx6_gfx7_gfx10_Renamed_gfx13<0x022, "s_rfe_i64">; +defm S_AND_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x024>; +defm S_OR_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x025>; +defm S_XOR_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x026>; +defm S_ANDN2_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10_Renamed_gfx13<0x027, "s_and_not1_saveexec_b64">; +defm S_ORN2_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10_Renamed_gfx13<0x028, "s_or_not1_saveexec_b64">; +defm S_NAND_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x029>; +defm S_NOR_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x02a>; +defm S_XNOR_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10_gfx11_gfx12_gfx13<0x02b>; +defm S_QUADMASK_B32 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x02c>; +defm S_QUADMASK_B64 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x02d>; +defm S_MOVRELS_B32 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x02e>; +defm S_MOVRELS_B64 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x02f>; +defm S_MOVRELD_B32 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x030>; +defm S_MOVRELD_B64 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x031>; +defm S_ABS_I32 : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x034>; + +//===----------------------------------------------------------------------===// +// SOP2 - GFX13. +//===----------------------------------------------------------------------===// + +multiclass SOP2_Real_gfx13<bits<7> op, string name = !tolower(NAME)> { + defvar ps = !cast<SOP2_Pseudo>(NAME); + def _gfx13 : SOP2_Real32<op, ps, name>, + Select<GFX13Gen, ps.Mnemonic>; + if !ne(ps.Mnemonic, name) then + def : MnemonicAlias<ps.Mnemonic, name>, Requires<[isGFX13Only]>; +} + +defm S_PACK_HL_B32_B16 : SOP2_Real_gfx13<0x37>; + +//===----------------------------------------------------------------------===// +// SOP2 - GFX12, GFX13 //===----------------------------------------------------------------------===// multiclass SOP2_Real_gfx12<bits<7> op, string name = !tolower(NAME)> { @@ -2355,17 +2473,23 @@ multiclass SOP2_Real_gfx12<bits<7> op, string name = !tolower(NAME)> { Select<GFX12Gen, ps.PseudoInstr>; if !ne(ps.Mnemonic, name) then def : AMDGPUMnemonicAlias<ps.Mnemonic, name> { - let AssemblerPredicate = isGFX12Plus; + let AssemblerPredicate = isGFX12Only; } } -defm S_MINIMUM_F32 : SOP2_Real_gfx12<0x04f>; -defm S_MAXIMUM_F32 : SOP2_Real_gfx12<0x050>; -defm S_MINIMUM_F16 : SOP2_Real_gfx12<0x051>; -defm S_MAXIMUM_F16 : SOP2_Real_gfx12<0x052>; +multiclass SOP2_Real_gfx12_gfx13<bits<7> op, string name = !tolower(NAME)> : + SOP2_Real_gfx12<op, name>, SOP2_Real_gfx13<op, name>; + +defm S_MINIMUM_F32 : SOP2_Real_gfx12_gfx13<0x04f>; +defm S_MAXIMUM_F32 : SOP2_Real_gfx12_gfx13<0x050>; +defm S_MINIMUM_F16 : SOP2_Real_gfx12_gfx13<0x051>; +defm S_MAXIMUM_F16 : SOP2_Real_gfx12_gfx13<0x052>; +defm S_ADD_U64 : SOP2_Real_gfx12_gfx13<0x053, "s_add_nc_u64">; +defm S_SUB_U64 : SOP2_Real_gfx12_gfx13<0x054, "s_sub_nc_u64">; +defm S_MUL_U64 : SOP2_Real_gfx12_gfx13<0x055>; //===----------------------------------------------------------------------===// -// SOP2 - GFX11, GFX12. +// SOP2 - GFX11, GFX12, GFX13 //===----------------------------------------------------------------------===// multiclass SOP2_Real_gfx11<bits<7> op, string name = !tolower(NAME)> { @@ -2424,14 +2548,19 @@ defm S_MUL_HI_I32 : SOP2_Real_gfx11_gfx12<0x02e>; defm S_CSELECT_B32 : SOP2_Real_gfx11_gfx12<0x030>; defm S_CSELECT_B64 : SOP2_Real_gfx11_gfx12<0x031>; defm S_PACK_HL_B32_B16 : SOP2_Real_gfx11_gfx12<0x035>; -defm S_ADD_U64 : SOP2_Real_gfx12<0x053, "s_add_nc_u64">; -defm S_SUB_U64 : SOP2_Real_gfx12<0x054, "s_sub_nc_u64">; -defm S_MUL_U64 : SOP2_Real_gfx12<0x055>; //===----------------------------------------------------------------------===// -// SOP2 - GFX1150, GFX12 +// SOP2 - GFX1150, GFX12, GFX13 //===----------------------------------------------------------------------===// +multiclass SOP2_Real_gfx11_gfx12_gfx13<bits<7> op> : + SOP2_Real_gfx11<op>, SOP2_Real_gfx12<op>, SOP2_Real_gfx13<op>; + +multiclass SOP2_Real_FMAK_gfx13<bits<7> op> { + def _gfx13 : SOP2_Real64<op, !cast<SOP2_Pseudo>(NAME)>, + Select<GFX13Gen, !cast<SOP2_Pseudo>(NAME).Mnemonic>; +} + multiclass SOP2_Real_FMAK_gfx12<bits<7> op> { def _gfx12 : SOP2_Real64<op, !cast<SOP2_Pseudo>(NAME)>, Select<GFX12Gen, !cast<SOP2_Pseudo>(NAME).PseudoInstr>; @@ -2442,35 +2571,36 @@ multiclass SOP2_Real_FMAK_gfx11<bits<7> op> { Select<GFX11Gen, !cast<SOP2_Pseudo>(NAME).PseudoInstr>; } -multiclass SOP2_Real_FMAK_gfx11_gfx12<bits<7> op> : - SOP2_Real_FMAK_gfx11<op>, SOP2_Real_FMAK_gfx12<op>; +multiclass SOP2_Real_FMAK_gfx11_gfx12_gfx13<bits<7> op> : + SOP2_Real_FMAK_gfx11<op>, SOP2_Real_FMAK_gfx12<op>, SOP2_Real_FMAK_gfx13<op>; -defm S_ADD_F32 : SOP2_Real_gfx11_gfx12<0x040>; -defm S_SUB_F32 : SOP2_Real_gfx11_gfx12<0x041>; -defm S_MUL_F32 : SOP2_Real_gfx11_gfx12<0x044>; -defm S_FMAAK_F32 : SOP2_Real_FMAK_gfx11_gfx12<0x045>; -defm S_FMAMK_F32 : SOP2_Real_FMAK_gfx11_gfx12<0x046>; -defm S_FMAC_F32 : SOP2_Real_gfx11_gfx12<0x047>; -defm S_CVT_PK_RTZ_F16_F32 : SOP2_Real_gfx11_gfx12<0x048>; -defm S_ADD_F16 : SOP2_Real_gfx11_gfx12<0x049>; -defm S_SUB_F16 : SOP2_Real_gfx11_gfx12<0x04a>; -defm S_MUL_F16 : SOP2_Real_gfx11_gfx12<0x04d>; -defm S_FMAC_F16 : SOP2_Real_gfx11_gfx12<0x04e>; +defm S_ADD_F32 : SOP2_Real_gfx11_gfx12_gfx13<0x040>; +defm S_SUB_F32 : SOP2_Real_gfx11_gfx12_gfx13<0x041>; +defm S_MUL_F32 : SOP2_Real_gfx11_gfx12_gfx13<0x044>; +defm S_FMAAK_F32 : SOP2_Real_FMAK_gfx11_gfx12_gfx13<0x045>; +defm S_FMAMK_F32 : SOP2_Real_FMAK_gfx11_gfx12_gfx13<0x046>; +defm S_FMAC_F32 : SOP2_Real_gfx11_gfx12_gfx13<0x047>; +defm S_CVT_PK_RTZ_F16_F32 : SOP2_Real_gfx11_gfx12_gfx13<0x048>; +defm S_ADD_F16 : SOP2_Real_gfx11_gfx12_gfx13<0x049>; +defm S_SUB_F16 : SOP2_Real_gfx11_gfx12_gfx13<0x04a>; +defm S_MUL_F16 : SOP2_Real_gfx11_gfx12_gfx13<0x04d>; +defm S_FMAC_F16 : SOP2_Real_gfx11_gfx12_gfx13<0x04e>; //===----------------------------------------------------------------------===// -// SOP2 - GFX1150 +// SOP2 - GFX1150, GFX12, GFX13 //===----------------------------------------------------------------------===// -multiclass SOP2_Real_gfx11_Renamed_gfx12<bits<7> op, string gfx12_name> : - SOP2_Real_gfx11<op>, SOP2_Real_gfx12<op, gfx12_name>; +multiclass SOP2_Real_gfx11_Renamed_gfx12_gfx13<bits<7> op, string gfx12_gfx13_name> : + SOP2_Real_gfx11<op>, SOP2_Real_gfx12<op, gfx12_gfx13_name>, + SOP2_Real_gfx13<op, gfx12_gfx13_name>; -defm S_MIN_F32 : SOP2_Real_gfx11_Renamed_gfx12<0x042, "s_min_num_f32">; -defm S_MAX_F32 : SOP2_Real_gfx11_Renamed_gfx12<0x043, "s_max_num_f32">; -defm S_MIN_F16 : SOP2_Real_gfx11_Renamed_gfx12<0x04b, "s_min_num_f16">; -defm S_MAX_F16 : SOP2_Real_gfx11_Renamed_gfx12<0x04c, "s_max_num_f16">; +defm S_MIN_F32 : SOP2_Real_gfx11_Renamed_gfx12_gfx13<0x042, "s_min_num_f32">; +defm S_MAX_F32 : SOP2_Real_gfx11_Renamed_gfx12_gfx13<0x043, "s_max_num_f32">; +defm S_MIN_F16 : SOP2_Real_gfx11_Renamed_gfx12_gfx13<0x04b, "s_min_num_f16">; +defm S_MAX_F16 : SOP2_Real_gfx11_Renamed_gfx12_gfx13<0x04c, "s_max_num_f16">; //===----------------------------------------------------------------------===// -// SOP2 - GFX10. +// SOP2 - GFX10, GFX11, GFX12, GFX13 //===----------------------------------------------------------------------===// multiclass SOP2_Real_gfx10<bits<7> op> { @@ -2479,21 +2609,25 @@ multiclass SOP2_Real_gfx10<bits<7> op> { Select<GFX10Gen, ps.PseudoInstr>; } -multiclass SOP2_Real_gfx10_gfx11_gfx12<bits<7> op> : - SOP2_Real_gfx10<op>, SOP2_Real_gfx11_gfx12<op>; +multiclass SOP2_Real_gfx10_gfx13<bits<7> op> : + SOP2_Real_gfx10<op>, SOP2_Real_gfx13<op>; -defm S_LSHL1_ADD_U32 : SOP2_Real_gfx10<0x02e>; -defm S_LSHL2_ADD_U32 : SOP2_Real_gfx10<0x02f>; -defm S_LSHL3_ADD_U32 : SOP2_Real_gfx10<0x030>; -defm S_LSHL4_ADD_U32 : SOP2_Real_gfx10<0x031>; -defm S_PACK_LL_B32_B16 : SOP2_Real_gfx10_gfx11_gfx12<0x032>; -defm S_PACK_LH_B32_B16 : SOP2_Real_gfx10_gfx11_gfx12<0x033>; -defm S_PACK_HH_B32_B16 : SOP2_Real_gfx10_gfx11_gfx12<0x034>; -defm S_MUL_HI_U32 : SOP2_Real_gfx10<0x035>; -defm S_MUL_HI_I32 : SOP2_Real_gfx10<0x036>; +multiclass SOP2_Real_gfx10_gfx11_gfx12_gfx13<bits<7> op> : + SOP2_Real_gfx10<op>, SOP2_Real_gfx11<op>, SOP2_Real_gfx12<op>, + SOP2_Real_gfx13<op>; + +defm S_LSHL1_ADD_U32 : SOP2_Real_gfx10_gfx13<0x02e>; +defm S_LSHL2_ADD_U32 : SOP2_Real_gfx10_gfx13<0x02f>; +defm S_LSHL3_ADD_U32 : SOP2_Real_gfx10_gfx13<0x030>; +defm S_LSHL4_ADD_U32 : SOP2_Real_gfx10_gfx13<0x031>; +defm S_PACK_LL_B32_B16 : SOP2_Real_gfx10_gfx11_gfx12_gfx13<0x032>; +defm S_PACK_LH_B32_B16 : SOP2_Real_gfx10_gfx11_gfx12_gfx13<0x033>; +defm S_PACK_HH_B32_B16 : SOP2_Real_gfx10_gfx11_gfx12_gfx13<0x034>; +defm S_MUL_HI_U32 : SOP2_Real_gfx10_gfx13<0x035>; +defm S_MUL_HI_I32 : SOP2_Real_gfx10_gfx13<0x036>; //===----------------------------------------------------------------------===// -// SOP2 - GFX6, GFX7. +// SOP2 - GFX6, GFX7, GFX10, GFX11, GFX12, GFX13 //===----------------------------------------------------------------------===// multiclass SOP2_Real_gfx6_gfx7<bits<7> op> { @@ -2502,57 +2636,105 @@ multiclass SOP2_Real_gfx6_gfx7<bits<7> op> { Select_gfx6_gfx7<ps.PseudoInstr>; } -multiclass SOP2_Real_gfx6_gfx7_gfx10<bits<7> op> : - SOP2_Real_gfx6_gfx7<op>, SOP2_Real_gfx10<op>; +multiclass SOP2_Real_gfx6_gfx7_gfx10_gfx13<bits<7> op> : + SOP2_Real_gfx6_gfx7<op>, SOP2_Real_gfx10<op>, SOP2_Real_gfx13<op>; + +multiclass SOP2_Real_gfx6_gfx7_gfx10_Renamed_gfx13<bits<7> op, string gfx13_name> : + SOP2_Real_gfx6_gfx7<op>, SOP2_Real_gfx10<op>, SOP2_Real_gfx13<op, gfx13_name>; -multiclass SOP2_Real_gfx6_gfx7_gfx10_gfx11_Renamed_gfx12<bits<7> op, string gfx12_name> : +multiclass SOP2_Real_gfx6_gfx7_gfx10_gfx11_Renamed_gfx12_gfx13<bits<7> op, string gfx12_gfx13_name> : SOP2_Real_gfx6_gfx7<op>, SOP2_Real_gfx10<op>, SOP2_Real_gfx11<op>, - SOP2_Real_gfx12<op, gfx12_name>; + SOP2_Real_gfx12<op, gfx12_gfx13_name>, SOP2_Real_gfx13<op, gfx12_gfx13_name>; defm S_CBRANCH_G_FORK : SOP2_Real_gfx6_gfx7<0x02b>; -defm S_ADD_U32 : SOP2_Real_gfx6_gfx7_gfx10_gfx11_Renamed_gfx12<0x000, "s_add_co_u32">; -defm S_SUB_U32 : SOP2_Real_gfx6_gfx7_gfx10_gfx11_Renamed_gfx12<0x001, "s_sub_co_u32">; -defm S_ADD_I32 : SOP2_Real_gfx6_gfx7_gfx10_gfx11_Renamed_gfx12<0x002, "s_add_co_i32">; -defm S_SUB_I32 : SOP2_Real_gfx6_gfx7_gfx10_gfx11_Renamed_gfx12<0x003, "s_sub_co_i32">; -defm S_ADDC_U32 : SOP2_Real_gfx6_gfx7_gfx10_gfx11_Renamed_gfx12<0x004, "s_add_co_ci_u32">; -defm S_SUBB_U32 : SOP2_Real_gfx6_gfx7_gfx10_gfx11_Renamed_gfx12<0x005, "s_sub_co_ci_u32">; -defm S_MIN_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x006>; -defm S_MIN_U32 : SOP2_Real_gfx6_gfx7_gfx10<0x007>; -defm S_MAX_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x008>; -defm S_MAX_U32 : SOP2_Real_gfx6_gfx7_gfx10<0x009>; -defm S_CSELECT_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x00a>; -defm S_CSELECT_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x00b>; -defm S_AND_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x00e>; -defm S_AND_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x00f>; -defm S_OR_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x010>; -defm S_OR_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x011>; -defm S_XOR_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x012>; -defm S_XOR_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x013>; -defm S_ANDN2_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x014>; -defm S_ANDN2_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x015>; -defm S_ORN2_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x016>; -defm S_ORN2_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x017>; -defm S_NAND_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x018>; -defm S_NAND_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x019>; -defm S_NOR_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x01a>; -defm S_NOR_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x01b>; -defm S_XNOR_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x01c>; -defm S_XNOR_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x01d>; -defm S_LSHL_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x01e>; -defm S_LSHL_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x01f>; -defm S_LSHR_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x020>; -defm S_LSHR_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x021>; -defm S_ASHR_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x022>; -defm S_ASHR_I64 : SOP2_Real_gfx6_gfx7_gfx10<0x023>; -defm S_BFM_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x024>; -defm S_BFM_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x025>; -defm S_MUL_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x026>; -defm S_BFE_U32 : SOP2_Real_gfx6_gfx7_gfx10<0x027>; -defm S_BFE_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x028>; -defm S_BFE_U64 : SOP2_Real_gfx6_gfx7_gfx10<0x029>; -defm S_BFE_I64 : SOP2_Real_gfx6_gfx7_gfx10<0x02a>; -defm S_ABSDIFF_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x02c>; +defm S_ADD_U32 : SOP2_Real_gfx6_gfx7_gfx10_gfx11_Renamed_gfx12_gfx13<0x000, "s_add_co_u32">; +defm S_SUB_U32 : SOP2_Real_gfx6_gfx7_gfx10_gfx11_Renamed_gfx12_gfx13<0x001, "s_sub_co_u32">; +defm S_ADD_I32 : SOP2_Real_gfx6_gfx7_gfx10_gfx11_Renamed_gfx12_gfx13<0x002, "s_add_co_i32">; +defm S_SUB_I32 : SOP2_Real_gfx6_gfx7_gfx10_gfx11_Renamed_gfx12_gfx13<0x003, "s_sub_co_i32">; +defm S_ADDC_U32 : SOP2_Real_gfx6_gfx7_gfx10_gfx11_Renamed_gfx12_gfx13<0x004, "s_add_co_ci_u32">; +defm S_SUBB_U32 : SOP2_Real_gfx6_gfx7_gfx10_gfx11_Renamed_gfx12_gfx13<0x005, "s_sub_co_ci_u32">; +defm S_MIN_I32 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x006>; +defm S_MIN_U32 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x007>; +defm S_MAX_I32 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x008>; +defm S_MAX_U32 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x009>; +defm S_CSELECT_B32 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x00a>; +defm S_CSELECT_B64 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x00b>; +defm S_AND_B32 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x00e>; +defm S_AND_B64 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x00f>; +defm S_OR_B32 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x010>; +defm S_OR_B64 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x011>; +defm S_XOR_B32 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x012>; +defm S_XOR_B64 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x013>; +defm S_ANDN2_B32 : SOP2_Real_gfx6_gfx7_gfx10_Renamed_gfx13<0x014, "s_and_not1_b32">; +defm S_ANDN2_B64 : SOP2_Real_gfx6_gfx7_gfx10_Renamed_gfx13<0x015, "s_and_not1_b64">; +defm S_ORN2_B32 : SOP2_Real_gfx6_gfx7_gfx10_Renamed_gfx13<0x016, "s_or_not1_b32">; +defm S_ORN2_B64 : SOP2_Real_gfx6_gfx7_gfx10_Renamed_gfx13<0x017, "s_or_not1_b64">; +defm S_NAND_B32 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x018>; +defm S_NAND_B64 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x019>; +defm S_NOR_B32 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x01a>; +defm S_NOR_B64 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x01b>; +defm S_XNOR_B32 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x01c>; +defm S_XNOR_B64 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x01d>; +defm S_LSHL_B32 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x01e>; +defm S_LSHL_B64 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x01f>; +defm S_LSHR_B32 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x020>; +defm S_LSHR_B64 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x021>; +defm S_ASHR_I32 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x022>; +defm S_ASHR_I64 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x023>; +defm S_BFM_B32 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x024>; +defm S_BFM_B64 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x025>; +defm S_MUL_I32 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x026>; +defm S_BFE_U32 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x027>; +defm S_BFE_I32 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x028>; +defm S_BFE_U64 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x029>; +defm S_BFE_I64 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x02a>; +defm S_ABSDIFF_I32 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x02c>; + + +//===----------------------------------------------------------------------===// +// SOPK - GFX10 Only +//===----------------------------------------------------------------------===// + +multiclass SOPK_Real32_gfx10<bits<5> op> { + defvar ps = !cast<SOPK_Pseudo>(NAME); + def _gfx10 : SOPK_Real32<op, ps>, + Select<GFX10Gen, ps.PseudoInstr>; +} + +multiclass SOPK_Real64_gfx10<bits<5> op> { + defvar ps = !cast<SOPK_Pseudo>(NAME); + def _gfx10 : SOPK_Real64<op, ps>, + Select<GFX10Gen, ps.PseudoInstr>; +} + +defm S_WAITCNT_VSCNT : SOPK_Real32_gfx10<0x017>; +defm S_WAITCNT_VMCNT : SOPK_Real32_gfx10<0x018>; +defm S_WAITCNT_EXPCNT : SOPK_Real32_gfx10<0x019>; +defm S_WAITCNT_LGKMCNT : SOPK_Real32_gfx10<0x01a>; +defm S_SUBVECTOR_LOOP_BEGIN : SOPK_Real32_gfx10<0x01b>; +defm S_SUBVECTOR_LOOP_END : SOPK_Real32_gfx10<0x01c>; + +//===----------------------------------------------------------------------===// +// SOPK - GFX11 Only +//===----------------------------------------------------------------------===// + +multiclass SOPK_Real32_gfx11<bits<5> op> { + def _gfx11 : SOPK_Real32<op, !cast<SOPK_Pseudo>(NAME)>, + Select<GFX11Gen, !cast<SOPK_Pseudo>(NAME).PseudoInstr>; +} + +multiclass SOPK_Real64_gfx11<bits<5> op> { + def _gfx11 : SOPK_Real64<op, !cast<SOPK_Pseudo>(NAME)>, + Select<GFX11Gen, !cast<SOPK_Pseudo>(NAME).PseudoInstr>; +} + +defm S_SUBVECTOR_LOOP_BEGIN : SOPK_Real32_gfx11<0x016>; +defm S_SUBVECTOR_LOOP_END : SOPK_Real32_gfx11<0x017>; +defm S_WAITCNT_VSCNT : SOPK_Real32_gfx11<0x018>; +defm S_WAITCNT_VMCNT : SOPK_Real32_gfx11<0x019>; +defm S_WAITCNT_EXPCNT : SOPK_Real32_gfx11<0x01a>; +defm S_WAITCNT_LGKMCNT : SOPK_Real32_gfx11<0x01b>; //===----------------------------------------------------------------------===// // SOPK - GFX11, GFX12. @@ -2568,21 +2750,11 @@ multiclass SOPK_Real32_gfx12<bits<5> op, string name = !tolower(NAME)> { } } -multiclass SOPK_Real32_gfx11<bits<5> op> { - def _gfx11 : SOPK_Real32<op, !cast<SOPK_Pseudo>(NAME)>, - Select<GFX11Gen, !cast<SOPK_Pseudo>(NAME).PseudoInstr>; -} - multiclass SOPK_Real64_gfx12<bits<5> op> { def _gfx12 : SOPK_Real64<op, !cast<SOPK_Pseudo>(NAME)>, Select<GFX12Gen, !cast<SOPK_Pseudo>(NAME).PseudoInstr>; } -multiclass SOPK_Real64_gfx11<bits<5> op> { - def _gfx11 : SOPK_Real64<op, !cast<SOPK_Pseudo>(NAME)>, - Select<GFX11Gen, !cast<SOPK_Pseudo>(NAME).PseudoInstr>; -} - multiclass SOPK_Real32_gfx11_gfx12<bits<5> op> : SOPK_Real32_gfx11<op>, SOPK_Real32_gfx12<op>; @@ -2604,43 +2776,39 @@ defm S_SETREG_IMM32_B32 : SOPK_Real64_gfx11_gfx12<0x013>; let OtherPredicates = [isNotGFX1250Plus] in defm S_CALL_B64 : SOPK_Real32_gfx11_gfx12<0x014>; defm S_CALL_B64 : SOPK_Real32_gfx1250<0x014, "s_call_i64">; -defm S_SUBVECTOR_LOOP_BEGIN : SOPK_Real32_gfx11<0x016>; -defm S_SUBVECTOR_LOOP_END : SOPK_Real32_gfx11<0x017>; -defm S_WAITCNT_VSCNT : SOPK_Real32_gfx11<0x018>; -defm S_WAITCNT_VMCNT : SOPK_Real32_gfx11<0x019>; -defm S_WAITCNT_EXPCNT : SOPK_Real32_gfx11<0x01a>; -defm S_WAITCNT_LGKMCNT : SOPK_Real32_gfx11<0x01b>; //===----------------------------------------------------------------------===// -// SOPK - GFX10. +// SOPK - GFX10, GFX11, GFX12, GFX13. //===----------------------------------------------------------------------===// -multiclass SOPK_Real32_gfx10<bits<5> op> { +multiclass SOPK_Real32_gfx13<bits<5> op, string name = !tolower(NAME)> { defvar ps = !cast<SOPK_Pseudo>(NAME); - def _gfx10 : SOPK_Real32<op, ps>, - Select<GFX10Gen, ps.PseudoInstr>; + def _gfx13 : SOPK_Real32<op, ps, name>, + Select<GFX13Gen, ps.Mnemonic>; + if !ne(ps.Mnemonic, name) then + def : MnemonicAlias<ps.Mnemonic, name>, Requires<[isGFX13Only]>; } -multiclass SOPK_Real64_gfx10<bits<5> op> { +multiclass SOPK_Real64_gfx13<bits<5> op> { defvar ps = !cast<SOPK_Pseudo>(NAME); - def _gfx10 : SOPK_Real64<op, ps>, - Select<GFX10Gen, ps.PseudoInstr>; + def _gfx13 : SOPK_Real64<op, ps>, + Select<GFX13Gen, ps.Mnemonic>; } -multiclass SOPK_Real32_gfx10_gfx11<bits<5> op> : - SOPK_Real32_gfx10<op>, SOPK_Real32_gfx11<op>; +multiclass SOPK_Real32_gfx10_gfx11_gfx12_gfx13<bits<5> op> : + SOPK_Real32_gfx10<op>, SOPK_Real32_gfx11<op>, SOPK_Real32_gfx12<op>, + SOPK_Real32_gfx13<op>; -multiclass SOPK_Real32_gfx10_gfx11_gfx12<bits<5> op> : - SOPK_Real32_gfx10<op>, SOPK_Real32_gfx11_gfx12<op>; +defm S_VERSION : SOPK_Real32_gfx10_gfx11_gfx12_gfx13<0x001>; -defm S_VERSION : SOPK_Real32_gfx10_gfx11_gfx12<0x001>; -defm S_CALL_B64 : SOPK_Real32_gfx10<0x016>; -defm S_WAITCNT_VSCNT : SOPK_Real32_gfx10<0x017>; -defm S_WAITCNT_VMCNT : SOPK_Real32_gfx10<0x018>; -defm S_WAITCNT_EXPCNT : SOPK_Real32_gfx10<0x019>; -defm S_WAITCNT_LGKMCNT : SOPK_Real32_gfx10<0x01a>; -defm S_SUBVECTOR_LOOP_BEGIN : SOPK_Real32_gfx10<0x01b>; -defm S_SUBVECTOR_LOOP_END : SOPK_Real32_gfx10<0x01c>; +//===----------------------------------------------------------------------===// +// SOPK - GFX10, GFX13. +//===----------------------------------------------------------------------===// + +multiclass SOPK_Real32_gfx10_Renamed_gfx13<bits<5> op, string gfx13_name> : + SOPK_Real32_gfx10<op>, SOPK_Real32_gfx13<op, gfx13_name>; + +defm S_CALL_B64 : SOPK_Real32_gfx10_Renamed_gfx13<0x016, "s_call_i64">; //===----------------------------------------------------------------------===// // SOPK - GFX6, GFX7. @@ -2652,32 +2820,15 @@ multiclass SOPK_Real32_gfx6_gfx7<bits<5> op> { Select_gfx6_gfx7<ps.PseudoInstr>; } -multiclass SOPK_Real64_gfx6_gfx7<bits<5> op> { - defvar ps = !cast<SOPK_Pseudo>(NAME); - def _gfx6_gfx7 : SOPK_Real64<op, ps>, - Select_gfx6_gfx7<ps.PseudoInstr>; -} - -multiclass SOPK_Real32_gfx6_gfx7_gfx10<bits<5> op> : - SOPK_Real32_gfx6_gfx7<op>, SOPK_Real32_gfx10<op>; +defm S_CBRANCH_I_FORK : SOPK_Real32_gfx6_gfx7<0x011>; -multiclass SOPK_Real64_gfx6_gfx7_gfx10<bits<5> op> : - SOPK_Real64_gfx6_gfx7<op>, SOPK_Real64_gfx10<op>; +//===----------------------------------------------------------------------===// +// SOPK - GFX6, GFX7, GFX10, GFX11. +//===----------------------------------------------------------------------===// multiclass SOPK_Real32_gfx6_gfx7_gfx10_gfx11<bits<5> op> : - SOPK_Real32_gfx6_gfx7<op>, SOPK_Real32_gfx10_gfx11<op>; + SOPK_Real32_gfx6_gfx7<op>, SOPK_Real32_gfx10<op>, SOPK_Real32_gfx11<op>; -multiclass SOPK_Real32_gfx6_gfx7_gfx10_gfx11_gfx12<bits<5> op> : - SOPK_Real32_gfx6_gfx7<op>, SOPK_Real32_gfx10_gfx11_gfx12<op>; - -multiclass SOPK_Real32_gfx6_gfx7_gfx10_gfx11_Renamed_gfx12<bits<5> op, string gfx12_name> : - SOPK_Real32_gfx6_gfx7<op>, SOPK_Real32_gfx10<op>, SOPK_Real32_gfx11<op>, - SOPK_Real32_gfx12<op, gfx12_name>; - -defm S_CBRANCH_I_FORK : SOPK_Real32_gfx6_gfx7<0x011>; - -defm S_MOVK_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11_gfx12<0x000>; -defm S_CMOVK_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11_gfx12<0x002>; defm S_CMPK_EQ_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x003>; defm S_CMPK_LG_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x004>; defm S_CMPK_GT_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x005>; @@ -2690,11 +2841,71 @@ defm S_CMPK_GT_U32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x00b>; defm S_CMPK_GE_U32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x00c>; defm S_CMPK_LT_U32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x00d>; defm S_CMPK_LE_U32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x00e>; -defm S_ADDK_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11_Renamed_gfx12<0x00f, "s_addk_co_i32">; -defm S_MULK_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11_gfx12<0x010>; -defm S_GETREG_B32 : SOPK_Real32_gfx6_gfx7_gfx10<0x012>; -defm S_SETREG_B32 : SOPK_Real32_gfx6_gfx7_gfx10<0x013>; -defm S_SETREG_IMM32_B32 : SOPK_Real64_gfx6_gfx7_gfx10<0x015>; + +//===----------------------------------------------------------------------===// +// SOPK - GFX6, GFX7, GFX10, GFX13. +//===----------------------------------------------------------------------===// + +multiclass SOPK_Real64_gfx6_gfx7<bits<5> op> { + defvar ps = !cast<SOPK_Pseudo>(NAME); + def _gfx6_gfx7 : SOPK_Real64<op, ps>, + Select_gfx6_gfx7<ps.PseudoInstr>; +} + +multiclass SOPK_Real32_gfx6_gfx7_gfx10_gfx13<bits<5> op> : + SOPK_Real32_gfx6_gfx7<op>, SOPK_Real32_gfx10<op>, SOPK_Real32_gfx13<op>; + +multiclass SOPK_Real64_gfx6_gfx7_gfx10_gfx13<bits<5> op> : + SOPK_Real64_gfx6_gfx7<op>, SOPK_Real64_gfx10<op>, SOPK_Real64_gfx13<op>; + +defm S_GETREG_B32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx13<0x012>; +defm S_SETREG_B32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx13<0x013>; +defm S_SETREG_IMM32_B32 : SOPK_Real64_gfx6_gfx7_gfx10_gfx13<0x015>; + +//===----------------------------------------------------------------------===// +// SOPK - GFX6, GFX7, GFX10, GFX11, GFX12, GFX13. +//===----------------------------------------------------------------------===// + +multiclass SOPK_Real32_gfx6_gfx7_gfx10_gfx11_gfx12_gfx13<bits<5> op> : + SOPK_Real32_gfx6_gfx7<op>, SOPK_Real32_gfx10<op>, SOPK_Real32_gfx11<op>, + SOPK_Real32_gfx12<op>, SOPK_Real32_gfx13<op>; + +multiclass SOPK_Real32_gfx6_gfx7_gfx10_gfx11_Renamed_gfx12_gfx13<bits<5> op, string gfx12_gfx13_name> : + SOPK_Real32_gfx6_gfx7<op>, SOPK_Real32_gfx10<op>, SOPK_Real32_gfx11<op>, + SOPK_Real32_gfx12<op, gfx12_gfx13_name>, SOPK_Real32_gfx13<op, gfx12_gfx13_name>; + +defm S_MOVK_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11_gfx12_gfx13<0x000>; +defm S_CMOVK_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11_gfx12_gfx13<0x002>; +defm S_ADDK_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11_Renamed_gfx12_gfx13<0x00f, "s_addk_co_i32">; +defm S_MULK_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11_gfx12_gfx13<0x010>; + + +//===----------------------------------------------------------------------===// +// SOPP - GFX13 only +//===----------------------------------------------------------------------===// + +multiclass SOPP_Real_32_gfx13<bits<7> op, string name = !cast<SOPP_Pseudo>(NAME).Mnemonic, bit compat_alias = 1> { + defvar ps = !cast<SOPP_Pseudo>(NAME); + def _gfx13 : SOPP_Real_32<op, ps, name>, + Select<GFX13Gen, ps.Mnemonic>, + SOPPRelaxTable<0, ps.KeyName, "_gfx13">; + if !and(compat_alias, !ne(ps.Mnemonic, name)) then + def : MnemonicAlias<ps.Mnemonic, name>, Requires<[isGFX13Only]>; +} + +multiclass SOPP_Real_64_gfx13<bits<7> op> { + defvar ps = !cast<SOPP_Pseudo>(NAME); + def _gfx13 : SOPP_Real_64<op, ps, ps.Mnemonic>, + Select<GFX13Gen, ps.Mnemonic>, + SOPPRelaxTable<1, ps.KeyName, "_gfx13">; +} + +defm S_WAKEUP_imm : SOPP_Real_32_gfx13<0x003>; +defm S_BARRIER_WAIT : SOPP_Real_32_gfx13<0x2b>; +defm S_MONITOR_SLEEP : SOPP_Real_32_gfx13<0x2c>; +defm S_DELAY_ALU : SOPP_Real_32_gfx13<0x2e>; +defm S_WAIT_EVENT : SOPP_Real_32_gfx13<0x2f>; +defm S_BARRIER_LEAVE_IMM : SOPP_Real_32_gfx13<0x31>; //===----------------------------------------------------------------------===// // SOPP - GFX12 only. @@ -2706,35 +2917,23 @@ multiclass SOPP_Real_32_gfx12<bits<7> op, string name = !tolower(NAME)> { Select<GFX12Gen, ps.PseudoInstr>; if !ne(ps.Mnemonic, name) then def : AMDGPUMnemonicAlias<ps.Mnemonic, name> { - let AssemblerPredicate = isGFX12Plus; + let AssemblerPredicate = isGFX12Only; } } +multiclass SOPP_Real_64_gfx12<bits<7> op> { + def _gfx12 : SOPP_Real_64<op, !cast<SOPP_Pseudo>(NAME), !cast<SOPP_Pseudo>(NAME).Mnemonic>, + Select<GFX12Gen, !cast<SOPP_Pseudo>(NAME).PseudoInstr>, + SOPPRelaxTable<1, !cast<SOPP_Pseudo>(NAME).KeyName, "_gfx12">; +} + defm S_BARRIER_WAIT : SOPP_Real_32_gfx12<0x014>; defm S_BARRIER_LEAVE : SOPP_Real_32_gfx12<0x015>; -defm S_WAIT_LOADCNT : SOPP_Real_32_gfx12<0x040>; -defm S_WAIT_STORECNT : SOPP_Real_32_gfx12<0x041>; -defm S_WAIT_SAMPLECNT : SOPP_Real_32_gfx12<0x042>; defm S_WAIT_BVHCNT : SOPP_Real_32_gfx12<0x043>; -defm S_WAIT_EXPCNT : SOPP_Real_32_gfx12<0x044>; -defm S_WAIT_DSCNT : SOPP_Real_32_gfx12<0x046>; -defm S_WAIT_KMCNT : SOPP_Real_32_gfx12<0x047>; -defm S_WAIT_LOADCNT_DSCNT : SOPP_Real_32_gfx12<0x048>; -defm S_WAIT_STORECNT_DSCNT : SOPP_Real_32_gfx12<0x049>; //===----------------------------------------------------------------------===// -// SOPP - GFX1250 only. +// SOPP - GFX11 only. //===----------------------------------------------------------------------===// -defm S_SET_VGPR_MSB : SOPP_Real_32_gfx12<0x006>; -defm S_SETPRIO_INC_WG : SOPP_Real_32_gfx12<0x03e>; -defm S_WAIT_XCNT : SOPP_Real_32_gfx12<0x045>; -defm S_WAIT_ASYNCCNT : SOPP_Real_32_gfx12<0x04a>; -defm S_WAIT_TENSORCNT : SOPP_Real_32_gfx12<0x04b>; - -//===----------------------------------------------------------------------===// -// SOPP - GFX11, GFX12. -//===----------------------------------------------------------------------===// - multiclass SOPP_Real_32_gfx11<bits<7> op, string name = !tolower(NAME)> { defvar ps = !cast<SOPP_Pseudo>(NAME); @@ -2747,94 +2946,91 @@ multiclass SOPP_Real_32_gfx11<bits<7> op, string name = !tolower(NAME)> { } } -multiclass SOPP_Real_64_gfx12<bits<7> op> { - def _gfx12 : SOPP_Real_64<op, !cast<SOPP_Pseudo>(NAME), !cast<SOPP_Pseudo>(NAME).Mnemonic>, - Select<GFX12Gen, !cast<SOPP_Pseudo>(NAME).PseudoInstr>, - SOPPRelaxTable<1, !cast<SOPP_Pseudo>(NAME).KeyName, "_gfx12">; -} - multiclass SOPP_Real_64_gfx11<bits<7> op> { def _gfx11 : SOPP_Real_64<op, !cast<SOPP_Pseudo>(NAME), !cast<SOPP_Pseudo>(NAME).Mnemonic>, Select<GFX11Gen, !cast<SOPP_Pseudo>(NAME).PseudoInstr>, SOPPRelaxTable<1, !cast<SOPP_Pseudo>(NAME).KeyName, "_gfx11">; } -multiclass SOPP_Real_32_gfx11_gfx12<bits<7> op> : - SOPP_Real_32_gfx11<op>, SOPP_Real_32_gfx12<op>; - -multiclass SOPP_Real_32_gfx11_Renamed_gfx12<bits<7> op, string gfx12_name> : - SOPP_Real_32_gfx11<op>, SOPP_Real_32_gfx12<op, gfx12_name>; - -multiclass SOPP_Real_With_Relaxation_gfx12<bits<7> op> { - defm "" : SOPP_Real_32_gfx12<op>; - let isCodeGenOnly = 1 in - defm _pad_s_nop : SOPP_Real_64_gfx12<op>; -} - multiclass SOPP_Real_With_Relaxation_gfx11<bits<7> op> { defm "" : SOPP_Real_32_gfx11<op>; let isCodeGenOnly = 1 in defm _pad_s_nop : SOPP_Real_64_gfx11<op>; } -multiclass SOPP_Real_With_Relaxation_gfx11_gfx12<bits<7>op> : - SOPP_Real_With_Relaxation_gfx11<op>, SOPP_Real_With_Relaxation_gfx12<op>; - -defm S_SETKILL : SOPP_Real_32_gfx11_gfx12<0x001>; -defm S_SETHALT : SOPP_Real_32_gfx11_gfx12<0x002>; -defm S_SLEEP : SOPP_Real_32_gfx11_gfx12<0x003>; defm S_INST_PREFETCH : SOPP_Real_32_gfx11<0x004, "s_set_inst_prefetch_distance">; -defm S_CLAUSE : SOPP_Real_32_gfx11_gfx12<0x005>; -defm S_DELAY_ALU : SOPP_Real_32_gfx11_gfx12<0x007>; -defm S_WAITCNT_DEPCTR : SOPP_Real_32_gfx11_Renamed_gfx12<0x008, "s_wait_alu">; -defm S_WAITCNT : SOPP_Real_32_gfx11_gfx12<0x009>; -defm S_WAIT_IDLE : SOPP_Real_32_gfx11_gfx12<0x00a>; -defm S_WAIT_EVENT : SOPP_Real_32_gfx11_gfx12<0x00b>; -defm S_TRAP : SOPP_Real_32_gfx11_gfx12<0x010>; -defm S_ROUND_MODE : SOPP_Real_32_gfx11_gfx12<0x011>; -defm S_DENORM_MODE : SOPP_Real_32_gfx11_gfx12<0x012>; -defm S_BRANCH : SOPP_Real_With_Relaxation_gfx11_gfx12<0x020>; -defm S_CBRANCH_SCC0 : SOPP_Real_With_Relaxation_gfx11_gfx12<0x021>; -defm S_CBRANCH_SCC1 : SOPP_Real_With_Relaxation_gfx11_gfx12<0x022>; -defm S_CBRANCH_VCCZ : SOPP_Real_With_Relaxation_gfx11_gfx12<0x023>; -defm S_CBRANCH_VCCNZ : SOPP_Real_With_Relaxation_gfx11_gfx12<0x024>; -defm S_CBRANCH_EXECZ : SOPP_Real_With_Relaxation_gfx11_gfx12<0x025>; -defm S_CBRANCH_EXECNZ : SOPP_Real_With_Relaxation_gfx11_gfx12<0x026>; defm S_CBRANCH_CDBGSYS : SOPP_Real_With_Relaxation_gfx11<0x027>; defm S_CBRANCH_CDBGUSER : SOPP_Real_With_Relaxation_gfx11<0x028>; defm S_CBRANCH_CDBGSYS_OR_USER : SOPP_Real_With_Relaxation_gfx11<0x029>; defm S_CBRANCH_CDBGSYS_AND_USER : SOPP_Real_With_Relaxation_gfx11<0x02a>; -defm S_ENDPGM : SOPP_Real_32_gfx11_gfx12<0x030>; -defm S_ENDPGM_SAVED : SOPP_Real_32_gfx11_gfx12<0x031>; defm S_ENDPGM_ORDERED_PS_DONE : SOPP_Real_32_gfx11<0x032>; -defm S_WAKEUP : SOPP_Real_32_gfx11_gfx12<0x034>; -defm S_SETPRIO : SOPP_Real_32_gfx11_gfx12<0x035>; -defm S_SENDMSG : SOPP_Real_32_gfx11_gfx12<0x036>; -defm S_SENDMSGHALT : SOPP_Real_32_gfx11_gfx12<0x037>; -defm S_INCPERFLEVEL : SOPP_Real_32_gfx11_gfx12<0x038>; -defm S_DECPERFLEVEL : SOPP_Real_32_gfx11_gfx12<0x039>; -defm S_TTRACEDATA : SOPP_Real_32_gfx11_gfx12<0x03a>; -defm S_TTRACEDATA_IMM : SOPP_Real_32_gfx11_gfx12<0x03b>; -defm S_ICACHE_INV : SOPP_Real_32_gfx11_gfx12<0x03c>; - defm S_BARRIER : SOPP_Real_32_gfx11<0x03d>; //===----------------------------------------------------------------------===// -// SOPP - GFX1250. +// SOPP - GFX10 only. //===----------------------------------------------------------------------===// -defm S_MONITOR_SLEEP : SOPP_Real_32_gfx12<0x004>; +multiclass SOPP_Real_32_gfx10<bits<7> op> { + defvar ps = !cast<SOPP_Pseudo>(NAME); + def _gfx10 : SOPP_Real_32<op, ps>, + Select<GFX10Gen, ps.PseudoInstr>, + SOPPRelaxTable<0, ps.KeyName, "_gfx10">; +} + +defm S_INST_PREFETCH : SOPP_Real_32_gfx10<0x020>; //===----------------------------------------------------------------------===// -// SOPP - GFX6, GFX7, GFX8, GFX9, GFX10 +// SOPP - GFX12, GFX13. //===----------------------------------------------------------------------===// -multiclass SOPP_Real_32_gfx6_gfx7<bits<7> op> { - defvar ps = !cast<SOPP_Pseudo>(NAME); - def _gfx6_gfx7 : SOPP_Real_32<op, ps, !cast<SOPP_Pseudo>(NAME).Mnemonic>, - Select_gfx6_gfx7<ps.PseudoInstr>, - SOPPRelaxTable<0, ps.KeyName, "_gfx6_gfx7">; -} +multiclass SOPP_Real_32_gfx12_gfx13<bits<7> op> : + SOPP_Real_32_gfx12<op>, SOPP_Real_32_gfx13<op>; + +defm S_WAIT_LOADCNT : SOPP_Real_32_gfx12_gfx13<0x040>; +defm S_WAIT_STORECNT : SOPP_Real_32_gfx12_gfx13<0x041>; +defm S_WAIT_SAMPLECNT : SOPP_Real_32_gfx12_gfx13<0x042>; +defm S_WAIT_EXPCNT : SOPP_Real_32_gfx12_gfx13<0x044>; +defm S_WAIT_DSCNT : SOPP_Real_32_gfx12_gfx13<0x046>; +defm S_WAIT_KMCNT : SOPP_Real_32_gfx12_gfx13<0x047>; +defm S_WAIT_LOADCNT_DSCNT : SOPP_Real_32_gfx12_gfx13<0x048>; +defm S_WAIT_STORECNT_DSCNT : SOPP_Real_32_gfx12_gfx13<0x049>; + +//===----------------------------------------------------------------------===// +// SOPP - GFX1250 only. +//===----------------------------------------------------------------------===// + +defm S_MONITOR_SLEEP : SOPP_Real_32_gfx12<0x004>; +defm S_SET_VGPR_MSB : SOPP_Real_32_gfx12<0x006>; +defm S_WAIT_XCNT : SOPP_Real_32_gfx12<0x045>; + +//===----------------------------------------------------------------------===// +// SOPP - GFX1250, GFX13 +//===----------------------------------------------------------------------===// + +defm S_SETPRIO_INC_WG : SOPP_Real_32_gfx12_gfx13<0x03e>; +defm S_WAIT_ASYNCCNT : SOPP_Real_32_gfx12_gfx13<0x04a>; +defm S_WAIT_TENSORCNT : SOPP_Real_32_gfx12_gfx13<0x04b>; + +//===----------------------------------------------------------------------===// +// SOPP - GFX10, GFX13 +//===----------------------------------------------------------------------===// + +multiclass SOPP_Real_32_gfx10_gfx13<bits<7> op> : + SOPP_Real_32_gfx10<op>, SOPP_Real_32_gfx13<op>; + +multiclass SOPP_Real_32_gfx10_Renamed_gfx13<bits<7> op, string gfx13_name> : + SOPP_Real_32_gfx10<op>, SOPP_Real_32_gfx13<op, gfx13_name>; + +defm S_CLAUSE : SOPP_Real_32_gfx10_gfx13<0x021>; +defm S_WAIT_IDLE : SOPP_Real_32_gfx10_gfx13<0x022>; +defm S_ROUND_MODE : SOPP_Real_32_gfx10_gfx13<0x024>; +defm S_DENORM_MODE : SOPP_Real_32_gfx10_gfx13<0x025>; +defm S_TTRACEDATA_IMM : SOPP_Real_32_gfx10_gfx13<0x028>; +defm S_WAITCNT_DEPCTR : SOPP_Real_32_gfx10_Renamed_gfx13<0x023, "s_wait_alu">; + +//===----------------------------------------------------------------------===// +// SOPP - GFX8, GFX9. +//===----------------------------------------------------------------------===// multiclass SOPP_Real_32_gfx8_gfx9<bits<7> op> { defvar ps = !cast<SOPP_Pseudo>(NAME); @@ -2843,27 +3039,46 @@ multiclass SOPP_Real_32_gfx8_gfx9<bits<7> op> { SOPPRelaxTable<0, ps.KeyName, "_vi">; } -multiclass SOPP_Real_32_gfx10<bits<7> op> { +defm S_SET_GPR_IDX_OFF : SOPP_Real_32_gfx8_gfx9<0x01c>; +defm S_SET_GPR_IDX_MODE : SOPP_Real_32_gfx8_gfx9<0x01d>; + +//===----------------------------------------------------------------------===// +// SOPP - GFX6, GFX7, GFX8, GFX9, GFX10. +//===----------------------------------------------------------------------===// + +multiclass SOPP_Real_32_gfx6_gfx7<bits<7> op> { defvar ps = !cast<SOPP_Pseudo>(NAME); - def _gfx10 : SOPP_Real_32<op, ps>, - Select<GFX10Gen, ps.PseudoInstr>, - SOPPRelaxTable<0, ps.KeyName, "_gfx10">; + def _gfx6_gfx7 : SOPP_Real_32<op, ps, !cast<SOPP_Pseudo>(NAME).Mnemonic>, + Select_gfx6_gfx7<ps.PseudoInstr>, + SOPPRelaxTable<0, ps.KeyName, "_gfx6_gfx7">; } -multiclass SOPP_Real_32_gfx8_gfx9_gfx10<bits<7> op> : - SOPP_Real_32_gfx8_gfx9<op>, SOPP_Real_32_gfx10<op>; - multiclass SOPP_Real_32_gfx6_gfx7_gfx8_gfx9<bits<7> op> : SOPP_Real_32_gfx6_gfx7<op>, SOPP_Real_32_gfx8_gfx9<op>; multiclass SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<bits<7> op> : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9<op>, SOPP_Real_32_gfx10<op>; -multiclass SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<bits<7> op> : - SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<op>, SOPP_Real_32_gfx11_gfx12<op>; +defm S_BARRIER : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x00a>; +defm S_WAITCNT : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x00c>; + +//===----------------------------------------------------------------------===// +// SOPP - GFX6, GFX7, GFX8, GFX9, GFX10, GFX11, GFX12, GFX13. +//===----------------------------------------------------------------------===// + +multiclass SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12_gfx13<bits<7> op> : + SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<op>, SOPP_Real_32_gfx11<op>, + SOPP_Real_32_gfx12<op>, SOPP_Real_32_gfx13<op>; + +defm S_NOP : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12_gfx13<0x000>; -multiclass SOPP_Real_32_gfx10_gfx11_gfx12<bits<7> op> : - SOPP_Real_32_gfx10<op>, SOPP_Real_32_gfx11_gfx12<op>; +//===----------------------------------------------------------------------===// +// SOPP - GFX6, GFX7, GFX8, GFX9, GFX10, GFX13. +//===----------------------------------------------------------------------===// + +multiclass SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<bits<7> op> : + SOPP_Real_32_gfx6_gfx7_gfx8_gfx9<op>, SOPP_Real_32_gfx10<op>, + SOPP_Real_32_gfx13<op>; //64 bit encodings, for Relaxation multiclass SOPP_Real_64_gfx6_gfx7<bits<7> op> { @@ -2890,6 +3105,44 @@ multiclass SOPP_Real_64_gfx10<bits<7> op> { multiclass SOPP_Real_64_gfx6_gfx7_gfx8_gfx9<bits<7> op> : SOPP_Real_64_gfx6_gfx7<op>, SOPP_Real_64_gfx8_gfx9<op>; +multiclass SOPP_Real_64_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<bits<7> op> : + SOPP_Real_64_gfx6_gfx7_gfx8_gfx9<op>, SOPP_Real_64_gfx10<op>, + SOPP_Real_64_gfx13<op>; + +multiclass SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<bits<7> op> { + defm "" : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<op>; + let isCodeGenOnly = 1 in + defm _pad_s_nop : SOPP_Real_64_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<op>; +} + +defm S_ENDPGM : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x001>; +defm S_SETHALT : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x00d>; +defm S_SETKILL : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x00b>; +defm S_SLEEP : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x00e>; +defm S_SETPRIO : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x00f>; +defm S_SENDMSG : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x010>; +defm S_SENDMSGHALT : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x011>; +defm S_TRAP : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x012>; +defm S_ICACHE_INV : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x013>; +defm S_INCPERFLEVEL : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x014>; +defm S_DECPERFLEVEL : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x015>; +defm S_TTRACEDATA : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x016>; +defm S_ENDPGM_SAVED : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x01B>; + +let isBranch = 1 in { +defm S_BRANCH : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x002>; +defm S_CBRANCH_SCC0 : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x004>; +defm S_CBRANCH_SCC1 : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x005>; +defm S_CBRANCH_VCCZ : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x006>; +defm S_CBRANCH_VCCNZ : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x007>; +defm S_CBRANCH_EXECZ : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x008>; +defm S_CBRANCH_EXECNZ : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x009>; +} + +//===----------------------------------------------------------------------===// +// SOPP - GFX6, GFX7, GFX8, GFX9, GFX10. +//===----------------------------------------------------------------------===// + multiclass SOPP_Real_64_gfx6_gfx7_gfx8_gfx9_gfx10<bits<7> op> : SOPP_Real_64_gfx6_gfx7_gfx8_gfx9<op>, SOPP_Real_64_gfx10<op>; @@ -2900,43 +3153,7 @@ multiclass SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<bits<7> op> { defm _pad_s_nop : SOPP_Real_64_gfx6_gfx7_gfx8_gfx9_gfx10<op>; } -defm S_NOP : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<0x000>; -defm S_ENDPGM : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x001>; -defm S_WAKEUP : SOPP_Real_32_gfx8_gfx9_gfx10<0x003>; -defm S_BARRIER : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x00a>; -defm S_WAITCNT : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x00c>; -defm S_SETHALT : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x00d>; -defm S_SETKILL : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x00b>; -defm S_SLEEP : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x00e>; -defm S_SETPRIO : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x00f>; -defm S_SENDMSG : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x010>; -defm S_SENDMSGHALT : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x011>; -defm S_TRAP : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x012>; -defm S_ICACHE_INV : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x013>; -defm S_INCPERFLEVEL : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x014>; -defm S_DECPERFLEVEL : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x015>; -defm S_TTRACEDATA : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x016>; -defm S_ENDPGM_SAVED : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x01B>; -defm S_SET_GPR_IDX_OFF : SOPP_Real_32_gfx8_gfx9<0x01c>; -defm S_SET_GPR_IDX_MODE : SOPP_Real_32_gfx8_gfx9<0x01d>; -defm S_ENDPGM_ORDERED_PS_DONE : SOPP_Real_32_gfx8_gfx9_gfx10<0x01e>; -defm S_CODE_END : SOPP_Real_32_gfx10_gfx11_gfx12<0x01f>; -defm S_INST_PREFETCH : SOPP_Real_32_gfx10<0x020>; -defm S_CLAUSE : SOPP_Real_32_gfx10<0x021>; -defm S_WAIT_IDLE : SOPP_Real_32_gfx10<0x022>; -defm S_WAITCNT_DEPCTR : SOPP_Real_32_gfx10<0x023>; -defm S_ROUND_MODE : SOPP_Real_32_gfx10<0x024>; -defm S_DENORM_MODE : SOPP_Real_32_gfx10<0x025>; -defm S_TTRACEDATA_IMM : SOPP_Real_32_gfx10<0x028>; - let isBranch = 1 in { -defm S_BRANCH : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x002>; -defm S_CBRANCH_SCC0 : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x004>; -defm S_CBRANCH_SCC1 : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x005>; -defm S_CBRANCH_VCCZ : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x006>; -defm S_CBRANCH_VCCNZ : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x007>; -defm S_CBRANCH_EXECZ : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x008>; -defm S_CBRANCH_EXECNZ : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x009>; defm S_CBRANCH_CDBGSYS : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x017>; defm S_CBRANCH_CDBGUSER : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x018>; defm S_CBRANCH_CDBGSYS_OR_USER : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x019>; @@ -2944,6 +3161,77 @@ defm S_CBRANCH_CDBGSYS_AND_USER : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_ } //===----------------------------------------------------------------------===// +// SOPP - GFX10, GFX11, GFX12, GFX13. +//===----------------------------------------------------------------------===// + +multiclass SOPP_Real_32_gfx10_gfx11_gfx12_gfx13<bits<7> op> : + SOPP_Real_32_gfx10<op>, SOPP_Real_32_gfx11<op>, SOPP_Real_32_gfx12<op>, + SOPP_Real_32_gfx13<op>; + +defm S_CODE_END : SOPP_Real_32_gfx10_gfx11_gfx12_gfx13<0x01f>; + +//===----------------------------------------------------------------------===// +// SOPP - GFX11, GFX12. +//===----------------------------------------------------------------------===// + +multiclass SOPP_Real_32_gfx11_gfx12<bits<7> op> : + SOPP_Real_32_gfx11<op>, SOPP_Real_32_gfx12<op>; + +multiclass SOPP_Real_32_gfx11_Renamed_gfx12<bits<7> op, string gfx12_name> : + SOPP_Real_32_gfx11<op>, SOPP_Real_32_gfx12<op, gfx12_name>; + +multiclass SOPP_Real_With_Relaxation_gfx12<bits<7> op> { + defm "" : SOPP_Real_32_gfx12<op>; + let isCodeGenOnly = 1 in + defm _pad_s_nop : SOPP_Real_64_gfx12<op>; +} + +multiclass SOPP_Real_With_Relaxation_gfx11_gfx12<bits<7>op> : + SOPP_Real_With_Relaxation_gfx11<op>, SOPP_Real_With_Relaxation_gfx12<op>; + +defm S_SETKILL : SOPP_Real_32_gfx11_gfx12<0x001>; +defm S_SETHALT : SOPP_Real_32_gfx11_gfx12<0x002>; +defm S_SLEEP : SOPP_Real_32_gfx11_gfx12<0x003>; +defm S_CLAUSE : SOPP_Real_32_gfx11_gfx12<0x005>; +defm S_DELAY_ALU : SOPP_Real_32_gfx11_gfx12<0x007>; +defm S_WAITCNT_DEPCTR : SOPP_Real_32_gfx11_Renamed_gfx12<0x008, "s_wait_alu">; +defm S_WAITCNT : SOPP_Real_32_gfx11_gfx12<0x009>; +defm S_WAIT_IDLE : SOPP_Real_32_gfx11_gfx12<0x00a>; +defm S_WAIT_EVENT : SOPP_Real_32_gfx11_gfx12<0x00b>; +defm S_TRAP : SOPP_Real_32_gfx11_gfx12<0x010>; +defm S_ROUND_MODE : SOPP_Real_32_gfx11_gfx12<0x011>; +defm S_DENORM_MODE : SOPP_Real_32_gfx11_gfx12<0x012>; +defm S_BRANCH : SOPP_Real_With_Relaxation_gfx11_gfx12<0x020>; +defm S_CBRANCH_SCC0 : SOPP_Real_With_Relaxation_gfx11_gfx12<0x021>; +defm S_CBRANCH_SCC1 : SOPP_Real_With_Relaxation_gfx11_gfx12<0x022>; +defm S_CBRANCH_VCCZ : SOPP_Real_With_Relaxation_gfx11_gfx12<0x023>; +defm S_CBRANCH_VCCNZ : SOPP_Real_With_Relaxation_gfx11_gfx12<0x024>; +defm S_CBRANCH_EXECZ : SOPP_Real_With_Relaxation_gfx11_gfx12<0x025>; +defm S_CBRANCH_EXECNZ : SOPP_Real_With_Relaxation_gfx11_gfx12<0x026>; +defm S_ENDPGM : SOPP_Real_32_gfx11_gfx12<0x030>; +defm S_ENDPGM_SAVED : SOPP_Real_32_gfx11_gfx12<0x031>; +defm S_WAKEUP : SOPP_Real_32_gfx11_gfx12<0x034>; +defm S_SETPRIO : SOPP_Real_32_gfx11_gfx12<0x035>; +defm S_SENDMSG : SOPP_Real_32_gfx11_gfx12<0x036>; +defm S_SENDMSGHALT : SOPP_Real_32_gfx11_gfx12<0x037>; +defm S_INCPERFLEVEL : SOPP_Real_32_gfx11_gfx12<0x038>; +defm S_DECPERFLEVEL : SOPP_Real_32_gfx11_gfx12<0x039>; +defm S_TTRACEDATA : SOPP_Real_32_gfx11_gfx12<0x03a>; +defm S_TTRACEDATA_IMM : SOPP_Real_32_gfx11_gfx12<0x03b>; +defm S_ICACHE_INV : SOPP_Real_32_gfx11_gfx12<0x03c>; + +//===----------------------------------------------------------------------===// +// SOPP - GFX8, GFX9, GFX10. +//===----------------------------------------------------------------------===// + +multiclass SOPP_Real_32_gfx8_gfx9_gfx10<bits<7> op> : + SOPP_Real_32_gfx8_gfx9<op>, SOPP_Real_32_gfx10<op>; + +defm S_WAKEUP : SOPP_Real_32_gfx8_gfx9_gfx10<0x003>; +defm S_ENDPGM_ORDERED_PS_DONE : SOPP_Real_32_gfx8_gfx9_gfx10<0x01e>; + + +//===----------------------------------------------------------------------===// // SOPC - GFX11, GFX12. //===----------------------------------------------------------------------===// @@ -2964,41 +3252,61 @@ defm S_CMP_EQ_U64 : SOPC_Real_gfx11_gfx12<0x10>; defm S_CMP_LG_U64 : SOPC_Real_gfx11_gfx12<0x11>; //===----------------------------------------------------------------------===// -// SOPC - GFX1150, GFX12 +// SOPC - GFX1150, GFX12, GFX13 //===----------------------------------------------------------------------===// -defm S_CMP_LT_F32 : SOPC_Real_gfx11_gfx12<0x41>; -defm S_CMP_EQ_F32 : SOPC_Real_gfx11_gfx12<0x42>; -defm S_CMP_LE_F32 : SOPC_Real_gfx11_gfx12<0x43>; -defm S_CMP_GT_F32 : SOPC_Real_gfx11_gfx12<0x44>; -defm S_CMP_LG_F32 : SOPC_Real_gfx11_gfx12<0x45>; -defm S_CMP_GE_F32 : SOPC_Real_gfx11_gfx12<0x46>; -defm S_CMP_O_F32 : SOPC_Real_gfx11_gfx12<0x47>; -defm S_CMP_U_F32 : SOPC_Real_gfx11_gfx12<0x48>; -defm S_CMP_NGE_F32 : SOPC_Real_gfx11_gfx12<0x49>; -defm S_CMP_NLG_F32 : SOPC_Real_gfx11_gfx12<0x4a>; -defm S_CMP_NGT_F32 : SOPC_Real_gfx11_gfx12<0x4b>; -defm S_CMP_NLE_F32 : SOPC_Real_gfx11_gfx12<0x4c>; -defm S_CMP_NEQ_F32 : SOPC_Real_gfx11_gfx12<0x4d>; -defm S_CMP_NLT_F32 : SOPC_Real_gfx11_gfx12<0x4e>; +multiclass SOPC_Real_gfx13<bits<7> op> { + def _gfx13 : SOPC_Real<op, !cast<SOPC_Pseudo>(NAME)>, + Select<GFX13Gen, !cast<SOPC_Pseudo>(NAME).Mnemonic>; +} -defm S_CMP_LT_F16 : SOPC_Real_gfx11_gfx12<0x51>; -defm S_CMP_EQ_F16 : SOPC_Real_gfx11_gfx12<0x52>; -defm S_CMP_LE_F16 : SOPC_Real_gfx11_gfx12<0x53>; -defm S_CMP_GT_F16 : SOPC_Real_gfx11_gfx12<0x54>; -defm S_CMP_LG_F16 : SOPC_Real_gfx11_gfx12<0x55>; -defm S_CMP_GE_F16 : SOPC_Real_gfx11_gfx12<0x56>; -defm S_CMP_O_F16 : SOPC_Real_gfx11_gfx12<0x57>; -defm S_CMP_U_F16 : SOPC_Real_gfx11_gfx12<0x58>; -defm S_CMP_NGE_F16 : SOPC_Real_gfx11_gfx12<0x59>; -defm S_CMP_NLG_F16 : SOPC_Real_gfx11_gfx12<0x5a>; -defm S_CMP_NGT_F16 : SOPC_Real_gfx11_gfx12<0x5b>; -defm S_CMP_NLE_F16 : SOPC_Real_gfx11_gfx12<0x5c>; -defm S_CMP_NEQ_F16 : SOPC_Real_gfx11_gfx12<0x5d>; -defm S_CMP_NLT_F16 : SOPC_Real_gfx11_gfx12<0x5e>; +multiclass SOPC_Real_gfx11_gfx12_gfx13<bits<7> op> : + SOPC_Real_gfx11<op>, SOPC_Real_gfx12<op>, SOPC_Real_gfx13<op>; + +defm S_CMP_LT_F32 : SOPC_Real_gfx11_gfx12_gfx13<0x41>; +defm S_CMP_EQ_F32 : SOPC_Real_gfx11_gfx12_gfx13<0x42>; +defm S_CMP_LE_F32 : SOPC_Real_gfx11_gfx12_gfx13<0x43>; +defm S_CMP_GT_F32 : SOPC_Real_gfx11_gfx12_gfx13<0x44>; +defm S_CMP_LG_F32 : SOPC_Real_gfx11_gfx12_gfx13<0x45>; +defm S_CMP_GE_F32 : SOPC_Real_gfx11_gfx12_gfx13<0x46>; +defm S_CMP_O_F32 : SOPC_Real_gfx11_gfx12_gfx13<0x47>; +defm S_CMP_U_F32 : SOPC_Real_gfx11_gfx12_gfx13<0x48>; +defm S_CMP_NGE_F32 : SOPC_Real_gfx11_gfx12_gfx13<0x49>; +defm S_CMP_NLG_F32 : SOPC_Real_gfx11_gfx12_gfx13<0x4a>; +defm S_CMP_NGT_F32 : SOPC_Real_gfx11_gfx12_gfx13<0x4b>; +defm S_CMP_NLE_F32 : SOPC_Real_gfx11_gfx12_gfx13<0x4c>; +defm S_CMP_NEQ_F32 : SOPC_Real_gfx11_gfx12_gfx13<0x4d>; +defm S_CMP_NLT_F32 : SOPC_Real_gfx11_gfx12_gfx13<0x4e>; + +defm S_CMP_LT_F16 : SOPC_Real_gfx11_gfx12_gfx13<0x51>; +defm S_CMP_EQ_F16 : SOPC_Real_gfx11_gfx12_gfx13<0x52>; +defm S_CMP_LE_F16 : SOPC_Real_gfx11_gfx12_gfx13<0x53>; +defm S_CMP_GT_F16 : SOPC_Real_gfx11_gfx12_gfx13<0x54>; +defm S_CMP_LG_F16 : SOPC_Real_gfx11_gfx12_gfx13<0x55>; +defm S_CMP_GE_F16 : SOPC_Real_gfx11_gfx12_gfx13<0x56>; +defm S_CMP_O_F16 : SOPC_Real_gfx11_gfx12_gfx13<0x57>; +defm S_CMP_U_F16 : SOPC_Real_gfx11_gfx12_gfx13<0x58>; +defm S_CMP_NGE_F16 : SOPC_Real_gfx11_gfx12_gfx13<0x59>; +defm S_CMP_NLG_F16 : SOPC_Real_gfx11_gfx12_gfx13<0x5a>; +defm S_CMP_NGT_F16 : SOPC_Real_gfx11_gfx12_gfx13<0x5b>; +defm S_CMP_NLE_F16 : SOPC_Real_gfx11_gfx12_gfx13<0x5c>; +defm S_CMP_NEQ_F16 : SOPC_Real_gfx11_gfx12_gfx13<0x5d>; +defm S_CMP_NLT_F16 : SOPC_Real_gfx11_gfx12_gfx13<0x5e>; //===----------------------------------------------------------------------===// -// SOPC - GFX6, GFX7, GFX8, GFX9, GFX10 +// SOPC - GFX8, GFX9. +//===----------------------------------------------------------------------===// + +multiclass SOPC_Real_gfx8_gfx9<bits<7> op> { + defvar ps = !cast<SOPC_Pseudo>(NAME); + def _vi : SOPC_Real<op, ps>, + Select_vi<ps.PseudoInstr>; +} + +defm S_SET_GPR_IDX_ON : SOPC_Real_gfx8_gfx9<0x11>; + +//===----------------------------------------------------------------------===// +// SOPC - GFX6, GFX7, GFX8, GFX9. //===----------------------------------------------------------------------===// multiclass SOPC_Real_gfx6_gfx7<bits<7> op> { @@ -3007,11 +3315,14 @@ multiclass SOPC_Real_gfx6_gfx7<bits<7> op> { Select_gfx6_gfx7<ps.PseudoInstr>; } -multiclass SOPC_Real_gfx8_gfx9<bits<7> op> { - defvar ps = !cast<SOPC_Pseudo>(NAME); - def _vi : SOPC_Real<op, ps>, - Select_vi<ps.PseudoInstr>; -} +multiclass SOPC_Real_gfx6_gfx7_gfx8_gfx9<bits<7> op> : + SOPC_Real_gfx6_gfx7<op>, SOPC_Real_gfx8_gfx9<op>; + +defm S_SETVSKIP : SOPC_Real_gfx6_gfx7_gfx8_gfx9<0x10>; + +//===----------------------------------------------------------------------===// +// SOPC - GFX6, GFX7, GFX8, GFX9, GFX10, GFX13 +//===----------------------------------------------------------------------===// multiclass SOPC_Real_gfx10<bits<7> op> { defvar ps = !cast<SOPC_Pseudo>(NAME); @@ -3019,36 +3330,36 @@ multiclass SOPC_Real_gfx10<bits<7> op> { Select<GFX10Gen, ps.PseudoInstr>; } -multiclass SOPC_Real_gfx8_gfx9_gfx10<bits<7> op> : - SOPC_Real_gfx8_gfx9<op>, SOPC_Real_gfx10<op>; +multiclass SOPC_Real_gfx8_gfx9_gfx10_gfx13<bits<7> op> : + SOPC_Real_gfx8_gfx9<op>, SOPC_Real_gfx10<op>, SOPC_Real_gfx13<op>; -multiclass SOPC_Real_gfx6_gfx7_gfx8_gfx9<bits<7> op> : - SOPC_Real_gfx6_gfx7<op>, SOPC_Real_gfx8_gfx9<op>; +defm S_CMP_EQ_U64 : SOPC_Real_gfx8_gfx9_gfx10_gfx13<0x12>; +defm S_CMP_LG_U64 : SOPC_Real_gfx8_gfx9_gfx10_gfx13<0x13>; + +//===----------------------------------------------------------------------===// +// SOPC - GFX6, GFX7, GFX8, GFX9, GFX10, GFX11, GFX12, GFX13 +//===----------------------------------------------------------------------===// -multiclass SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<bits<7> op> : +multiclass SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12_gfx13<bits<7> op> : SOPC_Real_gfx6_gfx7_gfx8_gfx9<op>, SOPC_Real_gfx10<op>, SOPC_Real_gfx11<op>, - SOPC_Real_gfx12<op>; - -defm S_CMP_EQ_I32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<0x00>; -defm S_CMP_LG_I32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<0x01>; -defm S_CMP_GT_I32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<0x02>; -defm S_CMP_GE_I32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<0x03>; -defm S_CMP_LT_I32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<0x04>; -defm S_CMP_LE_I32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<0x05>; -defm S_CMP_EQ_U32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<0x06>; -defm S_CMP_LG_U32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<0x07>; -defm S_CMP_GT_U32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<0x08>; -defm S_CMP_GE_U32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<0x09>; -defm S_CMP_LT_U32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<0x0a>; -defm S_CMP_LE_U32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<0x0b>; -defm S_BITCMP0_B32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<0x0c>; -defm S_BITCMP1_B32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<0x0d>; -defm S_BITCMP0_B64 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<0x0e>; -defm S_BITCMP1_B64 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<0x0f>; -defm S_SETVSKIP : SOPC_Real_gfx6_gfx7_gfx8_gfx9<0x10>; -defm S_SET_GPR_IDX_ON : SOPC_Real_gfx8_gfx9<0x11>; -defm S_CMP_EQ_U64 : SOPC_Real_gfx8_gfx9_gfx10<0x12>; -defm S_CMP_LG_U64 : SOPC_Real_gfx8_gfx9_gfx10<0x13>; + SOPC_Real_gfx12<op>, SOPC_Real_gfx13<op>; + +defm S_CMP_EQ_I32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12_gfx13<0x00>; +defm S_CMP_LG_I32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12_gfx13<0x01>; +defm S_CMP_GT_I32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12_gfx13<0x02>; +defm S_CMP_GE_I32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12_gfx13<0x03>; +defm S_CMP_LT_I32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12_gfx13<0x04>; +defm S_CMP_LE_I32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12_gfx13<0x05>; +defm S_CMP_EQ_U32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12_gfx13<0x06>; +defm S_CMP_LG_U32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12_gfx13<0x07>; +defm S_CMP_GT_U32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12_gfx13<0x08>; +defm S_CMP_GE_U32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12_gfx13<0x09>; +defm S_CMP_LT_U32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12_gfx13<0x0a>; +defm S_CMP_LE_U32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12_gfx13<0x0b>; +defm S_BITCMP0_B32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12_gfx13<0x0c>; +defm S_BITCMP1_B32 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12_gfx13<0x0d>; +defm S_BITCMP0_B64 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12_gfx13<0x0e>; +defm S_BITCMP1_B64 : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12_gfx13<0x0f>; //===----------------------------------------------------------------------===// // GFX8 (VI), GFX9. diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp index 6489e63..fddd9c7 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp @@ -99,7 +99,6 @@ static constexpr CustomOperand MsgOperands[] = { {{"MSG_EARLY_PRIM_DEALLOC"}, ID_EARLY_PRIM_DEALLOC, isGFX9_GFX10}, {{"MSG_GS_ALLOC_REQ"}, ID_GS_ALLOC_REQ, isGFX9Plus}, {{"MSG_GET_DOORBELL"}, ID_GET_DOORBELL, isGFX9_GFX10}, - {{"MSG_SAVEWAVE_HAS_TDM"}, ID_SAVEWAVE_HAS_TDM, isGFX1250}, {{"MSG_GET_DDID"}, ID_GET_DDID, isGFX10}, {{"MSG_SYSMSG"}, ID_SYSMSG}, {{"MSG_RTN_GET_DOORBELL"}, ID_RTN_GET_DOORBELL, isGFX11Plus}, @@ -111,7 +110,8 @@ static constexpr CustomOperand MsgOperands[] = { {{"MSG_RTN_GET_TBA_TO_PC"}, ID_RTN_GET_TBA_TO_PC, isGFX11Plus}, {{"MSG_RTN_GET_SE_AID_ID"}, ID_RTN_GET_SE_AID_ID, isGFX12Plus}, {{"MSG_RTN_GET_CLUSTER_BARRIER_STATE"}, ID_RTN_GET_CLUSTER_BARRIER_STATE, - isGFX1250}, + isGFX1250Plus}, + {{"MSG_RTN_SAVE_WAVE_HAS_TDM"}, ID_RTN_SAVE_WAVE_HAS_TDM, isGFX1250Plus} }; static constexpr CustomOperand SysMsgOperands[] = { @@ -156,6 +156,26 @@ StringRef getMsgOpName(int64_t MsgId, uint64_t Encoding, } // namespace SendMsg +namespace WaitEvent { + +// clang-format off +static constexpr CustomOperand WaitEventOperands[] = { + {{"{ export_ready: 0 }"}, 0, isGFX12Plus}, + {{"{ dont_wait_export_ready: 0 }"}, 0, isGFX11}, + {{"{ dont_wait_export_ready: 1 }"}, DONT_WAIT_EXPORT_READY, isGFX11}, + {{"{ export_ready: 1 }"}, EXPORT_READY, isGFX12Plus} +}; +// clang-format on + +int64_t getWaitEventMask(StringRef Name, const MCSubtargetInfo &STI) { + return getEncodingFromOperandTable(WaitEventOperands, Name, STI); +} + +StringRef getWaitEventMaskName(uint64_t Encoding, const MCSubtargetInfo &STI) { + return getNameFromOperandTable(WaitEventOperands, Encoding, STI); +} +} // namespace WaitEvent + namespace Hwreg { // Disable lint checking for this block since it makes the table unreadable. @@ -211,8 +231,9 @@ static constexpr CustomOperand Operands[] = { {{"HW_REG_HW_ID2"}, ID_HW_ID2, isGFX10Plus}, {{"HW_REG_SQ_PERF_SNAPSHOT_PC_HI"}, ID_SQ_PERF_SNAPSHOT_PC_HI, isGFX940}, {{"HW_REG_POPS_PACKER"}, ID_POPS_PACKER, isGFX10}, + {{"HW_REG_WAVE_SCHED_MODE"}, ID_SCHED_MODE, isGFX12Plus}, {{"HW_REG_PERF_SNAPSHOT_DATA"}, ID_PERF_SNAPSHOT_DATA_gfx11, isGFX11}, - {{"HW_REG_IB_STS2"}, ID_IB_STS2, isGFX1250}, + {{"HW_REG_IB_STS2"}, ID_IB_STS2, isGFX1250Plus}, {{"HW_REG_SHADER_CYCLES"}, ID_SHADER_CYCLES, isGFX10_3_GFX11}, {{"HW_REG_SHADER_CYCLES_LO"}, ID_SHADER_CYCLES, isGFX12Plus}, {{"HW_REG_SHADER_CYCLES_HI"}, ID_SHADER_CYCLES_HI, isGFX12Plus}, @@ -220,8 +241,8 @@ static constexpr CustomOperand Operands[] = { {{"HW_REG_DVGPR_ALLOC_LO"}, ID_DVGPR_ALLOC_LO, isGFX12Plus}, {{"HW_REG_WAVE_DVGPR_ALLOC_HI"}, ID_DVGPR_ALLOC_HI, isGFX12Plus}, {{"HW_REG_DVGPR_ALLOC_HI"}, ID_DVGPR_ALLOC_HI, isGFX12Plus}, - {{"HW_REG_XNACK_STATE_PRIV"}, ID_XNACK_STATE_PRIV, isGFX1250}, - {{"HW_REG_XNACK_MASK"}, ID_XNACK_MASK_gfx1250, isGFX1250}, + {{"HW_REG_XNACK_STATE_PRIV"}, ID_XNACK_STATE_PRIV, isGFX1250Plus}, + {{"HW_REG_XNACK_MASK"}, ID_XNACK_MASK_gfx1250, isGFX1250Plus}, }; // clang-format on diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h index c84c1a7..5916e27 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h @@ -84,6 +84,11 @@ StringRef getMsgOpName(int64_t MsgId, uint64_t Encoding, } // namespace SendMsg +namespace WaitEvent { +int64_t getWaitEventMask(StringRef Name, const MCSubtargetInfo &STI); +StringRef getWaitEventMaskName(uint64_t Encoding, const MCSubtargetInfo &STI); +} // namespace WaitEvent + namespace Hwreg { // Symbolic names for the hwreg(...) syntax. int64_t getHwregId(StringRef Name, const MCSubtargetInfo &STI); @@ -127,6 +132,20 @@ ArrayRef<GFXVersion> getGFXVersions(); } // namespace UCVersion +namespace WMMAMods { +// These should match enum values in SIDefines.h + +constexpr const char *const ModMatrixFmt[] = { + "MATRIX_FMT_FP8", "MATRIX_FMT_BF8", "MATRIX_FMT_FP6", "MATRIX_FMT_BF6", + "MATRIX_FMT_FP4"}; + +constexpr const char *const ModMatrixScale[] = {"MATRIX_SCALE_ROW0", + "MATRIX_SCALE_ROW1"}; + +constexpr const char *const ModMatrixScaleFmt[] = { + "MATRIX_SCALE_FMT_E8", "MATRIX_SCALE_FMT_E5M3", "MATRIX_SCALE_FMT_E4M3"}; +} // namespace WMMAMods + } // namespace AMDGPU } // namespace llvm diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 3e1b058..3f32d11 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -177,7 +177,13 @@ inline unsigned getVaSsrcBitWidth() { return 1; } inline unsigned getVaSsrcBitShift() { return 8; } /// \returns HoldCnt bit shift -inline unsigned getHoldCntWidth() { return 1; } +inline unsigned getHoldCntWidth(unsigned VersionMajor, unsigned VersionMinor) { + static constexpr const unsigned MinMajor = 10; + static constexpr const unsigned MinMinor = 3; + return std::tie(VersionMajor, VersionMinor) >= std::tie(MinMajor, MinMinor) + ? 1 + : 0; +} /// \returns HoldCnt bit shift inline unsigned getHoldCntBitShift() { return 7; } @@ -188,6 +194,10 @@ namespace llvm { namespace AMDGPU { +iota_range<InstCounterType> inst_counter_types(InstCounterType MaxCounter) { + return enum_seq(LOAD_CNT, MaxCounter); +} + /// \returns true if the target supports signed immediate offset for SMRD /// instructions. bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST) { @@ -349,8 +359,8 @@ unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode, } struct MUBUFInfo { - uint16_t Opcode; - uint16_t BaseOpcode; + uint32_t Opcode; + uint32_t BaseOpcode; uint8_t elements; bool has_vaddr; bool has_srsrc; @@ -360,8 +370,8 @@ struct MUBUFInfo { }; struct MTBUFInfo { - uint16_t Opcode; - uint16_t BaseOpcode; + uint32_t Opcode; + uint32_t BaseOpcode; uint8_t elements; bool has_vaddr; bool has_srsrc; @@ -369,25 +379,25 @@ struct MTBUFInfo { }; struct SMInfo { - uint16_t Opcode; + uint32_t Opcode; bool IsBuffer; }; struct VOPInfo { - uint16_t Opcode; + uint32_t Opcode; bool IsSingle; }; struct VOPC64DPPInfo { - uint16_t Opcode; + uint32_t Opcode; }; struct VOPCDPPAsmOnlyInfo { - uint16_t Opcode; + uint32_t Opcode; }; struct VOP3CDPPAsmOnlyInfo { - uint16_t Opcode; + uint32_t Opcode; }; struct VOPDComponentInfo { @@ -398,7 +408,7 @@ struct VOPDComponentInfo { }; struct VOPDInfo { - uint16_t Opcode; + uint32_t Opcode; uint16_t OpX; uint16_t OpY; uint16_t Subtarget; @@ -406,7 +416,7 @@ struct VOPDInfo { }; struct VOPTrue16Info { - uint16_t Opcode; + uint32_t Opcode; bool IsTrue16; }; @@ -414,16 +424,18 @@ struct VOPTrue16Info { #define GET_FP4FP8DstByteSelTable_IMPL struct DPMACCInstructionInfo { - uint16_t Opcode; + uint32_t Opcode; bool IsDPMACCInstruction; }; struct FP4FP8DstByteSelInfo { - uint16_t Opcode; + uint32_t Opcode; bool HasFP8DstByteSel; bool HasFP4DstByteSel; }; +#define GET_DPMACCInstructionTable_DECL +#define GET_DPMACCInstructionTable_IMPL #define GET_MTBUFInfoTable_DECL #define GET_MTBUFInfoTable_IMPL #define GET_MUBUFInfoTable_DECL @@ -729,6 +741,8 @@ bool isGenericAtomic(unsigned Opc) { Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN || Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX || Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP || + Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32 || + Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32 || Opc == AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG; } @@ -780,6 +794,11 @@ FPType getFPDstSelType(unsigned Opc) { return FPType::None; } +bool isDPMACCInstruction(unsigned Opc) { + const DPMACCInstructionInfo *Info = getDPMACCInstructionHelper(Opc); + return Info && Info->IsDPMACCInstruction; +} + unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc) { const WMMAOpcodeMappingInfo *Info = getWMMAMappingInfoFrom2AddrOpcode(Opc); return Info ? Info->Opcode3Addr : ~0u; @@ -793,7 +812,7 @@ unsigned mapWMMA3AddrTo2AddrOpcode(unsigned Opc) { // Wrapper for Tablegen'd function. enum Subtarget is not defined in any // header files, so we need to wrap it in a function that takes unsigned // instead. -int getMCOpcode(uint16_t Opcode, unsigned Gen) { +int64_t getMCOpcode(uint32_t Opcode, unsigned Gen) { return getMCOpcodeGen(Opcode, static_cast<Subtarget>(Gen)); } @@ -897,7 +916,7 @@ unsigned ComponentInfo::getIndexInParsedOperands(unsigned CompOprIdx) const { } std::optional<unsigned> InstInfo::getInvalidCompOperandIndex( - std::function<unsigned(unsigned, unsigned)> GetRegIdx, + std::function<MCRegister(unsigned, unsigned)> GetRegIdx, const MCRegisterInfo &MRI, bool SkipSrc, bool AllowSameVGPR, bool VOPD3) const { @@ -914,12 +933,13 @@ std::optional<unsigned> InstInfo::getInvalidCompOperandIndex( BaseX = X; if (!BaseY) BaseY = Y; - if ((BaseX & BanksMask) == (BaseY & BanksMask)) + if ((BaseX.id() & BanksMask) == (BaseY.id() & BanksMask)) return true; if (BaseX != X /* This is 64-bit register */ && - ((BaseX + 1) & BanksMask) == (BaseY & BanksMask)) + ((BaseX.id() + 1) & BanksMask) == (BaseY.id() & BanksMask)) return true; - if (BaseY != Y && (BaseX & BanksMask) == ((BaseY + 1) & BanksMask)) + if (BaseY != Y && + (BaseX.id() & BanksMask) == ((BaseY.id() + 1) & BanksMask)) return true; // If both are 64-bit bank conflict will be detected yet while checking @@ -968,7 +988,7 @@ std::optional<unsigned> InstInfo::getInvalidCompOperandIndex( // if the operand is not a register or not a VGPR. InstInfo::RegIndices InstInfo::getRegIndices(unsigned CompIdx, - std::function<unsigned(unsigned, unsigned)> GetRegIdx, + std::function<MCRegister(unsigned, unsigned)> GetRegIdx, bool VOPD3) const { assert(CompIdx < COMPONENTS_NUM); @@ -983,7 +1003,7 @@ InstInfo::getRegIndices(unsigned CompIdx, Comp.hasRegSrcOperand(CompSrcIdx) ? GetRegIdx(CompIdx, Comp.getIndexOfSrcInMCOperands(CompSrcIdx, VOPD3)) - : 0; + : MCRegister(); } return RegIndices; } @@ -1709,6 +1729,30 @@ bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val) { return false; } +raw_ostream &operator<<(raw_ostream &OS, const AMDGPU::Waitcnt &Wait) { + ListSeparator LS; + if (Wait.LoadCnt != ~0u) + OS << LS << "LoadCnt: " << Wait.LoadCnt; + if (Wait.ExpCnt != ~0u) + OS << LS << "ExpCnt: " << Wait.ExpCnt; + if (Wait.DsCnt != ~0u) + OS << LS << "DsCnt: " << Wait.DsCnt; + if (Wait.StoreCnt != ~0u) + OS << LS << "StoreCnt: " << Wait.StoreCnt; + if (Wait.SampleCnt != ~0u) + OS << LS << "SampleCnt: " << Wait.SampleCnt; + if (Wait.BvhCnt != ~0u) + OS << LS << "BvhCnt: " << Wait.BvhCnt; + if (Wait.KmCnt != ~0u) + OS << LS << "KmCnt: " << Wait.KmCnt; + if (Wait.XCnt != ~0u) + OS << LS << "XCnt: " << Wait.XCnt; + if (LS.unused()) + OS << "none"; + OS << '\n'; + return OS; +} + unsigned getVmcntBitMask(const IsaVersion &Version) { return (1 << (getVmcntBitWidthLo(Version.Major) + getVmcntBitWidthHi(Version.Major))) - @@ -1751,6 +1795,25 @@ unsigned getStorecntBitMask(const IsaVersion &Version) { return (1 << getStorecntBitWidth(Version.Major)) - 1; } +HardwareLimits::HardwareLimits(const IsaVersion &IV) { + bool HasExtendedWaitCounts = IV.Major >= 12; + if (HasExtendedWaitCounts) { + LoadcntMax = getLoadcntBitMask(IV); + DscntMax = getDscntBitMask(IV); + } else { + LoadcntMax = getVmcntBitMask(IV); + DscntMax = getLgkmcntBitMask(IV); + } + ExpcntMax = getExpcntBitMask(IV); + StorecntMax = getStorecntBitMask(IV); + SamplecntMax = getSamplecntBitMask(IV); + BvhcntMax = getBvhcntBitMask(IV); + KmcntMax = getKmcntBitMask(IV); + XcntMax = getXcntBitMask(IV); + VaVdstMax = DepCtr::getVaVdstBitMask(); + VmVsrcMax = DepCtr::getVmVsrcBitMask(); +} + unsigned getWaitcntBitMask(const IsaVersion &Version) { unsigned VmcntLo = getBitMask(getVmcntBitShiftLo(Version.Major), getVmcntBitWidthLo(Version.Major)); @@ -2019,6 +2082,22 @@ int encodeDepCtr(const StringRef Name, int64_t Val, unsigned &UsedOprMask, STI); } +unsigned getVaVdstBitMask() { return (1 << getVaVdstBitWidth()) - 1; } + +unsigned getVaSdstBitMask() { return (1 << getVaSdstBitWidth()) - 1; } + +unsigned getVaSsrcBitMask() { return (1 << getVaSsrcBitWidth()) - 1; } + +unsigned getHoldCntBitMask(const IsaVersion &Version) { + return (1 << getHoldCntWidth(Version.Major, Version.Minor)) - 1; +} + +unsigned getVmVsrcBitMask() { return (1 << getVmVsrcBitWidth()) - 1; } + +unsigned getVaVccBitMask() { return (1 << getVaVccBitWidth()) - 1; } + +unsigned getSaSdstBitMask() { return (1 << getSaSdstBitWidth()) - 1; } + unsigned decodeFieldVmVsrc(unsigned Encoded) { return unpackBits(Encoded, getVmVsrcBitShift(), getVmVsrcBitWidth()); } @@ -2043,64 +2122,74 @@ unsigned decodeFieldVaSsrc(unsigned Encoded) { return unpackBits(Encoded, getVaSsrcBitShift(), getVaSsrcBitWidth()); } -unsigned decodeFieldHoldCnt(unsigned Encoded) { - return unpackBits(Encoded, getHoldCntBitShift(), getHoldCntWidth()); +unsigned decodeFieldHoldCnt(unsigned Encoded, const IsaVersion &Version) { + return unpackBits(Encoded, getHoldCntBitShift(), + getHoldCntWidth(Version.Major, Version.Minor)); } unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc) { return packBits(VmVsrc, Encoded, getVmVsrcBitShift(), getVmVsrcBitWidth()); } -unsigned encodeFieldVmVsrc(unsigned VmVsrc) { - return encodeFieldVmVsrc(0xffff, VmVsrc); +unsigned encodeFieldVmVsrc(unsigned VmVsrc, const MCSubtargetInfo &STI) { + unsigned Encoded = getDefaultDepCtrEncoding(STI); + return encodeFieldVmVsrc(Encoded, VmVsrc); } unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst) { return packBits(VaVdst, Encoded, getVaVdstBitShift(), getVaVdstBitWidth()); } -unsigned encodeFieldVaVdst(unsigned VaVdst) { - return encodeFieldVaVdst(0xffff, VaVdst); +unsigned encodeFieldVaVdst(unsigned VaVdst, const MCSubtargetInfo &STI) { + unsigned Encoded = getDefaultDepCtrEncoding(STI); + return encodeFieldVaVdst(Encoded, VaVdst); } unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst) { return packBits(SaSdst, Encoded, getSaSdstBitShift(), getSaSdstBitWidth()); } -unsigned encodeFieldSaSdst(unsigned SaSdst) { - return encodeFieldSaSdst(0xffff, SaSdst); +unsigned encodeFieldSaSdst(unsigned SaSdst, const MCSubtargetInfo &STI) { + unsigned Encoded = getDefaultDepCtrEncoding(STI); + return encodeFieldSaSdst(Encoded, SaSdst); } unsigned encodeFieldVaSdst(unsigned Encoded, unsigned VaSdst) { return packBits(VaSdst, Encoded, getVaSdstBitShift(), getVaSdstBitWidth()); } -unsigned encodeFieldVaSdst(unsigned VaSdst) { - return encodeFieldVaSdst(0xffff, VaSdst); +unsigned encodeFieldVaSdst(unsigned VaSdst, const MCSubtargetInfo &STI) { + unsigned Encoded = getDefaultDepCtrEncoding(STI); + return encodeFieldVaSdst(Encoded, VaSdst); } unsigned encodeFieldVaVcc(unsigned Encoded, unsigned VaVcc) { return packBits(VaVcc, Encoded, getVaVccBitShift(), getVaVccBitWidth()); } -unsigned encodeFieldVaVcc(unsigned VaVcc) { - return encodeFieldVaVcc(0xffff, VaVcc); +unsigned encodeFieldVaVcc(unsigned VaVcc, const MCSubtargetInfo &STI) { + unsigned Encoded = getDefaultDepCtrEncoding(STI); + return encodeFieldVaVcc(Encoded, VaVcc); } unsigned encodeFieldVaSsrc(unsigned Encoded, unsigned VaSsrc) { return packBits(VaSsrc, Encoded, getVaSsrcBitShift(), getVaSsrcBitWidth()); } -unsigned encodeFieldVaSsrc(unsigned VaSsrc) { - return encodeFieldVaSsrc(0xffff, VaSsrc); +unsigned encodeFieldVaSsrc(unsigned VaSsrc, const MCSubtargetInfo &STI) { + unsigned Encoded = getDefaultDepCtrEncoding(STI); + return encodeFieldVaSsrc(Encoded, VaSsrc); } -unsigned encodeFieldHoldCnt(unsigned Encoded, unsigned HoldCnt) { - return packBits(HoldCnt, Encoded, getHoldCntBitShift(), getHoldCntWidth()); +unsigned encodeFieldHoldCnt(unsigned Encoded, unsigned HoldCnt, + const IsaVersion &Version) { + return packBits(HoldCnt, Encoded, getHoldCntBitShift(), + getHoldCntWidth(Version.Major, Version.Minor)); } -unsigned encodeFieldHoldCnt(unsigned HoldCnt) { - return encodeFieldHoldCnt(0xffff, HoldCnt); +unsigned encodeFieldHoldCnt(unsigned HoldCnt, const MCSubtargetInfo &STI) { + unsigned Encoded = getDefaultDepCtrEncoding(STI); + return encodeFieldHoldCnt(Encoded, HoldCnt, getIsaVersion(STI.getCPU())); } } // namespace DepCtr @@ -2450,7 +2539,7 @@ unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler) { } unsigned getMaxNumUserSGPRs(const MCSubtargetInfo &STI) { - if (isGFX1250(STI)) + if (isGFX1250Plus(STI)) return 32; return 16; } @@ -2517,14 +2606,26 @@ bool isGFX12(const MCSubtargetInfo &STI) { return STI.getFeatureBits()[AMDGPU::FeatureGFX12]; } -bool isGFX12Plus(const MCSubtargetInfo &STI) { return isGFX12(STI); } +bool isGFX12Plus(const MCSubtargetInfo &STI) { + return isGFX12(STI) || isGFX13Plus(STI); +} bool isNotGFX12Plus(const MCSubtargetInfo &STI) { return !isGFX12Plus(STI); } bool isGFX1250(const MCSubtargetInfo &STI) { + return STI.getFeatureBits()[AMDGPU::FeatureGFX1250Insts] && !isGFX13(STI); +} + +bool isGFX1250Plus(const MCSubtargetInfo &STI) { return STI.getFeatureBits()[AMDGPU::FeatureGFX1250Insts]; } +bool isGFX13(const MCSubtargetInfo &STI) { + return STI.getFeatureBits()[AMDGPU::FeatureGFX13]; +} + +bool isGFX13Plus(const MCSubtargetInfo &STI) { return isGFX13(STI); } + bool supportsWGP(const MCSubtargetInfo &STI) { if (isGFX1250(STI)) return false; @@ -2578,7 +2679,7 @@ bool hasMAIInsts(const MCSubtargetInfo &STI) { } bool hasVOPD(const MCSubtargetInfo &STI) { - return STI.hasFeature(AMDGPU::FeatureVOPD); + return STI.hasFeature(AMDGPU::FeatureVOPDInsts); } bool hasDPPSrc1SGPR(const MCSubtargetInfo &STI) { @@ -2697,8 +2798,8 @@ MCRegister getMCReg(MCRegister Reg, const MCSubtargetInfo &STI) { MCRegister mc2PseudoReg(MCRegister Reg) { MAP_REG2REG } -bool isInlineValue(unsigned Reg) { - switch (Reg) { +bool isInlineValue(MCRegister Reg) { + switch (Reg.id()) { case AMDGPU::SRC_SHARED_BASE_LO: case AMDGPU::SRC_SHARED_BASE: case AMDGPU::SRC_SHARED_LIMIT_LO: @@ -2743,6 +2844,7 @@ bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo) { case AMDGPU::OPERAND_REG_IMM_FP64: case AMDGPU::OPERAND_REG_IMM_FP16: case AMDGPU::OPERAND_REG_IMM_V2FP16: + case AMDGPU::OPERAND_REG_IMM_V2FP16_SPLAT: case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: case AMDGPU::OPERAND_REG_INLINE_C_FP32: case AMDGPU::OPERAND_REG_INLINE_C_FP64: @@ -3104,6 +3206,34 @@ std::optional<unsigned> getInlineEncodingV2F16(uint32_t Literal) { return getInlineEncodingV216(true, Literal); } +// Encoding of the literal as an inline constant for V_PK_FMAC_F16 instruction +// or nullopt. This accounts for different inline constant behavior: +// - Pre-GFX11: fp16 inline constants have the value in low 16 bits, 0 in high +// - GFX11+: fp16 inline constants are duplicated into both halves +std::optional<unsigned> getPKFMACF16InlineEncoding(uint32_t Literal, + bool IsGFX11Plus) { + // Pre-GFX11 behavior: f16 in low bits, 0 in high bits + if (!IsGFX11Plus) + return getInlineEncodingV216(/*IsFloat=*/true, Literal); + + // GFX11+ behavior: f16 duplicated in both halves + // First, check for sign-extended integer inline constants (-16 to 64) + // These work the same across all generations + int32_t Signed = static_cast<int32_t>(Literal); + if (Signed >= 0 && Signed <= 64) + return 128 + Signed; + + if (Signed >= -16 && Signed <= -1) + return 192 + std::abs(Signed); + + // For float inline constants on GFX11+, both halves must be equal + uint16_t Lo = static_cast<uint16_t>(Literal); + uint16_t Hi = static_cast<uint16_t>(Literal >> 16); + if (Lo != Hi) + return std::nullopt; + return getInlineEncodingV216(/*IsFloat=*/true, Lo); +} + // Whether the given literal can be inlined for a V_PK_* instruction. bool isInlinableLiteralV216(uint32_t Literal, uint8_t OpType) { switch (OpType) { @@ -3113,6 +3243,8 @@ bool isInlinableLiteralV216(uint32_t Literal, uint8_t OpType) { case AMDGPU::OPERAND_REG_IMM_V2FP16: case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: return getInlineEncodingV216(true, Literal).has_value(); + case AMDGPU::OPERAND_REG_IMM_V2FP16_SPLAT: + llvm_unreachable("OPERAND_REG_IMM_V2FP16_SPLAT is not supported"); case AMDGPU::OPERAND_REG_IMM_V2BF16: case AMDGPU::OPERAND_REG_INLINE_C_V2BF16: return isInlinableLiteralV2BF16(Literal); @@ -3138,6 +3270,11 @@ bool isInlinableLiteralV2F16(uint32_t Literal) { return getInlineEncodingV2F16(Literal).has_value(); } +// Whether the given literal can be inlined for V_PK_FMAC_F16 instruction. +bool isPKFMACF16InlineConstant(uint32_t Literal, bool IsGFX11Plus) { + return getPKFMACF16InlineEncoding(Literal, IsGFX11Plus).has_value(); +} + bool isValid32BitLiteral(uint64_t Val, bool IsFP64) { if (IsFP64) return !Lo_32(Val); @@ -3159,6 +3296,7 @@ int64_t encode32BitLiteral(int64_t Imm, OperandType Type, bool IsLit) { case OPERAND_REG_IMM_INT32: case OPERAND_REG_IMM_V2BF16: case OPERAND_REG_IMM_V2FP16: + case OPERAND_REG_IMM_V2FP16_SPLAT: case OPERAND_REG_IMM_V2FP32: case OPERAND_REG_IMM_V2INT16: case OPERAND_REG_IMM_V2INT32: @@ -3361,7 +3499,7 @@ const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t Format, : getGfx9BufferFormatInfo(Format); } -const MCRegisterClass *getVGPRPhysRegClass(MCPhysReg Reg, +const MCRegisterClass *getVGPRPhysRegClass(MCRegister Reg, const MCRegisterInfo &MRI) { const unsigned VGPRClasses[] = { AMDGPU::VGPR_16RegClassID, AMDGPU::VGPR_32RegClassID, @@ -3382,22 +3520,22 @@ const MCRegisterClass *getVGPRPhysRegClass(MCPhysReg Reg, return nullptr; } -unsigned getVGPREncodingMSBs(MCPhysReg Reg, const MCRegisterInfo &MRI) { +unsigned getVGPREncodingMSBs(MCRegister Reg, const MCRegisterInfo &MRI) { unsigned Enc = MRI.getEncodingValue(Reg); unsigned Idx = Enc & AMDGPU::HWEncoding::REG_IDX_MASK; return Idx >> 8; } -MCPhysReg getVGPRWithMSBs(MCPhysReg Reg, unsigned MSBs, - const MCRegisterInfo &MRI) { +MCRegister getVGPRWithMSBs(MCRegister Reg, unsigned MSBs, + const MCRegisterInfo &MRI) { unsigned Enc = MRI.getEncodingValue(Reg); unsigned Idx = Enc & AMDGPU::HWEncoding::REG_IDX_MASK; if (Idx >= 0x100) - return AMDGPU::NoRegister; + return MCRegister(); const MCRegisterClass *RC = getVGPRPhysRegClass(Reg, MRI); if (!RC) - return AMDGPU::NoRegister; + return MCRegister(); Idx |= MSBs << 8; if (RC->getID() == AMDGPU::VGPR_16RegClassID) { @@ -3438,17 +3576,42 @@ getVGPRLoweringOperandTables(const MCInstrDesc &Desc) { AMDGPU::OpName::src0Y, AMDGPU::OpName::vsrc1Y, AMDGPU::OpName::vsrc2Y, AMDGPU::OpName::vdstY}; + // VOP2 MADMK instructions use src0, imm, src1 scheme. + static const AMDGPU::OpName VOP2MADMKOps[4] = { + AMDGPU::OpName::src0, AMDGPU::OpName::NUM_OPERAND_NAMES, + AMDGPU::OpName::src1, AMDGPU::OpName::vdst}; + static const AMDGPU::OpName VOPDFMAMKOpsX[4] = { + AMDGPU::OpName::src0X, AMDGPU::OpName::NUM_OPERAND_NAMES, + AMDGPU::OpName::vsrc1X, AMDGPU::OpName::vdstX}; + static const AMDGPU::OpName VOPDFMAMKOpsY[4] = { + AMDGPU::OpName::src0Y, AMDGPU::OpName::NUM_OPERAND_NAMES, + AMDGPU::OpName::vsrc1Y, AMDGPU::OpName::vdstY}; + unsigned TSFlags = Desc.TSFlags; if (TSFlags & (SIInstrFlags::VOP1 | SIInstrFlags::VOP2 | SIInstrFlags::VOP3 | SIInstrFlags::VOP3P | SIInstrFlags::VOPC | SIInstrFlags::DPP)) { + switch (Desc.getOpcode()) { // LD_SCALE operands ignore MSB. - if (Desc.getOpcode() == AMDGPU::V_WMMA_LD_SCALE_PAIRED_B32 || - Desc.getOpcode() == AMDGPU::V_WMMA_LD_SCALE_PAIRED_B32_gfx1250 || - Desc.getOpcode() == AMDGPU::V_WMMA_LD_SCALE16_PAIRED_B64 || - Desc.getOpcode() == AMDGPU::V_WMMA_LD_SCALE16_PAIRED_B64_gfx1250) + case AMDGPU::V_WMMA_LD_SCALE_PAIRED_B32: + case AMDGPU::V_WMMA_LD_SCALE_PAIRED_B32_gfx1250: + case AMDGPU::V_WMMA_LD_SCALE16_PAIRED_B64: + case AMDGPU::V_WMMA_LD_SCALE16_PAIRED_B64_gfx1250: return {}; + case AMDGPU::V_FMAMK_F16: + case AMDGPU::V_FMAMK_F16_t16: + case AMDGPU::V_FMAMK_F16_t16_gfx12: + case AMDGPU::V_FMAMK_F16_fake16: + case AMDGPU::V_FMAMK_F16_fake16_gfx12: + case AMDGPU::V_FMAMK_F32: + case AMDGPU::V_FMAMK_F32_gfx12: + case AMDGPU::V_FMAMK_F64: + case AMDGPU::V_FMAMK_F64_gfx1250: + return {VOP2MADMKOps, nullptr}; + default: + break; + } return {VOPOps, nullptr}; } @@ -3464,8 +3627,11 @@ getVGPRLoweringOperandTables(const MCInstrDesc &Desc) { if (TSFlags & SIInstrFlags::VIMAGE) return {VIMGOps, nullptr}; - if (AMDGPU::isVOPD(Desc.getOpcode())) - return {VOPDOpsX, VOPDOpsY}; + if (AMDGPU::isVOPD(Desc.getOpcode())) { + auto [OpX, OpY] = getVOPDComponents(Desc.getOpcode()); + return {(OpX == AMDGPU::V_FMAMK_F32) ? VOPDFMAMKOpsX : VOPDOpsX, + (OpY == AMDGPU::V_FMAMK_F32) ? VOPDFMAMKOpsY : VOPDOpsY}; + } assert(!(TSFlags & SIInstrFlags::MIMG)); @@ -3545,8 +3711,15 @@ bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCInstrInfo &MII, } unsigned getLdsDwGranularity(const MCSubtargetInfo &ST) { - return ST.hasFeature(AMDGPU::FeatureAddressableLocalMemorySize327680) ? 256 - : 128; + if (ST.getFeatureBits().test(FeatureAddressableLocalMemorySize32768)) + return 64; + if (ST.getFeatureBits().test(FeatureAddressableLocalMemorySize65536)) + return 128; + if (ST.getFeatureBits().test(FeatureAddressableLocalMemorySize163840)) + return 320; + if (ST.getFeatureBits().test(FeatureAddressableLocalMemorySize327680)) + return 512; + return 64; // In sync with getAddressableLocalMemorySize } bool isPackedFP32Inst(unsigned Opc) { @@ -3599,9 +3772,9 @@ ClusterDimsAttr ClusterDimsAttr::get(const Function &F) { if (!Attr.has_value()) AttrKind = Kind::Unknown; - else if (all_of(*Attr, [](unsigned V) { return V == EncoNoCluster; })) + else if (all_of(*Attr, equal_to(EncoNoCluster))) AttrKind = Kind::NoCluster; - else if (all_of(*Attr, [](unsigned V) { return V == EncoVariableDims; })) + else if (all_of(*Attr, equal_to(EncoVariableDims))) AttrKind = Kind::VariableDims; ClusterDimsAttr A(AttrKind); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 5e3195b..7500c24 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -98,7 +98,7 @@ struct GcnBufferFormatInfo { }; struct MAIInstInfo { - uint16_t Opcode; + uint32_t Opcode; bool is_dgemm; bool is_gfx940_xdl; }; @@ -121,7 +121,7 @@ struct True16D16Info { }; struct WMMAInstInfo { - uint16_t Opcode; + uint32_t Opcode; bool is_wmma_xdl; }; @@ -416,7 +416,7 @@ inline bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx) { } LLVM_READONLY -int getSOPPWithRelaxation(uint16_t Opcode); +int64_t getSOPPWithRelaxation(uint32_t Opcode); struct MIMGBaseOpcodeInfo { MIMGBaseOpcode BaseOpcode; @@ -522,8 +522,8 @@ unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode, bool IsG16Supported); struct MIMGInfo { - uint16_t Opcode; - uint16_t BaseOpcode; + uint32_t Opcode; + uint32_t BaseOpcode; uint8_t MIMGEncoding; uint8_t VDataDwords; uint8_t VAddrDwords; @@ -646,7 +646,7 @@ const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t Format, const MCSubtargetInfo &STI); LLVM_READONLY -int getMCOpcode(uint16_t Opcode, unsigned Gen); +int64_t getMCOpcode(uint32_t Opcode, unsigned Gen); LLVM_READONLY unsigned getVOPDOpcode(unsigned Opc, bool VOPD3); @@ -909,7 +909,7 @@ private: const ComponentInfo CompInfo[COMPONENTS_NUM]; public: - using RegIndices = std::array<unsigned, Component::MAX_OPR_NUM>; + using RegIndices = std::array<MCRegister, Component::MAX_OPR_NUM>; InstInfo(const MCInstrDesc &OpX, const MCInstrDesc &OpY) : CompInfo{OpX, OpY} {} @@ -932,9 +932,10 @@ public: // even though it violates requirement to be from different banks. // If \p VOPD3 is set to true both dst registers allowed to be either odd // or even and instruction may have real src2 as opposed to tied accumulator. - bool hasInvalidOperand(std::function<unsigned(unsigned, unsigned)> GetRegIdx, - const MCRegisterInfo &MRI, bool SkipSrc = false, - bool AllowSameVGPR = false, bool VOPD3 = false) const { + bool + hasInvalidOperand(std::function<MCRegister(unsigned, unsigned)> GetRegIdx, + const MCRegisterInfo &MRI, bool SkipSrc = false, + bool AllowSameVGPR = false, bool VOPD3 = false) const { return getInvalidCompOperandIndex(GetRegIdx, MRI, SkipSrc, AllowSameVGPR, VOPD3) .has_value(); @@ -949,14 +950,14 @@ public: // If \p VOPD3 is set to true both dst registers allowed to be either odd // or even and instruction may have real src2 as opposed to tied accumulator. std::optional<unsigned> getInvalidCompOperandIndex( - std::function<unsigned(unsigned, unsigned)> GetRegIdx, + std::function<MCRegister(unsigned, unsigned)> GetRegIdx, const MCRegisterInfo &MRI, bool SkipSrc = false, bool AllowSameVGPR = false, bool VOPD3 = false) const; private: RegIndices getRegIndices(unsigned ComponentIdx, - std::function<unsigned(unsigned, unsigned)> GetRegIdx, + std::function<MCRegister(unsigned, unsigned)> GetRegIdx, bool VOPD3) const; }; @@ -1075,6 +1076,37 @@ getIntegerVecAttribute(const Function &F, StringRef Name, unsigned Size); /// Checks if \p Val is inside \p MD, a !range-like metadata. bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val); +enum InstCounterType { + LOAD_CNT = 0, // VMcnt prior to gfx12. + DS_CNT, // LKGMcnt prior to gfx12. + EXP_CNT, // + STORE_CNT, // VScnt in gfx10/gfx11. + NUM_NORMAL_INST_CNTS, + SAMPLE_CNT = NUM_NORMAL_INST_CNTS, // gfx12+ only. + BVH_CNT, // gfx12+ only. + KM_CNT, // gfx12+ only. + X_CNT, // gfx1250. + NUM_EXTENDED_INST_CNTS, + VA_VDST = NUM_EXTENDED_INST_CNTS, // gfx12+ expert mode only. + VM_VSRC, // gfx12+ expert mode only. + NUM_EXPERT_INST_CNTS, + NUM_INST_CNTS = NUM_EXPERT_INST_CNTS +}; + +// Return an iterator over all counters between LOAD_CNT (the first counter) +// and \c MaxCounter (exclusive, default value yields an enumeration over +// all counters). +iota_range<InstCounterType> +inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS); + +} // namespace AMDGPU + +template <> struct enum_iteration_traits<AMDGPU::InstCounterType> { + static constexpr bool is_iterable = true; +}; + +namespace AMDGPU { + /// Represents the counter values to wait for in an s_waitcnt instruction. /// /// Large values (including the maximum possible integer) can be used to @@ -1088,6 +1120,71 @@ struct Waitcnt { unsigned BvhCnt = ~0u; // gfx12+ only. unsigned KmCnt = ~0u; // gfx12+ only. unsigned XCnt = ~0u; // gfx1250. + unsigned VaVdst = ~0u; // gfx12+ expert scheduling mode only. + unsigned VmVsrc = ~0u; // gfx12+ expert scheduling mode only. + + unsigned get(InstCounterType T) const { + switch (T) { + case LOAD_CNT: + return LoadCnt; + case EXP_CNT: + return ExpCnt; + case DS_CNT: + return DsCnt; + case STORE_CNT: + return StoreCnt; + case SAMPLE_CNT: + return SampleCnt; + case BVH_CNT: + return BvhCnt; + case KM_CNT: + return KmCnt; + case X_CNT: + return XCnt; + case VA_VDST: + return VaVdst; + case VM_VSRC: + return VmVsrc; + default: + llvm_unreachable("bad InstCounterType"); + } + } + void set(InstCounterType T, unsigned Val) { + switch (T) { + case LOAD_CNT: + LoadCnt = Val; + break; + case EXP_CNT: + ExpCnt = Val; + break; + case DS_CNT: + DsCnt = Val; + break; + case STORE_CNT: + StoreCnt = Val; + break; + case SAMPLE_CNT: + SampleCnt = Val; + break; + case BVH_CNT: + BvhCnt = Val; + break; + case KM_CNT: + KmCnt = Val; + break; + case X_CNT: + XCnt = Val; + break; + case VA_VDST: + VaVdst = Val; + break; + case VM_VSRC: + VmVsrc = Val; + break; + default: + llvm_unreachable("bad InstCounterType"); + } + } Waitcnt() = default; // Pre-gfx12 constructor. @@ -1096,19 +1193,24 @@ struct Waitcnt { // gfx12+ constructor. Waitcnt(unsigned LoadCnt, unsigned ExpCnt, unsigned DsCnt, unsigned StoreCnt, - unsigned SampleCnt, unsigned BvhCnt, unsigned KmCnt, unsigned XCnt) + unsigned SampleCnt, unsigned BvhCnt, unsigned KmCnt, unsigned XCnt, + unsigned VaVdst, unsigned VmVsrc) : LoadCnt(LoadCnt), ExpCnt(ExpCnt), DsCnt(DsCnt), StoreCnt(StoreCnt), - SampleCnt(SampleCnt), BvhCnt(BvhCnt), KmCnt(KmCnt), XCnt(XCnt) {} + SampleCnt(SampleCnt), BvhCnt(BvhCnt), KmCnt(KmCnt), XCnt(XCnt), + VaVdst(VaVdst), VmVsrc(VmVsrc) {} bool hasWait() const { return StoreCnt != ~0u || hasWaitExceptStoreCnt(); } bool hasWaitExceptStoreCnt() const { return LoadCnt != ~0u || ExpCnt != ~0u || DsCnt != ~0u || - SampleCnt != ~0u || BvhCnt != ~0u || KmCnt != ~0u || XCnt != ~0u; + SampleCnt != ~0u || BvhCnt != ~0u || KmCnt != ~0u || XCnt != ~0u || + VaVdst != ~0u || VmVsrc != ~0u; } bool hasWaitStoreCnt() const { return StoreCnt != ~0u; } + bool hasWaitDepctr() const { return VaVdst != ~0u || VmVsrc != ~0u; } + Waitcnt combined(const Waitcnt &Other) const { // Does the right thing provided self and Other are either both pre-gfx12 // or both gfx12+. @@ -1116,8 +1218,30 @@ struct Waitcnt { std::min(LoadCnt, Other.LoadCnt), std::min(ExpCnt, Other.ExpCnt), std::min(DsCnt, Other.DsCnt), std::min(StoreCnt, Other.StoreCnt), std::min(SampleCnt, Other.SampleCnt), std::min(BvhCnt, Other.BvhCnt), - std::min(KmCnt, Other.KmCnt), std::min(XCnt, Other.XCnt)); + std::min(KmCnt, Other.KmCnt), std::min(XCnt, Other.XCnt), + std::min(VaVdst, Other.VaVdst), std::min(VmVsrc, Other.VmVsrc)); } + + friend raw_ostream &operator<<(raw_ostream &OS, const AMDGPU::Waitcnt &Wait); +}; + +/// Represents the hardware counter limits for different wait count types. +struct HardwareLimits { + unsigned LoadcntMax; // Corresponds to Vmcnt prior to gfx12. + unsigned ExpcntMax; + unsigned DscntMax; // Corresponds to LGKMcnt prior to gfx12. + unsigned StorecntMax; // Corresponds to VScnt in gfx10/gfx11. + unsigned SamplecntMax; // gfx12+ only. + unsigned BvhcntMax; // gfx12+ only. + unsigned KmcntMax; // gfx12+ only. + unsigned XcntMax; // gfx1250. + unsigned VaVdstMax; // gfx12+ expert mode only. + unsigned VmVsrcMax; // gfx12+ expert mode only. + + HardwareLimits() = default; + + /// Initializes hardware limits from ISA version. + HardwareLimits(const IsaVersion &IV); }; // The following methods are only meaningful on targets that support @@ -1278,6 +1402,27 @@ bool isSymbolicDepCtrEncoding(unsigned Code, bool &HasNonDefaultVal, bool decodeDepCtr(unsigned Code, int &Id, StringRef &Name, unsigned &Val, bool &IsDefault, const MCSubtargetInfo &STI); +/// \returns Maximum VaVdst value that can be encoded. +unsigned getVaVdstBitMask(); + +/// \returns Maximum VaSdst value that can be encoded. +unsigned getVaSdstBitMask(); + +/// \returns Maximum VaSsrc value that can be encoded. +unsigned getVaSsrcBitMask(); + +/// \returns Maximum HoldCnt value that can be encoded. +unsigned getHoldCntBitMask(const IsaVersion &Version); + +/// \returns Maximum VmVsrc value that can be encoded. +unsigned getVmVsrcBitMask(); + +/// \returns Maximum VaVcc value that can be encoded. +unsigned getVaVccBitMask(); + +/// \returns Maximum SaSdst value that can be encoded. +unsigned getSaSdstBitMask(); + /// \returns Decoded VaVdst from given immediate \p Encoded. unsigned decodeFieldVaVdst(unsigned Encoded); @@ -1297,46 +1442,47 @@ unsigned decodeFieldVaVcc(unsigned Encoded); unsigned decodeFieldVaSsrc(unsigned Encoded); /// \returns Decoded HoldCnt from given immediate \p Encoded. -unsigned decodeFieldHoldCnt(unsigned Encoded); +unsigned decodeFieldHoldCnt(unsigned Encoded, const IsaVersion &Version); /// \returns \p VmVsrc as an encoded Depctr immediate. -unsigned encodeFieldVmVsrc(unsigned VmVsrc); +unsigned encodeFieldVmVsrc(unsigned VmVsrc, const MCSubtargetInfo &STI); /// \returns \p Encoded combined with encoded \p VmVsrc. unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc); /// \returns \p VaVdst as an encoded Depctr immediate. -unsigned encodeFieldVaVdst(unsigned VaVdst); +unsigned encodeFieldVaVdst(unsigned VaVdst, const MCSubtargetInfo &STI); /// \returns \p Encoded combined with encoded \p VaVdst. unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst); /// \returns \p SaSdst as an encoded Depctr immediate. -unsigned encodeFieldSaSdst(unsigned SaSdst); +unsigned encodeFieldSaSdst(unsigned SaSdst, const MCSubtargetInfo &STI); /// \returns \p Encoded combined with encoded \p SaSdst. unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst); /// \returns \p VaSdst as an encoded Depctr immediate. -unsigned encodeFieldVaSdst(unsigned VaSdst); +unsigned encodeFieldVaSdst(unsigned VaSdst, const MCSubtargetInfo &STI); /// \returns \p Encoded combined with encoded \p VaSdst. unsigned encodeFieldVaSdst(unsigned Encoded, unsigned VaSdst); /// \returns \p VaVcc as an encoded Depctr immediate. -unsigned encodeFieldVaVcc(unsigned VaVcc); +unsigned encodeFieldVaVcc(unsigned VaVcc, const MCSubtargetInfo &STI); /// \returns \p Encoded combined with encoded \p VaVcc. unsigned encodeFieldVaVcc(unsigned Encoded, unsigned VaVcc); /// \returns \p HoldCnt as an encoded Depctr immediate. -unsigned encodeFieldHoldCnt(unsigned HoldCnt); +unsigned encodeFieldHoldCnt(unsigned HoldCnt, const MCSubtargetInfo &STI); /// \returns \p Encoded combined with encoded \p HoldCnt. -unsigned encodeFieldHoldCnt(unsigned HoldCnt, unsigned Encoded); +unsigned encodeFieldHoldCnt(unsigned Encoded, unsigned HoldCnt, + const IsaVersion &Version); /// \returns \p VaSsrc as an encoded Depctr immediate. -unsigned encodeFieldVaSsrc(unsigned VaSsrc); +unsigned encodeFieldVaSsrc(unsigned VaSsrc, const MCSubtargetInfo &STI); /// \returns \p Encoded combined with encoded \p VaSsrc. unsigned encodeFieldVaSsrc(unsigned Encoded, unsigned VaSsrc); @@ -1513,6 +1659,8 @@ constexpr inline bool isKernel(CallingConv::ID CC) { } } +inline bool isKernel(const Function &F) { return isKernel(F.getCallingConv()); } + LLVM_READNONE constexpr bool canGuaranteeTCO(CallingConv::ID CC) { return CC == CallingConv::Fast; @@ -1561,6 +1709,9 @@ bool isGFX11Plus(const MCSubtargetInfo &STI); bool isGFX12(const MCSubtargetInfo &STI); bool isGFX12Plus(const MCSubtargetInfo &STI); bool isGFX1250(const MCSubtargetInfo &STI); +bool isGFX1250Plus(const MCSubtargetInfo &STI); +bool isGFX13(const MCSubtargetInfo &STI); +bool isGFX13Plus(const MCSubtargetInfo &STI); bool supportsWGP(const MCSubtargetInfo &STI); bool isNotGFX12Plus(const MCSubtargetInfo &STI); bool isNotGFX11Plus(const MCSubtargetInfo &STI); @@ -1599,7 +1750,7 @@ LLVM_READNONE MCRegister mc2PseudoReg(MCRegister Reg); LLVM_READNONE -bool isInlineValue(unsigned Reg); +bool isInlineValue(MCRegister Reg); /// Is this an AMDGPU specific source operand? These include registers, /// inline constants, literals and mandatory literals (KImm). @@ -1663,6 +1814,7 @@ inline unsigned getOperandSize(const MCOperandInfo &OpInfo) { case AMDGPU::OPERAND_REG_IMM_V2INT16: case AMDGPU::OPERAND_REG_IMM_V2BF16: case AMDGPU::OPERAND_REG_IMM_V2FP16: + case AMDGPU::OPERAND_REG_IMM_V2FP16_SPLAT: case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: return 2; @@ -1709,6 +1861,10 @@ LLVM_READNONE std::optional<unsigned> getInlineEncodingV2F16(uint32_t Literal); LLVM_READNONE +std::optional<unsigned> getPKFMACF16InlineEncoding(uint32_t Literal, + bool IsGFX11Plus); + +LLVM_READNONE bool isInlinableLiteralV216(uint32_t Literal, uint8_t OpType); LLVM_READNONE @@ -1721,6 +1877,9 @@ LLVM_READNONE bool isInlinableLiteralV2F16(uint32_t Literal); LLVM_READNONE +bool isPKFMACF16InlineConstant(uint32_t Literal, bool IsGFX11Plus); + +LLVM_READNONE bool isValid32BitLiteral(uint64_t Val, bool IsFP64); LLVM_READNONE @@ -1798,16 +1957,16 @@ bool isIntrinsicAlwaysUniform(unsigned IntrID); /// \returns a register class for the physical register \p Reg if it is a VGPR /// or nullptr otherwise. -const MCRegisterClass *getVGPRPhysRegClass(MCPhysReg Reg, +const MCRegisterClass *getVGPRPhysRegClass(MCRegister Reg, const MCRegisterInfo &MRI); /// \returns the MODE bits which have to be set by the S_SET_VGPR_MSB for the /// physical register \p Reg. -unsigned getVGPREncodingMSBs(MCPhysReg Reg, const MCRegisterInfo &MRI); +unsigned getVGPREncodingMSBs(MCRegister Reg, const MCRegisterInfo &MRI); /// If \p Reg is a low VGPR return a corresponding high VGPR with \p MSBs set. -MCPhysReg getVGPRWithMSBs(MCPhysReg Reg, unsigned MSBs, - const MCRegisterInfo &MRI); +MCRegister getVGPRWithMSBs(MCRegister Reg, unsigned MSBs, + const MCRegisterInfo &MRI); // Returns a table for the opcode with a given \p Desc to map the VGPR MSB // set by the S_SET_VGPR_MSB to one of 4 sources. In case of VOPD returns 2 @@ -1867,7 +2026,7 @@ private: Kind AttrKind = Kind::Unknown; }; -} // end namespace AMDGPU +} // namespace AMDGPU raw_ostream &operator<<(raw_ostream &OS, const AMDGPU::IsaInfo::TargetIDSetting S); diff --git a/llvm/lib/Target/AMDGPU/VINTERPInstructions.td b/llvm/lib/Target/AMDGPU/VINTERPInstructions.td index 5e89e34..75437cf 100644 --- a/llvm/lib/Target/AMDGPU/VINTERPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VINTERPInstructions.td @@ -10,7 +10,7 @@ // VINTERP encoding //===----------------------------------------------------------------------===// -class VINTERPe <VOPProfile P> : Enc64 { +class VINTERPe : Enc64 { bits<11> vdst; bits<4> src0_modifiers; bits<11> src0; @@ -27,10 +27,10 @@ class VINTERPe <VOPProfile P> : Enc64 { let Inst{7-0} = vdst{7-0}; let Inst{10-8} = waitexp; // Fields for hi/lo 16-bits of register selection - let Inst{11} = !if(P.HasSrc0, src0_modifiers{2}, 0); - let Inst{12} = !if(P.HasSrc1, src1_modifiers{2}, 0); - let Inst{13} = !if(P.HasSrc2, src2_modifiers{2}, 0); - let Inst{14} = !if(P.HasDst, src0_modifiers{3}, 0); + let Inst{11} = src0_modifiers{2}; + let Inst{12} = src1_modifiers{2}; + let Inst{13} = src2_modifiers{2}; + let Inst{14} = src0_modifiers{3}; let Inst{15} = clamp; let Inst{40-32} = src0{8-0}; let Inst{49-41} = src1{8-0}; @@ -40,11 +40,11 @@ class VINTERPe <VOPProfile P> : Enc64 { let Inst{63} = src2_modifiers{0}; // neg(2) } -class VINTERPe_gfx11 <bits<7> op, VOPProfile P> : VINTERPe<P> { +class VINTERPe_gfx11 <bits<7> op> : VINTERPe { let Inst{22-16} = op; } -class VINTERPe_gfx12 <bits<7> op, VOPProfile P> : VINTERPe<P> { +class VINTERPe_gfx12 <bits<7> op> : VINTERPe { let Inst{20-16} = op{4-0}; } @@ -243,7 +243,7 @@ multiclass VINTERP_Real_gfx11 <bits<7> op, string asmName> { !if(ps.Pfl.IsRealTrue16, "", "_FAKE16") in { def _gfx11 : VINTERP_Real<ps, SIEncodingFamily.GFX11, asmName>, - VINTERPe_gfx11<op, ps.Pfl>; + VINTERPe_gfx11<op>; } } @@ -253,7 +253,7 @@ multiclass VINTERP_Real_gfx12 <bits<7> op, string asmName> { !if(ps.Pfl.IsRealTrue16, "", "_FAKE16") in { def _gfx12 : VINTERP_Real<ps, SIEncodingFamily.GFX12, asmName>, - VINTERPe_gfx12<op, ps.Pfl>; + VINTERPe_gfx12<op>; } } diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 54f57e0..56e7623 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -263,16 +263,19 @@ let HasOMod = 0, HasClamp = 0 in { let isReMaterializable = 1 in { let SchedRW = [WriteDoubleCvt] in { // OMod clears exceptions when set in this instruction +let IsDPMACCInstruction = 1 in defm V_CVT_I32_F64 : VOP1Inst <"v_cvt_i32_f64", VOP_I32_F64_SPECIAL_OMOD, fp_to_sint>; let mayRaiseFPException = 0 in { defm V_CVT_F64_I32 : VOP1Inst <"v_cvt_f64_i32", VOP1_F64_I32, sint_to_fp>; } +let IsDPMACCInstruction = 1 in { defm V_CVT_F32_F64 : VOP1Inst <"v_cvt_f32_f64", VOP_F32_F64, fpround>; -defm V_CVT_F64_F32 : VOP1Inst <"v_cvt_f64_f32", VOP_F64_F32, any_fpextend>; // OMod clears exceptions when set in this instruction defm V_CVT_U32_F64 : VOP1Inst <"v_cvt_u32_f64", VOP_I32_F64_SPECIAL_OMOD, fp_to_uint>; +} // IsDPMACCInstruction = 1 +defm V_CVT_F64_F32 : VOP1Inst <"v_cvt_f64_f32", VOP_F64_F32, any_fpextend>; let mayRaiseFPException = 0 in { defm V_CVT_F64_U32 : VOP1Inst <"v_cvt_f64_u32", VOP1_F64_I32, uint_to_fp>; @@ -349,11 +352,11 @@ defm V_RSQ_F32 : VOP1Inst <"v_rsq_f32", VOP_F32_F32, AMDGPUrsq>; defm V_SQRT_F32 : VOP1Inst <"v_sqrt_f32", VOP_F32_F32, int_amdgcn_sqrt>; } // End TRANS = 1, SchedRW = [WriteTrans32] -let TRANS = 1, SchedRW = [WriteTrans64] in { +let TRANS = 1, SchedRW = [WriteTrans64], IsDPMACCInstruction = 1 in { defm V_RCP_F64 : VOP1Inst <"v_rcp_f64", VOP_F64_F64_NO_DPP, AMDGPUrcp>; defm V_RSQ_F64 : VOP1Inst <"v_rsq_f64", VOP_F64_F64_NO_DPP, AMDGPUrsq>; defm V_SQRT_F64 : VOP1Inst <"v_sqrt_f64", VOP_F64_F64_NO_DPP, int_amdgcn_sqrt>; -} // End TRANS = 1, SchedRW = [WriteTrans64] +} // End TRANS = 1, SchedRW = [WriteTrans64], IsDPMACCInstruction = 1 let TRANS = 1, SchedRW = [WriteTrans32] in { defm V_SIN_F32 : VOP1Inst <"v_sin_f32", VOP_F32_F32, AMDGPUsin>; @@ -369,18 +372,45 @@ defm V_FFBH_U32 : VOP1Inst <"v_ffbh_u32", VOP_I32_I32, AMDGPUffbh_u32>; defm V_FFBL_B32 : VOP1Inst <"v_ffbl_b32", VOP_I32_I32, AMDGPUffbl_b32>; defm V_FFBH_I32 : VOP1Inst <"v_ffbh_i32", VOP_I32_I32, AMDGPUffbh_i32>; -let SchedRW = [WriteDoubleAdd] in { +let SchedRW = [WriteDoubleAdd], IsDPMACCInstruction = 1 in { defm V_FREXP_EXP_I32_F64 : VOP1Inst <"v_frexp_exp_i32_f64", VOP_I32_F64_SPECIAL_OMOD, int_amdgcn_frexp_exp>; defm V_FREXP_MANT_F64 : VOP1Inst <"v_frexp_mant_f64", VOP_F64_F64, int_amdgcn_frexp_mant>; let FPDPRounding = 1 in { defm V_FRACT_F64 : VOP1Inst <"v_fract_f64", VOP_F64_F64, AMDGPUfract>; } // End FPDPRounding = 1 -} // End SchedRW = [WriteDoubleAdd] +} // End SchedRW = [WriteDoubleAdd], IsDPMACCInstruction = 1 defm V_FREXP_EXP_I32_F32 : VOP1Inst <"v_frexp_exp_i32_f32", VOP_I32_F32, int_amdgcn_frexp_exp>; defm V_FREXP_MANT_F32 : VOP1Inst <"v_frexp_mant_f32", VOP_F32_F32, int_amdgcn_frexp_mant>; } // End isReMaterializable = 1 +// These i32 conversions naturally saturate. +def : GCNPat<(i32 (fp_to_uint_sat (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)), i32)), + (V_CVT_U32_F32_e64 $src0_modifiers, $src0)>; +def : GCNPat<(i32 (fp_to_sint_sat (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)), i32)), + (V_CVT_I32_F32_e64 $src0_modifiers, $src0)>; +def : GCNPat<(i32 (fp_to_uint_sat (f64 (VOP3Mods f64:$src0, i32:$src0_modifiers)), i32)), + (V_CVT_U32_F64_e64 $src0_modifiers, $src0)>; +def : GCNPat<(i32 (fp_to_sint_sat (f64 (VOP3Mods f64:$src0, i32:$src0_modifiers)), i32)), + (V_CVT_I32_F64_e64 $src0_modifiers, $src0)>; +def : GCNPat<(i32 (fp_to_uint_sat f32:$src0, i32)), (V_CVT_U32_F32_e32 (f32 $src0))>; +def : GCNPat<(i32 (fp_to_sint_sat f32:$src0, i32)), (V_CVT_I32_F32_e32 (f32 $src0))>; +def : GCNPat<(i32 (fp_to_uint_sat f64:$src0, i32)), (V_CVT_U32_F64_e32 (f64 $src0))>; +def : GCNPat<(i32 (fp_to_sint_sat f64:$src0, i32)), (V_CVT_I32_F64_e32 (f64 $src0))>; + +def : GCNPat<(i32 (fp_to_uint_sat_gi (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))), + (V_CVT_U32_F32_e64 $src0_modifiers, $src0)>; +def : GCNPat<(i32 (fp_to_sint_sat_gi (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))), + (V_CVT_I32_F32_e64 $src0_modifiers, $src0)>; +def : GCNPat<(i32 (fp_to_uint_sat_gi (f64 (VOP3Mods f64:$src0, i32:$src0_modifiers)))), + (V_CVT_U32_F64_e64 $src0_modifiers, $src0)>; +def : GCNPat<(i32 (fp_to_sint_sat_gi (f64 (VOP3Mods f64:$src0, i32:$src0_modifiers)))), + (V_CVT_I32_F64_e64 $src0_modifiers, $src0)>; +def : GCNPat<(i32 (fp_to_uint_sat_gi f32:$src0)), (V_CVT_U32_F32_e32 (f32 $src0))>; +def : GCNPat<(i32 (fp_to_sint_sat_gi f32:$src0)), (V_CVT_I32_F32_e32 (f32 $src0))>; +def : GCNPat<(i32 (fp_to_uint_sat_gi f64:$src0)), (V_CVT_U32_F64_e32 (f64 $src0))>; +def : GCNPat<(i32 (fp_to_sint_sat_gi f64:$src0)), (V_CVT_I32_F64_e32 (f64 $src0))>; + defm V_CLREXCP : VOP1Inst <"v_clrexcp", VOP_NO_EXT<VOP_NONE>>; // Restrict src0 to be VGPR @@ -493,12 +523,12 @@ let SubtargetPredicate = isGFX7GFX8GFX9 in { } // End SubtargetPredicate = isGFX7GFX8GFX9 let SubtargetPredicate = isGFX7Plus in { - let SchedRW = [WriteDoubleAdd] in { + let SchedRW = [WriteDoubleAdd], IsDPMACCInstruction = 1 in { defm V_TRUNC_F64 : VOP1Inst<"v_trunc_f64", VOP_F64_F64, ftrunc>; defm V_CEIL_F64 : VOP1Inst<"v_ceil_f64", VOP_F64_F64, fceil>; defm V_RNDNE_F64 : VOP1Inst<"v_rndne_f64", VOP_F64_F64, froundeven>; defm V_FLOOR_F64 : VOP1Inst<"v_floor_f64", VOP_F64_F64, ffloor>; - } // End SchedRW = [WriteDoubleAdd] + } // End SchedRW = [WriteDoubleAdd], IsDPMACCInstruction = 1 } // End SubtargetPredicate = isGFX7Plus } // End isReMaterializable = 1 @@ -513,6 +543,13 @@ defm V_CVT_U16_F16 : VOP1Inst_t16_with_profiles <"v_cvt_u16_f16", defm V_CVT_I16_F16 : VOP1Inst_t16_with_profiles <"v_cvt_i16_f16", VOP_I16_F16_SPECIAL_OMOD, VOP_I16_F16_SPECIAL_OMOD_t16, VOP_I16_F16_SPECIAL_OMOD_fake16, fp_to_sint>; + +let HasClamp = 0, HasOMod = 0 in { +def V_TRANS_BF16_Profile : VOPProfile <[bf16, bf16, untyped, untyped]>; +def V_TRANS_BF16_t16_Profile : VOPProfile_True16 <VOP_BF16_BF16>; +def V_TRANS_BF16_fake16_Profile : VOPProfile_Fake16 <VOP_BF16_BF16>; +} + let TRANS = 1, SchedRW = [WriteTrans32] in { defm V_RCP_F16 : VOP1Inst_t16 <"v_rcp_f16", VOP_F16_F16, AMDGPUrcp>; defm V_SQRT_F16 : VOP1Inst_t16 <"v_sqrt_f16", VOP_F16_F16, any_amdgcn_sqrt>; @@ -527,14 +564,30 @@ defm V_TANH_F16 : VOP1Inst_t16 <"v_tanh_f16", VOP_F16_F16, int_amdgcn_tanh>; } let SubtargetPredicate = HasBF16TransInsts in { -defm V_TANH_BF16 : VOP1Inst_t16 <"v_tanh_bf16", VOP_BF16_BF16, int_amdgcn_tanh>; -defm V_RCP_BF16 : VOP1Inst_t16 <"v_rcp_bf16", VOP_BF16_BF16, AMDGPUrcp>; -defm V_SQRT_BF16 : VOP1Inst_t16 <"v_sqrt_bf16", VOP_BF16_BF16, any_amdgcn_sqrt>; -defm V_RSQ_BF16 : VOP1Inst_t16 <"v_rsq_bf16", VOP_BF16_BF16, AMDGPUrsq>; -defm V_LOG_BF16 : VOP1Inst_t16 <"v_log_bf16", VOP_BF16_BF16, AMDGPUlogf16>; -defm V_EXP_BF16 : VOP1Inst_t16 <"v_exp_bf16", VOP_BF16_BF16, AMDGPUexpf16>; -defm V_SIN_BF16 : VOP1Inst_t16 <"v_sin_bf16", VOP_BF16_BF16, AMDGPUsin>; -defm V_COS_BF16 : VOP1Inst_t16 <"v_cos_bf16", VOP_BF16_BF16, AMDGPUcos>; +defm V_TANH_BF16 : VOP1Inst_t16_with_profiles<"v_tanh_bf16", V_TRANS_BF16_Profile, + V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile, + int_amdgcn_tanh>; +defm V_RCP_BF16 : VOP1Inst_t16_with_profiles<"v_rcp_bf16", V_TRANS_BF16_Profile, + V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile, + AMDGPUrcp>; +defm V_SQRT_BF16 : VOP1Inst_t16_with_profiles<"v_sqrt_bf16", V_TRANS_BF16_Profile, + V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile, + any_amdgcn_sqrt>; +defm V_RSQ_BF16 : VOP1Inst_t16_with_profiles<"v_rsq_bf16", V_TRANS_BF16_Profile, + V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile, + AMDGPUrsq>; +defm V_LOG_BF16 : VOP1Inst_t16_with_profiles<"v_log_bf16", V_TRANS_BF16_Profile, + V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile, + AMDGPUlogf16>; +defm V_EXP_BF16 : VOP1Inst_t16_with_profiles<"v_exp_bf16", V_TRANS_BF16_Profile, + V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile, + AMDGPUexpf16>; +defm V_SIN_BF16 : VOP1Inst_t16_with_profiles<"v_sin_bf16", V_TRANS_BF16_Profile, + V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile, + AMDGPUsin>; +defm V_COS_BF16 : VOP1Inst_t16_with_profiles<"v_cos_bf16", V_TRANS_BF16_Profile, + V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile, + AMDGPUcos>; } } // End TRANS = 1, SchedRW = [WriteTrans32] defm V_FREXP_MANT_F16 : VOP1Inst_t16 <"v_frexp_mant_f16", VOP_F16_F16, int_amdgcn_frexp_mant>; @@ -593,15 +646,15 @@ let SubtargetPredicate = isGFX9Plus in { let isReMaterializable = 1 in defm V_SAT_PK_U8_I16 : VOP1Inst_t16<"v_sat_pk_u8_i16", VOP_I16_I32>; - - let mayRaiseFPException = 0 in { - defm V_CVT_NORM_I16_F16 : VOP1Inst_t16_with_profiles <"v_cvt_norm_i16_f16", - VOP_I16_F16_SPECIAL_OMOD, VOP_I16_F16_SPECIAL_OMOD_t16, VOP_I16_F16_SPECIAL_OMOD_fake16>; - defm V_CVT_NORM_U16_F16 : VOP1Inst_t16_with_profiles <"v_cvt_norm_u16_f16", - VOP_I16_F16_SPECIAL_OMOD, VOP_I16_F16_SPECIAL_OMOD_t16, VOP_I16_F16_SPECIAL_OMOD_fake16>; - } // End mayRaiseFPException = 0 } // End SubtargetPredicate = isGFX9Plus +let mayRaiseFPException = 0, SubtargetPredicate = HasCvtNormInsts in { +defm V_CVT_NORM_I16_F16 : VOP1Inst_t16_with_profiles <"v_cvt_norm_i16_f16", + VOP_I16_F16_SPECIAL_OMOD, VOP_I16_F16_SPECIAL_OMOD_t16, VOP_I16_F16_SPECIAL_OMOD_fake16>; +defm V_CVT_NORM_U16_F16 : VOP1Inst_t16_with_profiles <"v_cvt_norm_u16_f16", + VOP_I16_F16_SPECIAL_OMOD, VOP_I16_F16_SPECIAL_OMOD_t16, VOP_I16_F16_SPECIAL_OMOD_fake16>; +} // End mayRaiseFPException = 0, SubtargetPredicate = HasCvtNormInsts + let SubtargetPredicate = isGFX9Only in { defm V_SCREEN_PARTITION_4SE_B32 : VOP1Inst <"v_screen_partition_4se_b32", VOP_I32_I32>; } // End SubtargetPredicate = isGFX9Only @@ -644,7 +697,7 @@ let OtherPredicates = [HasCvtFP8VOP1Bug] in { (V_CVT_F32_BF8_sdwa 0, $src, 0, 0, 0)>; } -let OtherPredicates = [HasNoCvtFP8VOP1Bug, HasSDWA] in { // FIXME: HasSDWA is a substitute for !gfx12 +let OtherPredicates = [NotHasCvtFP8VOP1Bug, HasSDWA] in { // FIXME: HasSDWA is a substitute for !gfx12 def : GCNPat<(f32 (int_amdgcn_cvt_f32_fp8 i32:$src, 0)), (V_CVT_F32_FP8_e32 $src)>; def : GCNPat<(f32 (int_amdgcn_cvt_f32_bf8 i32:$src, 0)), @@ -707,9 +760,9 @@ def V_CVT_F16_F8_True16_Profile : VOP3_Profile_True16<V_CVT_F16_F8_Profile>; def V_CVT_F16_F8_Fake16_Profile : VOP3_Profile_Fake16<V_CVT_F16_F8_Profile>; } -let SubtargetPredicate = isGFX12Plus, OtherPredicates = [HasFP8ConversionInsts], +let SubtargetPredicate = isGFX11Plus, OtherPredicates = [HasFP8ConversionInsts], mayRaiseFPException = 0, SchedRW = [WriteFloatCvt] in { - let SubtargetPredicate = isGFX12PlusNot12_50 in + let SubtargetPredicate = isGFX11PlusNot12_50 in defm V_CVT_F32_FP8_OP_SEL : VOP1Inst<"v_cvt_f32_fp8_op_sel", VOPProfile_Base_CVT_F_F8_ByteSel<f32>>; let SubtargetPredicate = isGFX125xOnly in defm V_CVT_F32_FP8_gfx1250 : VOP1Inst<"v_cvt_f32_fp8_gfx1250", VOPProfile_Base_CVT_F_F8_ByteSel<f32, 1>>; @@ -733,7 +786,7 @@ class Cvt_F_F8_Pat_ByteSel<SDPatternOperator node, VOP3_Pseudo inst, bit HasOpSe >; let OtherPredicates = [HasFP8ConversionInsts] in { - let SubtargetPredicate = isGFX12PlusNot12_50 in + let SubtargetPredicate = isGFX11PlusNot12_50 in def : Cvt_F_F8_Pat_ByteSel<int_amdgcn_cvt_f32_fp8, V_CVT_F32_FP8_OP_SEL_e64>; let SubtargetPredicate = isGFX125xOnly in { def : GCNPat<(int_amdgcn_cvt_f32_fp8 i32:$src0, timm:$byte_sel), @@ -741,7 +794,7 @@ let OtherPredicates = [HasFP8ConversionInsts] in { def : GCNPat<(int_amdgcn_cvt_f32_fp8_e5m3 i32:$src0, timm:$byte_sel), (V_CVT_F32_FP8_gfx1250_e64 $src0, DSTCLAMP.ENABLE, (as_i32timm $byte_sel))>; } - let SubtargetPredicate = isGFX12Plus in + let SubtargetPredicate = isGFX11Plus in def : Cvt_F_F8_Pat_ByteSel<int_amdgcn_cvt_f32_bf8, V_CVT_F32_BF8_OP_SEL_e64>; } @@ -753,7 +806,7 @@ class Cvt_PK_F32_F8_Pat_OpSel<SDPatternOperator node, int index, (inst_e32 $src)) >; -let SubtargetPredicate = isGFX12Plus, OtherPredicates = [HasFP8ConversionInsts] in { +let SubtargetPredicate = isGFX11Plus, OtherPredicates = [HasFP8ConversionInsts] in { foreach Index = [0, -1] in { def : Cvt_PK_F32_F8_Pat_OpSel<int_amdgcn_cvt_pk_f32_fp8, Index, V_CVT_PK_F32_FP8_fake16_e32, V_CVT_PK_F32_FP8_fake16_e64>; @@ -839,7 +892,7 @@ let SubtargetPredicate = isGFX11Plus in { // Restrict src0 to be VGPR def V_PERMLANE64_B32 : VOP1_Pseudo<"v_permlane64_b32", VOP_MOVRELS, [], /*VOP1Only=*/ 1>; - let isAsCheapAsAMove = 1 in + let isAsCheapAsAMove = 1, isMoveImm = 1 in defm V_MOV_B16 : VOP1Inst_t16<"v_mov_b16", VOP_I16_I16>; defm V_NOT_B16 : VOP1Inst_t16<"v_not_b16", VOP_I16_I16>; defm V_CVT_I32_I16 : VOP1Inst_t16<"v_cvt_i32_i16", VOP_I32_I16>; @@ -927,7 +980,7 @@ class VOP1_DPP8_Gen<bits<8> op, VOP1_Pseudo ps, GFXGen Gen, VOPProfile p = ps.Pf } //===----------------------------------------------------------------------===// -// GFX11, GFX12 +// GFX11, GFX12, GFX13 //===----------------------------------------------------------------------===// multiclass VOP1Only_Real<GFXGen Gen, bits<9> op> { @@ -1001,10 +1054,19 @@ multiclass VOP1_Realtriple_e64_with_name<GFXGen Gen, bits<9> op, string opName, asmName>; } +multiclass VOP1_Realtriple_e64_with_name_gfx12_gfx13< + bits<9> op, string opName, string asmName> : + VOP1_Realtriple_e64_with_name<GFX12Gen, op, opName, asmName>, + VOP1_Realtriple_e64_with_name<GFX13Gen, op, opName, asmName>; + multiclass VOP1_Real_FULL<GFXGen Gen, bits<9> op> : VOP1_Real_e32<Gen, op>, VOP1_Realtriple_e64<Gen, op>, VOP1_Real_dpp<Gen, op>, VOP1_Real_dpp8<Gen, op>; +multiclass VOP1_Real_FULL_gfx1250_gfx13<bits<9> op> : + VOP1_Real_FULL<GFX1250Gen, op>, + VOP1_Real_FULL<GFX13Gen, op>; + multiclass VOP1_Real_NO_VOP3_with_name_gfx11<bits<9> op, string opName, string asmName> { defm NAME : VOP1_Real_e32_with_name<GFX11Gen, op, opName, asmName>, @@ -1016,11 +1078,14 @@ multiclass VOP1_Real_NO_VOP3_with_name_gfx11<bits<9> op, string opName, } } -multiclass VOP1_Real_NO_VOP3_with_name_gfx12<bits<9> op, string opName, - string asmName> { +multiclass VOP1_Real_NO_VOP3_with_name_gfx12_gfx13< + bits<9> op, string opName, string asmName> { defm NAME : VOP1_Real_e32_with_name<GFX12Gen, op, opName, asmName>, VOP1_Real_dpp_with_name<GFX12Gen, op, opName, asmName>, VOP1_Real_dpp8_with_name<GFX12Gen, op, opName, asmName>; + defm NAME : VOP1_Real_e32_with_name<GFX13Gen, op, opName, asmName>, + VOP1_Real_dpp_with_name<GFX13Gen, op, opName, asmName>, + VOP1_Real_dpp8_with_name<GFX13Gen, op, opName, asmName>; } multiclass VOP1_Real_FULL_with_name<GFXGen Gen, bits<9> op, string opName, @@ -1030,6 +1095,11 @@ multiclass VOP1_Real_FULL_with_name<GFXGen Gen, bits<9> op, string opName, VOP1_Real_dpp8_with_name<Gen, op, opName, asmName>, VOP1_Realtriple_e64_with_name<Gen, op, opName, asmName>; +multiclass VOP1_Real_FULL_with_name_gfx1250_gfx13< + bits<9> op, string opName, string asmName> : + VOP1_Real_FULL_with_name<GFX1250Gen, op, opName, asmName>, + VOP1_Real_FULL_with_name<GFX13Gen, op, opName, asmName>; + multiclass VOP1_Real_NO_DPP<GFXGen Gen, bits<9> op> : VOP1_Real_e32<Gen, op>, VOP1_Real_e64<Gen, op>; @@ -1038,134 +1108,159 @@ multiclass VOP1_Real_with_DPP16<GFXGen Gen, bits<9> op> : VOP1_Real_dpp<Gen, op>, VOP3_Real_dpp_Base<Gen, {0, 1, 1, op{6-0}}>; -multiclass VOP1_Real_FULL_t16_gfx11_gfx12<bits<9> op, string asmName, - string opName = NAME> : +multiclass VOP1_Real_FULL_t16_gfx11_gfx12_gfx13< + bits<9> op, string asmName, string opName = NAME> : VOP1_Real_FULL_with_name<GFX11Gen, op, opName, asmName>, - VOP1_Real_FULL_with_name<GFX12Gen, op, opName, asmName>; + VOP1_Real_FULL_with_name<GFX12Gen, op, opName, asmName>, + VOP1_Real_FULL_with_name<GFX13Gen, op, opName, asmName>; + +multiclass VOP1_Real_FULL_with_name_gfx12_gfx13< + bits<9> op, string opName, string asmName> : + VOP1_Real_FULL_with_name<GFX12Gen, op, opName, asmName>, + VOP1_Real_FULL_with_name<GFX13Gen, op, opName, asmName>; -multiclass VOP1_Real_FULL_with_name_gfx11_gfx12<bits<9> op, string opName, - string asmName> : +multiclass VOP1_Real_FULL_with_name_gfx11_gfx12_gfx13< + bits<9> op, string opName, string asmName> : VOP1_Real_FULL_with_name<GFX11Gen, op, opName, asmName>, - VOP1_Real_FULL_with_name<GFX12Gen, op, opName, asmName>; + VOP1_Real_FULL_with_name<GFX12Gen, op, opName, asmName>, + VOP1_Real_FULL_with_name<GFX13Gen, op, opName, asmName>; -multiclass VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12< +multiclass VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13< bits<9> op, string asmName = !tolower(NAME), string opName = NAME> { defm opName#"_t16" : - VOP1_Real_FULL_with_name_gfx11_gfx12<op, opName#"_t16", asmName>; + VOP1_Real_FULL_with_name_gfx11_gfx12_gfx13<op, opName#"_t16", asmName>; defm opName#"_fake16": - VOP1_Real_FULL_with_name_gfx11_gfx12<op, opName#"_fake16", asmName>; + VOP1_Real_FULL_with_name_gfx11_gfx12_gfx13<op, opName#"_fake16", asmName>; } -multiclass VOP1Only_Real_gfx11_gfx12<bits<9> op> : - VOP1Only_Real<GFX11Gen, op>, VOP1Only_Real<GFX12Gen, op>; +multiclass VOP1Only_Real_gfx11_gfx12_gfx13<bits<9> op> : + VOP1Only_Real<GFX11Gen, op>, VOP1Only_Real<GFX12Gen, op>, + VOP1Only_Real<GFX13Gen, op>; multiclass VOP1_Real_FULL_gfx11_gfx12<bits<9> op> : VOP1_Real_FULL<GFX11Gen, op>, VOP1_Real_FULL<GFX12Gen, op>; -multiclass VOP1_Real_FULL_t16_and_fake16_gfx1250< +multiclass VOP1_Real_e32_with_name_gfx11_gfx12_gfx13<bits<9> op, string opName, + string asmName> : + VOP1_Real_e32_with_name<GFX11Gen, op, opName, asmName>, + VOP1_Real_e32_with_name<GFX12Gen, op, opName, asmName>, + VOP1_Real_e32_with_name<GFX13Gen, op, opName, asmName>; + +multiclass VOP1_Real_FULL_t16<GFXGen Gen, bits<9> op> : + VOP1_Real_FULL_with_name<Gen, op, NAME, + !cast<VOP1_Pseudo>(!subst("_fake16", "", NAME)#"_e32").Mnemonic>; + +multiclass VOP1_Real_FULL_t16_and_fake16_gfx1250_gfx13< bits<9> op, string asmName = !tolower(NAME), string opName = NAME> { defm opName#"_t16" : - VOP1_Real_FULL_with_name<GFX1250Gen, op, opName#"_t16", asmName>; + VOP1_Real_FULL_with_name<GFX1250Gen, op, opName#"_t16", asmName>, + VOP1_Real_FULL_with_name<GFX13Gen, op, opName#"_t16", asmName>; defm opName#"_fake16": - VOP1_Real_FULL_with_name<GFX1250Gen, op, opName#"_fake16", asmName>; + VOP1_Real_FULL_with_name<GFX1250Gen, op, opName#"_fake16", asmName>, + VOP1_Real_FULL_with_name<GFX13Gen, op, opName#"_fake16", asmName>; } -multiclass VOP1_Real_FULL_with_name_gfx11_gfx12_not_gfx1250<bits<9> op, string opName, - string asmName> : +multiclass VOP1_Real_FULL_with_name_gfx11_gfx12_gfx13_not_gfx1250<bits<9> op, string opName, + string asmName> : VOP1_Real_FULL_with_name<GFX11Gen, op, opName, asmName>, - VOP1_Real_FULL_with_name<GFX12Not12_50Gen, op, opName, asmName>; + VOP1_Real_FULL_with_name<GFX12Not12_50Gen, op, opName, asmName>, + VOP1_Real_FULL_with_name<GFX13Gen, op, opName, asmName>; -multiclass VOP1_Real_OpSelIsDPP_gfx1250<bits<9> op> : VOP1_Real_e32<GFX1250Gen, op> { +multiclass VOP1_Real_OpSelIsDPP<GFXGen Gen, bits<9> op> : VOP1_Real_e32<Gen, op> { defvar ps = !cast<VOP_Pseudo>(NAME#"_e64"); - def _e64_gfx1250 : - VOP3_Real_Gen<ps, GFX1250Gen>, + def _e64#Gen.Suffix : + VOP3_Real_Gen<ps, Gen>, VOP3OpSelIsDPP_gfx12<{0, 1, 1, op{6-0}}, ps.Pfl>; } -defm V_CVT_F32_FP8 : VOP1_Real_FULL_with_name_gfx11_gfx12_not_gfx1250<0x06c, "V_CVT_F32_FP8_OP_SEL", "v_cvt_f32_fp8">; +multiclass VOP1_Real_OpSelIsDPP_gfx1250_gfx13<bits<9> op> : + VOP1_Real_OpSelIsDPP<GFX1250Gen, op>, + VOP1_Real_OpSelIsDPP<GFX13Gen, op>; + +defm V_CVT_F32_FP8 : VOP1_Real_FULL_with_name_gfx11_gfx12_gfx13_not_gfx1250<0x06c, "V_CVT_F32_FP8_OP_SEL", "v_cvt_f32_fp8">; defm V_CVT_F32_FP8 : VOP1_Real_FULL_with_name<GFX1250Gen, 0x06c, "V_CVT_F32_FP8_gfx1250", "v_cvt_f32_fp8">; -defm V_CVT_F32_BF8 : VOP1_Real_FULL_with_name<GFX12Gen, 0x06d, "V_CVT_F32_BF8_OP_SEL", "v_cvt_f32_bf8">; +defm V_CVT_F32_BF8 : VOP1_Real_FULL_with_name_gfx11_gfx12_gfx13<0x06d, "V_CVT_F32_BF8_OP_SEL", "v_cvt_f32_bf8">; -defm V_CVT_PK_F32_FP8_fake16 : VOP1_Real_e32_with_name<GFX12Gen, 0x06e, "V_CVT_PK_F32_FP8_fake16", "v_cvt_pk_f32_fp8">; -defm V_CVT_PK_F32_FP8_t16 : VOP1_Real_e32_with_name<GFX12Gen, 0x06e, "V_CVT_PK_F32_FP8_t16", "v_cvt_pk_f32_fp8">; -defm V_CVT_PK_F32_FP8_fake16 : VOP3_Real_with_name<GFX12Gen, 0x1ee, "V_CVT_PK_F32_FP8_fake16", "v_cvt_pk_f32_fp8">; -defm V_CVT_PK_F32_FP8_t16 : VOP3_Real_with_name<GFX12Gen, 0x1ee, "V_CVT_PK_F32_FP8_t16", "v_cvt_pk_f32_fp8">; -defm V_CVT_PK_F32_BF8_fake16 : VOP1_Real_e32_with_name<GFX12Gen, 0x06f, "V_CVT_PK_F32_BF8_fake16", "v_cvt_pk_f32_bf8">; -defm V_CVT_PK_F32_BF8_t16 : VOP1_Real_e32_with_name<GFX12Gen, 0x06f, "V_CVT_PK_F32_BF8_t16", "v_cvt_pk_f32_bf8">; -defm V_CVT_PK_F32_BF8_fake16 : VOP3_Real_with_name<GFX12Gen, 0x1ef, "V_CVT_PK_F32_BF8_fake16", "v_cvt_pk_f32_bf8">; -defm V_CVT_PK_F32_BF8_t16 : VOP3_Real_with_name<GFX12Gen, 0x1ef, "V_CVT_PK_F32_BF8_t16", "v_cvt_pk_f32_bf8">; +defm V_CVT_PK_F32_FP8_fake16 : VOP1_Real_e32_with_name_gfx11_gfx12_gfx13<0x06e, "V_CVT_PK_F32_FP8_fake16", "v_cvt_pk_f32_fp8">; +defm V_CVT_PK_F32_FP8_t16 : VOP1_Real_e32_with_name_gfx11_gfx12_gfx13<0x06e, "V_CVT_PK_F32_FP8_t16", "v_cvt_pk_f32_fp8">; +defm V_CVT_PK_F32_FP8_fake16 : VOP3_Real_with_name_gfx11_gfx12_gfx13<0x1ee, "V_CVT_PK_F32_FP8_fake16", "v_cvt_pk_f32_fp8">; +defm V_CVT_PK_F32_FP8_t16 : VOP3_Real_with_name_gfx11_gfx12_gfx13<0x1ee, "V_CVT_PK_F32_FP8_t16", "v_cvt_pk_f32_fp8">; +defm V_CVT_PK_F32_BF8_fake16 : VOP1_Real_e32_with_name_gfx11_gfx12_gfx13<0x06f, "V_CVT_PK_F32_BF8_fake16", "v_cvt_pk_f32_bf8">; +defm V_CVT_PK_F32_BF8_t16 : VOP1_Real_e32_with_name_gfx11_gfx12_gfx13<0x06f, "V_CVT_PK_F32_BF8_t16", "v_cvt_pk_f32_bf8">; +defm V_CVT_PK_F32_BF8_fake16 : VOP3_Real_with_name_gfx11_gfx12_gfx13<0x1ef, "V_CVT_PK_F32_BF8_fake16", "v_cvt_pk_f32_bf8">; +defm V_CVT_PK_F32_BF8_t16 : VOP3_Real_with_name_gfx11_gfx12_gfx13<0x1ef, "V_CVT_PK_F32_BF8_t16", "v_cvt_pk_f32_bf8">; -defm V_CVT_NEAREST_I32_F32 : VOP1_Real_FULL_with_name_gfx11_gfx12<0x00c, +defm V_CVT_NEAREST_I32_F32 : VOP1_Real_FULL_with_name_gfx11_gfx12_gfx13<0x00c, "V_CVT_RPI_I32_F32", "v_cvt_nearest_i32_f32">; -defm V_CVT_FLOOR_I32_F32 : VOP1_Real_FULL_with_name_gfx11_gfx12<0x00d, +defm V_CVT_FLOOR_I32_F32 : VOP1_Real_FULL_with_name_gfx11_gfx12_gfx13<0x00d, "V_CVT_FLR_I32_F32", "v_cvt_floor_i32_f32">; -defm V_CLZ_I32_U32 : VOP1_Real_FULL_with_name_gfx11_gfx12<0x039, +defm V_CLZ_I32_U32 : VOP1_Real_FULL_with_name_gfx11_gfx12_gfx13<0x039, "V_FFBH_U32", "v_clz_i32_u32">; -defm V_CTZ_I32_B32 : VOP1_Real_FULL_with_name_gfx11_gfx12<0x03a, +defm V_CTZ_I32_B32 : VOP1_Real_FULL_with_name_gfx11_gfx12_gfx13<0x03a, "V_FFBL_B32", "v_ctz_i32_b32">; -defm V_CLS_I32 : VOP1_Real_FULL_with_name_gfx11_gfx12<0x03b, +defm V_CLS_I32 : VOP1_Real_FULL_with_name_gfx11_gfx12_gfx13<0x03b, "V_FFBH_I32", "v_cls_i32">; -defm V_SWAP_B16 : VOP1Only_Real_gfx11_gfx12<0x066>; -defm V_PERMLANE64_B32 : VOP1Only_Real_gfx11_gfx12<0x067>; -defm V_MOV_B16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x01c, "v_mov_b16">; -defm V_NOT_B16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x069>; -defm V_CVT_I32_I16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x06a>; -defm V_CVT_U32_U16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x06b>; - -defm V_CVT_F16_U16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x050>; -defm V_CVT_F16_I16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x051>; -defm V_CVT_U16_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x052>; -defm V_CVT_I16_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x053>; -defm V_RCP_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x054, "v_rcp_f16">; -defm V_RCP_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x054, "v_rcp_f16">; -defm V_SQRT_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x055, "v_sqrt_f16">; -defm V_SQRT_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x055, "v_sqrt_f16">; -defm V_RSQ_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x056, "v_rsq_f16">; -defm V_RSQ_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x056, "v_rsq_f16">; -defm V_LOG_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x057, "v_log_f16">; -defm V_LOG_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x057, "v_log_f16">; -defm V_EXP_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x058, "v_exp_f16">; -defm V_EXP_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x058, "v_exp_f16">; -defm V_FREXP_MANT_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x059>; -defm V_FREXP_EXP_I16_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x05a>; -defm V_FLOOR_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05b, "v_floor_f16">; -defm V_FLOOR_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05b, "v_floor_f16">; -defm V_CEIL_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05c, "v_ceil_f16">; -defm V_CEIL_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05c, "v_ceil_f16">; -defm V_TRUNC_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x05d>; -defm V_RNDNE_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x05e>; -defm V_FRACT_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x05f>; -defm V_SIN_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x060>; -defm V_COS_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x061>; -defm V_SAT_PK_U8_I16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x062>; -defm V_CVT_NORM_I16_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x063>; -defm V_CVT_NORM_U16_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x064>; - -defm V_CVT_F16_F32 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x00a>; -defm V_CVT_F32_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x00b>; +defm V_SWAP_B16 : VOP1Only_Real_gfx11_gfx12_gfx13<0x066>; +defm V_PERMLANE64_B32 : VOP1Only_Real_gfx11_gfx12_gfx13<0x067>; +defm V_MOV_B16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12_gfx13<0x01c, "v_mov_b16">; +defm V_NOT_B16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13<0x069>; +defm V_CVT_I32_I16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13<0x06a>; +defm V_CVT_U32_U16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13<0x06b>; + +defm V_CVT_F16_U16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13<0x050>; +defm V_CVT_F16_I16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13<0x051>; +defm V_CVT_U16_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13<0x052>; +defm V_CVT_I16_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13<0x053>; +defm V_RCP_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12_gfx13<0x054, "v_rcp_f16">; +defm V_RCP_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12_gfx13<0x054, "v_rcp_f16">; +defm V_SQRT_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12_gfx13<0x055, "v_sqrt_f16">; +defm V_SQRT_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12_gfx13<0x055, "v_sqrt_f16">; +defm V_RSQ_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12_gfx13<0x056, "v_rsq_f16">; +defm V_RSQ_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12_gfx13<0x056, "v_rsq_f16">; +defm V_LOG_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12_gfx13<0x057, "v_log_f16">; +defm V_LOG_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12_gfx13<0x057, "v_log_f16">; +defm V_EXP_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12_gfx13<0x058, "v_exp_f16">; +defm V_EXP_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12_gfx13<0x058, "v_exp_f16">; +defm V_FREXP_MANT_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13<0x059>; +defm V_FREXP_EXP_I16_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13<0x05a>; +defm V_FLOOR_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12_gfx13<0x05b, "v_floor_f16">; +defm V_FLOOR_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12_gfx13<0x05b, "v_floor_f16">; +defm V_CEIL_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12_gfx13<0x05c, "v_ceil_f16">; +defm V_CEIL_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12_gfx13<0x05c, "v_ceil_f16">; +defm V_TRUNC_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13<0x05d>; +defm V_RNDNE_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13<0x05e>; +defm V_FRACT_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13<0x05f>; +defm V_SIN_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13<0x060>; +defm V_COS_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13<0x061>; +defm V_SAT_PK_U8_I16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13<0x062>; +defm V_CVT_NORM_I16_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13<0x063>; +defm V_CVT_NORM_U16_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13<0x064>; + +defm V_CVT_F16_F32 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13<0x00a>; +defm V_CVT_F32_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13<0x00b>; defm V_MOV_B64 : VOP1_Real_FULL <GFX1250Gen, 0x1d>; -defm V_TANH_F32 : VOP1_Real_FULL<GFX1250Gen, 0x01e>; -defm V_TANH_F16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x01f>; -defm V_PERMLANE16_SWAP_B32 : VOP1_Real_OpSelIsDPP_gfx1250<0x049>; -defm V_TANH_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x04a>; -defm V_PRNG_B32 : VOP1_Real_FULL<GFX1250Gen, 0x04b>; -defm V_CVT_F32_BF16_gfx1250 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x072, "v_cvt_f32_bf16">; -defm V_SAT_PK4_I4_I8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x073>; -defm V_SAT_PK4_U4_U8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x074>; -defm V_CVT_PK_F16_FP8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x075>; -defm V_CVT_PK_F16_BF8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x076>; -defm V_CVT_F16_FP8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x077>; -defm V_CVT_F16_BF8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x078>; -defm V_RCP_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x079>; -defm V_SQRT_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07a>; -defm V_RSQ_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07b>; -defm V_LOG_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07c>; -defm V_EXP_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07d>; -defm V_SIN_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07e>; -defm V_COS_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07f>; +defm V_TANH_F32 : VOP1_Real_FULL_gfx1250_gfx13<0x01e>; +defm V_TANH_F16 : VOP1_Real_FULL_t16_and_fake16_gfx1250_gfx13<0x01f>; +defm V_PERMLANE16_SWAP_B32 : VOP1_Real_OpSelIsDPP_gfx1250_gfx13<0x049>; +defm V_TANH_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250_gfx13<0x04a>; +defm V_PRNG_B32 : VOP1_Real_FULL_gfx1250_gfx13<0x04b>; +defm V_CVT_F32_BF16_gfx1250 : VOP1_Real_FULL_t16_and_fake16_gfx1250_gfx13<0x072, "v_cvt_f32_bf16">; +defm V_SAT_PK4_I4_I8 : VOP1_Real_FULL_t16_and_fake16_gfx1250_gfx13<0x073>; +defm V_SAT_PK4_U4_U8 : VOP1_Real_FULL_t16_and_fake16_gfx1250_gfx13<0x074>; +defm V_CVT_PK_F16_FP8 : VOP1_Real_FULL_t16_and_fake16_gfx1250_gfx13<0x075>; +defm V_CVT_PK_F16_BF8 : VOP1_Real_FULL_t16_and_fake16_gfx1250_gfx13<0x076>; +defm V_CVT_F16_FP8 : VOP1_Real_FULL_t16_and_fake16_gfx1250_gfx13<0x077>; +defm V_CVT_F16_BF8 : VOP1_Real_FULL_t16_and_fake16_gfx1250_gfx13<0x078>; +defm V_RCP_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250_gfx13<0x079>; +defm V_SQRT_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250_gfx13<0x07a>; +defm V_RSQ_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250_gfx13<0x07b>; +defm V_LOG_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250_gfx13<0x07c>; +defm V_EXP_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250_gfx13<0x07d>; +defm V_SIN_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250_gfx13<0x07e>; +defm V_COS_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250_gfx13<0x07f>; //===----------------------------------------------------------------------===// // GFX10. @@ -1213,17 +1308,22 @@ multiclass VOP1_Real_gfx10_FULL_gfx11_gfx12<bits<9> op> : VOP1_Real_FULL<GFX11Gen, op>, VOP1_Real_FULL<GFX12Gen, op>; -multiclass VOP1_Real_gfx10_NO_DPP_gfx11_gfx12<bits<9> op> : +multiclass VOP1_Real_gfx10_NO_DPP_gfx11_gfx12_gfx13<bits<9> op> : VOP1_Real_gfx10<op>, VOP1_Real_NO_DPP<GFX11Gen, op>, - VOP1_Real_NO_DPP<GFX12Gen, op>; + VOP1_Real_NO_DPP<GFX12Gen, op>, + VOP1_Real_NO_DPP<GFX13Gen, op>; multiclass VOP1Only_Real_gfx10_gfx11_gfx12<bits<9> op> : VOP1Only_Real_gfx10<op>, VOP1Only_Real<GFX11Gen, op>, VOP1Only_Real<GFX12Gen, op>; -defm V_PIPEFLUSH : VOP1_Real_gfx10_NO_DPP_gfx11_gfx12<0x01b>; +multiclass VOP1Only_Real_gfx10_gfx11_gfx12_gfx13<bits<9> op> : + VOP1Only_Real_gfx10_gfx11_gfx12<op>, + VOP1Only_Real<GFX13Gen, op>; + +defm V_PIPEFLUSH : VOP1_Real_gfx10_NO_DPP_gfx11_gfx12_gfx13<0x01b>; defm V_MOVRELSD_2_B32 : VOP1_Real_gfx10_FULL_gfx11_gfx12<0x048>; defm V_CVT_F16_U16 : VOP1_Real_gfx10<0x050>; defm V_CVT_F16_I16 : VOP1_Real_gfx10<0x051>; @@ -1247,7 +1347,7 @@ defm V_SAT_PK_U8_I16 : VOP1_Real_gfx10<0x062>; defm V_CVT_NORM_I16_F16 : VOP1_Real_gfx10<0x063>; defm V_CVT_NORM_U16_F16 : VOP1_Real_gfx10<0x064>; -defm V_SWAP_B32 : VOP1Only_Real_gfx10_gfx11_gfx12<0x065>; +defm V_SWAP_B32 : VOP1Only_Real_gfx10_gfx11_gfx12_gfx13<0x065>; defm V_SWAPREL_B32 : VOP1Only_Real_gfx10_gfx11_gfx12<0x068>; //===----------------------------------------------------------------------===// @@ -1270,20 +1370,20 @@ let AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7" in { multiclass VOP1_Real_gfx7<bits<9> op> : VOP1_Real_e32_gfx7<op>, VOP1_Real_e64_gfx7<op>; -multiclass VOP1_Real_gfx7_gfx10_NO_DPP_gfx11_with_DPP16_gfx12<bits<9> op> : +multiclass VOP1_Real_gfx7_gfx10_NO_DPP_gfx11_gfx13_with_DPP16_gfx12<bits<9> op> : VOP1_Real_gfx7<op>, VOP1_Real_gfx10<op>, VOP1_Real_NO_DPP<GFX11Gen, op>, - VOP1_Real_with_DPP16<GFX12Gen, op>; + VOP1_Real_with_DPP16<GFX12Gen, op>, VOP1_Real_NO_DPP<GFX13Gen, op>; defm V_LOG_LEGACY_F32 : VOP1_Real_gfx7<0x045>; defm V_EXP_LEGACY_F32 : VOP1_Real_gfx7<0x046>; -defm V_TRUNC_F64 : VOP1_Real_gfx7_gfx10_NO_DPP_gfx11_with_DPP16_gfx12<0x017>; -defm V_CEIL_F64 : VOP1_Real_gfx7_gfx10_NO_DPP_gfx11_with_DPP16_gfx12<0x018>; -defm V_RNDNE_F64 : VOP1_Real_gfx7_gfx10_NO_DPP_gfx11_with_DPP16_gfx12<0x019>; -defm V_FLOOR_F64 : VOP1_Real_gfx7_gfx10_NO_DPP_gfx11_with_DPP16_gfx12<0x01a>; +defm V_TRUNC_F64 : VOP1_Real_gfx7_gfx10_NO_DPP_gfx11_gfx13_with_DPP16_gfx12<0x017>; +defm V_CEIL_F64 : VOP1_Real_gfx7_gfx10_NO_DPP_gfx11_gfx13_with_DPP16_gfx12<0x018>; +defm V_RNDNE_F64 : VOP1_Real_gfx7_gfx10_NO_DPP_gfx11_gfx13_with_DPP16_gfx12<0x019>; +defm V_FLOOR_F64 : VOP1_Real_gfx7_gfx10_NO_DPP_gfx11_gfx13_with_DPP16_gfx12<0x01a>; //===----------------------------------------------------------------------===// -// GFX6, GFX7, GFX10, GFX11, GFX12 +// GFX6, GFX7, GFX10, GFX11, GFX12, GFX13 //===----------------------------------------------------------------------===// let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in { @@ -1314,16 +1414,20 @@ multiclass VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<bits<9> op> : VOP1_Real_gfx6_gfx7_gfx10<op>, VOP1_Real_FULL<GFX11Gen, op>, VOP1_Real_FULL<GFX12Gen, op>; -multiclass VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx12<bits<9> op> : +multiclass VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<bits<9> op> : + VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<op>, + VOP1_Real_FULL<GFX13Gen, op>; + +multiclass VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx12_gfx13<bits<9> op> : VOP1_Real_gfx6_gfx7_gfx10<op>, VOP1_Real_NO_DPP<GFX11Gen, op>, - VOP1_Real_NO_DPP<GFX12Gen, op>; + VOP1_Real_NO_DPP<GFX12Gen, op>, VOP1_Real_NO_DPP<GFX13Gen, op>; -multiclass VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_with_DPP16_gfx12<bits<9> op> : +multiclass VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx13_with_DPP16_gfx12<bits<9> op> : VOP1_Real_gfx6_gfx7_gfx10<op>, VOP1_Real_NO_DPP<GFX11Gen, op>, - VOP1_Real_with_DPP16<GFX12Gen, op>; + VOP1_Real_with_DPP16<GFX12Gen, op>, VOP1_Real_NO_DPP<GFX13Gen, op>; -multiclass VOP1Only_Real_gfx6_gfx7_gfx10_gfx11_gfx12<bits<9> op> : - VOP1Only_Real_gfx6_gfx7<op>, VOP1Only_Real_gfx10_gfx11_gfx12<op>; +multiclass VOP1Only_Real_gfx6_gfx7_gfx10_gfx11_gfx12_gfx13<bits<9> op> : + VOP1Only_Real_gfx6_gfx7<op>, VOP1Only_Real_gfx10_gfx11_gfx12_gfx13<op>; defm V_LOG_CLAMP_F32 : VOP1_Real_gfx6_gfx7<0x026>; defm V_RCP_CLAMP_F32 : VOP1_Real_gfx6_gfx7<0x028>; @@ -1333,59 +1437,63 @@ defm V_RSQ_LEGACY_F32 : VOP1_Real_gfx6_gfx7<0x02d>; defm V_RCP_CLAMP_F64 : VOP1_Real_gfx6_gfx7<0x030>; defm V_RSQ_CLAMP_F64 : VOP1_Real_gfx6_gfx7<0x032>; -defm V_NOP : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx12<0x000>; -defm V_MOV_B32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x001>; -defm V_READFIRSTLANE_B32 : VOP1Only_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x002>; -defm V_CVT_I32_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_with_DPP16_gfx12<0x003>; -defm V_CVT_F64_I32 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_with_DPP16_gfx12<0x004>; -defm V_CVT_F32_I32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x005>; -defm V_CVT_F32_U32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x006>; -defm V_CVT_U32_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x007>; -defm V_CVT_I32_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x008>; +defm V_NOP : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx12_gfx13<0x000>; +defm V_MOV_B32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x001>; +defm V_READFIRSTLANE_B32 : VOP1Only_Real_gfx6_gfx7_gfx10_gfx11_gfx12_gfx13<0x002>; +defm V_CVT_I32_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx13_with_DPP16_gfx12<0x003>; +defm V_CVT_F64_I32 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx13_with_DPP16_gfx12<0x004>; +defm V_CVT_F32_I32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x005>; +defm V_CVT_F32_U32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x006>; +defm V_CVT_U32_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x007>; +defm V_CVT_I32_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x008>; defm V_CVT_F16_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x00a>; defm V_CVT_F32_F16 : VOP1_Real_gfx6_gfx7_gfx10<0x00b>; defm V_CVT_RPI_I32_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x00c>; defm V_CVT_FLR_I32_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x00d>; -defm V_CVT_OFF_F32_I4 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x00e>; -defm V_CVT_F32_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_with_DPP16_gfx12<0x00f>; -defm V_CVT_F64_F32 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_with_DPP16_gfx12<0x010>; -defm V_CVT_F32_UBYTE0 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x011>; -defm V_CVT_F32_UBYTE1 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x012>; -defm V_CVT_F32_UBYTE2 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x013>; -defm V_CVT_F32_UBYTE3 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x014>; -defm V_CVT_U32_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_with_DPP16_gfx12<0x015>; -defm V_CVT_F64_U32 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_with_DPP16_gfx12<0x016>; -defm V_FRACT_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x020>; -defm V_TRUNC_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x021>; -defm V_CEIL_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x022>; -defm V_RNDNE_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x023>; -defm V_FLOOR_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x024>; -defm V_EXP_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x025>; -defm V_LOG_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x027>; -defm V_RCP_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x02a>; -defm V_RCP_IFLAG_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x02b>; -defm V_RSQ_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x02e>; -defm V_RCP_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx12<0x02f>; -defm V_RSQ_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx12<0x031>; -defm V_SQRT_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x033>; -defm V_SQRT_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx12<0x034>; -defm V_SIN_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x035>; -defm V_COS_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x036>; -defm V_NOT_B32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x037>; -defm V_BFREV_B32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x038>; +defm V_CVT_OFF_F32_I4 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x00e>; +defm V_CVT_F32_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx13_with_DPP16_gfx12<0x00f>; +defm V_CVT_F64_F32 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx13_with_DPP16_gfx12<0x010>; +defm V_CVT_F32_UBYTE0 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x011>; +defm V_CVT_F32_UBYTE1 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x012>; +defm V_CVT_F32_UBYTE2 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x013>; +defm V_CVT_F32_UBYTE3 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x014>; +defm V_CVT_U32_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx13_with_DPP16_gfx12<0x015>; +defm V_CVT_F64_U32 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx13_with_DPP16_gfx12<0x016>; +defm V_FRACT_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x020>; +defm V_TRUNC_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x021>; +defm V_CEIL_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x022>; +defm V_RNDNE_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x023>; +defm V_FLOOR_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x024>; +defm V_EXP_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x025>; +defm V_LOG_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x027>; +defm V_RCP_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x02a>; +defm V_RCP_IFLAG_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x02b>; +defm V_RSQ_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x02e>; +defm V_RCP_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx12_gfx13<0x02f>; +defm V_RSQ_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx12_gfx13<0x031>; +defm V_SQRT_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x033>; +defm V_SQRT_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx12_gfx13<0x034>; +defm V_SIN_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x035>; +defm V_COS_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x036>; +defm V_NOT_B32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x037>; +defm V_BFREV_B32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x038>; defm V_FFBH_U32 : VOP1_Real_gfx6_gfx7_gfx10<0x039>; defm V_FFBL_B32 : VOP1_Real_gfx6_gfx7_gfx10<0x03a>; defm V_FFBH_I32 : VOP1_Real_gfx6_gfx7_gfx10<0x03b>; -defm V_FREXP_EXP_I32_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_with_DPP16_gfx12<0x03c>; -defm V_FREXP_MANT_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_with_DPP16_gfx12<0x03d>; -defm V_FRACT_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_with_DPP16_gfx12<0x03e>; -defm V_FREXP_EXP_I32_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x03f>; -defm V_FREXP_MANT_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x040>; +defm V_FREXP_EXP_I32_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx13_with_DPP16_gfx12<0x03c>; +defm V_FREXP_MANT_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx13_with_DPP16_gfx12<0x03d>; +defm V_FRACT_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx13_with_DPP16_gfx12<0x03e>; +defm V_FREXP_EXP_I32_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x03f>; +defm V_FREXP_MANT_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x040>; defm V_CLREXCP : VOP1_Real_gfx6_gfx7_gfx10<0x041>; defm V_MOVRELD_B32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x042>; defm V_MOVRELS_B32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x043>; defm V_MOVRELSD_B32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x044>; +def : AMDGPUMnemonicAlias<"v_brev_b32", "v_bfrev_b32"> { + let AssemblerPredicate = isGFX13Plus; +} + //===----------------------------------------------------------------------===// // GFX8, GFX9 (VI). //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index d87d250..2ccf392 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -15,8 +15,8 @@ class VOP2e <bits<6> op, VOPProfile P> : Enc32 { bits<9> src0; bits<8> src1; - let Inst{8-0} = !if(P.HasSrc0, src0, 0); - let Inst{16-9} = !if(P.HasSrc1, src1, 0); + let Inst{8-0} = !if(P.HasSrc0, src0, ?); + let Inst{16-9} = !if(P.HasSrc1, src1, ?); let Inst{24-17} = !if(P.EmitDst, vdst, 0); let Inst{30-25} = op; let Inst{31} = 0x0; //encoding @@ -28,8 +28,8 @@ class VOP2_MADKe <bits<6> op, VOPProfile P> : Enc64 { bits<8> src1; bits<32> imm; - let Inst{8-0} = !if(P.HasSrc0, src0, 0); - let Inst{16-9} = !if(P.HasSrc1, src1, 0); + let Inst{8-0} = !if(P.HasSrc0, src0, ?); + let Inst{16-9} = !if(P.HasSrc1, src1, ?); let Inst{24-17} = !if(P.EmitDst, vdst, 0); let Inst{30-25} = op; let Inst{31} = 0x0; // encoding @@ -42,8 +42,8 @@ class VOP2_MADK64e <bits<6> op, VOPProfile P> : Enc96 { bits<8> src1; bits<64> imm; - let Inst{8-0} = !if(P.HasSrc0, src0, 0); - let Inst{16-9} = !if(P.HasSrc1, src1, 0); + let Inst{8-0} = !if(P.HasSrc0, src0, ?); + let Inst{16-9} = !if(P.HasSrc1, src1, ?); let Inst{24-17} = !if(P.EmitDst, vdst, 0); let Inst{30-25} = op; let Inst{31} = 0x0; // encoding @@ -55,7 +55,7 @@ class VOP2_SDWAe <bits<6> op, VOPProfile P> : VOP_SDWAe <P> { bits<8> src1; let Inst{8-0} = 0xf9; // sdwa - let Inst{16-9} = !if(P.HasSrc1, src1{7-0}, 0); + let Inst{16-9} = !if(P.HasSrc1, src1{7-0}, ?); let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0); let Inst{30-25} = op; let Inst{31} = 0x0; // encoding @@ -66,11 +66,11 @@ class VOP2_SDWA9Ae <bits<6> op, VOPProfile P> : VOP_SDWA9Ae <P> { bits<9> src1; let Inst{8-0} = 0xf9; // sdwa - let Inst{16-9} = !if(P.HasSrc1, src1{7-0}, 0); + let Inst{16-9} = !if(P.HasSrc1, src1{7-0}, ?); let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0); let Inst{30-25} = op; let Inst{31} = 0x0; // encoding - let Inst{63} = !if(P.HasSrc1, src1{8}, 0); // src1_sgpr + let Inst{63} = !if(P.HasSrc1, src1{8}, ?); // src1_sgpr } class VOP2_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], string suffix = "_e32"> : @@ -971,7 +971,7 @@ defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_I32_I32_I32, int_a } // End IsNeverUniform = 1 defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_F32_F32_I32, any_fldexp>; -let ReadsModeReg = 0, mayRaiseFPException = 0 in { +let ReadsModeReg = 0, mayRaiseFPException = 0, SubtargetPredicate = HasCvtPkNormVOP2Insts in { defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_V2I16_F32_F32, AMDGPUpknorm_i16_f32>; defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_V2I16_F32_F32, AMDGPUpknorm_u16_f32>; } @@ -1266,14 +1266,14 @@ let Constraints = "$vdst = $src2", defm V_FMAC_F32 : VOP2Inst_VOPD <"v_fmac_f32", VOP_MAC_F32, 0x0, "v_fmac_f32">; } // End SubtargetPredicate = HasDLInsts -let SubtargetPredicate = HasFmaLegacy32 in { +let SubtargetPredicate = HasFmacLegacy32 in { let Constraints = "$vdst = $src2", isConvertibleToThreeAddress = 1, isCommutable = 1 in defm V_FMAC_LEGACY_F32 : VOP2Inst <"v_fmac_legacy_f32", VOP_MAC_LEGACY_F32>; -} // End SubtargetPredicate = HasFmaLegacy32 +} // End SubtargetPredicate = HasFmacLegacy32 let SubtargetPredicate = HasFmacF64Inst, Constraints = "$vdst = $src2", @@ -1348,10 +1348,15 @@ let isCommutable = 1 in def V_FMAAK_F64 : VOP2_Pseudo<"v_fmaak_f64", VOP_MADAK_F64, [], "">; } // End SubtargetPredicate = HasFmaakFmamkF64Insts, isReMaterializable = 1, FixedSize = 1, Size = 12, SchedRW = [Write64Bit] +// A dedicated profile for V_PK_FMAC_F16. +def VOP_V2F16_V2F16_V2F16_SPLAT : VOPProfile <[v2f16, v2f16, v2f16, untyped]> { + let Src0RC32 = VSrc_v2f16_splat; +} + let SubtargetPredicate = HasPkFmacF16Inst in { // FIXME: V_PK_FMAC_F16 is currently not used in instruction selection. // If this changes, ensure the DPP variant is not used for GFX11+. -defm V_PK_FMAC_F16 : VOP2Inst<"v_pk_fmac_f16", VOP_V2F16_V2F16_V2F16>; +defm V_PK_FMAC_F16 : VOP2Inst<"v_pk_fmac_f16", VOP_V2F16_V2F16_V2F16_SPLAT>; } // End SubtargetPredicate = HasPkFmacF16Inst // Note: 16-bit instructions produce a 0 result in the high 16-bits @@ -1481,7 +1486,7 @@ let SubtargetPredicate = isGFX12Plus, isReMaterializable = 1 in { } // End SubtargetPredicate = isGFX12Plus, isReMaterializable = 1 let SubtargetPredicate = HasIEEEMinimumMaximumInsts, isReMaterializable = 1, - SchedRW = [WriteDoubleAdd], isCommutable = 1 in { + SchedRW = [WriteDoubleAdd], isCommutable = 1, IsDPMACCInstruction = 1 in { defm V_MIN_NUM_F64 : VOP2Inst_VOPD <"v_min_num_f64", VOP_F64_F64_F64, 0x24, "v_min_num_f64", fminnum_like>; defm V_MAX_NUM_F64 : VOP2Inst_VOPD <"v_max_num_f64", VOP_F64_F64_F64, 0x23, "v_max_num_f64", fmaxnum_like>; } @@ -1502,7 +1507,7 @@ class VOP2_DPP<bits<6> op, VOP2_DPP_Pseudo ps, bits<8> vdst; bits<8> src1; let Inst{8-0} = 0xfa; - let Inst{16-9} = !if(p.HasSrc1, src1{7-0}, 0); + let Inst{16-9} = !if(p.HasSrc1, src1{7-0}, ?); let Inst{24-17} = !if(p.EmitDst, vdst{7-0}, 0); let Inst{30-25} = op; let Inst{31} = 0x0; @@ -1544,7 +1549,7 @@ class VOP2_DPP8<bits<6> op, VOP2_Pseudo ps, bits<8> src1; let Inst{8-0} = fi; - let Inst{16-9} = !if(p.HasSrc1, src1{7-0}, 0); + let Inst{16-9} = !if(p.HasSrc1, src1{7-0}, ?); let Inst{24-17} = !if(p.EmitDst, vdst{7-0}, 0); let Inst{30-25} = op; let Inst{31} = 0x0; @@ -2346,7 +2351,7 @@ class VOP2_DPPe <bits<6> op, VOP2_DPP_Pseudo ps, VOPProfile P = ps.Pfl> : bits<8> vdst; bits<8> src1; let Inst{8-0} = 0xfa; //dpp - let Inst{16-9} = !if(P.HasSrc1, src1{7-0}, 0); + let Inst{16-9} = !if(P.HasSrc1, src1{7-0}, ?); let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0); let Inst{30-25} = op; let Inst{31} = 0x0; //encoding diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 42ec8ba..bdcf04f 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -151,7 +151,7 @@ class getInterp16Ins <bit HasSrc2, bit HasOMod, class VOP3_INTERP16 <list<ValueType> ArgVT, bit OpSel = 0> : VOPProfile<ArgVT> { let IsSingle = 1; - let HasOMod = !ne(DstVT.Value, f16.Value); + let HasOMod = !ne(DstVT, f16); let HasHigh = 1; let HasOpSel = OpSel; @@ -185,7 +185,8 @@ defm V_FMA_LEGACY_F32 : VOP3Inst <"v_fma_legacy_f32", defm V_MAD_I32_I24 : VOP3Inst <"v_mad_i32_i24", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>; defm V_MAD_U32_U24 : VOP3Inst <"v_mad_u32_u24", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>; defm V_FMA_F32 : VOP3Inst <"v_fma_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, any_fma>, VOPD_Component<0x13, "v_fma_f32">; -defm V_LERP_U8 : VOP3Inst <"v_lerp_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_lerp>; +let SubtargetPredicate = HasLerpInst in + defm V_LERP_U8 : VOP3Inst <"v_lerp_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_lerp>; let SchedRW = [WriteIntMul] in { let SubtargetPredicate = HasMadU32Inst in @@ -198,9 +199,11 @@ let SchedRW = [WriteIntMul] in { let SchedRW = [WriteDoubleAdd] in { let FPDPRounding = 1 in { +let IsDPMACCInstruction = 1 in defm V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP_F64_F64_F64_F64_DPP_PROF, any_fma>, VOPD_Component<0x20, "v_fma_f64">; let SubtargetPredicate = isNotGFX12Plus in { defm V_ADD_F64 : VOP3Inst <"v_add_f64", VOP3_Profile<VOP_F64_F64_F64>, any_fadd>; +let IsDPMACCInstruction = 1 in defm V_MUL_F64 : VOP3Inst <"v_mul_f64", VOP3_Profile<VOP_F64_F64_F64>, any_fmul>; } // End SubtargetPredicate = isNotGFX12Plus } // End FPDPRounding = 1 @@ -223,10 +226,10 @@ defm V_MAXIMUM_F32 : VOP3Inst <"v_maximum_f32", VOP3_Profile<VOP_F32_F32_F32>, f defm V_MINIMUM_F16 : VOP3Inst_t16 <"v_minimum_f16", VOP_F16_F16_F16, fminimum>; defm V_MAXIMUM_F16 : VOP3Inst_t16 <"v_maximum_f16", VOP_F16_F16_F16, fmaximum>; -let SchedRW = [WriteDoubleAdd] in { +let SchedRW = [WriteDoubleAdd], IsDPMACCInstruction = 1 in { defm V_MINIMUM_F64 : VOP3Inst <"v_minimum_f64", VOP3_Profile<VOP_F64_F64_F64>, fminimum>; defm V_MAXIMUM_F64 : VOP3Inst <"v_maximum_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaximum>; -} // End SchedRW = [WriteDoubleAdd] +} // End SchedRW = [WriteDoubleAdd], IsDPMACCInstruction = 1 } // End SubtargetPredicate = HasIEEEMinimumMaximumInsts, ReadsModeReg = 0, AddedComplexity = 1 let SubtargetPredicate = isGFX1250Plus, SchedRW = [WriteDoubleAdd] in { @@ -251,19 +254,19 @@ defm V_DIV_FMAS_F32 : VOP3Inst_Pseudo_Wrapper <"v_div_fmas_f32", VOP_F32_F32_F32 // if (vcc) // result *= 2^64 // -let SchedRW = [WriteDouble], FPDPRounding = 1 in +let SchedRW = [WriteDouble], FPDPRounding = 1, IsDPMACCInstruction = 1 in defm V_DIV_FMAS_F64 : VOP3Inst <"v_div_fmas_f64", VOP_F64_F64_F64_F64_VCC>; } // End Uses = [MODE, VCC, EXEC] } // End isCommutable = 1 let isReMaterializable = 1 in { -let mayRaiseFPException = 0 in { +let mayRaiseFPException = 0, SubtargetPredicate = HasCubeInsts in { defm V_CUBEID_F32 : VOP3Inst <"v_cubeid_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubeid>; defm V_CUBESC_F32 : VOP3Inst <"v_cubesc_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubesc>; defm V_CUBETC_F32 : VOP3Inst <"v_cubetc_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubetc>; defm V_CUBEMA_F32 : VOP3Inst <"v_cubema_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubema>; -} // End mayRaiseFPException +} // mayRaiseFPException = 0, SubtargetPredicate = HasCubeInsts defm V_BFE_U32 : VOP3Inst <"v_bfe_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfe_u32>; defm V_BFE_I32 : VOP3Inst <"v_bfe_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfe_i32>; @@ -306,20 +309,20 @@ let SubtargetPredicate = HasMinimum3Maximum3F32, ReadsModeReg = 0 in { defm V_MAXIMUM3_F32 : VOP3Inst <"v_maximum3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmaximum3>; } // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 -let isCommutable = 1 in { +let isCommutable = 1, SubtargetPredicate = HasSadInsts in { defm V_SAD_U8 : VOP3Inst <"v_sad_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>; defm V_SAD_HI_U8 : VOP3Inst <"v_sad_hi_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>; defm V_SAD_U16 : VOP3Inst <"v_sad_u16", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>; defm V_SAD_U32 : VOP3Inst <"v_sad_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>; -} // End isCommutable = 1 +} // End isCommutable = 1, SubtargetPredicate = HasSadInsts defm V_CVT_PK_U8_F32 : VOP3Inst<"v_cvt_pk_u8_f32", VOP3_Profile<VOP_I32_F32_I32_I32>, int_amdgcn_cvt_pk_u8_f32>; defm V_DIV_FIXUP_F32 : VOP3Inst <"v_div_fixup_f32", DIV_FIXUP_F32_PROF, AMDGPUdiv_fixup>; -let SchedRW = [WriteDoubleAdd], FPDPRounding = 1 in { +let SchedRW = [WriteDoubleAdd], FPDPRounding = 1, IsDPMACCInstruction = 1 in { defm V_DIV_FIXUP_F64 : VOP3Inst <"v_div_fixup_f64", VOP_F64_F64_F64_F64_DPP_PROF, AMDGPUdiv_fixup>; defm V_LDEXP_F64 : VOP3Inst <"v_ldexp_f64", VOP3_Profile<VOP_F64_F64_I32>, any_fldexp>; -} // End SchedRW = [WriteDoubleAdd], FPDPRounding = 1 +} // End SchedRW = [WriteDoubleAdd], FPDPRounding = 1, IsDPMACCInstruction = 1 } // End isReMaterializable = 1 let SubtargetPredicate = isGFX9GFX10 in @@ -357,7 +360,7 @@ let mayRaiseFPException = 0 in { // Seems suspicious but manual doesn't say it d defm V_DIV_SCALE_F32 : VOP3Inst_Pseudo_Wrapper <"v_div_scale_f32", VOP3b_F32_I1_F32_F32_F32> ; // Double precision division pre-scale. - let SchedRW = [WriteDouble, WriteSALU], FPDPRounding = 1 in + let SchedRW = [WriteDouble, WriteSALU], FPDPRounding = 1, IsDPMACCInstruction = 1 in defm V_DIV_SCALE_F64 : VOP3Inst <"v_div_scale_f64", VOP3b_F64_I1_F64_F64_F64>; } // End mayRaiseFPException = 0 @@ -370,12 +373,12 @@ defm V_MQSAD_PK_U16_U8 : VOP3Inst <"v_mqsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64 let isReMaterializable = 1 in { -let SchedRW = [WriteDouble] in { +let SchedRW = [WriteDouble], IsDPMACCInstruction = 1 in { defm V_TRIG_PREOP_F64 : VOP3Inst <"v_trig_preop_f64", VOP3_Profile<VOP_F64_F64_I32>, int_amdgcn_trig_preop>; -} // End SchedRW = [WriteDouble] +} // End SchedRW = [WriteDouble], IsDPMACCInstruction = 1 let SchedRW = [Write64Bit] in { - let SubtargetPredicate = isGFX6GFX7 in { + let SubtargetPredicate = isGFX6GFX7, IsDPMACCInstruction = 1 in { defm V_LSHL_B64 : VOP3Inst <"v_lshl_b64", VOP3_Profile<VOP_I64_I64_I32>, cshl_64>; defm V_LSHR_B64 : VOP3Inst <"v_lshr_b64", VOP3_Profile<VOP_I64_I64_I32>, csrl_64>; defm V_ASHR_I64 : VOP3Inst <"v_ashr_i64", VOP3_Profile<VOP_I64_I64_I32>, csra_64>; @@ -424,15 +427,16 @@ def VOPProfileMQSAD : VOP3_Profile<VOP_V4I32_I64_I32_V4I32, VOP3_CLAMP> { let SubtargetPredicate = isGFX7Plus in { let Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32] in { -defm V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64, VOP3_CLAMP>>; +let SubtargetPredicate = HasQsadInsts in + defm V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64, VOP3_CLAMP>>; defm V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOPProfileMQSAD>; } // End Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32] } // End SubtargetPredicate = isGFX7Plus let isCommutable = 1, SchedRW = [WriteIntMul, WriteSALU] in { let SubtargetPredicate = isGFX7Plus in { - defm V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64_DPP, null_frag, [HasNotMADIntraFwdBug]>; - defm V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64_DPP, null_frag, [HasNotMADIntraFwdBug]>; + defm V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64_DPP, null_frag, [NotHasMADIntraFwdBug]>; + defm V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64_DPP, null_frag, [NotHasMADIntraFwdBug]>; } let SubtargetPredicate = isGFX11Only, OtherPredicates = [HasMADIntraFwdBug], Constraints = "@earlyclobber $vdst" in { @@ -634,19 +638,13 @@ class ThreeOpFrag<SDPatternOperator op1, SDPatternOperator op2> : ThreeOpFragSDA } def shl_0_to_4 : PatFrag< - (ops node:$src0, node:$src1), (shl node:$src0, node:$src1), - [{ - if (auto *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) { - return C->getZExtValue() <= 4; - } - return false; - }]> { + (ops node:$src0, node:$src1), (shl node:$src0, node:$src1), [{ + KnownBits KB = CurDAG->computeKnownBits(N->getOperand(1)); + return KB.getMaxValue().getZExtValue() <= 4; + }]> { let GISelPredicateCode = [{ - int64_t Imm = 0; - if (!mi_match(MI.getOperand(2).getReg(), MRI, m_ICst(Imm)) && - !mi_match(MI.getOperand(2).getReg(), MRI, m_Copy(m_ICst(Imm)))) - return false; - return (uint64_t)Imm <= 4; + KnownBits KB = VT->getKnownBits(MI.getOperand(2).getReg()); + return KB.getMaxValue().getZExtValue() <= 4; }]; } @@ -775,10 +773,10 @@ let SubtargetPredicate = HasMinimum3Maximum3F16, ReadsModeReg = 0 in { } // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 let SubtargetPredicate = HasAddMinMaxInsts, isCommutable = 1, isReMaterializable = 1 in { - defm V_ADD_MAX_I32 : VOP3Inst <"v_add_max_i32", VOP_I32_I32_I32_I32>; - defm V_ADD_MAX_U32 : VOP3Inst <"v_add_max_u32", VOP_I32_I32_I32_I32>; - defm V_ADD_MIN_I32 : VOP3Inst <"v_add_min_i32", VOP_I32_I32_I32_I32>; - defm V_ADD_MIN_U32 : VOP3Inst <"v_add_min_u32", VOP_I32_I32_I32_I32>; + defm V_ADD_MAX_I32 : VOP3Inst <"v_add_max_i32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>, int_amdgcn_add_max_i32>; + defm V_ADD_MAX_U32 : VOP3Inst <"v_add_max_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>, int_amdgcn_add_max_u32>; + defm V_ADD_MIN_I32 : VOP3Inst <"v_add_min_i32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>, int_amdgcn_add_min_i32>; + defm V_ADD_MIN_U32 : VOP3Inst <"v_add_min_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>, int_amdgcn_add_min_u32>; } defm V_ADD_I16 : VOP3Inst_t16 <"v_add_i16", VOP_I16_I16_I16>; @@ -789,9 +787,6 @@ let isCommutable = 1 in { defm V_MAD_I32_I16 : VOP3Inst_t16 <"v_mad_i32_i16", VOP_I32_I16_I16_I32>; } // End isCommutable = 1 -defm V_CVT_PKNORM_I16_F16 : VOP3Inst_t16 <"v_cvt_pknorm_i16_f16", VOP_B32_F16_F16>; -defm V_CVT_PKNORM_U16_F16 : VOP3Inst_t16 <"v_cvt_pknorm_u16_f16", VOP_B32_F16_F16>; - defm V_PACK_B32_F16 : VOP3Inst_t16 <"v_pack_b32_f16", VOP_B32_F16_F16>; let isReMaterializable = 1 in { @@ -820,13 +815,13 @@ let OtherPredicates = [HasFP8ConversionInsts], mayRaiseFPException = 0, VOP3_CVT_PK_F8_F32_Profile_t16<>, VOP3_CVT_PK_F8_F32_Profile_fake16<>>; - let SubtargetPredicate = isGFX12Plus in { + let SubtargetPredicate = isGFX11Plus in { let OtherPredicates = [HasFP8ConversionInsts, NotHasFP8E5M3Insts] in defm V_CVT_SR_FP8_F32_gfx12 : VOP3Inst<"v_cvt_sr_fp8_f32_gfx12", VOP3_CVT_SR_F8_ByteSel_Profile<f32>>; let OtherPredicates = [HasFP8ConversionInsts, HasFP8E5M3Insts] in defm V_CVT_SR_FP8_F32_gfx1250 : VOP3Inst<"v_cvt_sr_fp8_f32_gfx1250", VOP3_CVT_SR_F8_ByteSel_Profile<f32, true>>; defm V_CVT_SR_BF8_F32_gfx12 : VOP3Inst<"v_cvt_sr_bf8_f32_gfx12", VOP3_CVT_SR_F8_ByteSel_Profile<f32>>; - } + } // End SubtargetPredicate = isGFX11Plus } // These instructions have non-standard use of op_sel. In particular they are @@ -930,7 +925,7 @@ let SubtargetPredicate = isGFX940Plus in { } } -let SubtargetPredicate = isGFX12Plus in { +let SubtargetPredicate = isGFX11Plus in { let OtherPredicates = [HasFP8ConversionInsts, NotHasFP8E5M3Insts] in def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_fp8_f32, V_CVT_SR_FP8_F32_gfx12_e64, f32>; let OtherPredicates = [HasFP8ConversionInsts, HasFP8E5M3Insts] in { @@ -938,7 +933,7 @@ let SubtargetPredicate = isGFX12Plus in { def : Cvt_SR_F8_ByteSel_E5M3_Pat<int_amdgcn_cvt_sr_fp8_f32_e5m3, V_CVT_SR_FP8_F32_gfx1250_e64, f32, DSTCLAMP.ENABLE>; } def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_bf8_f32, V_CVT_SR_BF8_F32_gfx12_e64, f32>; -} +} // End SubtargetPredicate = isGFX11Plus } class ThreeOp_i32_Pats <SDPatternOperator op1, SDPatternOperator op2, Instruction inst> : GCNPat < @@ -976,10 +971,10 @@ def : GCNPat < } // End SubtargetPredicate = HasLshlAddU64Inst let SubtargetPredicate = HasAddMinMaxInsts in { -def : ThreeOp_i32_Pats<add, smax, V_ADD_MAX_I32_e64>; -def : ThreeOp_i32_Pats<add, umax, V_ADD_MAX_U32_e64>; -def : ThreeOp_i32_Pats<add, smin, V_ADD_MIN_I32_e64>; -def : ThreeOp_i32_Pats<add, umin, V_ADD_MIN_U32_e64>; +def : ThreeOp_i32_Pats<saddsat, smax, V_ADD_MAX_I32_e64>; +def : ThreeOp_i32_Pats<uaddsat, umax, V_ADD_MAX_U32_e64>; +def : ThreeOp_i32_Pats<saddsat, smin, V_ADD_MIN_I32_e64>; +def : ThreeOp_i32_Pats<uaddsat, umin, V_ADD_MIN_U32_e64>; } def : VOPBinOpClampPat<saddsat, V_ADD_I32_e64, i32>; @@ -996,6 +991,11 @@ def : GCNPat<(DivergentBinFrag<or> (or_oneuse i64:$src0, i64:$src1), i64:$src2), } // End SubtargetPredicate = isGFX9Plus +let SubtargetPredicate = HasCvtPkNormVOP3Insts in { + defm V_CVT_PKNORM_I16_F16 : VOP3Inst_t16 <"v_cvt_pknorm_i16_f16", VOP_B32_F16_F16>; + defm V_CVT_PKNORM_U16_F16 : VOP3Inst_t16 <"v_cvt_pknorm_u16_f16", VOP_B32_F16_F16>; +} // end SubtargetPredicate = HasCvtPkNormVOP3Insts + // FIXME: Probably should hardcode clamp bit in pseudo and avoid this. class OpSelBinOpClampPat<SDPatternOperator node, Instruction inst> : GCNPat< @@ -1061,7 +1061,7 @@ multiclass IMAD32_Mul24_Pats<VOP3_Pseudo inst> { } // exclude pre-GFX9 where it was slow -let OtherPredicates = [HasNotMADIntraFwdBug], SubtargetPredicate = isGFX9Plus in { +let OtherPredicates = [NotHasMADIntraFwdBug], SubtargetPredicate = isGFX9Plus in { defm : IMAD32_Pats<V_MAD_U64_U32_e64>; defm : IMAD32_Mul24_Pats<V_MAD_U64_U32_e64>; } @@ -1717,6 +1717,28 @@ let SubtargetPredicate = isGFX11Plus in { defm V_MINMAX_I32 : VOP3Inst<"v_minmax_i32", VOP3_Profile<VOP_I32_I32_I32_I32>>; defm V_CVT_PK_I16_F32 : VOP3Inst<"v_cvt_pk_i16_f32", VOP3_Profile<VOP_V2I16_F32_F32>>; defm V_CVT_PK_U16_F32 : VOP3Inst<"v_cvt_pk_u16_f32", VOP3_Profile<VOP_V2I16_F32_F32>>; + + def : GCNPat<(v2i16 (build_vector (i16 (fp_to_sint (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))), + (i16 (fp_to_sint (f32 (VOP3Mods f32:$src1, i32:$src1_modifiers)))))), + (V_CVT_PK_I16_F32_e64 $src0_modifiers, $src0, $src1_modifiers, $src1)>; + def : GCNPat<(v2i16 (build_vector (i16 (fp_to_uint (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))), + (i16 (fp_to_uint (f32 (VOP3Mods f32:$src1, i32:$src1_modifiers)))))), + (V_CVT_PK_U16_F32_e64 $src0_modifiers, $src0, $src1_modifiers, $src1)>; + + // Fallback patterns for f32->i16 conversion. These are only required because + // f32->i16 has to be legal so that we can select V_CVT_PK_[IU]16_F32 above. + let True16Predicate = UseRealTrue16Insts in { + def : GCNPat<(i16 (fp_to_sint (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))), + (EXTRACT_SUBREG (V_CVT_I32_F32_e64 $src0_modifiers, $src0), lo16)>; + def : GCNPat<(i16 (fp_to_uint (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))), + (EXTRACT_SUBREG (V_CVT_U32_F32_e64 $src0_modifiers, $src0), lo16)>; + } + let True16Predicate = NotUseRealTrue16Insts in { + def : GCNPat<(i16 (fp_to_sint (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))), + (V_CVT_I32_F32_e64 $src0_modifiers, $src0)>; + def : GCNPat<(i16 (fp_to_uint (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))), + (V_CVT_U32_F32_e64 $src0_modifiers, $src0)>; + } } // End SubtargetPredicate = isGFX11Plus class VOP3_CVT_SR_FP16_TiedInput_Profile<VOPProfile P> : VOP3_CVT_SCALE_F1632_FP8BF8_TiedInput_Profile<P> { diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 6500fce..9a4054b 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -75,7 +75,7 @@ multiclass VOP3PInst<string OpName, VOPProfile P, SDPatternOperator node = null_frag, bit IsDOT = 0> { def NAME : VOP3P_Pseudo<OpName, P, !if (P.HasModifiers, - getVOP3PModPat<P, node, IsDOT, IsDOT>.ret, + getVOP3PModPat<P, node, !or(P.EnableClamp, IsDOT), IsDOT>.ret, getVOP3Pat<P, node>.ret)>; let SubtargetPredicate = isGFX11Plus in { if P.HasExtVOP3DPP then @@ -182,6 +182,8 @@ multiclass MadFmaMixFP32Pats<SDPatternOperator fma_like, ValueType VT = f16> { defvar VOP3PMadMixModsPat = !if (!eq(VT, bf16), VOP3PMadMixBF16Mods, VOP3PMadMixMods); defvar VOP3PMadMixModsExtPat = !if (!eq(VT, bf16), VOP3PMadMixBF16ModsExt, VOP3PMadMixModsExt); + defvar OneImm = !if (!eq(VT, bf16), CONST.BF16_ONE, CONST.FP16_ONE); + defvar NegOneImm = !if (!eq(VT, bf16), CONST.BF16_NEG_ONE, CONST.FP16_NEG_ONE); // At least one of the operands needs to be an fpextend of an f16 // for this to be worthwhile, so we need three patterns here. // TODO: Could we use a predicate to inspect src1/2/3 instead? @@ -203,6 +205,34 @@ multiclass MadFmaMixFP32Pats<SDPatternOperator fma_like, (f32 (VOP3PMadMixModsExtPat VT:$src2, i32:$src2_mods)))), (mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE)>; + + // (fadd x, y) -> (fma x, 1.0, y) + def : GCNPat < + (f32 (fadd (f32 (VOP3PMadMixModsExtPat VT:$src0, i32:$src0_mods)), + (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_mods)))), + (mix_inst $src0_mods, $src0, (i32 8), (i32 OneImm), $src1_mods, $src1, + DSTCLAMP.NONE)>; + + // (fmul x, y) -> (fma x, y, -0.0) + def : GCNPat < + (f32 (fmul (f32 (VOP3PMadMixModsExtPat VT:$src0, i32:$src0_mods)), + (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_mods)))), + (mix_inst $src0_mods, $src0, $src1_mods, $src1, (i32 SRCMODS.NEG), (i32 0), + DSTCLAMP.NONE)>; + + // (fsub x, y) -> (fma y, -1.0, x) + def : GCNPat < + (f32 (fsub (f32 (VOP3PMadMixModsExtPat VT:$src0, i32:$src0_mods)), + (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_mods)))), + (mix_inst $src1_mods, $src1, (i32 8), (i32 NegOneImm), $src0_mods, $src0, + DSTCLAMP.NONE)>; + + // (fsub x, y) -> (fma y, -1.0, x) + def : GCNPat < + (f32 (fsub (f32 (VOP3PMadMixModsPat f32:$src0, i32:$src0_mods)), + (f32 (VOP3PMadMixModsExtPat VT:$src1, i32:$src1_mods)))), + (mix_inst $src1_mods, $src1, (i32 8), (i32 NegOneImm), $src0_mods, $src0, + DSTCLAMP.NONE)>; } multiclass MadFmaMixFP16Pats<SDPatternOperator fma_like, @@ -235,7 +265,7 @@ multiclass MadFmaMixFP16Pats<SDPatternOperator fma_like, (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_modifiers))))), (mixlo_inst $src0_modifiers, $src0, $src1_modifiers, $src1, - (i32 0), (i32 0), + (i32 SRCMODS.NONE), (i32 0), DSTCLAMP.NONE, (i32 (IMPLICIT_DEF))) >; @@ -245,7 +275,7 @@ multiclass MadFmaMixFP16Pats<SDPatternOperator fma_like, (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_modifiers)))))), (vecVT (mixhi_inst $src0_modifiers, $src0, $src1_modifiers, $src1, - (i32 0), (i32 0), + (i32 SRCMODS.NONE), (i32 0), DSTCLAMP.NONE, VGPR_32:$elt0)) >; @@ -299,7 +329,7 @@ multiclass MadFmaMixFP16Pats_t16<SDPatternOperator fma_like, (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_modifiers))))), (mix_inst_16 $src0_modifiers, $src0, $src1_modifiers, $src1, - (i32 0), (i32 0), + (i32 SRCMODS.NONE), (i32 0), DSTCLAMP.NONE) >; @@ -434,15 +464,16 @@ defm : MadFmaMixFP16Pats_t16<fma, V_FMA_MIX_BF16_t16>; } // End SubtargetPredicate = HasFmaMixBF16Insts def PK_ADD_MINMAX_Profile : VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16, VOP3_PACKED> { - let HasModifiers = 0; + let HasNeg = 0; + let EnableClamp = 1; } let isCommutable = 1, isReMaterializable = 1 in { let SubtargetPredicate = HasPkAddMinMaxInsts in { -defm V_PK_ADD_MAX_I16 : VOP3PInst<"v_pk_add_max_i16", PK_ADD_MINMAX_Profile>; -defm V_PK_ADD_MAX_U16 : VOP3PInst<"v_pk_add_max_u16", PK_ADD_MINMAX_Profile>; -defm V_PK_ADD_MIN_I16 : VOP3PInst<"v_pk_add_min_i16", PK_ADD_MINMAX_Profile>; -defm V_PK_ADD_MIN_U16 : VOP3PInst<"v_pk_add_min_u16", PK_ADD_MINMAX_Profile>; +defm V_PK_ADD_MAX_I16 : VOP3PInst<"v_pk_add_max_i16", PK_ADD_MINMAX_Profile, int_amdgcn_pk_add_max_i16>; +defm V_PK_ADD_MAX_U16 : VOP3PInst<"v_pk_add_max_u16", PK_ADD_MINMAX_Profile, int_amdgcn_pk_add_max_u16>; +defm V_PK_ADD_MIN_I16 : VOP3PInst<"v_pk_add_min_i16", PK_ADD_MINMAX_Profile, int_amdgcn_pk_add_min_i16>; +defm V_PK_ADD_MIN_U16 : VOP3PInst<"v_pk_add_min_u16", PK_ADD_MINMAX_Profile, int_amdgcn_pk_add_min_u16>; } let SubtargetPredicate = HasPkMinMax3Insts in { defm V_PK_MAX3_I16 : VOP3PInst<"v_pk_max3_i16", PK_ADD_MINMAX_Profile>; @@ -463,10 +494,10 @@ class ThreeOp_OpSelClampPats <SDPatternOperator op1, SDPatternOperator op2, >; let SubtargetPredicate = HasPkAddMinMaxInsts in { -def : ThreeOp_OpSelClampPats<add, smax, V_PK_ADD_MAX_I16>; -def : ThreeOp_OpSelClampPats<add, umax, V_PK_ADD_MAX_U16>; -def : ThreeOp_OpSelClampPats<add, smin, V_PK_ADD_MIN_I16>; -def : ThreeOp_OpSelClampPats<add, umin, V_PK_ADD_MIN_U16>; +def : ThreeOp_OpSelClampPats<saddsat, smax, V_PK_ADD_MAX_I16>; +def : ThreeOp_OpSelClampPats<uaddsat, umax, V_PK_ADD_MAX_U16>; +def : ThreeOp_OpSelClampPats<saddsat, smin, V_PK_ADD_MIN_I16>; +def : ThreeOp_OpSelClampPats<uaddsat, umin, V_PK_ADD_MIN_U16>; } let SubtargetPredicate = HasPkMinMax3Insts in { @@ -662,7 +693,6 @@ def VOP3P_DOTF8_Profile : VOP3P_Profile<VOPProfile <[f32, i32, i32, f32]>, multiclass VOP3PDOTF8Inst <string OpName, SDPatternOperator intrinsic_node> { defm NAME : VOP3PInst<OpName, VOP3P_DOTF8_Profile, null_frag, 1>; - let SubtargetPredicate = isGFX12Plus in def : GCNPat <(intrinsic_node i32:$src0, i32:$src1, (VOP3Mods f32:$src2, i32:$src2_modifiers)), (!cast<Instruction>(NAME) i32:$src0, i32:$src1, @@ -995,6 +1025,7 @@ class MAIInst<string OpName, VOPProfile P, SDPatternOperator node, bit Scaled = Instruction Opcode = !cast<Instruction>(NAME); bit is_dgemm = 0; bit is_gfx940_xdl = 0; + let isConvergent = 1; let PseudoInstr = NAME; // FIXME: Why is this not the default } @@ -1032,7 +1063,7 @@ multiclass MAIInst<string OpName, string P, SDPatternOperator node = null_frag, defvar ProfileVGPR = !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD"); - let isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1 in { + let mayRaiseFPException = 0, ReadsModeReg = 1 in { // FP32 denorm mode is respected, rounding mode is not. Exceptions are not supported. let Constraints = !if(NoDstOverlap, "@earlyclobber $vdst", "") in { def _e64 : MAIInst<OpName, ProfileAGPR, @@ -1059,7 +1090,7 @@ multiclass MAIInst<string OpName, string P, SDPatternOperator node = null_frag, MFMATable<1, "VGPR", NAME # "_vgprcd_e64", NAME # "_mac_e64">; } } - } // End isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1 + } // mayRaiseFPException = 0, ReadsModeReg = 1 } // Provide a wrapper around MAIInst that provides the appended operands from V_MFMA_LD_SCALE_B32 @@ -1363,16 +1394,10 @@ class WMMAOpcodeMapping<Instruction TwoAddr, Instruction ThreeAddr> { field bit is_wmma_xdl; } -def WMMAOpcode : GenericEnum { - let FilterClass = "VOP3P_Pseudo"; -} - class WMMAMappingTable : GenericTable { let FilterClass = "WMMAOpcodeMapping"; let CppTypeName = "WMMAOpcodeMappingInfo"; let Fields = ["Opcode2Addr", "Opcode3Addr"]; - string TypeOf_Opcode2Addr = "WMMAOpcode"; - string TypeOf_Opcode3Addr = "WMMAOpcode"; } def WMMAOpcode2AddrMappingTable : WMMAMappingTable { @@ -1401,13 +1426,13 @@ multiclass WMMAInst<string Suffix, string Instr, VOPProfile P, SDPatternOperator defvar WMMAConstraints3Addr = "@earlyclobber $vdst"; defvar WMMAProfile = VOPProfileWMMA<P, Suffix, _Src01RC64, Type.hasClamp, Type.hasOpsel>; - let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in { + let isConvergent = 1, Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in { let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = convertibleTo3Addr in { def _twoaddr # Suffix : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>; } } if convertibleTo3Addr then { - let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in { + let isConvergent = 1, Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in { let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in { def _threeaddr # Suffix : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>; } @@ -1453,13 +1478,12 @@ let WaveSizePredicate = isWave64 in { } class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType, - bit _IsIU, bit _IsFP8BF8XF32, bit _Has_ImodOp = 0, + bit _IsIU, bit _IsFP8BF8, bit _Has_ImodOp = 0, bit _HasMatrixFMT = 0, bit _HasMatrixScale = 0, bit _Scale16 = 0, bit _HasMatrixReuse = 0, bit _IsF4 = 0> : VOP3P_Profile<VOPProfile<ArgTy>> { bit IsIU = _IsIU; - bit NoABMods = !or(_IsFP8BF8XF32, _IsF4); // No IMOD support for A and B - bit IsXF32 = !and(_IsFP8BF8XF32, !eq(ArgTy[1], v8f32)); + bit NoABMods = !or(_IsFP8BF8, _IsF4); // No IMOD support for A and B int IndexType = _IndexType; let HasMatrixFMT = _HasMatrixFMT; @@ -1468,7 +1492,7 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType, let HasMatrixReuse = _HasMatrixReuse; bit HasIModOp = _Has_ImodOp; - let HasClamp = !and(IsIU, !not(HasIModOp)); + let HasClamp = IsIU; let IsPacked = 1; let IsWMMA = !not(_IsSWMMAC); let IsSWMMAC = _IsSWMMAC; @@ -1487,9 +1511,9 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType, bit IsC_F16 = !or(!eq(ArgTy[3], v8f16), !eq(ArgTy[3], v4f16)); bit NegLo01 = !not(NoABMods); - bit NegLo2 = !and(!not(IsIU), !not(IsXF32), IsWMMA); + bit NegLo2 = !and(!not(IsIU), IsWMMA); bit NegHi01 = IsF16BF16; // Only F16BF16 can have neg_hi[0:1] - bit NegHi2 = !and(!not(IsIU), !not(IsXF32), IsWMMA); + bit NegHi2 = !and(!not(IsIU), IsWMMA); bit NegLoAny = !or(NegLo01, NegLo2); bit NegHiAny = !or(NegHi01, NegHi2); @@ -1520,8 +1544,6 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType, // wmma f32_f32 | neg_lo for neg A/B | neg_lo = 1 neg C(f32) // | neg_hi ignored | neg_hi = 1 abs C(f32) // --------------------------------------------------------------------------- - // wmma f32_xf32 | not allowed for xf32 | not allowed - // --------------------------------------------------------------------------- // wmma f32_f16 | both neg_lo,neg_hi = 1 | neg_lo = 1 neg C(f32) // wmma f32_bf16 | neg A/B (f16 or bf16) | neg_hi = 1 abs C(f32) // --------------------------------------------------------------------------- @@ -1552,13 +1574,13 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType, // pseudo - // fp8bf8 and xf32 wmmas don't use src (0 and 1) modifiers, iu use neg_lo, f16 and bf16 + // fp8bf8 wmmas don't use src (0 and 1) modifiers, iu use neg_lo, f16 and bf16 // use neg_lo and neg_hi. iu wmmas (C is i32) don't use src 2 modifiers, // remaining wmmas(f16, bf16 and f8bf8) use neg_lo and neg_hi for C (C is f32 // f16 or bf16). swmmac use index_key and don't use src 2 modifiers. dag Src0Mods = !if(NoABMods, (ins), (ins PackedF16InputMods:$src0_modifiers)); dag Src1Mods = !if(NoABMods, (ins), (ins PackedF16InputMods:$src1_modifiers)); - dag Src2Mods = !if(!or(IsIU, IsXF32, IsSWMMAC), (ins), (ins PackedF16InputMods:$src2_modifiers)); + dag Src2Mods = !if(!or(IsIU, IsSWMMAC), (ins), (ins PackedF16InputMods:$src2_modifiers)); dag IndexKey = !cond(!eq(IndexType, 0) : (ins), !eq(IndexType, 8) : (ins IndexKey8bit:$index_key_8bit), !eq(IndexType, 16): (ins IndexKey16bit:$index_key_16bit), @@ -1573,7 +1595,7 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType, MatrixAScaleFmt:$matrix_a_scale_fmt, MatrixBScaleFmt:$matrix_b_scale_fmt), (ins)); dag MatrixReuse = !if(HasMatrixReuse, (ins MatrixAReuse:$matrix_a_reuse, MatrixBReuse:$matrix_b_reuse), (ins)); - dag Clamp = !if(HasClamp, (ins Clamp0:$clamp), (ins)); + dag ClampOp = !if(HasClamp, (ins Clamp:$clamp), (ins)); dag Neg = !cond(!and(NegLoAny, NegHiAny) : (ins neg_lo0:$neg_lo, neg_hi0:$neg_hi), !and(NegLoAny, !not(NegHiAny)) : (ins neg_lo0:$neg_lo), !and(!not(NegLoAny), !not(NegHiAny)) : (ins)); @@ -1585,7 +1607,7 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType, (ins VRegSrc_64:$src2), (ins VRegSrc_32:$src2)), IndexKey)), - MatrixScaleSrc, MatrixFMT, MatrixScale, MatrixReuse, Clamp, Neg); + MatrixScaleSrc, ClampOp, MatrixFMT, MatrixScale, MatrixReuse, Neg); // asm @@ -1635,22 +1657,21 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType, IsAB_BF16_IMod0 : (ins (i32 8), Src1VT:$src1), IsIU : (ins (VOP3PModsNeg $src1_modifiers), Src1VT:$src1), NoABMods : (ins Src1VT:$src1)); - bit IsC_IMod1 = !and(HasIModOp, IsWMMA, !not(IsIU), !not(IsXF32)); + bit IsC_IMod1 = !and(HasIModOp, IsWMMA, !not(IsIU)); bit IsC_F32_IMod0 = !and(IsC_F32, !not(HasIModOp)); bit IsC_F16_IMod0 = !and(IsC_F16, !not(HasIModOp)); bit IsC_BF16_IMod0 = !and(IsC_BF16, !not(HasIModOp)); - bit IsIUXF32 = !or(IsIU, IsXF32); dag Src2InPatWmma = !cond(IsC_IMod1 : (ins timm:$src2_modifiers, Src2VT:$src2), IsC_F32_IMod0 : (ins (Src2VT (WMMAModsF32NegAbs Src2VT:$src2, i32:$src2_modifiers))), IsC_F16_IMod0 : (ins (Src2VT (WMMAModsF16NegAbs Src2VT:$src2, i32:$src2_modifiers))), IsC_BF16_IMod0 : (ins Src2VT:$src2), - IsIUXF32 : (ins Src2VT:$src2), + IsIU : (ins Src2VT:$src2), IsSWMMAC : (ins)); dag Src2OutPatWmma = !cond(IsC_IMod1 : (ins (VOP3PModsNegAbs $src2_modifiers), Src2VT:$src2), IsC_F32_IMod0 : (ins i32:$src2_modifiers, Src2VT:$src2), IsC_F16_IMod0 : (ins i32:$src2_modifiers, Src2VT:$src2), IsC_BF16_IMod0 : (ins (i32 8), Src2VT:$src2), - IsIUXF32 : (ins Src2VT:$src2), + IsIU : (ins Src2VT:$src2), IsSWMMAC : (ins)); dag ClampPat = !if(HasClamp, (ins i1:$clamp), (ins)); dag IndexInPat = !cond(!eq(IndexType, 0) : (ins i32:$src2), @@ -1663,7 +1684,7 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType, !eq(IndexType, 32): (ins i64:$src2, i32:$index_key_32bit)); dag MatrixFMTOutPat = !if(HasMatrixFMT, (ins i32:$matrix_a_fmt, i32:$matrix_b_fmt), (ins)); dag Src2InlineInPat = !con(!if(IsC_IMod1, (ins timm:$src2_modifiers), (ins)), (ins (Src2VT (WMMAVISrc Src2VT:$src2)))); - dag Src2InlineOutPat = !con(!if(IsIUXF32, (ins), !if(IsC_IMod1, (ins (VOP3PModsNegAbs $src2_modifiers)), (ins (i32 8)))), (ins Src2VT:$src2)); + dag Src2InlineOutPat = !con(!if(IsIU, (ins), !if(IsC_IMod1, (ins (VOP3PModsNegAbs $src2_modifiers)), (ins (i32 8)))), (ins Src2VT:$src2)); dag MatrixScaleInPat = !if(HasMatrixScale, (ins timm:$matrix_a_scale, timm:$matrix_a_scale_fmt, ScaleTy:$scale_src0, timm:$matrix_b_scale, timm:$matrix_b_scale_fmt, ScaleTy:$scale_src1), (ins)); @@ -1674,17 +1695,17 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType, dag MatrixReuseOutModPat = !if(HasMatrixReuse, (ins i1:$matrix_a_reuse, i1:$matrix_b_reuse), (ins)); dag WmmaInPat = !con(Src0InPat, Src1InPat, Src2InPatWmma, MatrixScaleInPat, MatrixReuseInPat, ClampPat); - dag WmmaOutPat = !con(Src0OutPat, Src1OutPat, Src2OutPatWmma, MatrixScaleOutSrcPat, MatrixFMTOutPat, - MatrixScaleOutModPat, MatrixReuseOutModPat, ClampPat); + dag WmmaOutPat = !con(Src0OutPat, Src1OutPat, Src2OutPatWmma, MatrixScaleOutSrcPat, ClampPat, MatrixFMTOutPat, + MatrixScaleOutModPat, MatrixReuseOutModPat); dag SwmmacInPat = !con(Src0InPat, Src1InPat, (ins Src2VT:$srcTiedDef), IndexInPat, MatrixReuseInPat, ClampPat); - dag SwmmacOutPat = !con(Src0OutPat, Src1OutPat, (ins Src2VT:$srcTiedDef), IndexOutPat, MatrixReuseOutModPat, ClampPat); + dag SwmmacOutPat = !con(Src0OutPat, Src1OutPat, (ins Src2VT:$srcTiedDef), IndexOutPat, ClampPat, MatrixReuseOutModPat); // wmma pattern where src2 is inline imm uses _threeaddr pseudo, // can't use _twoaddr since it would violate src2 tied to vdst constraint. dag WmmaInlineInPat = !con(Src0InPat, Src1InPat, Src2InlineInPat, MatrixScaleInPat, MatrixReuseInPat, ClampPat); - dag WmmaInlineOutPat = !con(Src0OutPat, Src1OutPat, Src2InlineOutPat, MatrixScaleOutSrcPat, - MatrixFMTOutPat, MatrixScaleOutModPat, MatrixReuseOutModPat, ClampPat); + dag WmmaInlineOutPat = !con(Src0OutPat, Src1OutPat, Src2InlineOutPat, MatrixScaleOutSrcPat, ClampPat, + MatrixFMTOutPat, MatrixScaleOutModPat, MatrixReuseOutModPat); } def WMMAInstInfoTable : GenericTable { @@ -1706,7 +1727,7 @@ multiclass WMMAInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string Pse defvar WMMAConstraints2Addr = !if(DiffVdstSrc2, "@earlyclobber $vdst", "@earlyclobber $vdst,$vdst = $src2"); defvar WMMAConstraints3Addr = "@earlyclobber $vdst"; - let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in { + let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0, isConvergent = 1 in { let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = 1 in def _twoaddr : VOP3P_Pseudo<Instr, WMMAProfile>, WMMAInstInfo { let PseudoInstr = Instr#PseudoInstrSuffix; @@ -1733,7 +1754,7 @@ multiclass SWMMACInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string P let mayRaiseFPException = 0; let ReadsModeReg = 0; let AsmMatchConverter = "cvtSWMMAC"; - + let isConvergent = 1; let Constraints = "@earlyclobber $vdst,$vdst = $srcTiedDef"; } } @@ -1756,84 +1777,126 @@ multiclass SWMMACInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string P // Non-standard types (iu8, iu4, fp8, bf8) will be packed in vectors of i32s. // Original type for them is in comment on the right and refers to A and B. -def F32_F16_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v8f16, v8f16, v8f32], 0, 0, 0, 0>; -def F32_BF16_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v8i16, v8i16, v8f32], 0, 0, 0, 0>; -def F16_F16_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v8f16, v8f16, v8f16], 0, 0, 0, 0>; -def BF16_BF16_WMMA_w32 : VOP3PWMMA_Profile<[v8i16, v8i16, v8i16, v8i16], 0, 0, 0, 0>; -def I32_IU8_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, v2i32, v2i32, v8i32], 0, 0, 1, 0>; // 8xi8 -def I32_IU4X16_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, i32, i32, v8i32], 0, 0, 1, 0>; // 8xi4 -def F32_FP8BF8_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v2i32, v2i32, v8f32], 0, 0, 0, 1>; // 8xf8 -def I32_IU4X32_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, v2i32, v2i32, v8i32], 0, 0, 1, 0>; // 16xi4 - -def F32_F16_WMMA_w64 : VOP3PWMMA_Profile<[v4f32, v4f16, v4f16, v4f32], 0, 0, 0, 0>; -def F32_BF16_WMMA_w64 : VOP3PWMMA_Profile<[v4f32, v4i16, v4i16, v4f32], 0, 0, 0, 0>; -def F16_F16_WMMA_w64 : VOP3PWMMA_Profile<[v4f16, v4f16, v4f16, v4f16], 0, 0, 0, 0>; -def BF16_BF16_WMMA_w64 : VOP3PWMMA_Profile<[v4i16, v4i16, v4i16, v4i16], 0, 0, 0, 0>; -def I32_IU8_WMMA_w64 : VOP3PWMMA_Profile<[v4i32, i32, i32, v4i32], 0, 0, 1, 0>; // 4xi8 -def I32_IU4X16_WMMA_w64 : VOP3PWMMA_Profile<[v4i32, i32, i32, v4i32], 0, 0, 1, 0>; // 8xi4 * -def F32_FP8BF8_WMMA_w64 : VOP3PWMMA_Profile<[v4f32, i32, i32, v4f32], 0, 0, 0, 1>; // 4xf8 -def I32_IU4X32_WMMA_w64 : VOP3PWMMA_Profile<[v4i32, i32, i32, v4i32], 0, 0, 1, 0>; // 8xi4 - -def F32_F16_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v8f16, v16f16, v8f32], 1, 16, 0, 0>; -def F32_BF16_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v8i16, v16i16, v8f32], 1, 16, 0, 0>; -def F16_F16_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v8f16, v16f16, v8f16], 1, 16, 0, 0>; -def BF16_BF16_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i16, v8i16, v16i16, v8i16], 1, 16, 0, 0>; -def I32_IU8_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, v2i32, v4i32, v8i32], 1, 16, 1, 0>; // 8xi8, 16xi8 -def I32_IU4X32_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, i32, v2i32, v8i32], 1, 16, 1, 0>; // 8xi4, 16xi4 -def I32_IU4X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, v2i32, v4i32, v8i32], 1, 0, 1, 0>; // 16xi4, 32xi4 ** -def F32_FP8BF8_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v2i32, v4i32, v8f32], 1, 16, 0, 1>; // 8xf8, 16xf8 - -def F32_F16_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f32, v4f16, v8f16, v4f32], 1, 8, 0, 0>; -def F32_BF16_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f32, v4i16, v8i16, v4f32], 1, 8, 0, 0>; -def F16_F16_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f16, v4f16, v8f16, v4f16], 1, 8, 0, 0>; -def BF16_BF16_SWMMAC_w64 : VOP3PWMMA_Profile<[v4i16, v4i16, v8i16, v4i16], 1, 8, 0, 0>; -def I32_IU8_SWMMAC_w64 : VOP3PWMMA_Profile<[v4i32, i32, v2i32, v4i32], 1, 8, 1, 0>; // 4xi8, 8xi8 -def I32_IU4X32_SWMMAC_w64 : VOP3PWMMA_Profile<[v4i32, i32, i32, v4i32], 1, 16, 1, 0>; // 8xi4, 8xi4 *** -def I32_IU4X64_SWMMAC_w64 : VOP3PWMMA_Profile<[v4i32, i32, v2i32, v4i32], 1, 16, 1, 0>; // 8xi4, 16xi4 -def F32_FP8BF8_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f32, i32, v2i32, v4f32], 1, 8, 0, 1>; // 4xf8, 8xf8 +def F32_F16_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v8f16, v8f16, v8f32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/0>; +def F32_BF16_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v8i16, v8i16, v8f32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/0>; +def F16_F16_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v8f16, v8f16, v8f16], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/0>; +def BF16_BF16_WMMA_w32 : VOP3PWMMA_Profile<[v8i16, v8i16, v8i16, v8i16], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/0>; +def I32_IU8_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, v2i32, v2i32, v8i32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/1, /*_IsFP8BF8=*/0>; // 8xi8 +def I32_IU4X16_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, i32, i32, v8i32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/1, /*_IsFP8BF8=*/0>; // 8xi4 +def F32_FP8BF8_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v2i32, v2i32, v8f32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/1>; // 8xf8 +def I32_IU4X32_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, v2i32, v2i32, v8i32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/1, /*_IsFP8BF8=*/0>; // 16xi4 + +def F32_F16_WMMA_w64 : VOP3PWMMA_Profile<[v4f32, v4f16, v4f16, v4f32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/0>; +def F32_BF16_WMMA_w64 : VOP3PWMMA_Profile<[v4f32, v4i16, v4i16, v4f32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/0>; +def F16_F16_WMMA_w64 : VOP3PWMMA_Profile<[v4f16, v4f16, v4f16, v4f16], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/0>; +def BF16_BF16_WMMA_w64 : VOP3PWMMA_Profile<[v4i16, v4i16, v4i16, v4i16], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/0>; +def I32_IU8_WMMA_w64 : VOP3PWMMA_Profile<[v4i32, i32, i32, v4i32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/1, /*_IsFP8BF8=*/0>; // 4xi8 +def I32_IU4X16_WMMA_w64 : VOP3PWMMA_Profile<[v4i32, i32, i32, v4i32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/1, /*_IsFP8BF8=*/0>; // 8xi4 * +def F32_FP8BF8_WMMA_w64 : VOP3PWMMA_Profile<[v4f32, i32, i32, v4f32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/1>; // 4xf8 +def I32_IU4X32_WMMA_w64 : VOP3PWMMA_Profile<[v4i32, i32, i32, v4i32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/1, /*_IsFP8BF8=*/0>; // 8xi4 + +def F32_F16_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v8f16, v16f16, v8f32], /*_IsSWMMAC=*/1, /*_IndexType=*/16, /*_IsIU=*/0, /*_IsFP8BF8=*/0>; +def F32_BF16_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v8i16, v16i16, v8f32], /*_IsSWMMAC=*/1, /*_IndexType=*/16, /*_IsIU=*/0, /*_IsFP8BF8=*/0>; +def F16_F16_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v8f16, v16f16, v8f16], /*_IsSWMMAC=*/1, /*_IndexType=*/16, /*_IsIU=*/0, /*_IsFP8BF8=*/0>; +def BF16_BF16_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i16, v8i16, v16i16, v8i16], /*_IsSWMMAC=*/1, /*_IndexType=*/16, /*_IsIU=*/0, /*_IsFP8BF8=*/0>; +def I32_IU8_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, v2i32, v4i32, v8i32], /*_IsSWMMAC=*/1, /*_IndexType=*/16, /*_IsIU=*/1, /*_IsFP8BF8=*/0>; // 8xi8, 16xi8 +def I32_IU4X32_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, i32, v2i32, v8i32], /*_IsSWMMAC=*/1, /*_IndexType=*/16, /*_IsIU=*/1, /*_IsFP8BF8=*/0>; // 8xi4, 16xi4 +def I32_IU4X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, v2i32, v4i32, v8i32], /*_IsSWMMAC=*/1, /*_IndexType=*/0, /*_IsIU=*/1, /*_IsFP8BF8=*/0>; // 16xi4, 32xi4 ** +def F32_FP8BF8_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v2i32, v4i32, v8f32], /*_IsSWMMAC=*/1, /*_IndexType=*/16, /*_IsIU=*/0, /*_IsFP8BF8=*/1>; // 8xf8, 16xf8 + +def F32_F16_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f32, v4f16, v8f16, v4f32], /*_IsSWMMAC=*/1, /*_IndexType=*/8, /*_IsIU=*/0, /*_IsFP8BF8=*/0>; +def F32_BF16_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f32, v4i16, v8i16, v4f32], /*_IsSWMMAC=*/1, /*_IndexType=*/8, /*_IsIU=*/0, /*_IsFP8BF8=*/0>; +def F16_F16_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f16, v4f16, v8f16, v4f16], /*_IsSWMMAC=*/1, /*_IndexType=*/8, /*_IsIU=*/0, /*_IsFP8BF8=*/0>; +def BF16_BF16_SWMMAC_w64 : VOP3PWMMA_Profile<[v4i16, v4i16, v8i16, v4i16], /*_IsSWMMAC=*/1, /*_IndexType=*/8, /*_IsIU=*/0, /*_IsFP8BF8=*/0>; +def I32_IU8_SWMMAC_w64 : VOP3PWMMA_Profile<[v4i32, i32, v2i32, v4i32], /*_IsSWMMAC=*/1, /*_IndexType=*/8, /*_IsIU=*/1, /*_IsFP8BF8=*/0>; // 4xi8, 8xi8 +def I32_IU4X32_SWMMAC_w64 : VOP3PWMMA_Profile<[v4i32, i32, i32, v4i32], /*_IsSWMMAC=*/1, /*_IndexType=*/16, /*_IsIU=*/1, /*_IsFP8BF8=*/0>; // 8xi4, 8xi4 *** +def I32_IU4X64_SWMMAC_w64 : VOP3PWMMA_Profile<[v4i32, i32, v2i32, v4i32], /*_IsSWMMAC=*/1, /*_IndexType=*/16, /*_IsIU=*/1, /*_IsFP8BF8=*/0>; // 8xi4, 16xi4 +def F32_FP8BF8_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f32, i32, v2i32, v4f32], /*_IsSWMMAC=*/1, /*_IndexType=*/8, /*_IsIU=*/0, /*_IsFP8BF8=*/1>; // 4xf8, 8xf8 // * IU4X16_WMMA_w64 lanes 0-31 will have 8xi4, remaining lanes are ignored // ** IU4X64_SWMMAC_w32 index is i32, index_key is not used // *** IU4X32_SWMMAC_w64 lanes 0-31 will have 8xi4 remaining lanes are ignored // for matrix A, index is i16; Matrix B uses all lanes -def F32_F32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v2f32, v2f32, v8f32], 0, 0, 0, 0, 1, 0, 0, 0, 1>; -def F32_BF16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v16bf16, v8f32], 0, 0, 0, 0, 1, 0, 0, 0, 1>; -def F32_F16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16f16, v16f16, v8f32], 0, 0, 0, 0, 1, 0, 0, 0, 1>; -def F16_F16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v16f16, v16f16, v8f16], 0, 0, 0, 0, 1, 0, 0, 0, 1>; -def BF16_BF16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8bf16], 0, 0, 0, 0, 1, 0, 0, 0, 1>; -def BF16F32_BF16_WMMA_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8f32], 0, 0, 0, 0, 1, 0, 0, 0, 1>; -def F32_FP8BF8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v8i32, v8f32], 0, 0, 0, 1, 1, 0, 0, 0, 1>; -def F32_FP8BF8X128_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v16i32, v8f32], 0, 0, 0, 1, 1, 0, 0, 0, 1>; -def F16_FP8BF8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v8i32, v8i32, v8f16], 0, 0, 0, 1, 1, 0, 0, 0, 1>; -def F16_FP8BF8X128_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v16i32, v16i32, v8f16], 0, 0, 0, 1, 1, 0, 0, 0, 1>; -def F32_32X16X128_F4_WMMA_w32 : VOP3PWMMA_Profile<[v16f32, v16i32, v8i32, v16f32], 0, 0, 0, 0, 1, 0, 0, 0, 0, 1>; -def I32_IU8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, v8i32, v8i32, v8i32], 0, 0, 1, 0, 1, 0, 0, 0, 1>; -def F32_32X16X128_F4_SCALE_w32 : VOP3PWMMA_Profile<[v16f32, v16i32, v8i32, v16f32], 0, 0, 0, 1, 1, 0, 1, 0, 1>; -def F32_32X16X128_F4_SCALE16_w32 : VOP3PWMMA_Profile<[v16f32, v16i32, v8i32, v16f32], 0, 0, 0, 1, 1, 0, 1, 1, 1>; -def F32_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16f16, v32f16, v8f32], 1, 16, 0, 0, 1, 0, 0, 0, 1>; -def F32_BF16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v32bf16, v8f32], 1, 16, 0, 0, 1, 0, 0, 0, 1>; -def F16_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v16f16, v32f16, v8f16], 1, 16, 0, 0, 1, 0, 0, 0, 1>; -def BF16_BF16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v32bf16, v8bf16], 1, 16, 0, 0, 1, 0, 0, 0, 1>; -def F32_FP8BF8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v16i32, v8f32], 1, 32, 0, 1, 1, 0, 0, 0, 1>; -def F16_FP8BF8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v8i32, v16i32, v8f16], 1, 32, 0, 1, 1, 0, 0, 0, 1>; -def I32_IU8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, v8i32, v16i32, v8i32], 1, 32, 1, 0, 1, 0, 0, 0, 1>; - -multiclass WMMA_F8F6F4_Profiles<bit HasMatrixScale, bit Scale16, bit HasMatrixReuse> { - def _f8_f8_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>; - def _f8_f6_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v12i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>; - def _f8_f4_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v8i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>; - def _f6_f8_w32 : VOP3PWMMA_Profile<[v8f32, v12i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>; - def _f6_f6_w32 : VOP3PWMMA_Profile<[v8f32, v12i32, v12i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>; - def _f6_f4_w32 : VOP3PWMMA_Profile<[v8f32, v12i32, v8i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>; - def _f4_f8_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>; - def _f4_f6_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v12i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>; - def _f4_f4_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v8i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>; -} - -defm F32_16X16X128_F8F6F4 : WMMA_F8F6F4_Profiles<0, 0, 0>; -defm F32_16X16X128_F8F6F4_SCALE : WMMA_F8F6F4_Profiles<1, 0, 1>; -defm F32_16X16X128_F8F6F4_SCALE16 : WMMA_F8F6F4_Profiles<1, 1, 1>; +def F32_F32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v2f32, v2f32, v8f32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/0, + /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>; +def F32_BF16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v16bf16, v8f32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/0, + /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>; +def F32_F16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16f16, v16f16, v8f32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/0, + /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>; +def F16_F16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v16f16, v16f16, v8f16], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/0, + /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>; +def BF16_BF16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8bf16], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/0, + /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>; +def BF16F32_BF16_WMMA_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8f32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/0, + /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>; +def F32_FP8BF8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v8i32, v8f32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/1, + /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>; +def F32_FP8BF8X128_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v16i32, v8f32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/1, + /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>; +def F16_FP8BF8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v8i32, v8i32, v8f16], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/1, + /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>; +def F16_FP8BF8X128_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v16i32, v16i32, v8f16], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/1, + /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>; +def F32_32X16X128_F4_WMMA_w32 : VOP3PWMMA_Profile<[v16f32, v16i32, v8i32, v16f32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/0, + /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/0, /*_IsF4=*/1>; +def I32_IU8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, v8i32, v8i32, v8i32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/1, /*_IsFP8BF8=*/0, + /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>; +def F32_32X16X128_F4_SCALE_w32 : VOP3PWMMA_Profile<[v16f32, v16i32, v8i32, v16f32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/1, + /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/1, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>; +def F32_32X16X128_F4_SCALE16_w32 : VOP3PWMMA_Profile<[v16f32, v16i32, v8i32, v16f32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/1, + /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/1, /*_Scale16=*/1, /*_HasMatrixReuse=*/1>; +def F32_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16f16, v32f16, v8f32], /*_IsSWMMAC=*/1, /*_IndexType=*/16, /*_IsIU=*/0, /*_IsFP8BF8=*/0, + /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>; +def F32_BF16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v32bf16, v8f32], /*_IsSWMMAC=*/1, /*_IndexType=*/16, /*_IsIU=*/0, /*_IsFP8BF8=*/0, + /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>; +def F16_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v16f16, v32f16, v8f16], /*_IsSWMMAC=*/1, /*_IndexType=*/16, /*_IsIU=*/0, /*_IsFP8BF8=*/0, + /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>; +def BF16_BF16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v32bf16, v8bf16], /*_IsSWMMAC=*/1, /*_IndexType=*/16, /*_IsIU=*/0, /*_IsFP8BF8=*/0, + /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>; +def F32_FP8BF8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v16i32, v8f32], /*_IsSWMMAC=*/1, /*_IndexType=*/32, /*_IsIU=*/0, /*_IsFP8BF8=*/1, + /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>; +def F16_FP8BF8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v8i32, v16i32, v8f16], /*_IsSWMMAC=*/1, /*_IndexType=*/32, /*_IsIU=*/0, /*_IsFP8BF8=*/1, + /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>; +def I32_IU8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, v8i32, v16i32, v8i32], /*_IsSWMMAC=*/1, /*_IndexType=*/32, /*_IsIU=*/1, /*_IsFP8BF8=*/0, + /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>; + +// Helper class to compute the destination vector type of WMMA_F8F6F4 instructions based on element type and dimensions. +class getWMMAF8F6F4DstVTy<ValueType DstEltTy, int M, int N> { + // Size in bits = (M * N / 32) * element_size_in_bits + defvar Size = !mul(!div(!mul(M, N), 32), DstEltTy.Size); + ValueType ret = !cond(!eq(Size, 256) : v8f32, + !eq(Size, 1024) : v64f16); +} + +// Helper class to compute the type of matrix A and B of WMMA_F8F6F4 instructions based on format and dimensions. +class getWMMAF8F6F4ABVTy<string Fmt, int D1, int D2> { + defvar FmtBits = !cond(!eq(Fmt, "f8") : 8, + !eq(Fmt, "f6") : 6, + !eq(Fmt, "f4") : 4); + // TypeSize in bits = (D1 * D2 / 32) * format_bits + defvar TypeSize = !mul(!div(!mul(D1, D2), 32), FmtBits); + ValueType ret = !cond(!eq(TypeSize, 256) : v8i32, + !eq(TypeSize, 384) : v12i32, + !eq(TypeSize, 512) : v16i32, + !eq(TypeSize, 1024) : v32i32); +} + +multiclass WMMA_F8F6F4_Profiles<ValueType DstEltTy, int M, int N, int K, + bit HasMatrixScale, bit Scale16, bit HasMatrixReuse> { + defvar DstTy = getWMMAF8F6F4DstVTy<DstEltTy, M, N>.ret; + foreach ATy = ["f8", "f6", "f4"] in { + foreach BTy = ["f8", "f6", "f4"] in { + def _#ATy#_#BTy#_w32 : VOP3PWMMA_Profile< + [DstTy, getWMMAF8F6F4ABVTy<ATy, M, K>.ret, getWMMAF8F6F4ABVTy<BTy, K, N>.ret, DstTy], + 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>; + } + } +} + +defm F32_16X16X128_F8F6F4 : WMMA_F8F6F4_Profiles<f32, /*M=*/16, /*N=*/16, /*K=*/128, /*HasMatrixScale=*/0, /*Scale16=*/0, /*HasMatrixReuse=*/0>; +defm F32_16X16X128_F8F6F4_SCALE : WMMA_F8F6F4_Profiles<f32, /*M=*/16, /*N=*/16, /*K=*/128, /*HasMatrixScale=*/1, /*Scale16=*/0, /*HasMatrixReuse=*/1>; +defm F32_16X16X128_F8F6F4_SCALE16 : WMMA_F8F6F4_Profiles<f32, /*M=*/16, /*N=*/16, /*K=*/128, /*HasMatrixScale=*/1, /*Scale16=*/1, /*HasMatrixReuse=*/1>; class VOP_WMMA_LD_SCALE<ValueType vt, RegisterOperand RC> : VOP3P_Profile<VOPProfile<[untyped, vt, vt, untyped]>> { let HasMatrixScale = 1; @@ -1905,8 +1968,10 @@ defm V_WMMA_SCALE_F32_32X16X128_F4_w32 : WMMAInstGFX12<"v_wmma_scale_f32_32x16 defm V_WMMA_SCALE16_F32_32X16X128_F4_w32 : WMMAInstGFX12<"v_wmma_scale16_f32_32x16x128_f4", F32_32X16X128_F4_SCALE16_w32, "_w32">; } // End is_wmma_xdl = 1. -defm V_WMMA_LD_SCALE_PAIRED_B32 : VOP3PInst<"v_wmma_ld_scale_paired_b32", VOP_WMMA_LD_SCALE<i32, VCSrc_b32_Lo256>>; -defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3PInst<"v_wmma_ld_scale16_paired_b64", VOP_WMMA_LD_SCALE<i64, VCSrc_b64_Lo256>>; +let isConvergent = 1 in { + defm V_WMMA_LD_SCALE_PAIRED_B32 : VOP3PInst<"v_wmma_ld_scale_paired_b32", VOP_WMMA_LD_SCALE<i32, VCSrc_b32_Lo256>>; + defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3PInst<"v_wmma_ld_scale16_paired_b64", VOP_WMMA_LD_SCALE<i64, VCSrc_b64_Lo256>>; +} } // End SubtargetPredicate = isGFX125xOnly } // End WaveSizePredicate = isWave32 @@ -2182,20 +2247,23 @@ multiclass VOP3P_Real_WMMA_gfx1250 <bits<8> op, VOP3PWMMA_Profile WMMAP> { } } -multiclass VOP3P_Real_WMMA_F8F6F4_gfx1250<bits<8> op, VOP3PWMMA_Profile WMMAP> { +multiclass VOP3P_Real_WMMA_F8F6F4<string Gen, bits<8> op, VOP3PWMMA_Profile WMMAP> { defvar PS = !cast<VOP3P_Pseudo>(NAME # "_twoaddr"); defvar asmName = !substr(PS.Mnemonic, 0, !sub(!size(PS.Mnemonic), !size("_f8_f8_w32"))); defvar psName = !substr(NAME, 0, !sub(!size(PS.Mnemonic), !size("_f8_f8_w32"))); - let AsmString = asmName # PS.AsmOperands in - defm NAME : VOP3P_Real_WMMA_gfx1250<op, WMMAP>, - MFMA_F8F6F4_WithSizeTable_Helper<PS, psName # "_f8_f8_w32_twoaddr_gfx1250">; + let AsmString = asmName # PS.AsmOperands in { + if !eq(Gen, "gfx1250") then { + defm NAME : VOP3P_Real_WMMA_gfx1250<op, WMMAP>, + MFMA_F8F6F4_WithSizeTable_Helper<PS, psName # "_f8_f8_w32_twoaddr_" # Gen>; + } + } } -multiclass VOP3P_Real_WMMA_gfx1250_SrcFormats<bits<8> op, string WMMAP> { - defm _f8_f8_w32 : VOP3P_Real_WMMA_F8F6F4_gfx1250<op, !cast<VOP3PWMMA_Profile>(WMMAP # "_f8_f8_w32")>; +multiclass VOP3P_Real_WMMA_SrcFormats<string Gen, bits<8> op, string WMMAP> { + defm _f8_f8_w32 : VOP3P_Real_WMMA_F8F6F4<Gen, op, !cast<VOP3PWMMA_Profile>(WMMAP # "_f8_f8_w32")>; foreach I = ["f8_f6", "f8_f4", "f6_f8", "f6_f6", "f6_f4", "f4_f8", "f4_f6", "f4_f4"] in { let isAsmParserOnly = true in { // Disable ambiguous disassembly. - defm _#I#_w32 : VOP3P_Real_WMMA_F8F6F4_gfx1250<op, !cast<VOP3PWMMA_Profile>(WMMAP # "_" # I # "_w32")>; + defm _#I#_w32 : VOP3P_Real_WMMA_F8F6F4<Gen, op, !cast<VOP3PWMMA_Profile>(WMMAP # "_" # I # "_w32")>; } } } @@ -2215,7 +2283,7 @@ class VOP3PX2e <bits<8> op, bits<8> LdScaleOp, VOP3PWMMA_Profile P> : Enc128, VO let Inst{23-16} = LdScaleOp; let Inst{40-32} = scale_src0; let Inst{49-41} = scale_src1; - let Inst{58-50} = 0; // scale src2 + let Inst{58-50} = ?; // scale src2 let Inst{59} = matrix_b_scale{0}; // scale_op_sel_hi(0) let Inst{60} = 0; // scale_op_sel_hi(1) let Inst{63-61} = {0, matrix_a_scale_fmt{1-0}}; // neg (lo) @@ -2234,9 +2302,9 @@ class VOP3PX2e <bits<8> op, bits<8> LdScaleOp, VOP3PWMMA_Profile P> : Enc128, VO let Inst{87-80} = op; let Inst{95-88} = 0xcc; //encoding - let Inst{104-96} = !if(P.HasSrc0, src0, 0); - let Inst{113-105} = !if(P.HasSrc1, src1, 0); - let Inst{122-114} = !if(P.HasSrc2, src2, 0); + let Inst{104-96} = !if(P.HasSrc0, src0, ?); + let Inst{113-105} = !if(P.HasSrc1, src1, ?); + let Inst{122-114} = !if(P.HasSrc2, src2, ?); // neg_lo let Inst{125} = !if(P.NegLo01, src0_modifiers{0}, 0); @@ -2244,34 +2312,35 @@ class VOP3PX2e <bits<8> op, bits<8> LdScaleOp, VOP3PWMMA_Profile P> : Enc128, VO let Inst{127} = !if(P.NegLo2, src2_modifiers{0}, 0); } -multiclass VOP3PX2_Real_ScaledWMMA_F4<bits<8> op, bits<8> LdScaleOp, VOP3PWMMA_Profile WMMAP> { - defvar PS = !cast<VOP3P_Pseudo>(NAME # "_twoaddr"); - let SubtargetPredicate = isGFX1250Plus, WaveSizePredicate = isWave32, - DecoderNamespace = "GFX1250" in { +multiclass VOP3PX2_Real_ScaledWMMA_F4<string Gen, bits<8> op, bits<8> LdScaleOp, VOP3PWMMA_Profile WMMAP> { + defvar PS = !cast<VOP3P_Pseudo>(NAME # "_twoaddr"); + if !eq(Gen, "gfx1250") then { def _gfx1250 : VOP3P_Real_Gen<PS, GFX1250Gen, PS.Mnemonic>, - VOP3PX2e <op, LdScaleOp, WMMAP>; + VOP3PX2e <op, LdScaleOp, WMMAP> { + let PostEncoderMethod = "postEncodeVOP3<true, true, false>"; + } } } -multiclass VOP3PX2_Real_ScaledWMMA<bits<8> op, bits<8> LdScaleOp, VOP3PWMMA_Profile WMMAP> { +multiclass VOP3PX2_Real_ScaledWMMA<string Gen, bits<8> op, bits<8> LdScaleOp, VOP3PWMMA_Profile WMMAP> { defvar PS = !cast<VOP3P_Pseudo>(NAME # "_twoaddr"); defvar asmName = !substr(PS.Mnemonic, 0, !sub(!size(PS.Mnemonic), !size("_f8_f8_w32"))); defvar psName = !substr(NAME, 0, !sub(!size(PS.Mnemonic), !size("_f8_f8_w32"))); - let SubtargetPredicate = isGFX1250Plus, WaveSizePredicate = isWave32, - DecoderNamespace = "GFX1250" in { + if !eq(Gen, "gfx1250") then { def _gfx1250 : VOP3P_Real_Gen<PS, GFX1250Gen, asmName>, VOP3PX2e <op, LdScaleOp, WMMAP>, - MFMA_F8F6F4_WithSizeTable_Helper<PS, psName # "_f8_f8_w32_gfx1250"> { + MFMA_F8F6F4_WithSizeTable_Helper<PS, psName # "_f8_f8_w32_" # Gen> { let AsmString = asmName # PS.AsmOperands; + let PostEncoderMethod = "postEncodeVOP3<true, true, false>"; } } } -multiclass VOP3PX2_Real_ScaledWMMA_SrcFormats<bits<8> op, bits<8> LdScaleOp, string WMMAP> { - defm _f8_f8_w32 : VOP3PX2_Real_ScaledWMMA<op, LdScaleOp, !cast<VOP3PWMMA_Profile>(WMMAP # "_f8_f8_w32")>; +multiclass VOP3PX2_Real_ScaledWMMA_SrcFormats<string Gen, bits<8> op, bits<8> LdScaleOp, string WMMAP> { + defm _f8_f8_w32 : VOP3PX2_Real_ScaledWMMA<Gen, op, LdScaleOp, !cast<VOP3PWMMA_Profile>(WMMAP # "_f8_f8_w32")>; foreach I = ["f8_f6", "f8_f4", "f6_f8", "f6_f6", "f6_f4", "f4_f8", "f4_f6", "f4_f4"] in { let isAsmParserOnly = true in { // Disable ambiguous disassembly. - defm _#I#_w32 : VOP3PX2_Real_ScaledWMMA<op, LdScaleOp, !cast<VOP3PWMMA_Profile>(WMMAP # "_" # I # "_w32")>; + defm _#I#_w32 : VOP3PX2_Real_ScaledWMMA<Gen, op, LdScaleOp, !cast<VOP3PWMMA_Profile>(WMMAP # "_" # I # "_w32")>; } } } @@ -2350,12 +2419,14 @@ defm V_WMMA_F16_16X16X128_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x086, F16_FP8B defm V_WMMA_F16_16X16X128_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x087, F16_FP8BF8X128_WMMA_w32>; defm V_WMMA_F32_32X16X128_F4_w32 : VOP3P_Real_WMMA_gfx1250 <0x088, F32_32X16X128_F4_WMMA_w32>; -defm V_WMMA_F32_16X16X128_F8F6F4 : VOP3P_Real_WMMA_gfx1250_SrcFormats<0x033, "F32_16X16X128_F8F6F4">; -defm V_WMMA_SCALE_F32_16X16X128_F8F6F4 : VOP3PX2_Real_ScaledWMMA_SrcFormats<0x033, 0x35, "F32_16X16X128_F8F6F4_SCALE">; -defm V_WMMA_SCALE16_F32_16X16X128_F8F6F4 : VOP3PX2_Real_ScaledWMMA_SrcFormats<0x033, 0x3a, "F32_16X16X128_F8F6F4_SCALE16">; +let WaveSizePredicate = isWave32, SubtargetPredicate = isGFX1250Plus, DecoderNamespace = "GFX1250" in { +defm V_WMMA_F32_16X16X128_F8F6F4 : VOP3P_Real_WMMA_SrcFormats <"gfx1250", 0x033, "F32_16X16X128_F8F6F4">; +defm V_WMMA_SCALE_F32_16X16X128_F8F6F4 : VOP3PX2_Real_ScaledWMMA_SrcFormats <"gfx1250", 0x033, 0x35, "F32_16X16X128_F8F6F4_SCALE">; +defm V_WMMA_SCALE16_F32_16X16X128_F8F6F4 : VOP3PX2_Real_ScaledWMMA_SrcFormats <"gfx1250", 0x033, 0x3a, "F32_16X16X128_F8F6F4_SCALE16">; -defm V_WMMA_SCALE_F32_32X16X128_F4_w32 : VOP3PX2_Real_ScaledWMMA_F4<0x088, 0x35, F32_32X16X128_F4_SCALE_w32>; -defm V_WMMA_SCALE16_F32_32X16X128_F4_w32 : VOP3PX2_Real_ScaledWMMA_F4<0x088, 0x3a, F32_32X16X128_F4_SCALE16_w32>; +defm V_WMMA_SCALE_F32_32X16X128_F4_w32 : VOP3PX2_Real_ScaledWMMA_F4 <"gfx1250", 0x088, 0x35, F32_32X16X128_F4_SCALE_w32>; +defm V_WMMA_SCALE16_F32_32X16X128_F4_w32 : VOP3PX2_Real_ScaledWMMA_F4 <"gfx1250", 0x088, 0x3a, F32_32X16X128_F4_SCALE16_w32>; +} // End WaveSizePredicate = isWave32, SubtargetPredicate = isGFX1250Plus, DecoderNamespace = "GFX1250" defm V_SWMMAC_F32_16X16X64_F16_w32 : VOP3P_Real_WMMA_gfx1250 <0x065, F32_F16X64_SWMMAC_w32>; defm V_SWMMAC_F32_16X16X64_BF16_w32 : VOP3P_Real_WMMA_gfx1250 <0x066, F32_BF16X64_SWMMAC_w32>; @@ -2417,6 +2488,11 @@ multiclass VOP3P_Realtriple<GFXGen Gen, bits<8> op, string backing_ps_name = NAM multiclass VOP3P_Realtriple_gfx11_gfx12<bits<8> op> : VOP3P_Realtriple<GFX11Gen, op>, VOP3P_Realtriple<GFX12Gen, op>; +defm V_DOT4_F32_FP8_BF8 : VOP3P_Realtriple_gfx11_gfx12<0x24>; +defm V_DOT4_F32_BF8_FP8 : VOP3P_Realtriple_gfx11_gfx12<0x25>; +defm V_DOT4_F32_FP8_FP8 : VOP3P_Realtriple_gfx11_gfx12<0x26>; +defm V_DOT4_F32_BF8_BF8 : VOP3P_Realtriple_gfx11_gfx12<0x27>; + //===----------------------------------------------------------------------===// // GFX12 //===----------------------------------------------------------------------===// @@ -2459,8 +2535,10 @@ defm V_FMA_MIX_F32_BF16 : VOP3P_Realtriple<GFX1250Gen, 0x3d>; defm V_FMA_MIXLO_BF16 : VOP3P_Realtriple<GFX1250Gen, 0x3e>; defm V_FMA_MIXHI_BF16 : VOP3P_Realtriple<GFX1250Gen, 0x3f>; -defm V_WMMA_LD_SCALE_PAIRED_B32 : VOP3P_Real_gfx1250<0x35>; -defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3P_Real_gfx1250<0x3a>; +let PostEncoderMethod = "postEncodeVOP3<true, true, false>" in { + defm V_WMMA_LD_SCALE_PAIRED_B32 : VOP3P_Real_gfx1250<0x35>; + defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3P_Real_gfx1250<0x3a>; +} let AssemblerPredicate = isGFX1250Plus in def : AMDGPUMnemonicAlias<"v_fma_mix_f32_f16", "v_fma_mix_f32">; @@ -2468,10 +2546,6 @@ def : AMDGPUMnemonicAlias<"v_fma_mix_f32_f16", "v_fma_mix_f32">; defm V_PK_MINIMUM_F16 : VOP3P_Real_gfx12<0x1d>; defm V_PK_MAXIMUM_F16 : VOP3P_Real_gfx12<0x1e>; -defm V_DOT4_F32_FP8_BF8 : VOP3P_Realtriple<GFX12Gen, 0x24>; -defm V_DOT4_F32_BF8_FP8 : VOP3P_Realtriple<GFX12Gen, 0x25>; -defm V_DOT4_F32_FP8_FP8 : VOP3P_Realtriple<GFX12Gen, 0x26>; -defm V_DOT4_F32_BF8_BF8 : VOP3P_Realtriple<GFX12Gen, 0x27>; //===----------------------------------------------------------------------===// // GFX11 diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td index 2730ec5..989181b 100644 --- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td @@ -24,7 +24,7 @@ class VOPC_SDWAe <bits<8> op, VOPProfile P> : VOP_SDWAe <P> { bits<8> src1; let Inst{8-0} = 0xf9; // sdwa - let Inst{16-9} = !if(P.HasSrc1, src1{7-0}, 0); + let Inst{16-9} = !if(P.HasSrc1, src1{7-0}, ?); let Inst{24-17} = op; let Inst{31-25} = 0x3e; // encoding } @@ -33,10 +33,10 @@ class VOPC_SDWA9e <bits<8> op, VOPProfile P> : VOP_SDWA9Be <P> { bits<9> src1; let Inst{8-0} = 0xf9; // sdwa - let Inst{16-9} = !if(P.HasSrc1, src1{7-0}, 0); + let Inst{16-9} = !if(P.HasSrc1, src1{7-0}, ?); let Inst{24-17} = op; let Inst{31-25} = 0x3e; // encoding - let Inst{63} = !if(P.HasSrc1, src1{8}, 0); // src1_sgpr + let Inst{63} = !if(P.HasSrc1, src1{8}, ?); // src1_sgpr } @@ -422,7 +422,6 @@ multiclass VOPC_Pseudos <string opName, } -let SubtargetPredicate = HasSdstCMPX in { multiclass VOPCX_Pseudos <string opName, VOPC_Profile P, VOPC_Profile P_NoSDst, SDPatternOperator cond = COND_NULL, @@ -486,7 +485,6 @@ multiclass VOPCX_Pseudos <string opName, } } // end SubtargetPredicate = isGFX11Plus } -} // End SubtargetPredicate = HasSdstCMPX defm VOPC_I1_F16_F16 : VOPC_Profile_t16<[Write32Bit], f16>; def VOPC_I1_F32_F32 : VOPC_Profile<[Write32Bit], f32>; @@ -518,8 +516,10 @@ multiclass VOPC_F16 <string opName, SDPatternOperator cond = COND_NULL, multiclass VOPC_F32 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> : VOPC_Pseudos <opName, VOPC_I1_F32_F32, cond, revOp, 0>; -multiclass VOPC_F64 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> : - VOPC_Pseudos <opName, VOPC_I1_F64_F64, cond, revOp, 0>; +let IsDPMACCInstruction = 1 in { + multiclass VOPC_F64 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> : + VOPC_Pseudos <opName, VOPC_I1_F64_F64, cond, revOp, 0>; +} multiclass VOPC_I16 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> { @@ -537,9 +537,10 @@ multiclass VOPC_I16 <string opName, SDPatternOperator cond = COND_NULL, multiclass VOPC_I32 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> : VOPC_Pseudos <opName, VOPC_I1_I32_I32, cond, revOp, 0>; -multiclass VOPC_I64 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> : - VOPC_Pseudos <opName, VOPC_I1_I64_I64, cond, revOp, 0>; - +let IsDPMACCInstruction = 1 in { + multiclass VOPC_I64 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> : + VOPC_Pseudos <opName, VOPC_I1_I64_I64, cond, revOp, 0>; +} multiclass VOPCX_F16<string opName, string revOp = opName> { let OtherPredicates = [Has16BitInsts], True16Predicate = NotHasTrue16BitInsts in { @@ -556,8 +557,10 @@ multiclass VOPCX_F16<string opName, string revOp = opName> { multiclass VOPCX_F32 <string opName, string revOp = opName> : VOPCX_Pseudos <opName, VOPC_I1_F32_F32, VOPC_F32_F32, COND_NULL, revOp>; -multiclass VOPCX_F64 <string opName, string revOp = opName> : - VOPCX_Pseudos <opName, VOPC_I1_F64_F64, VOPC_F64_F64, COND_NULL, revOp>; +let IsDPMACCInstruction = 1 in { + multiclass VOPCX_F64 <string opName, string revOp = opName> : + VOPCX_Pseudos <opName, VOPC_I1_F64_F64, VOPC_F64_F64, COND_NULL, revOp>; +} multiclass VOPCX_I16<string opName, string revOp = opName> { let OtherPredicates = [Has16BitInsts], True16Predicate = NotHasTrue16BitInsts in { @@ -574,8 +577,10 @@ multiclass VOPCX_I16<string opName, string revOp = opName> { multiclass VOPCX_I32 <string opName, string revOp = opName> : VOPCX_Pseudos <opName, VOPC_I1_I32_I32, VOPC_I32_I32, COND_NULL, revOp>; -multiclass VOPCX_I64 <string opName, string revOp = opName> : - VOPCX_Pseudos <opName, VOPC_I1_I64_I64, VOPC_I64_I64, COND_NULL, revOp>; +let IsDPMACCInstruction = 1 in { + multiclass VOPCX_I64 <string opName, string revOp = opName> : + VOPCX_Pseudos <opName, VOPC_I1_I64_I64, VOPC_I64_I64, COND_NULL, revOp>; +} //===----------------------------------------------------------------------===// @@ -1114,7 +1119,6 @@ multiclass VOPC_Class_Pseudos <string opName, VOPC_Profile p, bit DefExec, } // end SubtargetPredicate = isGFX11Plus } -let SubtargetPredicate = HasSdstCMPX in { multiclass VOPCX_Class_Pseudos <string opName, VOPC_Profile P, VOPC_Profile P_NoSDst> : @@ -1164,7 +1168,6 @@ multiclass VOPCX_Class_Pseudos <string opName, } } // end SubtargetPredicate = isGFX11Plus } -} // End SubtargetPredicate = HasSdstCMPX } // End ReadsModeReg = 0, mayRaiseFPException = 0 defm VOPC_I1_F16_I16 : VOPC_Class_Profile_t16<[Write32Bit]>; @@ -1210,11 +1213,13 @@ multiclass VOPC_CLASS_F32 <string opName> { multiclass VOPCX_CLASS_F32 <string opName> : VOPCX_Class_Pseudos <opName, VOPC_I1_F32_I32, VOPC_F32_I32>; +// FIXME: let IsDPMACCInstruction = 1 in multiclass VOPC_CLASS_F64 <string opName> { defm NAME : VOPC_Class_Pseudos <opName, VOPC_I1_F64_I32, 0>; defm : VOPCClassPat64<NAME>; } +// FIXME: let IsDPMACCInstruction = 1 in multiclass VOPCX_CLASS_F64 <string opName> : VOPCX_Class_Pseudos <opName, VOPC_I1_F64_I32, VOPC_F64_I32>; @@ -1233,18 +1238,12 @@ defm V_CMPX_CLASS_F16 : VOPCX_CLASS_F16 <"v_cmpx_class_f16">; // We need to use COPY_TO_REGCLASS to w/a the problem when ReplaceAllUsesWith() // complaints it cannot replace i1 <-> i64/i32 if node was not morphed in place. multiclass ICMP_Pattern <PatFrags cond, Instruction inst, ValueType vt, dag dstInst = (inst $src0, $src1)> { - let WaveSizePredicate = isWave64 in def : GCNPat < - (i64 (AMDGPUsetcc vt:$src0, vt:$src1, cond)), - (i64 (COPY_TO_REGCLASS dstInst, SReg_64)) + (WaveSizeVT (AMDGPUsetcc vt:$src0, vt:$src1, cond)), + dstInst >; let WaveSizePredicate = isWave32 in { - def : GCNPat < - (i32 (AMDGPUsetcc vt:$src0, vt:$src1, cond)), - (i32 (COPY_TO_REGCLASS dstInst, SReg_32)) - >; - // Support codegen of i64 setcc in wave32 mode. def : GCNPat < (i64 (AMDGPUsetcc vt:$src0, vt:$src1, cond)), @@ -1459,7 +1458,7 @@ class VOPC_DPP_Base<bits<8> op, string OpName, VOPProfile P> let Inst{8-0} = 0xfa; - let Inst{39-32} = !if (P.HasSrc0, src0{7-0}, 0); + let Inst{39-32} = !if (P.HasSrc0, src0{7-0}, ?); let Inst{48-40} = dpp_ctrl; let Inst{50} = fi; let Inst{51} = bound_ctrl; @@ -1485,7 +1484,7 @@ class VOPC_DPP8_Base<bits<8> op, string OpName, VOPProfile P> let Inst{8-0} = fi; - let Inst{39-32} = !if (P.HasSrc0, src0{7-0}, 0); + let Inst{39-32} = !if (P.HasSrc0, src0{7-0}, ?); let Inst{63-40} = dpp8{23-0}; let AsmMatchConverter = "cvtDPP8"; @@ -1535,6 +1534,8 @@ class VOPC64_DPP<VOP_DPP_Pseudo ps, string opName = ps.OpName> let Uses = ps.Uses; let OtherPredicates = ps.OtherPredicates; let Constraints = ps.Constraints; + + let PostEncoderMethod = !if(!empty(ps.Defs), "", "postEncodeVOPCX"); } class VOPC64_DPP16_Dst<bits<10> op, VOP_DPP_Pseudo ps, @@ -1575,6 +1576,8 @@ class VOPC64_DPP8<VOP_Pseudo ps, string opName = ps.OpName> let Uses = ps.Uses; let OtherPredicates = ps.OtherPredicates; let True16Predicate = ps.True16Predicate; + + let PostEncoderMethod = !if(!empty(ps.Defs), "", "postEncodeVOPCX"); } class VOPC64_DPP8_Dst<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName> @@ -1777,6 +1780,7 @@ multiclass VOPCX_Real<GFXGen Gen, bits<9> op> { let Inst{7-0} = ?; // sdst let AsmString = !subst("_nosdst", "", ps64.Mnemonic) # "{_e64} " # ps64.AsmOperands; + let PostEncoderMethod = "postEncodeVOPCX"; } defm : VOPCXInstAliases<NAME, !substr(Gen.Suffix, 1)>; @@ -1838,6 +1842,7 @@ multiclass VOPCX_Real_with_name<GFXGen Gen, bits<9> op, string OpName, let Inst{7-0} = ?; // sdst let Inst{14} = 0; let AsmString = asm_name # "{_e64} " # ps64.AsmOperands; + let PostEncoderMethod = "postEncodeVOPCX"; } } else { def _e64#Gen.Suffix @@ -1845,6 +1850,7 @@ multiclass VOPCX_Real_with_name<GFXGen Gen, bits<9> op, string OpName, VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl> { let Inst{7-0} = ?; // sdst let AsmString = asm_name # "{_e64} " # ps64.AsmOperands; + let PostEncoderMethod = "postEncodeVOPCX"; } } @@ -2186,6 +2192,7 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in { let Inst{7-0} = ?; // sdst let AsmString = !subst("_nosdst", "", !cast<VOP3_Pseudo>(NAME#"_nosdst_e64").Mnemonic) # "{_e64} " # !cast<VOP3_Pseudo>(NAME#"_nosdst_e64").AsmOperands; + let PostEncoderMethod = "postEncodeVOPCX"; } if !cast<VOPC_Pseudo>(NAME#"_nosdst_e32").Pfl.HasExtSDWA9 then diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index 8325c62..09fdb00 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -18,6 +18,7 @@ class LetDummies { bit isConvergent; bit isAsCheapAsAMove; bit FPDPRounding; + bit IsDPMACCInstruction; Predicate SubtargetPredicate; string Constraints; string DisableEncoding; @@ -71,6 +72,7 @@ class VOP_Pseudo <string opName, string suffix, VOPProfile P, dag outs, dag ins, string Mnemonic = opName; Instruction Opcode = !cast<Instruction>(NAME); bit IsTrue16 = P.IsTrue16; + bit IsDPMACCInstruction = 0; VOPProfile Pfl = P; string AsmOperands; @@ -166,6 +168,7 @@ class VOP3P_Pseudo <string opName, VOPProfile P, list<dag> pattern = []> : class VOP_Real<VOP_Pseudo ps> { Instruction Opcode = !cast<Instruction>(NAME); bit IsSingle = ps.Pfl.IsSingle; + bit IsDPMACCInstruction = ps.IsDPMACCInstruction; } class VOP3_Real <VOP_Pseudo ps, int EncodingFamily, string asm_name = ps.Mnemonic> : @@ -198,6 +201,8 @@ class VOP3_Real <VOP_Pseudo ps, int EncodingFamily, string asm_name = ps.Mnemoni let isConvergent = ps.isConvergent; VOPProfile Pfl = ps.Pfl; + + let PostEncoderMethod = !if(!and(Pfl.HasSrc0, Pfl.HasSrc1, Pfl.HasSrc2), "", "postEncodeVOP3<"#Pfl.HasSrc0#","#Pfl.HasSrc1#","#Pfl.HasSrc2#">"); } class VOP3_Real_Gen <VOP_Pseudo ps, GFXGen Gen, string asm_name = ps.Mnemonic> : @@ -238,9 +243,9 @@ class VOP3a<VOPProfile P> : Enc64 { let Inst{10} = !if(P.HasSrc2Mods, src2_modifiers{1}, 0); let Inst{31-26} = 0x34; //encoding - let Inst{40-32} = !if(P.HasSrc0, src0, 0); - let Inst{49-41} = !if(P.HasSrc1, src1, 0); - let Inst{58-50} = !if(P.HasSrc2, src2, 0); + let Inst{40-32} = !if(P.HasSrc0, src0, ?); + let Inst{49-41} = !if(P.HasSrc1, src1, ?); + let Inst{58-50} = !if(P.HasSrc2, src2, ?); let Inst{60-59} = !if(P.HasOMod, omod, 0); let Inst{61} = !if(P.HasSrc0Mods, src0_modifiers{0}, 0); let Inst{62} = !if(P.HasSrc1Mods, src1_modifiers{0}, 0); @@ -273,9 +278,9 @@ class VOP3a_t16<VOPProfile P> : Enc64 { let Inst{15} = !if(P.HasClamp, clamp{0}, 0); let Inst{31-26} = 0x35; - let Inst{40-32} = !if(P.HasSrc0, src0{8-0}, 0); - let Inst{49-41} = !if(P.HasSrc1, src1{8-0}, 0); - let Inst{58-50} = !if(P.HasSrc2, src2{8-0}, 0); + let Inst{40-32} = !if(P.HasSrc0, src0{8-0}, ?); + let Inst{49-41} = !if(P.HasSrc1, src1{8-0}, ?); + let Inst{58-50} = !if(P.HasSrc2, src2{8-0}, ?); let Inst{60-59} = !if(P.HasOMod, omod, 0); let Inst{61} = !if(P.HasSrc0Mods, src0_modifiers{0}, 0); let Inst{62} = !if(P.HasSrc1Mods, src1_modifiers{0}, 0); @@ -457,9 +462,9 @@ class VOP3be <VOPProfile P> : Enc64 { let Inst{7-0} = vdst; let Inst{14-8} = sdst; let Inst{31-26} = 0x34; //encoding - let Inst{40-32} = !if(P.HasSrc0, src0, 0); - let Inst{49-41} = !if(P.HasSrc1, src1, 0); - let Inst{58-50} = !if(P.HasSrc2, src2, 0); + let Inst{40-32} = !if(P.HasSrc0, src0, ?); + let Inst{49-41} = !if(P.HasSrc1, src1, ?); + let Inst{58-50} = !if(P.HasSrc2, src2, ?); let Inst{60-59} = !if(P.HasOMod, omod, 0); let Inst{61} = !if(P.HasSrc0Mods, src0_modifiers{0}, 0); let Inst{62} = !if(P.HasSrc1Mods, src1_modifiers{0}, 0); @@ -509,9 +514,9 @@ class VOP3Pe <VOPProfile P> : Enc64, VOP3Pe_Base { let Inst{15} = !if(P.HasClamp, clamp{0}, 0); - let Inst{40-32} = !if(P.HasSrc0, src0, 0); - let Inst{49-41} = !if(P.HasSrc1, src1, 0); - let Inst{58-50} = !if(P.HasSrc2, src2, 0); + let Inst{40-32} = !if(P.HasSrc0, src0, ?); + let Inst{49-41} = !if(P.HasSrc1, src1, ?); + let Inst{58-50} = !if(P.HasSrc2, src2, ?); let Inst{59} = !cond(!and(P.HasSrc0, P.HasOpSel) : src0_modifiers{3}, P.IsDOT : 1, P.HasMatrixScale : matrix_b_scale{0}, @@ -546,12 +551,12 @@ class VOP3Pe_MAI <bits<7> op, VOPProfile P, bit acc_cd = 0> : Enc64, VOP3Pe_MAI_ let Inst{22-16} = op; let Inst{31-23} = 0x1a7; //encoding - let Inst{40-32} = !if(P.HasSrc0, src0{8-0}, 0); - let Inst{49-41} = !if(P.HasSrc1, src1{8-0}, 0); - let Inst{58-50} = !if(P.HasSrc2, src2, 0); + let Inst{40-32} = !if(P.HasSrc0, src0{8-0}, ?); + let Inst{49-41} = !if(P.HasSrc1, src1{8-0}, ?); + let Inst{58-50} = !if(P.HasSrc2, src2, ?); - let Inst{59} = !if(P.HasSrc0, src0{9}, 0); // acc(0) - let Inst{60} = !if(P.HasSrc1, src1{9}, 0); // acc(1) + let Inst{59} = !if(P.HasSrc0, src0{9}, ?); // acc(0) + let Inst{60} = !if(P.HasSrc1, src1{9}, ?); // acc(1) let Inst{63-61} = !if(P.HasSrc1, blgp, 0); } @@ -631,12 +636,12 @@ class VOP3PXe <bits<7> op, VOPProfile MFMAPfl, bit acc_cd = 0> : Enc128, VOP3Pe_ let Inst{86-80} = op; let Inst{95-87} = 0x1a7; //encoding - let Inst{104-96} = !if(MFMAPfl.HasSrc0, src0{8-0}, 0); - let Inst{113-105} = !if(MFMAPfl.HasSrc1, src1{8-0}, 0); - let Inst{122-114} = !if(MFMAPfl.HasSrc2, src2, 0); + let Inst{104-96} = !if(MFMAPfl.HasSrc0, src0{8-0}, ?); + let Inst{113-105} = !if(MFMAPfl.HasSrc1, src1{8-0}, ?); + let Inst{122-114} = !if(MFMAPfl.HasSrc2, src2, ?); - let Inst{123} = !if(MFMAPfl.HasSrc0, src0{9}, 0); // acc(0) - let Inst{124} = !if(MFMAPfl.HasSrc1, src1{9}, 0); // acc(1) + let Inst{123} = !if(MFMAPfl.HasSrc0, src0{9}, ?); // acc(0) + let Inst{124} = !if(MFMAPfl.HasSrc1, src1{9}, ?); // acc(1) let Inst{127-125} = !if(MFMAPfl.HasSrc1, blgp, 0); } @@ -698,7 +703,7 @@ class VOP_SDWAe<VOPProfile P> : Enc64 { bits<2> dst_unused; bits<1> clamp; - let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, 0); + let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, ?); let Inst{42-40} = !if(P.EmitDstSel, dst_sel{2-0}, ?); let Inst{44-43} = !if(P.EmitDstSel, dst_unused{1-0}, ?); let Inst{45} = !if(P.HasSDWAClamp, clamp{0}, 0); @@ -732,11 +737,11 @@ class VOP_SDWA9e<VOPProfile P> : Enc64 { bits<5> src1_modifiers; bits<1> src1_sgpr; - let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, 0); + let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, ?); let Inst{50-48} = !if(P.HasSrc0, src0_sel{2-0}, 0); let Inst{51} = !if(P.HasSrc0IntMods, src0_modifiers{4}, 0); let Inst{53-52} = !if(P.HasSrc0FloatMods, src0_modifiers{1-0}, 0); - let Inst{55} = !if(P.HasSrc0, src0{8}, 0); + let Inst{55} = !if(P.HasSrc0, src0{8}, ?); let Inst{58-56} = !if(P.HasSrc1, src1_sel{2-0}, 0); let Inst{59} = !if(P.HasSrc1IntMods, src1_modifiers{4}, 0); let Inst{61-60} = !if(P.HasSrc1FloatMods, src1_modifiers{1-0}, 0); @@ -765,16 +770,9 @@ class VOP_SDWA9Be<VOPProfile P> : VOP_SDWA9e<P> { } class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> : - InstSI <P.OutsSDWA, P.InsSDWA, "", pattern>, - VOP <opName>, - SIMCInstr <opName#"_sdwa", SIEncodingFamily.NONE> { - - let isPseudo = 1; - let isCodeGenOnly = 1; - let UseNamedOperandTable = 1; + VOP_Pseudo <opName, "_sdwa", P, P.OutsSDWA, P.InsSDWA, "", pattern> { - string Mnemonic = opName; - string AsmOperands = P.AsmSDWA; + let AsmOperands = P.AsmSDWA; string AsmOperands9 = P.AsmSDWA9; let Size = 8; @@ -794,8 +792,6 @@ class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> : let AsmVariantName = !if(P.HasExtSDWA, AMDGPUAsmVariants.SDWA, AMDGPUAsmVariants.Disable); let DecoderNamespace = "GFX8"; - - VOPProfile Pfl = P; } class VOP_SDWA8_Real <VOP_SDWA_Pseudo ps> : @@ -889,7 +885,7 @@ class VOP_DPPe<VOPProfile P, bit IsDPP16=0> : Enc64 { bits<4> row_mask; bit fi; - let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, 0); + let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, ?); let Inst{48-40} = dpp_ctrl; let Inst{50} = !if(IsDPP16, fi, ?); let Inst{51} = bound_ctrl; @@ -954,8 +950,8 @@ class VOP3_DPPe_Common<bits<10> op, VOPProfile P> : VOP3_DPPe_Common_Base<op, P> bits<9> src2; let Inst{7-0} = !if(P.EmitDst, vdst{7-0}, 0); - let Inst{49-41} = !if(P.HasSrc1, src1, 0); - let Inst{58-50} = !if(P.HasSrc2, src2, 0); + let Inst{49-41} = !if(P.HasSrc1, src1, ?); + let Inst{58-50} = !if(P.HasSrc2, src2, ?); } class VOP3_DPPe_Common_t16<bits<10> op, VOPProfile P> : VOP3_DPPe_Common_Base<op, P> { @@ -964,8 +960,8 @@ class VOP3_DPPe_Common_t16<bits<10> op, VOPProfile P> : VOP3_DPPe_Common_Base<op bits<11> src2; let Inst{7-0} = !if(P.EmitDst, vdst{7-0}, 0); - let Inst{49-41} = !if(P.HasSrc1, src1{8-0}, 0); - let Inst{58-50} = !if(P.HasSrc2, src2{8-0}, 0); + let Inst{49-41} = !if(P.HasSrc1, src1{8-0}, ?); + let Inst{58-50} = !if(P.HasSrc2, src2{8-0}, ?); } class VOP3P_DPPe_Common_Base<bits<8> op, VOPProfile P> : Enc96 { @@ -998,8 +994,8 @@ class VOP3P_DPPe_Common<bits<8> op, VOPProfile P> : VOP3P_DPPe_Common_Base<op, P bits<9> src2; let Inst{7-0} = vdst; - let Inst{49-41} = !if(P.HasSrc1, src1, 0); - let Inst{58-50} = !if(P.HasSrc2, src2, 0); + let Inst{49-41} = !if(P.HasSrc1, src1, ?); + let Inst{58-50} = !if(P.HasSrc2, src2, ?); } class VOP3P_DPPe_Common_t16<bits<8> op, VOPProfile P> : VOP3P_DPPe_Common_Base<op, P> { @@ -1008,8 +1004,8 @@ class VOP3P_DPPe_Common_t16<bits<8> op, VOPProfile P> : VOP3P_DPPe_Common_Base<o bits<11> src2; let Inst{7-0} = vdst{7-0}; - let Inst{49-41} = !if(P.HasSrc1, src1{8-0}, 0); - let Inst{58-50} = !if(P.HasSrc2, src2{8-0}, 0); + let Inst{49-41} = !if(P.HasSrc1, src1{8-0}, ?); + let Inst{58-50} = !if(P.HasSrc2, src2{8-0}, ?); } class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[], @@ -1134,7 +1130,7 @@ class VOP3_DPP_Enc <bits<10> op, VOPProfile P, bit IsDPP16> : VOP3_DPPe_Fields { let Inst{40-32} = 0xfa; - let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0); + let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, ?); let Inst{80-72} = dpp_ctrl; let Inst{82} = !if(IsDPP16, fi, ?); let Inst{83} = bound_ctrl; @@ -1154,7 +1150,7 @@ class VOP3_DPP_Enc_t16<bits<10> op, VOPProfile P, bit IsDPP16 > VOP3_DPPe_Fields_t16 { let Inst{40-32} = 0xfa; - let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0); + let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, ?); let Inst{80-72} = dpp_ctrl; let Inst{82} = !if(IsDPP16, fi, ?); let Inst{83} = bound_ctrl; @@ -1180,7 +1176,7 @@ class VOP3P_DPP <bits<8> op, string OpName, VOPProfile P, bit IsDPP16, let VOP3P = 1; let Inst{40-32} = 0xfa; - let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0); + let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, ?); let Inst{80-72} = dpp_ctrl; let Inst{82} = !if(IsDPP16, fi, ?); let Inst{83} = bound_ctrl; @@ -1195,7 +1191,7 @@ class VOP_DPP8e<VOPProfile P> : Enc64 { bits<24> dpp8; bits<9> fi; - let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, 0); + let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, ?); let Inst{63-40} = dpp8{23-0}; } @@ -1246,7 +1242,7 @@ class VOP3_DPP8_Enc <bits<10> op, VOPProfile P> : VOP3_DPPe_Common<op, P>, VOP3_DPP8e_Fields { let Inst{40-32} = fi; - let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0); + let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, ?); let Inst{95-72} = dpp8{23-0}; } @@ -1257,7 +1253,7 @@ class VOP3_DPP8_Enc_t16 <bits<10> op, VOPProfile P> : VOP3_DPPe_Common_t16<op, P>, VOP3_DPP8e_Fields_t16 { let Inst{40-32} = fi; - let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0); + let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, ?); let Inst{95-72} = dpp8{23-0}; } @@ -1270,7 +1266,7 @@ class VOP3P_DPP8<bits<8> op, string OpName, VOPProfile P> : let VOP3P = 1; let Inst{40-32} = fi; - let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0); + let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, ?); let Inst{95-72} = dpp8{23-0}; } @@ -1357,8 +1353,12 @@ class VOPBinOpClampPat<SDPatternOperator node, Instruction inst, ValueType vt> : class getVOP3ModPat<VOPProfile P, SDPatternOperator node> { dag src0 = !if(P.HasOMod, - (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod), - (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp)); + !if(P.HasClamp, + (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod), + (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i32:$omod)), + !if(P.HasClamp, + (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp), + (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers))); list<dag> ret3 = [(set P.DstVT:$vdst, (DivergentFragOrOp<node, P>.ret (P.Src0VT src0), @@ -1873,6 +1873,12 @@ multiclass VOP3_Real_with_name<GFXGen Gen, bits<10> op, string opName, } } +multiclass VOP3_Real_with_name_gfx11_gfx12_gfx13< + bits<10> op, string opName, string asmName, string pseudo_mnemonic = "", bit isSingle = 0> : + VOP3_Real_with_name<GFX11Gen, op, opName, asmName, pseudo_mnemonic, isSingle>, + VOP3_Real_with_name<GFX12Gen, op, opName, asmName, pseudo_mnemonic, isSingle>, + VOP3_Real_with_name<GFX13Gen, op, opName, asmName, pseudo_mnemonic, isSingle>; + // for READLANE/WRITELANE multiclass VOP3_Real_No_Suffix<GFXGen Gen, bits<10> op, string opName = NAME> { defvar ps = !cast<VOP_Pseudo>(opName); @@ -2204,12 +2210,12 @@ include "VOP3PInstructions.td" include "VOPDInstructions.td" class ClassPat<Instruction inst, ValueType vt> : GCNPat < - (is_fpclass (vt (VOP3ModsNonCanonicalizing vt:$src0, i32:$src0_mods)), (i32 timm:$mask)), + (i1 (is_fpclass (vt (VOP3ModsNonCanonicalizing vt:$src0, i32:$src0_mods)), (i32 timm:$mask))), (inst i32:$src0_mods, vt:$src0, (V_MOV_B32_e32 timm:$mask)) >; class ClassPat_t16<Instruction inst, ValueType vt> : GCNPat < - (is_fpclass (vt (VOP3ModsNonCanonicalizing vt:$src0, i32:$src0_mods)), (i32 timm:$mask)), + (i1 (is_fpclass (vt (VOP3ModsNonCanonicalizing vt:$src0, i32:$src0_mods)), (i32 timm:$mask))), (inst i32:$src0_mods, vt:$src0, SRCMODS.NONE, (V_MOV_B32_e32 timm:$mask)) >; @@ -2274,3 +2280,12 @@ def VOPTrue16Table : GenericTable { let PrimaryKey = ["Opcode"]; let PrimaryKeyName = "getTrue16OpcodeHelper"; } + +def DPMACCInstructionTable : GenericTable { + let FilterClass = "VOP_Pseudo"; + let CppTypeName = "DPMACCInstructionInfo"; + let Fields = ["Opcode", "IsDPMACCInstruction"]; + + let PrimaryKey = ["Opcode"]; + let PrimaryKeyName = "getDPMACCInstructionHelper"; +} |
