diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
133 files changed, 4082 insertions, 3295 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index cd8b249..5df11a4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -69,7 +69,7 @@ FunctionPass *createAMDGPUPreloadKernArgPrologLegacyPass(); ModulePass *createAMDGPUPreloadKernelArgumentsLegacyPass(const TargetMachine *); struct AMDGPUSimplifyLibCallsPass : PassInfoMixin<AMDGPUSimplifyLibCallsPass> { - AMDGPUSimplifyLibCallsPass() {} + AMDGPUSimplifyLibCallsPass() = default; PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); }; @@ -298,6 +298,15 @@ private: bool GlobalOpt; }; +void initializeAMDGPULowerExecSyncLegacyPass(PassRegistry &); +extern char &AMDGPULowerExecSyncLegacyPassID; +ModulePass *createAMDGPULowerExecSyncLegacyPass(); + +struct AMDGPULowerExecSyncPass : PassInfoMixin<AMDGPULowerExecSyncPass> { + AMDGPULowerExecSyncPass() {} + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); +}; + void initializeAMDGPUSwLowerLDSLegacyPass(PassRegistry &); extern char &AMDGPUSwLowerLDSLegacyPassID; ModulePass * @@ -371,13 +380,13 @@ public: class AMDGPUAnnotateUniformValuesPass : public PassInfoMixin<AMDGPUAnnotateUniformValuesPass> { public: - AMDGPUAnnotateUniformValuesPass() {} + AMDGPUAnnotateUniformValuesPass() = default; PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); }; class SIModeRegisterPass : public PassInfoMixin<SIModeRegisterPass> { public: - SIModeRegisterPass() {} + SIModeRegisterPass() = default; PreservedAnalyses run(MachineFunction &F, MachineFunctionAnalysisManager &AM); }; @@ -527,7 +536,7 @@ void initializeAMDGPUAAWrapperPassPass(PassRegistry&); ImmutablePass *createAMDGPUExternalAAWrapperPass(); void initializeAMDGPUExternalAAWrapperPass(PassRegistry&); -void initializeAMDGPUArgumentUsageInfoPass(PassRegistry &); +void initializeAMDGPUArgumentUsageInfoWrapperLegacyPass(PassRegistry &); ModulePass *createAMDGPUExportKernelRuntimeHandlesLegacyPass(); void initializeAMDGPUExportKernelRuntimeHandlesLegacyPass(PassRegistry &); diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 54d94b1..3b14a82 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -901,6 +901,48 @@ def FeaturePkFmacF16Inst : SubtargetFeature<"pk-fmac-f16-inst", "Has v_pk_fmac_f16 instruction" >; +def FeatureCubeInsts : SubtargetFeature<"cube-insts", + "HasCubeInsts", + "true", + "Has v_cube* instructions" +>; + +def FeatureLerpInst : SubtargetFeature<"lerp-inst", + "HasLerpInst", + "true", + "Has v_lerp_u8 instruction" +>; + +def FeatureSadInsts : SubtargetFeature<"sad-insts", + "HasSadInsts", + "true", + "Has v_sad* instructions" +>; + +def FeatureQsadInsts : SubtargetFeature<"qsad-insts", + "HasQsadInsts", + "true", + "Has v_qsad* instructions" +>; + +def FeatureCvtNormInsts : SubtargetFeature<"cvt-norm-insts", + "HasCvtNormInsts", + "true", + "Has v_cvt_norm* instructions" +>; + +def FeatureCvtPkNormVOP2Insts : SubtargetFeature<"cvt-pknorm-vop2-insts", + "HasCvtPkNormVOP2Insts", + "true", + "Has v_cvt_pk_norm_*f32 instructions/Has v_cvt_pk_norm_*_f16 instructions" +>; + +def FeatureCvtPkNormVOP3Insts : SubtargetFeature<"cvt-pknorm-vop3-insts", + "HasCvtPkNormVOP3Insts", + "true", + "Has v_cvt_pk_norm_*f32 instructions/Has v_cvt_pk_norm_*_f16 instructions" +>; + def FeatureAtomicDsPkAdd16Insts : SubtargetFeature<"atomic-ds-pk-add-16-insts", "HasAtomicDsPkAdd16Insts", "true", @@ -1494,7 +1536,8 @@ def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS", FeatureTrigReducedRange, FeatureExtendedImageInsts, FeatureImageInsts, FeatureGDS, FeatureGWS, FeatureDefaultComponentZero, FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts, - FeatureVmemWriteVgprInOrder + FeatureVmemWriteVgprInOrder, FeatureCubeInsts, FeatureLerpInst, + FeatureSadInsts, FeatureCvtPkNormVOP2Insts ] >; @@ -1508,7 +1551,8 @@ def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS", FeatureImageInsts, FeatureGDS, FeatureGWS, FeatureDefaultComponentZero, FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts, - FeatureVmemWriteVgprInOrder + FeatureVmemWriteVgprInOrder, FeatureCubeInsts, FeatureLerpInst, + FeatureSadInsts, FeatureQsadInsts, FeatureCvtPkNormVOP2Insts ] >; @@ -1524,7 +1568,9 @@ def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS", FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts, FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureFastDenormalF32, FeatureUnalignedBufferAccess, FeatureImageInsts, FeatureGDS, FeatureGWS, - FeatureDefaultComponentZero, FeatureVmemWriteVgprInOrder + FeatureDefaultComponentZero, FeatureVmemWriteVgprInOrder, FeatureCubeInsts, + FeatureLerpInst, FeatureSadInsts, FeatureQsadInsts, + FeatureCvtPkNormVOP2Insts ] >; @@ -1543,7 +1589,10 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9", FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureSupportsXNACK, FeatureUnalignedBufferAccess, FeatureUnalignedScratchAccess, FeatureUnalignedDSAccess, FeatureNegativeScratchOffsetBug, FeatureGWS, - FeatureDefaultComponentZero,FeatureVmemWriteVgprInOrder, FeatureMemToLDSLoad + FeatureDefaultComponentZero,FeatureVmemWriteVgprInOrder, FeatureMemToLDSLoad, + FeatureCubeInsts, FeatureLerpInst, FeatureSadInsts, FeatureQsadInsts, + FeatureCvtNormInsts, FeatureCvtPkNormVOP2Insts, + FeatureCvtPkNormVOP3Insts ] >; @@ -1567,7 +1616,10 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10", FeatureDefaultComponentZero, FeatureMaxHardClauseLength63, FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts, - FeatureVmemWriteVgprInOrder, FeatureMemToLDSLoad + FeatureVmemWriteVgprInOrder, FeatureMemToLDSLoad, FeatureCubeInsts, + FeatureLerpInst, FeatureSadInsts, FeatureQsadInsts, + FeatureCvtNormInsts, FeatureCvtPkNormVOP2Insts, + FeatureCvtPkNormVOP3Insts ] >; @@ -1590,7 +1642,9 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11", FeatureUnalignedDSAccess, FeatureGDS, FeatureGWS, FeatureDefaultComponentZero, FeatureMaxHardClauseLength32, FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts, - FeatureVmemWriteVgprInOrder + FeatureVmemWriteVgprInOrder, FeatureCubeInsts, FeatureLerpInst, + FeatureSadInsts, FeatureQsadInsts, FeatureCvtNormInsts, + FeatureCvtPkNormVOP2Insts, FeatureCvtPkNormVOP3Insts ] >; @@ -2069,13 +2123,20 @@ def FeatureISAVersion12 : FeatureSet< FeatureMemoryAtomicFAddF32DenormalSupport, FeatureBVHDualAndBVH8Insts, FeatureWaitsBeforeSystemScopeStores, + FeatureD16Writes32BitVgpr, + FeatureCubeInsts, + FeatureLerpInst, + FeatureSadInsts, + FeatureQsadInsts, + FeatureCvtNormInsts, + FeatureCvtPkNormVOP2Insts, + FeatureCvtPkNormVOP3Insts ]>; -def FeatureISAVersion12_50 : FeatureSet< +def FeatureISAVersion12_50_Common : FeatureSet< [FeatureGFX12, FeatureGFX1250Insts, FeatureRequiresAlignedVGPRs, - FeatureAddressableLocalMemorySize327680, FeatureCuMode, Feature1024AddressableVGPRs, Feature64BitLiterals, @@ -2143,8 +2204,20 @@ def FeatureISAVersion12_50 : FeatureSet< FeatureSupportsXNACK, FeatureXNACK, FeatureClusters, + FeatureD16Writes32BitVgpr, + FeatureCubeInsts, + FeatureLerpInst, + FeatureSadInsts, + FeatureQsadInsts, + FeatureCvtNormInsts, + FeatureCvtPkNormVOP2Insts, + FeatureCvtPkNormVOP3Insts ]>; +def FeatureISAVersion12_50 : FeatureSet< + !listconcat(FeatureISAVersion12_50_Common.Features, + [FeatureAddressableLocalMemorySize327680])>; + def FeatureISAVersion12_51 : FeatureSet< !listconcat(FeatureISAVersion12_50.Features, [FeatureDPALU_DPP])>; @@ -2523,6 +2596,10 @@ def HasAtomicFMinFMaxF64FlatInsts : Predicate<"Subtarget->hasAtomicFMinFMaxF64FlatInsts()">, AssemblerPredicate<(any_of FeatureAtomicFMinFMaxF64FlatInsts)>; +def HasAtomicCondSubClampFlatInsts : + Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12">, + AssemblerPredicate<(all_of FeatureGFX12Insts)>; + def HasLdsAtomicAddF64 : Predicate<"Subtarget->hasLdsAtomicAddF64()">, AssemblerPredicate<(any_of FeatureGFX90AInsts, FeatureGFX1250Insts)>; @@ -2814,6 +2891,27 @@ def HasFP8Insts : Predicate<"Subtarget->hasFP8Insts()">, def HasFP8ConversionInsts : Predicate<"Subtarget->hasFP8ConversionInsts()">, AssemblerPredicate<(all_of FeatureFP8ConversionInsts)>; +def HasCubeInsts : Predicate<"Subtarget->hasCubeInsts()">, + AssemblerPredicate<(all_of FeatureCubeInsts)>; + +def HasLerpInst : Predicate<"Subtarget->hasLerpInst()">, + AssemblerPredicate<(all_of FeatureLerpInst)>; + +def HasSadInsts : Predicate<"Subtarget->hasSadInsts()">, + AssemblerPredicate<(all_of FeatureSadInsts)>; + +def HasQsadInsts : Predicate<"Subtarget->hasQsadInsts()">, + AssemblerPredicate<(all_of FeatureQsadInsts)>; + +def HasCvtNormInsts : Predicate<"Subtarget->hasCvtNormInsts()">, + AssemblerPredicate<(all_of FeatureCvtNormInsts)>; + +def HasCvtPkNormVOP2Insts : Predicate<"Subtarget->hasCvtPkNormVOP2Insts()">, + AssemblerPredicate<(all_of FeatureCvtPkNormVOP2Insts)>; + +def HasCvtPkNormVOP3Insts : Predicate<"Subtarget->hasCvtPkNormVOP3Insts()">, + AssemblerPredicate<(all_of FeatureCvtPkNormVOP3Insts)>; + def HasFP8E5M3Insts : Predicate<"Subtarget->hasFP8E5M3Insts()">, AssemblerPredicate<(all_of FeatureFP8E5M3Insts)>; @@ -2829,9 +2927,16 @@ def HasMadMacF32Insts : Predicate<"Subtarget->hasMadMacF32Insts()">, def HasFmaLegacy32 : Predicate<"Subtarget->hasGFX10_3Insts()">, AssemblerPredicate<(any_of FeatureGFX10_3Insts)>; +def HasFmacLegacy32 : Predicate<"Subtarget->hasGFX10_3Insts() && Subtarget->getGeneration() < AMDGPUSubtarget::GFX12">, + AssemblerPredicate<(all_of FeatureGFX10_3Insts, (not FeatureGFX12Insts))>; + def HasAtomicDsPkAdd16Insts : Predicate<"Subtarget->hasAtomicDsPkAdd16Insts()">, AssemblerPredicate<(any_of FeatureAtomicDsPkAdd16Insts)>; +def HasAtomicDsCondSubClampInsts : + Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12">, + AssemblerPredicate<(all_of FeatureGFX12Insts)>; + def HasAtomicFlatPkAdd16Insts : Predicate<"Subtarget->hasAtomicFlatPkAdd16Insts()">, AssemblerPredicate<(any_of FeatureAtomicFlatPkAdd16Insts)>; @@ -2974,15 +3079,46 @@ def HasSetPrioIncWgInst : Predicate<"Subtarget->hasSetPrioIncWgInst()">, def NeedsAlignedVGPRs : Predicate<"Subtarget->needsAlignedVGPRs()">, AssemblerPredicate<(all_of FeatureRequiresAlignedVGPRs)>; +def NotNeedsAlignedVGPRs : Predicate<"!Subtarget->needsAlignedVGPRs()">, + AssemblerPredicate<(all_of (not FeatureRequiresAlignedVGPRs))>; + +def isWave32 : Predicate<"Subtarget->isWave32()">, + AssemblerPredicate <(any_of FeatureWavefrontSize32, + FeatureAssemblerPermissiveWavesize)>; +def isWave64 : Predicate<"Subtarget->isWave64()">, + AssemblerPredicate <(any_of FeatureWavefrontSize64, + FeatureAssemblerPermissiveWavesize)>; + +def isWave32Strict : Predicate<"Subtarget->isWave32()">, + AssemblerPredicate <(all_of FeatureWavefrontSize32)>; +def isWave64Strict : Predicate<"Subtarget->isWave64()">, + AssemblerPredicate <(all_of FeatureWavefrontSize64)>; + //===----------------------------------------------------------------------===// // HwModes //===----------------------------------------------------------------------===// -// gfx90a-gfx950. Has AGPRs, and also the align2 VGPR/AGPR requirement +defvar DefaultMode_Wave64 = DefaultMode; +defvar DefaultMode_Wave32 = HwMode<[isWave32, NotNeedsAlignedVGPRs]>; + +// gfx90a-gfx950. Has AGPRs, and also the align2 VGPR/AGPR requirement. Implied +// wave64. def AVAlign2LoadStoreMode : HwMode<[HasMAIInsts, NeedsAlignedVGPRs]>; // gfx1250, has alignment requirement but no AGPRs. -def AlignedVGPRNoAGPRMode : HwMode<[NotHasMAIInsts, NeedsAlignedVGPRs]>; +def AlignedVGPRNoAGPRMode_Wave32 : HwMode<[NotHasMAIInsts, NeedsAlignedVGPRs, isWave32Strict]>; +def AlignedVGPRNoAGPRMode_Wave64 : HwMode<[NotHasMAIInsts, NeedsAlignedVGPRs, isWave64Strict]>; + +// FIXME: This should be able to only define a separate hwmode that +// only depends on wavesize for just ValueTypes. These use different +// HwMode namespaces. If we don't define the full set of modes used +// for RegClassByHwMode, tablegen crashes for some reason +def WaveSizeVT : ValueTypeByHwMode<[ + DefaultMode_Wave64, + AVAlign2LoadStoreMode, + AlignedVGPRNoAGPRMode_Wave64, + DefaultMode_Wave32, + AlignedVGPRNoAGPRMode_Wave32], [i64, i64, i64, i32, i32]>; // Include AMDGPU TD files diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp index c28c25f..2bdadda 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp @@ -65,7 +65,7 @@ recursivelyVisitUsers(GlobalValue &GV, continue; if (Instruction *I = dyn_cast<Instruction>(U)) { - Function *F = I->getParent()->getParent(); + Function *F = I->getFunction(); if (!AMDGPU::isEntryFunctionCC(F->getCallingConv())) { // FIXME: This is a horrible hack. We should always respect noinline, // and just let us hit the error when we can't handle this. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp index dda8033..346e257 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp @@ -19,7 +19,7 @@ using namespace llvm; #define DEBUG_TYPE "amdgpu-argument-reg-usage-info" -INITIALIZE_PASS(AMDGPUArgumentUsageInfo, DEBUG_TYPE, +INITIALIZE_PASS(AMDGPUArgumentUsageInfoWrapperLegacy, DEBUG_TYPE, "Argument Register Usage Information Storage", false, true) void ArgDescriptor::print(raw_ostream &OS, @@ -42,7 +42,7 @@ void ArgDescriptor::print(raw_ostream &OS, OS << '\n'; } -char AMDGPUArgumentUsageInfo::ID = 0; +char AMDGPUArgumentUsageInfoWrapperLegacy::ID = 0; const AMDGPUFunctionArgInfo AMDGPUArgumentUsageInfo::ExternFunctionInfo{}; @@ -50,15 +50,6 @@ const AMDGPUFunctionArgInfo AMDGPUArgumentUsageInfo::ExternFunctionInfo{}; const AMDGPUFunctionArgInfo AMDGPUArgumentUsageInfo::FixedABIFunctionInfo = AMDGPUFunctionArgInfo::fixedABILayout(); -bool AMDGPUArgumentUsageInfo::doInitialization(Module &M) { - return false; -} - -bool AMDGPUArgumentUsageInfo::doFinalization(Module &M) { - ArgInfoMap.clear(); - return false; -} - // TODO: Print preload kernargs? void AMDGPUArgumentUsageInfo::print(raw_ostream &OS, const Module *M) const { for (const auto &FI : ArgInfoMap) { @@ -86,6 +77,12 @@ void AMDGPUArgumentUsageInfo::print(raw_ostream &OS, const Module *M) const { } } +bool AMDGPUArgumentUsageInfo::invalidate(Module &M, const PreservedAnalyses &PA, + ModuleAnalysisManager::Invalidator &) { + auto PAC = PA.getChecker<AMDGPUArgumentUsageAnalysis>(); + return !PAC.preservedWhenStateless(); +} + std::tuple<const ArgDescriptor *, const TargetRegisterClass *, LLT> AMDGPUFunctionArgInfo::getPreloadedValue( AMDGPUFunctionArgInfo::PreloadedValue Value) const { @@ -191,3 +188,10 @@ AMDGPUArgumentUsageInfo::lookupFuncArgInfo(const Function &F) const { return FixedABIFunctionInfo; return I->second; } + +AnalysisKey AMDGPUArgumentUsageAnalysis::Key; + +AMDGPUArgumentUsageInfo +AMDGPUArgumentUsageAnalysis::run(Module &M, ModuleAnalysisManager &) { + return AMDGPUArgumentUsageInfo(); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h index 1064e57..f41739a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h @@ -12,10 +12,15 @@ #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/ADT/DenseMap.h" #include "llvm/CodeGen/Register.h" +#include "llvm/IR/PassManager.h" #include "llvm/Pass.h" +#include "llvm/PassRegistry.h" +#include <variant> namespace llvm { +void initializeAMDGPUArgumentUsageInfoWrapperLegacyPass(PassRegistry &); + class Function; class LLT; class raw_ostream; @@ -27,55 +32,44 @@ private: friend struct AMDGPUFunctionArgInfo; friend class AMDGPUArgumentUsageInfo; - union { - MCRegister Reg; - unsigned StackOffset; - }; + std::variant<std::monostate, MCRegister, unsigned> Val; // Bitmask to locate argument within the register. unsigned Mask; - bool IsStack : 1; - bool IsSet : 1; - public: - ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u, bool IsStack = false, - bool IsSet = false) - : Reg(Val), Mask(Mask), IsStack(IsStack), IsSet(IsSet) {} + ArgDescriptor(unsigned Mask = ~0u) : Mask(Mask) {} static ArgDescriptor createRegister(Register Reg, unsigned Mask = ~0u) { - return ArgDescriptor(Reg, Mask, false, true); + ArgDescriptor Ret(Mask); + Ret.Val = Reg.asMCReg(); + return Ret; } static ArgDescriptor createStack(unsigned Offset, unsigned Mask = ~0u) { - return ArgDescriptor(Offset, Mask, true, true); + ArgDescriptor Ret(Mask); + Ret.Val = Offset; + return Ret; } static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask) { - return ArgDescriptor(Arg.Reg, Mask, Arg.IsStack, Arg.IsSet); + // Copy the descriptor, then change the mask. + ArgDescriptor Ret(Arg); + Ret.Mask = Mask; + return Ret; } - bool isSet() const { - return IsSet; - } + bool isSet() const { return !std::holds_alternative<std::monostate>(Val); } explicit operator bool() const { return isSet(); } - bool isRegister() const { - return !IsStack; - } + bool isRegister() const { return std::holds_alternative<MCRegister>(Val); } - MCRegister getRegister() const { - assert(!IsStack); - return Reg; - } + MCRegister getRegister() const { return std::get<MCRegister>(Val); } - unsigned getStackOffset() const { - assert(IsStack); - return StackOffset; - } + unsigned getStackOffset() const { return std::get<unsigned>(Val); } unsigned getMask() const { // None of the target SGPRs or VGPRs are expected to have a 'zero' mask. @@ -96,7 +90,7 @@ inline raw_ostream &operator<<(raw_ostream &OS, const ArgDescriptor &Arg) { } struct KernArgPreloadDescriptor : public ArgDescriptor { - KernArgPreloadDescriptor() {} + KernArgPreloadDescriptor() = default; SmallVector<MCRegister> Regs; }; @@ -178,32 +172,70 @@ struct AMDGPUFunctionArgInfo { static AMDGPUFunctionArgInfo fixedABILayout(); }; -class AMDGPUArgumentUsageInfo : public ImmutablePass { +class AMDGPUArgumentUsageInfo { private: DenseMap<const Function *, AMDGPUFunctionArgInfo> ArgInfoMap; public: - static char ID; - static const AMDGPUFunctionArgInfo ExternFunctionInfo; static const AMDGPUFunctionArgInfo FixedABIFunctionInfo; - AMDGPUArgumentUsageInfo() : ImmutablePass(ID) { } + void print(raw_ostream &OS, const Module *M = nullptr) const; + + void clear() { ArgInfoMap.clear(); } + + void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo) { + ArgInfoMap[&F] = ArgInfo; + } + + const AMDGPUFunctionArgInfo &lookupFuncArgInfo(const Function &F) const; + + bool invalidate(Module &M, const PreservedAnalyses &PA, + ModuleAnalysisManager::Invalidator &Inv); +}; + +class AMDGPUArgumentUsageInfoWrapperLegacy : public ImmutablePass { + std::unique_ptr<AMDGPUArgumentUsageInfo> AUIP; + +public: + static char ID; + + AMDGPUArgumentUsageInfoWrapperLegacy() : ImmutablePass(ID) { + initializeAMDGPUArgumentUsageInfoWrapperLegacyPass( + *PassRegistry::getPassRegistry()); + } + + AMDGPUArgumentUsageInfo &getArgUsageInfo() { return *AUIP; } + const AMDGPUArgumentUsageInfo &getArgUsageInfo() const { return *AUIP; } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesAll(); } - bool doInitialization(Module &M) override; - bool doFinalization(Module &M) override; + bool doInitialization(Module &M) override { + AUIP = std::make_unique<AMDGPUArgumentUsageInfo>(); + return false; + } - void print(raw_ostream &OS, const Module *M = nullptr) const override; + bool doFinalization(Module &M) override { + AUIP->clear(); + return false; + } - void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo) { - ArgInfoMap[&F] = ArgInfo; + void print(raw_ostream &OS, const Module *M = nullptr) const override { + AUIP->print(OS, M); } +}; - const AMDGPUFunctionArgInfo &lookupFuncArgInfo(const Function &F) const; +class AMDGPUArgumentUsageAnalysis + : public AnalysisInfoMixin<AMDGPUArgumentUsageAnalysis> { + friend AnalysisInfoMixin<AMDGPUArgumentUsageAnalysis>; + static AnalysisKey Key; + +public: + using Result = AMDGPUArgumentUsageInfo; + + AMDGPUArgumentUsageInfo run(Module &M, ModuleAnalysisManager &); }; } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentation.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentation.cpp index 93732a7..9af3b05 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentation.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentation.cpp @@ -208,7 +208,8 @@ void instrumentAddress(Module &M, IRBuilder<> &IRB, Instruction *OrigIns, Value *NumBits = IRB.CreateTypeSize(IntptrTy, TypeStoreSize); Value *Size = IRB.CreateLShr(NumBits, ConstantInt::get(IntptrTy, 3)); Value *AddrLong = IRB.CreatePtrToInt(Addr, IntptrTy); - Value *SizeMinusOne = IRB.CreateAdd(Size, ConstantInt::get(IntptrTy, -1)); + Value *SizeMinusOne = + IRB.CreateAdd(Size, ConstantInt::getAllOnesValue(IntptrTy)); Value *LastByte = IRB.CreateIntToPtr(IRB.CreateAdd(AddrLong, SizeMinusOne), AddrTy); instrumentAddressImpl(M, IRB, OrigIns, InsertBefore, Addr, {}, 8, IsWrite, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 29f8f9b..d0c86a7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -358,6 +358,32 @@ bool AMDGPUAsmPrinter::doInitialization(Module &M) { return AsmPrinter::doInitialization(M); } +/// Mimics GCNSubtarget::computeOccupancy for MCExpr. +/// +/// Remove dependency on GCNSubtarget and depend only only the necessary values +/// for said occupancy computation. Should match computeOccupancy implementation +/// without passing \p STM on. +const AMDGPUMCExpr *createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs, + const MCExpr *NumVGPRs, + unsigned DynamicVGPRBlockSize, + const GCNSubtarget &STM, MCContext &Ctx) { + unsigned MaxWaves = IsaInfo::getMaxWavesPerEU(&STM); + unsigned Granule = IsaInfo::getVGPRAllocGranule(&STM, DynamicVGPRBlockSize); + unsigned TargetTotalNumVGPRs = IsaInfo::getTotalNumVGPRs(&STM); + unsigned Generation = STM.getGeneration(); + + auto CreateExpr = [&Ctx](unsigned Value) { + return MCConstantExpr::create(Value, Ctx); + }; + + return AMDGPUMCExpr::create(AMDGPUMCExpr::AGVK_Occupancy, + {CreateExpr(MaxWaves), CreateExpr(Granule), + CreateExpr(TargetTotalNumVGPRs), + CreateExpr(Generation), CreateExpr(InitOcc), + NumSGPRs, NumVGPRs}, + Ctx); +} + void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) { if (F.isDeclaration() || !AMDGPU::isModuleEntryFunctionCC(F.getCallingConv())) return; @@ -459,7 +485,7 @@ void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) { MaxWaves, MFI.getDynamicVGPRBlockSize())}); uint64_t NumSGPRsForWavesPerEU = std::max( {NumSgpr, (uint64_t)1, (uint64_t)STM.getMinNumSGPRs(MaxWaves)}); - const MCExpr *OccupancyExpr = AMDGPUMCExpr::createOccupancy( + const MCExpr *OccupancyExpr = createOccupancy( STM.getOccupancyWithWorkGroupSizes(*MF).second, MCConstantExpr::create(NumSGPRsForWavesPerEU, OutContext), MCConstantExpr::create(NumVGPRsForWavesPerEU, OutContext), @@ -508,9 +534,9 @@ bool AMDGPUAsmPrinter::doFinalization(Module &M) { MCSectionELF *MaxGPRSection = OutContext.getELFSection(".AMDGPU.gpr_maximums", ELF::SHT_PROGBITS, 0); OutStreamer->switchSection(MaxGPRSection); - getTargetStreamer()->EmitMCResourceMaximums(RI.getMaxVGPRSymbol(OutContext), - RI.getMaxAGPRSymbol(OutContext), - RI.getMaxSGPRSymbol(OutContext)); + getTargetStreamer()->EmitMCResourceMaximums( + RI.getMaxVGPRSymbol(OutContext), RI.getMaxAGPRSymbol(OutContext), + RI.getMaxSGPRSymbol(OutContext), RI.getMaxNamedBarrierSymbol(OutContext)); OutStreamer->popSection(); for (Function &F : M.functions()) @@ -1160,21 +1186,20 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, // Make clamp modifier on NaN input returns 0. ProgInfo.DX10Clamp = Mode.DX10Clamp; - unsigned LDSAlignShift; - if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize327680)) { - // LDS is allocated in 256 dword blocks. - LDSAlignShift = 10; - } else if (STM.getFeatureBits().test( - FeatureAddressableLocalMemorySize163840)) { - // LDS is allocated in 320 dword blocks. + unsigned LDSAlignShift = 8; + switch (getLdsDwGranularity(STM)) { + case 512: + case 320: LDSAlignShift = 11; - } else if (STM.getFeatureBits().test( - FeatureAddressableLocalMemorySize65536)) { - // LDS is allocated in 128 dword blocks. + break; + case 128: LDSAlignShift = 9; - } else { - // LDS is allocated in 64 dword blocks. + break; + case 64: LDSAlignShift = 8; + break; + default: + llvm_unreachable("invald LDS block size"); } ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs(); @@ -1270,7 +1295,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT, amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT_SHIFT); - ProgInfo.Occupancy = AMDGPUMCExpr::createOccupancy( + ProgInfo.Occupancy = createOccupancy( STM.computeOccupancy(F, ProgInfo.LDSSize).second, ProgInfo.NumSGPRsForWavesPerEU, ProgInfo.NumVGPRsForWavesPerEU, MFI->getDynamicVGPRBlockSize(), STM, Ctx); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp index 0a163f8..784ee36 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -589,7 +589,7 @@ std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively( // return the next active lane auto *Mask = B.CreateShl(ConstantInt::get(WaveTy, 1), FF1); - auto *InverseMask = B.CreateXor(Mask, ConstantInt::get(WaveTy, -1)); + auto *InverseMask = B.CreateXor(Mask, ConstantInt::getAllOnesValue(WaveTy)); auto *NewActiveBits = B.CreateAnd(ActiveBits, InverseMask); ActiveBits->addIncoming(NewActiveBits, ComputeLoop); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index 9907c88f..d3505cf 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -38,9 +38,10 @@ enum ImplicitArgumentPositions { #define AMDGPU_ATTRIBUTE(Name, Str) Name = 1 << Name##_POS, enum ImplicitArgumentMask { - NOT_IMPLICIT_INPUT = 0, + UNKNOWN_INTRINSIC = 0, #include "AMDGPUAttributes.def" - ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1 + ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1, + NOT_IMPLICIT_INPUT }; #define AMDGPU_ATTRIBUTE(Name, Str) {Name, Str}, @@ -115,7 +116,7 @@ intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit, NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5); return QUEUE_PTR; default: - return NOT_IMPLICIT_INPUT; + return UNKNOWN_INTRINSIC; } } @@ -534,6 +535,21 @@ struct AAAMDAttributesFunction : public AAAMDAttributes { ImplicitArgumentMask AttrMask = intrinsicToAttrMask(IID, NonKernelOnly, NeedsImplicit, HasApertureRegs, SupportsGetDoorbellID, COV); + + if (AttrMask == UNKNOWN_INTRINSIC) { + // Assume not-nocallback intrinsics may invoke a function which accesses + // implicit arguments. + // + // FIXME: This isn't really the correct check. We want to ensure it + // isn't calling any function that may use implicit arguments regardless + // of whether it's internal to the module or not. + // + // TODO: Ignoring callsite attributes. + if (!Callee->hasFnAttribute(Attribute::NoCallback)) + return indicatePessimisticFixpoint(); + continue; + } + if (AttrMask != NOT_IMPLICIT_INPUT) { if ((IsNonEntryFunc || !NonKernelOnly)) removeAssumedBits(AttrMask); @@ -1357,7 +1373,10 @@ struct AAAMDGPUMinAGPRAlloc default: // Some intrinsics may use AGPRs, but if we have a choice, we are not // required to use AGPRs. - return true; + + // Assume !nocallback intrinsics may call a function which requires + // AGPRs. + return CB.hasFnAttr(Attribute::NoCallback); } // TODO: Handle callsite attributes @@ -1555,7 +1574,7 @@ private: AMDGPU::ClusterDimsAttr Attr; - static constexpr const char AttrName[] = "amdgpu-cluster-dims"; + static constexpr char AttrName[] = "amdgpu-cluster-dims"; }; AAAMDGPUClusterDims & @@ -1584,7 +1603,7 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM, &AAAMDGPUMinAGPRAlloc::ID, &AACallEdges::ID, &AAPointerInfo::ID, &AAPotentialConstantValues::ID, &AAUnderlyingObjects::ID, &AANoAliasAddrSpace::ID, &AAAddressSpace::ID, &AAIndirectCallInfo::ID, - &AAAMDGPUClusterDims::ID}); + &AAAMDGPUClusterDims::ID, &AAAlign::ID}); AttributorConfig AC(CGUpdater); AC.IsClosedWorldModule = Options.IsClosedWorld; @@ -1642,6 +1661,10 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM, if (Ptr) { A.getOrCreateAAFor<AAAddressSpace>(IRPosition::value(*Ptr)); A.getOrCreateAAFor<AANoAliasAddrSpace>(IRPosition::value(*Ptr)); + if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Ptr)) { + if (II->getIntrinsicID() == Intrinsic::amdgcn_make_buffer_rsrc) + A.getOrCreateAAFor<AAAlign>(IRPosition::value(*Ptr)); + } } } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp index 30a1f05..2e586ea 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp @@ -27,8 +27,17 @@ using namespace llvm; namespace { class BarrierLatency : public ScheduleDAGMutation { +private: + SmallSet<SyncScope::ID, 4> IgnoredScopes; + public: - BarrierLatency() = default; + BarrierLatency(MachineFunction *MF) { + LLVMContext &Context = MF->getFunction().getContext(); + IgnoredScopes.insert(SyncScope::SingleThread); + IgnoredScopes.insert(Context.getOrInsertSyncScopeID("wavefront")); + IgnoredScopes.insert(Context.getOrInsertSyncScopeID("wavefront-one-as")); + IgnoredScopes.insert(Context.getOrInsertSyncScopeID("singlethread-one-as")); + } void apply(ScheduleDAGInstrs *DAG) override; }; @@ -40,8 +49,11 @@ void BarrierLatency::apply(ScheduleDAGInstrs *DAG) { continue; // Update latency on barrier edges of ATOMIC_FENCE. - // We don't consider the scope of the fence or type of instruction - // involved in the barrier edge. + // Ignore scopes not expected to have any latency. + SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm()); + if (IgnoredScopes.contains(SSID)) + continue; + for (SDep &PredDep : SU.Preds) { if (!PredDep.isBarrier()) continue; @@ -68,6 +80,6 @@ void BarrierLatency::apply(ScheduleDAGInstrs *DAG) { } // end namespace std::unique_ptr<ScheduleDAGMutation> -llvm::createAMDGPUBarrierLatencyDAGMutation() { - return std::make_unique<BarrierLatency>(); +llvm::createAMDGPUBarrierLatencyDAGMutation(MachineFunction *MF) { + return std::make_unique<BarrierLatency>(MF); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h index c23f0b9..547cd2a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h @@ -14,7 +14,10 @@ namespace llvm { -std::unique_ptr<ScheduleDAGMutation> createAMDGPUBarrierLatencyDAGMutation(); +class MachineFunction; + +std::unique_ptr<ScheduleDAGMutation> +createAMDGPUBarrierLatencyDAGMutation(MachineFunction *MF); } // namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp index 7afadde..682f1aa 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -21,6 +21,7 @@ #include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/PseudoSourceValueManager.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #define DEBUG_TYPE "amdgpu-call-lowering" @@ -414,7 +415,8 @@ void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B, ArgInfo &OrigArg, MachineFunction &MF = B.getMF(); const Function &F = MF.getFunction(); const DataLayout &DL = F.getDataLayout(); - MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); + const SITargetLowering &TLI = *getTLI<SITargetLowering>(); + MachinePointerInfo PtrInfo = TLI.getKernargSegmentPtrInfo(MF); LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 8e35ba7..71ea9ef 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -143,14 +143,6 @@ public: bool canBreakPHINode(const PHINode &I); - /// \returns True if binary operation \p I is a signed binary operation, false - /// otherwise. - bool isSigned(const BinaryOperator &I) const; - - /// \returns True if the condition of 'select' operation \p I comes from a - /// signed 'icmp' operation, false otherwise. - bool isSigned(const SelectInst &I) const; - /// Return true if \p T is a legal scalar floating point type. bool isLegalFloatingTy(const Type *T) const; @@ -304,16 +296,6 @@ bool AMDGPUCodeGenPrepareImpl::run() { return MadeChange; } -bool AMDGPUCodeGenPrepareImpl::isSigned(const BinaryOperator &I) const { - return I.getOpcode() == Instruction::AShr || - I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem; -} - -bool AMDGPUCodeGenPrepareImpl::isSigned(const SelectInst &I) const { - return isa<ICmpInst>(I.getOperand(0)) && - cast<ICmpInst>(I.getOperand(0))->isSigned(); -} - bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy(const Type *Ty) const { return Ty->isFloatTy() || Ty->isDoubleTy() || (Ty->isHalfTy() && ST.has16BitInsts()); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index bb4bf74..55ce4f1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -288,6 +288,8 @@ def : GINodeEquiv<G_AMDGPU_TBUFFER_STORE_FORMAT_D16, SItbuffer_store_d16>; // FIXME: Check MMO is atomic def : GINodeEquiv<G_ATOMICRMW_UINC_WRAP, atomic_load_uinc_wrap_glue>; def : GINodeEquiv<G_ATOMICRMW_UDEC_WRAP, atomic_load_udec_wrap_glue>; +def : GINodeEquiv<G_ATOMICRMW_USUB_COND, atomic_load_usub_cond_glue>; +def : GINodeEquiv<G_ATOMICRMW_USUB_SAT, atomic_load_usub_sat_glue>; def : GINodeEquiv<G_ATOMICRMW_FMIN, atomic_load_fmin_glue>; def : GINodeEquiv<G_ATOMICRMW_FMAX, atomic_load_fmax_glue>; @@ -308,6 +310,7 @@ def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FMIN, SIbuffer_atomic_fmin>; def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FMAX, SIbuffer_atomic_fmax>; def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_CMPSWAP, SIbuffer_atomic_cmpswap>; def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32, SIbuffer_atomic_cond_sub_u32>; +def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32, SIbuffer_atomic_csub>; def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD, SIsbuffer_load>; def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_SBYTE, SIsbuffer_load_byte>; def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_UBYTE, SIsbuffer_load_ubyte>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp index 5700468..85addb13a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp @@ -1968,7 +1968,7 @@ private: int NumBits = 0; auto TRI = TII->getRegisterInfo(); - auto &MRI = MI->getParent()->getParent()->getRegInfo(); + auto &MRI = MI->getMF()->getRegInfo(); for (auto &Elt : Collection) { auto Op = Elt->getInstr()->getOperand(0); auto Size = @@ -2183,7 +2183,7 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy( SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); // Interleave MFMA with DS_READ prefetch - for (unsigned I = 0; I < DSRCount - 4; ++I) { + for (unsigned I = 4; I < DSRCount; ++I) { SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( SchedGroupMask::DS_READ, 1, PipelineSyncID, DAG, TII); SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); @@ -2196,7 +2196,7 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy( // Phase 2a: Loop carried dependency with V_PERM // Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they // depend on. Interleave MFMA to keep XDL unit busy throughout. - for (unsigned I = 0; I < DSWWithPermCount - DSWWithSharedVMEMCount; ++I) { + for (unsigned I = DSWWithSharedVMEMCount; I < DSWWithPermCount; ++I) { SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII); SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true)); @@ -2233,7 +2233,7 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy( // Phase 2b: Loop carried dependency without V_PERM // Schedule DS_WRITE as closely as possible to the VMEM_READ they depend on. // Interleave MFMA to keep XDL unit busy throughout. - for (unsigned I = 0; I < DSWCount - DSWWithPermCount; I++) { + for (unsigned I = DSWWithPermCount; I < DSWCount; I++) { SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII); SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h index aff7096..0688f07 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h @@ -11,7 +11,6 @@ #include "llvm/CodeGen/ScheduleDAGMutation.h" #include <memory> -#include <vector> namespace llvm { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index b8b419d..d0835a0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -134,7 +134,7 @@ static SDValue stripExtractLoElt(SDValue In) { INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISelLegacy, "amdgpu-isel", "AMDGPU DAG->DAG Pattern Instruction Selection", false, false) -INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo) +INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfoWrapperLegacy) INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysisLegacy) INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass) #ifdef EXPENSIVE_CHECKS @@ -238,7 +238,7 @@ bool AMDGPUDAGToDAGISelLegacy::runOnMachineFunction(MachineFunction &MF) { } void AMDGPUDAGToDAGISelLegacy::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<AMDGPUArgumentUsageInfo>(); + AU.addRequired<AMDGPUArgumentUsageInfoWrapperLegacy>(); AU.addRequired<UniformityInfoWrapperPass>(); #ifdef EXPENSIVE_CHECKS AU.addRequired<DominatorTreeWrapperPass>(); @@ -470,6 +470,24 @@ MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm, return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops); } +SDNode *AMDGPUDAGToDAGISel::packConstantV2I16(const SDNode *N, + SelectionDAG &DAG) const { + // TODO: Handle undef as zero + + assert(N->getOpcode() == ISD::BUILD_VECTOR && N->getNumOperands() == 2); + uint32_t LHSVal, RHSVal; + if (getConstantValue(N->getOperand(0), LHSVal) && + getConstantValue(N->getOperand(1), RHSVal)) { + SDLoc SL(N); + uint32_t K = (LHSVal & 0xffff) | (RHSVal << 16); + return DAG.getMachineNode( + isVGPRImm(N) ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32, SL, + N->getValueType(0), DAG.getTargetConstant(K, SL, MVT::i32)); + } + + return nullptr; +} + void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) { EVT VT = N->getValueType(0); unsigned NumVectorElts = VT.getVectorNumElements(); @@ -708,10 +726,14 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { break; } + const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); assert(VT.getVectorElementType().bitsEq(MVT::i32)); - unsigned RegClassID = - SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32)->getID(); - SelectBuildVector(N, RegClassID); + const TargetRegisterClass *RegClass = + N->isDivergent() + ? TRI->getDefaultVectorSuperClassForBitWidth(NumVectorElts * 32) + : SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32); + + SelectBuildVector(N, RegClass->getID()); return; } case ISD::VECTOR_SHUFFLE: @@ -1828,72 +1850,83 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr, isFlatScratchBaseLegal(Addr))) { int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue(); - const SIInstrInfo *TII = Subtarget->getInstrInfo(); - if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) { - Addr = N0; - OffsetVal = COffsetVal; - } else { - // If the offset doesn't fit, put the low bits into the offset field and - // add the rest. - // - // For a FLAT instruction the hardware decides whether to access - // global/scratch/shared memory based on the high bits of vaddr, - // ignoring the offset field, so we have to ensure that when we add - // remainder to vaddr it still points into the same underlying object. - // The easiest way to do that is to make sure that we split the offset - // into two pieces that are both >= 0 or both <= 0. - - SDLoc DL(N); - uint64_t RemainderOffset; - - std::tie(OffsetVal, RemainderOffset) = - TII->splitFlatOffset(COffsetVal, AS, FlatVariant); - - SDValue AddOffsetLo = - getMaterializedScalarImm32(Lo_32(RemainderOffset), DL); - SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1); - - if (Addr.getValueType().getSizeInBits() == 32) { - SmallVector<SDValue, 3> Opnds; - Opnds.push_back(N0); - Opnds.push_back(AddOffsetLo); - unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32; - if (Subtarget->hasAddNoCarry()) { - AddOp = AMDGPU::V_ADD_U32_e64; - Opnds.push_back(Clamp); - } - Addr = SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0); + // Adding the offset to the base address in a FLAT instruction must not + // change the memory aperture in which the address falls. Therefore we can + // only fold offsets from inbounds GEPs into FLAT instructions. + bool IsInBounds = + Addr.getOpcode() == ISD::PTRADD && Addr->getFlags().hasInBounds(); + if (COffsetVal == 0 || FlatVariant != SIInstrFlags::FLAT || IsInBounds) { + const SIInstrInfo *TII = Subtarget->getInstrInfo(); + if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) { + Addr = N0; + OffsetVal = COffsetVal; } else { - // TODO: Should this try to use a scalar add pseudo if the base address - // is uniform and saddr is usable? - SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32); - SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32); - - SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, - DL, MVT::i32, N0, Sub0); - SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, - DL, MVT::i32, N0, Sub1); - - SDValue AddOffsetHi = - getMaterializedScalarImm32(Hi_32(RemainderOffset), DL); - - SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1); - - SDNode *Add = - CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs, - {AddOffsetLo, SDValue(N0Lo, 0), Clamp}); - - SDNode *Addc = CurDAG->getMachineNode( - AMDGPU::V_ADDC_U32_e64, DL, VTs, - {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp}); - - SDValue RegSequenceArgs[] = { - CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32), - SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1}; - - Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL, - MVT::i64, RegSequenceArgs), - 0); + // If the offset doesn't fit, put the low bits into the offset field + // and add the rest. + // + // For a FLAT instruction the hardware decides whether to access + // global/scratch/shared memory based on the high bits of vaddr, + // ignoring the offset field, so we have to ensure that when we add + // remainder to vaddr it still points into the same underlying object. + // The easiest way to do that is to make sure that we split the offset + // into two pieces that are both >= 0 or both <= 0. + + SDLoc DL(N); + uint64_t RemainderOffset; + + std::tie(OffsetVal, RemainderOffset) = + TII->splitFlatOffset(COffsetVal, AS, FlatVariant); + + SDValue AddOffsetLo = + getMaterializedScalarImm32(Lo_32(RemainderOffset), DL); + SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1); + + if (Addr.getValueType().getSizeInBits() == 32) { + SmallVector<SDValue, 3> Opnds; + Opnds.push_back(N0); + Opnds.push_back(AddOffsetLo); + unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32; + if (Subtarget->hasAddNoCarry()) { + AddOp = AMDGPU::V_ADD_U32_e64; + Opnds.push_back(Clamp); + } + Addr = + SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0); + } else { + // TODO: Should this try to use a scalar add pseudo if the base + // address is uniform and saddr is usable? + SDValue Sub0 = + CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32); + SDValue Sub1 = + CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32); + + SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, N0, Sub0); + SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, N0, Sub1); + + SDValue AddOffsetHi = + getMaterializedScalarImm32(Hi_32(RemainderOffset), DL); + + SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1); + + SDNode *Add = + CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs, + {AddOffsetLo, SDValue(N0Lo, 0), Clamp}); + + SDNode *Addc = CurDAG->getMachineNode( + AMDGPU::V_ADDC_U32_e64, DL, VTs, + {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp}); + + SDValue RegSequenceArgs[] = { + CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, + MVT::i32), + SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1}; + + Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL, + MVT::i64, RegSequenceArgs), + 0); + } } } } @@ -3047,9 +3080,38 @@ void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) { SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32); const unsigned Opc = gwsIntrinToOpcode(IntrID); + + const MCInstrDesc &InstrDesc = TII->get(Opc); + int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); + + const TargetRegisterClass *DataRC = TII->getRegClass(InstrDesc, Data0Idx); + SmallVector<SDValue, 5> Ops; - if (HasVSrc) - Ops.push_back(N->getOperand(2)); + if (HasVSrc) { + const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); + + SDValue Data = N->getOperand(2); + MVT DataVT = Data.getValueType().getSimpleVT(); + if (TRI->isTypeLegalForClass(*DataRC, DataVT)) { + // Normal 32-bit case. + Ops.push_back(N->getOperand(2)); + } else { + // Operand is really 32-bits, but requires 64-bit alignment, so use the + // even aligned 64-bit register class. + const SDValue RegSeqOps[] = { + CurDAG->getTargetConstant(DataRC->getID(), SL, MVT::i32), Data, + CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32), + SDValue( + CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL, MVT::i32), + 0), + CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32)}; + + Ops.push_back(SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, + SL, MVT::v2i32, RegSeqOps), + 0)); + } + } + Ops.push_back(OffsetField); Ops.push_back(Chain); @@ -4387,16 +4449,23 @@ bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const { bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode *N) const { const auto *Ld = cast<LoadSDNode>(N); - const MachineMemOperand *MMO = Ld->getMemOperand(); - if (N->isDivergent() && !AMDGPU::isUniformMMO(MMO)) + + // FIXME: We ought to able able to take the direct isDivergent result. We + // cannot rely on the MMO for a uniformity check, and should stop using + // it. This is a hack for 2 ways that the IR divergence analysis is superior + // to the DAG divergence: Recognizing shift-of-workitem-id as always + // uniform, and isSingleLaneExecution. These should be handled in the DAG + // version, and then this can be dropped. + if (Ld->isDivergent() && !AMDGPU::isUniformMMO(MMO)) return false; return MMO->getSize().hasValue() && Ld->getAlign() >= Align(std::min(MMO->getSize().getValue().getKnownMinValue(), uint64_t(4))) && - ((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || + (MMO->isInvariant() || + (Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) || (Subtarget->getScalarizeGlobalBehavior() && Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index 4fa0d3f..a86b754 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -14,6 +14,7 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUISELDAGTODAG_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUISELDAGTODAG_H +#include "AMDGPUSelectionDAGInfo.h" #include "GCNSubtarget.h" #include "SIMachineFunctionInfo.h" #include "SIModeRegisterDefaults.h" @@ -45,21 +46,6 @@ static inline bool getConstantValue(SDValue N, uint32_t &Out) { return false; } -// TODO: Handle undef as zero -static inline SDNode *packConstantV2I16(const SDNode *N, SelectionDAG &DAG) { - assert(N->getOpcode() == ISD::BUILD_VECTOR && N->getNumOperands() == 2); - uint32_t LHSVal, RHSVal; - if (getConstantValue(N->getOperand(0), LHSVal) && - getConstantValue(N->getOperand(1), RHSVal)) { - SDLoc SL(N); - uint32_t K = (LHSVal & 0xffff) | (RHSVal << 16); - return DAG.getMachineNode(AMDGPU::S_MOV_B32, SL, N->getValueType(0), - DAG.getTargetConstant(K, SL, MVT::i32)); - } - - return nullptr; -} - /// AMDGPU specific code to select AMDGPU machine instructions for /// SelectionDAG operations. class AMDGPUDAGToDAGISel : public SelectionDAGISel { @@ -115,6 +101,8 @@ private: MachineSDNode *buildSMovImm64(SDLoc &DL, uint64_t Val, EVT VT) const; + SDNode *packConstantV2I16(const SDNode *N, SelectionDAG &DAG) const; + SDNode *glueCopyToOp(SDNode *N, SDValue NewChain, SDValue Glue) const; SDNode *glueCopyToM0(SDNode *N, SDValue Val) const; SDNode *glueCopyToM0LDSInit(SDNode *N) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 1b559a6..ff17833 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -17,6 +17,7 @@ #include "AMDGPUInstrInfo.h" #include "AMDGPUMachineFunction.h" #include "AMDGPUMemoryUtils.h" +#include "AMDGPUSelectionDAGInfo.h" #include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/GlobalISel/GISelValueTracking.h" @@ -59,8 +60,9 @@ unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) { } AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, - const AMDGPUSubtarget &STI) - : TargetLowering(TM), Subtarget(&STI) { + const TargetSubtargetInfo &STI, + const AMDGPUSubtarget &AMDGPUSTI) + : TargetLowering(TM, STI), Subtarget(&AMDGPUSTI) { // Always lower memset, memcpy, and memmove intrinsics to load/store // instructions, rather then generating calls to memset, mempcy or memmove. MaxStoresPerMemset = MaxStoresPerMemsetOptSize = ~0U; @@ -336,6 +338,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand); setTruncStoreAction(MVT::v4f32, MVT::v4bf16, Expand); setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand); + setTruncStoreAction(MVT::v6f32, MVT::v6f16, Expand); setTruncStoreAction(MVT::v8f32, MVT::v8bf16, Expand); setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand); setTruncStoreAction(MVT::v16f32, MVT::v16bf16, Expand); @@ -502,9 +505,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, // The hardware supports 32-bit FSHR, but not FSHL. setOperationAction(ISD::FSHR, MVT::i32, Legal); - // The hardware supports 32-bit ROTR, but not ROTL. - setOperationAction(ISD::ROTL, {MVT::i32, MVT::i64}, Expand); - setOperationAction(ISD::ROTR, MVT::i64, Expand); + setOperationAction({ISD::ROTL, ISD::ROTR}, {MVT::i32, MVT::i64}, Expand); setOperationAction({ISD::MULHU, ISD::MULHS}, MVT::i16, Expand); @@ -1216,7 +1217,7 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute( const SmallVectorImpl<ISD::InputArg> &Ins) const { const MachineFunction &MF = State.getMachineFunction(); const Function &Fn = MF.getFunction(); - LLVMContext &Ctx = Fn.getParent()->getContext(); + LLVMContext &Ctx = Fn.getContext(); const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF); const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(); CallingConv::ID CC = Fn.getCallingConv(); @@ -1248,7 +1249,8 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute( SmallVector<EVT, 16> ValueVTs; SmallVector<uint64_t, 16> Offsets; - ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset); + ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, /*MemVTs=*/nullptr, + &Offsets, ArgOffset); for (unsigned Value = 0, NumValues = ValueVTs.size(); Value != NumValues; ++Value) { @@ -1409,7 +1411,12 @@ SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI, InVals.push_back(DAG.getPOISON(Arg.VT)); } - return DAG.getEntryNode(); + // FIXME: Hack because R600 doesn't handle callseq pseudos yet. + if (getTargetMachine().getTargetTriple().getArch() == Triple::r600) + return CLI.Chain; + + SDValue Chain = DAG.getCALLSEQ_START(CLI.Chain, 0, 0, CLI.DL); + return DAG.getCALLSEQ_END(Chain, 0, 0, /*InGlue=*/SDValue(), CLI.DL); } SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI, @@ -1885,14 +1892,14 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op, Align BaseAlign = Load->getAlign(); Align HiAlign = commonAlignment(BaseAlign, Size); - SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT, - Load->getChain(), BasePtr, SrcValue, LoMemVT, - BaseAlign, Load->getMemOperand()->getFlags()); + SDValue LoLoad = DAG.getExtLoad( + Load->getExtensionType(), SL, LoVT, Load->getChain(), BasePtr, SrcValue, + LoMemVT, BaseAlign, Load->getMemOperand()->getFlags(), Load->getAAInfo()); SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Size)); - SDValue HiLoad = - DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(), - HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()), - HiMemVT, HiAlign, Load->getMemOperand()->getFlags()); + SDValue HiLoad = DAG.getExtLoad( + Load->getExtensionType(), SL, HiVT, Load->getChain(), HiPtr, + SrcValue.getWithOffset(LoMemVT.getStoreSize()), HiMemVT, HiAlign, + Load->getMemOperand()->getFlags(), Load->getAAInfo()); SDValue Join; if (LoVT == HiVT) { @@ -1980,10 +1987,10 @@ SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op, SDValue LoStore = DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign, - Store->getMemOperand()->getFlags()); - SDValue HiStore = - DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size), - HiMemVT, HiAlign, Store->getMemOperand()->getFlags()); + Store->getMemOperand()->getFlags(), Store->getAAInfo()); + SDValue HiStore = DAG.getTruncStore( + Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size), HiMemVT, HiAlign, + Store->getMemOperand()->getFlags(), Store->getAAInfo()); return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore); } @@ -2764,7 +2771,6 @@ SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op, EVT VT = Op.getValueType(); SDNodeFlags Flags = Op->getFlags(); SDLoc DL(Op); - const bool IsLog10 = Op.getOpcode() == ISD::FLOG10; assert(IsLog10 || Op.getOpcode() == ISD::FLOG); @@ -2803,7 +2809,9 @@ SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op, SDValue C = DAG.getConstantFP(IsLog10 ? c_log10 : c_log, DL, VT); SDValue CC = DAG.getConstantFP(IsLog10 ? cc_log10 : cc_log, DL, VT); - + // This adds correction terms for which contraction may lead to an increase + // in the error of the approximation, so disable it. + Flags.setAllowContract(false); R = DAG.getNode(ISD::FMUL, DL, VT, Y, C, Flags); SDValue NegR = DAG.getNode(ISD::FNEG, DL, VT, R, Flags); SDValue FMA0 = DAG.getNode(ISD::FMA, DL, VT, Y, C, NegR, Flags); @@ -2826,7 +2834,9 @@ SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op, SDValue YHInt = DAG.getNode(ISD::AND, DL, MVT::i32, YAsInt, MaskConst); SDValue YH = DAG.getNode(ISD::BITCAST, DL, MVT::f32, YHInt); SDValue YT = DAG.getNode(ISD::FSUB, DL, VT, Y, YH, Flags); - + // This adds correction terms for which contraction may lead to an increase + // in the error of the approximation, so disable it. + Flags.setAllowContract(false); SDValue YTCT = DAG.getNode(ISD::FMUL, DL, VT, YT, CT, Flags); SDValue Mad0 = getMad(DAG, DL, VT, YH, CT, YTCT, Flags); SDValue Mad1 = getMad(DAG, DL, VT, YT, CH, Mad0, Flags); @@ -2950,19 +2960,28 @@ SDValue AMDGPUTargetLowering::lowerFEXP2(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScale, Flags); } +SDValue AMDGPUTargetLowering::lowerFEXPUnsafeImpl(SDValue X, const SDLoc &SL, + SelectionDAG &DAG, + SDNodeFlags Flags, + bool IsExp10) const { + // exp(x) -> exp2(M_LOG2E_F * x); + // exp10(x) -> exp2(log2(10) * x); + EVT VT = X.getValueType(); + SDValue Const = + DAG.getConstantFP(IsExp10 ? 0x1.a934f0p+1f : numbers::log2e, SL, VT); + + SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Const, Flags); + return DAG.getNode(VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP + : (unsigned)ISD::FEXP2, + SL, VT, Mul, Flags); +} + SDValue AMDGPUTargetLowering::lowerFEXPUnsafe(SDValue X, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const { EVT VT = X.getValueType(); - const SDValue Log2E = DAG.getConstantFP(numbers::log2e, SL, VT); - - if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) { - // exp2(M_LOG2E_F * f); - SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Log2E, Flags); - return DAG.getNode(VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP - : (unsigned)ISD::FEXP2, - SL, VT, Mul, Flags); - } + if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) + return lowerFEXPUnsafeImpl(X, SL, DAG, Flags, /*IsExp10=*/false); EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); @@ -2976,6 +2995,7 @@ SDValue AMDGPUTargetLowering::lowerFEXPUnsafe(SDValue X, const SDLoc &SL, SDValue AdjustedX = DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X); + const SDValue Log2E = DAG.getConstantFP(numbers::log2e, SL, VT); SDValue ExpInput = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, Log2E, Flags); SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, ExpInput, Flags); @@ -2994,6 +3014,7 @@ SDValue AMDGPUTargetLowering::lowerFEXP10Unsafe(SDValue X, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const { const EVT VT = X.getValueType(); + const unsigned Exp2Op = VT == MVT::f32 ? static_cast<unsigned>(AMDGPUISD::EXP) : static_cast<unsigned>(ISD::FEXP2); @@ -3050,33 +3071,32 @@ SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const { SDNodeFlags Flags = Op->getFlags(); const bool IsExp10 = Op.getOpcode() == ISD::FEXP10; - if (VT.getScalarType() == MVT::f16) { - // v_exp_f16 (fmul x, log2e) - if (allowApproxFunc(DAG, Flags)) // TODO: Does this really require fast? - return lowerFEXPUnsafe(X, SL, DAG, Flags); + // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying + // library behavior. Also, is known-not-daz source sufficient? + if (allowApproxFunc(DAG, Flags)) { // TODO: Does this really require fast? + return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags) + : lowerFEXPUnsafe(X, SL, DAG, Flags); + } + if (VT.getScalarType() == MVT::f16) { if (VT.isVector()) return SDValue(); + // Nothing in half is a denormal when promoted to f32. + // // exp(f16 x) -> // fptrunc (v_exp_f32 (fmul (fpext x), log2e)) - - // Nothing in half is a denormal when promoted to f32. + // + // exp10(f16 x) -> + // fptrunc (v_exp_f32 (fmul (fpext x), log2(10))) SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, X, Flags); - SDValue Lowered = lowerFEXPUnsafe(Ext, SL, DAG, Flags); + SDValue Lowered = lowerFEXPUnsafeImpl(Ext, SL, DAG, Flags, IsExp10); return DAG.getNode(ISD::FP_ROUND, SL, VT, Lowered, DAG.getTargetConstant(0, SL, MVT::i32), Flags); } assert(VT == MVT::f32); - // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying - // library behavior. Also, is known-not-daz source sufficient? - if (allowApproxFunc(DAG, Flags)) { - return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags) - : lowerFEXPUnsafe(X, SL, DAG, Flags); - } - // Algorithm: // // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64) @@ -5649,169 +5669,6 @@ uint32_t AMDGPUTargetLowering::getImplicitParameterOffset( return getImplicitParameterOffset(MFI->getExplicitKernArgSize(), Param); } -#define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node; - -const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { - switch ((AMDGPUISD::NodeType)Opcode) { - case AMDGPUISD::FIRST_NUMBER: break; - // AMDIL DAG nodes - NODE_NAME_CASE(BRANCH_COND); - - // AMDGPU DAG nodes - NODE_NAME_CASE(IF) - NODE_NAME_CASE(ELSE) - NODE_NAME_CASE(LOOP) - NODE_NAME_CASE(CALL) - NODE_NAME_CASE(TC_RETURN) - NODE_NAME_CASE(TC_RETURN_GFX) - NODE_NAME_CASE(TC_RETURN_GFX_WholeWave) - NODE_NAME_CASE(TC_RETURN_CHAIN) - NODE_NAME_CASE(TC_RETURN_CHAIN_DVGPR) - NODE_NAME_CASE(TRAP) - NODE_NAME_CASE(RET_GLUE) - NODE_NAME_CASE(WAVE_ADDRESS) - NODE_NAME_CASE(RETURN_TO_EPILOG) - NODE_NAME_CASE(ENDPGM) - NODE_NAME_CASE(ENDPGM_TRAP) - NODE_NAME_CASE(SIMULATED_TRAP) - NODE_NAME_CASE(DWORDADDR) - NODE_NAME_CASE(FRACT) - NODE_NAME_CASE(SETCC) - NODE_NAME_CASE(DENORM_MODE) - NODE_NAME_CASE(FMA_W_CHAIN) - NODE_NAME_CASE(FMUL_W_CHAIN) - NODE_NAME_CASE(CLAMP) - NODE_NAME_CASE(COS_HW) - NODE_NAME_CASE(SIN_HW) - NODE_NAME_CASE(FMAX_LEGACY) - NODE_NAME_CASE(FMIN_LEGACY) - NODE_NAME_CASE(FMAX3) - NODE_NAME_CASE(SMAX3) - NODE_NAME_CASE(UMAX3) - NODE_NAME_CASE(FMIN3) - NODE_NAME_CASE(SMIN3) - NODE_NAME_CASE(UMIN3) - NODE_NAME_CASE(FMED3) - NODE_NAME_CASE(SMED3) - NODE_NAME_CASE(UMED3) - NODE_NAME_CASE(FMAXIMUM3) - NODE_NAME_CASE(FMINIMUM3) - NODE_NAME_CASE(FDOT2) - NODE_NAME_CASE(URECIP) - NODE_NAME_CASE(DIV_SCALE) - NODE_NAME_CASE(DIV_FMAS) - NODE_NAME_CASE(DIV_FIXUP) - NODE_NAME_CASE(FMAD_FTZ) - NODE_NAME_CASE(RCP) - NODE_NAME_CASE(RSQ) - NODE_NAME_CASE(RCP_LEGACY) - NODE_NAME_CASE(RCP_IFLAG) - NODE_NAME_CASE(LOG) - NODE_NAME_CASE(EXP) - NODE_NAME_CASE(FMUL_LEGACY) - NODE_NAME_CASE(RSQ_CLAMP) - NODE_NAME_CASE(FP_CLASS) - NODE_NAME_CASE(DOT4) - NODE_NAME_CASE(CARRY) - NODE_NAME_CASE(BORROW) - NODE_NAME_CASE(BFE_U32) - NODE_NAME_CASE(BFE_I32) - NODE_NAME_CASE(BFI) - NODE_NAME_CASE(BFM) - NODE_NAME_CASE(FFBH_U32) - NODE_NAME_CASE(FFBH_I32) - NODE_NAME_CASE(FFBL_B32) - NODE_NAME_CASE(MUL_U24) - NODE_NAME_CASE(MUL_I24) - NODE_NAME_CASE(MULHI_U24) - NODE_NAME_CASE(MULHI_I24) - NODE_NAME_CASE(MAD_U24) - NODE_NAME_CASE(MAD_I24) - NODE_NAME_CASE(MAD_I64_I32) - NODE_NAME_CASE(MAD_U64_U32) - NODE_NAME_CASE(PERM) - NODE_NAME_CASE(TEXTURE_FETCH) - NODE_NAME_CASE(R600_EXPORT) - NODE_NAME_CASE(CONST_ADDRESS) - NODE_NAME_CASE(REGISTER_LOAD) - NODE_NAME_CASE(REGISTER_STORE) - NODE_NAME_CASE(CVT_F32_UBYTE0) - NODE_NAME_CASE(CVT_F32_UBYTE1) - NODE_NAME_CASE(CVT_F32_UBYTE2) - NODE_NAME_CASE(CVT_F32_UBYTE3) - NODE_NAME_CASE(CVT_PKRTZ_F16_F32) - NODE_NAME_CASE(CVT_PKNORM_I16_F32) - NODE_NAME_CASE(CVT_PKNORM_U16_F32) - NODE_NAME_CASE(CVT_PK_I16_I32) - NODE_NAME_CASE(CVT_PK_U16_U32) - NODE_NAME_CASE(FP_TO_FP16) - NODE_NAME_CASE(BUILD_VERTICAL_VECTOR) - NODE_NAME_CASE(CONST_DATA_PTR) - NODE_NAME_CASE(PC_ADD_REL_OFFSET) - NODE_NAME_CASE(PC_ADD_REL_OFFSET64) - NODE_NAME_CASE(LDS) - NODE_NAME_CASE(DUMMY_CHAIN) - NODE_NAME_CASE(LOAD_D16_HI) - NODE_NAME_CASE(LOAD_D16_LO) - NODE_NAME_CASE(LOAD_D16_HI_I8) - NODE_NAME_CASE(LOAD_D16_HI_U8) - NODE_NAME_CASE(LOAD_D16_LO_I8) - NODE_NAME_CASE(LOAD_D16_LO_U8) - NODE_NAME_CASE(STORE_MSKOR) - NODE_NAME_CASE(TBUFFER_STORE_FORMAT) - NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16) - NODE_NAME_CASE(TBUFFER_LOAD_FORMAT) - NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16) - NODE_NAME_CASE(DS_ORDERED_COUNT) - NODE_NAME_CASE(ATOMIC_CMP_SWAP) - NODE_NAME_CASE(BUFFER_LOAD) - NODE_NAME_CASE(BUFFER_LOAD_UBYTE) - NODE_NAME_CASE(BUFFER_LOAD_USHORT) - NODE_NAME_CASE(BUFFER_LOAD_BYTE) - NODE_NAME_CASE(BUFFER_LOAD_SHORT) - NODE_NAME_CASE(BUFFER_LOAD_TFE) - NODE_NAME_CASE(BUFFER_LOAD_UBYTE_TFE) - NODE_NAME_CASE(BUFFER_LOAD_USHORT_TFE) - NODE_NAME_CASE(BUFFER_LOAD_BYTE_TFE) - NODE_NAME_CASE(BUFFER_LOAD_SHORT_TFE) - NODE_NAME_CASE(BUFFER_LOAD_FORMAT) - NODE_NAME_CASE(BUFFER_LOAD_FORMAT_TFE) - NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16) - NODE_NAME_CASE(SBUFFER_LOAD) - NODE_NAME_CASE(SBUFFER_LOAD_BYTE) - NODE_NAME_CASE(SBUFFER_LOAD_UBYTE) - NODE_NAME_CASE(SBUFFER_LOAD_SHORT) - NODE_NAME_CASE(SBUFFER_LOAD_USHORT) - NODE_NAME_CASE(SBUFFER_PREFETCH_DATA) - NODE_NAME_CASE(BUFFER_STORE) - NODE_NAME_CASE(BUFFER_STORE_BYTE) - NODE_NAME_CASE(BUFFER_STORE_SHORT) - NODE_NAME_CASE(BUFFER_STORE_FORMAT) - NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16) - NODE_NAME_CASE(BUFFER_ATOMIC_SWAP) - NODE_NAME_CASE(BUFFER_ATOMIC_ADD) - NODE_NAME_CASE(BUFFER_ATOMIC_SUB) - NODE_NAME_CASE(BUFFER_ATOMIC_SMIN) - NODE_NAME_CASE(BUFFER_ATOMIC_UMIN) - NODE_NAME_CASE(BUFFER_ATOMIC_SMAX) - NODE_NAME_CASE(BUFFER_ATOMIC_UMAX) - NODE_NAME_CASE(BUFFER_ATOMIC_AND) - NODE_NAME_CASE(BUFFER_ATOMIC_OR) - NODE_NAME_CASE(BUFFER_ATOMIC_XOR) - NODE_NAME_CASE(BUFFER_ATOMIC_INC) - NODE_NAME_CASE(BUFFER_ATOMIC_DEC) - NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP) - NODE_NAME_CASE(BUFFER_ATOMIC_CSUB) - NODE_NAME_CASE(BUFFER_ATOMIC_FADD) - NODE_NAME_CASE(BUFFER_ATOMIC_FMIN) - NODE_NAME_CASE(BUFFER_ATOMIC_FMAX) - NODE_NAME_CASE(BUFFER_ATOMIC_COND_SUB_U32) - NODE_NAME_CASE(WHOLE_WAVE_SETUP) - NODE_NAME_CASE(WHOLE_WAVE_RETURN) - } - return nullptr; -} - SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index bdaf486..10ae816 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -78,6 +78,9 @@ protected: bool IsLog10, SDNodeFlags Flags) const; SDValue lowerFEXP2(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerFEXPUnsafeImpl(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, + SDNodeFlags Flags, bool IsExp10) const; + SDValue lowerFEXPUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const; SDValue lowerFEXP10Unsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, @@ -180,7 +183,8 @@ protected: const SmallVectorImpl<ISD::InputArg> &Ins) const; public: - AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI); + AMDGPUTargetLowering(const TargetMachine &TM, const TargetSubtargetInfo &STI, + const AMDGPUSubtarget &AMDGPUSTI); bool mayIgnoreSignedZero(SDValue Op) const; @@ -280,8 +284,6 @@ public: SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const; - const char* getTargetNodeName(unsigned Opcode) const override; - // FIXME: Turn off MergeConsecutiveStores() before Instruction Selection for // AMDGPU. Commit r319036, // (https://github.com/llvm/llvm-project/commit/db77e57ea86d941a4262ef60261692f4cb6893e6) @@ -406,235 +408,6 @@ public: } }; -namespace AMDGPUISD { - -enum NodeType : unsigned { - // AMDIL ISD Opcodes - FIRST_NUMBER = ISD::BUILTIN_OP_END, - BRANCH_COND, - // End AMDIL ISD Opcodes - - // Function call. - CALL, - TC_RETURN, - TC_RETURN_GFX, - TC_RETURN_GFX_WholeWave, - TC_RETURN_CHAIN, - TC_RETURN_CHAIN_DVGPR, - TRAP, - - // Masked control flow nodes. - IF, - ELSE, - LOOP, - - // A uniform kernel return that terminates the wavefront. - ENDPGM, - - // s_endpgm, but we may want to insert it in the middle of the block. - ENDPGM_TRAP, - - // "s_trap 2" equivalent on hardware that does not support it. - SIMULATED_TRAP, - - // Return to a shader part's epilog code. - RETURN_TO_EPILOG, - - // Return with values from a non-entry function. - RET_GLUE, - - // Convert a unswizzled wave uniform stack address to an address compatible - // with a vector offset for use in stack access. - WAVE_ADDRESS, - - DWORDADDR, - FRACT, - - /// CLAMP value between 0.0 and 1.0. NaN clamped to 0, following clamp output - /// modifier behavior with dx10_enable. - CLAMP, - - // This is SETCC with the full mask result which is used for a compare with a - // result bit per item in the wavefront. - SETCC, - - DENORM_MODE, - - // FP ops with input and output chain. - FMA_W_CHAIN, - FMUL_W_CHAIN, - - // SIN_HW, COS_HW - f32 for SI, 1 ULP max error, valid from -100 pi to 100 pi. - // Denormals handled on some parts. - COS_HW, - SIN_HW, - FMAX_LEGACY, - FMIN_LEGACY, - - FMAX3, - SMAX3, - UMAX3, - FMIN3, - SMIN3, - UMIN3, - FMED3, - SMED3, - UMED3, - FMAXIMUM3, - FMINIMUM3, - FDOT2, - URECIP, - DIV_SCALE, - DIV_FMAS, - DIV_FIXUP, - // For emitting ISD::FMAD when f32 denormals are enabled because mac/mad is - // treated as an illegal operation. - FMAD_FTZ, - - // RCP, RSQ - For f32, 1 ULP max error, no denormal handling. - // For f64, max error 2^29 ULP, handles denormals. - RCP, - RSQ, - RCP_LEGACY, - RCP_IFLAG, - - // log2, no denormal handling for f32. - LOG, - - // exp2, no denormal handling for f32. - EXP, - - FMUL_LEGACY, - RSQ_CLAMP, - FP_CLASS, - DOT4, - CARRY, - BORROW, - BFE_U32, // Extract range of bits with zero extension to 32-bits. - BFE_I32, // Extract range of bits with sign extension to 32-bits. - BFI, // (src0 & src1) | (~src0 & src2) - BFM, // Insert a range of bits into a 32-bit word. - FFBH_U32, // ctlz with -1 if input is zero. - FFBH_I32, - FFBL_B32, // cttz with -1 if input is zero. - MUL_U24, - MUL_I24, - MULHI_U24, - MULHI_I24, - MAD_U24, - MAD_I24, - MAD_U64_U32, - MAD_I64_I32, - PERM, - TEXTURE_FETCH, - R600_EXPORT, - CONST_ADDRESS, - REGISTER_LOAD, - REGISTER_STORE, - - // These cvt_f32_ubyte* nodes need to remain consecutive and in order. - CVT_F32_UBYTE0, - CVT_F32_UBYTE1, - CVT_F32_UBYTE2, - CVT_F32_UBYTE3, - - // Convert two float 32 numbers into a single register holding two packed f16 - // with round to zero. - CVT_PKRTZ_F16_F32, - CVT_PKNORM_I16_F32, - CVT_PKNORM_U16_F32, - CVT_PK_I16_I32, - CVT_PK_U16_U32, - - // Same as the standard node, except the high bits of the resulting integer - // are known 0. - FP_TO_FP16, - - /// This node is for VLIW targets and it is used to represent a vector - /// that is stored in consecutive registers with the same channel. - /// For example: - /// |X |Y|Z|W| - /// T0|v.x| | | | - /// T1|v.y| | | | - /// T2|v.z| | | | - /// T3|v.w| | | | - BUILD_VERTICAL_VECTOR, - /// Pointer to the start of the shader's constant data. - CONST_DATA_PTR, - PC_ADD_REL_OFFSET, - PC_ADD_REL_OFFSET64, - LDS, - - DUMMY_CHAIN, - - FIRST_MEMORY_OPCODE, - LOAD_D16_HI = FIRST_MEMORY_OPCODE, - LOAD_D16_LO, - LOAD_D16_HI_I8, - LOAD_D16_HI_U8, - LOAD_D16_LO_I8, - LOAD_D16_LO_U8, - - STORE_MSKOR, - TBUFFER_STORE_FORMAT, - TBUFFER_STORE_FORMAT_D16, - TBUFFER_LOAD_FORMAT, - TBUFFER_LOAD_FORMAT_D16, - DS_ORDERED_COUNT, - ATOMIC_CMP_SWAP, - BUFFER_LOAD, - BUFFER_LOAD_UBYTE, - BUFFER_LOAD_USHORT, - BUFFER_LOAD_BYTE, - BUFFER_LOAD_SHORT, - BUFFER_LOAD_TFE, - BUFFER_LOAD_UBYTE_TFE, - BUFFER_LOAD_USHORT_TFE, - BUFFER_LOAD_BYTE_TFE, - BUFFER_LOAD_SHORT_TFE, - BUFFER_LOAD_FORMAT, - BUFFER_LOAD_FORMAT_TFE, - BUFFER_LOAD_FORMAT_D16, - SBUFFER_LOAD, - SBUFFER_LOAD_BYTE, - SBUFFER_LOAD_UBYTE, - SBUFFER_LOAD_SHORT, - SBUFFER_LOAD_USHORT, - SBUFFER_PREFETCH_DATA, - BUFFER_STORE, - BUFFER_STORE_BYTE, - BUFFER_STORE_SHORT, - BUFFER_STORE_FORMAT, - BUFFER_STORE_FORMAT_D16, - BUFFER_ATOMIC_SWAP, - BUFFER_ATOMIC_ADD, - BUFFER_ATOMIC_SUB, - BUFFER_ATOMIC_SMIN, - BUFFER_ATOMIC_UMIN, - BUFFER_ATOMIC_SMAX, - BUFFER_ATOMIC_UMAX, - BUFFER_ATOMIC_AND, - BUFFER_ATOMIC_OR, - BUFFER_ATOMIC_XOR, - BUFFER_ATOMIC_INC, - BUFFER_ATOMIC_DEC, - BUFFER_ATOMIC_CMPSWAP, - BUFFER_ATOMIC_CSUB, - BUFFER_ATOMIC_FADD, - BUFFER_ATOMIC_FMIN, - BUFFER_ATOMIC_FMAX, - BUFFER_ATOMIC_COND_SUB_U32, - LAST_MEMORY_OPCODE = BUFFER_ATOMIC_COND_SUB_U32, - - // Set up a whole wave function. - WHOLE_WAVE_SETUP, - - // Return from a whole wave function. - WHOLE_WAVE_RETURN, -}; - -} // End namespace AMDGPUISD - } // End namespace llvm #endif diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp index 9a90787..5f4ca82 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp @@ -221,7 +221,7 @@ public: }; // A map from regunits to the delay info for that regunit. - struct DelayState : DenseMap<unsigned, DelayInfo> { + struct DelayState : DenseMap<MCRegUnit, DelayInfo> { // Merge another DelayState into this one by merging the delay info for each // regunit. void merge(const DelayState &RHS) { @@ -359,7 +359,8 @@ public: bool Changed = false; MachineInstr *LastDelayAlu = nullptr; - MCRegUnit LastSGPRFromVALU = 0; + // FIXME: 0 is a valid register unit. + MCRegUnit LastSGPRFromVALU = static_cast<MCRegUnit>(0); // Iterate over the contents of bundles, but don't emit any instructions // inside a bundle. for (auto &MI : MBB.instrs()) { @@ -379,7 +380,8 @@ public: if (It != State.end()) { DelayInfo Info = It->getSecond(); State.advanceByVALUNum(Info.VALUNum); - LastSGPRFromVALU = 0; + // FIXME: 0 is a valid register unit. + LastSGPRFromVALU = static_cast<MCRegUnit>(0); } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index 4fe5d00..4792673 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -35,7 +35,7 @@ struct AMDGPUImageDMaskIntrinsic { }; #define GET_AMDGPUImageDMaskIntrinsicTable_IMPL -#include "InstCombineTables.inc" +#include "AMDGPUGenSearchableTables.inc" } // end anonymous namespace diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp index 5085e86..2b1f404 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp @@ -29,11 +29,19 @@ Intrinsic::ID AMDGPU::getIntrinsicID(const MachineInstr &I) { // TODO: Should largely merge with AMDGPUTTIImpl::isSourceOfDivergence. bool AMDGPU::isUniformMMO(const MachineMemOperand *MMO) { const Value *Ptr = MMO->getValue(); + if (!Ptr) { + if (const PseudoSourceValue *PSV = MMO->getPseudoValue()) { + return PSV->isConstantPool() || PSV->isStack() || PSV->isGOT() || + PSV->isJumpTable(); + } + + // Unknown value. + return false; + } + // UndefValue means this is a load of a kernel input. These are uniform. // Sometimes LDS instructions have constant pointers. - // If Ptr is null, then that means this mem operand contains a - // PseudoSourceValue like GOT. - if (!Ptr || isa<UndefValue, Constant, GlobalValue>(Ptr)) + if (isa<UndefValue, Constant, GlobalValue>(Ptr)) return true; if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td index b8fa6f3..8a43c2d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -62,6 +62,7 @@ def AMDGPUIfBreakOp : SDTypeProfile<1, 2, // AMDGPU DAG Nodes // +// Masked control flow nodes. def AMDGPUif : SDNode<"AMDGPUISD::IF", AMDGPUIfOp, [SDNPHasChain]>; def AMDGPUelse : SDNode<"AMDGPUISD::ELSE", AMDGPUElseOp, [SDNPHasChain]>; def AMDGPUloop : SDNode<"AMDGPUISD::LOOP", AMDGPULoopOp, [SDNPHasChain]>; @@ -114,6 +115,7 @@ def AMDGPUtrap : SDNode<"AMDGPUISD::TRAP", [SDNPHasChain, SDNPVariadic, SDNPSideEffect, SDNPOptInGlue] >; +// Pointer to the start of the shader's constant data. def AMDGPUconstdata_ptr : SDNode< "AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 1, [SDTCisVT<0, iPTR>, SDTCisVT<0, iPTR>]> @@ -122,18 +124,21 @@ def AMDGPUconstdata_ptr : SDNode< // This argument to this node is a dword address. def AMDGPUdwordaddr : SDNode<"AMDGPUISD::DWORDADDR", SDTIntUnaryOp>; +// SIN_HW, COS_HW - f32 for SI, 1 ULP max error, valid from -100 pi to 100 pi. +// Denormals handled on some parts. def AMDGPUcos_impl : SDNode<"AMDGPUISD::COS_HW", SDTFPUnaryOp>; def AMDGPUsin_impl : SDNode<"AMDGPUISD::SIN_HW", SDTFPUnaryOp>; + // out = a - floor(a) def AMDGPUfract_impl : SDNode<"AMDGPUISD::FRACT", SDTFPUnaryOp>; // out = 1.0 / a def AMDGPUrcp_impl : SDNode<"AMDGPUISD::RCP", SDTFPUnaryOp>; -// v_log_f32, which is log2 +// v_log_f32, which is log2, no denormal handling for f32. def AMDGPUlog_impl : SDNode<"AMDGPUISD::LOG", SDTFPUnaryOp>; -// v_exp_f32, which is exp2 +// v_exp_f32, which is exp2, no denormal handling for f32. def AMDGPUexp_impl : SDNode<"AMDGPUISD::EXP", SDTFPUnaryOp>; // out = 1.0 / sqrt(a) @@ -146,11 +151,16 @@ def AMDGPUrcp_iflag : SDNode<"AMDGPUISD::RCP_IFLAG", SDTFPUnaryOp>; // out = 1.0 / sqrt(a) result clamped to +/- max_float. def AMDGPUrsq_clamp_impl : SDNode<"AMDGPUISD::RSQ_CLAMP", SDTFPUnaryOp>; +// Convert two float 32 numbers into a single register holding two packed f16 +// with round to zero. def AMDGPUpkrtz_f16_f32_impl : SDNode<"AMDGPUISD::CVT_PKRTZ_F16_F32", AMDGPUFPPackOp>; def AMDGPUpknorm_i16_f32_impl : SDNode<"AMDGPUISD::CVT_PKNORM_I16_F32", AMDGPUFPPackOp>; def AMDGPUpknorm_u16_f32_impl : SDNode<"AMDGPUISD::CVT_PKNORM_U16_F32", AMDGPUFPPackOp>; def AMDGPUpk_i16_i32_impl : SDNode<"AMDGPUISD::CVT_PK_I16_I32", AMDGPUIntPackOp>; def AMDGPUpk_u16_u32_impl : SDNode<"AMDGPUISD::CVT_PK_U16_U32", AMDGPUIntPackOp>; + +// Same as the standard node, except the high bits of the resulting integer +// are known 0. def AMDGPUfp_to_f16 : SDNode<"AMDGPUISD::FP_TO_FP16" , SDTFPToIntOp>; @@ -225,14 +235,18 @@ def AMDGPUSetCCOp : SDTypeProfile<1, 3, [ // setcc SDTCisInt<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, OtherVT> ]>; +// This is SETCC with the full mask result which is used for a compare with a +// result bit per item in the wavefront. def AMDGPUsetcc : SDNode<"AMDGPUISD::SETCC", AMDGPUSetCCOp>; +// FP ops with input and output chain. def AMDGPUfma : SDNode<"AMDGPUISD::FMA_W_CHAIN", SDTFPTernaryOp, [ SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; def AMDGPUmul : SDNode<"AMDGPUISD::FMUL_W_CHAIN", SDTFPBinOp, [ SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; +// These cvt_f32_ubyte* nodes need to remain consecutive and in order. def AMDGPUcvt_f32_ubyte0 : SDNode<"AMDGPUISD::CVT_F32_UBYTE0", SDTIntToFPOp, []>; def AMDGPUcvt_f32_ubyte1 : SDNode<"AMDGPUISD::CVT_F32_UBYTE1", @@ -264,6 +278,8 @@ def AMDGPUdiv_fmas_impl : SDNode<"AMDGPUISD::DIV_FMAS", AMDGPUFmasOp, // Denominator, src2 = Numerator). def AMDGPUdiv_fixup_impl : SDNode<"AMDGPUISD::DIV_FIXUP", SDTFPTernaryOp>; +// For emitting ISD::FMAD when f32 denormals are enabled because mac/mad is +// treated as an illegal operation. def AMDGPUfmad_ftz_impl : SDNode<"AMDGPUISD::FMAD_FTZ", SDTFPTernaryOp>; def AMDGPUregister_load : SDNode<"AMDGPUISD::REGISTER_LOAD", @@ -290,14 +306,23 @@ def AMDGPUatomic_cmp_swap : SDNode<"AMDGPUISD::ATOMIC_CMP_SWAP", [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>; +// Extract range of bits with zero extension to 32-bits. def AMDGPUbfe_u32_impl : SDNode<"AMDGPUISD::BFE_U32", AMDGPUDTIntTernaryOp>; + +// Extract range of bits with sign extension to 32-bits. def AMDGPUbfe_i32_impl : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>; + +// (src0 & src1) | (~src0 & src2) def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>; + +// Insert a range of bits into a 32-bit word. def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>; +// ctlz with -1 if input is zero. def AMDGPUffbh_u32_impl : SDNode<"AMDGPUISD::FFBH_U32", SDTIntBitCountUnaryOp>; def AMDGPUffbh_i32_impl : SDNode<"AMDGPUISD::FFBH_I32", SDTIntBitCountUnaryOp>; +// cttz with -1 if input is zero. def AMDGPUffbl_b32_impl : SDNode<"AMDGPUISD::FFBL_B32", SDTIntBitCountUnaryOp>; // Signed and unsigned 24-bit multiply. The highest 8-bits are ignore @@ -394,16 +419,24 @@ def IL_brcond : SDNode<"AMDGPUISD::BRANCH_COND", SDTIL_BRCond, [SDNPHasChai //===----------------------------------------------------------------------===// // Call/Return DAG Nodes //===----------------------------------------------------------------------===// + +// A uniform kernel return that terminates the wavefront. def AMDGPUendpgm : SDNode<"AMDGPUISD::ENDPGM", SDTNone, [SDNPHasChain, SDNPOptInGlue]>; + +// s_endpgm, but we may want to insert it in the middle of the block. def AMDGPUendpgm_trap : SDNode<"AMDGPUISD::ENDPGM_TRAP", SDTNone, [SDNPHasChain]>; + +// "s_trap 2" equivalent on hardware that does not support it. def AMDGPUsimulated_trap : SDNode<"AMDGPUISD::SIMULATED_TRAP", SDTNone, [SDNPHasChain]>; +// Return to a shader part's epilog code. def AMDGPUreturn_to_epilog : SDNode<"AMDGPUISD::RETURN_TO_EPILOG", SDTNone, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; +// Return with values from a non-entry function. def AMDGPUret_glue : SDNode<"AMDGPUISD::RET_GLUE", SDTNone, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic] >; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 9ce1224..1549214 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -116,8 +116,14 @@ bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI, if (!DstRC || DstRC != SrcRC) return false; - return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) && - RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI); + if (!RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) || + !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI)) + return false; + const MCInstrDesc &MCID = MI.getDesc(); + if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) { + MI.getOperand(0).setIsEarlyClobber(true); + } + return true; } bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { @@ -221,12 +227,21 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const { const DebugLoc &DL = I.getDebugLoc(); MachineBasicBlock *BB = I.getParent(); + Register VCCReg = I.getOperand(1).getReg(); + MachineInstr *Cmp; + + // Set SCC as a side effect with S_CMP or S_OR. + if (STI.hasScalarCompareEq64()) { + unsigned CmpOpc = + STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32; + Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc)).addReg(VCCReg).addImm(0); + } else { + Register DeadDst = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); + Cmp = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_OR_B64), DeadDst) + .addReg(VCCReg) + .addReg(VCCReg); + } - unsigned CmpOpc = - STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32; - MachineInstr *Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc)) - .addReg(I.getOperand(1).getReg()) - .addImm(0); if (!constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI)) return false; @@ -593,6 +608,7 @@ bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32( I.setDesc(TII.get(Opc)); I.addOperand(*MF, MachineOperand::CreateImm(0)); I.addImplicitDefUseOperands(*MF); + I.getOperand(0).setIsEarlyClobber(true); return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } @@ -1930,20 +1946,52 @@ bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI, // The resource id offset is computed as (<isa opaque base> + M0[21:16] + // offset field) % 64. Some versions of the programming guide omit the m0 // part, or claim it's from offset 0. - auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID))); + + unsigned Opc = gwsIntrinToOpcode(IID); + const MCInstrDesc &InstrDesc = TII.get(Opc); if (HasVSrc) { Register VSrc = MI.getOperand(1).getReg(); - MIB.addReg(VSrc); - if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI)) - return false; - } + int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); + const TargetRegisterClass *DataRC = TII.getRegClass(InstrDesc, Data0Idx); + const TargetRegisterClass *SubRC = + TRI.getSubRegisterClass(DataRC, AMDGPU::sub0); - MIB.addImm(ImmOffset) - .cloneMemRefs(MI); + if (!SubRC) { + // 32-bit normal case. + if (!RBI.constrainGenericRegister(VSrc, *DataRC, *MRI)) + return false; - TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::data0); + BuildMI(*MBB, &MI, DL, InstrDesc) + .addReg(VSrc) + .addImm(ImmOffset) + .cloneMemRefs(MI); + } else { + // Requires even register alignment, so create 64-bit value and pad the + // top half with undef. + Register DataReg = MRI->createVirtualRegister(DataRC); + if (!RBI.constrainGenericRegister(VSrc, *SubRC, *MRI)) + return false; + + Register UndefReg = MRI->createVirtualRegister(SubRC); + BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg); + BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), DataReg) + .addReg(VSrc) + .addImm(AMDGPU::sub0) + .addReg(UndefReg) + .addImm(AMDGPU::sub1); + + BuildMI(*MBB, &MI, DL, InstrDesc) + .addReg(DataReg) + .addImm(ImmOffset) + .cloneMemRefs(MI); + } + } else { + BuildMI(*MBB, &MI, DL, InstrDesc) + .addImm(ImmOffset) + .cloneMemRefs(MI); + } MI.eraseFromParent(); return true; @@ -1982,7 +2030,7 @@ bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI, } bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const { - MachineFunction *MF = MI.getParent()->getParent(); + MachineFunction *MF = MI.getMF(); SIMachineFunctionInfo *MFInfo = MF->getInfo<SIMachineFunctionInfo>(); MFInfo->setInitWholeWave(); @@ -3674,7 +3722,7 @@ bool AMDGPUInstructionSelector::selectBVHIntersectRayIntrinsic( MI.getOpcode() == AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY ? 1 : 3; MI.setDesc(TII.get(MI.getOperand(OpcodeOpIdx).getImm())); MI.removeOperand(OpcodeOpIdx); - MI.addImplicitDefUseOperands(*MI.getParent()->getParent()); + MI.addImplicitDefUseOperands(*MI.getMF()); return constrainSelectedInstRegOperands(MI, TII, TRI, RBI); } @@ -3777,7 +3825,11 @@ bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const { MI.removeOperand(4); // VDst_In MI.removeOperand(1); // Intrinsic ID MI.addOperand(VDst_In); // Readd VDst_In to the end - MI.addImplicitDefUseOperands(*MI.getParent()->getParent()); + MI.addImplicitDefUseOperands(*MI.getMF()); + const MCInstrDesc &MCID = MI.getDesc(); + if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) { + MI.getOperand(0).setIsEarlyClobber(true); + } return true; } @@ -4149,6 +4201,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) { case TargetOpcode::G_ATOMICRMW_UMAX: case TargetOpcode::G_ATOMICRMW_UINC_WRAP: case TargetOpcode::G_ATOMICRMW_UDEC_WRAP: + case TargetOpcode::G_ATOMICRMW_USUB_COND: + case TargetOpcode::G_ATOMICRMW_USUB_SAT: case TargetOpcode::G_ATOMICRMW_FADD: case TargetOpcode::G_ATOMICRMW_FMIN: case TargetOpcode::G_ATOMICRMW_FMAX: @@ -6744,7 +6798,7 @@ bool AMDGPUInstructionSelector::selectSGetBarrierState( MachineInstr &I, Intrinsic::ID IntrID) const { MachineBasicBlock *MBB = I.getParent(); const DebugLoc &DL = I.getDebugLoc(); - MachineOperand BarOp = I.getOperand(2); + const MachineOperand &BarOp = I.getOperand(2); std::optional<int64_t> BarValImm = getIConstantVRegSExtVal(BarOp.getReg(), *MRI); @@ -6797,8 +6851,8 @@ bool AMDGPUInstructionSelector::selectNamedBarrierInit( MachineInstr &I, Intrinsic::ID IntrID) const { MachineBasicBlock *MBB = I.getParent(); const DebugLoc &DL = I.getDebugLoc(); - MachineOperand BarOp = I.getOperand(1); - MachineOperand CntOp = I.getOperand(2); + const MachineOperand &BarOp = I.getOperand(1); + const MachineOperand &CntOp = I.getOperand(2); // BarID = (BarOp >> 4) & 0x3F Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index bd443b5..2a99dac 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -636,15 +636,11 @@ multiclass local_addr_space_atomic_op { } } -defm int_amdgcn_global_atomic_csub : noret_op; defm int_amdgcn_global_atomic_ordered_add_b64 : noret_op; defm int_amdgcn_flat_atomic_fmin_num : noret_op; defm int_amdgcn_flat_atomic_fmax_num : noret_op; defm int_amdgcn_global_atomic_fmin_num : noret_op; defm int_amdgcn_global_atomic_fmax_num : noret_op; -defm int_amdgcn_atomic_cond_sub_u32 : local_addr_space_atomic_op; -defm int_amdgcn_atomic_cond_sub_u32 : flat_addr_space_atomic_op; -defm int_amdgcn_atomic_cond_sub_u32 : global_addr_space_atomic_op; multiclass noret_binary_atomic_op<SDNode atomic_op> { let HasNoUse = true in @@ -695,6 +691,8 @@ defm atomic_load_fmin : binary_atomic_op_fp_all_as<atomic_load_fmin>; defm atomic_load_fmax : binary_atomic_op_fp_all_as<atomic_load_fmax>; defm atomic_load_uinc_wrap : binary_atomic_op_all_as<atomic_load_uinc_wrap>; defm atomic_load_udec_wrap : binary_atomic_op_all_as<atomic_load_udec_wrap>; +defm atomic_load_usub_cond : binary_atomic_op_all_as<atomic_load_usub_cond>; +defm atomic_load_usub_sat : binary_atomic_op_all_as<atomic_load_usub_sat>; defm AMDGPUatomic_cmp_swap : binary_atomic_op_all_as<AMDGPUatomic_cmp_swap>; def load_align8_local : PatFrag<(ops node:$ptr), (load_local node:$ptr)>, @@ -806,12 +804,6 @@ class DwordAddrPat<ValueType vt, RegisterClass rc> : AMDGPUPat < (vt rc:$addr) >; -// rotr pattern -class ROTRPattern <Instruction BIT_ALIGN> : AMDGPUPat < - (rotr i32:$src0, i32:$src1), - (BIT_ALIGN $src0, $src0, $src1) ->; - // Special conversion patterns def cvt_rpi_i32_f32 : PatFrag < diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 1a13b22..cb1a4ee 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -30,6 +30,7 @@ #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/GlobalISel/Utils.h" +#include "llvm/CodeGen/PseudoSourceValueManager.h" #include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/IntrinsicsAMDGPU.h" @@ -1721,6 +1722,13 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); } + auto &Atomics32 = + getActionDefinitionsBuilder({G_ATOMICRMW_USUB_COND, G_ATOMICRMW_USUB_SAT}) + .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, {S32, RegionPtr}}); + if (ST.hasFlatAddressSpace()) { + Atomics32.legalFor({{S32, FlatPtr}}); + } + // TODO: v2bf16 operations, and fat buffer pointer support. auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); if (ST.hasLDSFPAtomicAddF32()) { @@ -2321,14 +2329,14 @@ Register AMDGPULegalizerInfo::getSegmentAperture( return B.buildUnmerge(S32, Dst).getReg(1); } - // TODO: can we be smarter about machine pointer info? - MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); Register LoadAddr = MRI.createGenericVirtualRegister( LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); // For code object version 5, private_base and shared_base are passed through // implicit kernargs. if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >= AMDGPU::AMDHSA_COV5) { + MachinePointerInfo PtrInfo = getKernargSegmentPtrInfo(B.getMF()); + AMDGPUTargetLowering::ImplicitParameter Param = AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE : AMDGPUTargetLowering::PRIVATE_BASE; @@ -2343,7 +2351,7 @@ Register AMDGPULegalizerInfo::getSegmentAperture( return Register(); MachineMemOperand *MMO = MF.getMachineMemOperand( - PtrInfo, + PtrInfo.getWithOffset(Offset), MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant, LLT::scalar(32), commonAlignment(Align(64), Offset)); @@ -2361,6 +2369,9 @@ Register AMDGPULegalizerInfo::getSegmentAperture( if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) return Register(); + // TODO: Use custom PseudoSourceValue + MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); + // Offset into amd_queue_t for group_segment_aperture_base_hi / // private_segment_aperture_base_hi. uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; @@ -2560,8 +2571,14 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); uint32_t AddrHiVal = Info->get32BitAddressHighBits(); auto PtrLo = B.buildPtrToInt(S32, Src); - auto HighAddr = B.buildConstant(S32, AddrHiVal); - B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr}); + if (AddrHiVal == 0) { + auto Zext = B.buildZExt(LLT::scalar(64), PtrLo); + B.buildIntToPtr(Dst, Zext); + } else { + auto HighAddr = B.buildConstant(S32, AddrHiVal); + B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr}); + } + MI.eraseFromParent(); return true; } @@ -3551,12 +3568,14 @@ bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI, auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log); auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log); - - R = B.buildFMul(Ty, Y, C, Flags).getReg(0); - auto NegR = B.buildFNeg(Ty, R, Flags); - auto FMA0 = B.buildFMA(Ty, Y, C, NegR, Flags); - auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, Flags); - R = B.buildFAdd(Ty, R, FMA1, Flags).getReg(0); + // This adds correction terms for which contraction may lead to an increase + // in the error of the approximation, so disable it. + auto NewFlags = Flags & ~(MachineInstr::FmContract); + R = B.buildFMul(Ty, Y, C, NewFlags).getReg(0); + auto NegR = B.buildFNeg(Ty, R, NewFlags); + auto FMA0 = B.buildFMA(Ty, Y, C, NegR, NewFlags); + auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, NewFlags); + R = B.buildFAdd(Ty, R, FMA1, NewFlags).getReg(0); } else { // ch+ct is ln(2)/ln(10) to more than 36 bits const float ch_log10 = 0x1.344000p-2f; @@ -3572,12 +3591,15 @@ bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI, auto MaskConst = B.buildConstant(Ty, 0xfffff000); auto YH = B.buildAnd(Ty, Y, MaskConst); auto YT = B.buildFSub(Ty, Y, YH, Flags); - auto YTCT = B.buildFMul(Ty, YT, CT, Flags); + // This adds correction terms for which contraction may lead to an increase + // in the error of the approximation, so disable it. + auto NewFlags = Flags & ~(MachineInstr::FmContract); + auto YTCT = B.buildFMul(Ty, YT, CT, NewFlags); Register Mad0 = - getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags); - Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, Flags); - R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, Flags); + getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), NewFlags); + Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, NewFlags); + R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, NewFlags); } const bool IsFiniteOnly = @@ -3706,24 +3728,39 @@ bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI, return true; } +static MachineInstrBuilder buildExp(MachineIRBuilder &B, const DstOp &Dst, + const SrcOp &Src, unsigned Flags) { + LLT Ty = Dst.getLLTTy(*B.getMRI()); + + if (Ty == LLT::scalar(32)) { + return B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Dst}) + .addUse(Src.getReg()) + .setMIFlags(Flags); + } + return B.buildFExp2(Dst, Src, Flags); +} + +bool AMDGPULegalizerInfo::legalizeFExpUnsafeImpl(MachineIRBuilder &B, + Register Dst, Register X, + unsigned Flags, + bool IsExp10) const { + LLT Ty = B.getMRI()->getType(X); + + // exp(x) -> exp2(M_LOG2E_F * x); + // exp10(x) -> exp2(log2(10) * x); + auto Const = B.buildFConstant(Ty, IsExp10 ? 0x1.a934f0p+1f : numbers::log2e); + auto Mul = B.buildFMul(Ty, X, Const, Flags); + buildExp(B, Dst, Mul, Flags); + return true; +} + bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, Register X, unsigned Flags) const { LLT Ty = B.getMRI()->getType(Dst); LLT F32 = LLT::scalar(32); if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) { - auto Log2E = B.buildFConstant(Ty, numbers::log2e); - auto Mul = B.buildFMul(Ty, X, Log2E, Flags); - - if (Ty == F32) { - B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst}) - .addUse(Mul.getReg(0)) - .setMIFlags(Flags); - } else { - B.buildFExp2(Dst, Mul.getReg(0), Flags); - } - - return true; + return legalizeFExpUnsafeImpl(B, Dst, X, Flags, /*IsExp10=*/false); } auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f); @@ -3746,6 +3783,55 @@ bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, return true; } +bool AMDGPULegalizerInfo::legalizeFExp10Unsafe(MachineIRBuilder &B, + Register Dst, Register X, + unsigned Flags) const { + LLT Ty = B.getMRI()->getType(Dst); + LLT F32 = LLT::scalar(32); + + if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) { + // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f); + auto K0 = B.buildFConstant(Ty, 0x1.a92000p+1f); + auto K1 = B.buildFConstant(Ty, 0x1.4f0978p-11f); + + auto Mul1 = B.buildFMul(Ty, X, K1, Flags); + auto Exp2_1 = buildExp(B, Ty, Mul1, Flags); + auto Mul0 = B.buildFMul(Ty, X, K0, Flags); + auto Exp2_0 = buildExp(B, Ty, Mul0, Flags); + B.buildFMul(Dst, Exp2_0, Exp2_1, Flags); + return true; + } + + // bool s = x < -0x1.2f7030p+5f; + // x += s ? 0x1.0p+5f : 0.0f; + // exp10 = exp2(x * 0x1.a92000p+1f) * + // exp2(x * 0x1.4f0978p-11f) * + // (s ? 0x1.9f623ep-107f : 1.0f); + + auto Threshold = B.buildFConstant(Ty, -0x1.2f7030p+5f); + auto NeedsScaling = + B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold); + + auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+5f); + auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags); + auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X); + + auto K0 = B.buildFConstant(Ty, 0x1.a92000p+1f); + auto K1 = B.buildFConstant(Ty, 0x1.4f0978p-11f); + + auto Mul1 = B.buildFMul(Ty, AdjustedX, K1, Flags); + auto Exp2_1 = buildExp(B, Ty, Mul1, Flags); + auto Mul0 = B.buildFMul(Ty, AdjustedX, K0, Flags); + auto Exp2_0 = buildExp(B, Ty, Mul0, Flags); + + auto MulExps = B.buildFMul(Ty, Exp2_0, Exp2_1, Flags); + auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.9f623ep-107f); + auto AdjustedResult = B.buildFMul(Ty, MulExps, ResultScaleFactor, Flags); + + B.buildSelect(Dst, NeedsScaling, AdjustedResult, MulExps); + return true; +} + bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const { Register Dst = MI.getOperand(0).getReg(); @@ -3762,18 +3848,22 @@ bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, // v_exp_f16 (fmul x, log2e) if (allowApproxFunc(MF, Flags)) { // TODO: Does this really require fast? - legalizeFExpUnsafe(B, Dst, X, Flags); + IsExp10 ? legalizeFExp10Unsafe(B, Dst, X, Flags) + : legalizeFExpUnsafe(B, Dst, X, Flags); MI.eraseFromParent(); return true; } + // Nothing in half is a denormal when promoted to f32. + // // exp(f16 x) -> // fptrunc (v_exp_f32 (fmul (fpext x), log2e)) - - // Nothing in half is a denormal when promoted to f32. + // + // exp10(f16 x) -> + // fptrunc (v_exp_f32 (fmul (fpext x), log2(10))) auto Ext = B.buildFPExt(F32, X, Flags); Register Lowered = MRI.createGenericVirtualRegister(F32); - legalizeFExpUnsafe(B, Lowered, Ext.getReg(0), Flags); + legalizeFExpUnsafeImpl(B, Lowered, Ext.getReg(0), Flags, IsExp10); B.buildFPTrunc(Dst, Lowered, Flags); MI.eraseFromParent(); return true; @@ -3784,7 +3874,8 @@ bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying // library behavior. Also, is known-not-daz source sufficient? if (allowApproxFunc(MF, Flags)) { - legalizeFExpUnsafe(B, Dst, X, Flags); + IsExp10 ? legalizeFExp10Unsafe(B, Dst, X, Flags) + : legalizeFExpUnsafe(B, Dst, X, Flags); MI.eraseFromParent(); return true; } @@ -4709,6 +4800,14 @@ bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic( return true; } +MachinePointerInfo +AMDGPULegalizerInfo::getKernargSegmentPtrInfo(MachineFunction &MF) const { + // This isn't really a constant pool but close enough. + MachinePointerInfo PtrInfo(MF.getPSVManager().getConstantPool()); + PtrInfo.AddrSpace = AMDGPUAS::CONSTANT_ADDRESS; + return PtrInfo; +} + Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const { LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); @@ -4736,8 +4835,8 @@ bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI, "unexpected kernarg parameter type"); Register Ptr = getKernargParameterPtr(B, Offset); - MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); - B.buildLoad(DstReg, Ptr, PtrInfo, Align(4), + MachinePointerInfo PtrInfo = getKernargSegmentPtrInfo(B.getMF()); + B.buildLoad(DstReg, Ptr, PtrInfo.getWithOffset(Offset), Align(4), MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant); MI.eraseFromParent(); @@ -6538,8 +6637,15 @@ static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { case Intrinsic::amdgcn_struct_buffer_atomic_fmax: case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax: return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; + case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32: + case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32: + return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32; case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32: case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32: return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32; default: llvm_unreachable("unhandled atomic opcode"); @@ -7260,9 +7366,9 @@ bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( return false; // TODO: can we be smarter about machine pointer info? - MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); + MachinePointerInfo PtrInfo = getKernargSegmentPtrInfo(MF); MachineMemOperand *MMO = MF.getMachineMemOperand( - PtrInfo, + PtrInfo.getWithOffset(Offset), MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant, LLT::scalar(64), commonAlignment(Align(64), Offset)); @@ -7724,7 +7830,7 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, case Intrinsic::amdgcn_make_buffer_rsrc: return legalizePointerAsRsrcIntrin(MI, MRI, B); case Intrinsic::amdgcn_kernarg_segment_ptr: - if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { + if (!AMDGPU::isKernel(B.getMF().getFunction())) { // This only makes sense to call in a kernel, so just lower to null. B.buildConstant(MI.getOperand(0).getReg(), 0); MI.eraseFromParent(); @@ -7947,6 +8053,14 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax: case Intrinsic::amdgcn_struct_buffer_atomic_fmax: case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax: + case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32: + case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32: + case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32: + case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32: case Intrinsic::amdgcn_raw_buffer_atomic_fadd: case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd: case Intrinsic::amdgcn_struct_buffer_atomic_fadd: diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h index cd44a9b..1224ee7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -91,8 +91,12 @@ public: bool legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, Register Src, bool IsLog10, unsigned Flags) const; bool legalizeFExp2(MachineInstr &MI, MachineIRBuilder &B) const; + bool legalizeFExpUnsafeImpl(MachineIRBuilder &B, Register Dst, Register Src, + unsigned Flags, bool IsExp10) const; bool legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const; + bool legalizeFExp10Unsafe(MachineIRBuilder &B, Register Dst, Register Src, + unsigned Flags) const; bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const; bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const; bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI, @@ -132,6 +136,7 @@ public: MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const; + MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const; Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const; bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B, uint64_t Offset, diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp index aa75534..821d7f38 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp @@ -845,7 +845,7 @@ bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) { return false; } } - LLVMContext &context = CI->getParent()->getParent()->getContext(); + LLVMContext &context = CI->getContext(); Constant *nval; if (getArgType(FInfo) == AMDGPULibFunc::F32) { SmallVector<float, 0> FVal; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp index 0a59132..97e7a23 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp @@ -1565,8 +1565,11 @@ void SplitPtrStructs::processConditionals() { } else if (isa<SelectInst>(I)) { if (MaybeRsrc) { if (auto *RsrcInst = dyn_cast<Instruction>(Rsrc)) { - ConditionalTemps.push_back(RsrcInst); - RsrcInst->replaceAllUsesWith(*MaybeRsrc); + // Guard against conditionals that were already folded away. + if (RsrcInst != *MaybeRsrc) { + ConditionalTemps.push_back(RsrcInst); + RsrcInst->replaceAllUsesWith(*MaybeRsrc); + } } for (Value *V : Seen) FoundRsrcs[V] = *MaybeRsrc; @@ -1745,6 +1748,12 @@ Value *SplitPtrStructs::handleMemoryInst(Instruction *I, Value *Arg, Value *Ptr, case AtomicRMWInst::FMin: IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin; break; + case AtomicRMWInst::USubCond: + IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32; + break; + case AtomicRMWInst::USubSat: + IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32; + break; case AtomicRMWInst::FSub: { reportFatalUsageError( "atomic floating point subtraction not supported for " @@ -1770,14 +1779,12 @@ Value *SplitPtrStructs::handleMemoryInst(Instruction *I, Value *Arg, Value *Ptr, break; case AtomicRMWInst::UIncWrap: case AtomicRMWInst::UDecWrap: - reportFatalUsageError("wrapping increment/decrement not supported for " - "buffer resources and should've ben expanded away"); + reportFatalUsageError( + "wrapping increment/decrement not supported for " + "buffer resources and should've been expanded away"); break; case AtomicRMWInst::BAD_BINOP: llvm_unreachable("Not sure how we got a bad binop"); - case AtomicRMWInst::USubCond: - case AtomicRMWInst::USubSat: - break; } } @@ -2059,17 +2066,7 @@ PtrParts SplitPtrStructs::visitICmpInst(ICmpInst &Cmp) { "Pointer comparison is only equal or unequal"); auto [LhsRsrc, LhsOff] = getPtrParts(Lhs); auto [RhsRsrc, RhsOff] = getPtrParts(Rhs); - Value *RsrcCmp = - IRB.CreateICmp(Pred, LhsRsrc, RhsRsrc, Cmp.getName() + ".rsrc"); - copyMetadata(RsrcCmp, &Cmp); - Value *OffCmp = IRB.CreateICmp(Pred, LhsOff, RhsOff, Cmp.getName() + ".off"); - copyMetadata(OffCmp, &Cmp); - - Value *Res = nullptr; - if (Pred == ICmpInst::ICMP_EQ) - Res = IRB.CreateAnd(RsrcCmp, OffCmp); - else if (Pred == ICmpInst::ICMP_NE) - Res = IRB.CreateOr(RsrcCmp, OffCmp); + Value *Res = IRB.CreateICmp(Pred, LhsOff, RhsOff); copyMetadata(Res, &Cmp); Res->takeName(&Cmp); SplitUsers.insert(&Cmp); diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp new file mode 100644 index 0000000..38b01dc --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp @@ -0,0 +1,240 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Lower LDS global variables with target extension type "amdgpu.named.barrier" +// that require specialized address assignment. It assigns a unique +// barrier identifier to each named-barrier LDS variable and encodes +// this identifier within the !absolute_symbol metadata of that global. +// This encoding ensures that subsequent LDS lowering passes can process these +// barriers correctly without conflicts. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUMemoryUtils.h" +#include "AMDGPUTargetMachine.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/Analysis/CallGraph.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/ReplaceConstant.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" + +#include <algorithm> + +#define DEBUG_TYPE "amdgpu-lower-exec-sync" + +using namespace llvm; +using namespace AMDGPU; + +namespace { + +// If GV is also used directly by other kernels, create a new GV +// used only by this kernel and its function. +static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV, + Function *KF) { + bool NeedsReplacement = false; + for (Use &U : GV->uses()) { + if (auto *I = dyn_cast<Instruction>(U.getUser())) { + Function *F = I->getFunction(); + if (isKernel(*F) && F != KF) { + NeedsReplacement = true; + break; + } + } + } + if (!NeedsReplacement) + return GV; + // Create a new GV used only by this kernel and its function + GlobalVariable *NewGV = new GlobalVariable( + M, GV->getValueType(), GV->isConstant(), GV->getLinkage(), + GV->getInitializer(), GV->getName() + "." + KF->getName(), nullptr, + GV->getThreadLocalMode(), GV->getType()->getAddressSpace()); + NewGV->copyAttributesFrom(GV); + for (Use &U : make_early_inc_range(GV->uses())) { + if (auto *I = dyn_cast<Instruction>(U.getUser())) { + Function *F = I->getFunction(); + if (!isKernel(*F) || F == KF) { + U.getUser()->replaceUsesOfWith(GV, NewGV); + } + } + } + return NewGV; +} + +// Write the specified address into metadata where it can be retrieved by +// the assembler. Format is a half open range, [Address Address+1) +static void recordLDSAbsoluteAddress(Module *M, GlobalVariable *GV, + uint32_t Address) { + LLVMContext &Ctx = M->getContext(); + auto *IntTy = M->getDataLayout().getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS); + auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address)); + auto *MaxC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address + 1)); + GV->setMetadata(LLVMContext::MD_absolute_symbol, + MDNode::get(Ctx, {MinC, MaxC})); +} + +template <typename T> SmallVector<T> sortByName(SmallVector<T> &&V) { + sort(V, [](const auto *L, const auto *R) { + return L->getName() < R->getName(); + }); + return {std::move(V)}; +} + +// Main utility function for special LDS variables lowering. +static bool lowerExecSyncGlobalVariables( + Module &M, LDSUsesInfoTy &LDSUsesInfo, + VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) { + bool Changed = false; + const DataLayout &DL = M.getDataLayout(); + // The 1st round: give module-absolute assignments + int NumAbsolutes = 0; + SmallVector<GlobalVariable *> OrderedGVs; + for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) { + GlobalVariable *GV = K.first; + if (!isNamedBarrier(*GV)) + continue; + // give a module-absolute assignment if it is indirectly accessed by + // multiple kernels. This is not precise, but we don't want to duplicate + // a function when it is called by multiple kernels. + if (LDSToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) { + OrderedGVs.push_back(GV); + } else { + // leave it to the 2nd round, which will give a kernel-relative + // assignment if it is only indirectly accessed by one kernel + LDSUsesInfo.direct_access[*K.second.begin()].insert(GV); + } + LDSToKernelsThatNeedToAccessItIndirectly.erase(GV); + } + OrderedGVs = sortByName(std::move(OrderedGVs)); + for (GlobalVariable *GV : OrderedGVs) { + unsigned BarrierScope = AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP; + unsigned BarId = NumAbsolutes + 1; + unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16; + NumAbsolutes += BarCnt; + + // 4 bits for alignment, 5 bits for the barrier num, + // 3 bits for the barrier scope + unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4; + recordLDSAbsoluteAddress(&M, GV, Offset); + } + OrderedGVs.clear(); + + // The 2nd round: give a kernel-relative assignment for GV that + // either only indirectly accessed by single kernel or only directly + // accessed by multiple kernels. + SmallVector<Function *> OrderedKernels; + for (auto &K : LDSUsesInfo.direct_access) { + Function *F = K.first; + assert(isKernel(*F)); + OrderedKernels.push_back(F); + } + OrderedKernels = sortByName(std::move(OrderedKernels)); + + DenseMap<Function *, uint32_t> Kernel2BarId; + for (Function *F : OrderedKernels) { + for (GlobalVariable *GV : LDSUsesInfo.direct_access[F]) { + if (!isNamedBarrier(*GV)) + continue; + + LDSUsesInfo.direct_access[F].erase(GV); + if (GV->isAbsoluteSymbolRef()) { + // already assigned + continue; + } + OrderedGVs.push_back(GV); + } + OrderedGVs = sortByName(std::move(OrderedGVs)); + for (GlobalVariable *GV : OrderedGVs) { + // GV could also be used directly by other kernels. If so, we need to + // create a new GV used only by this kernel and its function. + auto NewGV = uniquifyGVPerKernel(M, GV, F); + Changed |= (NewGV != GV); + unsigned BarrierScope = AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP; + unsigned BarId = Kernel2BarId[F]; + BarId += NumAbsolutes + 1; + unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16; + Kernel2BarId[F] += BarCnt; + unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4; + recordLDSAbsoluteAddress(&M, NewGV, Offset); + } + OrderedGVs.clear(); + } + // Also erase those special LDS variables from indirect_access. + for (auto &K : LDSUsesInfo.indirect_access) { + assert(isKernel(*K.first)); + for (GlobalVariable *GV : K.second) { + if (isNamedBarrier(*GV)) + K.second.erase(GV); + } + } + return Changed; +} + +static bool runLowerExecSyncGlobals(Module &M) { + CallGraph CG = CallGraph(M); + bool Changed = false; + Changed |= eliminateConstantExprUsesOfLDSFromAllInstructions(M); + + // For each kernel, what variables does it access directly or through + // callees + LDSUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDS(CG, M); + + // For each variable accessed through callees, which kernels access it + VariableFunctionMap LDSToKernelsThatNeedToAccessItIndirectly; + for (auto &K : LDSUsesInfo.indirect_access) { + Function *F = K.first; + assert(isKernel(*F)); + for (GlobalVariable *GV : K.second) { + LDSToKernelsThatNeedToAccessItIndirectly[GV].insert(F); + } + } + + if (LDSUsesInfo.HasSpecialGVs) { + // Special LDS variables need special address assignment + Changed |= lowerExecSyncGlobalVariables( + M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly); + } + return Changed; +} + +class AMDGPULowerExecSyncLegacy : public ModulePass { +public: + static char ID; + AMDGPULowerExecSyncLegacy() : ModulePass(ID) {} + bool runOnModule(Module &M) override; +}; + +} // namespace + +char AMDGPULowerExecSyncLegacy::ID = 0; +char &llvm::AMDGPULowerExecSyncLegacyPassID = AMDGPULowerExecSyncLegacy::ID; + +INITIALIZE_PASS_BEGIN(AMDGPULowerExecSyncLegacy, DEBUG_TYPE, + "AMDGPU lowering of execution synchronization", false, + false) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) +INITIALIZE_PASS_END(AMDGPULowerExecSyncLegacy, DEBUG_TYPE, + "AMDGPU lowering of execution synchronization", false, + false) + +bool AMDGPULowerExecSyncLegacy::runOnModule(Module &M) { + return runLowerExecSyncGlobals(M); +} + +ModulePass *llvm::createAMDGPULowerExecSyncLegacyPass() { + return new AMDGPULowerExecSyncLegacy(); +} + +PreservedAnalyses AMDGPULowerExecSyncPass::run(Module &M, + ModuleAnalysisManager &AM) { + return runLowerExecSyncGlobals(M) ? PreservedAnalyses::none() + : PreservedAnalyses::all(); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp index dec781d..755b44c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp @@ -64,7 +64,7 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) { return false; const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); - LLVMContext &Ctx = F.getParent()->getContext(); + LLVMContext &Ctx = F.getContext(); const DataLayout &DL = F.getDataLayout(); BasicBlock &EntryBlock = *F.begin(); IRBuilder<> Builder(&EntryBlock, getInsertPt(EntryBlock)); diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp index fed7a13..248d7dc 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp @@ -98,7 +98,7 @@ static void annotateGridSizeLoadWithRangeMD(LoadInst *Load, } static bool processUse(CallInst *CI, bool IsV5OrAbove) { - Function *F = CI->getParent()->getParent(); + Function *F = CI->getFunction(); auto *MD = F->getMetadata("reqd_work_group_size"); const bool HasReqdWorkGroupSize = MD && MD->getNumOperands() == 3; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp index a4ef524..be30128 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -441,7 +441,7 @@ public: return KernelSet; for (Function &Func : M.functions()) { - if (Func.isDeclaration() || !isKernelLDS(&Func)) + if (Func.isDeclaration() || !isKernel(Func)) continue; for (GlobalVariable *GV : LDSUsesInfo.indirect_access[&Func]) { if (VariableSet.contains(GV)) { @@ -555,7 +555,7 @@ public: for (Function &Func : M->functions()) { if (Func.isDeclaration()) continue; - if (!isKernelLDS(&Func)) + if (!isKernel(Func)) continue; if (KernelsThatAllocateTableLDS.contains(&Func) || @@ -703,7 +703,7 @@ public: return false; } Function *F = I->getFunction(); - return !isKernelLDS(F); + return !isKernel(*F); }); // Replace uses of module scope variable from kernel functions that @@ -711,7 +711,7 @@ public: // Record on each kernel whether the module scope global is used by it for (Function &Func : M.functions()) { - if (Func.isDeclaration() || !isKernelLDS(&Func)) + if (Func.isDeclaration() || !isKernel(Func)) continue; if (KernelsThatAllocateModuleLDS.contains(&Func)) { @@ -743,7 +743,7 @@ public: DenseMap<Function *, LDSVariableReplacement> KernelToReplacement; for (Function &Func : M.functions()) { - if (Func.isDeclaration() || !isKernelLDS(&Func)) + if (Func.isDeclaration() || !isKernel(Func)) continue; DenseSet<GlobalVariable *> KernelUsedVariables; @@ -828,7 +828,7 @@ public: // semantics. Setting the alignment here allows this IR pass to accurately // predict the exact constant at which it will be allocated. - assert(isKernelLDS(func)); + assert(isKernel(*func)); LLVMContext &Ctx = M.getContext(); const DataLayout &DL = M.getDataLayout(); @@ -878,7 +878,7 @@ public: for (auto &func : OrderedKernels) { if (KernelsThatIndirectlyAllocateDynamicLDS.contains(func)) { - assert(isKernelLDS(func)); + assert(isKernel(*func)); if (!func->hasName()) { reportFatalUsageError("anonymous kernels cannot use LDS variables"); } @@ -912,7 +912,7 @@ public: auto *I = dyn_cast<Instruction>(U.getUser()); if (!I) continue; - if (isKernelLDS(I->getFunction())) + if (isKernel(*I->getFunction())) continue; replaceUseWithTableLookup(M, Builder, table, GV, U, nullptr); @@ -922,126 +922,6 @@ public: return KernelToCreatedDynamicLDS; } - static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV, - Function *KF) { - bool NeedsReplacement = false; - for (Use &U : GV->uses()) { - if (auto *I = dyn_cast<Instruction>(U.getUser())) { - Function *F = I->getFunction(); - if (isKernelLDS(F) && F != KF) { - NeedsReplacement = true; - break; - } - } - } - if (!NeedsReplacement) - return GV; - // Create a new GV used only by this kernel and its function - GlobalVariable *NewGV = new GlobalVariable( - M, GV->getValueType(), GV->isConstant(), GV->getLinkage(), - GV->getInitializer(), GV->getName() + "." + KF->getName(), nullptr, - GV->getThreadLocalMode(), GV->getType()->getAddressSpace()); - NewGV->copyAttributesFrom(GV); - for (Use &U : make_early_inc_range(GV->uses())) { - if (auto *I = dyn_cast<Instruction>(U.getUser())) { - Function *F = I->getFunction(); - if (!isKernelLDS(F) || F == KF) { - U.getUser()->replaceUsesOfWith(GV, NewGV); - } - } - } - return NewGV; - } - - bool lowerSpecialLDSVariables( - Module &M, LDSUsesInfoTy &LDSUsesInfo, - VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) { - bool Changed = false; - const DataLayout &DL = M.getDataLayout(); - // The 1st round: give module-absolute assignments - int NumAbsolutes = 0; - std::vector<GlobalVariable *> OrderedGVs; - for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) { - GlobalVariable *GV = K.first; - if (!isNamedBarrier(*GV)) - continue; - // give a module-absolute assignment if it is indirectly accessed by - // multiple kernels. This is not precise, but we don't want to duplicate - // a function when it is called by multiple kernels. - if (LDSToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) { - OrderedGVs.push_back(GV); - } else { - // leave it to the 2nd round, which will give a kernel-relative - // assignment if it is only indirectly accessed by one kernel - LDSUsesInfo.direct_access[*K.second.begin()].insert(GV); - } - LDSToKernelsThatNeedToAccessItIndirectly.erase(GV); - } - OrderedGVs = sortByName(std::move(OrderedGVs)); - for (GlobalVariable *GV : OrderedGVs) { - unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP; - unsigned BarId = NumAbsolutes + 1; - unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16; - NumAbsolutes += BarCnt; - - // 4 bits for alignment, 5 bits for the barrier num, - // 3 bits for the barrier scope - unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4; - recordLDSAbsoluteAddress(&M, GV, Offset); - } - OrderedGVs.clear(); - - // The 2nd round: give a kernel-relative assignment for GV that - // either only indirectly accessed by single kernel or only directly - // accessed by multiple kernels. - std::vector<Function *> OrderedKernels; - for (auto &K : LDSUsesInfo.direct_access) { - Function *F = K.first; - assert(isKernelLDS(F)); - OrderedKernels.push_back(F); - } - OrderedKernels = sortByName(std::move(OrderedKernels)); - - llvm::DenseMap<Function *, uint32_t> Kernel2BarId; - for (Function *F : OrderedKernels) { - for (GlobalVariable *GV : LDSUsesInfo.direct_access[F]) { - if (!isNamedBarrier(*GV)) - continue; - - LDSUsesInfo.direct_access[F].erase(GV); - if (GV->isAbsoluteSymbolRef()) { - // already assigned - continue; - } - OrderedGVs.push_back(GV); - } - OrderedGVs = sortByName(std::move(OrderedGVs)); - for (GlobalVariable *GV : OrderedGVs) { - // GV could also be used directly by other kernels. If so, we need to - // create a new GV used only by this kernel and its function. - auto NewGV = uniquifyGVPerKernel(M, GV, F); - Changed |= (NewGV != GV); - unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP; - unsigned BarId = Kernel2BarId[F]; - BarId += NumAbsolutes + 1; - unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16; - Kernel2BarId[F] += BarCnt; - unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4; - recordLDSAbsoluteAddress(&M, NewGV, Offset); - } - OrderedGVs.clear(); - } - // Also erase those special LDS variables from indirect_access. - for (auto &K : LDSUsesInfo.indirect_access) { - assert(isKernelLDS(K.first)); - for (GlobalVariable *GV : K.second) { - if (isNamedBarrier(*GV)) - K.second.erase(GV); - } - } - return Changed; - } - bool runOnModule(Module &M) { CallGraph CG = CallGraph(M); bool Changed = superAlignLDSGlobals(M); @@ -1058,18 +938,12 @@ public: VariableFunctionMap LDSToKernelsThatNeedToAccessItIndirectly; for (auto &K : LDSUsesInfo.indirect_access) { Function *F = K.first; - assert(isKernelLDS(F)); + assert(isKernel(*F)); for (GlobalVariable *GV : K.second) { LDSToKernelsThatNeedToAccessItIndirectly[GV].insert(F); } } - if (LDSUsesInfo.HasSpecialGVs) { - // Special LDS variables need special address assignment - Changed |= lowerSpecialLDSVariables( - M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly); - } - // Partition variables accessed indirectly into the different strategies DenseSet<GlobalVariable *> ModuleScopeVariables; DenseSet<GlobalVariable *> TableLookupVariables; @@ -1157,7 +1031,7 @@ public: const DataLayout &DL = M.getDataLayout(); for (Function &Func : M.functions()) { - if (Func.isDeclaration() || !isKernelLDS(&Func)) + if (Func.isDeclaration() || !isKernel(Func)) continue; // All three of these are optional. The first variable is allocated at diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp index 1e6589e..d7d0292 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp @@ -58,6 +58,8 @@ class AMDGPULowerVGPREncoding { static constexpr unsigned BitsPerField = 2; static constexpr unsigned NumFields = 4; static constexpr unsigned FieldMask = (1 << BitsPerField) - 1; + static constexpr unsigned ModeWidth = NumFields * BitsPerField; + static constexpr unsigned ModeMask = (1 << ModeWidth) - 1; using ModeType = PackedVector<unsigned, BitsPerField, std::bitset<BitsPerField * NumFields>>; @@ -82,12 +84,12 @@ private: const SIInstrInfo *TII; const SIRegisterInfo *TRI; + // Current basic block. + MachineBasicBlock *MBB; + /// Most recent s_set_* instruction. MachineInstr *MostRecentModeSet; - /// Whether the current mode is known. - bool CurrentModeKnown; - /// Current mode bits. ModeTy CurrentMode; @@ -108,10 +110,13 @@ private: MachineInstr *Clause; /// Insert mode change before \p I. \returns true if mode was changed. - bool setMode(ModeTy NewMode, ModeTy Mask, MachineInstr *I); + bool setMode(ModeTy NewMode, ModeTy Mask, + MachineBasicBlock::instr_iterator I); /// Reset mode to default. - void resetMode(MachineInstr *I) { setMode(ModeTy(), ModeTy::fullMask(), I); } + void resetMode(MachineBasicBlock::instr_iterator I) { + setMode(ModeTy(), ModeTy::fullMask(), I); + } /// If \p MO references VGPRs, return the MSBs. Otherwise, return nullopt. std::optional<unsigned> getMSBs(const MachineOperand &MO) const; @@ -130,38 +135,43 @@ private: /// Check if an instruction \p I is within a clause and returns a suitable /// iterator to insert mode change. It may also modify the S_CLAUSE /// instruction to extend it or drop the clause if it cannot be adjusted. - MachineInstr *handleClause(MachineInstr *I); + MachineBasicBlock::instr_iterator + handleClause(MachineBasicBlock::instr_iterator I); }; bool AMDGPULowerVGPREncoding::setMode(ModeTy NewMode, ModeTy Mask, - MachineInstr *I) { + MachineBasicBlock::instr_iterator I) { assert((NewMode.raw_bits() & ~Mask.raw_bits()).none()); - if (CurrentModeKnown) { - auto Delta = NewMode.raw_bits() ^ CurrentMode.raw_bits(); + auto Delta = NewMode.raw_bits() ^ CurrentMode.raw_bits(); - if ((Delta & Mask.raw_bits()).none()) { - CurrentMask |= Mask; - return false; - } + if ((Delta & Mask.raw_bits()).none()) { + CurrentMask |= Mask; + return false; + } - if (MostRecentModeSet && (Delta & CurrentMask.raw_bits()).none()) { - CurrentMode |= NewMode; - CurrentMask |= Mask; + if (MostRecentModeSet && (Delta & CurrentMask.raw_bits()).none()) { + CurrentMode |= NewMode; + CurrentMask |= Mask; - MostRecentModeSet->getOperand(0).setImm(CurrentMode); - return true; - } + MachineOperand &Op = MostRecentModeSet->getOperand(0); + + // Carry old mode bits from the existing instruction. + int64_t OldModeBits = Op.getImm() & (ModeMask << ModeWidth); + + Op.setImm(CurrentMode | OldModeBits); + return true; } + // Record previous mode into high 8 bits of the immediate. + int64_t OldModeBits = CurrentMode << ModeWidth; + I = handleClause(I); - MostRecentModeSet = - BuildMI(*I->getParent(), I, {}, TII->get(AMDGPU::S_SET_VGPR_MSB)) - .addImm(NewMode); + MostRecentModeSet = BuildMI(*MBB, I, {}, TII->get(AMDGPU::S_SET_VGPR_MSB)) + .addImm(NewMode | OldModeBits); CurrentMode = NewMode; CurrentMask = Mask; - CurrentModeKnown = true; return true; } @@ -233,21 +243,22 @@ bool AMDGPULowerVGPREncoding::runOnMachineInstr(MachineInstr &MI) { if (Ops.first) { ModeTy NewMode, Mask; computeMode(NewMode, Mask, MI, Ops.first, Ops.second); - return setMode(NewMode, Mask, &MI); + return setMode(NewMode, Mask, MI.getIterator()); } assert(!TII->hasVGPRUses(MI) || MI.isMetaInstruction() || MI.isPseudo()); return false; } -MachineInstr *AMDGPULowerVGPREncoding::handleClause(MachineInstr *I) { +MachineBasicBlock::instr_iterator +AMDGPULowerVGPREncoding::handleClause(MachineBasicBlock::instr_iterator I) { if (!ClauseRemaining) return I; // A clause cannot start with a special instruction, place it right before // the clause. if (ClauseRemaining == ClauseLen) { - I = Clause->getPrevNode(); + I = Clause->getPrevNode()->getIterator(); assert(I->isBundle()); return I; } @@ -284,9 +295,9 @@ bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) { ClauseLen = ClauseRemaining = 0; CurrentMode.reset(); CurrentMask.reset(); - CurrentModeKnown = true; for (auto &MBB : MF) { MostRecentModeSet = nullptr; + this->MBB = &MBB; for (auto &MI : llvm::make_early_inc_range(MBB.instrs())) { if (MI.isMetaInstruction()) @@ -294,17 +305,16 @@ bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) { if (MI.isTerminator() || MI.isCall()) { if (MI.getOpcode() == AMDGPU::S_ENDPGM || - MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) { + MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) CurrentMode.reset(); - CurrentModeKnown = true; - } else - resetMode(&MI); + else + resetMode(MI.getIterator()); continue; } if (MI.isInlineAsm()) { if (TII->hasVGPRUses(MI)) - resetMode(&MI); + resetMode(MI.getIterator()); continue; } @@ -323,14 +333,8 @@ bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) { --ClauseRemaining; } - // If we're falling through to a block that has at least one other - // predecessor, we no longer know the mode. - MachineBasicBlock *Next = MBB.getNextNode(); - if (Next && Next->pred_size() >= 2 && - llvm::is_contained(Next->predecessors(), &MBB)) { - if (CurrentMode.raw_bits().any()) - CurrentModeKnown = false; - } + // Reset the mode if we are falling through. + resetMode(MBB.instr_end()); } return Changed; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp index 680e7eb..bf9b429 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -229,7 +229,8 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { OutMI.addOperand(Src); return; } else if (Opcode == AMDGPU::SI_TCRETURN || - Opcode == AMDGPU::SI_TCRETURN_GFX) { + Opcode == AMDGPU::SI_TCRETURN_GFX || + Opcode == AMDGPU::SI_TCRETURN_CHAIN) { // TODO: How to use branch immediate and avoid register+add? Opcode = AMDGPU::S_SETPC_B64; } else if (AMDGPU::getT16D16Helper(Opcode)) { @@ -243,7 +244,7 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { int MCOpcode = TII->pseudoToMCOpcode(Opcode); if (MCOpcode == -1) { - LLVMContext &C = MI->getParent()->getParent()->getFunction().getContext(); + LLVMContext &C = MI->getMF()->getFunction().getContext(); C.emitError("AMDGPUMCInstLower::lower - Pseudo instruction doesn't have " "a target-specific version: " + Twine(MI->getOpcode())); } @@ -332,7 +333,7 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) { StringRef Err; if (!STI.getInstrInfo()->verifyInstruction(*MI, Err)) { - LLVMContext &C = MI->getParent()->getParent()->getFunction().getContext(); + LLVMContext &C = MI->getMF()->getFunction().getContext(); C.emitError("Illegal instruction detected: " + Err); MI->print(errs()); } @@ -412,7 +413,7 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) { *OutStreamer); if (isVerbose() && MI->getOpcode() == AMDGPU::S_SET_VGPR_MSB) { - unsigned V = MI->getOperand(0).getImm(); + unsigned V = MI->getOperand(0).getImm() & 0xff; OutStreamer->AddComment( " msbs: dst=" + Twine(V >> 6) + " src0=" + Twine(V & 3) + " src1=" + Twine((V >> 2) & 3) + " src2=" + Twine((V >> 4) & 3)); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp index e17c211..8145816 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp @@ -126,7 +126,7 @@ void getUsesOfLDSByFunction(const CallGraph &CG, Module &M, for (User *V : GV.users()) { if (auto *I = dyn_cast<Instruction>(V)) { Function *F = I->getFunction(); - if (isKernelLDS(F)) + if (isKernel(*F)) kernels[F].insert(&GV); else Functions[F].insert(&GV); @@ -135,10 +135,6 @@ void getUsesOfLDSByFunction(const CallGraph &CG, Module &M, } } -bool isKernelLDS(const Function *F) { - return AMDGPU::isKernel(F->getCallingConv()); -} - LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) { FunctionVariableMap DirectMapKernel; @@ -148,7 +144,7 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) { // Collect functions whose address has escaped DenseSet<Function *> AddressTakenFuncs; for (Function &F : M.functions()) { - if (!isKernelLDS(&F)) + if (!isKernel(F)) if (F.hasAddressTaken(nullptr, /* IgnoreCallbackUses */ false, /* IgnoreAssumeLikeCalls */ false, @@ -180,7 +176,7 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) { // access all variables accessed by functions whose address escaped for (Function &F : M.functions()) { if (!F.isDeclaration() && FunctionMakesUnknownCall(&F)) { - if (!isKernelLDS(&F)) { + if (!isKernel(F)) { set_union(TransitiveMapFunction[&F], VariablesReachableThroughFunctionPointer); } @@ -190,7 +186,7 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) { // Direct implementation of collecting all variables reachable from each // function for (Function &Func : M.functions()) { - if (Func.isDeclaration() || isKernelLDS(&Func)) + if (Func.isDeclaration() || isKernel(Func)) continue; DenseSet<Function *> seen; // catches cycles @@ -227,7 +223,7 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) { FunctionVariableMap IndirectMapKernel; for (Function &Func : M.functions()) { - if (Func.isDeclaration() || !isKernelLDS(&Func)) + if (Func.isDeclaration() || !isKernel(Func)) continue; for (const CallGraphNode::CallRecord &R : *CG[&Func]) { @@ -273,6 +269,8 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) { // this is a re-run of the pass // so we don't have anything to do. // - No variables are absolute. + // Named-barriers which are absolute symbols are removed + // from the maps. std::optional<bool> HasAbsoluteGVs; bool HasSpecialGVs = false; for (auto &Map : {DirectMapKernel, IndirectMapKernel}) { @@ -284,6 +282,10 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) { if (IsDirectMapDynLDSGV) continue; if (isNamedBarrier(*GV)) { + if (IsAbsolute) { + DirectMapKernel[Fn].erase(GV); + IndirectMapKernel[Fn].erase(GV); + } HasSpecialGVs = true; continue; } @@ -335,7 +337,7 @@ void removeFnAttrFromReachable(CallGraph &CG, Function *KernelRoot, Function *PotentialCallee = ExternalCallRecord.second->getFunction(); assert(PotentialCallee); - if (!isKernelLDS(PotentialCallee)) { + if (!isKernel(*PotentialCallee)) { for (StringRef Attr : FnAttrs) PotentialCallee->removeFnAttr(Attr); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.h index 058e744..8868b93 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.h @@ -53,8 +53,6 @@ void getUsesOfLDSByFunction(const CallGraph &CG, Module &M, FunctionVariableMap &kernels, FunctionVariableMap &functions); -bool isKernelLDS(const Function *F); - LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M); /// Strip FnAttr attribute from any functions where we may have diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index bf6f1a9..f464fbf 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -13,6 +13,12 @@ // NOTE: NO INCLUDE GUARD DESIRED! +#ifndef MODULE_ANALYSIS +#define MODULE_ANALYSIS(NAME, CREATE_PASS) +#endif +MODULE_ANALYSIS("amdgpu-argument-usage", AMDGPUArgumentUsageAnalysis()) +#undef MODULE_ANALYSIS + #ifndef MODULE_PASS #define MODULE_PASS(NAME, CREATE_PASS) #endif @@ -29,6 +35,7 @@ MODULE_PASS("amdgpu-perf-hint", MODULE_PASS("amdgpu-preload-kernel-arguments", AMDGPUPreloadKernelArgumentsPass(*this)) MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass()) MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this)) +MODULE_PASS("amdgpu-lower-exec-sync", AMDGPULowerExecSyncPass()) MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this)) #undef MODULE_PASS diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h index cf2ab825..a3be0f5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h @@ -48,7 +48,7 @@ private: FuncInfoMap FIM; public: - AMDGPUPerfHintAnalysis() {} + AMDGPUPerfHintAnalysis() = default; // OldPM bool runOnSCC(const GCNTargetMachine &TM, CallGraphSCC &SCC); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp index 6e54737..4a70c5d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp @@ -177,8 +177,7 @@ void AMDGPUPreLegalizerCombinerImpl::applyClampI64ToI16( MachineInstr &MI, const ClampI64ToI16MatchInfo &MatchInfo) const { Register Src = MatchInfo.Origin; - assert(MI.getParent()->getParent()->getRegInfo().getType(Src) == - LLT::scalar(64)); + assert(MI.getMF()->getRegInfo().getType(Src) == LLT::scalar(64)); const LLT S32 = LLT::scalar(32); auto Unmerge = B.buildUnmerge(S32, Src); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp index ffbbf63..7d6e3ed 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp @@ -127,7 +127,7 @@ private: // will also be preloaded even if that data is unused. Function *cloneFunctionWithPreloadImplicitArgs(unsigned LastPreloadIndex) { FunctionType *FT = F.getFunctionType(); - LLVMContext &Ctx = F.getParent()->getContext(); + LLVMContext &Ctx = F.getContext(); SmallVector<Type *, 16> FTypes(FT->param_begin(), FT->param_end()); for (unsigned I = 0; I <= LastPreloadIndex; ++I) FTypes.push_back(getHiddenArgType(Ctx, HiddenArg(I))); @@ -196,7 +196,7 @@ public: SmallVector<std::pair<LoadInst *, unsigned>, 4> ImplicitArgLoads; for (auto *U : ImplicitArgPtr->users()) { Instruction *CI = dyn_cast<Instruction>(U); - if (!CI || CI->getParent()->getParent() != &F) + if (!CI || CI->getFunction() != &F) continue; for (auto *U : CI->users()) { @@ -213,7 +213,7 @@ public: continue; // FIXME: Expand handle merged loads. - LLVMContext &Ctx = F.getParent()->getContext(); + LLVMContext &Ctx = F.getContext(); Type *LoadTy = Load->getType(); HiddenArg HA = getHiddenArgFromOffset(Offset); if (HA == END_HIDDEN_ARGS || LoadTy != getHiddenArgType(Ctx, HA)) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp index f5e14c7..416de90 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp @@ -129,7 +129,7 @@ static StringRef getAsConstantStr(Value *V) { static void diagnoseInvalidFormatString(const CallBase *CI) { CI->getContext().diagnose(DiagnosticInfoUnsupported( - *CI->getParent()->getParent(), + *CI->getFunction(), "printf format string must be a trivially resolved constant string " "global variable", CI->getDebugLoc())); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index ddabd25..b79689c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -122,6 +122,7 @@ private: /// Check whether we have enough local memory for promotion. bool hasSufficientLocalMem(const Function &F); + FixedVectorType *getVectorTypeForAlloca(Type *AllocaTy) const; bool tryPromoteAllocaToVector(AllocaInst &I); bool tryPromoteAllocaToLDS(AllocaInst &I, bool SufficientLDS); @@ -460,13 +461,15 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca, return nullptr; Value *Offset = VarOffset.first; - auto *OffsetType = dyn_cast<IntegerType>(Offset->getType()); - if (!OffsetType) + if (!isa<IntegerType>(Offset->getType())) return nullptr; + Offset = Builder.CreateSExtOrTrunc(Offset, Builder.getIntNTy(BW)); + if (Offset != VarOffset.first) + NewInsts.push_back(cast<Instruction>(Offset)); + if (!OffsetQuot.isOne()) { - ConstantInt *ConstMul = - ConstantInt::get(Ctx, OffsetQuot.sext(OffsetType->getBitWidth())); + ConstantInt *ConstMul = ConstantInt::get(Ctx, OffsetQuot.sextOrTrunc(BW)); Offset = Builder.CreateMul(Offset, ConstMul); if (Instruction *NewInst = dyn_cast<Instruction>(Offset)) NewInsts.push_back(NewInst); @@ -474,8 +477,7 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca, if (ConstOffset.isZero()) return Offset; - ConstantInt *ConstIndex = - ConstantInt::get(Ctx, IndexQuot.sext(OffsetType->getBitWidth())); + ConstantInt *ConstIndex = ConstantInt::get(Ctx, IndexQuot.sextOrTrunc(BW)); Value *IndexAdd = Builder.CreateAdd(Offset, ConstIndex); if (Instruction *NewInst = dyn_cast<Instruction>(IndexAdd)) NewInsts.push_back(NewInst); @@ -501,27 +503,14 @@ static Value *promoteAllocaUserToVector( Instruction *Inst, const DataLayout &DL, FixedVectorType *VectorTy, unsigned VecStoreSize, unsigned ElementSize, DenseMap<MemTransferInst *, MemTransferInfo> &TransferInfo, - std::map<GetElementPtrInst *, WeakTrackingVH> &GEPVectorIdx, Value *CurVal, - SmallVectorImpl<LoadInst *> &DeferredLoads) { + std::map<GetElementPtrInst *, WeakTrackingVH> &GEPVectorIdx, + function_ref<Value *()> GetCurVal) { // Note: we use InstSimplifyFolder because it can leverage the DataLayout // to do more folding, especially in the case of vector splats. IRBuilder<InstSimplifyFolder> Builder(Inst->getContext(), InstSimplifyFolder(DL)); Builder.SetInsertPoint(Inst); - const auto GetOrLoadCurrentVectorValue = [&]() -> Value * { - if (CurVal) - return CurVal; - - // If the current value is not known, insert a dummy load and lower it on - // the second pass. - LoadInst *Dummy = - Builder.CreateLoad(VectorTy, PoisonValue::get(Builder.getPtrTy()), - "promotealloca.dummyload"); - DeferredLoads.push_back(Dummy); - return Dummy; - }; - const auto CreateTempPtrIntCast = [&Builder, DL](Value *Val, Type *PtrTy) -> Value * { assert(DL.getTypeStoreSize(Val->getType()) == DL.getTypeStoreSize(PtrTy)); @@ -541,12 +530,7 @@ static Value *promoteAllocaUserToVector( switch (Inst->getOpcode()) { case Instruction::Load: { - // Loads can only be lowered if the value is known. - if (!CurVal) { - DeferredLoads.push_back(cast<LoadInst>(Inst)); - return nullptr; - } - + Value *CurVal = GetCurVal(); Value *Index = calculateVectorIndex( cast<LoadInst>(Inst)->getPointerOperand(), GEPVectorIdx); @@ -636,7 +620,7 @@ static Value *promoteAllocaUserToVector( Val = Builder.CreateBitOrPointerCast(Val, SubVecTy); - Value *CurVec = GetOrLoadCurrentVectorValue(); + Value *CurVec = GetCurVal(); for (unsigned K = 0, NumElts = std::min(NumWrittenElts, NumVecElts); K < NumElts; ++K) { Value *CurIdx = @@ -649,8 +633,7 @@ static Value *promoteAllocaUserToVector( if (Val->getType() != VecEltTy) Val = Builder.CreateBitOrPointerCast(Val, VecEltTy); - return Builder.CreateInsertElement(GetOrLoadCurrentVectorValue(), Val, - Index); + return Builder.CreateInsertElement(GetCurVal(), Val, Index); } case Instruction::Call: { if (auto *MTI = dyn_cast<MemTransferInst>(Inst)) { @@ -672,7 +655,7 @@ static Value *promoteAllocaUserToVector( } } - return Builder.CreateShuffleVector(GetOrLoadCurrentVectorValue(), Mask); + return Builder.CreateShuffleVector(GetCurVal(), Mask); } if (auto *MSI = dyn_cast<MemSetInst>(Inst)) { @@ -791,16 +774,13 @@ static BasicBlock::iterator skipToNonAllocaInsertPt(BasicBlock &BB, return I; } -// FIXME: Should try to pick the most likely to be profitable allocas first. -bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { - LLVM_DEBUG(dbgs() << "Trying to promote to vector: " << Alloca << '\n'); - +FixedVectorType * +AMDGPUPromoteAllocaImpl::getVectorTypeForAlloca(Type *AllocaTy) const { if (DisablePromoteAllocaToVector) { - LLVM_DEBUG(dbgs() << " Promote alloca to vector is disabled\n"); - return false; + LLVM_DEBUG(dbgs() << " Promote alloca to vectors is disabled\n"); + return nullptr; } - Type *AllocaTy = Alloca.getAllocatedType(); auto *VectorTy = dyn_cast<FixedVectorType>(AllocaTy); if (auto *ArrayTy = dyn_cast<ArrayType>(AllocaTy)) { uint64_t NumElems = 1; @@ -832,10 +812,9 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { } } } - if (!VectorTy) { LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n"); - return false; + return nullptr; } const unsigned MaxElements = @@ -845,9 +824,29 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { VectorTy->getNumElements() < 2) { LLVM_DEBUG(dbgs() << " " << *VectorTy << " has an unsupported number of elements\n"); - return false; + return nullptr; } + Type *VecEltTy = VectorTy->getElementType(); + unsigned ElementSizeInBits = DL->getTypeSizeInBits(VecEltTy); + if (ElementSizeInBits != DL->getTypeAllocSizeInBits(VecEltTy)) { + LLVM_DEBUG(dbgs() << " Cannot convert to vector if the allocation size " + "does not match the type's size\n"); + return nullptr; + } + + return VectorTy; +} + +// FIXME: Should try to pick the most likely to be profitable allocas first. +bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { + LLVM_DEBUG(dbgs() << "Trying to promote to vectors: " << Alloca << '\n'); + + Type *AllocaTy = Alloca.getAllocatedType(); + FixedVectorType *VectorTy = getVectorTypeForAlloca(AllocaTy); + if (!VectorTy) + return false; + std::map<GetElementPtrInst *, WeakTrackingVH> GEPVectorIdx; SmallVector<Instruction *> WorkList; SmallVector<Instruction *> UsersToRemove; @@ -869,13 +868,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { LLVM_DEBUG(dbgs() << " Attempting promotion to: " << *VectorTy << "\n"); Type *VecEltTy = VectorTy->getElementType(); - unsigned ElementSizeInBits = DL->getTypeSizeInBits(VecEltTy); - if (ElementSizeInBits != DL->getTypeAllocSizeInBits(VecEltTy)) { - LLVM_DEBUG(dbgs() << " Cannot convert to vector if the allocation size " - "does not match the type's size\n"); - return false; - } - unsigned ElementSize = ElementSizeInBits / 8; + unsigned ElementSize = DL->getTypeSizeInBits(VecEltTy) / 8; assert(ElementSize > 0); for (auto *U : Uses) { Instruction *Inst = cast<Instruction>(U->getUser()); @@ -1027,37 +1020,46 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { Updater.AddAvailableValue(EntryBB, AllocaInitValue); - // First handle the initial worklist. - SmallVector<LoadInst *, 4> DeferredLoads; + // First handle the initial worklist, in basic block order. + // + // Insert a placeholder whenever we need the vector value at the top of a + // basic block. + SmallVector<Instruction *> Placeholders; forEachWorkListItem(WorkList, [&](Instruction *I) { BasicBlock *BB = I->getParent(); - // On the first pass, we only take values that are trivially known, i.e. - // where AddAvailableValue was already called in this block. - Value *Result = promoteAllocaUserToVector( - I, *DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx, - Updater.FindValueForBlock(BB), DeferredLoads); + auto GetCurVal = [&]() -> Value * { + if (Value *CurVal = Updater.FindValueForBlock(BB)) + return CurVal; + + if (!Placeholders.empty() && Placeholders.back()->getParent() == BB) + return Placeholders.back(); + + // If the current value in the basic block is not yet known, insert a + // placeholder that we will replace later. + IRBuilder<> Builder(I); + auto *Placeholder = cast<Instruction>(Builder.CreateFreeze( + PoisonValue::get(VectorTy), "promotealloca.placeholder")); + Placeholders.push_back(Placeholder); + return Placeholders.back(); + }; + + Value *Result = + promoteAllocaUserToVector(I, *DL, VectorTy, VecStoreSize, ElementSize, + TransferInfo, GEPVectorIdx, GetCurVal); if (Result) Updater.AddAvailableValue(BB, Result); }); - // Then handle deferred loads. - forEachWorkListItem(DeferredLoads, [&](Instruction *I) { - SmallVector<LoadInst *, 0> NewDLs; - BasicBlock *BB = I->getParent(); - // On the second pass, we use GetValueInMiddleOfBlock to guarantee we always - // get a value, inserting PHIs as needed. - Value *Result = promoteAllocaUserToVector( - I, *DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx, - Updater.GetValueInMiddleOfBlock(I->getParent()), NewDLs); - if (Result) - Updater.AddAvailableValue(BB, Result); - assert(NewDLs.empty() && "No more deferred loads should be queued!"); - }); + // Now fixup the placeholders. + for (Instruction *Placeholder : Placeholders) { + Placeholder->replaceAllUsesWith( + Updater.GetValueInMiddleOfBlock(Placeholder->getParent())); + Placeholder->eraseFromParent(); + } // Delete all instructions. On the first pass, new dummy loads may have been // added so we need to collect them too. DenseSet<Instruction *> InstsToDelete(WorkList.begin(), WorkList.end()); - InstsToDelete.insert_range(DeferredLoads); for (Instruction *I : InstsToDelete) { assert(I->use_empty()); I->eraseFromParent(); @@ -1378,7 +1380,7 @@ bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) { auto visitUsers = [&](const GlobalVariable *GV, const Constant *Val) -> bool { for (const User *U : Val->users()) { if (const Instruction *Use = dyn_cast<Instruction>(U)) { - if (Use->getParent()->getParent() == &F) + if (Use->getFunction() == &F) return true; } else { const Constant *C = cast<Constant>(U); @@ -1489,7 +1491,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I, const DataLayout &DL = Mod->getDataLayout(); IRBuilder<> Builder(&I); - const Function &ContainingFunction = *I.getParent()->getParent(); + const Function &ContainingFunction = *I.getFunction(); CallingConv::ID CC = ContainingFunction.getCallingConv(); // Don't promote the alloca to LDS for shader calling conventions as the work @@ -1544,7 +1546,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I, LLVM_DEBUG(dbgs() << "Promoting alloca to local memory\n"); - Function *F = I.getParent()->getParent(); + Function *F = I.getFunction(); Type *GVTy = ArrayType::get(I.getAllocatedType(), WorkGroupSize); GlobalVariable *GV = new GlobalVariable( diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp index e187959..839120d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp @@ -24,6 +24,7 @@ #include "llvm/CodeGen/GlobalISel/CSEInfo.h" #include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h" #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" +#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineUniformityAnalysis.h" @@ -34,9 +35,17 @@ using namespace llvm; using namespace AMDGPU; +using namespace llvm::MIPatternMatch; namespace { +// AMDGPU-specific pattern matchers +template <typename SrcTy> +inline UnaryOp_match<SrcTy, AMDGPU::G_AMDGPU_READANYLANE> +m_GAMDGPUReadAnyLane(const SrcTy &Src) { + return UnaryOp_match<SrcTy, AMDGPU::G_AMDGPU_READANYLANE>(Src); +} + class AMDGPURegBankLegalize : public MachineFunctionPass { public: static char ID; @@ -160,9 +169,25 @@ AMDGPURegBankLegalizeCombiner::tryMatchRALFromUnmerge(Register Src) { Register AMDGPURegBankLegalizeCombiner::getReadAnyLaneSrc(Register Src) { // Src = G_AMDGPU_READANYLANE RALSrc - auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE); - if (RAL) + Register RALSrc; + if (mi_match(Src, MRI, m_GAMDGPUReadAnyLane(m_Reg(RALSrc)))) + return RALSrc; + + // RALSrc = G_ANYEXT S16Src + // TruncSrc = G_AMDGPU_READANYLANE RALSrc + // Src = G_TRUNC TruncSrc + if (mi_match(Src, MRI, + m_GTrunc(m_GAMDGPUReadAnyLane(m_GAnyExt(m_Reg(RALSrc)))))) { return RALSrc; + } + + // TruncSrc = G_AMDGPU_READANYLANE RALSrc + // AextSrc = G_TRUNC TruncSrc + // Src = G_ANYEXT AextSrc + if (mi_match(Src, MRI, + m_GAnyExt(m_GTrunc(m_GAMDGPUReadAnyLane(m_Reg(RALSrc)))))) { + return RALSrc; + } // LoVgpr, HiVgpr = G_UNMERGE_VALUES UnmergeSrc // LoSgpr = G_AMDGPU_READANYLANE LoVgpr @@ -410,7 +435,8 @@ bool AMDGPURegBankLegalize::runOnMachineFunction(MachineFunction &MF) { unsigned Opc = MI->getOpcode(); // Insert point for use operands needs some calculation. if (Opc == AMDGPU::G_PHI) { - RBLHelper.applyMappingPHI(*MI); + if (!RBLHelper.applyMappingPHI(*MI)) + return false; continue; } @@ -441,7 +467,8 @@ bool AMDGPURegBankLegalize::runOnMachineFunction(MachineFunction &MF) { // S1 rules are in RegBankLegalizeRules. } - RBLHelper.findRuleAndApplyMapping(*MI); + if (!RBLHelper.findRuleAndApplyMapping(*MI)) + return false; } // Sgpr S1 clean up combines: diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index 5407566..cc31d7d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -32,28 +32,48 @@ using namespace AMDGPU; RegBankLegalizeHelper::RegBankLegalizeHelper( MachineIRBuilder &B, const MachineUniformityInfo &MUI, const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules) - : ST(B.getMF().getSubtarget<GCNSubtarget>()), B(B), MRI(*B.getMRI()), - MUI(MUI), RBI(RBI), RBLRules(RBLRules), IsWave32(ST.isWave32()), + : MF(B.getMF()), ST(MF.getSubtarget<GCNSubtarget>()), B(B), + MRI(*B.getMRI()), MUI(MUI), RBI(RBI), MORE(MF, nullptr), + RBLRules(RBLRules), IsWave32(ST.isWave32()), SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)), VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)), VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {} -void RegBankLegalizeHelper::findRuleAndApplyMapping(MachineInstr &MI) { - const SetOfRulesForOpcode &RuleSet = RBLRules.getRulesForOpc(MI); - const RegBankLLTMapping &Mapping = RuleSet.findMappingForMI(MI, MRI, MUI); +bool RegBankLegalizeHelper::findRuleAndApplyMapping(MachineInstr &MI) { + const SetOfRulesForOpcode *RuleSet = RBLRules.getRulesForOpc(MI); + if (!RuleSet) { + reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize", + "No AMDGPU RegBankLegalize rules defined for opcode", + MI); + return false; + } + + const RegBankLLTMapping *Mapping = RuleSet->findMappingForMI(MI, MRI, MUI); + if (!Mapping) { + reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize", + "AMDGPU RegBankLegalize: none of the rules defined with " + "'Any' for MI's opcode matched MI", + MI); + return false; + } SmallSet<Register, 4> WaterfallSgprs; unsigned OpIdx = 0; - if (Mapping.DstOpMapping.size() > 0) { + if (Mapping->DstOpMapping.size() > 0) { B.setInsertPt(*MI.getParent(), std::next(MI.getIterator())); - applyMappingDst(MI, OpIdx, Mapping.DstOpMapping); + if (!applyMappingDst(MI, OpIdx, Mapping->DstOpMapping)) + return false; } - if (Mapping.SrcOpMapping.size() > 0) { + if (Mapping->SrcOpMapping.size() > 0) { B.setInstr(MI); - applyMappingSrc(MI, OpIdx, Mapping.SrcOpMapping, WaterfallSgprs); + if (!applyMappingSrc(MI, OpIdx, Mapping->SrcOpMapping, WaterfallSgprs)) + return false; } - lower(MI, Mapping, WaterfallSgprs); + if (!lower(MI, *Mapping, WaterfallSgprs)) + return false; + + return true; } bool RegBankLegalizeHelper::executeInWaterfallLoop( @@ -274,7 +294,7 @@ bool RegBankLegalizeHelper::executeInWaterfallLoop( return true; } -void RegBankLegalizeHelper::splitLoad(MachineInstr &MI, +bool RegBankLegalizeHelper::splitLoad(MachineInstr &MI, ArrayRef<LLT> LLTBreakdown, LLT MergeTy) { MachineFunction &MF = B.getMF(); assert(MI.getNumMemOperands() == 1); @@ -322,9 +342,10 @@ void RegBankLegalizeHelper::splitLoad(MachineInstr &MI, B.buildMergeLikeInstr(Dst, MergeTyParts); } MI.eraseFromParent(); + return true; } -void RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy, +bool RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy, LLT MergeTy) { MachineFunction &MF = B.getMF(); assert(MI.getNumMemOperands() == 1); @@ -350,9 +371,10 @@ void RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy, B.buildMergeLikeInstr(Dst, MergeTyParts); } MI.eraseFromParent(); + return true; } -void RegBankLegalizeHelper::widenMMOToS32(GAnyLoad &MI) const { +bool RegBankLegalizeHelper::widenMMOToS32(GAnyLoad &MI) const { Register Dst = MI.getDstReg(); Register Ptr = MI.getPointerReg(); MachineMemOperand &MMO = MI.getMMO(); @@ -376,9 +398,10 @@ void RegBankLegalizeHelper::widenMMOToS32(GAnyLoad &MI) const { } MI.eraseFromParent(); + return true; } -void RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &MI) { +bool RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &MI) { Register Dst = MI.getOperand(0).getReg(); LLT Ty = MRI.getType(Dst); Register Src = MI.getOperand(1).getReg(); @@ -404,15 +427,22 @@ void RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &MI) { Hi = B.buildUndef({VgprRB_S32}); break; default: - llvm_unreachable("Opcode not supported"); + reportGISelFailure( + MF, MORE, "amdgpu-regbanklegalize", + "AMDGPU RegBankLegalize: lowerVccExtToSel, Opcode not supported", MI); + return false; } B.buildMergeValues(Dst, {Lo.getReg(0), Hi.getReg(0)}); } else { - llvm_unreachable("Type not supported"); + reportGISelFailure( + MF, MORE, "amdgpu-regbanklegalize", + "AMDGPU RegBankLegalize: lowerVccExtToSel, Type not supported", MI); + return false; } MI.eraseFromParent(); + return true; } std::pair<Register, Register> RegBankLegalizeHelper::unpackZExt(Register Reg) { @@ -437,7 +467,14 @@ std::pair<Register, Register> RegBankLegalizeHelper::unpackAExt(Register Reg) { return {Lo.getReg(0), Hi.getReg(0)}; } -void RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &MI) { +std::pair<Register, Register> +RegBankLegalizeHelper::unpackAExtTruncS16(Register Reg) { + auto [Lo32, Hi32] = unpackAExt(Reg); + return {B.buildTrunc(SgprRB_S16, Lo32).getReg(0), + B.buildTrunc(SgprRB_S16, Hi32).getReg(0)}; +} + +bool RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &MI) { Register Lo, Hi; switch (MI.getOpcode()) { case AMDGPU::G_SHL: { @@ -462,13 +499,18 @@ void RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &MI) { break; } default: - llvm_unreachable("Unpack lowering not implemented"); + reportGISelFailure( + MF, MORE, "amdgpu-regbanklegalize", + "AMDGPU RegBankLegalize: lowerUnpackBitShift, case not implemented", + MI); + return false; } B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi}); MI.eraseFromParent(); + return true; } -void RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &MI) { +bool RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &MI) { Register Lo, Hi; switch (MI.getOpcode()) { case AMDGPU::G_SMIN: @@ -494,10 +536,25 @@ void RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &MI) { break; } default: - llvm_unreachable("Unpack min/max lowering not implemented"); + reportGISelFailure( + MF, MORE, "amdgpu-regbanklegalize", + "AMDGPU RegBankLegalize: lowerUnpackMinMax, case not implemented", MI); + return false; } B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi}); MI.eraseFromParent(); + return true; +} + +bool RegBankLegalizeHelper::lowerUnpackAExt(MachineInstr &MI) { + auto [Op1Lo, Op1Hi] = unpackAExt(MI.getOperand(1).getReg()); + auto [Op2Lo, Op2Hi] = unpackAExt(MI.getOperand(2).getReg()); + auto ResLo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Lo, Op2Lo}); + auto ResHi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Hi, Op2Hi}); + B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), + {ResLo.getReg(0), ResHi.getReg(0)}); + MI.eraseFromParent(); + return true; } static bool isSignedBFE(MachineInstr &MI) { @@ -507,7 +564,7 @@ static bool isSignedBFE(MachineInstr &MI) { return MI.getOpcode() == AMDGPU::G_SBFX; } -void RegBankLegalizeHelper::lowerV_BFE(MachineInstr &MI) { +bool RegBankLegalizeHelper::lowerV_BFE(MachineInstr &MI) { Register Dst = MI.getOperand(0).getReg(); assert(MRI.getType(Dst) == LLT::scalar(64)); bool Signed = isSignedBFE(MI); @@ -534,7 +591,7 @@ void RegBankLegalizeHelper::lowerV_BFE(MachineInstr &MI) { auto SignBit = B.buildShl({VgprRB, S64}, SHRSrc, Amt); B.buildInstr(SHROpc, {Dst}, {SignBit, Amt}); MI.eraseFromParent(); - return; + return true; } uint64_t WidthImm = ConstWidth->Value.getZExtValue(); @@ -564,9 +621,10 @@ void RegBankLegalizeHelper::lowerV_BFE(MachineInstr &MI) { } MI.eraseFromParent(); + return true; } -void RegBankLegalizeHelper::lowerS_BFE(MachineInstr &MI) { +bool RegBankLegalizeHelper::lowerS_BFE(MachineInstr &MI) { Register DstReg = MI.getOperand(0).getReg(); LLT Ty = MRI.getType(DstReg); bool Signed = isSignedBFE(MI); @@ -592,14 +650,19 @@ void RegBankLegalizeHelper::lowerS_BFE(MachineInstr &MI) { auto S_BFE = B.buildInstr(Opc, {{SgprRB, Ty}}, {B.buildCopy(Ty, Src), B.buildCopy(S32, Src1)}); if (!constrainSelectedInstRegOperands(*S_BFE, *ST.getInstrInfo(), - *ST.getRegisterInfo(), RBI)) - llvm_unreachable("failed to constrain BFE"); + *ST.getRegisterInfo(), RBI)) { + reportGISelFailure( + MF, MORE, "amdgpu-regbanklegalize", + "AMDGPU RegBankLegalize: lowerS_BFE, failed to constrain BFE", MI); + return false; + } B.buildCopy(DstReg, S_BFE->getOperand(0).getReg()); MI.eraseFromParent(); + return true; } -void RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) { +bool RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) { Register Dst = MI.getOperand(0).getReg(); LLT DstTy = MRI.getType(Dst); assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64); @@ -614,9 +677,35 @@ void RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) { B.buildInstr(Opc, {{VgprRB, Ty}}, {Op1.getReg(1), Op2.getReg(1)}, Flags); B.buildMergeLikeInstr(Dst, {Lo, Hi}); MI.eraseFromParent(); + return true; +} + +bool RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &MI) { + Register Dst = MI.getOperand(0).getReg(); + assert(MRI.getType(Dst) == V2S16); + unsigned Opc = MI.getOpcode(); + auto Flags = MI.getFlags(); + + if (MI.getNumOperands() == 2) { + auto [Op1Lo, Op1Hi] = unpackAExtTruncS16(MI.getOperand(1).getReg()); + auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo}, Flags); + auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi}, Flags); + B.buildMergeLikeInstr(Dst, {Lo, Hi}); + MI.eraseFromParent(); + return true; + } + + assert(MI.getNumOperands() == 3); + auto [Op1Lo, Op1Hi] = unpackAExtTruncS16(MI.getOperand(1).getReg()); + auto [Op2Lo, Op2Hi] = unpackAExtTruncS16(MI.getOperand(2).getReg()); + auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo}, Flags); + auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi}, Flags); + B.buildMergeLikeInstr(Dst, {Lo, Hi}); + MI.eraseFromParent(); + return true; } -void RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &MI) { +bool RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &MI) { Register Dst = MI.getOperand(0).getReg(); LLT DstTy = MRI.getType(Dst); assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64 || @@ -633,9 +722,10 @@ void RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &MI) { B.buildMergeLikeInstr(Dst, {Lo, Hi}); MI.eraseFromParent(); + return true; } -void RegBankLegalizeHelper::lowerSplitTo32SExtInReg(MachineInstr &MI) { +bool RegBankLegalizeHelper::lowerSplitTo32SExtInReg(MachineInstr &MI) { auto Op1 = B.buildUnmerge(VgprRB_S32, MI.getOperand(1).getReg()); int Amt = MI.getOperand(2).getImm(); Register Lo, Hi; @@ -660,9 +750,10 @@ void RegBankLegalizeHelper::lowerSplitTo32SExtInReg(MachineInstr &MI) { B.buildMergeLikeInstr(MI.getOperand(0).getReg(), {Lo, Hi}); MI.eraseFromParent(); + return true; } -void RegBankLegalizeHelper::lower(MachineInstr &MI, +bool RegBankLegalizeHelper::lower(MachineInstr &MI, const RegBankLLTMapping &Mapping, SmallSet<Register, 4> &WaterfallSgprs) { @@ -682,12 +773,14 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI, B.buildSelect(MI.getOperand(0).getReg(), MI.getOperand(1).getReg(), True, False); MI.eraseFromParent(); - return; + return true; } case UnpackBitShift: return lowerUnpackBitShift(MI); case UnpackMinMax: return lowerUnpackMinMax(MI); + case ScalarizeToS16: + return lowerSplitTo16(MI); case Ext32To64: { const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg()); MachineInstrBuilder Hi; @@ -707,20 +800,23 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI, break; } default: - llvm_unreachable("Unsuported Opcode in Ext32To64"); + reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize", + "AMDGPU RegBankLegalize: Ext32To64, unsuported opcode", + MI); + return false; } B.buildMergeLikeInstr(MI.getOperand(0).getReg(), {MI.getOperand(1).getReg(), Hi}); MI.eraseFromParent(); - return; + return true; } case UniCstExt: { uint64_t ConstVal = MI.getOperand(1).getCImm()->getZExtValue(); B.buildConstant(MI.getOperand(0).getReg(), ConstVal); MI.eraseFromParent(); - return; + return true; } case VgprToVccCopy: { Register Src = MI.getOperand(1).getReg(); @@ -744,7 +840,7 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI, auto Zero = B.buildConstant({VgprRB, Ty}, 0); B.buildICmp(CmpInst::ICMP_NE, MI.getOperand(0).getReg(), BoolSrc, Zero); MI.eraseFromParent(); - return; + return true; } case V_BFE: return lowerV_BFE(MI); @@ -773,8 +869,10 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI, else if (Size / 128 == 4) splitLoad(MI, {B128, B128, B128, B128}); else { - LLVM_DEBUG(dbgs() << "MI: "; MI.dump();); - llvm_unreachable("SplitLoad type not supported for MI"); + reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize", + "AMDGPU RegBankLegalize: SplitLoad, unsuported type", + MI); + return false; } } // 64 and 32 bit load @@ -785,10 +883,12 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI, else if (DstTy == V6S16) splitLoad(MI, {V4S16, V2S16}, V2S16); else { - LLVM_DEBUG(dbgs() << "MI: "; MI.dump();); - llvm_unreachable("SplitLoad type not supported for MI"); + reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize", + "AMDGPU RegBankLegalize: SplitLoad, unsuported type", + MI); + return false; } - break; + return true; } case WidenLoad: { LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); @@ -799,19 +899,25 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI, else if (DstTy == V6S16) widenLoad(MI, V8S16, V2S16); else { - LLVM_DEBUG(dbgs() << "MI: "; MI.dump();); - llvm_unreachable("WidenLoad type not supported for MI"); + reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize", + "AMDGPU RegBankLegalize: WidenLoad, unsuported type", + MI); + return false; } - break; + return true; } + case UnpackAExt: + return lowerUnpackAExt(MI); case WidenMMOToS32: return widenMMOToS32(cast<GAnyLoad>(MI)); } if (!WaterfallSgprs.empty()) { MachineBasicBlock::iterator I = MI.getIterator(); - executeInWaterfallLoop(B, make_range(I, std::next(I)), WaterfallSgprs); + if (!executeInWaterfallLoop(B, make_range(I, std::next(I)), WaterfallSgprs)) + return false; } + return true; } LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) { @@ -837,10 +943,12 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) { return LLT::scalar(32); case Sgpr64: case Vgpr64: + case UniInVgprS64: return LLT::scalar(64); case Sgpr128: case Vgpr128: return LLT::scalar(128); + case SgprP0: case VgprP0: return LLT::pointer(0, 64); case SgprP1: @@ -855,6 +963,8 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) { case SgprP5: case VgprP5: return LLT::pointer(5, 32); + case SgprP8: + return LLT::pointer(8, 128); case SgprV2S16: case VgprV2S16: case UniInVgprV2S16: @@ -940,10 +1050,12 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) { case Sgpr32_WF: case Sgpr64: case Sgpr128: + case SgprP0: case SgprP1: case SgprP3: case SgprP4: case SgprP5: + case SgprP8: case SgprPtr32: case SgprPtr64: case SgprPtr128: @@ -960,6 +1072,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) { case UniInVcc: case UniInVgprS16: case UniInVgprS32: + case UniInVgprS64: case UniInVgprV2S16: case UniInVgprV4S32: case UniInVgprB32: @@ -1003,7 +1116,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) { } } -void RegBankLegalizeHelper::applyMappingDst( +bool RegBankLegalizeHelper::applyMappingDst( MachineInstr &MI, unsigned &OpIdx, const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs) { // Defs start from operand 0 @@ -1022,10 +1135,12 @@ void RegBankLegalizeHelper::applyMappingDst( case Sgpr32: case Sgpr64: case Sgpr128: + case SgprP0: case SgprP1: case SgprP3: case SgprP4: case SgprP5: + case SgprP8: case SgprV2S16: case SgprV2S32: case SgprV4S32: @@ -1092,6 +1207,7 @@ void RegBankLegalizeHelper::applyMappingDst( break; } case UniInVgprS32: + case UniInVgprS64: case UniInVgprV2S16: case UniInVgprV4S32: { assert(Ty == getTyFromID(MethodIDs[OpIdx])); @@ -1120,20 +1236,28 @@ void RegBankLegalizeHelper::applyMappingDst( assert(RB == SgprRB); Register NewDst = MRI.createVirtualRegister(SgprRB_S32); Op.setReg(NewDst); - B.buildTrunc(Reg, NewDst); + if (!MRI.use_empty(Reg)) + B.buildTrunc(Reg, NewDst); break; } case InvalidMapping: { - LLVM_DEBUG(dbgs() << "Instruction with Invalid mapping: "; MI.dump();); - llvm_unreachable("missing fast rule for MI"); + reportGISelFailure( + MF, MORE, "amdgpu-regbanklegalize", + "AMDGPU RegBankLegalize: missing fast rule ('Div' or 'Uni') for", MI); + return false; } default: - llvm_unreachable("ID not supported"); + reportGISelFailure( + MF, MORE, "amdgpu-regbanklegalize", + "AMDGPU RegBankLegalize: applyMappingDst, ID not supported", MI); + return false; } } + + return true; } -void RegBankLegalizeHelper::applyMappingSrc( +bool RegBankLegalizeHelper::applyMappingSrc( MachineInstr &MI, unsigned &OpIdx, const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs, SmallSet<Register, 4> &SgprWaterfallOperandRegs) { @@ -1163,10 +1287,12 @@ void RegBankLegalizeHelper::applyMappingSrc( case Sgpr32: case Sgpr64: case Sgpr128: + case SgprP0: case SgprP1: case SgprP3: case SgprP4: case SgprP5: + case SgprP8: case SgprV2S16: case SgprV2S32: case SgprV4S32: { @@ -1285,12 +1411,16 @@ void RegBankLegalizeHelper::applyMappingSrc( break; } default: - llvm_unreachable("ID not supported"); + reportGISelFailure( + MF, MORE, "amdgpu-regbanklegalize", + "AMDGPU RegBankLegalize: applyMappingSrc, ID not supported", MI); + return false; } } + return true; } -void RegBankLegalizeHelper::applyMappingPHI(MachineInstr &MI) { +bool RegBankLegalizeHelper::applyMappingPHI(MachineInstr &MI) { Register Dst = MI.getOperand(0).getReg(); LLT Ty = MRI.getType(Dst); @@ -1313,16 +1443,17 @@ void RegBankLegalizeHelper::applyMappingPHI(MachineInstr &MI) { MI.getOperand(i).setReg(NewUse.getReg(0)); } - return; + return true; } - // ALL divergent i1 phis should be already lowered and inst-selected into PHI - // with sgpr reg class and S1 LLT. + // ALL divergent i1 phis should have been lowered and inst-selected into PHI + // with sgpr reg class and S1 LLT in AMDGPUGlobalISelDivergenceLowering pass. // Note: this includes divergent phis that don't require lowering. if (Ty == LLT::scalar(1) && MUI.isDivergent(Dst)) { - LLVM_DEBUG(dbgs() << "Divergent S1 G_PHI: "; MI.dump();); - llvm_unreachable("Make sure to run AMDGPUGlobalISelDivergenceLowering " - "before RegBankLegalize to lower lane mask(vcc) phis"); + reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize", + "AMDGPU RegBankLegalize: Can't lower divergent S1 G_PHI", + MI); + return false; } // We accept all types that can fit in some register class. @@ -1330,11 +1461,13 @@ void RegBankLegalizeHelper::applyMappingPHI(MachineInstr &MI) { // Divergent G_PHIs have vgpr dst but inputs can be sgpr or vgpr. if (Ty == LLT::scalar(32) || Ty == LLT::pointer(1, 64) || Ty == LLT::pointer(4, 64)) { - return; + return true; } - LLVM_DEBUG(dbgs() << "G_PHI not handled: "; MI.dump();); - llvm_unreachable("type not supported"); + reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize", + "AMDGPU RegBankLegalize: type not supported for G_PHI", + MI); + return false; } [[maybe_unused]] static bool verifyRegBankOnOperands(MachineInstr &MI, diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h index d937815..1dc0278 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h @@ -12,6 +12,7 @@ #include "AMDGPURegBankLegalizeRules.h" #include "llvm/ADT/SmallSet.h" #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" +#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" #include "llvm/CodeGen/MachineRegisterInfo.h" namespace llvm { @@ -27,11 +28,13 @@ namespace AMDGPU { // to replace instruction. In other case InstApplyMethod will create new // instruction(s). class RegBankLegalizeHelper { + MachineFunction &MF; const GCNSubtarget &ST; MachineIRBuilder &B; MachineRegisterInfo &MRI; const MachineUniformityInfo &MUI; const RegisterBankInfo &RBI; + MachineOptimizationRemarkEmitter MORE; const RegBankLegalizeRules &RBLRules; const bool IsWave32; const RegisterBank *SgprRB; @@ -72,6 +75,7 @@ class RegBankLegalizeHelper { static constexpr LLT P6 = LLT::pointer(6, 32); MachineRegisterInfo::VRegAttrs SgprRB_S32 = {SgprRB, S32}; + MachineRegisterInfo::VRegAttrs SgprRB_S16 = {SgprRB, S16}; MachineRegisterInfo::VRegAttrs VgprRB_S32 = {VgprRB, S32}; MachineRegisterInfo::VRegAttrs VccRB_S1 = {VccRB, S1}; @@ -80,10 +84,10 @@ public: const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules); - void findRuleAndApplyMapping(MachineInstr &MI); + bool findRuleAndApplyMapping(MachineInstr &MI); // Manual apply helpers. - void applyMappingPHI(MachineInstr &MI); + bool applyMappingPHI(MachineInstr &MI); void applyMappingTrivial(MachineInstr &MI); private: @@ -96,34 +100,37 @@ private: const RegisterBank *getRegBankFromID(RegBankLLTMappingApplyID ID); - void + bool applyMappingDst(MachineInstr &MI, unsigned &OpIdx, const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs); - void + bool applyMappingSrc(MachineInstr &MI, unsigned &OpIdx, const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs, SmallSet<Register, 4> &SgprWaterfallOperandRegs); - void splitLoad(MachineInstr &MI, ArrayRef<LLT> LLTBreakdown, + bool splitLoad(MachineInstr &MI, ArrayRef<LLT> LLTBreakdown, LLT MergeTy = LLT()); - void widenLoad(MachineInstr &MI, LLT WideTy, LLT MergeTy = LLT()); - void widenMMOToS32(GAnyLoad &MI) const; + bool widenLoad(MachineInstr &MI, LLT WideTy, LLT MergeTy = LLT()); + bool widenMMOToS32(GAnyLoad &MI) const; - void lower(MachineInstr &MI, const RegBankLLTMapping &Mapping, + bool lower(MachineInstr &MI, const RegBankLLTMapping &Mapping, SmallSet<Register, 4> &SgprWaterfallOperandRegs); - void lowerVccExtToSel(MachineInstr &MI); + bool lowerVccExtToSel(MachineInstr &MI); std::pair<Register, Register> unpackZExt(Register Reg); std::pair<Register, Register> unpackSExt(Register Reg); std::pair<Register, Register> unpackAExt(Register Reg); - void lowerUnpackBitShift(MachineInstr &MI); - void lowerV_BFE(MachineInstr &MI); - void lowerS_BFE(MachineInstr &MI); - void lowerSplitTo32(MachineInstr &MI); - void lowerSplitTo32Select(MachineInstr &MI); - void lowerSplitTo32SExtInReg(MachineInstr &MI); - void lowerUnpackMinMax(MachineInstr &MI); + std::pair<Register, Register> unpackAExtTruncS16(Register Reg); + bool lowerUnpackBitShift(MachineInstr &MI); + bool lowerV_BFE(MachineInstr &MI); + bool lowerS_BFE(MachineInstr &MI); + bool lowerSplitTo32(MachineInstr &MI); + bool lowerSplitTo16(MachineInstr &MI); + bool lowerSplitTo32Select(MachineInstr &MI); + bool lowerSplitTo32SExtInReg(MachineInstr &MI); + bool lowerUnpackMinMax(MachineInstr &MI); + bool lowerUnpackAExt(MachineInstr &MI); }; } // end namespace AMDGPU diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index a67b12a..9de3092 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -66,6 +66,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID, return MRI.getType(Reg) == LLT::pointer(4, 64); case P5: return MRI.getType(Reg) == LLT::pointer(5, 32); + case P8: + return MRI.getType(Reg) == LLT::pointer(8, 128); case Ptr32: return isAnyPtr(MRI.getType(Reg), 32); case Ptr64: @@ -108,6 +110,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID, return MRI.getType(Reg) == LLT::pointer(4, 64) && MUI.isUniform(Reg); case UniP5: return MRI.getType(Reg) == LLT::pointer(5, 32) && MUI.isUniform(Reg); + case UniP8: + return MRI.getType(Reg) == LLT::pointer(8, 128) && MUI.isUniform(Reg); case UniPtr32: return isAnyPtr(MRI.getType(Reg), 32) && MUI.isUniform(Reg); case UniPtr64: @@ -202,7 +206,7 @@ bool PredicateMapping::match(const MachineInstr &MI, return true; } -SetOfRulesForOpcode::SetOfRulesForOpcode() {} +SetOfRulesForOpcode::SetOfRulesForOpcode() = default; SetOfRulesForOpcode::SetOfRulesForOpcode(FastRulesTypes FastTypes) : FastTypes(FastTypes) {} @@ -239,7 +243,7 @@ UniformityLLTOpPredicateID LLTToBId(LLT Ty) { return _; } -const RegBankLLTMapping & +const RegBankLLTMapping * SetOfRulesForOpcode::findMappingForMI(const MachineInstr &MI, const MachineRegisterInfo &MRI, const MachineUniformityInfo &MUI) const { @@ -256,17 +260,16 @@ SetOfRulesForOpcode::findMappingForMI(const MachineInstr &MI, Slot = getFastPredicateSlot(LLTToId(MRI.getType(Reg))); if (Slot != -1) - return MUI.isUniform(Reg) ? Uni[Slot] : Div[Slot]; + return MUI.isUniform(Reg) ? &Uni[Slot] : &Div[Slot]; } // Slow search for more complex rules. for (const RegBankLegalizeRule &Rule : Rules) { if (Rule.Predicate.match(MI, MUI, MRI)) - return Rule.OperandMapping; + return &Rule.OperandMapping; } - LLVM_DEBUG(dbgs() << "MI: "; MI.dump();); - llvm_unreachable("None of the rules defined for MI's opcode matched MI"); + return nullptr; } void SetOfRulesForOpcode::addRule(RegBankLegalizeRule Rule) { @@ -349,7 +352,7 @@ RegBankLegalizeRules::addRulesForIOpcs(std::initializer_list<unsigned> OpcList, return RuleSetInitializer(OpcList, IRulesAlias, IRules, FastTypes); } -const SetOfRulesForOpcode & +const SetOfRulesForOpcode * RegBankLegalizeRules::getRulesForOpc(MachineInstr &MI) const { unsigned Opc = MI.getOpcode(); if (Opc == AMDGPU::G_INTRINSIC || Opc == AMDGPU::G_INTRINSIC_CONVERGENT || @@ -357,19 +360,15 @@ RegBankLegalizeRules::getRulesForOpc(MachineInstr &MI) const { Opc == AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS) { unsigned IntrID = cast<GIntrinsic>(MI).getIntrinsicID(); auto IRAIt = IRulesAlias.find(IntrID); - if (IRAIt == IRulesAlias.end()) { - LLVM_DEBUG(dbgs() << "MI: "; MI.dump();); - llvm_unreachable("No rules defined for intrinsic opcode"); - } - return IRules.at(IRAIt->second); + if (IRAIt == IRulesAlias.end()) + return nullptr; + return &IRules.at(IRAIt->second); } auto GRAIt = GRulesAlias.find(Opc); - if (GRAIt == GRulesAlias.end()) { - LLVM_DEBUG(dbgs() << "MI: "; MI.dump();); - llvm_unreachable("No rules defined for generic opcode"); - } - return GRules.at(GRAIt->second); + if (GRAIt == GRulesAlias.end()) + return nullptr; + return &GRules.at(GRAIt->second); } // Syntactic sugar wrapper for predicate lambda that enables '&&', '||' and '!'. @@ -470,7 +469,19 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Uni(S16, {{Sgpr32Trunc}, {Sgpr32AExt, Sgpr32AExt}}) .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}}) .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}) - .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}); + .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}) + .Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, UnpackAExt}) + .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}}) + .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr64}}) + .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}}); + + addRulesForGOpcs({G_UADDO, G_USUBO}, Standard) + .Uni(S32, {{Sgpr32, Sgpr32Trunc}, {Sgpr32, Sgpr32}}) + .Div(S32, {{Vgpr32, Vcc}, {Vgpr32, Vgpr32}}); + + addRulesForGOpcs({G_UADDE, G_USUBE}, Standard) + .Uni(S32, {{Sgpr32, Sgpr32Trunc}, {Sgpr32, Sgpr32, Sgpr32AExtBoolInReg}}) + .Div(S32, {{Vgpr32, Vcc}, {Vgpr32, Vgpr32, Vcc}}); addRulesForGOpcs({G_MUL}, Standard).Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}); @@ -514,6 +525,10 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}) .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}}); + addRulesForGOpcs({G_FSHR}, Standard) + .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32, Vgpr32}}) + .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}}); + addRulesForGOpcs({G_FRAME_INDEX}).Any({{UniP5, _}, {{SgprP5}, {None}}}); addRulesForGOpcs({G_UBFX, G_SBFX}, Standard) @@ -901,14 +916,56 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, addRulesForGOpcs({G_ABS}, Standard).Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt}}); - addRulesForGOpcs({G_READSTEADYCOUNTER}, Standard).Uni(S64, {{Sgpr64}, {}}); + addRulesForGOpcs({G_FENCE}).Any({{{}}, {{}, {}}}); + + addRulesForGOpcs({G_READSTEADYCOUNTER, G_READCYCLECOUNTER}, Standard) + .Uni(S64, {{Sgpr64}, {}}); + + addRulesForGOpcs({G_BLOCK_ADDR}).Any({{UniP0}, {{SgprP0}, {}}}); + + addRulesForGOpcs({G_GLOBAL_VALUE}) + .Any({{UniP0}, {{SgprP0}, {}}}) + .Any({{UniP1}, {{SgprP1}, {}}}) + .Any({{UniP3}, {{SgprP3}, {}}}) + .Any({{UniP4}, {{SgprP4}, {}}}) + .Any({{UniP8}, {{SgprP8}, {}}}); + + addRulesForGOpcs({G_AMDGPU_WAVE_ADDRESS}).Any({{UniP5}, {{SgprP5}, {}}}); bool hasSALUFloat = ST->hasSALUFloatInsts(); - addRulesForGOpcs({G_FADD}, Standard) + addRulesForGOpcs({G_FADD, G_FMUL, G_STRICT_FADD, G_STRICT_FMUL}, Standard) + .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}}, !hasSALUFloat) + .Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16}}, hasSALUFloat) + .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}}) .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasSALUFloat) .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasSALUFloat) - .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}); + .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}) + .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64}}) + .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}}) + .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16, VgprV2S16}}, !hasSALUFloat) + .Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, ScalarizeToS16}, + hasSALUFloat) + .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}}); + + // FNEG and FABS are either folded as source modifiers or can be selected as + // bitwise XOR and AND with Mask. XOR and AND are available on SALU but for + // targets without SALU float we still select them as VGPR since there would + // be no real sgpr use. + addRulesForGOpcs({G_FNEG, G_FABS}, Standard) + .Uni(S16, {{UniInVgprS16}, {Vgpr16}}, !hasSALUFloat) + .Uni(S16, {{Sgpr16}, {Sgpr16}}, hasSALUFloat) + .Div(S16, {{Vgpr16}, {Vgpr16}}) + .Uni(S32, {{UniInVgprS32}, {Vgpr32}}, !hasSALUFloat) + .Uni(S32, {{Sgpr32}, {Sgpr32}}, hasSALUFloat) + .Div(S32, {{Vgpr32}, {Vgpr32}}) + .Uni(S64, {{UniInVgprS64}, {Vgpr64}}) + .Div(S64, {{Vgpr64}, {Vgpr64}}) + .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16}}, !hasSALUFloat) + .Uni(V2S16, {{SgprV2S16}, {SgprV2S16}, ScalarizeToS16}, hasSALUFloat) + .Div(V2S16, {{VgprV2S16}, {VgprV2S16}}) + .Any({{UniV2S32}, {{UniInVgprV2S32}, {VgprV2S32}}}) + .Any({{DivV2S32}, {{VgprV2S32}, {VgprV2S32}}}); addRulesForGOpcs({G_FPTOUI}) .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}, hasSALUFloat) @@ -919,6 +976,14 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}, hasSALUFloat) .Any({{UniS32, S32}, {{UniInVgprS32}, {Vgpr32}}}, !hasSALUFloat); + addRulesForGOpcs({G_IS_FPCLASS}) + .Any({{DivS1, S16}, {{Vcc}, {Vgpr16}}}) + .Any({{UniS1, S16}, {{UniInVcc}, {Vgpr16}}}) + .Any({{DivS1, S32}, {{Vcc}, {Vgpr32}}}) + .Any({{UniS1, S32}, {{UniInVcc}, {Vgpr32}}}) + .Any({{DivS1, S64}, {{Vcc}, {Vgpr64}}}) + .Any({{UniS1, S64}, {{UniInVcc}, {Vgpr64}}}); + using namespace Intrinsic; addRulesForIOpcs({amdgcn_s_getpc}).Any({{UniS64, _}, {{Sgpr64}, {None}}}); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h index 93e0efd..1ac1173 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h @@ -63,6 +63,7 @@ enum UniformityLLTOpPredicateID { P3, P4, P5, + P8, Ptr32, Ptr64, Ptr128, @@ -72,6 +73,7 @@ enum UniformityLLTOpPredicateID { UniP3, UniP4, UniP5, + UniP8, UniPtr32, UniPtr64, UniPtr128, @@ -92,8 +94,10 @@ enum UniformityLLTOpPredicateID { V4S32, UniV2S16, + UniV2S32, DivV2S16, + DivV2S32, // B types B32, @@ -134,10 +138,12 @@ enum RegBankLLTMappingApplyID { Sgpr32, Sgpr64, Sgpr128, + SgprP0, SgprP1, SgprP3, SgprP4, SgprP5, + SgprP8, SgprPtr32, SgprPtr64, SgprPtr128, @@ -178,7 +184,9 @@ enum RegBankLLTMappingApplyID { UniInVcc, UniInVgprS16, UniInVgprS32, + UniInVgprS64, UniInVgprV2S16, + UniInVgprV2S32, UniInVgprV4S32, UniInVgprB32, UniInVgprB64, @@ -217,13 +225,15 @@ enum LoweringMethodID { V_BFE, VgprToVccCopy, SplitTo32, + ScalarizeToS16, SplitTo32Select, SplitTo32SExtInReg, Ext32To64, UniCstExt, SplitLoad, WidenLoad, - WidenMMOToS32 + WidenMMOToS32, + UnpackAExt }; enum FastRulesTypes { @@ -277,7 +287,7 @@ public: SetOfRulesForOpcode(); SetOfRulesForOpcode(FastRulesTypes FastTypes); - const RegBankLLTMapping & + const RegBankLLTMapping * findMappingForMI(const MachineInstr &MI, const MachineRegisterInfo &MRI, const MachineUniformityInfo &MUI) const; @@ -375,7 +385,7 @@ public: MRI = &_MRI; }; - const SetOfRulesForOpcode &getRulesForOpc(MachineInstr &MI) const; + const SetOfRulesForOpcode *getRulesForOpc(MachineInstr &MI) const; }; } // end namespace AMDGPU diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 54ba2f8..ce4cc79 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -287,9 +287,6 @@ unsigned AMDGPURegisterBankInfo::getBreakDownCost( const RegisterBank & AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC, LLT Ty) const { - if (&RC == &AMDGPU::SReg_1RegClass) - return AMDGPU::VCCRegBank; - // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a // VCC-like use. if (TRI->isSGPRClass(&RC)) { @@ -471,7 +468,7 @@ RegisterBankInfo::InstructionMappings AMDGPURegisterBankInfo::getInstrAlternativeMappings( const MachineInstr &MI) const { - const MachineFunction &MF = *MI.getParent()->getParent(); + const MachineFunction &MF = *MI.getMF(); const MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -2412,7 +2409,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl( if (DstBank == &AMDGPU::VCCRegBank) break; - MachineFunction *MF = MI.getParent()->getParent(); + MachineFunction *MF = MI.getMF(); ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank); LegalizerHelper Helper(*MF, ApplyBank, B); @@ -2492,7 +2489,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl( // There is no VALU abs instruction so we need to replace it with a sub and // max combination. if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) { - MachineFunction *MF = MI.getParent()->getParent(); + MachineFunction *MF = MI.getMF(); ApplyRegBankMapping Apply(B, *this, MRI, &AMDGPU::VGPRRegBank); LegalizerHelper Helper(*MF, Apply, B); @@ -3114,6 +3111,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl( case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: { @@ -3607,7 +3606,7 @@ unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo &MRI, } bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const { - const MachineFunction &MF = *MI.getParent()->getParent(); + const MachineFunction &MF = *MI.getMF(); const MachineRegisterInfo &MRI = MF.getRegInfo(); for (const MachineOperand &MO : MI.operands()) { if (!MO.isReg()) @@ -3623,7 +3622,7 @@ bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const { const RegisterBankInfo::InstructionMapping & AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const { - const MachineFunction &MF = *MI.getParent()->getParent(); + const MachineFunction &MF = *MI.getMF(); const MachineRegisterInfo &MRI = MF.getRegInfo(); SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); @@ -3641,7 +3640,7 @@ AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const { const RegisterBankInfo::InstructionMapping & AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const { - const MachineFunction &MF = *MI.getParent()->getParent(); + const MachineFunction &MF = *MI.getMF(); const MachineRegisterInfo &MRI = MF.getRegInfo(); SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); @@ -3665,7 +3664,7 @@ AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const { const RegisterBankInfo::InstructionMapping & AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const { - const MachineFunction &MF = *MI.getParent()->getParent(); + const MachineFunction &MF = *MI.getMF(); const MachineRegisterInfo &MRI = MF.getRegInfo(); SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); @@ -3744,7 +3743,7 @@ AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI, const RegisterBankInfo::InstructionMapping & AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const { - const MachineFunction &MF = *MI.getParent()->getParent(); + const MachineFunction &MF = *MI.getMF(); const MachineRegisterInfo &MRI = MF.getRegInfo(); SmallVector<const ValueMapping*, 2> OpdsMapping(2); unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); @@ -3834,7 +3833,7 @@ AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg, // const RegisterBankInfo::InstructionMapping & AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { - const MachineFunction &MF = *MI.getParent()->getParent(); + const MachineFunction &MF = *MI.getMF(); const MachineRegisterInfo &MRI = MF.getRegInfo(); if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) { @@ -4502,6 +4501,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: { @@ -5081,17 +5082,17 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { unsigned MinNumRegsRequired = DstSize / 32; const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + bool UseAGPRForm = Info->selectAGPRFormMFMA(MinNumRegsRequired); + OpdsMapping[0] = - Info->getMinNumAGPRs() >= MinNumRegsRequired - ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI) - : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); + UseAGPRForm ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI) + : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); OpdsMapping[4] = - Info->getMinNumAGPRs() >= MinNumRegsRequired - ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI) - : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); + UseAGPRForm ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI) + : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); OpdsMapping[8] = getVGPROpMapping(MI.getOperand(8).getReg(), MRI, *TRI); OpdsMapping[10] = getVGPROpMapping(MI.getOperand(10).getReg(), MRI, *TRI); @@ -5217,11 +5218,15 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { break; } case Intrinsic::amdgcn_wave_reduce_add: + case Intrinsic::amdgcn_wave_reduce_fadd: case Intrinsic::amdgcn_wave_reduce_sub: + case Intrinsic::amdgcn_wave_reduce_fsub: case Intrinsic::amdgcn_wave_reduce_min: case Intrinsic::amdgcn_wave_reduce_umin: + case Intrinsic::amdgcn_wave_reduce_fmin: case Intrinsic::amdgcn_wave_reduce_max: case Intrinsic::amdgcn_wave_reduce_umax: + case Intrinsic::amdgcn_wave_reduce_fmax: case Intrinsic::amdgcn_wave_reduce_and: case Intrinsic::amdgcn_wave_reduce_or: case Intrinsic::amdgcn_wave_reduce_xor: { @@ -5304,12 +5309,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); break; } - case Intrinsic::amdgcn_global_atomic_csub: case Intrinsic::amdgcn_global_atomic_fmin_num: case Intrinsic::amdgcn_global_atomic_fmax_num: case Intrinsic::amdgcn_flat_atomic_fmin_num: case Intrinsic::amdgcn_flat_atomic_fmax_num: - case Intrinsic::amdgcn_atomic_cond_sub_u32: case Intrinsic::amdgcn_global_atomic_ordered_add_b64: case Intrinsic::amdgcn_global_load_tr_b64: case Intrinsic::amdgcn_global_load_tr_b128: @@ -5704,6 +5707,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_ATOMICRMW_FMAX: case AMDGPU::G_ATOMICRMW_UINC_WRAP: case AMDGPU::G_ATOMICRMW_UDEC_WRAP: + case AMDGPU::G_ATOMICRMW_USUB_COND: + case AMDGPU::G_ATOMICRMW_USUB_SAT: case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: { OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg()); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td b/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td index 1c1a6da..c37d309 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td @@ -15,7 +15,7 @@ def VGPRRegBank : RegisterBank<"VGPR", >; // It is helpful to distinguish conditions from ordinary SGPRs. -def VCCRegBank : RegisterBank <"VCC", [SReg_1]>; +def VCCRegBank : RegisterBank<"VCC", [SReg_32, SReg_64]>; def AGPRRegBank : RegisterBank <"AGPR", [AGPR_LO16, AGPR_32, AReg_64, AReg_96, AReg_128, AReg_160, AReg_192, AReg_224, AReg_256, AReg_288, AReg_320, AReg_352, AReg_384, AReg_512, AReg_1024] diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp index 0ea9add..b03d50f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp @@ -261,13 +261,6 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage( const Function *Callee = getCalleeFunction(*CalleeOp); - // Avoid crashing on undefined behavior with an illegal call to a - // kernel. If a callsite's calling convention doesn't match the - // function's, it's undefined behavior. If the callsite calling - // convention does match, that would have errored earlier. - if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv())) - report_fatal_error("invalid call to entry function"); - auto isSameFunction = [](const MachineFunction &MF, const Function *F) { return F == &MF.getFunction(); }; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp index 89c16da..ffbb1c1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp @@ -32,6 +32,7 @@ #include "llvm/CodeGen/LiveStacks.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/SlotIndexes.h" #include "llvm/CodeGen/VirtRegMap.h" #include "llvm/InitializePasses.h" @@ -96,8 +97,8 @@ public: /// Compute the register class constraints based on the uses of \p Reg, /// excluding MFMA uses from which can be rewritten to change the register - /// class constraint. This should be nearly identical to - /// MachineRegisterInfo::recomputeRegClass. + /// class constraint. MFMA scale operands need to be constraint checked. + /// This should be nearly identical to MachineRegisterInfo::recomputeRegClass. /// \p RewriteCandidates will collect the set of MFMA instructions that need /// to have the opcode mutated to perform the replacement. @@ -151,9 +152,16 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::recomputeRegClassExceptRewritable( // We can swap the classes of dst + src2 as a pair to AGPR, so ignore the // effects of rewrite candidates. It just so happens that we can use - // either AGPR or VGPR in src0/src1, so don't bother checking the - // constraint effects of the individual operands. + // either AGPR or VGPR in src0/src1. We still need to check constraint + // effects for scale variant, which does not allow AGPR. if (isRewriteCandidate(*MI)) { + int AGPROp = AMDGPU::getMFMASrcCVDstAGPROp(MI->getOpcode()); + const MCInstrDesc &AGPRDesc = TII.get(AGPROp); + const TargetRegisterClass *NewRC = + TII.getRegClass(AGPRDesc, MO.getOperandNo()); + if (!TRI.hasAGPRs(NewRC)) + return false; + const MachineOperand *VDst = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst); const MachineOperand *Src2 = @@ -659,7 +667,11 @@ AMDGPURewriteAGPRCopyMFMAPass::run(MachineFunction &MF, if (!Impl.run(MF)) return PreservedAnalyses::all(); auto PA = getMachineFunctionPassPreservedAnalyses(); - PA.preserveSet<CFGAnalyses>(); - PA.preserve<LiveStacksAnalysis>(); + PA.preserveSet<CFGAnalyses>() + .preserve<LiveStacksAnalysis>() + .preserve<VirtRegMapAnalysis>() + .preserve<SlotIndexesAnalysis>() + .preserve<LiveIntervalsAnalysis>() + .preserve<LiveRegMatrixAnalysis>(); return PA; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp index 4b1f80c..a2e16c7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp @@ -299,7 +299,7 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) { if (Replacements.empty()) return false; - LLVMContext &Ctx = F.getParent()->getContext(); + LLVMContext &Ctx = F.getContext(); StructType *NewRetTy = StructType::create(Ctx, ReturnTypes, F.getName()); FunctionType *NewFuncTy = FunctionType::get(NewRetTy, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td index 2393346..58a9b55 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -237,8 +237,6 @@ def : SourceOfDivergence<int_amdgcn_mbcnt_lo>; def : SourceOfDivergence<int_r600_read_tidig_x>; def : SourceOfDivergence<int_r600_read_tidig_y>; def : SourceOfDivergence<int_r600_read_tidig_z>; -def : SourceOfDivergence<int_amdgcn_atomic_cond_sub_u32>; -def : SourceOfDivergence<int_amdgcn_global_atomic_csub>; def : SourceOfDivergence<int_amdgcn_global_atomic_fmin_num>; def : SourceOfDivergence<int_amdgcn_global_atomic_fmax_num>; def : SourceOfDivergence<int_amdgcn_global_atomic_ordered_add_b64>; @@ -413,3 +411,12 @@ def : AlwaysUniform<int_amdgcn_s_getpc>; def : AlwaysUniform<int_amdgcn_s_getreg>; def : AlwaysUniform<int_amdgcn_s_memrealtime>; def : AlwaysUniform<int_amdgcn_s_memtime>; + +def AMDGPUImageDMaskIntrinsicTable : GenericTable { + let FilterClass = "AMDGPUImageDMaskIntrinsic"; + let Fields = ["Intr"]; + + let PrimaryKey = ["Intr"]; + let PrimaryKeyName = "getAMDGPUImageDMaskIntrinsic"; + let PrimaryKeyEarlyOut = 1; +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSelectionDAGInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSelectionDAGInfo.cpp index 2941a48..5b8ee5f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSelectionDAGInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSelectionDAGInfo.cpp @@ -7,13 +7,53 @@ //===----------------------------------------------------------------------===// #include "AMDGPUSelectionDAGInfo.h" -#include "AMDGPUISelLowering.h" + +#define GET_SDNODE_DESC +#include "AMDGPUGenSDNodeInfo.inc" using namespace llvm; +AMDGPUSelectionDAGInfo::AMDGPUSelectionDAGInfo() + : SelectionDAGGenTargetInfo(AMDGPUGenSDNodeInfo) {} + AMDGPUSelectionDAGInfo::~AMDGPUSelectionDAGInfo() = default; -bool AMDGPUSelectionDAGInfo::isTargetMemoryOpcode(unsigned Opcode) const { - return Opcode >= AMDGPUISD::FIRST_MEMORY_OPCODE && - Opcode <= AMDGPUISD::LAST_MEMORY_OPCODE; +const char *AMDGPUSelectionDAGInfo::getTargetNodeName(unsigned Opcode) const { +#define NODE_NAME_CASE(node) \ + case AMDGPUISD::node: \ + return "AMDGPUISD::" #node; + + switch (static_cast<AMDGPUISD::NodeType>(Opcode)) { + // These nodes don't have corresponding entries in *.td files yet. + NODE_NAME_CASE(WAVE_ADDRESS) + NODE_NAME_CASE(MAD_I64_I32) + NODE_NAME_CASE(MAD_U64_U32) + NODE_NAME_CASE(BUILD_VERTICAL_VECTOR) + // These do, but only when compiling R600.td, + // and the enum is generated from AMDGPU.td. + NODE_NAME_CASE(DOT4) + NODE_NAME_CASE(TEXTURE_FETCH) + NODE_NAME_CASE(R600_EXPORT) + NODE_NAME_CASE(CONST_ADDRESS) + NODE_NAME_CASE(DUMMY_CHAIN) + } + +#undef NODE_NAME_CASE + + return SelectionDAGGenTargetInfo::getTargetNodeName(Opcode); +} + +void AMDGPUSelectionDAGInfo::verifyTargetNode(const SelectionDAG &DAG, + const SDNode *N) const { + switch (N->getOpcode()) { + case AMDGPUISD::IF: + // result #0 must have type i1, but has type i32/i64 + case AMDGPUISD::ELSE: + case AMDGPUISD::LOOP: + // operand #1 must have type i1, but has type i32/i64 + case AMDGPUISD::LDS: + // result #0 must have type i64 (iPTR), but has type i32 + return; + } + SelectionDAGGenTargetInfo::verifyTargetNode(DAG, N); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSelectionDAGInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUSelectionDAGInfo.h index 3280be7..bae614a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSelectionDAGInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSelectionDAGInfo.h @@ -11,13 +11,49 @@ #include "llvm/CodeGen/SelectionDAGTargetInfo.h" +#define GET_SDNODE_ENUM +#include "AMDGPUGenSDNodeInfo.inc" + namespace llvm { +namespace AMDGPUISD { + +enum NodeType : unsigned { + // Convert a unswizzled wave uniform stack address to an address compatible + // with a vector offset for use in stack access. + WAVE_ADDRESS = GENERATED_OPCODE_END, + + DOT4, + MAD_U64_U32, + MAD_I64_I32, + TEXTURE_FETCH, + R600_EXPORT, + CONST_ADDRESS, -class AMDGPUSelectionDAGInfo : public SelectionDAGTargetInfo { + /// This node is for VLIW targets and it is used to represent a vector + /// that is stored in consecutive registers with the same channel. + /// For example: + /// |X |Y|Z|W| + /// T0|v.x| | | | + /// T1|v.y| | | | + /// T2|v.z| | | | + /// T3|v.w| | | | + BUILD_VERTICAL_VECTOR, + + DUMMY_CHAIN, +}; + +} // namespace AMDGPUISD + +class AMDGPUSelectionDAGInfo : public SelectionDAGGenTargetInfo { public: + AMDGPUSelectionDAGInfo(); + ~AMDGPUSelectionDAGInfo() override; - bool isTargetMemoryOpcode(unsigned Opcode) const override; + const char *getTargetNodeName(unsigned Opcode) const override; + + void verifyTargetNode(const SelectionDAG &DAG, + const SDNode *N) const override; }; } // namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp index 9af8129..b707882 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp @@ -314,9 +314,7 @@ public: #endif bool empty() const { return Nodes.empty(); } - const iterator_range<nodes_iterator> nodes() const { - return {Nodes.begin(), Nodes.end()}; - } + iterator_range<nodes_iterator> nodes() const { return Nodes; } const Node &getNode(unsigned ID) const { return *Nodes[ID]; } unsigned getNumNodes() const { return Nodes.size(); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 26e0b3df..5ca8ee2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -282,7 +282,7 @@ bool AMDGPUSubtarget::isSingleLaneExecution(const Function &Func) const { } bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { - Function *Kernel = I->getParent()->getParent(); + Function *Kernel = I->getFunction(); unsigned MinSize = 0; unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; bool IdQuery = false; @@ -350,7 +350,7 @@ bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { } unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const { - assert(AMDGPU::isKernel(F.getCallingConv())); + assert(AMDGPU::isKernel(F)); // We don't allocate the segment if we know the implicit arguments weren't // used, even if the ABI implies we need them. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp index 4a9437b..9bdaa42 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp @@ -271,7 +271,7 @@ void AMDGPUSwLowerLDS::getNonKernelsWithLDSArguments(const CallGraph &CG) { Function *CalledFunc = CallerCGN->getFunction(); if (!CalledFunc || CalledFunc->isDeclaration()) continue; - if (AMDGPU::isKernelLDS(CalledFunc)) + if (AMDGPU::isKernel(*CalledFunc)) continue; for (auto AI = CalledFunc->arg_begin(), E = CalledFunc->arg_end(); AI != E; ++AI) { @@ -297,7 +297,7 @@ void AMDGPUSwLowerLDS::getUsesOfLDSByNonKernels() { for (User *V : GV->users()) { if (auto *I = dyn_cast<Instruction>(V)) { Function *F = I->getFunction(); - if (!isKernelLDS(F) && !F->isDeclaration()) + if (!isKernel(*F) && !F->isDeclaration()) FuncLDSAccessInfo.NonKernelToLDSAccessMap[F].insert(GV); } } @@ -523,7 +523,7 @@ static void replacesUsesOfGlobalInFunction(Function *Func, GlobalVariable *GV, auto ReplaceUsesLambda = [Func](const Use &U) -> bool { auto *V = U.getUser(); if (auto *Inst = dyn_cast<Instruction>(V)) { - auto *Func1 = Inst->getParent()->getParent(); + auto *Func1 = Inst->getFunction(); if (Func == Func1) return true; } @@ -1169,7 +1169,7 @@ bool AMDGPUSwLowerLDS::run() { if (!F || K.second.empty()) continue; - assert(isKernelLDS(F)); + assert(isKernel(*F)); // Only inserts if key isn't already in the map. FuncLDSAccessInfo.KernelToLDSParametersMap.insert( diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 75a94ac..8a831f7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -17,6 +17,7 @@ #include "AMDGPUTargetMachine.h" #include "AMDGPU.h" #include "AMDGPUAliasAnalysis.h" +#include "AMDGPUArgumentUsageInfo.h" #include "AMDGPUBarrierLatency.h" #include "AMDGPUCtorDtorLowering.h" #include "AMDGPUExportClustering.h" @@ -465,6 +466,11 @@ static cl::opt<bool> EnableScalarIRPasses( cl::init(true), cl::Hidden); +static cl::opt<bool> EnableLowerExecSync( + "amdgpu-enable-lower-exec-sync", + cl::desc("Enable lowering of execution synchronization."), cl::init(true), + cl::Hidden); + static cl::opt<bool> EnableSwLowerLDS("amdgpu-enable-sw-lower-lds", cl::desc("Enable lowering of lds to global memory pass " @@ -567,9 +573,10 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeSILoadStoreOptimizerLegacyPass(*PR); initializeAMDGPUCtorDtorLoweringLegacyPass(*PR); initializeAMDGPUAlwaysInlinePass(*PR); + initializeAMDGPULowerExecSyncLegacyPass(*PR); initializeAMDGPUSwLowerLDSLegacyPass(*PR); initializeAMDGPUAnnotateUniformValuesLegacyPass(*PR); - initializeAMDGPUArgumentUsageInfoPass(*PR); + initializeAMDGPUArgumentUsageInfoWrapperLegacyPass(*PR); initializeAMDGPUAtomicOptimizerPass(*PR); initializeAMDGPULowerKernelArgumentsPass(*PR); initializeAMDGPUPromoteKernelArgumentsPass(*PR); @@ -641,7 +648,7 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial)); DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); - DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation()); + DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation(C->MF)); return DAG; } @@ -662,7 +669,7 @@ createGCNMaxMemoryClauseMachineScheduler(MachineSchedContext *C) { if (ST.shouldClusterStores()) DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); - DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation()); + DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation(C->MF)); return DAG; } @@ -816,7 +823,7 @@ parseAMDGPUAtomicOptimizerStrategy(StringRef Params) { Params.consume_front("strategy="); auto Result = StringSwitch<std::optional<ScanOptions>>(Params) .Case("dpp", ScanOptions::DPP) - .Cases("iterative", "", ScanOptions::Iterative) + .Cases({"iterative", ""}, ScanOptions::Iterative) .Case("none", ScanOptions::None) .Default(std::nullopt); if (Result) @@ -962,6 +969,8 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { // We want to support the -lto-partitions=N option as "best effort". // For that, we need to lower LDS earlier in the pipeline before the // module is partitioned for codegen. + if (EnableLowerExecSync) + PM.addPass(AMDGPULowerExecSyncPass()); if (EnableSwLowerLDS) PM.addPass(AMDGPUSwLowerLDSPass(*this)); if (EnableLowerModuleLDS) @@ -1201,7 +1210,7 @@ GCNTargetMachine::createPostMachineScheduler(MachineSchedContext *C) const { EnableVOPD) DAG->addMutation(createVOPDPairingMutation()); DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); - DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation()); + DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation(C->MF)); return DAG; } //===----------------------------------------------------------------------===// @@ -1218,10 +1227,6 @@ class GCNPassConfig final : public AMDGPUPassConfig { public: GCNPassConfig(TargetMachine &TM, PassManagerBase &PM) : AMDGPUPassConfig(TM, PM) { - // It is necessary to know the register usage of the entire call graph. We - // allow calls without EnableAMDGPUFunctionCalls if they are marked - // noinline, so this is always required. - setRequiresCodeGenSCCOrder(true); substitutePass(&PostRASchedulerID, &PostMachineSchedulerID); } @@ -1315,6 +1320,9 @@ void AMDGPUPassConfig::addIRPasses() { isPassEnabled(EnableImageIntrinsicOptimizer)) addPass(createAMDGPUImageIntrinsicOptimizerPass(&TM)); + if (EnableUniformIntrinsicCombine) + addPass(createAMDGPUUniformIntrinsicCombineLegacyPass()); + // This can be disabled by passing ::Disable here or on the command line // with --expand-variadics-override=disable. addPass(createExpandVariadicsPass(ExpandVariadicsMode::Lowering)); @@ -1330,6 +1338,10 @@ void AMDGPUPassConfig::addIRPasses() { // Make enqueued block runtime handles externally visible. addPass(createAMDGPUExportKernelRuntimeHandlesLegacyPass()); + // Lower special LDS accesses. + if (EnableLowerExecSync) + addPass(createAMDGPULowerExecSyncLegacyPass()); + // Lower LDS accesses to global memory pass if address sanitizer is enabled. if (EnableSwLowerLDS) addPass(createAMDGPUSwLowerLDSLegacyPass(&TM)); @@ -1415,9 +1427,6 @@ void AMDGPUPassConfig::addCodeGenPrepare() { // many cases. addPass(createAMDGPULowerBufferFatPointersPass()); addPass(createAMDGPULowerIntrinsicsLegacyPass()); - // In accordance with the above FIXME, manually force all the - // function-level passes into a CGSCCPassManager. - addPass(new DummyCGSCCPass()); } // LowerSwitch pass may introduce unreachable blocks that can @@ -2012,6 +2021,42 @@ bool GCNTargetMachine::parseMachineFunctionInfo( MFI->ArgInfo.WorkItemIDZ, 0, 0))) return true; + // Parse FirstKernArgPreloadReg separately, since it's a Register, + // not ArgDescriptor. + if (YamlMFI.ArgInfo && YamlMFI.ArgInfo->FirstKernArgPreloadReg) { + const yaml::SIArgument &A = *YamlMFI.ArgInfo->FirstKernArgPreloadReg; + + if (!A.IsRegister) { + // For stack arguments, we don't have RegisterName.SourceRange, + // but we should have some location info from the YAML parser + const MemoryBuffer &Buffer = + *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID()); + // Create a minimal valid source range + SMLoc Loc = SMLoc::getFromPointer(Buffer.getBufferStart()); + SMRange Range(Loc, Loc); + + Error = SMDiagnostic( + *PFS.SM, Loc, Buffer.getBufferIdentifier(), 1, 0, SourceMgr::DK_Error, + "firstKernArgPreloadReg must be a register, not a stack location", "", + {}, {}); + + SourceRange = Range; + return true; + } + + Register Reg; + if (parseNamedRegisterReference(PFS, Reg, A.RegisterName.Value, Error)) { + SourceRange = A.RegisterName.SourceRange; + return true; + } + + if (!AMDGPU::SGPR_32RegClass.contains(Reg)) + return diagnoseRegisterClass(A.RegisterName); + + MFI->ArgInfo.FirstKernArgPreloadReg = Reg; + MFI->NumUserSGPRs += YamlMFI.NumKernargPreloadSGPRs; + } + if (ST.hasIEEEMode()) MFI->Mode.IEEE = YamlMFI.Mode.IEEE; if (ST.hasDX10ClampMode()) @@ -2066,6 +2111,8 @@ void AMDGPUCodeGenPassBuilder::addIRPasses(AddIRPass &addPass) const { if (isPassEnabled(EnableImageIntrinsicOptimizer)) addPass(AMDGPUImageIntrinsicOptimizerPass(TM)); + if (EnableUniformIntrinsicCombine) + addPass(AMDGPUUniformIntrinsicCombinePass()); // This can be disabled by passing ::Disable here or on the command line // with --expand-variadics-override=disable. addPass(ExpandVariadicsPass(ExpandVariadicsMode::Lowering)); @@ -2075,6 +2122,9 @@ void AMDGPUCodeGenPassBuilder::addIRPasses(AddIRPass &addPass) const { addPass(AMDGPUExportKernelRuntimeHandlesPass()); + if (EnableLowerExecSync) + addPass(AMDGPULowerExecSyncPass()); + if (EnableSwLowerLDS) addPass(AMDGPUSwLowerLDSPass(TM)); @@ -2158,6 +2208,9 @@ void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const { void AMDGPUCodeGenPassBuilder::addPreISel(AddIRPass &addPass) const { + // Require AMDGPUArgumentUsageAnalysis so that it's available during ISel. + addPass(RequireAnalysisPass<AMDGPUArgumentUsageAnalysis, Module>()); + if (TM.getOptLevel() > CodeGenOptLevel::None) { addPass(FlattenCFGPass()); addPass(SinkingPass()); @@ -2345,11 +2398,10 @@ void AMDGPUCodeGenPassBuilder::addPreEmitPass(AddMachinePass &addPass) const { addPass(SIMemoryLegalizerPass()); addPass(SIInsertWaitcntsPass()); - // TODO: addPass(SIModeRegisterPass()); + addPass(SIModeRegisterPass()); - if (TM.getOptLevel() > CodeGenOptLevel::None) { - // TODO: addPass(SIInsertHardClausesPass()); - } + if (TM.getOptLevel() > CodeGenOptLevel::None) + addPass(SIInsertHardClausesPass()); addPass(SILateBranchLoweringPass()); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 03d16fd..dfa2151 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -1241,46 +1241,123 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, (ScalarSize == 16 || ScalarSize == 8)) { // Larger vector widths may require additional instructions, but are // typically cheaper than scalarized versions. - unsigned NumVectorElts = cast<FixedVectorType>(SrcTy)->getNumElements(); - unsigned RequestedElts = - count_if(Mask, [](int MaskElt) { return MaskElt != -1; }); + // + // We assume that shuffling at a register granularity can be done for free. + // This is not true for vectors fed into memory instructions, but it is + // effectively true for all other shuffling. The emphasis of the logic here + // is to assist generic transform in cleaning up / canonicalizing those + // shuffles. + + // With op_sel VOP3P instructions freely can access the low half or high + // half of a register, so any swizzle of two elements is free. + if (auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcTy)) { + unsigned NumSrcElts = SrcVecTy->getNumElements(); + if (ST->hasVOP3PInsts() && ScalarSize == 16 && NumSrcElts == 2 && + (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Reverse || + Kind == TTI::SK_PermuteSingleSrc)) + return 0; + } + unsigned EltsPerReg = 32 / ScalarSize; - if (RequestedElts == 0) - return 0; switch (Kind) { case TTI::SK_Broadcast: + // A single v_perm_b32 can be re-used for all destination registers. + return 1; case TTI::SK_Reverse: - case TTI::SK_PermuteSingleSrc: { - // With op_sel VOP3P instructions freely can access the low half or high - // half of a register, so any swizzle of two elements is free. - if (ST->hasVOP3PInsts() && ScalarSize == 16 && NumVectorElts == 2) - return 0; - unsigned NumPerms = alignTo(RequestedElts, EltsPerReg) / EltsPerReg; - // SK_Broadcast just reuses the same mask - unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? 1 : NumPerms; - return NumPerms + NumPermMasks; - } + // One instruction per register. + if (auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy)) + return divideCeil(DstVecTy->getNumElements(), EltsPerReg); + return InstructionCost::getInvalid(); case TTI::SK_ExtractSubvector: + if (Index % EltsPerReg == 0) + return 0; // Shuffling at register granularity + if (auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy)) + return divideCeil(DstVecTy->getNumElements(), EltsPerReg); + return InstructionCost::getInvalid(); case TTI::SK_InsertSubvector: { - // Even aligned accesses are free - if (!(Index % 2)) - return 0; - // Insert/extract subvectors only require shifts / extract code to get the - // relevant bits - return alignTo(RequestedElts, EltsPerReg) / EltsPerReg; + auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy); + if (!DstVecTy) + return InstructionCost::getInvalid(); + unsigned NumDstElts = DstVecTy->getNumElements(); + unsigned NumInsertElts = cast<FixedVectorType>(SubTp)->getNumElements(); + unsigned EndIndex = Index + NumInsertElts; + unsigned BeginSubIdx = Index % EltsPerReg; + unsigned EndSubIdx = EndIndex % EltsPerReg; + unsigned Cost = 0; + + if (BeginSubIdx != 0) { + // Need to shift the inserted vector into place. The cost is the number + // of destination registers overlapped by the inserted vector. + Cost = divideCeil(EndIndex, EltsPerReg) - (Index / EltsPerReg); + } + + // If the last register overlap is partial, there may be three source + // registers feeding into it; that takes an extra instruction. + if (EndIndex < NumDstElts && BeginSubIdx < EndSubIdx) + Cost += 1; + + return Cost; } - case TTI::SK_PermuteTwoSrc: - case TTI::SK_Splice: - case TTI::SK_Select: { - unsigned NumPerms = alignTo(RequestedElts, EltsPerReg) / EltsPerReg; - // SK_Select just reuses the same mask - unsigned NumPermMasks = Kind == TTI::SK_Select ? 1 : NumPerms; - return NumPerms + NumPermMasks; + case TTI::SK_Splice: { + auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy); + if (!DstVecTy) + return InstructionCost::getInvalid(); + unsigned NumElts = DstVecTy->getNumElements(); + assert(NumElts == cast<FixedVectorType>(SrcTy)->getNumElements()); + // Determine the sub-region of the result vector that requires + // sub-register shuffles / mixing. + unsigned EltsFromLHS = NumElts - Index; + bool LHSIsAligned = (Index % EltsPerReg) == 0; + bool RHSIsAligned = (EltsFromLHS % EltsPerReg) == 0; + if (LHSIsAligned && RHSIsAligned) + return 0; + if (LHSIsAligned && !RHSIsAligned) + return divideCeil(NumElts, EltsPerReg) - (EltsFromLHS / EltsPerReg); + if (!LHSIsAligned && RHSIsAligned) + return divideCeil(EltsFromLHS, EltsPerReg); + return divideCeil(NumElts, EltsPerReg); } - default: break; } + + if (!Mask.empty()) { + unsigned NumSrcElts = cast<FixedVectorType>(SrcTy)->getNumElements(); + + // Generically estimate the cost by assuming that each destination + // register is derived from sources via v_perm_b32 instructions if it + // can't be copied as-is. + // + // For each destination register, derive the cost of obtaining it based + // on the number of source registers that feed into it. + unsigned Cost = 0; + for (unsigned DstIdx = 0; DstIdx < Mask.size(); DstIdx += EltsPerReg) { + SmallVector<int, 4> Regs; + bool Aligned = true; + for (unsigned I = 0; I < EltsPerReg && DstIdx + I < Mask.size(); ++I) { + int SrcIdx = Mask[DstIdx + I]; + if (SrcIdx == -1) + continue; + int Reg; + if (SrcIdx < (int)NumSrcElts) { + Reg = SrcIdx / EltsPerReg; + if (SrcIdx % EltsPerReg != I) + Aligned = false; + } else { + Reg = NumSrcElts + (SrcIdx - NumSrcElts) / EltsPerReg; + if ((SrcIdx - NumSrcElts) % EltsPerReg != I) + Aligned = false; + } + if (!llvm::is_contained(Regs, Reg)) + Regs.push_back(Reg); + } + if (Regs.size() >= 2) + Cost += Regs.size() - 1; + else if (!Aligned) + Cost += 1; + } + return Cost; + } } return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp index 65e6ed9..c52eb4e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp @@ -57,10 +57,11 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II, const UniformityInfo &UI, ValueMap<const Value *, bool> &Tracker) { llvm::Intrinsic::ID IID = II.getIntrinsicID(); - + /// We deliberately do not simplify readfirstlane with a uniform argument, so + /// that frontends can use it to force a copy to SGPR and thereby prevent the + /// backend from generating unwanted waterfall loops. switch (IID) { case Intrinsic::amdgcn_permlane64: - case Intrinsic::amdgcn_readfirstlane: case Intrinsic::amdgcn_readlane: { Value *Src = II.getArgOperand(0); if (isDivergentUseWithNew(II.getOperandUse(0), UI, Tracker)) @@ -107,7 +108,7 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II, return Changed; } default: - llvm_unreachable("Unexpected intrinsic ID in optimizeUniformIntrinsic"); + return false; } return false; } @@ -121,16 +122,6 @@ static bool runUniformIntrinsicCombine(Function &F, const UniformityInfo &UI) { auto *II = dyn_cast<IntrinsicInst>(&I); if (!II) continue; - - switch (II->getIntrinsicID()) { - case Intrinsic::amdgcn_permlane64: - case Intrinsic::amdgcn_readfirstlane: - case Intrinsic::amdgcn_readlane: - case Intrinsic::amdgcn_ballot: - break; - default: - continue; - } IsChanged |= optimizeUniformIntrinsic(*II, UI, Tracker); } return IsChanged; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp index 733c5d5..fe81a5e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp @@ -181,14 +181,52 @@ BasicBlock *AMDGPUUnifyDivergentExitNodesImpl::unifyReturnBlockSet( return NewRetBlock; } +static BasicBlock * +createDummyReturnBlock(Function &F, + SmallVector<BasicBlock *, 4> &ReturningBlocks) { + BasicBlock *DummyReturnBB = + BasicBlock::Create(F.getContext(), "DummyReturnBlock", &F); + Type *RetTy = F.getReturnType(); + Value *RetVal = RetTy->isVoidTy() ? nullptr : PoisonValue::get(RetTy); + ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB); + ReturningBlocks.push_back(DummyReturnBB); + return DummyReturnBB; +} + +/// Handle conditional branch instructions (-> 2 targets) and callbr +/// instructions with N targets. +static void handleNBranch(Function &F, BasicBlock *BB, Instruction *BI, + BasicBlock *DummyReturnBB, + std::vector<DominatorTree::UpdateType> &Updates) { + SmallVector<BasicBlock *, 2> Successors(successors(BB)); + + // Create a new transition block to hold the conditional branch. + BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock"); + + Updates.reserve(Updates.size() + 2 * Successors.size() + 2); + + // 'Successors' become successors of TransitionBB instead of BB, + // and TransitionBB becomes a single successor of BB. + Updates.emplace_back(DominatorTree::Insert, BB, TransitionBB); + for (BasicBlock *Successor : Successors) { + Updates.emplace_back(DominatorTree::Insert, TransitionBB, Successor); + Updates.emplace_back(DominatorTree::Delete, BB, Successor); + } + + // Create a branch that will always branch to the transition block and + // references DummyReturnBB. + BB->getTerminator()->eraseFromParent(); + BranchInst::Create(TransitionBB, DummyReturnBB, + ConstantInt::getTrue(F.getContext()), BB); + Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB); +} + bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT, const PostDominatorTree &PDT, const UniformityInfo &UA) { - assert(hasOnlySimpleTerminator(F) && "Unsupported block terminator."); - if (PDT.root_size() == 0 || (PDT.root_size() == 1 && - !isa<BranchInst>(PDT.getRoot()->getTerminator()))) + !isa<BranchInst, CallBrInst>(PDT.getRoot()->getTerminator()))) return false; // Loop over all of the blocks in a function, tracking all of the blocks that @@ -222,46 +260,28 @@ bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT, if (HasDivergentExitBlock) UnreachableBlocks.push_back(BB); } else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) { - - ConstantInt *BoolTrue = ConstantInt::getTrue(F.getContext()); - if (DummyReturnBB == nullptr) { - DummyReturnBB = BasicBlock::Create(F.getContext(), - "DummyReturnBlock", &F); - Type *RetTy = F.getReturnType(); - Value *RetVal = RetTy->isVoidTy() ? nullptr : PoisonValue::get(RetTy); - ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB); - ReturningBlocks.push_back(DummyReturnBB); - } + if (!DummyReturnBB) + DummyReturnBB = createDummyReturnBlock(F, ReturningBlocks); if (BI->isUnconditional()) { BasicBlock *LoopHeaderBB = BI->getSuccessor(0); BI->eraseFromParent(); // Delete the unconditional branch. // Add a new conditional branch with a dummy edge to the return block. - BranchInst::Create(LoopHeaderBB, DummyReturnBB, BoolTrue, BB); - Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB); - } else { // Conditional branch. - SmallVector<BasicBlock *, 2> Successors(successors(BB)); - - // Create a new transition block to hold the conditional branch. - BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock"); - - Updates.reserve(Updates.size() + 2 * Successors.size() + 2); - - // 'Successors' become successors of TransitionBB instead of BB, - // and TransitionBB becomes a single successor of BB. - Updates.emplace_back(DominatorTree::Insert, BB, TransitionBB); - for (BasicBlock *Successor : Successors) { - Updates.emplace_back(DominatorTree::Insert, TransitionBB, Successor); - Updates.emplace_back(DominatorTree::Delete, BB, Successor); - } - - // Create a branch that will always branch to the transition block and - // references DummyReturnBB. - BB->getTerminator()->eraseFromParent(); - BranchInst::Create(TransitionBB, DummyReturnBB, BoolTrue, BB); + BranchInst::Create(LoopHeaderBB, DummyReturnBB, + ConstantInt::getTrue(F.getContext()), BB); Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB); + } else { + handleNBranch(F, BB, BI, DummyReturnBB, Updates); } Changed = true; + } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(BB->getTerminator())) { + if (!DummyReturnBB) + DummyReturnBB = createDummyReturnBlock(F, ReturningBlocks); + + handleNBranch(F, BB, CBI, DummyReturnBB, Updates); + Changed = true; + } else { + llvm_unreachable("unsupported block terminator"); } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp b/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp index 61c5dcd..8d8386c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp @@ -44,6 +44,7 @@ namespace { class AMDGPUWaitSGPRHazards { public: + const GCNSubtarget *ST; const SIInstrInfo *TII; const SIRegisterInfo *TRI; const MachineRegisterInfo *MRI; @@ -54,7 +55,7 @@ public: bool CullSGPRHazardsAtMemWait; unsigned CullSGPRHazardsMemWaitThreshold; - AMDGPUWaitSGPRHazards() {} + AMDGPUWaitSGPRHazards() = default; // Return the numeric ID 0-127 for a given SGPR. static std::optional<unsigned> sgprNumber(Register Reg, @@ -165,7 +166,7 @@ public: } unsigned mergeMasks(unsigned Mask1, unsigned Mask2) { - unsigned Mask = 0xffff; + unsigned Mask = AMDGPU::DepCtr::getDefaultDepCtrEncoding(*ST); Mask = AMDGPU::DepCtr::encodeFieldSaSdst( Mask, std::min(AMDGPU::DepCtr::decodeFieldSaSdst(Mask1), AMDGPU::DepCtr::decodeFieldSaSdst(Mask2))); @@ -387,7 +388,7 @@ public: // Apply wait if (Wait) { - unsigned Mask = 0xffff; + unsigned Mask = AMDGPU::DepCtr::getDefaultDepCtrEncoding(*ST); if (Wait & WA_VCC) { State.VCCHazard &= ~HazardState::VALU; Mask = AMDGPU::DepCtr::encodeFieldVaVcc(Mask, 0); @@ -438,8 +439,8 @@ public: } bool run(MachineFunction &MF) { - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - if (!ST.hasVALUReadSGPRHazard()) + ST = &MF.getSubtarget<GCNSubtarget>(); + if (!ST->hasVALUReadSGPRHazard()) return false; // Parse settings @@ -467,10 +468,10 @@ public: if (!EnableSGPRHazardWaits) return false; - TII = ST.getInstrInfo(); - TRI = ST.getRegisterInfo(); + TII = ST->getInstrInfo(); + TRI = ST->getRegisterInfo(); MRI = &MF.getRegInfo(); - DsNopCount = ST.isWave64() ? WAVE64_NOPS : WAVE32_NOPS; + DsNopCount = ST->isWave64() ? WAVE64_NOPS : WAVE32_NOPS; auto CallingConv = MF.getFunction().getCallingConv(); if (!AMDGPU::isEntryFunctionCC(CallingConv) && diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 09338c5..7a91a40 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -347,6 +347,11 @@ public: return isRegKind() && getReg() == AMDGPU::SGPR_NULL; } + bool isAV_LdSt_32_Align2_RegOp() const { + return isRegClass(AMDGPU::VGPR_32RegClassID) || + isRegClass(AMDGPU::AGPR_32RegClassID); + } + bool isVRegWithInputMods() const; template <bool IsFake16> bool isT16_Lo128VRegWithInputMods() const; template <bool IsFake16> bool isT16VRegWithInputMods() const; @@ -1865,7 +1870,7 @@ private: unsigned getConstantBusLimit(unsigned Opcode) const; bool usesConstantBus(const MCInst &Inst, unsigned OpIdx); bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const; - unsigned findImplicitSGPRReadInVOP(const MCInst &Inst) const; + MCRegister findImplicitSGPRReadInVOP(const MCInst &Inst) const; bool isSupportedMnemo(StringRef Mnemo, const FeatureBitset &FBS); @@ -3665,7 +3670,8 @@ StringRef AMDGPUAsmParser::getMatchedVariantName() const { return ""; } -unsigned AMDGPUAsmParser::findImplicitSGPRReadInVOP(const MCInst &Inst) const { +MCRegister +AMDGPUAsmParser::findImplicitSGPRReadInVOP(const MCInst &Inst) const { const MCInstrDesc &Desc = MII.get(Inst.getOpcode()); for (MCPhysReg Reg : Desc.implicit_uses()) { switch (Reg) { @@ -3679,7 +3685,7 @@ unsigned AMDGPUAsmParser::findImplicitSGPRReadInVOP(const MCInst &Inst) const { break; } } - return AMDGPU::NoRegister; + return MCRegister(); } // NB: This code is correct only when used to check constant @@ -3854,9 +3860,9 @@ bool AMDGPUAsmParser::validateConstantBusLimitations( LiteralSize = 4; } - SmallDenseSet<unsigned> SGPRsUsed; - unsigned SGPRUsed = findImplicitSGPRReadInVOP(Inst); - if (SGPRUsed != AMDGPU::NoRegister) { + SmallDenseSet<MCRegister> SGPRsUsed; + MCRegister SGPRUsed = findImplicitSGPRReadInVOP(Inst); + if (SGPRUsed) { SGPRsUsed.insert(SGPRUsed); ++ConstantBusUseCount; } @@ -7043,6 +7049,12 @@ ParseStatus AMDGPUAsmParser::parseNamedBit(StringRef Name, if (Name == "a16" && !hasA16()) return Error(S, "a16 modifier is not supported on this GPU"); + if (Bit == 0 && Name == "gds") { + StringRef Mnemo = ((AMDGPUOperand &)*Operands[0]).getToken(); + if (Mnemo.starts_with("ds_gws")) + return Error(S, "nogds is not allowed"); + } + if (isGFX9() && ImmTy == AMDGPUOperand::ImmTyA16) ImmTy = AMDGPUOperand::ImmTyR128A16; diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index b97b738..bb0e938 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -783,37 +783,20 @@ multiclass MUBUF_Pseudo_Atomics_NO_RTN <string opName, multiclass MUBUF_Pseudo_Atomics_RTN <string opName, RegisterOperand vdataClass, - ValueType vdataType, - SDPatternOperator atomic> { + ValueType vdataType> { let FPAtomic = vdataType.isFP in { - def _OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass, 0, - [(set vdataType:$vdata, - (atomic (MUBUFOffset v4i32:$srsrc, i32:$soffset, i32:$offset), - vdataType:$vdata_in))]>, - MUBUFAddr64Table <0, NAME # "_RTN">; - - def _ADDR64_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, 0, - [(set vdataType:$vdata, - (atomic (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i32:$offset), - vdataType:$vdata_in))]>, - MUBUFAddr64Table <1, NAME # "_RTN">; - + def _OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass, 0>, + MUBUFAddr64Table <0, NAME # "_RTN">; + def _ADDR64_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, 0>, + MUBUFAddr64Table <1, NAME # "_RTN">; def _OFFEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, 0>; def _IDXEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, 0>; def _BOTHEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, 0>; - def _VBUFFER_OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.Offset, vdataClass, 1, - [(set vdataType:$vdata, - (atomic (MUBUFOffset v4i32:$srsrc, i32:$soffset, i32:$offset), - vdataType:$vdata_in))]>, - MUBUFAddr64Table <0, NAME # "_VBUFFER_RTN">; - - def _VBUFFER_ADDR64_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.Addr64, vdataClass, 1, - [(set vdataType:$vdata, - (atomic (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i32:$offset), - vdataType:$vdata_in))]>, - MUBUFAddr64Table <1, NAME # "_VBUFFER_RTN">; - + def _VBUFFER_OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.Offset, vdataClass, 1>, + MUBUFAddr64Table <0, NAME # "_VBUFFER_RTN">; + def _VBUFFER_ADDR64_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.Addr64, vdataClass, 1>, + MUBUFAddr64Table <1, NAME # "_VBUFFER_RTN">; def _VBUFFER_OFFEN_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.OffEn, vdataClass, 1>; def _VBUFFER_IDXEN_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.IdxEn, vdataClass, 1>; def _VBUFFER_BOTHEN_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.BothEn, vdataClass, 1>; @@ -822,10 +805,9 @@ multiclass MUBUF_Pseudo_Atomics_RTN <string opName, multiclass MUBUF_Pseudo_Atomics <string opName, RegisterOperand vdataClass, - ValueType vdataType, - SDPatternOperator atomic = null_frag> : + ValueType vdataType> : MUBUF_Pseudo_Atomics_NO_RTN<opName, vdataClass, vdataType>, - MUBUF_Pseudo_Atomics_RTN<opName, vdataClass, vdataType, atomic>; + MUBUF_Pseudo_Atomics_RTN<opName, vdataClass, vdataType>; //===----------------------------------------------------------------------===// @@ -1096,7 +1078,7 @@ defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Pseudo_Atomics < let OtherPredicates = [HasGFX10_BEncoding] in { defm BUFFER_ATOMIC_CSUB : MUBUF_Pseudo_Atomics < - "buffer_atomic_csub", VGPROp_32, i32, int_amdgcn_global_atomic_csub + "buffer_atomic_csub", VGPROp_32, i32 >; } @@ -1117,22 +1099,22 @@ def BUFFER_WBINVL1_SC : MUBUF_Invalidate <"buffer_wbinvl1_sc", let SubtargetPredicate = isGFX6GFX7GFX10Plus in { defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Pseudo_Atomics < - "buffer_atomic_fcmpswap", AVLdSt_64, v2f32, null_frag + "buffer_atomic_fcmpswap", AVLdSt_64, v2f32 >; } let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts in { defm BUFFER_ATOMIC_FMIN : MUBUF_Pseudo_Atomics < - "buffer_atomic_fmin", AVLdSt_32, f32, null_frag + "buffer_atomic_fmin", AVLdSt_32, f32 >; defm BUFFER_ATOMIC_FMAX : MUBUF_Pseudo_Atomics < - "buffer_atomic_fmax", AVLdSt_32, f32, null_frag + "buffer_atomic_fmax", AVLdSt_32, f32 >; } let SubtargetPredicate = isGFX6GFX7GFX10 in { defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Pseudo_Atomics < - "buffer_atomic_fcmpswap_x2", VGPROp_128, v2f64, null_frag + "buffer_atomic_fcmpswap_x2", VGPROp_128, v2f64 >; } @@ -1201,12 +1183,12 @@ defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_NO_RTN < let SubtargetPredicate = HasAtomicFaddRtnInsts in defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_RTN< - "buffer_atomic_add_f32", AVLdSt_32, f32, null_frag + "buffer_atomic_add_f32", AVLdSt_32, f32 >; let SubtargetPredicate = HasAtomicBufferGlobalPkAddF16Insts in defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_RTN < - "buffer_atomic_pk_add_f16", AVLdSt_32, v2f16, null_frag + "buffer_atomic_pk_add_f16", AVLdSt_32, v2f16 >; let SubtargetPredicate = isGFX12Plus in { diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index a1e0e52..782cbfa 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -15,6 +15,7 @@ tablegen(LLVM AMDGPUGenMCCodeEmitter.inc -gen-emitter) tablegen(LLVM AMDGPUGenMCPseudoLowering.inc -gen-pseudo-lowering) tablegen(LLVM AMDGPUGenRegisterBank.inc -gen-register-bank) tablegen(LLVM AMDGPUGenRegisterInfo.inc -gen-register-info) +tablegen(LLVM AMDGPUGenSDNodeInfo.inc -gen-sd-node-info) tablegen(LLVM AMDGPUGenSearchableTables.inc -gen-searchable-tables) tablegen(LLVM AMDGPUGenSubtargetInfo.inc -gen-subtarget) @@ -39,10 +40,6 @@ tablegen(LLVM R600GenSubtargetInfo.inc -gen-subtarget) add_public_tablegen_target(AMDGPUCommonTableGen) -set(LLVM_TARGET_DEFINITIONS InstCombineTables.td) -tablegen(LLVM InstCombineTables.inc -gen-searchable-tables) -add_public_tablegen_target(InstCombineTableGen) - add_llvm_target(AMDGPUCodeGen AMDGPUAliasAnalysis.cpp AMDGPUAlwaysInlinePass.cpp @@ -81,6 +78,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPULowerKernelAttributes.cpp AMDGPULowerModuleLDSPass.cpp AMDGPUPrepareAGPRAlloc.cpp + AMDGPULowerExecSync.cpp AMDGPUSwLowerLDS.cpp AMDGPUMachineFunction.cpp AMDGPUMachineModuleInfo.cpp diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index b841171..3a53cef 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -463,7 +463,7 @@ class DS_GWS_0D <string opName> class DS_GWS_1D <string opName> : DS_GWS<opName, - (ins AVLdSt_32:$data0, Offset:$offset), + (ins AV_LdSt_32_Align2_RegOp:$data0, Offset:$offset), " $data0$offset gds"> { let has_gws_data0 = 1; @@ -886,17 +886,6 @@ defm DS_SUB_CLAMP_RTN_U32 : DS_1A1D_RET_mc_gfx9<"ds_sub_clamp_rtn_u32", VGPROp_3 def DS_BPERMUTE_FI_B32 : DS_1A1D_PERMUTE <"ds_bpermute_fi_b32", int_amdgcn_ds_bpermute_fi_b32>; -multiclass DSAtomicRetNoRetPatIntrinsic_mc<DS_Pseudo inst, DS_Pseudo noRetInst, - ValueType vt, string frag> { - def : DSAtomicRetPat<inst, vt, - !cast<PatFrag>(frag#"_local_addrspace")>; - - let OtherPredicates = [HasAtomicCSubNoRtnInsts] in - def : DSAtomicRetPat<noRetInst, vt, - !cast<PatFrag>(frag#"_noret_local_addrspace"), /* complexity */ 1>; -} - -defm : DSAtomicRetNoRetPatIntrinsic_mc<DS_COND_SUB_RTN_U32, DS_COND_SUB_U32, i32, "int_amdgcn_atomic_cond_sub_u32">; } // let SubtargetPredicate = isGFX12Plus let SubtargetPredicate = isGFX1250Plus in { @@ -1279,6 +1268,14 @@ defm : DSAtomicRetNoRetPat_NoM0_mc<DS_PK_ADD_RTN_F16, DS_PK_ADD_F16, v2f16, "ato defm : DSAtomicRetNoRetPat_NoM0_mc<DS_PK_ADD_RTN_BF16, DS_PK_ADD_BF16, v2bf16, "atomic_load_fadd">; } +let SubtargetPredicate = HasAtomicDsCondSubClampInsts in { + +defm : DSAtomicRetNoRetPat_NoM0_mc<DS_COND_SUB_RTN_U32, DS_COND_SUB_U32, i32, "atomic_load_usub_cond">; + +defm : DSAtomicRetNoRetPat_NoM0_mc<DS_SUB_CLAMP_RTN_U32, DS_SUB_CLAMP_U32, i32, "atomic_load_usub_sat">; + +} // let SubtargetPredicate = HasAtomicDsCondSubClampInsts + let SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10 in { defm : DSAtomicCmpXChgSwapped_mc<DS_CMPST_RTN_B32, DS_CMPST_B32, i32, "atomic_cmp_swap">; } diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index e3f3aba..dd3120f 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -1199,8 +1199,8 @@ void AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const { // Given a wide tuple \p Reg check if it will overflow 256 registers. // \returns \p Reg on success or NoRegister otherwise. -static unsigned CheckVGPROverflow(unsigned Reg, const MCRegisterClass &RC, - const MCRegisterInfo &MRI) { +static MCRegister CheckVGPROverflow(MCRegister Reg, const MCRegisterClass &RC, + const MCRegisterInfo &MRI) { unsigned NumRegs = RC.getSizeInBits() / 32; MCRegister Sub0 = MRI.getSubReg(Reg, AMDGPU::sub0); if (!Sub0) @@ -1214,7 +1214,7 @@ static unsigned CheckVGPROverflow(unsigned Reg, const MCRegisterClass &RC, assert(BaseReg && "Only vector registers expected"); - return (Sub0 - BaseReg + NumRegs <= 256) ? Reg : AMDGPU::NoRegister; + return (Sub0 - BaseReg + NumRegs <= 256) ? Reg : MCRegister(); } // Note that before gfx10, the MIMG encoding provided no information about @@ -1456,9 +1456,8 @@ MCOperand AMDGPUDisassembler::errOperand(unsigned V, return MCOperand(); } -inline -MCOperand AMDGPUDisassembler::createRegOperand(unsigned int RegId) const { - return MCOperand::createReg(AMDGPU::getMCReg(RegId, STI)); +inline MCOperand AMDGPUDisassembler::createRegOperand(MCRegister Reg) const { + return MCOperand::createReg(AMDGPU::getMCReg(Reg, STI)); } inline diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h index d103d79..ab130db 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -69,7 +69,7 @@ public: const char* getRegClassName(unsigned RegClassID) const; - MCOperand createRegOperand(unsigned int RegId) const; + MCOperand createRegOperand(MCRegister Reg) const; MCOperand createRegOperand(unsigned RegClassID, unsigned Val) const; MCOperand createSRegOperand(unsigned SRegClassID, unsigned Val) const; MCOperand createVGPR16Operand(unsigned RegIdx, bool IsHi) const; diff --git a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td index dadc7dc..a2e3ece 100644 --- a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td +++ b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td @@ -505,7 +505,6 @@ def : AMDGPUPat < (fshr i32:$src0, i32:$src1, i32:$src2), (BIT_ALIGN_INT_eg $src0, $src1, $src2) >; -def : ROTRPattern <BIT_ALIGN_INT_eg>; def MULADD_eg : MULADD_Common<0x14>; def MULADD_IEEE_eg : MULADD_IEEE_Common<0x18>; def FMA_eg : FMA_Common<0x7>; diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 8ea64d1..9e38af9 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -262,8 +262,18 @@ multiclass FLAT_Flat_Load_Pseudo<string opName, RegisterOperand regClass = AVLdS multiclass FLAT_Flat_Load_Pseudo_t16<string opName> { defm "" : FLAT_Flat_Load_Pseudo<opName, AVLdSt_32, 1>; - let True16Predicate = UseRealTrue16Insts in - defm _t16 : FLAT_Flat_Load_Pseudo<opName#"_t16", VGPROp_16>, True16D16Table<NAME#"_HI", NAME>; + + defvar Name16 = opName#"_t16"; + let True16Predicate = UseRealTrue16Insts in { + def _t16 : FLAT_Load_Pseudo<Name16, VGPROp_16>, + GlobalSaddrTable<0, Name16>, + True16D16Table<NAME#"_HI", NAME>; + + let OtherPredicates = [HasFlatGVSMode] in + def _t16_SADDR : FLAT_Load_Pseudo<Name16, VGPROp_16, 0, 1, 1>, + GlobalSaddrTable<1, Name16>, + True16D16Table<NAME#"_HI_SADDR", NAME#"_SADDR">; + } } class FLAT_Store_Pseudo <string opName, RegisterOperand vdataClass, @@ -1552,10 +1562,6 @@ multiclass FlatAtomicNoRtnPatBase <string base_inst_name, string node, ValueType } } -multiclass FlatAtomicNoRtnPatWithAddrSpace<string inst, string node, string addrSpaceSuffix, - ValueType vt> : - FlatAtomicNoRtnPatBase<inst, node # "_noret_" # addrSpaceSuffix, vt, vt>; - multiclass FlatAtomicNoRtnPat <string inst, string node, ValueType vt, ValueType data_vt = vt, bit isIntr = 0> : FlatAtomicNoRtnPatBase<inst, node # "_noret" # !if(isIntr, "", "_"#vt), vt, data_vt>; @@ -1580,10 +1586,6 @@ multiclass FlatAtomicRtnPatBase <string inst_name, string node, ValueType vt, } } -multiclass FlatAtomicRtnPatWithAddrSpace<string inst, string intr, string addrSpaceSuffix, - ValueType vt> : - FlatAtomicRtnPatBase<inst, intr # "_" # addrSpaceSuffix, vt, vt>; - multiclass FlatAtomicRtnPat <string inst, string node, ValueType vt, ValueType data_vt = vt, bit isIntr = 0> : FlatAtomicRtnPatBase<inst, node # !if(isIntr, "", "_"#vt), vt, data_vt>; @@ -2169,14 +2171,16 @@ defm : FlatAtomicPat <"FLAT_ATOMIC_MIN_F64", "atomic_load_fmin_"#as, f64>; defm : FlatAtomicPat <"FLAT_ATOMIC_MAX_F64", "atomic_load_fmax_"#as, f64>; } +let SubtargetPredicate = HasAtomicCondSubClampFlatInsts in { + defm : FlatAtomicRtnPat<"FLAT_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_" #as, i32 >; + + defm : FlatAtomicNoRtnPat<"FLAT_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_"#as, i32>; +} } // end foreach as defm : FlatStorePats <FLAT_STORE_BYTE, truncstorei8_flat, i16>; defm : FlatStorePats <FLAT_STORE_SHORT, store_flat, i16>; -defm : FlatAtomicRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32>; -defm : FlatAtomicNoRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32>; - let OtherPredicates = [HasD16LoadStore] in { defm : FlatStorePats <FLAT_STORE_SHORT_D16_HI, truncstorei16_hi16_flat, i32>; defm : FlatStorePats <FLAT_STORE_BYTE_D16_HI, truncstorei8_hi16_flat, i32>; @@ -2340,10 +2344,10 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_OR", "atomic_load_or_global", i32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SWAP", "atomic_swap_global", i32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP", "AMDGPUatomic_cmp_swap_global", i32, v2i32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR", "atomic_load_xor_global", i32>; -defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_CSUB", "int_amdgcn_global_atomic_csub", i32, i32, /* isIntr */ 1>; +defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_CSUB", "atomic_load_usub_sat_global", i32>; let SubtargetPredicate = HasAtomicCSubNoRtnInsts in -defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_CSUB", "int_amdgcn_global_atomic_csub", i32, i32, /* isIntr */ 1>; +defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_CSUB", "atomic_load_usub_sat_global", i32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_X2", "atomic_load_add_global", i64>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB_X2", "atomic_load_sub_global", i64>; @@ -2360,10 +2364,8 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP_X2", "AMDGPUatomic_cmp_swap_ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR_X2", "atomic_load_xor_global", i64>; let SubtargetPredicate = isGFX12Plus in { - defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "global_addrspace", i32>; - - let SubtargetPredicate = HasAtomicCSubNoRtnInsts in - defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "global_addrspace", i32>; + defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_global", i32>; + defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_global", i32>; } let OtherPredicates = [isGFX12PlusNot12_50] in diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 52cc4ca..6f1a521 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -190,6 +190,12 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { if (checkFPAtomicToDenormModeHazard(MI) > 0) return HazardType; + // Hazards which cannot be mitigated with S_NOPs. + if (!IsHazardRecognizerMode) { + if (checkWMMACoexecutionHazards(MI) > 0) + return Hazard; + } + if (ST.hasNoDataDepHazard()) return NoHazard; @@ -435,10 +441,7 @@ void GCNHazardRecognizer::RecedeCycle() { // Helper Functions //===----------------------------------------------------------------------===// -using HazardFnResult = enum { HazardFound, HazardExpired, NoHazardFound }; - -using IsExpiredFn = function_ref<bool(const MachineInstr &, int WaitStates)>; -using GetNumWaitStatesFn = function_ref<unsigned int(const MachineInstr &)>; +enum HazardFnResult { HazardFound, HazardExpired, NoHazardFound }; // Search for a hazard in a block and its predecessors. template <typename StateT> @@ -546,11 +549,14 @@ hasHazard(StateT InitialState, // Returns a minimum wait states since \p I walking all predecessors. // Only scans until \p IsExpired does not return true. // Can only be run in a hazard recognizer mode. -static int getWaitStatesSince( - GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB, - MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates, - IsExpiredFn IsExpired, DenseSet<const MachineBasicBlock *> &Visited, - GetNumWaitStatesFn GetNumWaitStates = SIInstrInfo::getNumWaitStates) { +static int +getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, + const MachineBasicBlock *MBB, + MachineBasicBlock::const_reverse_instr_iterator I, + int WaitStates, GCNHazardRecognizer::IsExpiredFn IsExpired, + DenseSet<const MachineBasicBlock *> &Visited, + GCNHazardRecognizer::GetNumWaitStatesFn GetNumWaitStates = + SIInstrInfo::getNumWaitStates) { for (auto E = MBB->instr_rend(); I != E; ++I) { // Don't add WaitStates for parent BUNDLE instructions. if (I->isBundle()) @@ -582,20 +588,26 @@ static int getWaitStatesSince( return MinWaitStates; } -static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, - const MachineInstr *MI, IsExpiredFn IsExpired) { +static int +getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, + const MachineInstr *MI, + GCNHazardRecognizer::IsExpiredFn IsExpired, + GCNHazardRecognizer::GetNumWaitStatesFn GetNumWaitStates = + SIInstrInfo::getNumWaitStates) { DenseSet<const MachineBasicBlock *> Visited; return getWaitStatesSince(IsHazard, MI->getParent(), std::next(MI->getReverseIterator()), 0, IsExpired, - Visited, SIInstrInfo::getNumWaitStates); + Visited, GetNumWaitStates); } -int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) { +int GCNHazardRecognizer::getWaitStatesSince( + IsHazardFn IsHazard, int Limit, GetNumWaitStatesFn GetNumWaitStates) { if (IsHazardRecognizerMode) { auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) { return WaitStates >= Limit; }; - return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn); + return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn, + GetNumWaitStates); } int WaitStates = 0; @@ -607,7 +619,7 @@ int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) { if (MI->isInlineAsm()) continue; } - ++WaitStates; + WaitStates += MI ? GetNumWaitStates(*MI) : 1; if (WaitStates >= Limit) break; @@ -615,6 +627,10 @@ int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) { return std::numeric_limits<int>::max(); } +int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) { + return getWaitStatesSince(IsHazard, Limit, SIInstrInfo::getNumWaitStates); +} + int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg, IsHazardFn IsHazardDef, int Limit) { @@ -643,7 +659,7 @@ int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard, static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV, MCRegister Reg) { for (MCRegUnit Unit : TRI.regunits(Reg)) - BV.set(Unit); + BV.set(static_cast<unsigned>(Unit)); } static void addRegsToSet(const SIRegisterInfo &TRI, @@ -1243,6 +1259,20 @@ int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) { getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates); } +// emit V_NOP instructions. \p WaitStatesNeeded is the number of V_NOPs we need +// to insert, negative means not needed. +bool GCNHazardRecognizer::emitVNops(MachineInstr *MI, int WaitStatesNeeded) { + if (WaitStatesNeeded <= 0) + return false; + + const SIInstrInfo *TII = ST.getInstrInfo(); + for (int I = 0; I < WaitStatesNeeded; ++I) + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), + TII->get(AMDGPU::V_NOP_e32)); + + return true; +} + void GCNHazardRecognizer::fixHazards(MachineInstr *MI) { fixVMEMtoScalarWriteHazards(MI); fixVcmpxPermlaneHazards(MI); @@ -1257,7 +1287,7 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) { fixVALUTransUseHazard(MI); fixVALUTransCoexecutionHazards(MI); fixWMMAHazards(MI); // fall-through if co-execution is enabled. - fixWMMACoexecutionHazards(MI); + emitVNops(MI, checkWMMACoexecutionHazards(MI)); fixShift64HighRegBug(MI); fixVALUMaskWriteHazard(MI); fixRequiredExportPriority(MI); @@ -1354,7 +1384,7 @@ bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) { const SIInstrInfo *TII = ST.getInstrInfo(); BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::S_WAITCNT_DEPCTR)) - .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0)); + .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0, ST)); return true; } @@ -1487,7 +1517,7 @@ bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) { BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::S_WAITCNT_DEPCTR)) - .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0)); + .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0, ST)); return true; } @@ -1502,9 +1532,8 @@ static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, bool HasVmem = false; for (auto &MBB : MF) { for (auto &MI : MBB) { - HasLds |= SIInstrInfo::isDS(MI); - HasVmem |= (SIInstrInfo::isVMEM(MI) && !SIInstrInfo::isFLAT(MI)) || - SIInstrInfo::isSegmentSpecificFLAT(MI); + HasLds |= SIInstrInfo::isDS(MI) || SIInstrInfo::isLDSDMA(MI); + HasVmem |= SIInstrInfo::isVMEM(MI); if (HasLds && HasVmem) return true; } @@ -1526,10 +1555,9 @@ bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) { assert(!ST.hasExtendedWaitCounts()); auto IsHazardInst = [](const MachineInstr &MI) { - if (SIInstrInfo::isDS(MI)) + if (SIInstrInfo::isDS(MI) || SIInstrInfo::isLDSDMA(MI)) return 1; - if ((SIInstrInfo::isVMEM(MI) && !SIInstrInfo::isFLAT(MI)) || - SIInstrInfo::isSegmentSpecificFLAT(MI)) + if (SIInstrInfo::isVMEM(MI)) return 2; return 0; }; @@ -1653,7 +1681,7 @@ bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) { } else { BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_WAITCNT_DEPCTR)) - .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0)); + .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0, ST)); } return true; @@ -1811,7 +1839,7 @@ bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) { BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_WAITCNT_DEPCTR)) - .addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0)); + .addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0, ST)); return true; } @@ -1897,7 +1925,7 @@ bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) { // avoided. BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_WAITCNT_DEPCTR)) - .addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0)); + .addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0, ST)); return true; } @@ -2047,13 +2075,13 @@ static bool IsWMMAHazardInstInCategory(const MachineInstr &MI, return false; } -bool GCNHazardRecognizer::fixWMMACoexecutionHazards(MachineInstr *MI) { +int GCNHazardRecognizer::checkWMMACoexecutionHazards(MachineInstr *MI) { if (!AMDGPU::isGFX1250(ST)) - return false; + return 0; const SIInstrInfo *TII = ST.getInstrInfo(); if (!TII->isXDLWMMA(*MI) && !isCoexecutableVALUInst(*MI)) - return false; + return 0; const SIRegisterInfo *TRI = ST.getRegisterInfo(); @@ -2131,9 +2159,6 @@ bool GCNHazardRecognizer::fixWMMACoexecutionHazards(MachineInstr *MI) { }; int Limit = 0; - auto IsExpiredFn = [&Limit](const MachineInstr &, int WaitStates) { - return WaitStates >= Limit; - }; auto GetWaitStatesFn = [](const MachineInstr &I) { return SIInstrInfo::isVALU(I) ? 1 : 0; @@ -2143,38 +2168,26 @@ bool GCNHazardRecognizer::fixWMMACoexecutionHazards(MachineInstr *MI) { if (TII->isXDLWMMA(*MI)) { for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) { Limit = WMMAWaitStates[Category]; // for IsExpiredFn. - DenseSet<const MachineBasicBlock *> Visited; - // '::getWaitStatesSince' returns the number of VALUs in between if hazard + // 'getWaitStatesSince' returns the number of VALUs in between if hazard // exists, and INT_MAX if there is no hazard. As a result, a negative // WaitStatesNeeded here means no hazard, and we will continue to search // for other categories. WaitStatesNeeded = - Limit - ::getWaitStatesSince(IsWMMAHazardFn, MI->getParent(), - std::next(MI->getReverseIterator()), 0, - IsExpiredFn, Visited, GetWaitStatesFn); + Limit - getWaitStatesSince(IsWMMAHazardFn, Limit, GetWaitStatesFn); } } else { // Must be a co-executable VALU. for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) { Limit = VALUWaitStates[Category]; // for IsExpiredFn. - DenseSet<const MachineBasicBlock *> Visited; - // '::getWaitStatesSince' returns the number of VALUs in between if hazard + // 'getWaitStatesSince' returns the number of VALUs in between if hazard // exists, and INT_MAX if there is no hazard. As a result, a negative // WaitStatesNeeded here means no hazard, and we will continue to search // for other categories. WaitStatesNeeded = - Limit - ::getWaitStatesSince(IsVALUHazardFn, MI->getParent(), - std::next(MI->getReverseIterator()), 0, - IsExpiredFn, Visited, GetWaitStatesFn); + Limit - getWaitStatesSince(IsVALUHazardFn, Limit, GetWaitStatesFn); } } - // WaitStatesNeeded now is the number of V_NOPs we need to insert, negative - // means not needed. - for (int i = 0; i < WaitStatesNeeded; i++) - BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), - TII->get(AMDGPU::V_NOP_e32)); - - return true; + return WaitStatesNeeded; } bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) { @@ -3406,7 +3419,8 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) { }; const unsigned ConstantMaskBits = AMDGPU::DepCtr::encodeFieldSaSdst( - AMDGPU::DepCtr::encodeFieldVaSdst(AMDGPU::DepCtr::encodeFieldVaVcc(0), 0), + AMDGPU::DepCtr::encodeFieldVaSdst(AMDGPU::DepCtr::encodeFieldVaVcc(0, ST), + 0), 0); auto UpdateStateFn = [&](StateType &State, const MachineInstr &I) { switch (I.getOpcode()) { @@ -3458,9 +3472,9 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) { // Compute counter mask unsigned DepCtr = - IsVALU ? (IsVCC(HazardReg) ? AMDGPU::DepCtr::encodeFieldVaVcc(0) - : AMDGPU::DepCtr::encodeFieldVaSdst(0)) - : AMDGPU::DepCtr::encodeFieldSaSdst(0); + IsVALU ? (IsVCC(HazardReg) ? AMDGPU::DepCtr::encodeFieldVaVcc(0, ST) + : AMDGPU::DepCtr::encodeFieldVaSdst(0, ST)) + : AMDGPU::DepCtr::encodeFieldSaSdst(0, ST); // Try to merge previous waits into this one for regions with no SGPR reads. if (!WaitInstrs.empty()) { @@ -3725,7 +3739,7 @@ bool GCNHazardRecognizer::fixScratchBaseForwardingHazard(MachineInstr *MI) { BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::S_WAITCNT_DEPCTR)) .addImm(AMDGPU::DepCtr::encodeFieldVaSdst( - AMDGPU::DepCtr::encodeFieldSaSdst(0), 0)); + AMDGPU::DepCtr::encodeFieldSaSdst(0, ST), 0)); return true; } diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h index 67beffa..d725134 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h @@ -32,6 +32,8 @@ class GCNSubtarget; class GCNHazardRecognizer final : public ScheduleHazardRecognizer { public: typedef function_ref<bool(const MachineInstr &)> IsHazardFn; + typedef function_ref<bool(const MachineInstr &, int WaitStates)> IsExpiredFn; + typedef function_ref<unsigned int(const MachineInstr &)> GetNumWaitStatesFn; private: // Distinguish if we are called from scheduler or hazard recognizer @@ -74,6 +76,8 @@ private: // used on a newly inserted instruction before returning from PreEmitNoops. void runOnInstruction(MachineInstr *MI); + int getWaitStatesSince(IsHazardFn IsHazard, int Limit, + GetNumWaitStatesFn GetNumWaitStates); int getWaitStatesSince(IsHazardFn IsHazard, int Limit); int getWaitStatesSinceDef(unsigned Reg, IsHazardFn IsHazardDef, int Limit); int getWaitStatesSinceSetReg(IsHazardFn IsHazard, int Limit); @@ -94,6 +98,9 @@ private: int checkReadM0Hazards(MachineInstr *SMovRel); int checkNSAtoVMEMHazard(MachineInstr *MI); int checkFPAtomicToDenormModeHazard(MachineInstr *MI); + // Emit V_NOP instructions. \p WaitStatesNeeded is the number of V_NOPs we + // need to insert, negative means not needed. + bool emitVNops(MachineInstr *MI, int WaitStatesNeeded); void fixHazards(MachineInstr *MI); bool fixVcmpxPermlaneHazards(MachineInstr *MI); bool fixVMEMtoScalarWriteHazards(MachineInstr *MI); @@ -106,7 +113,7 @@ private: bool fixVALUTransUseHazard(MachineInstr *MI); bool fixVALUTransCoexecutionHazards(MachineInstr *MI); bool fixWMMAHazards(MachineInstr *MI); - bool fixWMMACoexecutionHazards(MachineInstr *MI); + int checkWMMACoexecutionHazards(MachineInstr *MI); bool fixShift64HighRegBug(MachineInstr *MI); bool fixVALUMaskWriteHazard(MachineInstr *MI); bool fixRequiredExportPriority(MachineInstr *MI); diff --git a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp index 959ce69..1682abb 100644 --- a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp +++ b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp @@ -43,7 +43,7 @@ public: bool run(MachineFunction &MF); private: - using NSA_Status = enum { + enum NSA_Status { NOT_NSA, // Not an NSA instruction FIXED, // NSA which we cannot modify NON_CONTIGUOUS, // NSA with non-sequential address which we can try diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp index 4deb2a9..62172a0 100644 --- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp +++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp @@ -136,7 +136,7 @@ bool GCNPreRAOptimizationsImpl::processReg(Register Reg) { continue; if (Def.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64) { - MachineOperand DefSrcMO = Def.getOperand(1); + const MachineOperand &DefSrcMO = Def.getOperand(1); // Immediates are not an issue and can be propagated in // postrapseudos pass. Only handle cases where defining diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp index 4e11c4f..2cb76a5 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -282,11 +282,12 @@ collectVirtualRegUses(SmallVectorImpl<VRegMaskOrUnit> &VRegMaskOrUnits, Register Reg = MO.getReg(); auto I = llvm::find_if(VRegMaskOrUnits, [Reg](const VRegMaskOrUnit &RM) { - return RM.RegUnit == Reg; + return RM.VRegOrUnit.asVirtualReg() == Reg; }); auto &P = I == VRegMaskOrUnits.end() - ? VRegMaskOrUnits.emplace_back(Reg, LaneBitmask::getNone()) + ? VRegMaskOrUnits.emplace_back(VirtRegOrUnit(Reg), + LaneBitmask::getNone()) : *I; P.LaneMask |= MO.getSubReg() ? TRI.getSubRegIndexLaneMask(MO.getSubReg()) @@ -295,7 +296,7 @@ collectVirtualRegUses(SmallVectorImpl<VRegMaskOrUnit> &VRegMaskOrUnits, SlotIndex InstrSI; for (auto &P : VRegMaskOrUnits) { - auto &LI = LIS.getInterval(P.RegUnit); + auto &LI = LIS.getInterval(P.VRegOrUnit.asVirtualReg()); if (!LI.hasSubRanges()) continue; @@ -312,29 +313,22 @@ collectVirtualRegUses(SmallVectorImpl<VRegMaskOrUnit> &VRegMaskOrUnits, /// Mostly copy/paste from CodeGen/RegisterPressure.cpp static LaneBitmask getLanesWithProperty( const LiveIntervals &LIS, const MachineRegisterInfo &MRI, - bool TrackLaneMasks, Register RegUnit, SlotIndex Pos, - LaneBitmask SafeDefault, + bool TrackLaneMasks, Register Reg, SlotIndex Pos, function_ref<bool(const LiveRange &LR, SlotIndex Pos)> Property) { - if (RegUnit.isVirtual()) { - const LiveInterval &LI = LIS.getInterval(RegUnit); - LaneBitmask Result; - if (TrackLaneMasks && LI.hasSubRanges()) { - for (const LiveInterval::SubRange &SR : LI.subranges()) { - if (Property(SR, Pos)) - Result |= SR.LaneMask; - } - } else if (Property(LI, Pos)) { - Result = TrackLaneMasks ? MRI.getMaxLaneMaskForVReg(RegUnit) - : LaneBitmask::getAll(); + assert(Reg.isVirtual()); + const LiveInterval &LI = LIS.getInterval(Reg); + LaneBitmask Result; + if (TrackLaneMasks && LI.hasSubRanges()) { + for (const LiveInterval::SubRange &SR : LI.subranges()) { + if (Property(SR, Pos)) + Result |= SR.LaneMask; } - - return Result; + } else if (Property(LI, Pos)) { + Result = + TrackLaneMasks ? MRI.getMaxLaneMaskForVReg(Reg) : LaneBitmask::getAll(); } - const LiveRange *LR = LIS.getCachedRegUnit(RegUnit); - if (LR == nullptr) - return SafeDefault; - return Property(*LR, Pos) ? LaneBitmask::getAll() : LaneBitmask::getNone(); + return Result; } /// Mostly copy/paste from CodeGen/RegisterPressure.cpp @@ -502,10 +496,9 @@ void GCNRPTracker::reset(const MachineRegisterInfo &MRI_, } /// Mostly copy/paste from CodeGen/RegisterPressure.cpp -LaneBitmask GCNRPTracker::getLastUsedLanes(Register RegUnit, - SlotIndex Pos) const { +LaneBitmask GCNRPTracker::getLastUsedLanes(Register Reg, SlotIndex Pos) const { return getLanesWithProperty( - LIS, *MRI, true, RegUnit, Pos.getBaseIndex(), LaneBitmask::getNone(), + LIS, *MRI, true, Reg, Pos.getBaseIndex(), [](const LiveRange &LR, SlotIndex Pos) { const LiveRange::Segment *S = LR.getSegmentContaining(Pos); return S != nullptr && S->end == Pos.getRegSlot(); @@ -562,10 +555,10 @@ void GCNUpwardRPTracker::recede(const MachineInstr &MI) { SmallVector<VRegMaskOrUnit, 8> RegUses; collectVirtualRegUses(RegUses, MI, LIS, *MRI); for (const VRegMaskOrUnit &U : RegUses) { - LaneBitmask &LiveMask = LiveRegs[U.RegUnit]; + LaneBitmask &LiveMask = LiveRegs[U.VRegOrUnit.asVirtualReg()]; LaneBitmask PrevMask = LiveMask; LiveMask |= U.LaneMask; - CurPressure.inc(U.RegUnit, PrevMask, LiveMask, *MRI); + CurPressure.inc(U.VRegOrUnit.asVirtualReg(), PrevMask, LiveMask, *MRI); } // Update MaxPressure with uses plus early-clobber defs pressure. @@ -580,7 +573,7 @@ void GCNUpwardRPTracker::recede(const MachineInstr &MI) { bool GCNDownwardRPTracker::reset(const MachineInstr &MI, const LiveRegSet *LiveRegsCopy) { - MRI = &MI.getParent()->getParent()->getRegInfo(); + MRI = &MI.getMF()->getRegInfo(); LastTrackedMI = nullptr; MBBEnd = MI.getParent()->end(); NextMI = &MI; @@ -748,9 +741,9 @@ GCNDownwardRPTracker::bumpDownwardPressure(const MachineInstr *MI, GCNRegPressure TempPressure = CurPressure; for (const VRegMaskOrUnit &Use : RegOpers.Uses) { - Register Reg = Use.RegUnit; - if (!Reg.isVirtual()) + if (!Use.VRegOrUnit.isVirtualReg()) continue; + Register Reg = Use.VRegOrUnit.asVirtualReg(); LaneBitmask LastUseMask = getLastUsedLanes(Reg, SlotIdx); if (LastUseMask.none()) continue; @@ -782,9 +775,9 @@ GCNDownwardRPTracker::bumpDownwardPressure(const MachineInstr *MI, // Generate liveness for defs. for (const VRegMaskOrUnit &Def : RegOpers.Defs) { - Register Reg = Def.RegUnit; - if (!Reg.isVirtual()) + if (!Def.VRegOrUnit.isVirtualReg()) continue; + Register Reg = Def.VRegOrUnit.asVirtualReg(); auto It = LiveRegs.find(Reg); LaneBitmask LiveMask = It != LiveRegs.end() ? It->second : LaneBitmask(0); LaneBitmask NewMask = LiveMask | Def.LaneMask; @@ -824,8 +817,7 @@ Printable llvm::print(const GCNRPTracker::LiveRegSet &LiveRegs, Register Reg = Register::index2VirtReg(I); auto It = LiveRegs.find(Reg); if (It != LiveRegs.end() && It->second.any()) - OS << ' ' << printVRegOrUnit(Reg, TRI) << ':' - << PrintLaneMask(It->second); + OS << ' ' << printReg(Reg, TRI) << ':' << PrintLaneMask(It->second); } OS << '\n'; }); diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h index 4b22c68..f9d3ce0 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h @@ -292,7 +292,7 @@ protected: /// Mostly copy/paste from CodeGen/RegisterPressure.cpp void bumpDeadDefs(ArrayRef<VRegMaskOrUnit> DeadDefs); - LaneBitmask getLastUsedLanes(Register RegUnit, SlotIndex Pos) const; + LaneBitmask getLastUsedLanes(Register Reg, SlotIndex Pos) const; public: // reset tracker and set live register set to the specified value. @@ -455,7 +455,7 @@ template <typename Range> DenseMap<MachineInstr*, GCNRPTracker::LiveRegSet> getLiveRegMap(Range &&R, bool After, LiveIntervals &LIS) { std::vector<SlotIndex> Indexes; - Indexes.reserve(std::distance(R.begin(), R.end())); + Indexes.reserve(llvm::size(R)); auto &SII = *LIS.getSlotIndexes(); for (MachineInstr *I : R) { auto SI = SII.getInstructionIndex(*I); @@ -463,7 +463,7 @@ getLiveRegMap(Range &&R, bool After, LiveIntervals &LIS) { } llvm::sort(Indexes); - auto &MRI = (*R.begin())->getParent()->getParent()->getRegInfo(); + auto &MRI = (*R.begin())->getMF()->getRegInfo(); DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> LiveRegMap; SmallVector<SlotIndex, 32> LiveIdxs, SRLiveIdxs; for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { @@ -493,13 +493,13 @@ getLiveRegMap(Range &&R, bool After, LiveIntervals &LIS) { inline GCNRPTracker::LiveRegSet getLiveRegsAfter(const MachineInstr &MI, const LiveIntervals &LIS) { return getLiveRegs(LIS.getInstructionIndex(MI).getDeadSlot(), LIS, - MI.getParent()->getParent()->getRegInfo()); + MI.getMF()->getRegInfo()); } inline GCNRPTracker::LiveRegSet getLiveRegsBefore(const MachineInstr &MI, const LiveIntervals &LIS) { return getLiveRegs(LIS.getInstructionIndex(MI).getBaseIndex(), LIS, - MI.getParent()->getParent()->getRegInfo()); + MI.getMF()->getRegInfo()); } template <typename Range> diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 9fbf9e5..c8ce3aa 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -978,10 +978,8 @@ GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const { static MachineInstr *getLastMIForRegion(MachineBasicBlock::iterator RegionBegin, MachineBasicBlock::iterator RegionEnd) { - auto REnd = RegionEnd == RegionBegin->getParent()->end() - ? std::prev(RegionEnd) - : RegionEnd; - return &*skipDebugInstructionsBackward(REnd, RegionBegin); + assert(RegionBegin != RegionEnd && "Region must not be empty"); + return &*skipDebugInstructionsBackward(std::prev(RegionEnd), RegionBegin); } void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx, @@ -1076,9 +1074,12 @@ GCNScheduleDAGMILive::getRegionLiveOutMap() const { assert(!Regions.empty()); std::vector<MachineInstr *> RegionLastMIs; RegionLastMIs.reserve(Regions.size()); - for (auto &[RegionBegin, RegionEnd] : reverse(Regions)) + for (auto &[RegionBegin, RegionEnd] : reverse(Regions)) { + // Skip empty regions. + if (RegionBegin == RegionEnd) + continue; RegionLastMIs.push_back(getLastMIForRegion(RegionBegin, RegionEnd)); - + } return getLiveRegMap(RegionLastMIs, /*After=*/true, *LIS); } @@ -1088,10 +1089,12 @@ void RegionPressureMap::buildLiveRegMap() { RegionLiveRegMap = IsLiveOut ? DAG->getRegionLiveOutMap() : DAG->getRegionLiveInMap(); for (unsigned I = 0; I < DAG->Regions.size(); I++) { + auto &[RegionBegin, RegionEnd] = DAG->Regions[I]; + // Skip empty regions. + if (RegionBegin == RegionEnd) + continue; MachineInstr *RegionKey = - IsLiveOut - ? getLastMIForRegion(DAG->Regions[I].first, DAG->Regions[I].second) - : &*DAG->Regions[I].first; + IsLiveOut ? getLastMIForRegion(RegionBegin, RegionEnd) : &*RegionBegin; IdxToInstruction[I] = RegionKey; } } @@ -1228,18 +1231,20 @@ bool UnclusteredHighRPStage::initGCNSchedStage() { createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PreRAReentry)); InitialOccupancy = DAG.MinOccupancy; - // Aggressivly try to reduce register pressure in the unclustered high RP + // Aggressively try to reduce register pressure in the unclustered high RP // stage. Temporarily increase occupancy target in the region. + TempTargetOccupancy = MFI.getMaxWavesPerEU() > DAG.MinOccupancy + ? InitialOccupancy + 1 + : InitialOccupancy; + IsAnyRegionScheduled = false; S.SGPRLimitBias = S.HighRPSGPRBias; S.VGPRLimitBias = S.HighRPVGPRBias; - if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy) - MFI.increaseOccupancy(MF, ++DAG.MinOccupancy); LLVM_DEBUG( dbgs() << "Retrying function scheduling without clustering. " - "Aggressivly try to reduce register pressure to achieve occupancy " - << DAG.MinOccupancy << ".\n"); + "Aggressively try to reduce register pressure to achieve occupancy " + << TempTargetOccupancy << ".\n"); return true; } @@ -1320,9 +1325,16 @@ void UnclusteredHighRPStage::finalizeGCNSchedStage() { SavedMutations.swap(DAG.Mutations); S.SGPRLimitBias = S.VGPRLimitBias = 0; if (DAG.MinOccupancy > InitialOccupancy) { + assert(IsAnyRegionScheduled); LLVM_DEBUG(dbgs() << StageID << " stage successfully increased occupancy to " << DAG.MinOccupancy << '\n'); + } else if (!IsAnyRegionScheduled) { + assert(DAG.MinOccupancy == InitialOccupancy); + LLVM_DEBUG(dbgs() << StageID + << ": No regions scheduled, min occupancy stays at " + << DAG.MinOccupancy << ", MFI occupancy stays at " + << MFI.getOccupancy() << ".\n"); } GCNSchedStage::finalizeGCNSchedStage(); @@ -1396,13 +1408,27 @@ bool UnclusteredHighRPStage::initGCNRegion() { // rescheduling of previous regions did not make occupancy drop back down to // the initial minimum). unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize(); + // If no region has been scheduled yet, the DAG has not yet been updated with + // the occupancy target. So retrieve it from the temporary. + unsigned CurrentTargetOccupancy = + IsAnyRegionScheduled ? DAG.MinOccupancy : TempTargetOccupancy; if (!DAG.RegionsWithExcessRP[RegionIdx] && - (DAG.MinOccupancy <= InitialOccupancy || + (CurrentTargetOccupancy <= InitialOccupancy || DAG.Pressure[RegionIdx].getOccupancy(ST, DynamicVGPRBlockSize) != InitialOccupancy)) return false; - return GCNSchedStage::initGCNRegion(); + bool IsSchedulingThisRegion = GCNSchedStage::initGCNRegion(); + // If this is the first region scheduled during this stage, make the target + // occupancy changes in the DAG and MFI. + if (!IsAnyRegionScheduled && IsSchedulingThisRegion) { + IsAnyRegionScheduled = true; + if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy) { + DAG.MinOccupancy = TempTargetOccupancy; + MFI.increaseOccupancy(MF, TempTargetOccupancy); + } + } + return IsSchedulingThisRegion; } bool ClusteredLowOccStage::initGCNRegion() { @@ -2011,7 +2037,7 @@ void PreRARematStage::rematerialize() { // Rematerialize DefMI to its use block. TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg, - AMDGPU::NoSubRegister, *DefMI, *DAG.TRI); + AMDGPU::NoSubRegister, *DefMI); Remat.RematMI = &*std::prev(InsertPos); DAG.LIS->InsertMachineInstrInMaps(*Remat.RematMI); @@ -2163,8 +2189,7 @@ void PreRARematStage::finalizeGCNSchedStage() { // Re-rematerialize MI at the end of its original region. Note that it may // not be rematerialized exactly in the same position as originally within // the region, but it should not matter much. - TII->reMaterialize(*MBB, InsertPos, Reg, AMDGPU::NoSubRegister, RematMI, - *DAG.TRI); + TII->reMaterialize(*MBB, InsertPos, Reg, AMDGPU::NoSubRegister, RematMI); MachineInstr *NewMI = &*std::prev(InsertPos); DAG.LIS->InsertMachineInstrInMaps(*NewMI); diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h index 975781f..95a931b 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -183,7 +183,7 @@ class ScheduleMetrics { unsigned BubbleCycles; public: - ScheduleMetrics() {} + ScheduleMetrics() = default; ScheduleMetrics(unsigned L, unsigned BC) : ScheduleLength(L), BubbleCycles(BC) {} unsigned getLength() const { return ScheduleLength; } @@ -217,7 +217,7 @@ class RegionPressureMap { bool IsLiveOut; public: - RegionPressureMap() {} + RegionPressureMap() = default; RegionPressureMap(GCNScheduleDAGMILive *GCNDAG, bool LiveOut) : DAG(GCNDAG), IsLiveOut(LiveOut) {} // Build the Instr->LiveReg and RegionIdx->Instr maps @@ -417,6 +417,10 @@ class UnclusteredHighRPStage : public GCNSchedStage { private: // Save the initial occupancy before starting this stage. unsigned InitialOccupancy; + // Save the temporary target occupancy before starting this stage. + unsigned TempTargetOccupancy; + // Track whether any region was scheduled by this stage. + bool IsAnyRegionScheduled; public: bool initGCNSchedStage() override; diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index f377b8a..ddff3ad 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -166,6 +166,13 @@ protected: bool HasMAIInsts = false; bool HasFP8Insts = false; bool HasFP8ConversionInsts = false; + bool HasCubeInsts = false; + bool HasLerpInst = false; + bool HasSadInsts = false; + bool HasQsadInsts = false; + bool HasCvtNormInsts = false; + bool HasCvtPkNormVOP2Insts = false; + bool HasCvtPkNormVOP3Insts = false; bool HasFP8E5M3Insts = false; bool HasCvtFP8Vop1Bug = false; bool HasPkFmacF16Inst = false; @@ -892,6 +899,20 @@ public: bool hasFP8ConversionInsts() const { return HasFP8ConversionInsts; } + bool hasCubeInsts() const { return HasCubeInsts; } + + bool hasLerpInst() const { return HasLerpInst; } + + bool hasSadInsts() const { return HasSadInsts; } + + bool hasQsadInsts() const { return HasQsadInsts; } + + bool hasCvtNormInsts() const { return HasCvtNormInsts; } + + bool hasCvtPkNormVOP2Insts() const { return HasCvtPkNormVOP2Insts; } + + bool hasCvtPkNormVOP3Insts() const { return HasCvtPkNormVOP3Insts; } + bool hasFP8E5M3Insts() const { return HasFP8E5M3Insts; } bool hasPkFmacF16Inst() const { @@ -1420,6 +1441,13 @@ public: /// \returns true if the target has instructions with xf32 format support. bool hasXF32Insts() const { return HasXF32Insts; } + /// \returns true if the target has packed f32 instructions that only read 32 + /// bits from a scalar operand (SGPR or literal) and replicates the bits to + /// both channels. + bool hasPKF32InstsReplicatingLower32BitsOfScalarInput() const { + return getGeneration() == GFX12 && GFX1250Insts; + } + bool hasBitOp3Insts() const { return HasBitOp3Insts; } bool hasPermlane16Swap() const { return HasPermlane16Swap; } @@ -1595,6 +1623,10 @@ public: return hasKernargPreload() && !GFX1250Insts; } + bool hasCondSubInsts() const { return GFX12Insts; } + + bool hasSubClampInsts() const { return hasGFX10_3Insts(); } + /// \returns SGPR allocation granularity supported by the subtarget. unsigned getSGPRAllocGranule() const { return AMDGPU::IsaInfo::getSGPRAllocGranule(this); @@ -1840,12 +1872,21 @@ public: return GFX1250Insts && getGeneration() == GFX12; } + // src_flat_scratch_hi cannot be used as a source in SALU producing a 64-bit + // result. + bool hasFlatScratchHiInB64InstHazard() const { + return GFX1250Insts && getGeneration() == GFX12; + } + /// \returns true if the subtarget supports clusters of workgroups. bool hasClusters() const { return HasClusters; } - /// \returns true if the subtarget requires a wait for xcnt before atomic - /// flat/global stores & rmw. - bool requiresWaitXCntBeforeAtomicStores() const { return GFX1250Insts; } + /// \returns true if the subtarget requires a wait for xcnt before VMEM + /// accesses that must never be repeated in the event of a page fault/re-try. + /// Atomic stores/rmw and all volatile accesses fall under this criteria. + bool requiresWaitXCntForSingleAccessInstructions() const { + return GFX1250Insts; + } /// \returns the number of significant bits in the immediate field of the /// S_NOP instruction. diff --git a/llvm/lib/Target/AMDGPU/InstCombineTables.td b/llvm/lib/Target/AMDGPU/InstCombineTables.td deleted file mode 100644 index 3d62641..0000000 --- a/llvm/lib/Target/AMDGPU/InstCombineTables.td +++ /dev/null @@ -1,10 +0,0 @@ -include "AMDGPU.td" - -def AMDGPUImageDMaskIntrinsicTable : GenericTable { - let FilterClass = "AMDGPUImageDMaskIntrinsic"; - let Fields = ["Intr"]; - - let PrimaryKey = ["Intr"]; - let PrimaryKeyName = "getAMDGPUImageDMaskIntrinsic"; - let PrimaryKeyEarlyOut = 1; -} diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index 703ec0a..b63d71d 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -12,6 +12,7 @@ #include "SIDefines.h" #include "Utils/AMDGPUAsmUtils.h" #include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" @@ -336,7 +337,7 @@ void AMDGPUInstPrinter::printSymbolicFormat(const MCInst *MI, // \returns a low 256 vgpr representing a high vgpr \p Reg [v256..v1023] or // \p Reg itself otherwise. -static MCPhysReg getRegForPrinting(MCPhysReg Reg, const MCRegisterInfo &MRI) { +static MCRegister getRegForPrinting(MCRegister Reg, const MCRegisterInfo &MRI) { unsigned Enc = MRI.getEncodingValue(Reg); unsigned Idx = Enc & AMDGPU::HWEncoding::REG_IDX_MASK; if (Idx < 0x100) @@ -355,10 +356,10 @@ static MCPhysReg getRegForPrinting(MCPhysReg Reg, const MCRegisterInfo &MRI) { } // Restore MSBs of a VGPR above 255 from the MCInstrAnalysis. -static MCPhysReg getRegFromMIA(MCPhysReg Reg, unsigned OpNo, - const MCInstrDesc &Desc, - const MCRegisterInfo &MRI, - const AMDGPUMCInstrAnalysis &MIA) { +static MCRegister getRegFromMIA(MCRegister Reg, unsigned OpNo, + const MCInstrDesc &Desc, + const MCRegisterInfo &MRI, + const AMDGPUMCInstrAnalysis &MIA) { unsigned VgprMSBs = MIA.getVgprMSBs(); if (!VgprMSBs) return Reg; @@ -403,10 +404,10 @@ void AMDGPUInstPrinter::printRegOperand(MCRegister Reg, raw_ostream &O, } #endif - unsigned PrintReg = getRegForPrinting(Reg, MRI); + MCRegister PrintReg = getRegForPrinting(Reg, MRI); O << getRegisterName(PrintReg); - if (PrintReg != Reg.id()) + if (PrintReg != Reg) O << " /*" << getRegisterName(Reg) << "*/"; } @@ -490,6 +491,18 @@ void AMDGPUInstPrinter::printVINTRPDst(const MCInst *MI, unsigned OpNo, printRegularOperand(MI, OpNo, STI, O); } +void AMDGPUInstPrinter::printAVLdSt32Align2RegOp(const MCInst *MI, + unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + MCRegister Reg = MI->getOperand(OpNo).getReg(); + + // On targets with an even alignment requirement + if (MCRegister SubReg = MRI.getSubReg(Reg, AMDGPU::sub0)) + Reg = SubReg; + printRegOperand(Reg, O, MRI); +} + void AMDGPUInstPrinter::printImmediateInt16(uint32_t Imm, const MCSubtargetInfo &STI, raw_ostream &O) { @@ -795,14 +808,24 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo, // Intention: print disassembler message when invalid code is decoded, // for example sgpr register used in VReg or VISrc(VReg or imm) operand. const MCOperandInfo &OpInfo = Desc.operands()[OpNo]; - int16_t RCID = MII.getOpRegClassID( - OpInfo, STI.getHwMode(MCSubtargetInfo::HwMode_RegInfo)); - if (RCID != -1) { + if (OpInfo.RegClass != -1) { + int16_t RCID = MII.getOpRegClassID( + OpInfo, STI.getHwMode(MCSubtargetInfo::HwMode_RegInfo)); const MCRegisterClass &RC = MRI.getRegClass(RCID); auto Reg = mc2PseudoReg(Op.getReg()); if (!RC.contains(Reg) && !isInlineValue(Reg)) { - O << "/*Invalid register, operand has \'" << MRI.getRegClassName(&RC) - << "\' register class*/"; + bool IsWaveSizeOp = OpInfo.isLookupRegClassByHwMode() && + (OpInfo.RegClass == AMDGPU::SReg_1 || + OpInfo.RegClass == AMDGPU::SReg_1_XEXEC); + // Suppress this comment for a mismatched wavesize. Some users expect to + // be able to assemble and disassemble modules with mixed wavesizes, but + // we do not know the subtarget in different functions in MC. + // + // TODO: Should probably print it anyway, maybe a more specific version. + if (!IsWaveSizeOp) { + O << "/*Invalid register, operand has \'" << MRI.getRegClassName(&RC) + << "\' register class*/"; + } } } } else if (Op.isImm()) { @@ -1331,12 +1354,9 @@ void AMDGPUInstPrinter::printPackedModifier(const MCInst *MI, return; O << Name; - for (int I = 0; I < NumOps; ++I) { - if (I != 0) - O << ','; - - O << !!(Ops[I] & Mod); - } + ListSeparator Sep(","); + for (int I = 0; I < NumOps; ++I) + O << Sep << !!(Ops[I] & Mod); if (HasDstSel) { O << ',' << !!(Ops[0] & SISrcMods::DST_OP_SEL); @@ -1574,14 +1594,10 @@ void AMDGPUInstPrinter::printGPRIdxMode(const MCInst *MI, unsigned OpNo, O << formatHex(static_cast<uint64_t>(Val)); } else { O << "gpr_idx("; - bool NeedComma = false; + ListSeparator Sep(","); for (unsigned ModeId = ID_MIN; ModeId <= ID_MAX; ++ModeId) { - if (Val & (1 << ModeId)) { - if (NeedComma) - O << ','; - O << IdSymbolic[ModeId]; - NeedComma = true; - } + if (Val & (1 << ModeId)) + O << Sep << IdSymbolic[ModeId]; } O << ')'; } @@ -1788,25 +1804,16 @@ void AMDGPUInstPrinter::printSWaitCnt(const MCInst *MI, unsigned OpNo, bool IsDefaultLgkmcnt = Lgkmcnt == getLgkmcntBitMask(ISA); bool PrintAll = IsDefaultVmcnt && IsDefaultExpcnt && IsDefaultLgkmcnt; - bool NeedSpace = false; + ListSeparator Sep(" "); - if (!IsDefaultVmcnt || PrintAll) { - O << "vmcnt(" << Vmcnt << ')'; - NeedSpace = true; - } + if (!IsDefaultVmcnt || PrintAll) + O << Sep << "vmcnt(" << Vmcnt << ')'; - if (!IsDefaultExpcnt || PrintAll) { - if (NeedSpace) - O << ' '; - O << "expcnt(" << Expcnt << ')'; - NeedSpace = true; - } + if (!IsDefaultExpcnt || PrintAll) + O << Sep << "expcnt(" << Expcnt << ')'; - if (!IsDefaultLgkmcnt || PrintAll) { - if (NeedSpace) - O << ' '; - O << "lgkmcnt(" << Lgkmcnt << ')'; - } + if (!IsDefaultLgkmcnt || PrintAll) + O << Sep << "lgkmcnt(" << Lgkmcnt << ')'; } void AMDGPUInstPrinter::printDepCtr(const MCInst *MI, unsigned OpNo, @@ -1822,14 +1829,10 @@ void AMDGPUInstPrinter::printDepCtr(const MCInst *MI, unsigned OpNo, StringRef Name; unsigned Val; bool IsDefault; - bool NeedSpace = false; + ListSeparator Sep(" "); while (decodeDepCtr(Imm16, Id, Name, Val, IsDefault, STI)) { - if (!IsDefault || !HasNonDefaultVal) { - if (NeedSpace) - O << ' '; - O << Name << '(' << Val << ')'; - NeedSpace = true; - } + if (!IsDefault || !HasNonDefaultVal) + O << Sep << Name << '(' << Val << ')'; } } else { O << formatHex(Imm16); diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h index b27295e..564d6ee 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h @@ -77,6 +77,9 @@ private: raw_ostream &O); void printVINTRPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + void printAVLdSt32Align2RegOp(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printImmediateInt16(uint32_t Imm, const MCSubtargetInfo &STI, raw_ostream &O); void printImmediateBF16(uint32_t Imm, const MCSubtargetInfo &STI, diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp index c27be02..093c85e 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp @@ -7,9 +7,7 @@ //===----------------------------------------------------------------------===// #include "AMDGPUMCExpr.h" -#include "GCNSubtarget.h" #include "Utils/AMDGPUBaseInfo.h" -#include "llvm/IR/Function.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCContext.h" @@ -317,30 +315,6 @@ const AMDGPUMCExpr *AMDGPUMCExpr::createTotalNumVGPR(const MCExpr *NumAGPR, return create(AGVK_TotalNumVGPRs, {NumAGPR, NumVGPR}, Ctx); } -/// Mimics GCNSubtarget::computeOccupancy for MCExpr. -/// -/// Remove dependency on GCNSubtarget and depend only only the necessary values -/// for said occupancy computation. Should match computeOccupancy implementation -/// without passing \p STM on. -const AMDGPUMCExpr *AMDGPUMCExpr::createOccupancy( - unsigned InitOcc, const MCExpr *NumSGPRs, const MCExpr *NumVGPRs, - unsigned DynamicVGPRBlockSize, const GCNSubtarget &STM, MCContext &Ctx) { - unsigned MaxWaves = IsaInfo::getMaxWavesPerEU(&STM); - unsigned Granule = IsaInfo::getVGPRAllocGranule(&STM, DynamicVGPRBlockSize); - unsigned TargetTotalNumVGPRs = IsaInfo::getTotalNumVGPRs(&STM); - unsigned Generation = STM.getGeneration(); - - auto CreateExpr = [&Ctx](unsigned Value) { - return MCConstantExpr::create(Value, Ctx); - }; - - return create(AGVK_Occupancy, - {CreateExpr(MaxWaves), CreateExpr(Granule), - CreateExpr(TargetTotalNumVGPRs), CreateExpr(Generation), - CreateExpr(InitOcc), NumSGPRs, NumVGPRs}, - Ctx); -} - const AMDGPUMCExpr *AMDGPUMCExpr::createLit(LitModifier Lit, int64_t Value, MCContext &Ctx) { assert(Lit == LitModifier::Lit || Lit == LitModifier::Lit64); diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h index 246a3f8..bf7b40b 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h @@ -98,11 +98,6 @@ public: return create(VariantKind::AGVK_AlignTo, {Value, Align}, Ctx); } - static const AMDGPUMCExpr * - createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs, - const MCExpr *NumVGPRs, unsigned DynamicVGPRBlockSize, - const GCNSubtarget &STM, MCContext &Ctx); - static const AMDGPUMCExpr *createLit(LitModifier Lit, int64_t Value, MCContext &Ctx); diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp index 013cfeb..28b4da8 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp @@ -168,7 +168,7 @@ bool AMDGPUMCInstrAnalysis::evaluateBranch(const MCInst &Inst, uint64_t Addr, void AMDGPUMCInstrAnalysis::updateState(const MCInst &Inst, uint64_t Addr) { if (Inst.getOpcode() == AMDGPU::S_SET_VGPR_MSB_gfx12) - VgprMSBs = Inst.getOperand(0).getImm(); + VgprMSBs = Inst.getOperand(0).getImm() & 0xff; else if (isTerminator(Inst)) VgprMSBs = 0; } diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index 5a08573..0855d6d 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -302,9 +302,9 @@ void AMDGPUTargetAsmStreamer::EmitMCResourceInfo( #undef PRINT_RES_INFO } -void AMDGPUTargetAsmStreamer::EmitMCResourceMaximums(const MCSymbol *MaxVGPR, - const MCSymbol *MaxAGPR, - const MCSymbol *MaxSGPR) { +void AMDGPUTargetAsmStreamer::EmitMCResourceMaximums( + const MCSymbol *MaxVGPR, const MCSymbol *MaxAGPR, const MCSymbol *MaxSGPR, + const MCSymbol *MaxNamedBarrier) { #define PRINT_RES_INFO(ARG) \ OS << "\t.set "; \ ARG->print(OS, getContext().getAsmInfo()); \ @@ -315,6 +315,7 @@ void AMDGPUTargetAsmStreamer::EmitMCResourceMaximums(const MCSymbol *MaxVGPR, PRINT_RES_INFO(MaxVGPR); PRINT_RES_INFO(MaxAGPR); PRINT_RES_INFO(MaxSGPR); + PRINT_RES_INFO(MaxNamedBarrier); #undef PRINT_RES_INFO } diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h index 22afcde..3a0d8dc 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h @@ -69,7 +69,8 @@ public: virtual void EmitMCResourceMaximums(const MCSymbol *MaxVGPR, const MCSymbol *MaxAGPR, - const MCSymbol *MaxSGPR) {}; + const MCSymbol *MaxSGPR, + const MCSymbol *MaxNamedBarrier) {}; /// \returns True on success, false on failure. virtual bool EmitISAVersion() { return true; } @@ -149,7 +150,8 @@ public: const MCSymbol *HasRecursion, const MCSymbol *HasIndirectCall) override; void EmitMCResourceMaximums(const MCSymbol *MaxVGPR, const MCSymbol *MaxAGPR, - const MCSymbol *MaxSGPR) override; + const MCSymbol *MaxSGPR, + const MCSymbol *MaxNamedBarrier) override; /// \returns True on success, false on failure. bool EmitISAVersion() override; diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td index d950131..65dce74 100644 --- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td +++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td @@ -2116,8 +2116,10 @@ class VIMAGE_TENSOR_Real <bits<8> op, VIMAGE_TENSOR_Pseudo ps, string opName = p let vaddr2 = !if(ps.UpTo2D, !cast<int>(SGPR_NULL_gfx11plus.HWEncoding), ?); let vaddr3 = !if(ps.UpTo2D, !cast<int>(SGPR_NULL_gfx11plus.HWEncoding), ?); + // Set VADDR4 to NULL + let vaddr4 = !cast<int>(SGPR_NULL_gfx11plus.HWEncoding); + // set to 0 based on SPG. - let vaddr4 = 0; let rsrc = 0; let vdata = 0; let d16 = 0; diff --git a/llvm/lib/Target/AMDGPU/R600.td b/llvm/lib/Target/AMDGPU/R600.td index 9148edb..bdfaac9 100644 --- a/llvm/lib/Target/AMDGPU/R600.td +++ b/llvm/lib/Target/AMDGPU/R600.td @@ -8,15 +8,6 @@ include "llvm/Target/Target.td" -def R600InstrInfo : InstrInfo { - let guessInstructionProperties = 1; -} - -def R600 : Target { - let InstructionSet = R600InstrInfo; - let AllowRegisterRenaming = 1; -} - let Namespace = "R600" in { foreach Index = 0-15 in { @@ -27,6 +18,18 @@ include "R600RegisterInfo.td" } +defm : RemapAllTargetPseudoPointerOperands<R600_Addr>; + +def R600InstrInfo : InstrInfo { + let guessInstructionProperties = 1; +} + +def R600 : Target { + let InstructionSet = R600InstrInfo; + let AllowRegisterRenaming = 1; +} + + def NullALU : InstrItinClass; def ALU_NULL : FuncUnit; diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp index 2aa54c9..3c4f115 100644 --- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -13,6 +13,7 @@ #include "R600ISelLowering.h" #include "AMDGPU.h" +#include "AMDGPUSelectionDAGInfo.h" #include "MCTargetDesc/R600MCTargetDesc.h" #include "R600Defines.h" #include "R600MachineFunctionInfo.h" @@ -29,7 +30,8 @@ using namespace llvm; R600TargetLowering::R600TargetLowering(const TargetMachine &TM, const R600Subtarget &STI) - : AMDGPUTargetLowering(TM, STI), Subtarget(&STI), Gen(STI.getGeneration()) { + : AMDGPUTargetLowering(TM, STI, STI), Subtarget(&STI), + Gen(STI.getGeneration()) { addRegisterClass(MVT::f32, &R600::R600_Reg32RegClass); addRegisterClass(MVT::i32, &R600::R600_Reg32RegClass); addRegisterClass(MVT::v2f32, &R600::R600_Reg64RegClass); @@ -1129,12 +1131,9 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { if ((AS == AMDGPUAS::PRIVATE_ADDRESS) && TruncatingStore) { // Add an extra level of chain to isolate this vector SDValue NewChain = DAG.getNode(AMDGPUISD::DUMMY_CHAIN, DL, MVT::Other, Chain); - // TODO: can the chain be replaced without creating a new store? - SDValue NewStore = DAG.getTruncStore( - NewChain, DL, Value, Ptr, StoreNode->getPointerInfo(), MemVT, - StoreNode->getAlign(), StoreNode->getMemOperand()->getFlags(), - StoreNode->getAAInfo()); - StoreNode = cast<StoreSDNode>(NewStore); + SmallVector<SDValue, 4> NewOps(StoreNode->ops()); + NewOps[0] = NewChain; + StoreNode = cast<StoreSDNode>(DAG.UpdateNodeOperands(StoreNode, NewOps)); } return scalarizeVectorStore(StoreNode, DAG); @@ -2186,6 +2185,8 @@ R600TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { case AtomicRMWInst::FSub: case AtomicRMWInst::FMax: case AtomicRMWInst::FMin: + case AtomicRMWInst::USubCond: + case AtomicRMWInst::USubSat: return AtomicExpansionKind::CmpXChg; case AtomicRMWInst::UIncWrap: case AtomicRMWInst::UDecWrap: diff --git a/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp b/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp index 3e256cc..7f805e6 100644 --- a/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp @@ -29,7 +29,7 @@ using namespace llvm; #include "R600GenInstrInfo.inc" R600InstrInfo::R600InstrInfo(const R600Subtarget &ST) - : R600GenInstrInfo(ST, -1, -1), RI(), ST(ST) {} + : R600GenInstrInfo(ST, RI, -1, -1), RI(), ST(ST) {} bool R600InstrInfo::isVector(const MachineInstr &MI) const { return get(MI.getOpcode()).TSFlags & R600_InstFlag::VECTOR; @@ -176,7 +176,7 @@ bool R600InstrInfo::usesVertexCache(unsigned Opcode) const { } bool R600InstrInfo::usesVertexCache(const MachineInstr &MI) const { - const MachineFunction *MF = MI.getParent()->getParent(); + const MachineFunction *MF = MI.getMF(); return !AMDGPU::isCompute(MF->getFunction().getCallingConv()) && usesVertexCache(MI.getOpcode()); } @@ -186,7 +186,7 @@ bool R600InstrInfo::usesTextureCache(unsigned Opcode) const { } bool R600InstrInfo::usesTextureCache(const MachineInstr &MI) const { - const MachineFunction *MF = MI.getParent()->getParent(); + const MachineFunction *MF = MI.getMF(); return (AMDGPU::isCompute(MF->getFunction().getCallingConv()) && usesVertexCache(MI.getOpcode())) || usesTextureCache(MI.getOpcode()); @@ -948,7 +948,7 @@ bool R600InstrInfo::PredicateInstruction(MachineInstr &MI, .setReg(Pred[2].getReg()); MI.getOperand(getOperandIdx(MI, R600::OpName::pred_sel_W)) .setReg(Pred[2].getReg()); - MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI); + MachineInstrBuilder MIB(*MI.getMF(), MI); MIB.addReg(R600::PREDICATE_BIT, RegState::Implicit); return true; } @@ -956,7 +956,7 @@ bool R600InstrInfo::PredicateInstruction(MachineInstr &MI, if (PIdx != -1) { MachineOperand &PMO = MI.getOperand(PIdx); PMO.setReg(Pred[2].getReg()); - MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI); + MachineInstrBuilder MIB(*MI.getMF(), MI); MIB.addReg(R600::PREDICATE_BIT, RegState::Implicit); return true; } diff --git a/llvm/lib/Target/AMDGPU/R600MCInstLower.cpp b/llvm/lib/Target/AMDGPU/R600MCInstLower.cpp index 48b4e7f..ac6508c 100644 --- a/llvm/lib/Target/AMDGPU/R600MCInstLower.cpp +++ b/llvm/lib/Target/AMDGPU/R600MCInstLower.cpp @@ -55,7 +55,7 @@ void R600AsmPrinter::emitInstruction(const MachineInstr *MI) { StringRef Err; if (!STI.getInstrInfo()->verifyInstruction(*MI, Err)) { - LLVMContext &C = MI->getParent()->getParent()->getFunction().getContext(); + LLVMContext &C = MI->getMF()->getFunction().getContext(); C.emitError("Illegal instruction detected: " + Err); MI->print(errs()); } diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h index b7a92a0..0d206ab 100644 --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -523,6 +523,7 @@ enum Id { // HwRegCode, (6) [5:0] ID_HW_ID1 = 23, ID_HW_ID2 = 24, ID_POPS_PACKER = 25, + ID_SCHED_MODE = 26, ID_PERF_SNAPSHOT_DATA_gfx11 = 27, ID_IB_STS2 = 28, ID_SHADER_CYCLES = 29, diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index 7793907..39a6a77 100644 --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -238,7 +238,7 @@ static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC, static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI, const SIRegisterInfo *TRI, const SIInstrInfo *TII) { - MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); auto &Src = MI.getOperand(1); Register DstReg = MI.getOperand(0).getReg(); Register SrcReg = Src.getReg(); @@ -856,8 +856,8 @@ void SIFixSGPRCopies::processPHINode(MachineInstr &MI) { } } - if (TRI->isVectorRegister(*MRI, PHIRes) || - RC0 == &AMDGPU::VReg_1RegClass) { + if (TRI->hasVectorRegisters(MRI->getRegClass(PHIRes)) || + RC0 == &AMDGPU::VReg_1RegClass) { LLVM_DEBUG(dbgs() << "Legalizing PHI: " << MI); TII->legalizeOperands(MI, MDT); } @@ -902,14 +902,28 @@ bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI, // really much we can do to fix this. // Some special instructions use M0 as an input. Some even only use // the first lane. Insert a readfirstlane and hope for the best. - if (DstReg == AMDGPU::M0 && - TRI->hasVectorRegisters(MRI->getRegClass(SrcReg))) { + const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcReg); + if (DstReg == AMDGPU::M0 && TRI->hasVectorRegisters(SrcRC)) { Register TmpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); - BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), - TII->get(AMDGPU::V_READFIRSTLANE_B32), TmpReg) + + const MCInstrDesc &ReadFirstLaneDesc = + TII->get(AMDGPU::V_READFIRSTLANE_B32); + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), ReadFirstLaneDesc, TmpReg) .add(MI.getOperand(1)); + + unsigned SubReg = MI.getOperand(1).getSubReg(); MI.getOperand(1).setReg(TmpReg); + MI.getOperand(1).setSubReg(AMDGPU::NoSubRegister); + + const TargetRegisterClass *OpRC = TII->getRegClass(ReadFirstLaneDesc, 1); + const TargetRegisterClass *ConstrainRC = + SubReg == AMDGPU::NoSubRegister + ? OpRC + : TRI->getMatchingSuperRegClass(SrcRC, OpRC, SubReg); + + if (!MRI->constrainRegClass(SrcReg, ConstrainRC)) + llvm_unreachable("failed to constrain register"); } else if (tryMoveVGPRConstToSGPR(MI.getOperand(1), DstReg, MI.getParent(), MI, MI.getDebugLoc())) { I = std::next(I); @@ -930,7 +944,7 @@ bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI, // s_mov_b32. if (isSafeToFoldImmIntoCopy(&MI, MRI->getVRegDef(SrcReg), TII, SMovOp, Imm)) { MI.getOperand(1).ChangeToImmediate(Imm); - MI.addImplicitDefUseOperands(*MI.getParent()->getParent()); + MI.addImplicitDefUseOperands(*MI.getMF()); MI.setDesc(TII->get(SMovOp)); return true; } @@ -1122,9 +1136,20 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) { BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg) .addReg(VReg32); } else if (SrcSize == 32) { - auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), - TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg); - MIB.addReg(SrcReg, 0, SubReg); + const MCInstrDesc &ReadFirstLaneDesc = + TII->get(AMDGPU::V_READFIRSTLANE_B32); + const TargetRegisterClass *OpRC = TII->getRegClass(ReadFirstLaneDesc, 1); + BuildMI(*MBB, MI, MI->getDebugLoc(), ReadFirstLaneDesc, DstReg) + .addReg(SrcReg, 0, SubReg); + + const TargetRegisterClass *ConstrainRC = + SubReg == AMDGPU::NoSubRegister + ? OpRC + : TRI->getMatchingSuperRegClass(MRI->getRegClass(SrcReg), OpRC, + SubReg); + + if (!MRI->constrainRegClass(SrcReg, ConstrainRC)) + llvm_unreachable("failed to constrain register"); } else { auto Result = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE), DstReg); diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 6616b30..2df9267 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -681,6 +681,10 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const { return false; MI->setDesc(TII->get(NewMFMAOpc)); MI->untieRegOperand(0); + const MCInstrDesc &MCID = MI->getDesc(); + for (unsigned I = 0; I < MI->getNumDefs(); ++I) + if (MCID.getOperandConstraint(I, MCOI::EARLY_CLOBBER) != -1) + MI->getOperand(I).setIsEarlyClobber(true); } // TODO: Should we try to avoid adding this to the candidate list? @@ -709,7 +713,7 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const { // Verify the register is compatible with the operand. if (const TargetRegisterClass *OpRC = - TII->getRegClass(MI->getDesc(), Fold.UseOpNo, TRI)) { + TII->getRegClass(MI->getDesc(), Fold.UseOpNo)) { const TargetRegisterClass *NewRC = TRI->getRegClassForReg(*MRI, New->getReg()); @@ -762,6 +766,29 @@ static void appendFoldCandidate(SmallVectorImpl<FoldCandidate> &FoldList, FoldCandidate(MI, OpNo, FoldOp, Commuted, ShrinkOp)); } +// Returns true if the instruction is a packed F32 instruction and the +// corresponding scalar operand reads 32 bits and replicates the bits to both +// channels. +static bool isPKF32InstrReplicatesLower32BitsOfScalarOperand( + const GCNSubtarget *ST, MachineInstr *MI, unsigned OpNo) { + if (!ST->hasPKF32InstsReplicatingLower32BitsOfScalarInput()) + return false; + const MCOperandInfo &OpDesc = MI->getDesc().operands()[OpNo]; + return OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32; +} + +// Packed FP32 instructions only read 32 bits from a scalar operand (SGPR or +// literal) and replicates the bits to both channels. Therefore, if the hi and +// lo are not same, we can't fold it. +static bool checkImmOpForPKF32InstrReplicatesLower32BitsOfScalarOperand( + const FoldableDef &OpToFold) { + assert(OpToFold.isImm() && "Expected immediate operand"); + uint64_t ImmVal = OpToFold.getEffectiveImmVal().value(); + uint32_t Lo = Lo_32(ImmVal); + uint32_t Hi = Hi_32(ImmVal); + return Lo == Hi; +} + bool SIFoldOperandsImpl::tryAddToFoldList( SmallVectorImpl<FoldCandidate> &FoldList, MachineInstr *MI, unsigned OpNo, const FoldableDef &OpToFold) const { @@ -915,6 +942,13 @@ bool SIFoldOperandsImpl::tryAddToFoldList( return true; } + // Special case for PK_F32 instructions if we are trying to fold an imm to + // src0 or src1. + if (OpToFold.isImm() && + isPKF32InstrReplicatesLower32BitsOfScalarOperand(ST, MI, OpNo) && + !checkImmOpForPKF32InstrReplicatesLower32BitsOfScalarOperand(OpToFold)) + return false; + appendFoldCandidate(FoldList, MI, OpNo, OpToFold); return true; } @@ -1129,40 +1163,14 @@ bool SIFoldOperandsImpl::tryToFoldACImm( if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx)) return false; - MachineOperand &UseOp = UseMI->getOperand(UseOpIdx); if (OpToFold.isImm() && OpToFold.isOperandLegal(*TII, *UseMI, UseOpIdx)) { + if (isPKF32InstrReplicatesLower32BitsOfScalarOperand(ST, UseMI, UseOpIdx) && + !checkImmOpForPKF32InstrReplicatesLower32BitsOfScalarOperand(OpToFold)) + return false; appendFoldCandidate(FoldList, UseMI, UseOpIdx, OpToFold); return true; } - // TODO: Verify the following code handles subregisters correctly. - // TODO: Handle extract of global reference - if (UseOp.getSubReg()) - return false; - - if (!OpToFold.isReg()) - return false; - - Register UseReg = OpToFold.getReg(); - if (!UseReg.isVirtual()) - return false; - - // Maybe it is just a COPY of an immediate itself. - - // FIXME: Remove this handling. There is already special case folding of - // immediate into copy in foldOperand. This is looking for the def of the - // value the folding started from in the first place. - MachineInstr *Def = MRI->getVRegDef(UseReg); - if (Def && TII->isFoldableCopy(*Def)) { - MachineOperand &DefOp = Def->getOperand(1); - if (DefOp.isImm() && TII->isOperandLegal(*UseMI, UseOpIdx, &DefOp)) { - FoldableDef FoldableImm(DefOp.getImm(), OpToFold.DefRC, - OpToFold.DefSubReg); - appendFoldCandidate(FoldList, UseMI, UseOpIdx, FoldableImm); - return true; - } - } - return false; } @@ -1309,10 +1317,11 @@ void SIFoldOperandsImpl::foldOperand( continue; const int SrcIdx = MovOp == AMDGPU::V_MOV_B16_t16_e64 ? 2 : 1; - const TargetRegisterClass *MovSrcRC = - TRI->getRegClass(TII->getOpRegClassID(MovDesc.operands()[SrcIdx])); - if (MovSrcRC) { + int16_t RegClassID = TII->getOpRegClassID(MovDesc.operands()[SrcIdx]); + if (RegClassID != -1) { + const TargetRegisterClass *MovSrcRC = TRI->getRegClass(RegClassID); + if (UseSubReg) MovSrcRC = TRI->getMatchingSuperRegClass(SrcRC, MovSrcRC, UseSubReg); @@ -1351,7 +1360,7 @@ void SIFoldOperandsImpl::foldOperand( if (MovOp == AMDGPU::V_MOV_B16_t16_e64) { const auto &SrcOp = UseMI->getOperand(UseOpIdx); MachineOperand NewSrcOp(SrcOp); - MachineFunction *MF = UseMI->getParent()->getParent(); + MachineFunction *MF = UseMI->getMF(); UseMI->removeOperand(1); UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // src0_modifiers UseMI->addOperand(NewSrcOp); // src0 @@ -1382,7 +1391,7 @@ void SIFoldOperandsImpl::foldOperand( // Remove this if 16-bit SGPRs (i.e. SGPR_LO16) are added to the // VS_16RegClass // - // Excerpt from AMDGPUGenRegisterInfo.inc + // Excerpt from AMDGPUGenRegisterInfoEnums.inc // NoSubRegister, //0 // hi16, // 1 // lo16, // 2 @@ -1558,20 +1567,6 @@ static unsigned getMovOpc(bool IsScalar) { return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; } -static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) { - MI.setDesc(NewDesc); - - // Remove any leftover implicit operands from mutating the instruction. e.g. - // if we replace an s_and_b32 with a copy, we don't need the implicit scc def - // anymore. - const MCInstrDesc &Desc = MI.getDesc(); - unsigned NumOps = Desc.getNumOperands() + Desc.implicit_uses().size() + - Desc.implicit_defs().size(); - - for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I) - MI.removeOperand(I); -} - std::optional<int64_t> SIFoldOperandsImpl::getImmOrMaterializedImm(MachineOperand &Op) const { if (Op.isImm()) @@ -1610,7 +1605,8 @@ bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const { Opc == AMDGPU::S_NOT_B32) && Src0Imm) { MI->getOperand(1).ChangeToImmediate(~*Src0Imm); - mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32))); + TII->mutateAndCleanupImplicit( + *MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32))); return true; } @@ -1638,7 +1634,7 @@ bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const { // instruction. MI->getOperand(Src0Idx).ChangeToImmediate(NewImm); MI->removeOperand(Src1Idx); - mutateCopyOp(*MI, TII->get(getMovOpc(IsSGPR))); + TII->mutateAndCleanupImplicit(*MI, TII->get(getMovOpc(IsSGPR))); return true; } @@ -1658,11 +1654,12 @@ bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const { if (Src1Val == 0) { // y = or x, 0 => y = copy x MI->removeOperand(Src1Idx); - mutateCopyOp(*MI, TII->get(AMDGPU::COPY)); + TII->mutateAndCleanupImplicit(*MI, TII->get(AMDGPU::COPY)); } else if (Src1Val == -1) { // y = or x, -1 => y = v_mov_b32 -1 MI->removeOperand(Src1Idx); - mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32))); + TII->mutateAndCleanupImplicit( + *MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32))); } else return false; @@ -1674,11 +1671,12 @@ bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const { if (Src1Val == 0) { // y = and x, 0 => y = v_mov_b32 0 MI->removeOperand(Src0Idx); - mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32))); + TII->mutateAndCleanupImplicit( + *MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32))); } else if (Src1Val == -1) { // y = and x, -1 => y = copy x MI->removeOperand(Src1Idx); - mutateCopyOp(*MI, TII->get(AMDGPU::COPY)); + TII->mutateAndCleanupImplicit(*MI, TII->get(AMDGPU::COPY)); } else return false; @@ -1690,7 +1688,7 @@ bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const { if (Src1Val == 0) { // y = xor x, 0 => y = copy x MI->removeOperand(Src1Idx); - mutateCopyOp(*MI, TII->get(AMDGPU::COPY)); + TII->mutateAndCleanupImplicit(*MI, TII->get(AMDGPU::COPY)); return true; } } @@ -1736,7 +1734,7 @@ bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI) const { MI.removeOperand(Src1ModIdx); if (Src0ModIdx != -1) MI.removeOperand(Src0ModIdx); - mutateCopyOp(MI, NewDesc); + TII->mutateAndCleanupImplicit(MI, NewDesc); LLVM_DEBUG(dbgs() << MI); return true; } @@ -1804,7 +1802,7 @@ bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI, if (CopiesToReplace.empty() && FoldList.empty()) return Changed; - MachineFunction *MF = MI.getParent()->getParent(); + MachineFunction *MF = MI.getMF(); // Make sure we add EXEC uses to any new v_mov instructions created. for (MachineInstr *Copy : CopiesToReplace) Copy->addImplicitDefUseOperands(*MF); @@ -2419,7 +2417,7 @@ bool SIFoldOperandsImpl::tryFoldRegSequence(MachineInstr &MI) { unsigned OpIdx = Op - &UseMI->getOperand(0); const MCInstrDesc &InstDesc = UseMI->getDesc(); - const TargetRegisterClass *OpRC = TII->getRegClass(InstDesc, OpIdx, TRI); + const TargetRegisterClass *OpRC = TII->getRegClass(InstDesc, OpIdx); if (!OpRC || !TRI->isVectorSuperClass(OpRC)) return false; diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index 5c39f7a..ec3e720 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -772,6 +772,17 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, PreloadedScratchRsrcReg, ScratchRsrcReg, ScratchWaveOffsetReg); } + + if (ST.hasWaitXCnt()) { + // Set REPLAY_MODE (bit 25) in MODE register to enable multi-group XNACK + // replay. This aligns hardware behavior with the compiler's s_wait_xcnt + // insertion logic, which assumes multi-group mode by default. + unsigned RegEncoding = + AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 25, 1); + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_IMM32_B32)) + .addImm(1) + .addImm(RegEncoding); + } } // Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg` @@ -1833,9 +1844,7 @@ void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF, static void assignSlotsUsingVGPRBlocks(MachineFunction &MF, const GCNSubtarget &ST, - std::vector<CalleeSavedInfo> &CSI, - unsigned &MinCSFrameIndex, - unsigned &MaxCSFrameIndex) { + std::vector<CalleeSavedInfo> &CSI) { SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); MachineFrameInfo &MFI = MF.getFrameInfo(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); @@ -1904,10 +1913,7 @@ static void assignSlotsUsingVGPRBlocks(MachineFunction &MF, int FrameIdx = MFI.CreateStackObject(BlockSize, TRI->getSpillAlign(*BlockRegClass), /*isSpillSlot=*/true); - if ((unsigned)FrameIdx < MinCSFrameIndex) - MinCSFrameIndex = FrameIdx; - if ((unsigned)FrameIdx > MaxCSFrameIndex) - MaxCSFrameIndex = FrameIdx; + MFI.setIsCalleeSavedObjectIndex(FrameIdx, true); CSIt->setFrameIdx(FrameIdx); CSIt->setReg(RegBlock); @@ -1917,8 +1923,7 @@ static void assignSlotsUsingVGPRBlocks(MachineFunction &MF, bool SIFrameLowering::assignCalleeSavedSpillSlots( MachineFunction &MF, const TargetRegisterInfo *TRI, - std::vector<CalleeSavedInfo> &CSI, unsigned &MinCSFrameIndex, - unsigned &MaxCSFrameIndex) const { + std::vector<CalleeSavedInfo> &CSI) const { if (CSI.empty()) return true; // Early exit if no callee saved registers are modified! @@ -1926,12 +1931,12 @@ bool SIFrameLowering::assignCalleeSavedSpillSlots( bool UseVGPRBlocks = ST.useVGPRBlockOpsForCSR(); if (UseVGPRBlocks) - assignSlotsUsingVGPRBlocks(MF, ST, CSI, MinCSFrameIndex, MaxCSFrameIndex); + assignSlotsUsingVGPRBlocks(MF, ST, CSI); - return assignCalleeSavedSpillSlots(MF, TRI, CSI) || UseVGPRBlocks; + return assignCalleeSavedSpillSlotsImpl(MF, TRI, CSI) || UseVGPRBlocks; } -bool SIFrameLowering::assignCalleeSavedSpillSlots( +bool SIFrameLowering::assignCalleeSavedSpillSlotsImpl( MachineFunction &MF, const TargetRegisterInfo *TRI, std::vector<CalleeSavedInfo> &CSI) const { if (CSI.empty()) @@ -2170,7 +2175,9 @@ bool SIFrameLowering::hasFPImpl(const MachineFunction &MF) const { return MFI.getStackSize() != 0; } - return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() || + return (frameTriviallyRequiresSP(MFI) && + !MF.getInfo<SIMachineFunctionInfo>()->isChainFunction()) || + MFI.isFrameAddressTaken() || MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->hasStackRealignment( MF) || mayReserveScratchForCWSR(MF) || diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h index a727729..4c1cf3c 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h @@ -49,11 +49,9 @@ public: const TargetRegisterInfo *TRI, std::vector<CalleeSavedInfo> &CSI) const override; - bool assignCalleeSavedSpillSlots(MachineFunction &MF, - const TargetRegisterInfo *TRI, - std::vector<CalleeSavedInfo> &CSI, - unsigned &MinCSFrameIndex, - unsigned &MaxCSFrameIndex) const override; + bool assignCalleeSavedSpillSlotsImpl(MachineFunction &MF, + const TargetRegisterInfo *TRI, + std::vector<CalleeSavedInfo> &CSI) const; bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index b34ab2a..0f91b31 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -15,6 +15,7 @@ #include "AMDGPU.h" #include "AMDGPUInstrInfo.h" #include "AMDGPULaneMaskUtils.h" +#include "AMDGPUSelectionDAGInfo.h" #include "AMDGPUTargetMachine.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" @@ -34,6 +35,8 @@ #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachinePassManager.h" +#include "llvm/CodeGen/PseudoSourceValueManager.h" #include "llvm/CodeGen/SDPatternMatch.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/IRBuilder.h" @@ -86,69 +89,78 @@ static unsigned findFirstFreeSGPR(CCState &CCInfo) { SITargetLowering::SITargetLowering(const TargetMachine &TM, const GCNSubtarget &STI) - : AMDGPUTargetLowering(TM, STI), Subtarget(&STI) { + : AMDGPUTargetLowering(TM, STI, STI), Subtarget(&STI) { addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass); addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass); addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass); - addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass); + + const SIRegisterInfo *TRI = STI.getRegisterInfo(); + const TargetRegisterClass *V32RegClass = + TRI->getDefaultVectorSuperClassForBitWidth(32); + addRegisterClass(MVT::f32, V32RegClass); addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass); - const SIRegisterInfo *TRI = STI.getRegisterInfo(); - const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class(); + const TargetRegisterClass *V64RegClass = + TRI->getDefaultVectorSuperClassForBitWidth(64); addRegisterClass(MVT::f64, V64RegClass); addRegisterClass(MVT::v2f32, V64RegClass); addRegisterClass(MVT::Untyped, V64RegClass); addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass); - addRegisterClass(MVT::v3f32, &AMDGPU::VReg_96RegClass); + addRegisterClass(MVT::v3f32, TRI->getDefaultVectorSuperClassForBitWidth(96)); addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass); addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass); addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass); - addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); + addRegisterClass(MVT::v4f32, TRI->getDefaultVectorSuperClassForBitWidth(128)); addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass); - addRegisterClass(MVT::v5f32, &AMDGPU::VReg_160RegClass); + addRegisterClass(MVT::v5f32, TRI->getDefaultVectorSuperClassForBitWidth(160)); addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass); - addRegisterClass(MVT::v6f32, &AMDGPU::VReg_192RegClass); + addRegisterClass(MVT::v6f32, TRI->getDefaultVectorSuperClassForBitWidth(192)); addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass); - addRegisterClass(MVT::v3f64, &AMDGPU::VReg_192RegClass); + addRegisterClass(MVT::v3f64, TRI->getDefaultVectorSuperClassForBitWidth(192)); addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass); - addRegisterClass(MVT::v7f32, &AMDGPU::VReg_224RegClass); + addRegisterClass(MVT::v7f32, TRI->getDefaultVectorSuperClassForBitWidth(224)); addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass); - addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass); + addRegisterClass(MVT::v8f32, TRI->getDefaultVectorSuperClassForBitWidth(256)); addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass); - addRegisterClass(MVT::v4f64, &AMDGPU::VReg_256RegClass); + addRegisterClass(MVT::v4f64, TRI->getDefaultVectorSuperClassForBitWidth(256)); addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass); - addRegisterClass(MVT::v9f32, &AMDGPU::VReg_288RegClass); + addRegisterClass(MVT::v9f32, TRI->getDefaultVectorSuperClassForBitWidth(288)); addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass); - addRegisterClass(MVT::v10f32, &AMDGPU::VReg_320RegClass); + addRegisterClass(MVT::v10f32, + TRI->getDefaultVectorSuperClassForBitWidth(320)); addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass); - addRegisterClass(MVT::v11f32, &AMDGPU::VReg_352RegClass); + addRegisterClass(MVT::v11f32, + TRI->getDefaultVectorSuperClassForBitWidth(352)); addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass); - addRegisterClass(MVT::v12f32, &AMDGPU::VReg_384RegClass); + addRegisterClass(MVT::v12f32, + TRI->getDefaultVectorSuperClassForBitWidth(384)); addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass); - addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass); + addRegisterClass(MVT::v16f32, + TRI->getDefaultVectorSuperClassForBitWidth(512)); addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass); - addRegisterClass(MVT::v8f64, &AMDGPU::VReg_512RegClass); + addRegisterClass(MVT::v8f64, TRI->getDefaultVectorSuperClassForBitWidth(512)); addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass); - addRegisterClass(MVT::v16f64, &AMDGPU::VReg_1024RegClass); + addRegisterClass(MVT::v16f64, + TRI->getDefaultVectorSuperClassForBitWidth(1024)); if (Subtarget->has16BitInsts()) { if (Subtarget->useRealTrue16Insts()) { @@ -180,7 +192,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, } addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass); - addRegisterClass(MVT::v32f32, &AMDGPU::VReg_1024RegClass); + addRegisterClass(MVT::v32f32, + TRI->getDefaultVectorSuperClassForBitWidth(1024)); computeRegisterProperties(Subtarget->getRegisterInfo()); @@ -1047,6 +1060,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, ISD::ATOMIC_LOAD_FMAX, ISD::ATOMIC_LOAD_UINC_WRAP, ISD::ATOMIC_LOAD_UDEC_WRAP, + ISD::ATOMIC_LOAD_USUB_COND, + ISD::ATOMIC_LOAD_USUB_SAT, ISD::INTRINSIC_VOID, ISD::INTRINSIC_W_CHAIN}); @@ -1297,7 +1312,7 @@ static unsigned getIntrMemWidth(unsigned IntrID) { } } -static void getCoopAtomicOperandsInfo(const CallInst &CI, bool IsLoad, +static void getCoopAtomicOperandsInfo(const CallBase &CI, bool IsLoad, TargetLoweringBase::IntrinsicInfo &Info) { Value *OrderingArg = CI.getArgOperand(IsLoad ? 1 : 2); unsigned Ord = cast<ConstantInt>(OrderingArg)->getZExtValue(); @@ -1327,7 +1342,7 @@ static void getCoopAtomicOperandsInfo(const CallInst &CI, bool IsLoad, } bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, - const CallInst &CI, + const CallBase &CI, MachineFunction &MF, unsigned IntrID) const { Info.flags = MachineMemOperand::MONone; @@ -1507,15 +1522,6 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; return true; } - case Intrinsic::amdgcn_global_atomic_csub: { - Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.memVT = MVT::getVT(CI.getType()); - Info.ptrVal = CI.getOperand(0); - Info.align.reset(); - Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore | - MachineMemOperand::MOVolatile; - return true; - } case Intrinsic::amdgcn_image_bvh_dual_intersect_ray: case Intrinsic::amdgcn_image_bvh_intersect_ray: case Intrinsic::amdgcn_image_bvh8_intersect_ray: { @@ -1536,8 +1542,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::amdgcn_global_atomic_fmax_num: case Intrinsic::amdgcn_global_atomic_ordered_add_b64: case Intrinsic::amdgcn_flat_atomic_fmin_num: - case Intrinsic::amdgcn_flat_atomic_fmax_num: - case Intrinsic::amdgcn_atomic_cond_sub_u32: { + case Intrinsic::amdgcn_flat_atomic_fmax_num: { Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(CI.getType()); Info.ptrVal = CI.getOperand(0); @@ -1712,7 +1717,6 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II, Type *&AccessTy) const { Value *Ptr = nullptr; switch (II->getIntrinsicID()) { - case Intrinsic::amdgcn_atomic_cond_sub_u32: case Intrinsic::amdgcn_cluster_load_b128: case Intrinsic::amdgcn_cluster_load_b64: case Intrinsic::amdgcn_cluster_load_b32: @@ -1735,7 +1739,6 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II, case Intrinsic::amdgcn_flat_load_monitor_b128: case Intrinsic::amdgcn_flat_load_monitor_b32: case Intrinsic::amdgcn_flat_load_monitor_b64: - case Intrinsic::amdgcn_global_atomic_csub: case Intrinsic::amdgcn_global_atomic_fmax_num: case Intrinsic::amdgcn_global_atomic_fmin_num: case Intrinsic::amdgcn_global_atomic_ordered_add_b64: @@ -2254,6 +2257,14 @@ bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const { return TargetLowering::isTypeDesirableForOp(Op, VT); } +MachinePointerInfo +SITargetLowering::getKernargSegmentPtrInfo(MachineFunction &MF) const { + // This isn't really a constant pool but close enough. + MachinePointerInfo PtrInfo(MF.getPSVManager().getConstantPool()); + PtrInfo.AddrSpace = AMDGPUAS::CONSTANT_ADDRESS; + return PtrInfo; +} + SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, @@ -2330,7 +2341,9 @@ SDValue SITargetLowering::lowerKernargMemParameter( SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain, uint64_t Offset, Align Alignment, bool Signed, const ISD::InputArg *Arg) const { - MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); + + MachinePointerInfo PtrInfo = + getKernargSegmentPtrInfo(DAG.getMachineFunction()); // Try to avoid using an extload by loading earlier than the argument address, // and extracting the relevant bits. The load should hopefully be merged with @@ -2345,7 +2358,8 @@ SDValue SITargetLowering::lowerKernargMemParameter( // TODO: If we passed in the base kernel offset we could have a better // alignment than 4, but we don't really need it. SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset); - SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4), + SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, + PtrInfo.getWithOffset(AlignDownOffset), Align(4), MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant); @@ -2360,9 +2374,9 @@ SDValue SITargetLowering::lowerKernargMemParameter( } SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset); - SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment, - MachineMemOperand::MODereferenceable | - MachineMemOperand::MOInvariant); + SDValue Load = DAG.getLoad( + MemVT, SL, Chain, Ptr, PtrInfo.getWithOffset(Offset), Alignment, + MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant); SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg); return DAG.getMergeValues({Val, Load.getValue(1)}, SL); @@ -3562,11 +3576,17 @@ SDValue SITargetLowering::LowerFormalArguments( if (IsEntryFunc) allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics); - // DAG.getPass() returns nullptr when using new pass manager. - // TODO: Use DAG.getMFAM() to access analysis result. if (DAG.getPass()) { - auto &ArgUsageInfo = DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>(); - ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo()); + auto &ArgUsageInfo = + DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfoWrapperLegacy>(); + ArgUsageInfo.getArgUsageInfo().setFuncArgInfo(Fn, Info->getArgInfo()); + } else if (auto *MFAM = DAG.getMFAM()) { + Module &M = *MF.getFunction().getParent(); + auto *ArgUsageInfo = + MFAM->getResult<ModuleAnalysisManagerMachineFunctionProxy>(MF) + .getCachedResult<AMDGPUArgumentUsageAnalysis>(M); + if (ArgUsageInfo) + ArgUsageInfo->setFuncArgInfo(Fn, Info->getArgInfo()); } unsigned StackArgSize = CCInfo.getStackSize(); @@ -3781,12 +3801,19 @@ void SITargetLowering::passSpecialInputs( const AMDGPUFunctionArgInfo *CalleeArgInfo = &AMDGPUArgumentUsageInfo::FixedABIFunctionInfo; if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) { - // DAG.getPass() returns nullptr when using new pass manager. - // TODO: Use DAG.getMFAM() to access analysis result. if (DAG.getPass()) { auto &ArgUsageInfo = - DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>(); - CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc); + DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfoWrapperLegacy>(); + CalleeArgInfo = + &ArgUsageInfo.getArgUsageInfo().lookupFuncArgInfo(*CalleeFunc); + } else if (auto *MFAM = DAG.getMFAM()) { + Module &M = *DAG.getMachineFunction().getFunction().getParent(); + auto *ArgUsageInfo = + MFAM->getResult<ModuleAnalysisManagerMachineFunctionProxy>( + DAG.getMachineFunction()) + .getCachedResult<AMDGPUArgumentUsageAnalysis>(M); + if (ArgUsageInfo) + CalleeArgInfo = &ArgUsageInfo->lookupFuncArgInfo(*CalleeFunc); } } @@ -4052,7 +4079,7 @@ bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { if (!CI->isTailCall()) return false; - const Function *ParentFn = CI->getParent()->getParent(); + const Function *ParentFn = CI->getFunction(); if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv())) return false; return true; @@ -5469,6 +5496,10 @@ static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc) { return std::numeric_limits<uint32_t>::min(); case AMDGPU::S_MAX_I32: return std::numeric_limits<int32_t>::min(); + case AMDGPU::V_ADD_F32_e64: // -0.0 + return 0x80000000; + case AMDGPU::V_SUB_F32_e64: // +0.0 + return 0x0; case AMDGPU::S_ADD_I32: case AMDGPU::S_SUB_I32: case AMDGPU::S_OR_B32: @@ -5476,6 +5507,9 @@ static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc) { return std::numeric_limits<uint32_t>::min(); case AMDGPU::S_AND_B32: return std::numeric_limits<uint32_t>::max(); + case AMDGPU::V_MIN_F32_e64: + case AMDGPU::V_MAX_F32_e64: + return 0x7fc00000; // qNAN default: llvm_unreachable( "Unexpected opcode in getIdentityValueFor32BitWaveReduction"); @@ -5510,7 +5544,14 @@ static bool is32bitWaveReduceOperation(unsigned Opc) { Opc == AMDGPU::S_MAX_U32 || Opc == AMDGPU::S_MAX_I32 || Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 || Opc == AMDGPU::S_AND_B32 || Opc == AMDGPU::S_OR_B32 || - Opc == AMDGPU::S_XOR_B32; + Opc == AMDGPU::S_XOR_B32 || Opc == AMDGPU::V_MIN_F32_e64 || + Opc == AMDGPU::V_MAX_F32_e64 || Opc == AMDGPU::V_ADD_F32_e64 || + Opc == AMDGPU::V_SUB_F32_e64; +} + +static bool isFloatingPointWaveReduceOperation(unsigned Opc) { + return Opc == AMDGPU::V_MIN_F32_e64 || Opc == AMDGPU::V_MAX_F32_e64 || + Opc == AMDGPU::V_ADD_F32_e64 || Opc == AMDGPU::V_SUB_F32_e64; } static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, @@ -5531,8 +5572,10 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, switch (Opc) { case AMDGPU::S_MIN_U32: case AMDGPU::S_MIN_I32: + case AMDGPU::V_MIN_F32_e64: case AMDGPU::S_MAX_U32: case AMDGPU::S_MAX_I32: + case AMDGPU::V_MAX_F32_e64: case AMDGPU::S_AND_B32: case AMDGPU::S_OR_B32: { // Idempotent operations. @@ -5555,8 +5598,10 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, case AMDGPU::S_XOR_B64: case AMDGPU::S_ADD_I32: case AMDGPU::S_ADD_U64_PSEUDO: + case AMDGPU::V_ADD_F32_e64: case AMDGPU::S_SUB_I32: - case AMDGPU::S_SUB_U64_PSEUDO: { + case AMDGPU::S_SUB_U64_PSEUDO: + case AMDGPU::V_SUB_F32_e64: { const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass(); const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg); Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass); @@ -5711,6 +5756,30 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, .addImm(AMDGPU::sub1); break; } + case AMDGPU::V_ADD_F32_e64: + case AMDGPU::V_SUB_F32_e64: { + Register ActiveLanesVreg = + MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register DstVreg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + // Get number of active lanes as a float val. + BuildMI(BB, MI, DL, TII->get(AMDGPU::V_CVT_F32_I32_e64), + ActiveLanesVreg) + .addReg(NewAccumulator->getOperand(0).getReg()) + .addImm(0) // clamp + .addImm(0); // output-modifier + + // Take negation of input for SUB reduction + unsigned srcMod = Opc == AMDGPU::V_SUB_F32_e64 ? 1 : 0; + BuildMI(BB, MI, DL, TII->get(AMDGPU::V_MUL_F32_e64), DstVreg) + .addImm(srcMod) // src0 modifier + .addReg(SrcReg) + .addImm(0) // src1 modifier + .addReg(ActiveLanesVreg) + .addImm(0) // clamp + .addImm(0); // output-mod + BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg) + .addReg(DstVreg); + } } RetBB = &BB; } @@ -5728,6 +5797,7 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, MachineBasicBlock::iterator I = BB.end(); Register SrcReg = MI.getOperand(1).getReg(); bool is32BitOpc = is32bitWaveReduceOperation(Opc); + bool isFPOp = isFloatingPointWaveReduceOperation(Opc); // Create Control flow for loop // Split MI's Machine Basic block into For loop @@ -5787,9 +5857,29 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, LaneValueReg) .addReg(SrcReg) .addReg(FF1Reg); - NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg) - .addReg(Accumulator->getOperand(0).getReg()) - .addReg(LaneValueReg); + if (isFPOp) { + Register LaneValVreg = + MRI.createVirtualRegister(MRI.getRegClass(SrcReg)); + Register DstVreg = MRI.createVirtualRegister(MRI.getRegClass(SrcReg)); + // Get the Lane Value in VGPR to avoid the Constant Bus Restriction + BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), + LaneValVreg) + .addReg(LaneValueReg); + BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstVreg) + .addImm(0) // src0 modifier + .addReg(Accumulator->getOperand(0).getReg()) + .addImm(0) // src1 modifier + .addReg(LaneValVreg) + .addImm(0) // clamp + .addImm(0); // omod + NewAccumulator = BuildMI(*ComputeLoop, I, DL, + TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg) + .addReg(DstVreg); + } else { + NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg) + .addReg(Accumulator->getOperand(0).getReg()) + .addReg(LaneValueReg); + } } else { Register LaneValueLoReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); @@ -5921,6 +6011,8 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32); case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64: return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_I64_e64); + case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F32: + return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_MIN_F32_e64); case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32: return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32); case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64: @@ -5929,14 +6021,20 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32); case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64: return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64); + case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F32: + return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_MAX_F32_e64); case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32: return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32); case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64: return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO); + case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F32: + return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_ADD_F32_e64); case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32: return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32); case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64: return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO); + case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32: + return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_SUB_F32_e64); case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32: return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32); case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64: @@ -6347,8 +6445,6 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case AMDGPU::DS_GWS_INIT: case AMDGPU::DS_GWS_SEMA_BR: case AMDGPU::DS_GWS_BARRIER: - TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0); - [[fallthrough]]; case AMDGPU::DS_GWS_SEMA_V: case AMDGPU::DS_GWS_SEMA_P: case AMDGPU::DS_GWS_SEMA_RELEASE_ALL: @@ -7035,9 +7131,15 @@ static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SDLoc SL(N); if (Src.getOpcode() == ISD::SETCC) { + SDValue Op0 = Src.getOperand(0); + SDValue Op1 = Src.getOperand(1); + // Need to expand bfloat to float for comparison (setcc). + if (Op0.getValueType() == MVT::bf16) { + Op0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op0); + Op1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op1); + } // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...) - return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0), - Src.getOperand(1), Src.getOperand(2)); + return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Op0, Op1, Src.getOperand(2)); } if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) { // (ballot 0) -> 0 @@ -8057,10 +8159,11 @@ SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT, MachineFunction &MF = DAG.getMachineFunction(); uint64_t Offset = getImplicitParameterOffset(MF, Param); SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset); - MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); - return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, PtrInfo, Alignment, - MachineMemOperand::MODereferenceable | - MachineMemOperand::MOInvariant); + MachinePointerInfo PtrInfo = + getKernargSegmentPtrInfo(DAG.getMachineFunction()); + return DAG.getLoad( + VT, DL, DAG.getEntryNode(), Ptr, PtrInfo.getWithOffset(Offset), Alignment, + MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant); } SDValue SITargetLowering::lowerTrapHsaQueuePtr(SDValue Op, @@ -8322,6 +8425,9 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op, Op.getValueType() == MVT::i64) { const SIMachineFunctionInfo *Info = DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>(); + if (Info->get32BitAddressHighBits() == 0) + return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, Src); + SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32); SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi); return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec); @@ -9731,7 +9837,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); } case Intrinsic::amdgcn_kernarg_segment_ptr: { - if (!AMDGPU::isKernel(MF.getFunction().getCallingConv())) { + if (!AMDGPU::isKernel(MF.getFunction())) { // This only makes sense to call in a kernel, so just lower to null. return DAG.getConstant(0, DL, VT); } @@ -10477,9 +10583,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_raw_buffer_atomic_dec: case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec: return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC); - case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32: - return lowerRawBufferAtomicIntrin(Op, DAG, - AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32); case Intrinsic::amdgcn_struct_buffer_atomic_swap: case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap: return lowerStructBufferAtomicIntrin(Op, DAG, @@ -10521,10 +10624,21 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_struct_buffer_atomic_dec: case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec: return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC); + case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32: + return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_CSUB); + case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32: + return lowerStructBufferAtomicIntrin(Op, DAG, + AMDGPUISD::BUFFER_ATOMIC_CSUB); + case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32: + return lowerRawBufferAtomicIntrin(Op, DAG, + AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32); case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32: return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32); - case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: { SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG); @@ -11892,7 +12006,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT || (AS == AMDGPUAS::GLOBAL_ADDRESS && Subtarget->getScalarizeGlobalBehavior() && Load->isSimple() && - isMemOpHasNoClobberedMemOperand(Load))) { + (Load->isInvariant() || isMemOpHasNoClobberedMemOperand(Load)))) { if ((!Op->isDivergent() || AMDGPU::isUniformMMO(MMO)) && Alignment >= Align(4) && NumElements < 32) { if (MemVT.isPow2VectorType() || @@ -13930,6 +14044,12 @@ static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { assert(OtherOp.getValueSizeInBits() == 32); } + // Check that we haven't just recreated the same FSHR node. + if (N->getOpcode() == ISD::FSHR && + (N->getOperand(0) == Op || N->getOperand(0) == OtherOp) && + (N->getOperand(1) == Op || N->getOperand(1) == OtherOp)) + return SDValue(); + if (hasNon16BitAccesses(PermMask, Op, OtherOp)) { assert(Op.getValueType().isByteSized() && @@ -17361,12 +17481,14 @@ void SITargetLowering::AddMemOpInit(MachineInstr &MI) const { // Abandon attempt if the dst size isn't large enough // - this is in fact an error but this is picked up elsewhere and // reported correctly. - uint32_t DstSize = - TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32; + const TargetRegisterClass *DstRC = TII->getRegClass(MI.getDesc(), DstIdx); + + uint32_t DstSize = TRI.getRegSizeInBits(*DstRC) / 32; if (DstSize < InitIdx) return; } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) { - InitIdx = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32; + const TargetRegisterClass *DstRC = TII->getRegClass(MI.getDesc(), DstIdx); + InitIdx = TRI.getRegSizeInBits(*DstRC) / 32; } else { return; } @@ -17414,7 +17536,7 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const { const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); - MachineFunction *MF = MI.getParent()->getParent(); + MachineFunction *MF = MI.getMF(); MachineRegisterInfo &MRI = MF->getRegInfo(); if (TII->isVOP3(MI.getOpcode())) { @@ -17550,6 +17672,8 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_, break; case 'v': switch (BitWidth) { + case 1: + return std::pair(0U, nullptr); case 16: RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass : &AMDGPU::VGPR_32_Lo256RegClass; @@ -17567,6 +17691,8 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_, if (!Subtarget->hasMAIInsts()) break; switch (BitWidth) { + case 1: + return std::pair(0U, nullptr); case 16: RC = &AMDGPU::AGPR_32RegClass; break; @@ -18252,7 +18378,6 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N, case AMDGPUISD::BUFFER_ATOMIC_INC: case AMDGPUISD::BUFFER_ATOMIC_DEC: case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP: - case AMDGPUISD::BUFFER_ATOMIC_CSUB: case AMDGPUISD::BUFFER_ATOMIC_FADD: case AMDGPUISD::BUFFER_ATOMIC_FMIN: case AMDGPUISD::BUFFER_ATOMIC_FMAX: @@ -18487,7 +18612,19 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { case AtomicRMWInst::UMax: case AtomicRMWInst::UMin: case AtomicRMWInst::UIncWrap: - case AtomicRMWInst::UDecWrap: { + case AtomicRMWInst::UDecWrap: + case AtomicRMWInst::USubCond: + case AtomicRMWInst::USubSat: { + if (Op == AtomicRMWInst::USubCond && !Subtarget->hasCondSubInsts()) + return AtomicExpansionKind::CmpXChg; + if (Op == AtomicRMWInst::USubSat && !Subtarget->hasSubClampInsts()) + return AtomicExpansionKind::CmpXChg; + if (Op == AtomicRMWInst::USubCond || Op == AtomicRMWInst::USubSat) { + auto *IT = dyn_cast<IntegerType>(RMW->getType()); + if (!IT || IT->getBitWidth() != 32) + return AtomicExpansionKind::CmpXChg; + } + if (AMDGPU::isFlatGlobalAddrSpace(AS) || AS == AMDGPUAS::BUFFER_FAT_POINTER) { if (Subtarget->hasEmulatedSystemScopeAtomics()) @@ -18752,8 +18889,11 @@ SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const { : &AMDGPU::SReg_32RegClass; if (!TRI->isSGPRClass(RC) && !isDivergent) return TRI->getEquivalentSGPRClass(RC); - if (TRI->isSGPRClass(RC) && isDivergent) + if (TRI->isSGPRClass(RC) && isDivergent) { + if (Subtarget->hasGFX90AInsts()) + return TRI->getEquivalentAVClass(RC); return TRI->getEquivalentVGPRClass(RC); + } return RC; } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index 74e58f4..fb16294 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -45,6 +45,8 @@ public: LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override; + MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const; + private: SDValue lowerKernArgParameterPtr(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, uint64_t Offset) const; @@ -332,7 +334,7 @@ public: MVT getPointerTy(const DataLayout &DL, unsigned AS) const override; MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override; - bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, + bool getTgtMemIntrinsic(IntrinsicInfo &, const CallBase &, MachineFunction &MF, unsigned IntrinsicID) const override; diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 6dcbced..146f360 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -345,9 +345,7 @@ public: class WaitcntGeneratorPreGFX12 : public WaitcntGenerator { public: - WaitcntGeneratorPreGFX12() = default; - WaitcntGeneratorPreGFX12(const MachineFunction &MF) - : WaitcntGenerator(MF, NUM_NORMAL_INST_CNTS) {} + using WaitcntGenerator::WaitcntGenerator; bool applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets, @@ -381,10 +379,7 @@ public: class WaitcntGeneratorGFX12Plus : public WaitcntGenerator { public: - WaitcntGeneratorGFX12Plus() = default; - WaitcntGeneratorGFX12Plus(const MachineFunction &MF, - InstCounterType MaxCounter) - : WaitcntGenerator(MF, MaxCounter) {} + using WaitcntGenerator::WaitcntGenerator; bool applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets, @@ -531,6 +526,7 @@ public: // instruction. WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const { switch (Inst.getOpcode()) { + // FIXME: GLOBAL_INV needs to be tracked with xcnt too. case AMDGPU::GLOBAL_INV: return VMEM_READ_ACCESS; // tracked using loadcnt case AMDGPU::GLOBAL_WB: @@ -551,9 +547,7 @@ public: return VMEM_ACCESS; if (Inst.mayStore() && (!Inst.mayLoad() || SIInstrInfo::isAtomicNoRet(Inst))) { - // FLAT and SCRATCH instructions may access scratch. Other VMEM - // instructions do not. - if (TII->mayAccessScratchThroughFlat(Inst)) + if (TII->mayAccessScratch(Inst)) return SCRATCH_WRITE_ACCESS; return VMEM_WRITE_ACCESS; } @@ -633,8 +627,11 @@ public: const MachineOperand &Op) const; bool counterOutOfOrder(InstCounterType T) const; - void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const; + void simplifyWaitcnt(AMDGPU::Waitcnt &Wait); void simplifyWaitcnt(InstCounterType T, unsigned &Count) const; + bool hasRedundantXCntWithKmCnt(const AMDGPU::Waitcnt &Wait); + bool canOptimizeXCntWithLoadCnt(const AMDGPU::Waitcnt &Wait); + void simplifyXcnt(AMDGPU::Waitcnt &CheckWait, AMDGPU::Waitcnt &UpdateWait); void determineWait(InstCounterType T, RegInterval Interval, AMDGPU::Waitcnt &Wait) const; @@ -646,7 +643,6 @@ public: void applyWaitcnt(const AMDGPU::Waitcnt &Wait); void applyWaitcnt(InstCounterType T, unsigned Count); - void applyXcnt(const AMDGPU::Waitcnt &Wait); void updateByEvent(WaitEventType E, MachineInstr &MI); unsigned hasPendingEvent() const { return PendingEvents; } @@ -921,6 +917,7 @@ bool WaitcntBrackets::hasPointSamplePendingVmemTypes( void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) { InstCounterType T = eventCounter(Context->WaitEventMaskForInst, E); + assert(T < Context->MaxCounter); unsigned UB = getScoreUB(T); unsigned CurrScore = UB + 1; @@ -1085,13 +1082,17 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) { } } } - if (Slot || LDSDMAStores.size() == NUM_LDS_VGPRS - 1) + if (Slot) break; + // The slot may not be valid because it can be >= NUM_LDS_VGPRS which + // means the scoreboard cannot track it. We still want to preserve the + // MI in order to check alias information, though. LDSDMAStores.push_back(&Inst); Slot = LDSDMAStores.size(); break; } - setRegScore(FIRST_LDS_VGPR + Slot, T, CurrScore); + if (Slot < NUM_LDS_VGPRS) + setRegScore(FIRST_LDS_VGPR + Slot, T, CurrScore); if (Slot) setRegScore(FIRST_LDS_VGPR, T, CurrScore); } @@ -1113,33 +1114,33 @@ void WaitcntBrackets::print(raw_ostream &OS) const { switch (T) { case LOAD_CNT: OS << " " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT(" - << SR << "): "; + << SR << "):"; break; case DS_CNT: OS << " " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT(" - << SR << "): "; + << SR << "):"; break; case EXP_CNT: - OS << " EXP_CNT(" << SR << "): "; + OS << " EXP_CNT(" << SR << "):"; break; case STORE_CNT: OS << " " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT(" - << SR << "): "; + << SR << "):"; break; case SAMPLE_CNT: - OS << " SAMPLE_CNT(" << SR << "): "; + OS << " SAMPLE_CNT(" << SR << "):"; break; case BVH_CNT: - OS << " BVH_CNT(" << SR << "): "; + OS << " BVH_CNT(" << SR << "):"; break; case KM_CNT: - OS << " KM_CNT(" << SR << "): "; + OS << " KM_CNT(" << SR << "):"; break; case X_CNT: - OS << " X_CNT(" << SR << "): "; + OS << " X_CNT(" << SR << "):"; break; default: - OS << " UNKNOWN(" << SR << "): "; + OS << " UNKNOWN(" << SR << "):"; break; } @@ -1153,9 +1154,9 @@ void WaitcntBrackets::print(raw_ostream &OS) const { continue; unsigned RelScore = RegScore - LB - 1; if (J < FIRST_LDS_VGPR) { - OS << RelScore << ":v" << J << " "; + OS << ' ' << RelScore << ":v" << J; } else { - OS << RelScore << ":ds "; + OS << ' ' << RelScore << ":ds"; } } // Also need to print sgpr scores for lgkm_cnt or xcnt. @@ -1165,11 +1166,11 @@ void WaitcntBrackets::print(raw_ostream &OS) const { if (RegScore <= LB) continue; unsigned RelScore = RegScore - LB - 1; - OS << RelScore << ":s" << J << " "; + OS << ' ' << RelScore << ":s" << J; } } if (T == KM_CNT && SCCScore > 0) - OS << SCCScore << ":scc "; + OS << ' ' << SCCScore << ":scc"; } OS << '\n'; } @@ -1192,7 +1193,7 @@ void WaitcntBrackets::print(raw_ostream &OS) const { /// Simplify the waitcnt, in the sense of removing redundant counts, and return /// whether a waitcnt instruction is needed at all. -void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const { +void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) { simplifyWaitcnt(LOAD_CNT, Wait.LoadCnt); simplifyWaitcnt(EXP_CNT, Wait.ExpCnt); simplifyWaitcnt(DS_CNT, Wait.DsCnt); @@ -1200,7 +1201,7 @@ void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const { simplifyWaitcnt(SAMPLE_CNT, Wait.SampleCnt); simplifyWaitcnt(BVH_CNT, Wait.BvhCnt); simplifyWaitcnt(KM_CNT, Wait.KmCnt); - simplifyWaitcnt(X_CNT, Wait.XCnt); + simplifyXcnt(Wait, Wait); } void WaitcntBrackets::simplifyWaitcnt(InstCounterType T, @@ -1270,7 +1271,7 @@ void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) { applyWaitcnt(SAMPLE_CNT, Wait.SampleCnt); applyWaitcnt(BVH_CNT, Wait.BvhCnt); applyWaitcnt(KM_CNT, Wait.KmCnt); - applyXcnt(Wait); + applyWaitcnt(X_CNT, Wait.XCnt); } void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) { @@ -1287,21 +1288,42 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) { } } -void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) { +bool WaitcntBrackets::hasRedundantXCntWithKmCnt(const AMDGPU::Waitcnt &Wait) { // Wait on XCNT is redundant if we are already waiting for a load to complete. // SMEM can return out of order, so only omit XCNT wait if we are waiting till // zero. - if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP)) - return applyWaitcnt(X_CNT, 0); + return Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP); +} +bool WaitcntBrackets::canOptimizeXCntWithLoadCnt(const AMDGPU::Waitcnt &Wait) { // If we have pending store we cannot optimize XCnt because we do not wait for // stores. VMEM loads retun in order, so if we only have loads XCnt is // decremented to the same number as LOADCnt. - if (Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) && - !hasPendingEvent(STORE_CNT)) - return applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt)); + return Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) && + !hasPendingEvent(STORE_CNT); +} - applyWaitcnt(X_CNT, Wait.XCnt); +void WaitcntBrackets::simplifyXcnt(AMDGPU::Waitcnt &CheckWait, + AMDGPU::Waitcnt &UpdateWait) { + // Try to simplify xcnt further by checking for joint kmcnt and loadcnt + // optimizations. On entry to a block with multiple predescessors, there may + // be pending SMEM and VMEM events active at the same time. + // In such cases, only clear one active event at a time. + // TODO: Revisit xcnt optimizations for gfx1250. + if (hasRedundantXCntWithKmCnt(CheckWait)) { + if (!hasMixedPendingEvents(X_CNT)) { + applyWaitcnt(X_CNT, 0); + } else { + PendingEvents &= ~(1 << SMEM_GROUP); + } + } else if (canOptimizeXCntWithLoadCnt(CheckWait)) { + if (!hasMixedPendingEvents(X_CNT)) { + applyWaitcnt(X_CNT, std::min(CheckWait.XCnt, CheckWait.LoadCnt)); + } else if (CheckWait.LoadCnt == 0) { + PendingEvents &= ~(1 << VMEM_GROUP); + } + } + simplifyWaitcnt(X_CNT, UpdateWait.XCnt); } // Where there are multiple types of event in the bracket of a counter, @@ -1518,7 +1540,7 @@ bool WaitcntGeneratorPreGFX12::createNewWaitcnt( BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); Modified = true; - LLVM_DEBUG(dbgs() << "generateWaitcnt\n"; + LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n"; if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; dbgs() << "New Instr: " << *SWaitInst << '\n'); } @@ -1532,7 +1554,7 @@ bool WaitcntGeneratorPreGFX12::createNewWaitcnt( .addImm(Wait.StoreCnt); Modified = true; - LLVM_DEBUG(dbgs() << "generateWaitcnt\n"; + LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n"; if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; dbgs() << "New Instr: " << *SWaitInst << '\n'); } @@ -1636,6 +1658,8 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt( } } + // Save the pre combine waitcnt in order to make xcnt checks. + AMDGPU::Waitcnt PreCombine = Wait; if (CombinedLoadDsCntInstr) { // Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need // to be waited for. Otherwise, let the instruction be deleted so @@ -1726,6 +1750,13 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt( } for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) { + if ((CT == KM_CNT && ScoreBrackets.hasRedundantXCntWithKmCnt(PreCombine)) || + (CT == LOAD_CNT && + ScoreBrackets.canOptimizeXCntWithLoadCnt(PreCombine))) { + // Xcnt may need to be updated depending on a pre-existing KM/LOAD_CNT + // due to taking the backedge of a block. + ScoreBrackets.simplifyXcnt(PreCombine, Wait); + } if (!WaitInstrs[CT]) continue; @@ -1790,7 +1821,7 @@ bool WaitcntGeneratorGFX12Plus::createNewWaitcnt( if (SWaitInst) { Modified = true; - LLVM_DEBUG(dbgs() << "generateWaitcnt\n"; + LLVM_DEBUG(dbgs() << "GFX12Plus::createNewWaitcnt\n"; if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; dbgs() << "New Instr: " << *SWaitInst << '\n'); } @@ -1810,7 +1841,7 @@ bool WaitcntGeneratorGFX12Plus::createNewWaitcnt( Modified = true; - LLVM_DEBUG(dbgs() << "generateWaitcnt\n"; + LLVM_DEBUG(dbgs() << "GFX12Plus::createNewWaitcnt\n"; if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; dbgs() << "New Instr: " << *SWaitInst << '\n'); } @@ -1979,15 +2010,23 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, if (Ptr && Memop->getAAInfo()) { const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores(); for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) { - if (MI.mayAlias(AA, *LDSDMAStores[I], true)) + if (MI.mayAlias(AA, *LDSDMAStores[I], true)) { + if ((I + 1) >= NUM_LDS_VGPRS) { + // We didn't have enough slot to track this LDS DMA store, it + // has been tracked using the common RegNo (FIRST_LDS_VGPR). + ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait); + break; + } + ScoreBrackets.determineWait(LOAD_CNT, RegNo + I + 1, Wait); + } } } else { ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait); } - if (Memop->isStore()) { + + if (Memop->isStore()) ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait); - } } // Loop over use and def operands. @@ -2072,6 +2111,14 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, // Verify that the wait is actually needed. ScoreBrackets.simplifyWaitcnt(Wait); + // Since the translation for VMEM addresses occur in-order, we can apply the + // XCnt if the current instruction is of VMEM type and has a memory + // dependency with another VMEM instruction in flight. + if (Wait.XCnt != ~0u && isVmemAccess(MI)) { + ScoreBrackets.applyWaitcnt(X_CNT, Wait.XCnt); + Wait.XCnt = ~0u; + } + // When forcing emit, we need to skip terminators because that would break the // terminators of the MBB if we emit a waitcnt between terminators. if (ForceEmitZeroFlag && !MI.isTerminator()) @@ -2140,21 +2187,6 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait, << "Update Instr: " << *It); } - // XCnt may be already consumed by a load wait. - if (Wait.XCnt != ~0u) { - if (Wait.KmCnt == 0 && !ScoreBrackets.hasPendingEvent(SMEM_GROUP)) - Wait.XCnt = ~0u; - - if (Wait.LoadCnt == 0 && !ScoreBrackets.hasPendingEvent(VMEM_GROUP)) - Wait.XCnt = ~0u; - - // Since the translation for VMEM addresses occur in-order, we can skip the - // XCnt if the current instruction is of VMEM type and has a memory - // dependency with another VMEM instruction in flight. - if (isVmemAccess(*It)) - Wait.XCnt = ~0u; - } - if (WCG->createNewWaitcnt(Block, It, Wait)) Modified = true; @@ -2265,10 +2297,12 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, ScoreBrackets->updateByEvent(LDS_ACCESS, Inst); } - // This is a flat memory operation that access both VMEM and LDS, so note it - // - it will require that both the VM and LGKM be flushed to zero if it is - // pending when a VM or LGKM dependency occurs. - if (FlatASCount > 1) + // Async/LDSDMA operations have FLAT encoding but do not actually use flat + // pointers. They do have two operands that each access global and LDS, thus + // making it appear at this point that they are using a flat pointer. Filter + // them out, and for the rest, generate a dependency on flat pointers so + // that both VM and LGKM counters are flushed. + if (!SIInstrInfo::isLDSDMA(Inst) && FlatASCount > 1) ScoreBrackets->setPendingFlat(); } else if (SIInstrInfo::isVMEM(Inst) && !llvm::AMDGPU::getMUBUFIsBufferInv(Inst.getOpcode())) { @@ -2720,7 +2754,7 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) { WCG = &WCGGFX12Plus; } else { MaxCounter = NUM_NORMAL_INST_CNTS; - WCGPreGFX12 = WaitcntGeneratorPreGFX12(MF); + WCGPreGFX12 = WaitcntGeneratorPreGFX12(MF, MaxCounter); WCG = &WCGPreGFX12; } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index d930a21..6d21109 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -63,7 +63,8 @@ static cl::opt<bool> Fix16BitCopies( cl::ReallyHidden); SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST) - : AMDGPUGenInstrInfo(ST, AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN), + : AMDGPUGenInstrInfo(ST, RI, AMDGPU::ADJCALLSTACKUP, + AMDGPU::ADJCALLSTACKDOWN), RI(ST), ST(ST) { SchedModel.init(&ST); } @@ -161,7 +162,7 @@ bool SIInstrInfo::resultDependsOnExec(const MachineInstr &MI) const { if (!DstReg.isVirtual()) return true; - const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) { switch (Use.getOpcode()) { case AMDGPU::S_AND_SAVEEXEC_B32: @@ -1667,8 +1668,7 @@ unsigned SIInstrInfo::getVectorRegSpillSaveOpcode( void SIInstrInfo::storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, - bool isKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags) const { MachineFunction *MF = MBB.getParent(); SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); @@ -1680,7 +1680,7 @@ void SIInstrInfo::storeRegToStackSlot( MachineMemOperand *MMO = MF->getMachineMemOperand( PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex), FrameInfo.getObjectAlign(FrameIndex)); - unsigned SpillSize = TRI->getSpillSize(*RC); + unsigned SpillSize = RI.getSpillSize(*RC); MachineRegisterInfo &MRI = MF->getRegInfo(); if (RI.isSGPRClass(RC)) { @@ -1862,14 +1862,13 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags) const { MachineFunction *MF = MBB.getParent(); SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); MachineFrameInfo &FrameInfo = MF->getFrameInfo(); const DebugLoc &DL = MBB.findDebugLoc(MI); - unsigned SpillSize = TRI->getSpillSize(*RC); + unsigned SpillSize = RI.getSpillSize(*RC); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(*MF, FrameIndex); @@ -1964,6 +1963,10 @@ MachineBasicBlock *SIInstrInfo::insertSimulatedTrap(MachineRegisterInfo &MRI, BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(TrapBB); MF->push_back(TrapBB); MBB.addSuccessor(TrapBB); + } else { + // Since we're adding HaltLoopBB and modifying the CFG, we must return a + // different block to signal the change. + ContBB = HaltLoopBB; } // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this @@ -2518,8 +2521,8 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg, - unsigned SubIdx, const MachineInstr &Orig, - const TargetRegisterInfo &RI) const { + unsigned SubIdx, + const MachineInstr &Orig) const { // Try shrinking the instruction to remat only the part needed for current // context. @@ -2569,7 +2572,7 @@ void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB, const MCInstrDesc &TID = get(NewOpcode); const TargetRegisterClass *NewRC = - RI.getAllocatableClass(getRegClass(TID, 0, &RI)); + RI.getAllocatableClass(getRegClass(TID, 0)); MRI.setRegClass(DestReg, NewRC); UseMO->setReg(DestReg); @@ -2599,7 +2602,7 @@ void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB, break; } - TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, RI); + TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig); } std::pair<MachineInstr*, MachineInstr*> @@ -2935,7 +2938,7 @@ void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, auto ApplyHazardWorkarounds = [this, &MBB, &I, &DL, FlushSGPRWrites]() { if (FlushSGPRWrites) BuildMI(MBB, I, DL, get(AMDGPU::S_WAITCNT_DEPCTR)) - .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0)); + .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0, ST)); }; // We need to compute the offset relative to the instruction immediately after @@ -3461,6 +3464,21 @@ void SIInstrInfo::removeModOperands(MachineInstr &MI) const { } } +void SIInstrInfo::mutateAndCleanupImplicit(MachineInstr &MI, + const MCInstrDesc &NewDesc) const { + MI.setDesc(NewDesc); + + // Remove any leftover implicit operands from mutating the instruction. e.g. + // if we replace an s_and_b32 with a copy, we don't need the implicit scc def + // anymore. + const MCInstrDesc &Desc = MI.getDesc(); + unsigned NumOps = Desc.getNumOperands() + Desc.implicit_uses().size() + + Desc.implicit_defs().size(); + + for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I) + MI.removeOperand(I); +} + std::optional<int64_t> SIInstrInfo::extractSubregFromImm(int64_t Imm, unsigned SubRegIndex) { switch (SubRegIndex) { @@ -3612,7 +3630,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_ACCVGPR_WRITE_B32_e64}) { const MCInstrDesc &MovDesc = get(MovOp); - const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0, &RI); + const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0); if (Is16Bit) { // We just need to find a correctly sized register class, so the // subregister index compatibility doesn't matter since we're statically @@ -3917,6 +3935,9 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, if (isLDSDMA(MIa) || isLDSDMA(MIb)) return false; + if (MIa.isBundle() || MIb.isBundle()) + return false; + // TODO: Should we check the address space from the MachineMemOperand? That // would allow us to distinguish objects we know don't alias based on the // underlying address space, even if it was lowered to a different one, @@ -3982,7 +4003,7 @@ static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm, MachineInstr **DefMI = nullptr) { if (!MO->isReg()) return false; - const MachineFunction *MF = MO->getParent()->getParent()->getParent(); + const MachineFunction *MF = MO->getParent()->getMF(); const MachineRegisterInfo &MRI = MF->getRegInfo(); return getFoldableImm(MO->getReg(), MRI, Imm, DefMI); } @@ -4044,10 +4065,29 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const { MachineBasicBlock &MBB = *MI.getParent(); + MachineInstr *CandidateMI = &MI; + + if (MI.isBundle()) { + // This is a temporary placeholder for bundle handling that enables us to + // exercise the relevant code paths in the two-address instruction pass. + if (MI.getBundleSize() != 1) + return nullptr; + CandidateMI = MI.getNextNode(); + } + ThreeAddressUpdates U; - MachineInstr *NewMI = convertToThreeAddressImpl(MI, U); + MachineInstr *NewMI = convertToThreeAddressImpl(*CandidateMI, U); + if (!NewMI) + return nullptr; + + if (MI.isBundle()) { + CandidateMI->eraseFromBundle(); - if (NewMI) { + for (MachineOperand &MO : MI.all_defs()) { + if (MO.isTied()) + MI.untieRegOperand(MO.getOperandNo()); + } + } else { updateLiveVariables(LV, MI, *NewMI); if (LIS) { LIS->ReplaceMachineInstrInMaps(MI, *NewMI); @@ -4088,7 +4128,22 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, LV->getVarInfo(DefReg).AliveBlocks.clear(); } - if (LIS) { + if (MI.isBundle()) { + VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg); + if (!VRI.Reads && !VRI.Writes) { + for (MachineOperand &MO : MI.all_uses()) { + if (MO.isReg() && MO.getReg() == DefReg) { + assert(MO.getSubReg() == 0 && + "tied sub-registers in bundles currently not supported"); + MI.removeOperand(MO.getOperandNo()); + break; + } + } + + if (LIS) + LIS->shrinkToUses(&LIS->getInterval(DefReg)); + } + } else if (LIS) { LiveInterval &DefLI = LIS->getInterval(DefReg); // We cannot delete the original instruction here, so hack out the use @@ -4103,11 +4158,26 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, } } + if (MI.isBundle()) { + VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg); + if (!VRI.Reads && !VRI.Writes) { + for (MachineOperand &MIOp : MI.uses()) { + if (MIOp.isReg() && MIOp.getReg() == DefReg) { + MIOp.setIsUndef(true); + MIOp.setReg(DummyReg); + } + } + } + + MI.addOperand(MachineOperand::CreateReg(DummyReg, false, false, false, + false, /*isUndef=*/true)); + } + LIS->shrinkToUses(&DefLI); } } - return NewMI; + return MI.isBundle() ? &MI : NewMI; } MachineInstr * @@ -4121,7 +4191,7 @@ SIInstrInfo::convertToThreeAddressImpl(MachineInstr &MI, if (NewMFMAOpc != -1) { MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc)); - for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) + for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) MIB.add(MI.getOperand(I)); return MIB; } @@ -4130,7 +4200,7 @@ SIInstrInfo::convertToThreeAddressImpl(MachineInstr &MI, unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode()); MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc)) .setMIFlags(MI.getFlags()); - for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) + for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) MIB->addOperand(MI.getOperand(I)); return MIB; } @@ -4329,8 +4399,9 @@ bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const { Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode); } -bool SIInstrInfo::mayAccessScratchThroughFlat(const MachineInstr &MI) const { - if (!isFLAT(MI) || isFLATGlobal(MI)) +bool SIInstrInfo::mayAccessScratch(const MachineInstr &MI) const { + // Instructions that access scratch use FLAT encoding or BUF encodings. + if ((!isFLAT(MI) || isFLATGlobal(MI)) && !isBUF(MI)) return false; // If scratch is not initialized, we can never access it. @@ -4948,7 +5019,7 @@ bool SIInstrInfo::verifyCopy(const MachineInstr &MI, bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const { uint16_t Opcode = MI.getOpcode(); - const MachineFunction *MF = MI.getParent()->getParent(); + const MachineFunction *MF = MI.getMF(); const MachineRegisterInfo &MRI = MF->getRegInfo(); // FIXME: At this point the COPY verify is done only for non-ssa forms. @@ -5452,9 +5523,10 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, Desc.getNumOperands() + Desc.implicit_uses().size(); const unsigned NumImplicitOps = IsDst ? 2 : 1; - // Allow additional implicit operands. This allows a fixup done by the post - // RA scheduler where the main implicit operand is killed and implicit-defs - // are added for sub-registers that remain live after this instruction. + // Require additional implicit operands. This allows a fixup done by the + // post RA scheduler where the main implicit operand is killed and + // implicit-defs are added for sub-registers that remain live after this + // instruction. if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) { ErrInfo = "missing implicit register operands"; return false; @@ -5736,6 +5808,17 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, } } + if (ST.hasFlatScratchHiInB64InstHazard() && isSALU(MI) && + MI.readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, nullptr)) { + const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst); + if ((Dst && RI.getRegClassForReg(MRI, Dst->getReg()) == + &AMDGPU::SReg_64RegClass) || + Opcode == AMDGPU::S_BITCMP0_B64 || Opcode == AMDGPU::S_BITCMP1_B64) { + ErrInfo = "Instruction cannot read flat_scratch_base_hi"; + return false; + } + } + return true; } @@ -5754,7 +5837,7 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const { case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM; case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM; case AMDGPU::S_MOV_B32: { - const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); return MI.getOperand(1).isReg() || RI.isAGPR(MRI, MI.getOperand(0).getReg()) ? AMDGPU::COPY : AMDGPU::V_MOV_B32_e32; @@ -6021,19 +6104,6 @@ SIInstrInfo::getWholeWaveFunctionSetup(MachineFunction &MF) const { llvm_unreachable("Couldn't find SI_SETUP_WHOLE_WAVE_FUNC instruction"); } -// FIXME: This should not be an overridable function. All subtarget dependent -// operand modifications should go through isLookupRegClassByHwMode in the -// generic handling. -const TargetRegisterClass * -SIInstrInfo::getRegClass(const MCInstrDesc &TID, unsigned OpNum, - const TargetRegisterInfo *TRI) const { - if (OpNum >= TID.getNumOperands()) - return nullptr; - const MCOperandInfo &OpInfo = TID.operands()[OpNum]; - int16_t RegClass = getOpRegClassID(OpInfo); - return RI.getRegClass(RegClass); -} - const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, unsigned OpNo) const { const MCInstrDesc &Desc = get(MI.getOpcode()); @@ -6042,14 +6112,14 @@ const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, Register Reg = MI.getOperand(OpNo).getReg(); if (Reg.isVirtual()) { - const MachineRegisterInfo &MRI = - MI.getParent()->getParent()->getRegInfo(); + const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); return MRI.getRegClass(Reg); } return RI.getPhysRegBaseClass(Reg); } - return RI.getRegClass(getOpRegClassID(Desc.operands()[OpNo])); + int16_t RegClass = getOpRegClassID(Desc.operands()[OpNo]); + return RegClass < 0 ? nullptr : RI.getRegClass(RegClass); } void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const { @@ -6133,7 +6203,7 @@ bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI, const TargetRegisterClass *RC = MRI.getRegClass(Reg); if (MO.getSubReg()) { - const MachineFunction *MF = MO.getParent()->getParent()->getParent(); + const MachineFunction *MF = MO.getParent()->getMF(); const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF); if (!SuperRC) return false; @@ -6145,7 +6215,7 @@ bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI, bool SIInstrInfo::isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx, const MachineOperand &MO) const { - const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx]; unsigned Opc = MI.getOpcode(); @@ -6153,7 +6223,7 @@ bool SIInstrInfo::isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx, // information. if (AMDGPU::isPackedFP32Inst(MI.getOpcode()) && AMDGPU::isGFX12Plus(ST) && MO.isReg() && RI.isSGPRReg(MRI, MO.getReg())) { - constexpr const AMDGPU::OpName OpNames[] = { + constexpr AMDGPU::OpName OpNames[] = { AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2}; for (auto [I, OpName] : enumerate(OpNames)) { @@ -6198,6 +6268,18 @@ bool SIInstrInfo::isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx, (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) && RI.isSGPRReg(MRI, MO.getReg())) return false; + + if (ST.hasFlatScratchHiInB64InstHazard() && + MO.getReg() == AMDGPU::SRC_FLAT_SCRATCH_BASE_HI && isSALU(MI)) { + if (const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst)) { + if (AMDGPU::getRegBitWidth(*RI.getRegClassForReg(MRI, Dst->getReg())) == + 64) + return false; + } + if (Opc == AMDGPU::S_BITCMP0_B64 || Opc == AMDGPU::S_BITCMP1_B64) + return false; + } + return true; } @@ -6215,8 +6297,8 @@ bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI, bool SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand( const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN, const MachineOperand *MO) const { - constexpr const unsigned NumOps = 3; - constexpr const AMDGPU::OpName OpNames[NumOps * 2] = { + constexpr unsigned NumOps = 3; + constexpr AMDGPU::OpName OpNames[NumOps * 2] = { AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2, AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers}; @@ -6247,7 +6329,7 @@ bool SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand( bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO) const { - const MachineFunction &MF = *MI.getParent()->getParent(); + const MachineFunction &MF = *MI.getMF(); const MachineRegisterInfo &MRI = MF.getRegInfo(); const MCInstrDesc &InstDesc = MI.getDesc(); const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx]; @@ -6801,7 +6883,7 @@ void SIInstrInfo::legalizeOperandsFLAT(MachineRegisterInfo &MRI, return; const TargetRegisterClass *DeclaredRC = - getRegClass(MI.getDesc(), SAddr->getOperandNo(), &RI); + getRegClass(MI.getDesc(), SAddr->getOperandNo()); Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI, DeclaredRC); SAddr->setReg(ToSGPR); @@ -7143,7 +7225,7 @@ extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc) { MachineBasicBlock * SIInstrInfo::legalizeOperands(MachineInstr &MI, MachineDominatorTree *MDT) const { - MachineFunction &MF = *MI.getParent()->getParent(); + MachineFunction &MF = *MI.getMF(); MachineRegisterInfo &MRI = MF.getRegInfo(); MachineBasicBlock *CreatedBB = nullptr; @@ -7632,6 +7714,8 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); unsigned Opcode = Inst.getOpcode(); unsigned NewOpcode = getVALUOp(Inst); + const DebugLoc &DL = Inst.getDebugLoc(); + // Handle some special cases switch (Opcode) { default: @@ -7783,6 +7867,11 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, Inst.eraseFromParent(); return; + case AMDGPU::S_ABSDIFF_I32: + lowerScalarAbsDiff(Worklist, Inst); + Inst.eraseFromParent(); + return; + case AMDGPU::S_CBRANCH_SCC0: case AMDGPU::S_CBRANCH_SCC1: { // Clear unused bits of vcc @@ -7869,7 +7958,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, return; case AMDGPU::S_UADDO_PSEUDO: case AMDGPU::S_USUBO_PSEUDO: { - const DebugLoc &DL = Inst.getDebugLoc(); MachineOperand &Dest0 = Inst.getOperand(0); MachineOperand &Dest1 = Inst.getOperand(1); MachineOperand &Src0 = Inst.getOperand(2); @@ -7889,12 +7977,37 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, legalizeOperands(*NewInstr, MDT); MRI.replaceRegWith(Dest0.getReg(), DestReg); - addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI, - Worklist); + addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist); Inst.eraseFromParent(); } return; + case AMDGPU::S_LSHL1_ADD_U32: + case AMDGPU::S_LSHL2_ADD_U32: + case AMDGPU::S_LSHL3_ADD_U32: + case AMDGPU::S_LSHL4_ADD_U32: { + MachineOperand &Dest = Inst.getOperand(0); + MachineOperand &Src0 = Inst.getOperand(1); + MachineOperand &Src1 = Inst.getOperand(2); + unsigned ShiftAmt = (Opcode == AMDGPU::S_LSHL1_ADD_U32 ? 1 + : Opcode == AMDGPU::S_LSHL2_ADD_U32 ? 2 + : Opcode == AMDGPU::S_LSHL3_ADD_U32 ? 3 + : 4); + + const TargetRegisterClass *NewRC = + RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())); + Register DestReg = MRI.createVirtualRegister(NewRC); + MachineInstr *NewInstr = + BuildMI(*MBB, &Inst, DL, get(AMDGPU::V_LSHL_ADD_U32_e64), DestReg) + .add(Src0) + .addImm(ShiftAmt) + .add(Src1); + legalizeOperands(*NewInstr, MDT); + MRI.replaceRegWith(Dest.getReg(), DestReg); + addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist); + Inst.eraseFromParent(); + } + return; case AMDGPU::S_CSELECT_B32: case AMDGPU::S_CSELECT_B64: lowerSelect(Worklist, Inst, MDT); @@ -7945,7 +8058,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, } legalizeOperands(*NewInstr, MDT); int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr); - MachineOperand SCCOp = Inst.getOperand(SCCIdx); + const MachineOperand &SCCOp = Inst.getOperand(SCCIdx); addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg); Inst.eraseFromParent(); return; @@ -7985,13 +8098,12 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, legalizeOperandsVALUt16(*NewInstr, MRI); legalizeOperands(*NewInstr, MDT); int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr); - MachineOperand SCCOp = Inst.getOperand(SCCIdx); + const MachineOperand &SCCOp = Inst.getOperand(SCCIdx); addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg); Inst.eraseFromParent(); return; } case AMDGPU::S_CVT_HI_F32_F16: { - const DebugLoc &DL = Inst.getDebugLoc(); Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); if (ST.useRealTrue16Insts()) { @@ -8021,7 +8133,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, } case AMDGPU::S_MINIMUM_F32: case AMDGPU::S_MAXIMUM_F32: { - const DebugLoc &DL = Inst.getDebugLoc(); Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst) .addImm(0) // src0_modifiers @@ -8039,7 +8150,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, } case AMDGPU::S_MINIMUM_F16: case AMDGPU::S_MAXIMUM_F16: { - const DebugLoc &DL = Inst.getDebugLoc(); Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass : &AMDGPU::VGPR_32RegClass); @@ -8063,7 +8173,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, case AMDGPU::V_S_RCP_F16_e64: case AMDGPU::V_S_RSQ_F16_e64: case AMDGPU::V_S_SQRT_F16_e64: { - const DebugLoc &DL = Inst.getDebugLoc(); Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass : &AMDGPU::VGPR_32RegClass); @@ -8112,26 +8221,34 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, return; } - if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() && - NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) { - // Instead of creating a copy where src and dst are the same register - // class, we just replace all uses of dst with src. These kinds of - // copies interfere with the heuristics MachineSink uses to decide - // whether or not to split a critical edge. Since the pass assumes - // that copies will end up as machine instructions and not be - // eliminated. - addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist); + if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual()) { Register NewDstReg = Inst.getOperand(1).getReg(); - MRI.replaceRegWith(DstReg, NewDstReg); - MRI.clearKillFlags(NewDstReg); - Inst.getOperand(0).setReg(DstReg); - Inst.eraseFromParent(); - // Legalize t16 operand since replaceReg is called after addUsersToVALU - for (MachineOperand &MO : - make_early_inc_range(MRI.use_operands(NewDstReg))) { - legalizeOperandsVALUt16(*MO.getParent(), MRI); + const TargetRegisterClass *SrcRC = RI.getRegClassForReg(MRI, NewDstReg); + if (const TargetRegisterClass *CommonRC = + RI.getCommonSubClass(NewDstRC, SrcRC)) { + // Instead of creating a copy where src and dst are the same register + // class, we just replace all uses of dst with src. These kinds of + // copies interfere with the heuristics MachineSink uses to decide + // whether or not to split a critical edge. Since the pass assumes + // that copies will end up as machine instructions and not be + // eliminated. + addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist); + MRI.replaceRegWith(DstReg, NewDstReg); + MRI.clearKillFlags(NewDstReg); + Inst.getOperand(0).setReg(DstReg); + + if (!MRI.constrainRegClass(NewDstReg, CommonRC)) + llvm_unreachable("failed to constrain register"); + + Inst.eraseFromParent(); + // Legalize t16 operand since replaceReg is called after addUsersToVALU + for (MachineOperand &MO : + make_early_inc_range(MRI.use_operands(NewDstReg))) { + legalizeOperandsVALUt16(*MO.getParent(), MRI); + } + + return; } - return; } // If this is a v2s copy between 16bit and 32bit reg, @@ -8183,7 +8300,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, AMDGPU::OpName::src0_modifiers) >= 0) NewInstr.addImm(0); if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) { - MachineOperand Src = Inst.getOperand(1); + const MachineOperand &Src = Inst.getOperand(1); NewInstr->addOperand(Src); } @@ -8412,6 +8529,37 @@ void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist, addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); } +void SIInstrInfo::lowerScalarAbsDiff(SIInstrWorklist &Worklist, + MachineInstr &Inst) const { + MachineBasicBlock &MBB = *Inst.getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + MachineBasicBlock::iterator MII = Inst; + const DebugLoc &DL = Inst.getDebugLoc(); + + MachineOperand &Dest = Inst.getOperand(0); + MachineOperand &Src1 = Inst.getOperand(1); + MachineOperand &Src2 = Inst.getOperand(2); + Register SubResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + unsigned SubOp = + ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32; + + BuildMI(MBB, MII, DL, get(SubOp), SubResultReg) + .addReg(Src1.getReg()) + .addReg(Src2.getReg()); + + BuildMI(MBB, MII, DL, get(SubOp), TmpReg).addImm(0).addReg(SubResultReg); + + BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg) + .addReg(SubResultReg) + .addReg(TmpReg); + + MRI.replaceRegWith(Dest.getReg(), ResultReg); + addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); +} + void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist, MachineInstr &Inst) const { MachineBasicBlock &MBB = *Inst.getParent(); @@ -9199,7 +9347,7 @@ void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist, addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); } -void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op, +void SIInstrInfo::addSCCDefUsersToVALUWorklist(const MachineOperand &Op, MachineInstr &SCCDefInst, SIInstrWorklist &Worklist, Register NewCond) const { @@ -9217,7 +9365,7 @@ void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op, int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, &RI, false); if (SCCIdx != -1) { if (MI.isCopy()) { - MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); Register DestReg = MI.getOperand(0).getReg(); MRI.replaceRegWith(DestReg, NewCond); @@ -9329,7 +9477,7 @@ Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI, return SGPRReg; Register UsedSGPRs[3] = {Register()}; - const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); for (unsigned i = 0; i < 3; ++i) { int Idx = OpIndices[i]; @@ -9579,7 +9727,7 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { return getInstBundleSize(MI); case TargetOpcode::INLINEASM: case TargetOpcode::INLINEASM_BR: { - const MachineFunction *MF = MI.getParent()->getParent(); + const MachineFunction *MF = MI.getMF(); const char *AsmStr = MI.getOperand(0).getSymbolName(); return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST); } @@ -9714,7 +9862,7 @@ bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI, // needed by the prolog. However, the insertions for scalar registers can // always be placed at the BB top as they are independent of the exec mask // value. - const MachineFunction *MF = MI.getParent()->getParent(); + const MachineFunction *MF = MI.getMF(); bool IsNullOrVectorRegister = true; if (Reg) { const MachineRegisterInfo &MRI = MF->getRegInfo(); @@ -10160,7 +10308,7 @@ static bool followSubRegDef(MachineInstr &MI, } MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, - MachineRegisterInfo &MRI) { + const MachineRegisterInfo &MRI) { assert(MRI.isSSA()); if (!P.Reg.isVirtual()) return nullptr; @@ -10501,7 +10649,7 @@ SIInstrInfo::getInstructionUniformity(const MachineInstr &MI) const { return InstructionUniformity::Default; } - const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo(); // FIXME: It's conceptually broken to report this for an instruction, and not @@ -10618,6 +10766,44 @@ bool SIInstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg, return false; } +// SCC is already valid after SCCValid. +// SCCRedefine will redefine SCC to the same value already available after +// SCCValid. If there are no intervening SCC conflicts delete SCCRedefine and +// update kill/dead flags if necessary. +static bool optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine, + const SIRegisterInfo &RI) { + MachineInstr *KillsSCC = nullptr; + if (SCCValid->getParent() != SCCRedefine->getParent()) + return false; + for (MachineInstr &MI : make_range(std::next(SCCValid->getIterator()), + SCCRedefine->getIterator())) { + if (MI.modifiesRegister(AMDGPU::SCC, &RI)) + return false; + if (MI.killsRegister(AMDGPU::SCC, &RI)) + KillsSCC = &MI; + } + if (MachineOperand *SccDef = + SCCValid->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr)) + SccDef->setIsDead(false); + if (KillsSCC) + KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr); + SCCRedefine->eraseFromParent(); + return true; +} + +static bool foldableSelect(const MachineInstr &Def) { + if (Def.getOpcode() != AMDGPU::S_CSELECT_B32 && + Def.getOpcode() != AMDGPU::S_CSELECT_B64) + return false; + bool Op1IsNonZeroImm = + Def.getOperand(1).isImm() && Def.getOperand(1).getImm() != 0; + bool Op2IsZeroImm = + Def.getOperand(2).isImm() && Def.getOperand(2).getImm() == 0; + if (!Op1IsNonZeroImm || !Op2IsZeroImm) + return false; + return true; +} + bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, @@ -10633,23 +10819,10 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, if (CmpValue != 0) return false; - MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg); - if (!Def || Def->getParent() != CmpInstr.getParent()) + MachineInstr *Def = MRI->getVRegDef(SrcReg); + if (!Def) return false; - const auto foldableSelect = [](MachineInstr *Def) -> bool { - if (Def->getOpcode() == AMDGPU::S_CSELECT_B32 || - Def->getOpcode() == AMDGPU::S_CSELECT_B64) { - bool Op1IsNonZeroImm = - Def->getOperand(1).isImm() && Def->getOperand(1).getImm() != 0; - bool Op2IsZeroImm = - Def->getOperand(2).isImm() && Def->getOperand(2).getImm() == 0; - if (Op1IsNonZeroImm && Op2IsZeroImm) - return true; - } - return false; - }; - // For S_OP that set SCC = DST!=0, do the transformation // // s_cmp_lg_* (S_OP ...), 0 => (S_OP ...) @@ -10660,24 +10833,38 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, // // s_cmp_lg_* (S_CSELECT* (non-zero imm), 0), 0 => (S_CSELECT* (non-zero // imm), 0) - if (!setsSCCifResultIsNonZero(*Def) && !foldableSelect(Def)) + if (!setsSCCifResultIsNonZero(*Def) && !foldableSelect(*Def)) return false; - MachineInstr *KillsSCC = nullptr; - for (MachineInstr &MI : - make_range(std::next(Def->getIterator()), CmpInstr.getIterator())) { - if (MI.modifiesRegister(AMDGPU::SCC, &RI)) - return false; - if (MI.killsRegister(AMDGPU::SCC, &RI)) - KillsSCC = &MI; - } + if (!optimizeSCC(Def, &CmpInstr, RI)) + return false; - if (MachineOperand *SccDef = - Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr)) - SccDef->setIsDead(false); - if (KillsSCC) - KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr); - CmpInstr.eraseFromParent(); + // If s_or_b32 result, sY, is unused (i.e. it is effectively a 64-bit + // s_cmp_lg of a register pair) and the inputs are the hi and lo-halves of a + // 64-bit foldableSelect then delete s_or_b32 in the sequence: + // sX = s_cselect_b64 (non-zero imm), 0 + // sLo = copy sX.sub0 + // sHi = copy sX.sub1 + // sY = s_or_b32 sLo, sHi + if (Def->getOpcode() == AMDGPU::S_OR_B32 && + MRI->use_nodbg_empty(Def->getOperand(0).getReg())) { + const MachineOperand &OrOpnd1 = Def->getOperand(1); + const MachineOperand &OrOpnd2 = Def->getOperand(2); + if (OrOpnd1.isReg() && OrOpnd2.isReg()) { + MachineInstr *Def1 = MRI->getVRegDef(OrOpnd1.getReg()); + MachineInstr *Def2 = MRI->getVRegDef(OrOpnd2.getReg()); + if (Def1 && Def1->getOpcode() == AMDGPU::COPY && Def2 && + Def2->getOpcode() == AMDGPU::COPY && Def1->getOperand(1).isReg() && + Def2->getOperand(1).isReg() && + Def1->getOperand(1).getSubReg() == AMDGPU::sub0 && + Def2->getOperand(1).getSubReg() == AMDGPU::sub1 && + Def1->getOperand(1).getReg() == Def2->getOperand(1).getReg()) { + MachineInstr *Select = MRI->getVRegDef(Def1->getOperand(1).getReg()); + if (Select && foldableSelect(*Select)) + optimizeSCC(Select, Def, RI); + } + } + } return true; }; @@ -10707,8 +10894,8 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n - MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg); - if (!Def || Def->getParent() != CmpInstr.getParent()) + MachineInstr *Def = MRI->getVRegDef(SrcReg); + if (!Def) return false; if (Def->getOpcode() != AMDGPU::S_AND_B32 && @@ -10755,21 +10942,8 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg)) return false; - MachineInstr *KillsSCC = nullptr; - for (MachineInstr &MI : - make_range(std::next(Def->getIterator()), CmpInstr.getIterator())) { - if (MI.modifiesRegister(AMDGPU::SCC, &RI)) - return false; - if (MI.killsRegister(AMDGPU::SCC, &RI)) - KillsSCC = &MI; - } - - MachineOperand *SccDef = - Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr); - SccDef->setIsDead(false); - if (KillsSCC) - KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr); - CmpInstr.eraseFromParent(); + if (!optimizeSCC(Def, &CmpInstr, RI)) + return false; if (!MRI->use_nodbg_empty(DefReg)) { assert(!IsReversedCC); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index dc23a21..b1d6563 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -136,6 +136,8 @@ private: void lowerScalarAbs(SIInstrWorklist &Worklist, MachineInstr &Inst) const; + void lowerScalarAbsDiff(SIInstrWorklist &Worklist, MachineInstr &Inst) const; + void lowerScalarXnor(SIInstrWorklist &Worklist, MachineInstr &Inst) const; void splitScalarNotBinop(SIInstrWorklist &Worklist, MachineInstr &Inst, @@ -172,7 +174,7 @@ private: void addUsersToMoveToVALUWorklist(Register Reg, MachineRegisterInfo &MRI, SIInstrWorklist &Worklist) const; - void addSCCDefUsersToVALUWorklist(MachineOperand &Op, + void addSCCDefUsersToVALUWorklist(const MachineOperand &Op, MachineInstr &SCCDefInst, SIInstrWorklist &Worklist, Register NewCond = Register()) const; @@ -307,22 +309,19 @@ public: void storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, - bool isKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, - int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; bool expandPostRAPseudo(MachineInstr &MI) const override; void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, - const MachineInstr &Orig, - const TargetRegisterInfo &TRI) const override; + const MachineInstr &Orig) const override; // Splits a V_MOV_B64_DPP_PSEUDO opcode into a pair of v_mov_b32_dpp // instructions. Returns a pair of generated instructions. @@ -426,6 +425,9 @@ public: void removeModOperands(MachineInstr &MI) const; + void mutateAndCleanupImplicit(MachineInstr &MI, + const MCInstrDesc &NewDesc) const; + /// Return the extracted immediate value in a subregister use from a constant /// materialized in a super register. /// @@ -583,6 +585,10 @@ public: return get(Opcode).TSFlags & SIInstrFlags::MTBUF; } + static bool isBUF(const MachineInstr &MI) { + return isMUBUF(MI) || isMTBUF(MI); + } + static bool isSMRD(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::SMRD; } @@ -688,11 +694,11 @@ public: return get(Opcode).TSFlags & SIInstrFlags::FLAT; } - /// \returns true for SCRATCH_ instructions, or FLAT_ instructions with - /// SCRATCH_ memory operands. + /// \returns true for SCRATCH_ instructions, or FLAT/BUF instructions unless + /// the MMOs do not include scratch. /// Conservatively correct; will return true if \p MI cannot be proven /// to not hit scratch. - bool mayAccessScratchThroughFlat(const MachineInstr &MI) const; + bool mayAccessScratch(const MachineInstr &MI) const; /// \returns true for FLAT instructions that can access VMEM. bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const; @@ -1174,13 +1180,13 @@ public: bool isVGPRCopy(const MachineInstr &MI) const { assert(isCopyInstr(MI)); Register Dest = MI.getOperand(0).getReg(); - const MachineFunction &MF = *MI.getParent()->getParent(); + const MachineFunction &MF = *MI.getMF(); const MachineRegisterInfo &MRI = MF.getRegInfo(); return !RI.isSGPRReg(MRI, Dest); } bool hasVGPRUses(const MachineInstr &MI) const { - const MachineFunction &MF = *MI.getParent()->getParent(); + const MachineFunction &MF = *MI.getMF(); const MachineRegisterInfo &MRI = MF.getRegInfo(); return llvm::any_of(MI.explicit_uses(), [&MRI, this](const MachineOperand &MO) { @@ -1622,10 +1628,6 @@ public: /// Return true if this opcode should not be used by codegen. bool isAsmOnlyOpcode(int MCOp) const; - const TargetRegisterClass * - getRegClass(const MCInstrDesc &TID, unsigned OpNum, - const TargetRegisterInfo *TRI) const override; - void fixImplicitOperands(MachineInstr &MI) const; MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, @@ -1655,6 +1657,7 @@ public: const TargetSchedModel &getSchedModel() const { return SchedModel; } + // FIXME: This should be removed // Enforce operand's \p OpName even alignment if required by target. // This is used if an operand is a 32 bit register but needs to be aligned // regardless. @@ -1687,7 +1690,7 @@ TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, /// skipping copy like instructions and subreg-manipulation pseudos. /// Following another subreg of a reg:subreg isn't supported. MachineInstr *getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, - MachineRegisterInfo &MRI); + const MachineRegisterInfo &MRI); /// \brief Return false if EXEC is not changed between the def of \p VReg at \p /// DefMI and the use at \p UseMI. Should be run on SSA. Currently does not diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index b7f63ec..628b972 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -6,13 +6,6 @@ // //===----------------------------------------------------------------------===// -def isWave32 : Predicate<"Subtarget->isWave32()">, - AssemblerPredicate <(any_of FeatureWavefrontSize32, - FeatureAssemblerPermissiveWavesize)>; -def isWave64 : Predicate<"Subtarget->isWave64()">, - AssemblerPredicate <(any_of FeatureWavefrontSize64, - FeatureAssemblerPermissiveWavesize)>; - class AMDGPUMnemonicAlias<string From, string To, string VariantName = ""> : MnemonicAlias<From, To, VariantName>, PredicateControl; @@ -57,6 +50,8 @@ def GFX10Gen : GFXGen<isGFX10Only, "GFX10", "_gfx10", SIEncodingFamily.GFX10>; // SI DAG Nodes //===----------------------------------------------------------------------===// +// CLAMP value between 0.0 and 1.0. NaN clamped to 0, following clamp output +// modifier behavior with dx10_enable. def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPUnaryOp>; def SDTSBufferLoad : SDTypeProfile<1, 3, @@ -331,7 +326,7 @@ def mfma_f32_32x32x64_f8f6f4 : UnscaledMFMAOptimizationPat<int_amdgcn_mfma_scale //===----------------------------------------------------------------------===// class isIntType<ValueType SrcVT> { - bit ret = !and(SrcVT.isInteger, !ne(SrcVT.Value, i1.Value)); + bit ret = !and(SrcVT.isInteger, !ne(SrcVT, i1)); } def SDTSBufferPrefetch : SDTypeProfile<0, 3, @@ -776,11 +771,7 @@ def xnor : PatFrag < foreach I = 1-4 in { def shl#I#_add : PatFrag < (ops node:$src0, node:$src1), - (add (shl_oneuse $src0, (i32 I)), $src1)> { - // FIXME: Poor substitute for disabling pattern in SelectionDAG - let PredicateCode = [{return false;}]; - let GISelPredicateCode = [{return true;}]; -} + (add (shl_oneuse $src0, (i32 I)), $src1)>; } multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0, @@ -818,6 +809,8 @@ defm atomic_load_add : SIAtomicM0Glue2 <"LOAD_ADD">; defm atomic_load_sub : SIAtomicM0Glue2 <"LOAD_SUB">; defm atomic_load_uinc_wrap : SIAtomicM0Glue2 <"LOAD_UINC_WRAP">; defm atomic_load_udec_wrap : SIAtomicM0Glue2 <"LOAD_UDEC_WRAP">; +defm atomic_load_usub_cond : SIAtomicM0Glue2 <"LOAD_USUB_COND">; +defm atomic_load_usub_sat : SIAtomicM0Glue2 <"LOAD_USUB_SAT">; defm atomic_load_and : SIAtomicM0Glue2 <"LOAD_AND">; defm atomic_load_min : SIAtomicM0Glue2 <"LOAD_MIN">; defm atomic_load_max : SIAtomicM0Glue2 <"LOAD_MAX">; @@ -1796,10 +1789,10 @@ class SIMCInstr <string pseudo, int subtarget> { class getNumSrcArgs<ValueType Src0, ValueType Src1, ValueType Src2> { int ret = - !if (!eq(Src0.Value, untyped.Value), 0, - !if (!eq(Src1.Value, untyped.Value), 1, // VOP1 - !if (!eq(Src2.Value, untyped.Value), 2, // VOP2 - 3))); // VOP3 + !if (!eq(Src0, untyped), 0, + !if (!eq(Src1, untyped), 1, // VOP1 + !if (!eq(Src2, untyped), 2, // VOP2 + 3))); // VOP3 } // Returns the register class to use for the destination of VOP[123C] @@ -1868,17 +1861,17 @@ class getVCSrcForVT<ValueType VT> { !if(VT.isFP, !if(!eq(VT.Size, 64), VCSrc_f64, - !cond(!eq(VT.Value, f16.Value) : VCSrc_f16, - !eq(VT.Value, bf16.Value) : VCSrc_bf16, - !eq(VT.Value, v2f16.Value) : VCSrc_v2f16, - !eq(VT.Value, v2bf16.Value) : VCSrc_v2bf16, + !cond(!eq(VT, f16) : VCSrc_f16, + !eq(VT, bf16) : VCSrc_bf16, + !eq(VT, v2f16) : VCSrc_v2f16, + !eq(VT, v2bf16) : VCSrc_v2bf16, 1 : VCSrc_f32) ), !if(!eq(VT.Size, 64), VCSrc_b64, - !if(!eq(VT.Value, i16.Value), + !if(!eq(VT, i16), VCSrc_b16, - !if(!eq(VT.Value, v2i16.Value), + !if(!eq(VT, v2i16), VCSrc_v2b16, VCSrc_b32 ) @@ -2003,28 +1996,28 @@ class getVOP3DPPSrcForVT<ValueType VT, bit IsFake16 = 1> { // Float or packed int class isModifierType<ValueType SrcVT> { - bit ret = !or(!eq(SrcVT.Value, f16.Value), - !eq(SrcVT.Value, bf16.Value), - !eq(SrcVT.Value, f32.Value), - !eq(SrcVT.Value, f64.Value), - !eq(SrcVT.Value, v2f16.Value), - !eq(SrcVT.Value, v2i16.Value), - !eq(SrcVT.Value, v2bf16.Value), - !eq(SrcVT.Value, v2f32.Value), - !eq(SrcVT.Value, v2i32.Value), - !eq(SrcVT.Value, v4f16.Value), - !eq(SrcVT.Value, v4i16.Value), - !eq(SrcVT.Value, v4bf16.Value), - !eq(SrcVT.Value, v4f32.Value), - !eq(SrcVT.Value, v4i32.Value), - !eq(SrcVT.Value, v8f16.Value), - !eq(SrcVT.Value, v8i16.Value), - !eq(SrcVT.Value, v8bf16.Value), - !eq(SrcVT.Value, v8f32.Value), - !eq(SrcVT.Value, v8i32.Value), - !eq(SrcVT.Value, v16f16.Value), - !eq(SrcVT.Value, v16i16.Value), - !eq(SrcVT.Value, v16bf16.Value)); + bit ret = !or(!eq(SrcVT, f16), + !eq(SrcVT, bf16), + !eq(SrcVT, f32), + !eq(SrcVT, f64), + !eq(SrcVT, v2f16), + !eq(SrcVT, v2i16), + !eq(SrcVT, v2bf16), + !eq(SrcVT, v2f32), + !eq(SrcVT, v2i32), + !eq(SrcVT, v4f16), + !eq(SrcVT, v4i16), + !eq(SrcVT, v4bf16), + !eq(SrcVT, v4f32), + !eq(SrcVT, v4i32), + !eq(SrcVT, v8f16), + !eq(SrcVT, v8i16), + !eq(SrcVT, v8bf16), + !eq(SrcVT, v8f32), + !eq(SrcVT, v8i32), + !eq(SrcVT, v16f16), + !eq(SrcVT, v16i16), + !eq(SrcVT, v16bf16)); } // Return type of input modifiers operand for specified input operand. @@ -2057,9 +2050,9 @@ class getSrcModDPP <ValueType VT> { class getSrcModDPP_t16 <ValueType VT, bit IsFake16 = 1> { Operand ret = !if (VT.isFP, - !if (!or(!eq(VT.Value, f16.Value), !eq(VT.Value, bf16.Value)), + !if (!or(!eq(VT, f16), !eq(VT, bf16)), FPT16_Lo128VRegInputMods<IsFake16>, FPVRegInputMods), - !if (!eq(VT.Value, i16.Value), + !if (!eq(VT, i16), IntT16_Lo128VRegInputMods<IsFake16>, IntVRegInputMods)); } @@ -2068,11 +2061,11 @@ class getSrcModDPP_t16 <ValueType VT, bit IsFake16 = 1> { class getSrcModVOP3VC <ValueType VT, bit IsFake16 = 1> { Operand ret = !if (VT.isFP, - !if (!or(!eq(VT.Value, f16.Value), !eq(VT.Value, bf16.Value)), + !if (!or(!eq(VT, f16), !eq(VT, bf16)), FPT16VCSrcInputMods<IsFake16>, - !if (!eq(VT.Value, f64.Value), FP64VCSrcInputMods, + !if (!eq(VT, f64), FP64VCSrcInputMods, FP32VCSrcInputMods)), - !if (!eq(VT.Value, i16.Value), + !if (!eq(VT, i16), IntT16VCSrcInputMods<IsFake16>, Int32VCSrcInputMods)); } @@ -2084,15 +2077,15 @@ class getSrcModVOP3VC <ValueType VT, bit IsFake16 = 1> { class getSrc0ModVOP3DPP <ValueType VT, ValueType DstVT, bit IsFake16 = 1> { defvar T16Dst = !if (VT.isFP, - !if (!or(!eq(VT.Value, f16.Value), !eq(VT.Value, bf16.Value)), + !if (!or(!eq(VT, f16), !eq(VT, bf16)), FPT16VRegInputMods<IsFake16>, FPVRegT16DstInputMods), - !if (!eq(VT.Value, i16.Value), IntT16VRegInputMods<IsFake16>, + !if (!eq(VT, i16), IntT16VRegInputMods<IsFake16>, IntVRegT16DstInputMods)); defvar Normal = !if (VT.isFP, - !if (!or(!eq(VT.Value, f16.Value), !eq(VT.Value, bf16.Value)), + !if (!or(!eq(VT, f16), !eq(VT, bf16)), FPT16VRegInputMods<IsFake16>, FPVRegInputMods), - !if (!eq(VT.Value, i16.Value), + !if (!eq(VT, i16), IntT16VRegInputMods<IsFake16>, IntVRegInputMods)); Operand ret = !if(!and(!not(IsFake16), !eq(DstVT.Size, 16)), T16Dst, Normal); @@ -2102,16 +2095,16 @@ class getSrc0ModVOP3DPP <ValueType VT, ValueType DstVT, bit IsFake16 = 1> { // only operands (VOPD3 vsrc1 and vsrc2). class getSrcModVOP3V <ValueType VT> { Operand ret = - !if (!eq(VT.Value, f64.Value), FP64VRegSrcInputMods, + !if (!eq(VT, f64), FP64VRegSrcInputMods, FP32VRegSrcInputMods); } // Return type of input modifiers operand specified input operand for SDWA class getSrcModSDWA <ValueType VT> { - Operand ret = !if(!eq(VT.Value, f16.Value), FP16SDWAInputMods, - !if(!eq(VT.Value, f32.Value), FP32SDWAInputMods, - !if(!eq(VT.Value, i16.Value), Int16SDWAInputMods, - !if(!eq(VT.Value, bf16.Value), FP16SDWAInputMods, + Operand ret = !if(!eq(VT, f16), FP16SDWAInputMods, + !if(!eq(VT, f32), FP32SDWAInputMods, + !if(!eq(VT, i16), Int16SDWAInputMods, + !if(!eq(VT, bf16), FP16SDWAInputMods, Int32SDWAInputMods)))); } @@ -2778,14 +2771,14 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> { field bit HasFP8ByteSel = !or(HasFP8SrcByteSel, HasFP8DstByteSel); field bit HasBitOp3 = 0; - field bit HasDst = !ne(DstVT.Value, untyped.Value); + field bit HasDst = !ne(DstVT, untyped); field bit HasDst32 = HasDst; field bit EmitDst = HasDst; // force dst encoding, see v_movreld_b32 special case field bit EmitDstSel = EmitDst; field int NumSrcArgs = getNumSrcArgs<Src0VT, Src1VT, Src2VT>.ret; - field bit HasSrc0 = !ne(Src0VT.Value, untyped.Value); - field bit HasSrc1 = !ne(Src1VT.Value, untyped.Value); - field bit HasSrc2 = !ne(Src2VT.Value, untyped.Value); + field bit HasSrc0 = !ne(Src0VT, untyped); + field bit HasSrc1 = !ne(Src1VT, untyped); + field bit HasSrc2 = !ne(Src2VT, untyped); field bit HasSrc0FloatMods = Src0VT.isFP; field bit HasSrc1FloatMods = Src1VT.isFP; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 6f1feb1..984d1a4 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -348,7 +348,7 @@ class WaveReduceOp<string OpName, string TypeStr, ValueType Ty, // Input list : [Operation_name, // type - Signed(I)/Unsigned(U)/Float(F)/Bitwise(B), -// bit-width +// input-type // output register class, // input register class] defvar Operations = [ @@ -371,6 +371,11 @@ defvar Operations = [ WaveReduceOp<"and", "B64", i64, SGPR_64, VSrc_b64>, WaveReduceOp<"or", "B64", i64, SGPR_64, VSrc_b64>, WaveReduceOp<"xor", "B64", i64, SGPR_64, VSrc_b64>, + + WaveReduceOp<"fmin", "F32", f32, SGPR_32, VSrc_b32>, + WaveReduceOp<"fmax", "F32", f32, SGPR_32, VSrc_b32>, + WaveReduceOp<"fadd", "F32", f32, SGPR_32, VSrc_b32>, + WaveReduceOp<"fsub", "F32", f32, SGPR_32, VSrc_b32>, ]; foreach Op = Operations in { @@ -791,6 +796,17 @@ def : GCNPat< (SI_CALL_ISEL $src0, (i64 0)) >; +// Funnel shift right (fshr) patterns for uniform inputs. +// These patterns implement this using scalar instructions by constructing a 64-bit +// value {a, b} and performing a single right shift. +def : GCNPat<(UniformTernaryFrag<fshr> i32:$src0, i32:$src1, i32:$src2), + (i32 (EXTRACT_SUBREG (S_LSHR_B64 (REG_SEQUENCE SReg_64, $src1, sub0, $src0, sub1), (S_AND_B32 $src2, (i32 31))), sub0)) +>; + +def : GCNPat<(UniformTernaryFrag<fshr> i32:$src0, i32:$src1, (i32 ShiftAmt32Imm:$src2)), + (i32 (EXTRACT_SUBREG (S_LSHR_B64 (REG_SEQUENCE SReg_64, $src1, sub0, $src0, sub1), $src2), sub0)) +>; + // Wrapper around s_swappc_b64 with extra $callee parameter to track // the called function after regalloc. def SI_CALL : SPseudoInstSI < @@ -804,9 +820,8 @@ def SI_CALL : SPseudoInstSI < let isConvergent = 1; } -class SI_TCRETURN_Pseudo<RegisterClass rc, SDNode sd> : SPseudoInstSI <(outs), - (ins rc:$src0, unknown:$callee, i32imm:$fpdiff), - [(sd i64:$src0, tglobaladdr:$callee, i32:$fpdiff)]> { +class SI_TCRETURN_Pseudo<RegisterClass rc, list<dag> pattern = []> + : SPseudoInstSI <(outs), (ins rc:$src0, unknown:$callee, i32imm:$fpdiff), pattern> { let Size = 4; let FixedSize = 1; let isCall = 1; @@ -820,8 +835,15 @@ class SI_TCRETURN_Pseudo<RegisterClass rc, SDNode sd> : SPseudoInstSI <(outs), } // Tail call handling pseudo -def SI_TCRETURN : SI_TCRETURN_Pseudo<CCR_SGPR_64, AMDGPUtc_return>; -def SI_TCRETURN_GFX : SI_TCRETURN_Pseudo<Gfx_CCR_SGPR_64, AMDGPUtc_return_gfx>; +def SI_TCRETURN : SI_TCRETURN_Pseudo<CCR_SGPR_64, + [(AMDGPUtc_return i64:$src0, tglobaladdr:$callee, i32:$fpdiff)]>; +def SI_TCRETURN_GFX : SI_TCRETURN_Pseudo<Gfx_CCR_SGPR_64, + [(AMDGPUtc_return_gfx i64:$src0, tglobaladdr:$callee, i32:$fpdiff)]>; + +// Tail call for chain calling conventions. +// Uses unrestricted SGPR_64 instead of CCR_SGPR_64 because chain calls +// never return and don't need to preserve any SGPRs. +def SI_TCRETURN_CHAIN : SI_TCRETURN_Pseudo<SGPR_64>; // Handle selecting indirect tail calls def : GCNPat< @@ -851,13 +873,13 @@ multiclass SI_CS_CHAIN_TC< // This is essentially a tail call, but it also takes a mask to put in EXEC // right before jumping to the callee. def NAME: SPseudoInstSI <(outs), - (ins CCR_SGPR_64:$src0, unknown:$callee, i32imm:$fpdiff, execrc:$exec)>; + (ins SGPR_64:$src0, unknown:$callee, i32imm:$fpdiff, execrc:$exec)>; // Same as above, but it will first try to reallocate the VGPRs, and choose an // EXEC mask and a callee depending on the success of the reallocation attempt. def _DVGPR : SPseudoInstSI <(outs), - (ins CCR_SGPR_64:$src0, i64imm:$callee, i32imm:$fpdiff, execrc:$exec, - SSrc_b32:$numvgprs, execrc:$fbexec, CCR_SGPR_64:$fbcallee)>; + (ins SGPR_64:$src0, i64imm:$callee, i32imm:$fpdiff, execrc:$exec, + SSrc_b32:$numvgprs, execrc:$fbexec, SGPR_64:$fbcallee)>; } // End FixedSize = 0 etc } @@ -869,7 +891,7 @@ multiclass si_cs_chain_tc_pattern< dag callee, ValueType execvt, RegisterOperand execrc, Instruction tc> { def : GCNPat< (AMDGPUtc_return_chain i64:$src0, callee, (i32 timm:$fpdiff), execvt:$exec), - (tc CCR_SGPR_64:$src0, callee, i32imm:$fpdiff, execrc:$exec) + (tc SGPR_64:$src0, callee, i32imm:$fpdiff, execrc:$exec) >; } @@ -896,8 +918,8 @@ multiclass si_cs_chain_tc_dvgpr_patterns< (AMDGPUtc_return_chain_dvgpr i64:$src0, callee, (i32 timm:$fpdiff), execvt:$exec, i32:$numvgprs, execvt:$fbexec, i64:$fbcallee), - (tc CCR_SGPR_64:$src0, (i64 0), i32imm:$fpdiff, execrc:$exec, - SSrc_b32:$numvgprs, execrc:$fbexec, CCR_SGPR_64:$fbcallee) + (tc SGPR_64:$src0, (i64 0), i32imm:$fpdiff, execrc:$exec, + SSrc_b32:$numvgprs, execrc:$fbexec, SGPR_64:$fbcallee) >; } } @@ -1429,7 +1451,7 @@ def : GCNPat < // Don't allow source modifiers. If there are any source modifiers then it's // better to select fma instead of fmac. -let SubtargetPredicate = HasFmaLegacy32 in +let SubtargetPredicate = HasFmacLegacy32 in def : GCNPat < (f32 (int_amdgcn_fma_legacy (VOP3NoMods f32:$src0), (VOP3NoMods f32:$src1), @@ -2663,8 +2685,6 @@ def : AMDGPUPat < let True16Predicate = NotHasTrue16BitInsts in { let SubtargetPredicate = isNotGFX9Plus in { -def : ROTRPattern <V_ALIGNBIT_B32_e64>; - def : GCNPat<(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (and i32:$src1, (i32 31))))), (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)), (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>; @@ -2675,14 +2695,6 @@ def : GCNPat<(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (i32 ShiftAmt32Imm: } // isNotGFX9Plus let SubtargetPredicate = isGFX9GFX10 in { -def : GCNPat < - (rotr i32:$src0, i32:$src1), - (V_ALIGNBIT_B32_opsel_e64 /* src0_modifiers */ 0, $src0, - /* src1_modifiers */ 0, $src0, - /* src2_modifiers */ 0, - $src1, /* clamp */ 0, /* op_sel */ 0) ->; - foreach pat = [(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (and i32:$src1, (i32 31))))), (i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (i32 ShiftAmt32Imm:$src1))))] in def : GCNPat<pat, @@ -2704,15 +2716,6 @@ def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2), } // end True16Predicate = NotHasTrue16BitInsts let True16Predicate = UseRealTrue16Insts in { -def : GCNPat < - (rotr i32:$src0, i32:$src1), - (V_ALIGNBIT_B32_t16_e64 /* src0_modifiers */ 0, $src0, - /* src1_modifiers */ 0, $src0, - /* src2_modifiers */ 0, - (EXTRACT_SUBREG $src1, lo16), - /* clamp */ 0, /* op_sel */ 0) ->; - def : GCNPat<(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))), (V_ALIGNBIT_B32_t16_e64 0, /* src0_modifiers */ (i32 (EXTRACT_SUBREG (i64 $src0), sub1)), @@ -2731,14 +2734,6 @@ def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2), } // end True16Predicate = UseRealTrue16Insts let True16Predicate = UseFakeTrue16Insts in { -def : GCNPat < - (rotr i32:$src0, i32:$src1), - (V_ALIGNBIT_B32_fake16_e64 /* src0_modifiers */ 0, $src0, - /* src1_modifiers */ 0, $src0, - /* src2_modifiers */ 0, - $src1, /* clamp */ 0, /* op_sel */ 0) ->; - def : GCNPat<(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (and i32:$src1, (i32 31))))), (V_ALIGNBIT_B32_fake16_e64 0, /* src0_modifiers */ (i32 (EXTRACT_SUBREG (i64 $src0), sub1)), @@ -2930,15 +2925,25 @@ def : GCNPat < >; def : GCNPat < - (i64 (zext i32:$src)), + (i64 (UniformUnaryFrag<zext> i32:$src)), (REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 (i32 0)), sub1) >; def : GCNPat < - (i64 (anyext i32:$src)), + (i64 (zext i32:$src)), + (REG_SEQUENCE VReg_64, $src, sub0, (V_MOV_B32_e32 (i32 0)), sub1) +>; + +def : GCNPat < + (i64 (UniformUnaryFrag<anyext> i32:$src)), (REG_SEQUENCE SReg_64, $src, sub0, (i32 (IMPLICIT_DEF)), sub1) >; +def : GCNPat < + (i64 (anyext i32:$src)), + (REG_SEQUENCE VReg_64, $src, sub0, (i32 (IMPLICIT_DEF)), sub1) +>; + class ZExt_i64_i1_Pat <SDNode ext> : GCNPat < (i64 (ext i1:$src)), (REG_SEQUENCE VReg_64, @@ -4527,6 +4532,7 @@ def G_AMDGPU_BUFFER_ATOMIC_SMAX : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_UMAX : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_AND : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32 : BufferAtomicGenericInstruction; +def G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32 : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_OR : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_XOR : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_INC : BufferAtomicGenericInstruction; @@ -4725,3 +4731,14 @@ def V_ILLEGAL : Enc32, InstSI<(outs), (ins), "v_illegal"> { let hasSideEffects = 1; let SubtargetPredicate = isGFX10Plus; } + +defvar VGPR32_Ptr_Opcodes = [LOAD_STACK_GUARD]; +defvar VGPR64_Ptr_Opcodes = !listremove(PseudosWithPtrOps, VGPR32_Ptr_Opcodes); + +foreach inst = VGPR32_Ptr_Opcodes in { + def : RemapPointerOperands<inst, VGPR_32>; +} + +foreach inst = VGPR64_Ptr_Opcodes in { + def : RemapPointerOperands<inst, VReg_64_AlignTarget>; +} diff --git a/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp b/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp index 6537b79..340c9f6 100644 --- a/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp @@ -186,7 +186,7 @@ void SILateBranchLowering::expandChainCall(MachineInstr &MI, for (int OpIdx = MI.getNumExplicitOperands() - 1; OpIdx >= ExecIdx; --OpIdx) MI.removeOperand(OpIdx); - MI.setDesc(TII->get(AMDGPU::SI_TCRETURN)); + MI.setDesc(TII->get(AMDGPU::SI_TCRETURN_CHAIN)); } void SILateBranchLowering::earlyTerm(MachineInstr &MI, diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index f0d1117..fcf91e0 100644 --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -233,10 +233,11 @@ private: void copyToDestRegs(CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator InsertBefore, - AMDGPU::OpName OpName, Register DestReg) const; + const DebugLoc &DL, AMDGPU::OpName OpName, + Register DestReg) const; Register copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator InsertBefore, - AMDGPU::OpName OpName) const; + const DebugLoc &DL, AMDGPU::OpName OpName) const; unsigned read2Opcode(unsigned EltSize) const; unsigned read2ST64Opcode(unsigned EltSize) const; @@ -1336,11 +1337,9 @@ SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI, int Data1Idx = AMDGPU::getNamedOperandIdx(Write2Opc.getOpcode(), AMDGPU::OpName::data1); - const TargetRegisterClass *DataRC0 = - TII->getRegClass(Write2Opc, Data0Idx, TRI); + const TargetRegisterClass *DataRC0 = TII->getRegClass(Write2Opc, Data0Idx); - const TargetRegisterClass *DataRC1 = - TII->getRegClass(Write2Opc, Data1Idx, TRI); + const TargetRegisterClass *DataRC1 = TII->getRegClass(Write2Opc, Data1Idx); if (unsigned SubReg = Data0->getSubReg()) { DataRC0 = TRI->getMatchingSuperRegClass(MRI->getRegClass(Data0->getReg()), @@ -1367,10 +1366,9 @@ SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI, // Paired. void SILoadStoreOptimizer::copyToDestRegs( CombineInfo &CI, CombineInfo &Paired, - MachineBasicBlock::iterator InsertBefore, AMDGPU::OpName OpName, - Register DestReg) const { + MachineBasicBlock::iterator InsertBefore, const DebugLoc &DL, + AMDGPU::OpName OpName, Register DestReg) const { MachineBasicBlock *MBB = CI.I->getParent(); - DebugLoc DL = CI.I->getDebugLoc(); auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired); @@ -1398,9 +1396,9 @@ void SILoadStoreOptimizer::copyToDestRegs( Register SILoadStoreOptimizer::copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator InsertBefore, + const DebugLoc &DL, AMDGPU::OpName OpName) const { MachineBasicBlock *MBB = CI.I->getParent(); - DebugLoc DL = CI.I->getDebugLoc(); auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired); @@ -1456,7 +1454,8 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); Register DestReg = MRI->createVirtualRegister(SuperRC); - DebugLoc DL = CI.I->getDebugLoc(); + DebugLoc DL = + DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc()); Register BaseReg = AddrReg->getReg(); unsigned BaseSubReg = AddrReg->getSubReg(); @@ -1484,7 +1483,7 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, .addImm(0) // gds .cloneMergedMemRefs({&*CI.I, &*Paired.I}); - copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg); + copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdst, DestReg); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); @@ -1541,7 +1540,8 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); const MCInstrDesc &Write2Desc = TII->get(Opc); - DebugLoc DL = CI.I->getDebugLoc(); + DebugLoc DL = + DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc()); Register BaseReg = AddrReg->getReg(); unsigned BaseSubReg = AddrReg->getSubReg(); @@ -1582,7 +1582,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); - DebugLoc DL = CI.I->getDebugLoc(); + DebugLoc DL = + DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc()); + const unsigned Opcode = getNewOpcode(CI, Paired); const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); @@ -1607,7 +1609,7 @@ SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired, MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); - copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg); + copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata, DestReg); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); @@ -1618,7 +1620,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair( CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); - DebugLoc DL = CI.I->getDebugLoc(); + DebugLoc DL = + DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc()); + const unsigned Opcode = getNewOpcode(CI, Paired); const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); @@ -1639,7 +1643,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair( New.addImm(MergedOffset); New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); - copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::sdst, DestReg); + copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::sdst, DestReg); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); @@ -1650,7 +1654,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); - DebugLoc DL = CI.I->getDebugLoc(); + + DebugLoc DL = + DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc()); const unsigned Opcode = getNewOpcode(CI, Paired); @@ -1680,7 +1686,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( .addImm(0) // swz .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); - copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg); + copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata, DestReg); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); @@ -1691,7 +1697,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); - DebugLoc DL = CI.I->getDebugLoc(); + + DebugLoc DL = + DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc()); const unsigned Opcode = getNewOpcode(CI, Paired); @@ -1731,7 +1739,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( .addImm(0) // swz .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); - copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg); + copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata, DestReg); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); @@ -1742,12 +1750,13 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair( CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); - DebugLoc DL = CI.I->getDebugLoc(); + DebugLoc DL = + DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc()); const unsigned Opcode = getNewOpcode(CI, Paired); Register SrcReg = - copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata); + copyFromSrcRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata); auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) .addReg(SrcReg, RegState::Kill); @@ -1789,7 +1798,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair( CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); - DebugLoc DL = CI.I->getDebugLoc(); + + DebugLoc DL = + DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc()); const unsigned Opcode = getNewOpcode(CI, Paired); @@ -1807,7 +1818,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair( .addImm(CI.CPol) .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); - copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg); + copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdst, DestReg); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); @@ -1818,12 +1829,14 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair( CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); - DebugLoc DL = CI.I->getDebugLoc(); + + DebugLoc DL = + DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc()); const unsigned Opcode = getNewOpcode(CI, Paired); Register SrcReg = - copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata); + copyFromSrcRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata); auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)) @@ -2094,12 +2107,13 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); - DebugLoc DL = CI.I->getDebugLoc(); + DebugLoc DL = + DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc()); const unsigned Opcode = getNewOpcode(CI, Paired); Register SrcReg = - copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata); + copyFromSrcRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata); auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) .addReg(SrcReg, RegState::Kill); diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp index 96131bd..9b71001 100644 --- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp @@ -75,7 +75,7 @@ Vreg1LoweringHelper::Vreg1LoweringHelper(MachineFunction *MF, bool Vreg1LoweringHelper::cleanConstrainRegs(bool Changed) { assert(Changed || ConstrainRegs.empty()); for (Register Reg : ConstrainRegs) - MRI->constrainRegClass(Reg, &AMDGPU::SReg_1_XEXECRegClass); + MRI->constrainRegClass(Reg, TII->getRegisterInfo().getWaveMaskRegClass()); ConstrainRegs.clear(); return Changed; diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp index 40eeeb8..cbd08f0 100644 --- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp @@ -117,27 +117,26 @@ static void insertCSRSaves(MachineBasicBlock &SaveBlock, MachineFunction &MF = *SaveBlock.getParent(); const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); - const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIRegisterInfo *RI = ST.getRegisterInfo(); MachineBasicBlock::iterator I = SaveBlock.begin(); - if (!TFI->spillCalleeSavedRegisters(SaveBlock, I, CSI, TRI)) { + if (!TFI->spillCalleeSavedRegisters(SaveBlock, I, CSI, RI)) { for (const CalleeSavedInfo &CS : CSI) { // Insert the spill to the stack frame. MCRegister Reg = CS.getReg(); MachineInstrSpan MIS(I, &SaveBlock); - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass( + const TargetRegisterClass *RC = RI->getMinimalPhysRegClass( Reg, Reg == RI->getReturnAddressReg(MF) ? MVT::i64 : MVT::i32); // If this value was already livein, we probably have a direct use of the // incoming register value, so don't kill at the spill point. This happens // since we pass some special inputs (workgroup IDs) in the callee saved // range. - const bool IsLiveIn = isLiveIntoMBB(Reg, SaveBlock, TRI); + const bool IsLiveIn = isLiveIntoMBB(Reg, SaveBlock, RI); TII.storeRegToStackSlot(SaveBlock, I, Reg, !IsLiveIn, CS.getFrameIdx(), - RC, TRI, Register()); + RC, Register()); if (Indexes) { assert(std::distance(MIS.begin(), I) == 1); diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index b398db4..9abda27 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -696,7 +696,6 @@ convertArgumentInfo(const AMDGPUFunctionArgInfo &ArgInfo, return true; }; - // TODO: Need to serialize kernarg preloads. bool Any = false; Any |= convertArg(AI.PrivateSegmentBuffer, ArgInfo.PrivateSegmentBuffer); Any |= convertArg(AI.DispatchPtr, ArgInfo.DispatchPtr); @@ -718,6 +717,21 @@ convertArgumentInfo(const AMDGPUFunctionArgInfo &ArgInfo, Any |= convertArg(AI.WorkItemIDY, ArgInfo.WorkItemIDY); Any |= convertArg(AI.WorkItemIDZ, ArgInfo.WorkItemIDZ); + // Write FirstKernArgPreloadReg separately, since it's a Register, + // not ArgDescriptor. + if (ArgInfo.FirstKernArgPreloadReg) { + Register Reg = ArgInfo.FirstKernArgPreloadReg; + assert(Reg.isPhysical() && + "FirstKernArgPreloadReg must be a physical register"); + + yaml::SIArgument SA = yaml::SIArgument::createArgument(true); + raw_string_ostream OS(SA.RegisterName.Value); + OS << printReg(Reg, &TRI); + + AI.FirstKernArgPreloadReg = SA; + Any = true; + } + if (Any) return AI; @@ -750,7 +764,8 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo( Mode(MFI.getMode()), HasInitWholeWave(MFI.hasInitWholeWave()), IsWholeWaveFunction(MFI.isWholeWaveFunction()), DynamicVGPRBlockSize(MFI.getDynamicVGPRBlockSize()), - ScratchReservedForDynamicVGPRs(MFI.getScratchReservedForDynamicVGPRs()) { + ScratchReservedForDynamicVGPRs(MFI.getScratchReservedForDynamicVGPRs()), + NumKernargPreloadSGPRs(MFI.getNumKernargPreloadedSGPRs()) { for (Register Reg : MFI.getSGPRSpillPhysVGPRs()) SpillPhysVGPRS.push_back(regToString(Reg, TRI)); @@ -799,6 +814,8 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields( ReturnsVoid = YamlMFI.ReturnsVoid; IsWholeWaveFunction = YamlMFI.IsWholeWaveFunction; + UserSGPRInfo.allocKernargPreloadSGPRs(YamlMFI.NumKernargPreloadSGPRs); + if (YamlMFI.ScavengeFI) { auto FIOrErr = YamlMFI.ScavengeFI->getFI(MF.getFrameInfo()); if (!FIOrErr) { diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 019c3b7..d901f4c 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -170,6 +170,7 @@ struct SIArgumentInfo { std::optional<SIArgument> DispatchID; std::optional<SIArgument> FlatScratchInit; std::optional<SIArgument> PrivateSegmentSize; + std::optional<SIArgument> FirstKernArgPreloadReg; std::optional<SIArgument> WorkGroupIDX; std::optional<SIArgument> WorkGroupIDY; @@ -195,6 +196,7 @@ template <> struct MappingTraits<SIArgumentInfo> { YamlIO.mapOptional("dispatchID", AI.DispatchID); YamlIO.mapOptional("flatScratchInit", AI.FlatScratchInit); YamlIO.mapOptional("privateSegmentSize", AI.PrivateSegmentSize); + YamlIO.mapOptional("firstKernArgPreloadReg", AI.FirstKernArgPreloadReg); YamlIO.mapOptional("workGroupIDX", AI.WorkGroupIDX); YamlIO.mapOptional("workGroupIDY", AI.WorkGroupIDY); @@ -305,6 +307,8 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo { unsigned DynamicVGPRBlockSize = 0; unsigned ScratchReservedForDynamicVGPRs = 0; + unsigned NumKernargPreloadSGPRs = 0; + SIMachineFunctionInfo() = default; SIMachineFunctionInfo(const llvm::SIMachineFunctionInfo &, const TargetRegisterInfo &TRI, @@ -361,6 +365,7 @@ template <> struct MappingTraits<SIMachineFunctionInfo> { YamlIO.mapOptional("dynamicVGPRBlockSize", MFI.DynamicVGPRBlockSize, false); YamlIO.mapOptional("scratchReservedForDynamicVGPRs", MFI.ScratchReservedForDynamicVGPRs, 0); + YamlIO.mapOptional("numKernargPreloadSGPRs", MFI.NumKernargPreloadSGPRs, 0); YamlIO.mapOptional("isWholeWaveFunction", MFI.IsWholeWaveFunction, false); } }; @@ -1014,7 +1019,9 @@ public: void setNumWaveDispatchVGPRs(unsigned Count) { NumWaveDispatchVGPRs = Count; } Register getPrivateSegmentWaveByteOffsetSystemSGPR() const { - return ArgInfo.PrivateSegmentWaveByteOffset.getRegister(); + if (ArgInfo.PrivateSegmentWaveByteOffset) + return ArgInfo.PrivateSegmentWaveByteOffset.getRegister(); + return MCRegister(); } /// Returns the physical register reserved for use as the resource diff --git a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp index fd28abe..2f3ad39 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp @@ -323,8 +323,8 @@ void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock, // Do not Track Physical Registers, because it messes up. for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) { - if (RegMaskPair.RegUnit.isVirtual()) - LiveInRegs.insert(RegMaskPair.RegUnit); + if (RegMaskPair.VRegOrUnit.isVirtualReg()) + LiveInRegs.insert(RegMaskPair.VRegOrUnit.asVirtualReg()); } LiveOutRegs.clear(); // There is several possibilities to distinguish: @@ -350,12 +350,13 @@ void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock, // Comparing to LiveInRegs is not sufficient to differentiate 4 vs 5, 7 // The use of findDefBetween removes the case 4. for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) { - Register Reg = RegMaskPair.RegUnit; - if (Reg.isVirtual() && - isDefBetween(Reg, LIS->getInstructionIndex(*BeginBlock).getRegSlot(), + VirtRegOrUnit VRegOrUnit = RegMaskPair.VRegOrUnit; + if (VRegOrUnit.isVirtualReg() && + isDefBetween(VRegOrUnit.asVirtualReg(), + LIS->getInstructionIndex(*BeginBlock).getRegSlot(), LIS->getInstructionIndex(*EndBlock).getRegSlot(), MRI, LIS)) { - LiveOutRegs.insert(Reg); + LiveOutRegs.insert(VRegOrUnit.asVirtualReg()); } } @@ -578,11 +579,11 @@ void SIScheduleBlock::printDebug(bool full) { << LiveOutPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n\n"; dbgs() << "LiveIns:\n"; for (Register Reg : LiveInRegs) - dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; + dbgs() << printReg(Reg, DAG->getTRI()) << ' '; dbgs() << "\nLiveOuts:\n"; for (Register Reg : LiveOutRegs) - dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; + dbgs() << printReg(Reg, DAG->getTRI()) << ' '; } dbgs() << "\nInstructions:\n"; @@ -1446,23 +1447,24 @@ SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG, } #endif - std::set<Register> InRegs = DAG->getInRegs(); + std::set<VirtRegOrUnit> InRegs = DAG->getInRegs(); addLiveRegs(InRegs); // Increase LiveOutRegsNumUsages for blocks // producing registers consumed in another // scheduling region. - for (Register Reg : DAG->getOutRegs()) { + for (VirtRegOrUnit VRegOrUnit : DAG->getOutRegs()) { for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { // Do reverse traversal int ID = BlocksStruct.TopDownIndex2Block[Blocks.size()-1-i]; SIScheduleBlock *Block = Blocks[ID]; const std::set<Register> &OutRegs = Block->getOutRegs(); - if (OutRegs.find(Reg) == OutRegs.end()) + if (!VRegOrUnit.isVirtualReg() || + OutRegs.find(VRegOrUnit.asVirtualReg()) == OutRegs.end()) continue; - ++LiveOutRegsNumUsages[ID][Reg]; + ++LiveOutRegsNumUsages[ID][VRegOrUnit.asVirtualReg()]; break; } } @@ -1565,15 +1567,18 @@ SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() { maxVregUsage = VregCurrentUsage; if (SregCurrentUsage > maxSregUsage) maxSregUsage = SregCurrentUsage; - LLVM_DEBUG(dbgs() << "Picking New Blocks\n"; dbgs() << "Available: "; - for (SIScheduleBlock *Block : ReadyBlocks) - dbgs() << Block->getID() << ' '; - dbgs() << "\nCurrent Live:\n"; - for (Register Reg : LiveRegs) - dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; - dbgs() << '\n'; - dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n'; - dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';); + LLVM_DEBUG({ + dbgs() << "Picking New Blocks\n"; + dbgs() << "Available: "; + for (SIScheduleBlock *Block : ReadyBlocks) + dbgs() << Block->getID() << ' '; + dbgs() << "\nCurrent Live:\n"; + for (Register Reg : LiveRegs) + dbgs() << printReg(Reg, DAG->getTRI()) << ' '; + dbgs() << '\n'; + dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n'; + dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n'; + }); Cand.Block = nullptr; for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(), @@ -1625,13 +1630,13 @@ SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() { // Tracking of currently alive registers to determine VGPR Usage. -void SIScheduleBlockScheduler::addLiveRegs(std::set<Register> &Regs) { - for (Register Reg : Regs) { +void SIScheduleBlockScheduler::addLiveRegs(std::set<VirtRegOrUnit> &Regs) { + for (VirtRegOrUnit VRegOrUnit : Regs) { // For now only track virtual registers. - if (!Reg.isVirtual()) + if (!VRegOrUnit.isVirtualReg()) continue; // If not already in the live set, then add it. - (void) LiveRegs.insert(Reg); + (void)LiveRegs.insert(VRegOrUnit.asVirtualReg()); } } @@ -1662,7 +1667,7 @@ void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) { void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) { decreaseLiveRegs(Block, Block->getInRegs()); - addLiveRegs(Block->getOutRegs()); + LiveRegs.insert(Block->getOutRegs().begin(), Block->getOutRegs().end()); releaseBlockSuccs(Block); for (const auto &RegP : LiveOutRegsNumUsages[Block->getID()]) { // We produce this register, thus it must not be previously alive. @@ -1689,7 +1694,7 @@ SIScheduleBlockScheduler::checkRegUsageImpact(std::set<Register> &InRegs, continue; if (LiveRegsConsumers[Reg] > 1) continue; - PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg); + PSetIterator PSetI = DAG->getMRI()->getPressureSets(VirtRegOrUnit(Reg)); for (; PSetI.isValid(); ++PSetI) { DiffSetPressure[*PSetI] -= PSetI.getWeight(); } @@ -1699,7 +1704,7 @@ SIScheduleBlockScheduler::checkRegUsageImpact(std::set<Register> &InRegs, // For now only track virtual registers. if (!Reg.isVirtual()) continue; - PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg); + PSetIterator PSetI = DAG->getMRI()->getPressureSets(VirtRegOrUnit(Reg)); for (; PSetI.isValid(); ++PSetI) { DiffSetPressure[*PSetI] += PSetI.getWeight(); } @@ -1846,7 +1851,7 @@ SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End, // For now only track virtual registers if (!Reg.isVirtual()) continue; - PSetIterator PSetI = MRI.getPressureSets(Reg); + PSetIterator PSetI = MRI.getPressureSets(VirtRegOrUnit(Reg)); for (; PSetI.isValid(); ++PSetI) { if (*PSetI == AMDGPU::RegisterPressureSets::VGPR_32) VgprUsage += PSetI.getWeight(); diff --git a/llvm/lib/Target/AMDGPU/SIMachineScheduler.h b/llvm/lib/Target/AMDGPU/SIMachineScheduler.h index b219cbd..1245774 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineScheduler.h +++ b/llvm/lib/Target/AMDGPU/SIMachineScheduler.h @@ -389,7 +389,7 @@ private: SIBlockSchedCandidate &TryCand); SIScheduleBlock *pickBlock(); - void addLiveRegs(std::set<Register> &Regs); + void addLiveRegs(std::set<VirtRegOrUnit> &Regs); void decreaseLiveRegs(SIScheduleBlock *Block, std::set<Register> &Regs); void releaseBlockSuccs(SIScheduleBlock *Parent); void blockScheduled(SIScheduleBlock *Block); @@ -462,18 +462,18 @@ public: unsigned &VgprUsage, unsigned &SgprUsage); - std::set<Register> getInRegs() { - std::set<Register> InRegs; + std::set<VirtRegOrUnit> getInRegs() { + std::set<VirtRegOrUnit> InRegs; for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) { - InRegs.insert(RegMaskPair.RegUnit); + InRegs.insert(RegMaskPair.VRegOrUnit); } return InRegs; } - std::set<unsigned> getOutRegs() { - std::set<unsigned> OutRegs; + std::set<VirtRegOrUnit> getOutRegs() { + std::set<VirtRegOrUnit> OutRegs; for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) { - OutRegs.insert(RegMaskPair.RegUnit); + OutRegs.insert(RegMaskPair.VRegOrUnit); } return OutRegs; }; diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 6ab8d552..a082d53 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -302,16 +302,17 @@ protected: SICacheControl(const GCNSubtarget &ST); - /// Sets named bit \p BitName to "true" if present in instruction \p MI. + /// Sets CPol \p Bits to "true" if present in instruction \p MI. /// \returns Returns true if \p MI is modified, false otherwise. - bool enableNamedBit(const MachineBasicBlock::iterator MI, - AMDGPU::CPol::CPol Bit) const; + bool enableCPolBits(const MachineBasicBlock::iterator MI, + unsigned Bits) const; /// Check if any atomic operation on AS can affect memory accessible via the /// global address space. bool canAffectGlobalAddrSpace(SIAtomicAddrSpace AS) const; public: + using CPol = AMDGPU::CPol::CPol; /// Create a cache control for the subtarget \p ST. static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST); @@ -401,21 +402,9 @@ public: virtual ~SICacheControl() = default; }; -class SIGfx6CacheControl : public SICacheControl { -protected: - - /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI - /// is modified, false otherwise. - bool enableGLCBit(const MachineBasicBlock::iterator &MI) const { - return enableNamedBit(MI, AMDGPU::CPol::GLC); - } - - /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI - /// is modified, false otherwise. - bool enableSLCBit(const MachineBasicBlock::iterator &MI) const { - return enableNamedBit(MI, AMDGPU::CPol::SLC); - } - +/// Generates code sequences for the memory model of all GFX targets below +/// GFX10. +class SIGfx6CacheControl final : public SICacheControl { public: SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {} @@ -454,76 +443,10 @@ public: Position Pos) const override; }; -class SIGfx7CacheControl : public SIGfx6CacheControl { +/// Generates code sequences for the memory model of GFX10/11. +class SIGfx10CacheControl final : public SICacheControl { public: - - SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {} - - bool insertAcquire(MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace, - Position Pos) const override; - -}; - -class SIGfx90ACacheControl : public SIGfx7CacheControl { -public: - - SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {} - - bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace) const override; - - bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace) const override; - - bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, - SIAtomicAddrSpace AddrSpace, SIMemOp Op, - bool IsVolatile, bool IsNonTemporal, - bool IsLastUse) const override; - - bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace, SIMemOp Op, - bool IsCrossAddrSpaceOrdering, Position Pos, - AtomicOrdering Order, bool AtomicsOnly) const override; - - bool insertAcquire(MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace, - Position Pos) const override; - - bool insertRelease(MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace, - bool IsCrossAddrSpaceOrdering, - Position Pos) const override; -}; - -class SIGfx940CacheControl : public SIGfx90ACacheControl { -protected: - - /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI - /// is modified, false otherwise. - bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const { - return enableNamedBit(MI, AMDGPU::CPol::SC0); - } - - /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI - /// is modified, false otherwise. - bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const { - return enableNamedBit(MI, AMDGPU::CPol::SC1); - } - - /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI - /// is modified, false otherwise. - bool enableNTBit(const MachineBasicBlock::iterator &MI) const { - return enableNamedBit(MI, AMDGPU::CPol::NT); - } - -public: - SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {}; + SIGfx10CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {} bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, @@ -531,42 +454,16 @@ public: bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace) const override; + SIAtomicAddrSpace AddrSpace) const override { + return false; + } bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace) const override; - - bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, - SIAtomicAddrSpace AddrSpace, SIMemOp Op, - bool IsVolatile, bool IsNonTemporal, - bool IsLastUse) const override; - - bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace, Position Pos) const override; - - bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, - Position Pos) const override; -}; - -class SIGfx10CacheControl : public SIGfx7CacheControl { -protected: - - /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI - /// is modified, false otherwise. - bool enableDLCBit(const MachineBasicBlock::iterator &MI) const { - return enableNamedBit(MI, AMDGPU::CPol::DLC); + SIAtomicAddrSpace AddrSpace) const override { + return false; } -public: - - SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {} - - bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace) const override; - bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, bool IsVolatile, bool IsNonTemporal, @@ -579,28 +476,23 @@ public: bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, Position Pos) const override; -}; - -class SIGfx11CacheControl : public SIGfx10CacheControl { -public: - SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {} - - bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace) const override; - bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, - SIAtomicAddrSpace AddrSpace, SIMemOp Op, - bool IsVolatile, bool IsNonTemporal, - bool IsLastUse) const override; + bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, + Position Pos) const override { + return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, + IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release, + /*AtomicsOnly=*/false); + } }; -class SIGfx12CacheControl : public SIGfx11CacheControl { +class SIGfx12CacheControl final : public SICacheControl { protected: // Sets TH policy to \p Value if CPol operand is present in instruction \p MI. // \returns Returns true if \p MI is modified, false otherwise. bool setTH(const MachineBasicBlock::iterator MI, AMDGPU::CPol::CPol Value) const; + // Sets Scope policy to \p Value if CPol operand is present in instruction \p // MI. \returns Returns true if \p MI is modified, false otherwise. bool setScope(const MachineBasicBlock::iterator MI, @@ -619,7 +511,7 @@ protected: SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const; public: - SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) { + SIGfx12CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) { // GFX12.0 and GFX12.5 memory models greatly overlap, and in some cases // the behavior is the same if assuming GFX12.0 in CU mode. assert(!ST.hasGFX1250Insts() || ST.isCuModeEnabled()); @@ -777,7 +669,7 @@ getSynchronizeAddrSpaceMD(const MachineInstr &MI) { void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI, const char *Msg) const { - const Function &Func = MI->getParent()->getParent()->getFunction(); + const Function &Func = MI->getMF()->getFunction(); Func.getContext().diagnose( DiagnosticInfoUnsupported(Func, Msg, MI->getDebugLoc())); } @@ -884,6 +776,13 @@ std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO( } } + // FIXME: The MMO of buffer atomic instructions does not always have an atomic + // ordering. We only need to handle VBUFFER atomics on GFX12+ so we can fix it + // here, but the lowering should really be cleaned up at some point. + if ((ST.getGeneration() >= GCNSubtarget::GFX12) && SIInstrInfo::isBUF(*MI) && + SIInstrInfo::isAtomic(*MI) && Ordering == AtomicOrdering::NotAtomic) + Ordering = AtomicOrdering::Monotonic; + SIAtomicScope Scope = SIAtomicScope::NONE; SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; bool IsCrossAddressSpaceOrdering = false; @@ -1006,13 +905,13 @@ SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) { InsertCacheInv = !AmdgcnSkipCacheInvalidations; } -bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI, - AMDGPU::CPol::CPol Bit) const { +bool SICacheControl::enableCPolBits(const MachineBasicBlock::iterator MI, + unsigned Bits) const { MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol); if (!CPol) return false; - CPol->setImm(CPol->getImm() | Bit); + CPol->setImm(CPol->getImm() | Bits); return true; } @@ -1028,18 +927,10 @@ bool SICacheControl::canAffectGlobalAddrSpace(SIAtomicAddrSpace AS) const { /* static */ std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) { GCNSubtarget::Generation Generation = ST.getGeneration(); - if (ST.hasGFX940Insts()) - return std::make_unique<SIGfx940CacheControl>(ST); - if (ST.hasGFX90AInsts()) - return std::make_unique<SIGfx90ACacheControl>(ST); - if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS) - return std::make_unique<SIGfx6CacheControl>(ST); if (Generation < AMDGPUSubtarget::GFX10) - return std::make_unique<SIGfx7CacheControl>(ST); - if (Generation < AMDGPUSubtarget::GFX11) - return std::make_unique<SIGfx10CacheControl>(ST); + return std::make_unique<SIGfx6CacheControl>(ST); if (Generation < AMDGPUSubtarget::GFX12) - return std::make_unique<SIGfx11CacheControl>(ST); + return std::make_unique<SIGfx10CacheControl>(ST); return std::make_unique<SIGfx12CacheControl>(ST); } @@ -1048,33 +939,61 @@ bool SIGfx6CacheControl::enableLoadCacheBypass( SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const { assert(MI->mayLoad() && !MI->mayStore()); - bool Changed = false; - if (canAffectGlobalAddrSpace(AddrSpace)) { - switch (Scope) { - case SIAtomicScope::SYSTEM: - case SIAtomicScope::AGENT: + if (!canAffectGlobalAddrSpace(AddrSpace)) { + /// The scratch address space does not need the global memory caches + /// to be bypassed as all memory operations by the same thread are + /// sequentially consistent, and no other thread can access scratch + /// memory. + + /// Other address spaces do not have a cache. + return false; + } + + bool Changed = false; + switch (Scope) { + case SIAtomicScope::SYSTEM: + if (ST.hasGFX940Insts()) { + // Set SC bits to indicate system scope. + Changed |= enableCPolBits(MI, CPol::SC0 | CPol::SC1); + break; + } + [[fallthrough]]; + case SIAtomicScope::AGENT: + if (ST.hasGFX940Insts()) { + // Set SC bits to indicate agent scope. + Changed |= enableCPolBits(MI, CPol::SC1); + } else { // Set L1 cache policy to MISS_EVICT. // Note: there is no L2 cache bypass policy at the ISA level. - Changed |= enableGLCBit(MI); - break; - case SIAtomicScope::WORKGROUP: - case SIAtomicScope::WAVEFRONT: - case SIAtomicScope::SINGLETHREAD: - // No cache to bypass. - break; - default: - llvm_unreachable("Unsupported synchronization scope"); + Changed |= enableCPolBits(MI, CPol::GLC); + } + break; + case SIAtomicScope::WORKGROUP: + if (ST.hasGFX940Insts()) { + // In threadgroup split mode the waves of a work-group can be executing + // on different CUs. Therefore need to bypass the L1 which is per CU. + // Otherwise in non-threadgroup split mode all waves of a work-group are + // on the same CU, and so the L1 does not need to be bypassed. Setting + // SC bits to indicate work-group scope will do this automatically. + Changed |= enableCPolBits(MI, CPol::SC0); + } else if (ST.hasGFX90AInsts()) { + // In threadgroup split mode the waves of a work-group can be executing + // on different CUs. Therefore need to bypass the L1 which is per CU. + // Otherwise in non-threadgroup split mode all waves of a work-group are + // on the same CU, and so the L1 does not need to be bypassed. + if (ST.isTgSplitEnabled()) + Changed |= enableCPolBits(MI, CPol::GLC); } + break; + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // No cache to bypass. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); } - /// The scratch address space does not need the global memory caches - /// to be bypassed as all memory operations by the same thread are - /// sequentially consistent, and no other thread can access scratch - /// memory. - - /// Other address spaces do not have a cache. - return Changed; } @@ -1085,8 +1004,39 @@ bool SIGfx6CacheControl::enableStoreCacheBypass( assert(!MI->mayLoad() && MI->mayStore()); bool Changed = false; - /// The L1 cache is write through so does not need to be bypassed. There is no - /// bypass control for the L2 cache at the isa level. + /// For targets other than GFX940, the L1 cache is write through so does not + /// need to be bypassed. There is no bypass control for the L2 cache at the + /// isa level. + + if (ST.hasGFX940Insts() && canAffectGlobalAddrSpace(AddrSpace)) { + switch (Scope) { + case SIAtomicScope::SYSTEM: + // Set SC bits to indicate system scope. + Changed |= enableCPolBits(MI, CPol::SC0 | CPol::SC1); + break; + case SIAtomicScope::AGENT: + // Set SC bits to indicate agent scope. + Changed |= enableCPolBits(MI, CPol::SC1); + break; + case SIAtomicScope::WORKGROUP: + // Set SC bits to indicate workgroup scope. + Changed |= enableCPolBits(MI, CPol::SC0); + break; + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // Leave SC bits unset to indicate wavefront scope. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + + /// The scratch address space does not need the global memory caches + /// to be bypassed as all memory operations by the same thread are + /// sequentially consistent, and no other thread can access scratch + /// memory. + + /// Other address spaces do not have a cache. + } return Changed; } @@ -1098,10 +1048,31 @@ bool SIGfx6CacheControl::enableRMWCacheBypass( assert(MI->mayLoad() && MI->mayStore()); bool Changed = false; - /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically - /// bypassed, and the GLC bit is instead used to indicate if they are - /// return or no-return. - /// Note: there is no L2 cache coherent bypass control at the ISA level. + /// For targets other than GFX940, do not set GLC for RMW atomic operations as + /// L0/L1 cache is automatically bypassed, and the GLC bit is instead used to + /// indicate if they are return or no-return. Note: there is no L2 cache + /// coherent bypass control at the ISA level. + /// For GFX90A+, RMW atomics implicitly bypass the L1 cache. + + if (ST.hasGFX940Insts() && canAffectGlobalAddrSpace(AddrSpace)) { + switch (Scope) { + case SIAtomicScope::SYSTEM: + // Set SC1 bit to indicate system scope. + Changed |= enableCPolBits(MI, CPol::SC1); + break; + case SIAtomicScope::AGENT: + case SIAtomicScope::WORKGROUP: + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // RMW atomic operations implicitly bypass the L1 cache and only use SC1 + // to indicate system or agent scope. The SC0 bit is used to indicate if + // they are return or no-return. Leave SC1 bit unset to indicate agent + // scope. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + } return Changed; } @@ -1123,11 +1094,15 @@ bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal( bool Changed = false; if (IsVolatile) { - // Set L1 cache policy to be MISS_EVICT for load instructions - // and MISS_LRU for store instructions. - // Note: there is no L2 cache bypass policy at the ISA level. - if (Op == SIMemOp::LOAD) - Changed |= enableGLCBit(MI); + if (ST.hasGFX940Insts()) { + // Set SC bits to indicate system scope. + Changed |= enableCPolBits(MI, CPol::SC0 | CPol::SC1); + } else if (Op == SIMemOp::LOAD) { + // Set L1 cache policy to be MISS_EVICT for load instructions + // and MISS_LRU for store instructions. + // Note: there is no L2 cache bypass policy at the ISA level. + Changed |= enableCPolBits(MI, CPol::GLC); + } // Ensure operation has completed at system scope to cause all volatile // operations to be visible outside the program in a global order. Do not @@ -1142,10 +1117,13 @@ bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal( } if (IsNonTemporal) { - // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT - // for both loads and stores, and the L2 cache policy to STREAM. - Changed |= enableGLCBit(MI); - Changed |= enableSLCBit(MI); + if (ST.hasGFX940Insts()) { + Changed |= enableCPolBits(MI, CPol::NT); + } else { + // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT + // for both loads and stores, and the L2 cache policy to STREAM. + Changed |= enableCPolBits(MI, CPol::SLC | CPol::GLC); + } return Changed; } @@ -1166,6 +1144,26 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, if (Pos == Position::AFTER) ++MI; + // GFX90A+ + if (ST.hasGFX90AInsts() && ST.isTgSplitEnabled()) { + // In threadgroup split mode the waves of a work-group can be executing on + // different CUs. Therefore need to wait for global or GDS memory operations + // to complete to ensure they are visible to waves in the other CUs. + // Otherwise in non-threadgroup split mode all waves of a work-group are on + // the same CU, so no need to wait for global memory as all waves in the + // work-group access the same the L1, nor wait for GDS as access are ordered + // on a CU. + if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH | + SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) && + (Scope == SIAtomicScope::WORKGROUP)) { + // Same as <GFX90A at AGENT scope; + Scope = SIAtomicScope::AGENT; + } + // In threadgroup split mode LDS cannot be allocated so no need to wait for + // LDS memory operations. + AddrSpace &= ~SIAtomicAddrSpace::LDS; + } + bool VMCnt = false; bool LGKMCnt = false; @@ -1260,62 +1258,13 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, return Changed; } -bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace, - Position Pos) const { - if (!InsertCacheInv) +static bool canUseBUFFER_WBINVL1_VOL(const GCNSubtarget &ST) { + if (ST.getGeneration() <= AMDGPUSubtarget::SOUTHERN_ISLANDS) return false; - - bool Changed = false; - - MachineBasicBlock &MBB = *MI->getParent(); - DebugLoc DL = MI->getDebugLoc(); - - if (Pos == Position::AFTER) - ++MI; - - if (canAffectGlobalAddrSpace(AddrSpace)) { - switch (Scope) { - case SIAtomicScope::SYSTEM: - case SIAtomicScope::AGENT: - BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1)); - Changed = true; - break; - case SIAtomicScope::WORKGROUP: - case SIAtomicScope::WAVEFRONT: - case SIAtomicScope::SINGLETHREAD: - // No cache to invalidate. - break; - default: - llvm_unreachable("Unsupported synchronization scope"); - } - } - - /// The scratch address space does not need the global memory cache - /// to be flushed as all memory operations by the same thread are - /// sequentially consistent, and no other thread can access scratch - /// memory. - - /// Other address spaces do not have a cache. - - if (Pos == Position::AFTER) - --MI; - - return Changed; + return !ST.isAmdPalOS() && !ST.isMesa3DOS(); } -bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace, - bool IsCrossAddrSpaceOrdering, - Position Pos) const { - return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, - IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release, - /*AtomicsOnly=*/false); -} - -bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, +bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, Position Pos) const { @@ -1327,235 +1276,95 @@ bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, MachineBasicBlock &MBB = *MI->getParent(); DebugLoc DL = MI->getDebugLoc(); - const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>(); - - const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS() - ? AMDGPU::BUFFER_WBINVL1 - : AMDGPU::BUFFER_WBINVL1_VOL; - if (Pos == Position::AFTER) ++MI; - if (canAffectGlobalAddrSpace(AddrSpace)) { - switch (Scope) { - case SIAtomicScope::SYSTEM: - case SIAtomicScope::AGENT: - BuildMI(MBB, MI, DL, TII->get(InvalidateL1)); - Changed = true; - break; - case SIAtomicScope::WORKGROUP: - case SIAtomicScope::WAVEFRONT: - case SIAtomicScope::SINGLETHREAD: - // No cache to invalidate. - break; - default: - llvm_unreachable("Unsupported synchronization scope"); - } - } - - /// The scratch address space does not need the global memory cache - /// to be flushed as all memory operations by the same thread are - /// sequentially consistent, and no other thread can access scratch - /// memory. - - /// Other address spaces do not have a cache. - - if (Pos == Position::AFTER) - --MI; - - return Changed; -} - -bool SIGfx90ACacheControl::enableLoadCacheBypass( - const MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace) const { - assert(MI->mayLoad() && !MI->mayStore()); - bool Changed = false; + const unsigned InvalidateL1 = canUseBUFFER_WBINVL1_VOL(ST) + ? AMDGPU::BUFFER_WBINVL1_VOL + : AMDGPU::BUFFER_WBINVL1; if (canAffectGlobalAddrSpace(AddrSpace)) { switch (Scope) { case SIAtomicScope::SYSTEM: - case SIAtomicScope::AGENT: - // Set the L1 cache policy to MISS_LRU. - // Note: there is no L2 cache bypass policy at the ISA level. - Changed |= enableGLCBit(MI); - break; - case SIAtomicScope::WORKGROUP: - // In threadgroup split mode the waves of a work-group can be executing on - // different CUs. Therefore need to bypass the L1 which is per CU. - // Otherwise in non-threadgroup split mode all waves of a work-group are - // on the same CU, and so the L1 does not need to be bypassed. - if (ST.isTgSplitEnabled()) - Changed |= enableGLCBit(MI); - break; - case SIAtomicScope::WAVEFRONT: - case SIAtomicScope::SINGLETHREAD: - // No cache to bypass. - break; - default: - llvm_unreachable("Unsupported synchronization scope"); - } - } - - /// The scratch address space does not need the global memory caches - /// to be bypassed as all memory operations by the same thread are - /// sequentially consistent, and no other thread can access scratch - /// memory. - - /// Other address spaces do not have a cache. - - return Changed; -} - -bool SIGfx90ACacheControl::enableRMWCacheBypass( - const MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace) const { - assert(MI->mayLoad() && MI->mayStore()); - bool Changed = false; + if (ST.hasGFX940Insts()) { + // Ensures that following loads will not see stale remote VMEM data or + // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW + // and CC will never be stale due to the local memory probes. + BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) + // Set SC bits to indicate system scope. + .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); + // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the + // hardware does not reorder memory operations by the same wave with + // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to + // remove any cache lines of earlier writes by the same wave and ensures + // later reads by the same wave will refetch the cache lines. + Changed = true; + break; + } - if (canAffectGlobalAddrSpace(AddrSpace)) { - switch (Scope) { - case SIAtomicScope::SYSTEM: + if (ST.hasGFX90AInsts()) { + // Ensures that following loads will not see stale remote VMEM data or + // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW + // and CC will never be stale due to the local memory probes. + BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2)); + BuildMI(MBB, MI, DL, TII->get(InvalidateL1)); + // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the + // hardware does not reorder memory operations by the same wave with + // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed + // to remove any cache lines of earlier writes by the same wave and + // ensures later reads by the same wave will refetch the cache lines. + Changed = true; + break; + } + [[fallthrough]]; case SIAtomicScope::AGENT: - /// Do not set glc for RMW atomic operations as they implicitly bypass - /// the L1 cache, and the glc bit is instead used to indicate if they are - /// return or no-return. - break; - case SIAtomicScope::WORKGROUP: - case SIAtomicScope::WAVEFRONT: - case SIAtomicScope::SINGLETHREAD: - // No cache to bypass. RMW atomics implicitly bypass the L1 cache. - break; - default: - llvm_unreachable("Unsupported synchronization scope"); - } - } - - return Changed; -} - -bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal( - MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, - bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const { - // Only handle load and store, not atomic read-modify-write insructions. The - // latter use glc to indicate if the atomic returns a result and so must not - // be used for cache control. - assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI)); - - // Only update load and store, not LLVM IR atomic read-modify-write - // instructions. The latter are always marked as volatile so cannot sensibly - // handle it as do not want to pessimize all atomics. Also they do not support - // the nontemporal attribute. - assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); - - bool Changed = false; - - if (IsVolatile) { - // Set L1 cache policy to be MISS_EVICT for load instructions - // and MISS_LRU for store instructions. - // Note: there is no L2 cache bypass policy at the ISA level. - if (Op == SIMemOp::LOAD) - Changed |= enableGLCBit(MI); - - // Ensure operation has completed at system scope to cause all volatile - // operations to be visible outside the program in a global order. Do not - // request cross address space as only the global address space can be - // observable outside the program, so no need to cause a waitcnt for LDS - // address space operations. - Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, - Position::AFTER, AtomicOrdering::Unordered, - /*AtomicsOnly=*/false); - - return Changed; - } - - if (IsNonTemporal) { - // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT - // for both loads and stores, and the L2 cache policy to STREAM. - Changed |= enableGLCBit(MI); - Changed |= enableSLCBit(MI); - return Changed; - } - - return Changed; -} - -bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace, SIMemOp Op, - bool IsCrossAddrSpaceOrdering, - Position Pos, AtomicOrdering Order, - bool AtomicsOnly) const { - if (ST.isTgSplitEnabled()) { - // In threadgroup split mode the waves of a work-group can be executing on - // different CUs. Therefore need to wait for global or GDS memory operations - // to complete to ensure they are visible to waves in the other CUs. - // Otherwise in non-threadgroup split mode all waves of a work-group are on - // the same CU, so no need to wait for global memory as all waves in the - // work-group access the same the L1, nor wait for GDS as access are ordered - // on a CU. - if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH | - SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) && - (Scope == SIAtomicScope::WORKGROUP)) { - // Same as GFX7 using agent scope. - Scope = SIAtomicScope::AGENT; - } - // In threadgroup split mode LDS cannot be allocated so no need to wait for - // LDS memory operations. - AddrSpace &= ~SIAtomicAddrSpace::LDS; - } - return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op, - IsCrossAddrSpaceOrdering, Pos, Order, - AtomicsOnly); -} - -bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace, - Position Pos) const { - if (!InsertCacheInv) - return false; - - bool Changed = false; - - MachineBasicBlock &MBB = *MI->getParent(); - DebugLoc DL = MI->getDebugLoc(); - - if (Pos == Position::AFTER) - ++MI; - - if (canAffectGlobalAddrSpace(AddrSpace)) { - switch (Scope) { - case SIAtomicScope::SYSTEM: - // Ensures that following loads will not see stale remote VMEM data or - // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and - // CC will never be stale due to the local memory probes. - BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2)); - // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the - // hardware does not reorder memory operations by the same wave with - // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to - // remove any cache lines of earlier writes by the same wave and ensures - // later reads by the same wave will refetch the cache lines. + if (ST.hasGFX940Insts()) { + // Ensures that following loads will not see stale remote date or local + // MTYPE NC global data. Local MTYPE RW and CC memory will never be + // stale due to the memory probes. + BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) + // Set SC bits to indicate agent scope. + .addImm(AMDGPU::CPol::SC1); + // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware + // does not reorder memory operations with respect to preceeding buffer + // invalidate. The invalidate is guaranteed to remove any cache lines of + // earlier writes and ensures later writes will refetch the cache lines. + } else + BuildMI(MBB, MI, DL, TII->get(InvalidateL1)); Changed = true; break; - case SIAtomicScope::AGENT: - // Same as GFX7. - break; case SIAtomicScope::WORKGROUP: - // In threadgroup split mode the waves of a work-group can be executing on - // different CUs. Therefore need to invalidate the L1 which is per CU. - // Otherwise in non-threadgroup split mode all waves of a work-group are - // on the same CU, and so the L1 does not need to be invalidated. if (ST.isTgSplitEnabled()) { - // Same as GFX7 using agent scope. - Scope = SIAtomicScope::AGENT; + if (ST.hasGFX940Insts()) { + // In threadgroup split mode the waves of a work-group can be + // executing on different CUs. Therefore need to invalidate the L1 + // which is per CU. Otherwise in non-threadgroup split mode all waves + // of a work-group are on the same CU, and so the L1 does not need to + // be invalidated. + + // Ensures L1 is invalidated if in threadgroup split mode. In + // non-threadgroup split mode it is a NOP, but no point generating it + // in that case if know not in that mode. + BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) + // Set SC bits to indicate work-group scope. + .addImm(AMDGPU::CPol::SC0); + // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware + // does not reorder memory operations with respect to preceeding + // buffer invalidate. The invalidate is guaranteed to remove any cache + // lines of earlier writes and ensures later writes will refetch the + // cache lines. + Changed = true; + } else if (ST.hasGFX90AInsts()) { + BuildMI(MBB, MI, DL, TII->get(InvalidateL1)); + Changed = true; + } } break; case SIAtomicScope::WAVEFRONT: case SIAtomicScope::SINGLETHREAD: - // Same as GFX7. + // For GFX940, we could generate "BUFFER_INV" but it would do nothing as + // there are no caches to invalidate. All other targets have no cache to + // invalidate. break; default: llvm_unreachable("Unsupported synchronization scope"); @@ -1572,356 +1381,65 @@ bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI, if (Pos == Position::AFTER) --MI; - Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos); - return Changed; } -bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace, - bool IsCrossAddrSpaceOrdering, - Position Pos) const { - bool Changed = false; - - MachineBasicBlock &MBB = *MI->getParent(); - const DebugLoc &DL = MI->getDebugLoc(); - - if (Pos == Position::AFTER) - ++MI; - - if (canAffectGlobalAddrSpace(AddrSpace)) { - switch (Scope) { - case SIAtomicScope::SYSTEM: - // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the - // hardware does not reorder memory operations by the same wave with - // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed - // to initiate writeback of any dirty cache lines of earlier writes by the - // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the - // writeback has completed. - BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) - // Set SC bits to indicate system scope. - .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); - // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT - // vmcnt(0)" needed by the "BUFFER_WBL2". - Changed = true; - break; - case SIAtomicScope::AGENT: - case SIAtomicScope::WORKGROUP: - case SIAtomicScope::WAVEFRONT: - case SIAtomicScope::SINGLETHREAD: - // Same as GFX7. - break; - default: - llvm_unreachable("Unsupported synchronization scope"); - } - } - - if (Pos == Position::AFTER) - --MI; - - Changed |= - SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace, - IsCrossAddrSpaceOrdering, Pos); - - return Changed; -} - -bool SIGfx940CacheControl::enableLoadCacheBypass( - const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace) const { - assert(MI->mayLoad() && !MI->mayStore()); - bool Changed = false; - - if (canAffectGlobalAddrSpace(AddrSpace)) { - switch (Scope) { - case SIAtomicScope::SYSTEM: - // Set SC bits to indicate system scope. - Changed |= enableSC0Bit(MI); - Changed |= enableSC1Bit(MI); - break; - case SIAtomicScope::AGENT: - // Set SC bits to indicate agent scope. - Changed |= enableSC1Bit(MI); - break; - case SIAtomicScope::WORKGROUP: - // In threadgroup split mode the waves of a work-group can be executing on - // different CUs. Therefore need to bypass the L1 which is per CU. - // Otherwise in non-threadgroup split mode all waves of a work-group are - // on the same CU, and so the L1 does not need to be bypassed. Setting SC - // bits to indicate work-group scope will do this automatically. - Changed |= enableSC0Bit(MI); - break; - case SIAtomicScope::WAVEFRONT: - case SIAtomicScope::SINGLETHREAD: - // Leave SC bits unset to indicate wavefront scope. - break; - default: - llvm_unreachable("Unsupported synchronization scope"); - } - } - - /// The scratch address space does not need the global memory caches - /// to be bypassed as all memory operations by the same thread are - /// sequentially consistent, and no other thread can access scratch - /// memory. - - /// Other address spaces do not have a cache. - - return Changed; -} - -bool SIGfx940CacheControl::enableStoreCacheBypass( - const MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const { - assert(!MI->mayLoad() && MI->mayStore()); - bool Changed = false; - - if (canAffectGlobalAddrSpace(AddrSpace)) { - switch (Scope) { - case SIAtomicScope::SYSTEM: - // Set SC bits to indicate system scope. - Changed |= enableSC0Bit(MI); - Changed |= enableSC1Bit(MI); - break; - case SIAtomicScope::AGENT: - // Set SC bits to indicate agent scope. - Changed |= enableSC1Bit(MI); - break; - case SIAtomicScope::WORKGROUP: - // Set SC bits to indicate workgroup scope. - Changed |= enableSC0Bit(MI); - break; - case SIAtomicScope::WAVEFRONT: - case SIAtomicScope::SINGLETHREAD: - // Leave SC bits unset to indicate wavefront scope. - break; - default: - llvm_unreachable("Unsupported synchronization scope"); - } - } - - /// The scratch address space does not need the global memory caches - /// to be bypassed as all memory operations by the same thread are - /// sequentially consistent, and no other thread can access scratch - /// memory. - - /// Other address spaces do not have a cache. - - return Changed; -} - -bool SIGfx940CacheControl::enableRMWCacheBypass( - const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace) const { - assert(MI->mayLoad() && MI->mayStore()); - bool Changed = false; - - if (canAffectGlobalAddrSpace(AddrSpace)) { - switch (Scope) { - case SIAtomicScope::SYSTEM: - // Set SC1 bit to indicate system scope. - Changed |= enableSC1Bit(MI); - break; - case SIAtomicScope::AGENT: - case SIAtomicScope::WORKGROUP: - case SIAtomicScope::WAVEFRONT: - case SIAtomicScope::SINGLETHREAD: - // RMW atomic operations implicitly bypass the L1 cache and only use SC1 - // to indicate system or agent scope. The SC0 bit is used to indicate if - // they are return or no-return. Leave SC1 bit unset to indicate agent - // scope. - break; - default: - llvm_unreachable("Unsupported synchronization scope"); - } - } - - return Changed; -} - -bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal( - MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, - bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const { - // Only handle load and store, not atomic read-modify-write insructions. The - // latter use glc to indicate if the atomic returns a result and so must not - // be used for cache control. - assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI)); - - // Only update load and store, not LLVM IR atomic read-modify-write - // instructions. The latter are always marked as volatile so cannot sensibly - // handle it as do not want to pessimize all atomics. Also they do not support - // the nontemporal attribute. - assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); - - bool Changed = false; - - if (IsVolatile) { - // Set SC bits to indicate system scope. - Changed |= enableSC0Bit(MI); - Changed |= enableSC1Bit(MI); - - // Ensure operation has completed at system scope to cause all volatile - // operations to be visible outside the program in a global order. Do not - // request cross address space as only the global address space can be - // observable outside the program, so no need to cause a waitcnt for LDS - // address space operations. - Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, - Position::AFTER, AtomicOrdering::Unordered, - /*AtomicsOnly=*/false); - - return Changed; - } - - if (IsNonTemporal) { - Changed |= enableNTBit(MI); - return Changed; - } - - return Changed; -} - -bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace, - Position Pos) const { - if (!InsertCacheInv) - return false; - +bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, + bool IsCrossAddrSpaceOrdering, + Position Pos) const { bool Changed = false; - MachineBasicBlock &MBB = *MI->getParent(); - DebugLoc DL = MI->getDebugLoc(); + if (ST.hasGFX90AInsts()) { + MachineBasicBlock &MBB = *MI->getParent(); + const DebugLoc &DL = MI->getDebugLoc(); - if (Pos == Position::AFTER) - ++MI; + if (Pos == Position::AFTER) + ++MI; - if (canAffectGlobalAddrSpace(AddrSpace)) { - switch (Scope) { - case SIAtomicScope::SYSTEM: - // Ensures that following loads will not see stale remote VMEM data or - // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and - // CC will never be stale due to the local memory probes. - BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) - // Set SC bits to indicate system scope. - .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); - // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the - // hardware does not reorder memory operations by the same wave with - // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to - // remove any cache lines of earlier writes by the same wave and ensures - // later reads by the same wave will refetch the cache lines. - Changed = true; - break; - case SIAtomicScope::AGENT: - // Ensures that following loads will not see stale remote date or local - // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale - // due to the memory probes. - BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) - // Set SC bits to indicate agent scope. - .addImm(AMDGPU::CPol::SC1); - // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware - // does not reorder memory operations with respect to preceeding buffer - // invalidate. The invalidate is guaranteed to remove any cache lines of - // earlier writes and ensures later writes will refetch the cache lines. - Changed = true; - break; - case SIAtomicScope::WORKGROUP: - // In threadgroup split mode the waves of a work-group can be executing on - // different CUs. Therefore need to invalidate the L1 which is per CU. - // Otherwise in non-threadgroup split mode all waves of a work-group are - // on the same CU, and so the L1 does not need to be invalidated. - if (ST.isTgSplitEnabled()) { - // Ensures L1 is invalidated if in threadgroup split mode. In - // non-threadgroup split mode it is a NOP, but no point generating it in - // that case if know not in that mode. - BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) - // Set SC bits to indicate work-group scope. - .addImm(AMDGPU::CPol::SC0); - // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware - // does not reorder memory operations with respect to preceeding buffer - // invalidate. The invalidate is guaranteed to remove any cache lines of - // earlier writes and ensures later writes will refetch the cache lines. + if (canAffectGlobalAddrSpace(AddrSpace)) { + switch (Scope) { + case SIAtomicScope::SYSTEM: + // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the + // hardware does not reorder memory operations by the same wave with + // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed + // to initiate writeback of any dirty cache lines of earlier writes by + // the same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the + // writeback has completed. + BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) + // Set SC bits to indicate system scope. + .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); Changed = true; + break; + case SIAtomicScope::AGENT: + if (ST.hasGFX940Insts()) { + BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) + // Set SC bits to indicate agent scope. + .addImm(AMDGPU::CPol::SC1); + + // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is + // SIAtomicScope::AGENT, the following insertWait will generate the + // required "S_WAITCNT vmcnt(0)". + Changed = true; + } + break; + case SIAtomicScope::WORKGROUP: + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // For GFX940, do not generate "BUFFER_WBL2" as there are no caches it + // would writeback, and would require an otherwise unnecessary + // "S_WAITCNT vmcnt(0)". + break; + default: + llvm_unreachable("Unsupported synchronization scope"); } - break; - case SIAtomicScope::WAVEFRONT: - case SIAtomicScope::SINGLETHREAD: - // Could generate "BUFFER_INV" but it would do nothing as there are no - // caches to invalidate. - break; - default: - llvm_unreachable("Unsupported synchronization scope"); } - } - - /// The scratch address space does not need the global memory cache - /// to be flushed as all memory operations by the same thread are - /// sequentially consistent, and no other thread can access scratch - /// memory. - - /// Other address spaces do not have a cache. - - if (Pos == Position::AFTER) - --MI; - - return Changed; -} - -bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace, - bool IsCrossAddrSpaceOrdering, - Position Pos) const { - bool Changed = false; - MachineBasicBlock &MBB = *MI->getParent(); - DebugLoc DL = MI->getDebugLoc(); - - if (Pos == Position::AFTER) - ++MI; - - if (canAffectGlobalAddrSpace(AddrSpace)) { - switch (Scope) { - case SIAtomicScope::SYSTEM: - // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the - // hardware does not reorder memory operations by the same wave with - // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed - // to initiate writeback of any dirty cache lines of earlier writes by the - // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the - // writeback has completed. - BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) - // Set SC bits to indicate system scope. - .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); - // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is - // SIAtomicScope::SYSTEM, the following insertWait will generate the - // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2". - Changed = true; - break; - case SIAtomicScope::AGENT: - BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) - // Set SC bits to indicate agent scope. - .addImm(AMDGPU::CPol::SC1); - - // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is - // SIAtomicScope::AGENT, the following insertWait will generate the - // required "S_WAITCNT vmcnt(0)". - Changed = true; - break; - case SIAtomicScope::WORKGROUP: - case SIAtomicScope::WAVEFRONT: - case SIAtomicScope::SINGLETHREAD: - // Do not generate "BUFFER_WBL2" as there are no caches it would - // writeback, and would require an otherwise unnecessary - // "S_WAITCNT vmcnt(0)". - break; - default: - llvm_unreachable("Unsupported synchronization scope"); - } + if (Pos == Position::AFTER) + --MI; } - if (Pos == Position::AFTER) - --MI; - // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other // S_WAITCNT needed. Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, @@ -1932,8 +1450,7 @@ bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI, } bool SIGfx10CacheControl::enableLoadCacheBypass( - const MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, + const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const { assert(MI->mayLoad() && !MI->mayStore()); bool Changed = false; @@ -1944,8 +1461,9 @@ bool SIGfx10CacheControl::enableLoadCacheBypass( case SIAtomicScope::AGENT: // Set the L0 and L1 cache policies to MISS_EVICT. // Note: there is no L2 cache coherent bypass control at the ISA level. - Changed |= enableGLCBit(MI); - Changed |= enableDLCBit(MI); + // For GFX10, set GLC+DLC, for GFX11, only set GLC. + Changed |= + enableCPolBits(MI, CPol::GLC | (AMDGPU::isGFX10(ST) ? CPol::DLC : 0)); break; case SIAtomicScope::WORKGROUP: // In WGP mode the waves of a work-group can be executing on either CU of @@ -1953,7 +1471,7 @@ bool SIGfx10CacheControl::enableLoadCacheBypass( // CU mode all waves of a work-group are on the same CU, and so the L0 // does not need to be bypassed. if (!ST.isCuModeEnabled()) - Changed |= enableGLCBit(MI); + Changed |= enableCPolBits(MI, CPol::GLC); break; case SIAtomicScope::WAVEFRONT: case SIAtomicScope::SINGLETHREAD: @@ -1996,10 +1514,13 @@ bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal( // and MISS_LRU for store instructions. // Note: there is no L2 cache coherent bypass control at the ISA level. if (Op == SIMemOp::LOAD) { - Changed |= enableGLCBit(MI); - Changed |= enableDLCBit(MI); + Changed |= enableCPolBits(MI, CPol::GLC | CPol::DLC); } + // GFX11: Set MALL NOALLOC for both load and store instructions. + if (AMDGPU::isGFX11(ST)) + Changed |= enableCPolBits(MI, CPol::DLC); + // Ensure operation has completed at system scope to cause all volatile // operations to be visible outside the program in a global order. Do not // request cross address space as only the global address space can be @@ -2017,8 +1538,12 @@ bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal( // For stores setting both GLC and SLC configures L0 and L1 cache policy // to MISS_EVICT and the L2 cache policy to STREAM. if (Op == SIMemOp::STORE) - Changed |= enableGLCBit(MI); - Changed |= enableSLCBit(MI); + Changed |= enableCPolBits(MI, CPol::GLC); + Changed |= enableCPolBits(MI, CPol::SLC); + + // GFX11: Set MALL NOALLOC for both load and store instructions. + if (AMDGPU::isGFX11(ST)) + Changed |= enableCPolBits(MI, CPol::DLC); return Changed; } @@ -2218,102 +1743,6 @@ bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, return Changed; } -bool SIGfx11CacheControl::enableLoadCacheBypass( - const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace) const { - assert(MI->mayLoad() && !MI->mayStore()); - bool Changed = false; - - if (canAffectGlobalAddrSpace(AddrSpace)) { - switch (Scope) { - case SIAtomicScope::SYSTEM: - case SIAtomicScope::AGENT: - // Set the L0 and L1 cache policies to MISS_EVICT. - // Note: there is no L2 cache coherent bypass control at the ISA level. - Changed |= enableGLCBit(MI); - break; - case SIAtomicScope::WORKGROUP: - // In WGP mode the waves of a work-group can be executing on either CU of - // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in - // CU mode all waves of a work-group are on the same CU, and so the L0 - // does not need to be bypassed. - if (!ST.isCuModeEnabled()) - Changed |= enableGLCBit(MI); - break; - case SIAtomicScope::WAVEFRONT: - case SIAtomicScope::SINGLETHREAD: - // No cache to bypass. - break; - default: - llvm_unreachable("Unsupported synchronization scope"); - } - } - - /// The scratch address space does not need the global memory caches - /// to be bypassed as all memory operations by the same thread are - /// sequentially consistent, and no other thread can access scratch - /// memory. - - /// Other address spaces do not have a cache. - - return Changed; -} - -bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal( - MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, - bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const { - - // Only handle load and store, not atomic read-modify-write insructions. The - // latter use glc to indicate if the atomic returns a result and so must not - // be used for cache control. - assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI)); - - // Only update load and store, not LLVM IR atomic read-modify-write - // instructions. The latter are always marked as volatile so cannot sensibly - // handle it as do not want to pessimize all atomics. Also they do not support - // the nontemporal attribute. - assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); - - bool Changed = false; - - if (IsVolatile) { - // Set L0 and L1 cache policy to be MISS_EVICT for load instructions - // and MISS_LRU for store instructions. - // Note: there is no L2 cache coherent bypass control at the ISA level. - if (Op == SIMemOp::LOAD) - Changed |= enableGLCBit(MI); - - // Set MALL NOALLOC for load and store instructions. - Changed |= enableDLCBit(MI); - - // Ensure operation has completed at system scope to cause all volatile - // operations to be visible outside the program in a global order. Do not - // request cross address space as only the global address space can be - // observable outside the program, so no need to cause a waitcnt for LDS - // address space operations. - Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, - Position::AFTER, AtomicOrdering::Unordered, - /*AtomicsOnly=*/false); - return Changed; - } - - if (IsNonTemporal) { - // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT - // and L2 cache policy to STREAM. - // For stores setting both GLC and SLC configures L0 and L1 cache policy - // to MISS_EVICT and the L2 cache policy to STREAM. - if (Op == SIMemOp::STORE) - Changed |= enableGLCBit(MI); - Changed |= enableSLCBit(MI); - - // Set MALL NOALLOC for load and store instructions. - Changed |= enableDLCBit(MI); - return Changed; - } - - return Changed; -} - bool SIGfx12CacheControl::setTH(const MachineBasicBlock::iterator MI, AMDGPU::CPol::CPol Value) const { MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol); @@ -2637,6 +2066,13 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( if (IsVolatile) { Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS); + if (ST.requiresWaitXCntForSingleAccessInstructions() && + SIInstrInfo::isVMEM(*MI)) { + MachineBasicBlock &MBB = *MI->getParent(); + BuildMI(MBB, MI, MI->getDebugLoc(), TII->get(S_WAIT_XCNT_soft)).addImm(0); + Changed = true; + } + // Ensure operation has completed at system scope to cause all volatile // operations to be visible outside the program in a global order. Do not // request cross address space as only the global address space can be @@ -2655,9 +2091,8 @@ bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const { const bool IsRMW = (MI.mayLoad() && MI.mayStore()); bool Changed = false; - // GFX12.5 only: xcnt wait is needed before flat and global atomics - // stores/rmw. - if (Atomic && ST.requiresWaitXCntBeforeAtomicStores() && TII->isFLAT(MI)) { + if (Atomic && ST.requiresWaitXCntForSingleAccessInstructions() && + SIInstrInfo::isVMEM(MI)) { MachineBasicBlock &MBB = *MI.getParent(); BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(S_WAIT_XCNT_soft)).addImm(0); Changed = true; diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index bfac639..acc4b3f 100644 --- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -118,7 +118,7 @@ public: MachineInstr *getParentInst() const { return Target->getParent(); } MachineRegisterInfo *getMRI() const { - return &getParentInst()->getParent()->getParent()->getRegInfo(); + return &getParentInst()->getMF()->getRegInfo(); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -1284,7 +1284,7 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, // Clone the instruction to allow revoking changes // made to MI during the processing of the operands // if the conversion fails. - SDWAInst = MI.getParent()->getParent()->CloneMachineInstr(&MI); + SDWAInst = MI.getMF()->CloneMachineInstr(&MI); MI.getParent()->insert(MI.getIterator(), SDWAInst); } else { SDWAInst = createSDWAVersion(MI); @@ -1334,20 +1334,21 @@ void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, const MCInstrDesc &Desc = TII->get(MI.getOpcode()); unsigned ConstantBusCount = 0; for (MachineOperand &Op : MI.explicit_uses()) { - if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(*MRI, Op.getReg()))) - continue; - - unsigned I = Op.getOperandNo(); + if (Op.isReg()) { + if (TRI->isVGPR(*MRI, Op.getReg())) + continue; - int16_t RegClass = TII->getOpRegClassID(Desc.operands()[I]); - if (RegClass == -1 || !TRI->isVSSuperClass(TRI->getRegClass(RegClass))) + if (ST.hasSDWAScalar() && ConstantBusCount == 0) { + ++ConstantBusCount; + continue; + } + } else if (!Op.isImm()) continue; - if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() && - TRI->isSGPRReg(*MRI, Op.getReg())) { - ++ConstantBusCount; + unsigned I = Op.getOperandNo(); + const TargetRegisterClass *OpRC = TII->getRegClass(Desc, I); + if (!OpRC || !TRI->isVSSuperClass(OpRC)) continue; - } Register VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), diff --git a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp index 5720b97..b537e44 100644 --- a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp +++ b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp @@ -110,7 +110,7 @@ void SIPostRABundler::collectUsedRegUnits(const MachineInstr &MI, "subregister indexes should not be present after RA"); for (MCRegUnit Unit : TRI->regunits(Reg)) - UsedRegUnits.set(Unit); + UsedRegUnits.set(static_cast<unsigned>(Unit)); } } diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp index 7431e11..8785968 100644 --- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp @@ -296,7 +296,7 @@ bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First, for (MachineBasicBlock::instr_iterator I = std::next(First.getIterator()), E = MI.getIterator(); I != E; ++I) { - if (I->isBundle()) + if (I->isBundle() || I->isDebugInstr()) continue; switch (I->getOpcode()) { case AMDGPU::S_SET_GPR_IDX_MODE: @@ -640,7 +640,7 @@ void SIPreEmitPeephole::collectUnpackingCandidates( } void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) { - MachineOperand DstOp = I.getOperand(0); + const MachineOperand &DstOp = I.getOperand(0); uint16_t UnpackedOpcode = mapToUnpackedOpcode(I); assert(UnpackedOpcode != std::numeric_limits<uint16_t>::max() && diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index a6c1af2..66586e8 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -340,10 +340,12 @@ SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) "getNumCoveredRegs() will not work with generated subreg masks!"); RegPressureIgnoredUnits.resize(getNumRegUnits()); - RegPressureIgnoredUnits.set(*regunits(MCRegister::from(AMDGPU::M0)).begin()); + RegPressureIgnoredUnits.set( + static_cast<unsigned>(*regunits(MCRegister::from(AMDGPU::M0)).begin())); for (auto Reg : AMDGPU::VGPR_16RegClass) { if (AMDGPU::isHi16Reg(Reg, *this)) - RegPressureIgnoredUnits.set(*regunits(Reg).begin()); + RegPressureIgnoredUnits.set( + static_cast<unsigned>(*regunits(Reg).begin())); } // HACK: Until this is fully tablegen'd. @@ -1949,7 +1951,7 @@ void SIRegisterInfo::buildSpillLoadStore( void SIRegisterInfo::addImplicitUsesForBlockCSRLoad(MachineInstrBuilder &MIB, Register BlockReg) const { - const MachineFunction *MF = MIB->getParent()->getParent(); + const MachineFunction *MF = MIB->getMF(); const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>(); uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps(BlockReg); Register BaseVGPR = getSubReg(BlockReg, AMDGPU::sub0); @@ -2319,7 +2321,7 @@ bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex( bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const { - MachineFunction *MF = MI->getParent()->getParent(); + MachineFunction *MF = MI->getMF(); MachineBasicBlock *MBB = MI->getParent(); SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); MachineFrameInfo &FrameInfo = MF->getFrameInfo(); @@ -2981,10 +2983,36 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, : RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass, MI, false, 0, !UseSGPR); - // TODO: for flat scratch another attempt can be made with a VGPR index - // if no SGPRs can be scavenged. - if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR)) + if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR)) { + int SVOpcode = AMDGPU::getFlatScratchInstSVfromSS(MI->getOpcode()); + if (ST.hasFlatScratchSVSMode() && SVOpcode != -1) { + Register TmpVGPR = RS->scavengeRegisterBackwards( + AMDGPU::VGPR_32RegClass, MI, false, 0, /*AllowSpill=*/true); + + // Materialize the frame register. + auto MIB = + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR); + if (FrameReg) + MIB.addReg(FrameReg); + else + MIB.addImm(Offset); + + // Add the offset to the frame register. + if (FrameReg && Offset) + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e32), FrameReg) + .addReg(FrameReg, RegState::Kill) + .addImm(Offset); + + BuildMI(*MBB, MI, DL, TII->get(SVOpcode)) + .add(MI->getOperand(0)) // $vdata + .addReg(TmpVGPR) // $vaddr + .addImm(0) // Offset + .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::cpol)); + MI->eraseFromParent(); + return true; + } report_fatal_error("Cannot scavenge register in FI elimination!"); + } if (!TmpSReg) { // Use frame register and restore it after. @@ -3046,7 +3074,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, if (!IsMUBUF && !MFI->isBottomOfStack()) { // Convert to a swizzled stack address by scaling by the wave size. // In an entry function/kernel the offset is already swizzled. - bool IsSALU = isSGPRClass(TII->getOpRegClass(*MI, FIOperandNum)); + bool IsSALU = isSGPRClass(TII->getRegClass(MI->getDesc(), FIOperandNum)); bool LiveSCC = RS->isRegUsed(AMDGPU::SCC) && !MI->definesRegister(AMDGPU::SCC, /*TRI=*/nullptr); const TargetRegisterClass *RC = IsSALU && !LiveSCC @@ -3558,6 +3586,17 @@ SIRegisterInfo::getVectorSuperClassForBitWidth(unsigned BitWidth) const { } const TargetRegisterClass * +SIRegisterInfo::getDefaultVectorSuperClassForBitWidth(unsigned BitWidth) const { + // TODO: In principle this should use AV classes for gfx908 too. This is + // limited to 90a+ to avoid regressing special case copy optimizations which + // need new handling. The core issue is that it's not possible to directly + // copy between AGPRs on gfx908, and the current optimizations around that + // expect to see copies to VGPR. + return ST.hasGFX90AInsts() ? getVectorSuperClassForBitWidth(BitWidth) + : getVGPRClassForBitWidth(BitWidth); +} + +const TargetRegisterClass * SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) { if (BitWidth == 16 || BitWidth == 32) return &AMDGPU::SReg_32RegClass; @@ -3628,6 +3667,14 @@ SIRegisterInfo::getEquivalentAGPRClass(const TargetRegisterClass *SRC) const { } const TargetRegisterClass * +SIRegisterInfo::getEquivalentAVClass(const TargetRegisterClass *SRC) const { + unsigned Size = getRegSizeInBits(*SRC); + const TargetRegisterClass *ARC = getVectorSuperClassForBitWidth(Size); + assert(ARC && "Invalid register class size"); + return ARC; +} + +const TargetRegisterClass * SIRegisterInfo::getEquivalentSGPRClass(const TargetRegisterClass *VRC) const { unsigned Size = getRegSizeInBits(*VRC); if (Size == 32) @@ -3734,27 +3781,6 @@ bool SIRegisterInfo::isAGPR(const MachineRegisterInfo &MRI, return RC && isAGPRClass(RC); } -bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI, - const TargetRegisterClass *SrcRC, - unsigned SubReg, - const TargetRegisterClass *DstRC, - unsigned DstSubReg, - const TargetRegisterClass *NewRC, - LiveIntervals &LIS) const { - unsigned SrcSize = getRegSizeInBits(*SrcRC); - unsigned DstSize = getRegSizeInBits(*DstRC); - unsigned NewSize = getRegSizeInBits(*NewRC); - - // Do not increase size of registers beyond dword, we would need to allocate - // adjacent registers and constraint regalloc more than needed. - - // Always allow dword coalescing. - if (SrcSize <= 32 || DstSize <= 32) - return true; - - return NewSize <= DstSize || NewSize <= SrcSize; -} - unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const { unsigned MinOcc = ST.getOccupancyWithWorkGroupSizes(MF).first; @@ -3788,10 +3814,10 @@ unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, llvm_unreachable("Unexpected register pressure set!"); } -const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const { +const int *SIRegisterInfo::getRegUnitPressureSets(MCRegUnit RegUnit) const { static const int Empty[] = { -1 }; - if (RegPressureIgnoredUnits[RegUnit]) + if (RegPressureIgnoredUnits[static_cast<unsigned>(RegUnit)]) return Empty; return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit); @@ -3915,20 +3941,6 @@ const TargetRegisterClass *SIRegisterInfo::getVGPR64Class() const { : &AMDGPU::VReg_64RegClass; } -const TargetRegisterClass * -SIRegisterInfo::getRegClass(unsigned RCID) const { - switch ((int)RCID) { - case AMDGPU::SReg_1RegClassID: - return getBoolRC(); - case AMDGPU::SReg_1_XEXECRegClassID: - return getWaveMaskRegClass(); - case -1: - return nullptr; - default: - return AMDGPUGenRegisterInfo::getRegClass(RCID); - } -} - // Find reaching register definition MachineInstr *SIRegisterInfo::findReachingDef(Register Reg, unsigned SubReg, MachineInstr &Use, @@ -4017,28 +4029,6 @@ bool SIRegisterInfo::isProperlyAlignedRC(const TargetRegisterClass &RC) const { return true; } -const TargetRegisterClass * -SIRegisterInfo::getProperlyAlignedRC(const TargetRegisterClass *RC) const { - if (!RC || !ST.needsAlignedVGPRs()) - return RC; - - unsigned Size = getRegSizeInBits(*RC); - if (Size <= 32) - return RC; - - if (RC == &AMDGPU::VS_64RegClass) - return &AMDGPU::VS_64_Align2RegClass; - - if (isVGPRClass(RC)) - return getAlignedVGPRClassForBitWidth(Size); - if (isAGPRClass(RC)) - return getAlignedAGPRClassForBitWidth(Size); - if (isVectorSuperClass(RC)) - return getAlignedVectorSuperClassForBitWidth(Size); - - return RC; -} - ArrayRef<MCPhysReg> SIRegisterInfo::getAllSGPR128(const MachineFunction &MF) const { return ArrayRef(AMDGPU::SGPR_128RegClass.begin(), ST.getMaxNumSGPRs(MF) / 4); diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index 7b91ba7..2e2916f 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -216,6 +216,10 @@ public: getVectorSuperClassForBitWidth(unsigned BitWidth) const; LLVM_READONLY + const TargetRegisterClass * + getDefaultVectorSuperClassForBitWidth(unsigned BitWidth) const; + + LLVM_READONLY static const TargetRegisterClass *getSGPRClassForBitWidth(unsigned BitWidth); /// \returns true if this class contains only SGPR registers @@ -285,6 +289,10 @@ public: const TargetRegisterClass * getEquivalentAGPRClass(const TargetRegisterClass *SRC) const; + /// \returns An AGPR+VGPR super reg class with the same width as \p SRC + const TargetRegisterClass * + getEquivalentAVClass(const TargetRegisterClass *SRC) const; + /// \returns A SGPR reg class with the same width as \p SRC const TargetRegisterClass * getEquivalentSGPRClass(const TargetRegisterClass *VRC) const; @@ -338,14 +346,6 @@ public: ArrayRef<int16_t> getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const; - bool shouldCoalesce(MachineInstr *MI, - const TargetRegisterClass *SrcRC, - unsigned SubReg, - const TargetRegisterClass *DstRC, - unsigned DstSubReg, - const TargetRegisterClass *NewRC, - LiveIntervals &LIS) const override; - unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override; @@ -357,7 +357,7 @@ public: const MachineFunction &MF, const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const override; - const int *getRegUnitPressureSets(unsigned RegUnit) const override; + const int *getRegUnitPressureSets(MCRegUnit RegUnit) const override; MCRegister getReturnAddressReg(const MachineFunction &MF) const; @@ -391,8 +391,6 @@ public: MCRegister getExec() const; - const TargetRegisterClass *getRegClass(unsigned RCID) const; - // Find reaching register definition MachineInstr *findReachingDef(Register Reg, unsigned SubReg, MachineInstr &Use, @@ -433,11 +431,6 @@ public: // the subtarget. bool isProperlyAlignedRC(const TargetRegisterClass &RC) const; - // Given \p RC returns corresponding aligned register class if required - // by the subtarget. - const TargetRegisterClass * - getProperlyAlignedRC(const TargetRegisterClass *RC) const; - /// Return all SGPR128 which satisfy the waves per execution unit requirement /// of the subtarget. ArrayRef<MCPhysReg> getAllSGPR128(const MachineFunction &MF) const; @@ -495,6 +488,17 @@ public: SmallVector<StringLiteral> getVRegFlagsOfReg(Register Reg, const MachineFunction &MF) const override; + + float + getSpillWeightScaleFactor(const TargetRegisterClass *RC) const override { + // Prioritize VGPR_32_Lo256 over other classes which may occupy registers + // beyond v256. + return AMDGPUGenRegisterInfo::getSpillWeightScaleFactor(RC) * + ((RC == &AMDGPU::VGPR_32_Lo256RegClass || + RC == &AMDGPU::VReg_64_Lo256_Align2RegClass) + ? 2.0 + : 1.0); + } }; namespace AMDGPU { diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index fc8f46a..272d4b5 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -644,7 +644,7 @@ def VGPR_32_Lo128 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg1 // Identical to VGPR_32 except it only contains the low 256 (Lo256) registers. def VGPR_32_Lo256 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32, (add (sequence "VGPR%u", 0, 255))> { - let AllocationPriority = 0; + let AllocationPriority = !add(3, !mul(BaseClassPriority, BaseClassScaleFactor)); let GeneratePressureSet = 0; let Size = 32; let Weight = 1; @@ -896,20 +896,6 @@ def SReg_64_Encodable : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v let Size = 64; } -def SReg_1_XEXEC : SIRegisterClass<"AMDGPU", [i1], 32, - (add SReg_64_XEXEC, SReg_32_XEXEC)> { - let CopyCost = 1; - let isAllocatable = 0; - let HasSGPR = 1; -} - -def SReg_1 : SIRegisterClass<"AMDGPU", [i1], 32, - (add SReg_1_XEXEC, EXEC, EXEC_LO, EXEC_HI)> { - let CopyCost = 1; - let isAllocatable = 0; - let HasSGPR = 1; -} - multiclass SRegClass<int numRegs, list<ValueType> regTypes, SIRegisterTuples regList, @@ -1205,6 +1191,34 @@ defm AV_512 : AVRegClass<16, VReg_512.RegTypes, (add VGPR_512), (add AGPR_512)>; defm AV_1024 : AVRegClass<32, VReg_1024.RegTypes, (add VGPR_1024), (add AGPR_1024)>; } +def SReg_1_XEXEC : SIRegisterClassLike<0, false, false, true>, + RegClassByHwMode< + [DefaultMode_Wave64, + AlignedVGPRNoAGPRMode_Wave64, + AVAlign2LoadStoreMode, + DefaultMode_Wave32, + AlignedVGPRNoAGPRMode_Wave32], + [SReg_64_XEXEC, + SReg_64_XEXEC, + SReg_64_XEXEC, + SReg_32_XM0_XEXEC, // FIXME: Why do the wave32 cases exclude m0? + SReg_32_XM0_XEXEC] +>; + +def SReg_1 : SIRegisterClassLike<0, false, false, true>, + RegClassByHwMode< + [DefaultMode_Wave64, + AlignedVGPRNoAGPRMode_Wave64, + AVAlign2LoadStoreMode, + DefaultMode_Wave32, + AlignedVGPRNoAGPRMode_Wave32], + [SReg_64, + SReg_64, + SReg_64, + SReg_32, + SReg_32] +>; + //===----------------------------------------------------------------------===// // // AlignTarget classes. Artifical classes to swap between @@ -1212,17 +1226,36 @@ defm AV_1024 : AVRegClass<32, VReg_1024.RegTypes, (add VGPR_1024), (add AGPR_102 // //===----------------------------------------------------------------------===// +// We have 3 orthogonal properties to consider. Unfortunately we need +// to define the cross product of these states, minus unused +// combinations. + def AV_LdSt_32_Target : RegClassByHwMode< - [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode], - [VGPR_32, AV_32, VGPR_32]>, SIRegisterClassLike<32, true, true> { + [DefaultMode_Wave64, + DefaultMode_Wave32, + AVAlign2LoadStoreMode, + AlignedVGPRNoAGPRMode_Wave64, + AlignedVGPRNoAGPRMode_Wave32], + [VGPR_32, + VGPR_32, + AV_32, + VGPR_32, + VGPR_32]>, + SIRegisterClassLike<32, true, true> { let DecoderMethod = "decodeAVLdSt"; } foreach RegSize = [ 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 512, 1024 ] in { def VReg_#RegSize#_AlignTarget : SIRegisterClassLike<RegSize, true>, RegClassByHwMode< - [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode], + [DefaultMode_Wave64, + DefaultMode_Wave32, + AVAlign2LoadStoreMode, + AlignedVGPRNoAGPRMode_Wave64, + AlignedVGPRNoAGPRMode_Wave32], [!cast<RegisterClass>("VReg_"#RegSize), + !cast<RegisterClass>("VReg_"#RegSize), + !cast<RegisterClass>("VReg_"#RegSize#_Align2), !cast<RegisterClass>("VReg_"#RegSize#_Align2), !cast<RegisterClass>("VReg_"#RegSize#_Align2)]> { let DecoderMethod = "DecodeVReg_"#RegSize#"RegisterClass"; @@ -1230,45 +1263,59 @@ foreach RegSize = [ 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 512, 10 def AReg_#RegSize#_AlignTarget : SIRegisterClassLike<RegSize, false, true>, RegClassByHwMode< - [DefaultMode, AVAlign2LoadStoreMode, /*Unused combination*/], + [DefaultMode_Wave64, /*unused combination*/ AVAlign2LoadStoreMode, /*Unused combination*/ /*Unused combination*/], [!cast<RegisterClass>("AReg_"#RegSize), + /*unused combination*/ !cast<RegisterClass>("AReg_"#RegSize#_Align2) + /*Unused combination*/ /*Unused combination*/]> { let DecoderMethod = "DecodeAReg_"#RegSize#"RegisterClass"; } def AV_#RegSize#_AlignTarget : SIRegisterClassLike<RegSize, true, true>, RegClassByHwMode< - [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode], + [DefaultMode_Wave32, + DefaultMode_Wave64, + AVAlign2LoadStoreMode, + AlignedVGPRNoAGPRMode_Wave64, + AlignedVGPRNoAGPRMode_Wave32], [!cast<RegisterClass>("AV_"#RegSize), + !cast<RegisterClass>("AV_"#RegSize), !cast<RegisterClass>("AV_"#RegSize#_Align2), + !cast<RegisterClass>("VReg_"#RegSize#_Align2), !cast<RegisterClass>("VReg_"#RegSize#_Align2)]> { let DecoderMethod = "DecodeAV_"#RegSize#"RegisterClass"; } def AV_LdSt_#RegSize#_AlignTarget : SIRegisterClassLike<RegSize, true, true>, RegClassByHwMode< - [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode], + [DefaultMode_Wave64, DefaultMode_Wave32, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode_Wave64, AlignedVGPRNoAGPRMode_Wave32], [!cast<RegisterClass>("VReg_"#RegSize), + !cast<RegisterClass>("VReg_"#RegSize), !cast<RegisterClass>("AV_"#RegSize#_Align2), + !cast<RegisterClass>("VReg_"#RegSize#_Align2), !cast<RegisterClass>("VReg_"#RegSize#_Align2)]> { let DecoderMethod = "decodeAVLdSt"; } def AV_LdSt_#RegSize#_Align2 : SIRegisterClassLike<RegSize, true, true>, RegClassByHwMode< - [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode], + [DefaultMode_Wave64, DefaultMode_Wave32, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode_Wave64, AlignedVGPRNoAGPRMode_Wave32], [!cast<RegisterClass>("VReg_"#RegSize#_Align2), + !cast<RegisterClass>("VReg_"#RegSize#_Align2), !cast<RegisterClass>("AV_"#RegSize#_Align2), + !cast<RegisterClass>("VReg_"#RegSize#_Align2), !cast<RegisterClass>("VReg_"#RegSize#_Align2)]> { let DecoderMethod = "decodeAVLdSt"; } def AV_LdSt_#RegSize#_Align1 : SIRegisterClassLike<RegSize, true, true>, RegClassByHwMode< - [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode], + [DefaultMode_Wave64, DefaultMode_Wave32, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode_Wave64, AlignedVGPRNoAGPRMode_Wave32], [!cast<RegisterClass>("VReg_"#RegSize), + !cast<RegisterClass>("VReg_"#RegSize), !cast<RegisterClass>("AV_"#RegSize), + !cast<RegisterClass>("VReg_"#RegSize), !cast<RegisterClass>("VReg_"#RegSize)]> { let DecoderMethod = "decodeAVLdSt"; } @@ -1276,11 +1323,22 @@ foreach RegSize = [ 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 512, 10 def VS_64_AlignTarget : SIRegisterClassLike<64, true, false, true>, RegClassByHwMode< - [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode], - [VS_64, VS_64_Align2, VS_64_Align2]> { + [DefaultMode_Wave64, DefaultMode_Wave32, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode_Wave64, AlignedVGPRNoAGPRMode_Wave32], + [VS_64, VS_64, VS_64_Align2, VS_64_Align2, VS_64_Align2]> { let DecoderMethod = "decodeSrcRegOrImm9"; } + +// Special case for DS_GWS instructions. The register input is really +// 32-bit, but it needs to be even aligned on targets with a VGPR +// alignment requirement. +def AV_LdSt_32_Align2 : SIRegisterClassLike</*Bitwidth=*/32, /*VGPR=*/true, /*AGPR=*/true>, + RegClassByHwMode< + [DefaultMode_Wave64, DefaultMode_Wave32, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode_Wave64, AlignedVGPRNoAGPRMode_Wave32], + [VGPR_32, VGPR_32, AV_64_Align2, VReg_64_Align2, VReg_64_Align2]> { + let DecoderMethod = "decodeAVLdSt<32>"; +} + class RegImmMatcher<string name> : AsmOperandClass { let Name = name; let RenderMethod = "addRegOrImmOperands"; @@ -1533,6 +1591,17 @@ foreach size = ["64", "96", "128", "160", "256", "1024" ] in { def AVLdSt_#size#_Align2 : AVLdStOperand<!cast<RegisterClassLike>("AV_LdSt_"#size#_Align2)>; } +def AV_LdSt_32_Align2_RegMatcher : AsmOperandClass { + let Name = "AV_LdSt_32_Align2_RegOp"; + let RenderMethod = "addRegOperands"; +} + +def AV_LdSt_32_Align2_RegOp : RegisterOperand<AV_LdSt_32_Align2> { + let ParserMatchClass = AV_LdSt_32_Align2_RegMatcher; + let PrintMethod = "printAVLdSt32Align2RegOp"; + let EncoderMethod = "getAVOperandEncoding"; +} + //===----------------------------------------------------------------------===// // ACSrc_* Operands with an AGPR or an inline constant //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp index 179ecba..1b78f67 100644 --- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -550,7 +550,8 @@ bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const { uint32_t NewImm = 0; if (Opc == AMDGPU::S_AND_B32) { - if (isPowerOf2_32(~Imm)) { + if (isPowerOf2_32(~Imm) && + MI.findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr)->isDead()) { NewImm = llvm::countr_one(Imm); Opc = AMDGPU::S_BITSET0_B32; } else if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) { @@ -558,7 +559,8 @@ bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const { Opc = AMDGPU::S_ANDN2_B32; } } else if (Opc == AMDGPU::S_OR_B32) { - if (isPowerOf2_32(Imm)) { + if (isPowerOf2_32(Imm) && + MI.findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr)->isDead()) { NewImm = llvm::countr_zero(Imm); Opc = AMDGPU::S_BITSET1_B32; } else if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) { @@ -584,7 +586,7 @@ bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const { if (SrcReg->isReg() && SrcReg->getReg() == Dest->getReg()) { const bool IsUndef = SrcReg->isUndef(); const bool IsKill = SrcReg->isKill(); - MI.setDesc(TII->get(Opc)); + TII->mutateAndCleanupImplicit(MI, TII->get(Opc)); if (Opc == AMDGPU::S_BITSET0_B32 || Opc == AMDGPU::S_BITSET1_B32) { Src0->ChangeToImmediate(NewImm); diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp index 6611e1e..10762ed 100644 --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -188,8 +188,9 @@ private: void markInstruction(MachineInstr &MI, char Flag, std::vector<WorkItem> &Worklist); - void markDefs(const MachineInstr &UseMI, LiveRange &LR, Register Reg, - unsigned SubReg, char Flag, std::vector<WorkItem> &Worklist); + void markDefs(const MachineInstr &UseMI, LiveRange &LR, + VirtRegOrUnit VRegOrUnit, unsigned SubReg, char Flag, + std::vector<WorkItem> &Worklist); void markOperand(const MachineInstr &MI, const MachineOperand &Op, char Flag, std::vector<WorkItem> &Worklist); void markInstructionUses(const MachineInstr &MI, char Flag, @@ -318,8 +319,8 @@ void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag, /// Mark all relevant definitions of register \p Reg in usage \p UseMI. void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR, - Register Reg, unsigned SubReg, char Flag, - std::vector<WorkItem> &Worklist) { + VirtRegOrUnit VRegOrUnit, unsigned SubReg, + char Flag, std::vector<WorkItem> &Worklist) { LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI); LiveQueryResult UseLRQ = LR.Query(LIS->getInstructionIndex(UseMI)); @@ -331,8 +332,9 @@ void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR, // cover registers. const LaneBitmask UseLanes = SubReg ? TRI->getSubRegIndexLaneMask(SubReg) - : (Reg.isVirtual() ? MRI->getMaxLaneMaskForVReg(Reg) - : LaneBitmask::getNone()); + : (VRegOrUnit.isVirtualReg() + ? MRI->getMaxLaneMaskForVReg(VRegOrUnit.asVirtualReg()) + : LaneBitmask::getNone()); // Perform a depth-first iteration of the LiveRange graph marking defs. // Stop processing of a given branch when all use lanes have been defined. @@ -382,11 +384,11 @@ void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR, MachineInstr *MI = LIS->getInstructionFromIndex(Value->def); assert(MI && "Def has no defining instruction"); - if (Reg.isVirtual()) { + if (VRegOrUnit.isVirtualReg()) { // Iterate over all operands to find relevant definitions bool HasDef = false; for (const MachineOperand &Op : MI->all_defs()) { - if (Op.getReg() != Reg) + if (Op.getReg() != VRegOrUnit.asVirtualReg()) continue; // Compute lanes defined and overlap with use @@ -453,7 +455,7 @@ void SIWholeQuadMode::markOperand(const MachineInstr &MI, << " for " << MI); if (Reg.isVirtual()) { LiveRange &LR = LIS->getInterval(Reg); - markDefs(MI, LR, Reg, Op.getSubReg(), Flag, Worklist); + markDefs(MI, LR, VirtRegOrUnit(Reg), Op.getSubReg(), Flag, Worklist); } else { // Handle physical registers that we need to track; this is mostly relevant // for VCC, which can appear as the (implicit) input of a uniform branch, @@ -462,7 +464,8 @@ void SIWholeQuadMode::markOperand(const MachineInstr &MI, LiveRange &LR = LIS->getRegUnit(Unit); const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn(); if (Value) - markDefs(MI, LR, Unit, AMDGPU::NoSubRegister, Flag, Worklist); + markDefs(MI, LR, VirtRegOrUnit(Unit), AMDGPU::NoSubRegister, Flag, + Worklist); } } } diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp index 6489e63..ce782b0 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp @@ -211,6 +211,7 @@ static constexpr CustomOperand Operands[] = { {{"HW_REG_HW_ID2"}, ID_HW_ID2, isGFX10Plus}, {{"HW_REG_SQ_PERF_SNAPSHOT_PC_HI"}, ID_SQ_PERF_SNAPSHOT_PC_HI, isGFX940}, {{"HW_REG_POPS_PACKER"}, ID_POPS_PACKER, isGFX10}, + {{"HW_REG_WAVE_SCHED_MODE"}, ID_SCHED_MODE, isGFX12Plus}, {{"HW_REG_PERF_SNAPSHOT_DATA"}, ID_PERF_SNAPSHOT_DATA_gfx11, isGFX11}, {{"HW_REG_IB_STS2"}, ID_IB_STS2, isGFX1250}, {{"HW_REG_SHADER_CYCLES"}, ID_SHADER_CYCLES, isGFX10_3_GFX11}, diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 3e1b058..c6e061f 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -729,6 +729,8 @@ bool isGenericAtomic(unsigned Opc) { Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN || Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX || Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP || + Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32 || + Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32 || Opc == AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG; } @@ -897,7 +899,7 @@ unsigned ComponentInfo::getIndexInParsedOperands(unsigned CompOprIdx) const { } std::optional<unsigned> InstInfo::getInvalidCompOperandIndex( - std::function<unsigned(unsigned, unsigned)> GetRegIdx, + std::function<MCRegister(unsigned, unsigned)> GetRegIdx, const MCRegisterInfo &MRI, bool SkipSrc, bool AllowSameVGPR, bool VOPD3) const { @@ -914,12 +916,13 @@ std::optional<unsigned> InstInfo::getInvalidCompOperandIndex( BaseX = X; if (!BaseY) BaseY = Y; - if ((BaseX & BanksMask) == (BaseY & BanksMask)) + if ((BaseX.id() & BanksMask) == (BaseY.id() & BanksMask)) return true; if (BaseX != X /* This is 64-bit register */ && - ((BaseX + 1) & BanksMask) == (BaseY & BanksMask)) + ((BaseX.id() + 1) & BanksMask) == (BaseY.id() & BanksMask)) return true; - if (BaseY != Y && (BaseX & BanksMask) == ((BaseY + 1) & BanksMask)) + if (BaseY != Y && + (BaseX.id() & BanksMask) == ((BaseY.id() + 1) & BanksMask)) return true; // If both are 64-bit bank conflict will be detected yet while checking @@ -968,7 +971,7 @@ std::optional<unsigned> InstInfo::getInvalidCompOperandIndex( // if the operand is not a register or not a VGPR. InstInfo::RegIndices InstInfo::getRegIndices(unsigned CompIdx, - std::function<unsigned(unsigned, unsigned)> GetRegIdx, + std::function<MCRegister(unsigned, unsigned)> GetRegIdx, bool VOPD3) const { assert(CompIdx < COMPONENTS_NUM); @@ -983,7 +986,7 @@ InstInfo::getRegIndices(unsigned CompIdx, Comp.hasRegSrcOperand(CompSrcIdx) ? GetRegIdx(CompIdx, Comp.getIndexOfSrcInMCOperands(CompSrcIdx, VOPD3)) - : 0; + : MCRegister(); } return RegIndices; } @@ -2051,56 +2054,63 @@ unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc) { return packBits(VmVsrc, Encoded, getVmVsrcBitShift(), getVmVsrcBitWidth()); } -unsigned encodeFieldVmVsrc(unsigned VmVsrc) { - return encodeFieldVmVsrc(0xffff, VmVsrc); +unsigned encodeFieldVmVsrc(unsigned VmVsrc, const MCSubtargetInfo &STI) { + unsigned Encoded = getDefaultDepCtrEncoding(STI); + return encodeFieldVmVsrc(Encoded, VmVsrc); } unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst) { return packBits(VaVdst, Encoded, getVaVdstBitShift(), getVaVdstBitWidth()); } -unsigned encodeFieldVaVdst(unsigned VaVdst) { - return encodeFieldVaVdst(0xffff, VaVdst); +unsigned encodeFieldVaVdst(unsigned VaVdst, const MCSubtargetInfo &STI) { + unsigned Encoded = getDefaultDepCtrEncoding(STI); + return encodeFieldVaVdst(Encoded, VaVdst); } unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst) { return packBits(SaSdst, Encoded, getSaSdstBitShift(), getSaSdstBitWidth()); } -unsigned encodeFieldSaSdst(unsigned SaSdst) { - return encodeFieldSaSdst(0xffff, SaSdst); +unsigned encodeFieldSaSdst(unsigned SaSdst, const MCSubtargetInfo &STI) { + unsigned Encoded = getDefaultDepCtrEncoding(STI); + return encodeFieldSaSdst(Encoded, SaSdst); } unsigned encodeFieldVaSdst(unsigned Encoded, unsigned VaSdst) { return packBits(VaSdst, Encoded, getVaSdstBitShift(), getVaSdstBitWidth()); } -unsigned encodeFieldVaSdst(unsigned VaSdst) { - return encodeFieldVaSdst(0xffff, VaSdst); +unsigned encodeFieldVaSdst(unsigned VaSdst, const MCSubtargetInfo &STI) { + unsigned Encoded = getDefaultDepCtrEncoding(STI); + return encodeFieldVaSdst(Encoded, VaSdst); } unsigned encodeFieldVaVcc(unsigned Encoded, unsigned VaVcc) { return packBits(VaVcc, Encoded, getVaVccBitShift(), getVaVccBitWidth()); } -unsigned encodeFieldVaVcc(unsigned VaVcc) { - return encodeFieldVaVcc(0xffff, VaVcc); +unsigned encodeFieldVaVcc(unsigned VaVcc, const MCSubtargetInfo &STI) { + unsigned Encoded = getDefaultDepCtrEncoding(STI); + return encodeFieldVaVcc(Encoded, VaVcc); } unsigned encodeFieldVaSsrc(unsigned Encoded, unsigned VaSsrc) { return packBits(VaSsrc, Encoded, getVaSsrcBitShift(), getVaSsrcBitWidth()); } -unsigned encodeFieldVaSsrc(unsigned VaSsrc) { - return encodeFieldVaSsrc(0xffff, VaSsrc); +unsigned encodeFieldVaSsrc(unsigned VaSsrc, const MCSubtargetInfo &STI) { + unsigned Encoded = getDefaultDepCtrEncoding(STI); + return encodeFieldVaSsrc(Encoded, VaSsrc); } unsigned encodeFieldHoldCnt(unsigned Encoded, unsigned HoldCnt) { return packBits(HoldCnt, Encoded, getHoldCntBitShift(), getHoldCntWidth()); } -unsigned encodeFieldHoldCnt(unsigned HoldCnt) { - return encodeFieldHoldCnt(0xffff, HoldCnt); +unsigned encodeFieldHoldCnt(unsigned HoldCnt, const MCSubtargetInfo &STI) { + unsigned Encoded = getDefaultDepCtrEncoding(STI); + return encodeFieldHoldCnt(Encoded, HoldCnt); } } // namespace DepCtr @@ -2697,8 +2707,8 @@ MCRegister getMCReg(MCRegister Reg, const MCSubtargetInfo &STI) { MCRegister mc2PseudoReg(MCRegister Reg) { MAP_REG2REG } -bool isInlineValue(unsigned Reg) { - switch (Reg) { +bool isInlineValue(MCRegister Reg) { + switch (Reg.id()) { case AMDGPU::SRC_SHARED_BASE_LO: case AMDGPU::SRC_SHARED_BASE: case AMDGPU::SRC_SHARED_LIMIT_LO: @@ -3361,7 +3371,7 @@ const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t Format, : getGfx9BufferFormatInfo(Format); } -const MCRegisterClass *getVGPRPhysRegClass(MCPhysReg Reg, +const MCRegisterClass *getVGPRPhysRegClass(MCRegister Reg, const MCRegisterInfo &MRI) { const unsigned VGPRClasses[] = { AMDGPU::VGPR_16RegClassID, AMDGPU::VGPR_32RegClassID, @@ -3382,22 +3392,22 @@ const MCRegisterClass *getVGPRPhysRegClass(MCPhysReg Reg, return nullptr; } -unsigned getVGPREncodingMSBs(MCPhysReg Reg, const MCRegisterInfo &MRI) { +unsigned getVGPREncodingMSBs(MCRegister Reg, const MCRegisterInfo &MRI) { unsigned Enc = MRI.getEncodingValue(Reg); unsigned Idx = Enc & AMDGPU::HWEncoding::REG_IDX_MASK; return Idx >> 8; } -MCPhysReg getVGPRWithMSBs(MCPhysReg Reg, unsigned MSBs, - const MCRegisterInfo &MRI) { +MCRegister getVGPRWithMSBs(MCRegister Reg, unsigned MSBs, + const MCRegisterInfo &MRI) { unsigned Enc = MRI.getEncodingValue(Reg); unsigned Idx = Enc & AMDGPU::HWEncoding::REG_IDX_MASK; if (Idx >= 0x100) - return AMDGPU::NoRegister; + return MCRegister(); const MCRegisterClass *RC = getVGPRPhysRegClass(Reg, MRI); if (!RC) - return AMDGPU::NoRegister; + return MCRegister(); Idx |= MSBs << 8; if (RC->getID() == AMDGPU::VGPR_16RegClassID) { @@ -3438,17 +3448,42 @@ getVGPRLoweringOperandTables(const MCInstrDesc &Desc) { AMDGPU::OpName::src0Y, AMDGPU::OpName::vsrc1Y, AMDGPU::OpName::vsrc2Y, AMDGPU::OpName::vdstY}; + // VOP2 MADMK instructions use src0, imm, src1 scheme. + static const AMDGPU::OpName VOP2MADMKOps[4] = { + AMDGPU::OpName::src0, AMDGPU::OpName::NUM_OPERAND_NAMES, + AMDGPU::OpName::src1, AMDGPU::OpName::vdst}; + static const AMDGPU::OpName VOPDFMAMKOpsX[4] = { + AMDGPU::OpName::src0X, AMDGPU::OpName::NUM_OPERAND_NAMES, + AMDGPU::OpName::vsrc1X, AMDGPU::OpName::vdstX}; + static const AMDGPU::OpName VOPDFMAMKOpsY[4] = { + AMDGPU::OpName::src0Y, AMDGPU::OpName::NUM_OPERAND_NAMES, + AMDGPU::OpName::vsrc1Y, AMDGPU::OpName::vdstY}; + unsigned TSFlags = Desc.TSFlags; if (TSFlags & (SIInstrFlags::VOP1 | SIInstrFlags::VOP2 | SIInstrFlags::VOP3 | SIInstrFlags::VOP3P | SIInstrFlags::VOPC | SIInstrFlags::DPP)) { + switch (Desc.getOpcode()) { // LD_SCALE operands ignore MSB. - if (Desc.getOpcode() == AMDGPU::V_WMMA_LD_SCALE_PAIRED_B32 || - Desc.getOpcode() == AMDGPU::V_WMMA_LD_SCALE_PAIRED_B32_gfx1250 || - Desc.getOpcode() == AMDGPU::V_WMMA_LD_SCALE16_PAIRED_B64 || - Desc.getOpcode() == AMDGPU::V_WMMA_LD_SCALE16_PAIRED_B64_gfx1250) + case AMDGPU::V_WMMA_LD_SCALE_PAIRED_B32: + case AMDGPU::V_WMMA_LD_SCALE_PAIRED_B32_gfx1250: + case AMDGPU::V_WMMA_LD_SCALE16_PAIRED_B64: + case AMDGPU::V_WMMA_LD_SCALE16_PAIRED_B64_gfx1250: return {}; + case AMDGPU::V_FMAMK_F16: + case AMDGPU::V_FMAMK_F16_t16: + case AMDGPU::V_FMAMK_F16_t16_gfx12: + case AMDGPU::V_FMAMK_F16_fake16: + case AMDGPU::V_FMAMK_F16_fake16_gfx12: + case AMDGPU::V_FMAMK_F32: + case AMDGPU::V_FMAMK_F32_gfx12: + case AMDGPU::V_FMAMK_F64: + case AMDGPU::V_FMAMK_F64_gfx1250: + return {VOP2MADMKOps, nullptr}; + default: + break; + } return {VOPOps, nullptr}; } @@ -3464,8 +3499,11 @@ getVGPRLoweringOperandTables(const MCInstrDesc &Desc) { if (TSFlags & SIInstrFlags::VIMAGE) return {VIMGOps, nullptr}; - if (AMDGPU::isVOPD(Desc.getOpcode())) - return {VOPDOpsX, VOPDOpsY}; + if (AMDGPU::isVOPD(Desc.getOpcode())) { + auto [OpX, OpY] = getVOPDComponents(Desc.getOpcode()); + return {(OpX == AMDGPU::V_FMAMK_F32) ? VOPDFMAMKOpsX : VOPDOpsX, + (OpY == AMDGPU::V_FMAMK_F32) ? VOPDFMAMKOpsY : VOPDOpsY}; + } assert(!(TSFlags & SIInstrFlags::MIMG)); @@ -3545,8 +3583,15 @@ bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCInstrInfo &MII, } unsigned getLdsDwGranularity(const MCSubtargetInfo &ST) { - return ST.hasFeature(AMDGPU::FeatureAddressableLocalMemorySize327680) ? 256 - : 128; + if (ST.getFeatureBits().test(FeatureAddressableLocalMemorySize32768)) + return 64; + if (ST.getFeatureBits().test(FeatureAddressableLocalMemorySize65536)) + return 128; + if (ST.getFeatureBits().test(FeatureAddressableLocalMemorySize163840)) + return 320; + if (ST.getFeatureBits().test(FeatureAddressableLocalMemorySize327680)) + return 512; + return 64; // In sync with getAddressableLocalMemorySize } bool isPackedFP32Inst(unsigned Opc) { diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 5e3195b..3a35200 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -909,7 +909,7 @@ private: const ComponentInfo CompInfo[COMPONENTS_NUM]; public: - using RegIndices = std::array<unsigned, Component::MAX_OPR_NUM>; + using RegIndices = std::array<MCRegister, Component::MAX_OPR_NUM>; InstInfo(const MCInstrDesc &OpX, const MCInstrDesc &OpY) : CompInfo{OpX, OpY} {} @@ -932,9 +932,10 @@ public: // even though it violates requirement to be from different banks. // If \p VOPD3 is set to true both dst registers allowed to be either odd // or even and instruction may have real src2 as opposed to tied accumulator. - bool hasInvalidOperand(std::function<unsigned(unsigned, unsigned)> GetRegIdx, - const MCRegisterInfo &MRI, bool SkipSrc = false, - bool AllowSameVGPR = false, bool VOPD3 = false) const { + bool + hasInvalidOperand(std::function<MCRegister(unsigned, unsigned)> GetRegIdx, + const MCRegisterInfo &MRI, bool SkipSrc = false, + bool AllowSameVGPR = false, bool VOPD3 = false) const { return getInvalidCompOperandIndex(GetRegIdx, MRI, SkipSrc, AllowSameVGPR, VOPD3) .has_value(); @@ -949,14 +950,14 @@ public: // If \p VOPD3 is set to true both dst registers allowed to be either odd // or even and instruction may have real src2 as opposed to tied accumulator. std::optional<unsigned> getInvalidCompOperandIndex( - std::function<unsigned(unsigned, unsigned)> GetRegIdx, + std::function<MCRegister(unsigned, unsigned)> GetRegIdx, const MCRegisterInfo &MRI, bool SkipSrc = false, bool AllowSameVGPR = false, bool VOPD3 = false) const; private: RegIndices getRegIndices(unsigned ComponentIdx, - std::function<unsigned(unsigned, unsigned)> GetRegIdx, + std::function<MCRegister(unsigned, unsigned)> GetRegIdx, bool VOPD3) const; }; @@ -1300,43 +1301,43 @@ unsigned decodeFieldVaSsrc(unsigned Encoded); unsigned decodeFieldHoldCnt(unsigned Encoded); /// \returns \p VmVsrc as an encoded Depctr immediate. -unsigned encodeFieldVmVsrc(unsigned VmVsrc); +unsigned encodeFieldVmVsrc(unsigned VmVsrc, const MCSubtargetInfo &STI); /// \returns \p Encoded combined with encoded \p VmVsrc. unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc); /// \returns \p VaVdst as an encoded Depctr immediate. -unsigned encodeFieldVaVdst(unsigned VaVdst); +unsigned encodeFieldVaVdst(unsigned VaVdst, const MCSubtargetInfo &STI); /// \returns \p Encoded combined with encoded \p VaVdst. unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst); /// \returns \p SaSdst as an encoded Depctr immediate. -unsigned encodeFieldSaSdst(unsigned SaSdst); +unsigned encodeFieldSaSdst(unsigned SaSdst, const MCSubtargetInfo &STI); /// \returns \p Encoded combined with encoded \p SaSdst. unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst); /// \returns \p VaSdst as an encoded Depctr immediate. -unsigned encodeFieldVaSdst(unsigned VaSdst); +unsigned encodeFieldVaSdst(unsigned VaSdst, const MCSubtargetInfo &STI); /// \returns \p Encoded combined with encoded \p VaSdst. unsigned encodeFieldVaSdst(unsigned Encoded, unsigned VaSdst); /// \returns \p VaVcc as an encoded Depctr immediate. -unsigned encodeFieldVaVcc(unsigned VaVcc); +unsigned encodeFieldVaVcc(unsigned VaVcc, const MCSubtargetInfo &STI); /// \returns \p Encoded combined with encoded \p VaVcc. unsigned encodeFieldVaVcc(unsigned Encoded, unsigned VaVcc); /// \returns \p HoldCnt as an encoded Depctr immediate. -unsigned encodeFieldHoldCnt(unsigned HoldCnt); +unsigned encodeFieldHoldCnt(unsigned HoldCnt, const MCSubtargetInfo &STI); /// \returns \p Encoded combined with encoded \p HoldCnt. -unsigned encodeFieldHoldCnt(unsigned HoldCnt, unsigned Encoded); +unsigned encodeFieldHoldCnt(unsigned Encoded, unsigned HoldCnt); /// \returns \p VaSsrc as an encoded Depctr immediate. -unsigned encodeFieldVaSsrc(unsigned VaSsrc); +unsigned encodeFieldVaSsrc(unsigned VaSsrc, const MCSubtargetInfo &STI); /// \returns \p Encoded combined with encoded \p VaSsrc. unsigned encodeFieldVaSsrc(unsigned Encoded, unsigned VaSsrc); @@ -1513,6 +1514,8 @@ constexpr inline bool isKernel(CallingConv::ID CC) { } } +inline bool isKernel(const Function &F) { return isKernel(F.getCallingConv()); } + LLVM_READNONE constexpr bool canGuaranteeTCO(CallingConv::ID CC) { return CC == CallingConv::Fast; @@ -1599,7 +1602,7 @@ LLVM_READNONE MCRegister mc2PseudoReg(MCRegister Reg); LLVM_READNONE -bool isInlineValue(unsigned Reg); +bool isInlineValue(MCRegister Reg); /// Is this an AMDGPU specific source operand? These include registers, /// inline constants, literals and mandatory literals (KImm). @@ -1798,16 +1801,16 @@ bool isIntrinsicAlwaysUniform(unsigned IntrID); /// \returns a register class for the physical register \p Reg if it is a VGPR /// or nullptr otherwise. -const MCRegisterClass *getVGPRPhysRegClass(MCPhysReg Reg, +const MCRegisterClass *getVGPRPhysRegClass(MCRegister Reg, const MCRegisterInfo &MRI); /// \returns the MODE bits which have to be set by the S_SET_VGPR_MSB for the /// physical register \p Reg. -unsigned getVGPREncodingMSBs(MCPhysReg Reg, const MCRegisterInfo &MRI); +unsigned getVGPREncodingMSBs(MCRegister Reg, const MCRegisterInfo &MRI); /// If \p Reg is a low VGPR return a corresponding high VGPR with \p MSBs set. -MCPhysReg getVGPRWithMSBs(MCPhysReg Reg, unsigned MSBs, - const MCRegisterInfo &MRI); +MCRegister getVGPRWithMSBs(MCRegister Reg, unsigned MSBs, + const MCRegisterInfo &MRI); // Returns a table for the opcode with a given \p Desc to map the VGPR MSB // set by the S_SET_VGPR_MSB to one of 4 sources. In case of VOPD returns 2 diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 54f57e0..1d1e959 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -513,6 +513,13 @@ defm V_CVT_U16_F16 : VOP1Inst_t16_with_profiles <"v_cvt_u16_f16", defm V_CVT_I16_F16 : VOP1Inst_t16_with_profiles <"v_cvt_i16_f16", VOP_I16_F16_SPECIAL_OMOD, VOP_I16_F16_SPECIAL_OMOD_t16, VOP_I16_F16_SPECIAL_OMOD_fake16, fp_to_sint>; + +let HasClamp = 0, HasOMod = 0 in { +def V_TRANS_BF16_Profile : VOPProfile <[bf16, bf16, untyped, untyped]>; +def V_TRANS_BF16_t16_Profile : VOPProfile_True16 <VOP_BF16_BF16>; +def V_TRANS_BF16_fake16_Profile : VOPProfile_Fake16 <VOP_BF16_BF16>; +} + let TRANS = 1, SchedRW = [WriteTrans32] in { defm V_RCP_F16 : VOP1Inst_t16 <"v_rcp_f16", VOP_F16_F16, AMDGPUrcp>; defm V_SQRT_F16 : VOP1Inst_t16 <"v_sqrt_f16", VOP_F16_F16, any_amdgcn_sqrt>; @@ -527,14 +534,30 @@ defm V_TANH_F16 : VOP1Inst_t16 <"v_tanh_f16", VOP_F16_F16, int_amdgcn_tanh>; } let SubtargetPredicate = HasBF16TransInsts in { -defm V_TANH_BF16 : VOP1Inst_t16 <"v_tanh_bf16", VOP_BF16_BF16, int_amdgcn_tanh>; -defm V_RCP_BF16 : VOP1Inst_t16 <"v_rcp_bf16", VOP_BF16_BF16, AMDGPUrcp>; -defm V_SQRT_BF16 : VOP1Inst_t16 <"v_sqrt_bf16", VOP_BF16_BF16, any_amdgcn_sqrt>; -defm V_RSQ_BF16 : VOP1Inst_t16 <"v_rsq_bf16", VOP_BF16_BF16, AMDGPUrsq>; -defm V_LOG_BF16 : VOP1Inst_t16 <"v_log_bf16", VOP_BF16_BF16, AMDGPUlogf16>; -defm V_EXP_BF16 : VOP1Inst_t16 <"v_exp_bf16", VOP_BF16_BF16, AMDGPUexpf16>; -defm V_SIN_BF16 : VOP1Inst_t16 <"v_sin_bf16", VOP_BF16_BF16, AMDGPUsin>; -defm V_COS_BF16 : VOP1Inst_t16 <"v_cos_bf16", VOP_BF16_BF16, AMDGPUcos>; +defm V_TANH_BF16 : VOP1Inst_t16_with_profiles<"v_tanh_bf16", V_TRANS_BF16_Profile, + V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile, + int_amdgcn_tanh>; +defm V_RCP_BF16 : VOP1Inst_t16_with_profiles<"v_rcp_bf16", V_TRANS_BF16_Profile, + V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile, + AMDGPUrcp>; +defm V_SQRT_BF16 : VOP1Inst_t16_with_profiles<"v_sqrt_bf16", V_TRANS_BF16_Profile, + V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile, + any_amdgcn_sqrt>; +defm V_RSQ_BF16 : VOP1Inst_t16_with_profiles<"v_rsq_bf16", V_TRANS_BF16_Profile, + V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile, + AMDGPUrsq>; +defm V_LOG_BF16 : VOP1Inst_t16_with_profiles<"v_log_bf16", V_TRANS_BF16_Profile, + V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile, + AMDGPUlogf16>; +defm V_EXP_BF16 : VOP1Inst_t16_with_profiles<"v_exp_bf16", V_TRANS_BF16_Profile, + V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile, + AMDGPUexpf16>; +defm V_SIN_BF16 : VOP1Inst_t16_with_profiles<"v_sin_bf16", V_TRANS_BF16_Profile, + V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile, + AMDGPUsin>; +defm V_COS_BF16 : VOP1Inst_t16_with_profiles<"v_cos_bf16", V_TRANS_BF16_Profile, + V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile, + AMDGPUcos>; } } // End TRANS = 1, SchedRW = [WriteTrans32] defm V_FREXP_MANT_F16 : VOP1Inst_t16 <"v_frexp_mant_f16", VOP_F16_F16, int_amdgcn_frexp_mant>; @@ -593,15 +616,15 @@ let SubtargetPredicate = isGFX9Plus in { let isReMaterializable = 1 in defm V_SAT_PK_U8_I16 : VOP1Inst_t16<"v_sat_pk_u8_i16", VOP_I16_I32>; - - let mayRaiseFPException = 0 in { - defm V_CVT_NORM_I16_F16 : VOP1Inst_t16_with_profiles <"v_cvt_norm_i16_f16", - VOP_I16_F16_SPECIAL_OMOD, VOP_I16_F16_SPECIAL_OMOD_t16, VOP_I16_F16_SPECIAL_OMOD_fake16>; - defm V_CVT_NORM_U16_F16 : VOP1Inst_t16_with_profiles <"v_cvt_norm_u16_f16", - VOP_I16_F16_SPECIAL_OMOD, VOP_I16_F16_SPECIAL_OMOD_t16, VOP_I16_F16_SPECIAL_OMOD_fake16>; - } // End mayRaiseFPException = 0 } // End SubtargetPredicate = isGFX9Plus +let mayRaiseFPException = 0, SubtargetPredicate = HasCvtNormInsts in { +defm V_CVT_NORM_I16_F16 : VOP1Inst_t16_with_profiles <"v_cvt_norm_i16_f16", + VOP_I16_F16_SPECIAL_OMOD, VOP_I16_F16_SPECIAL_OMOD_t16, VOP_I16_F16_SPECIAL_OMOD_fake16>; +defm V_CVT_NORM_U16_F16 : VOP1Inst_t16_with_profiles <"v_cvt_norm_u16_f16", + VOP_I16_F16_SPECIAL_OMOD, VOP_I16_F16_SPECIAL_OMOD_t16, VOP_I16_F16_SPECIAL_OMOD_fake16>; +} // End mayRaiseFPException = 0, SubtargetPredicate = HasCvtNormInsts + let SubtargetPredicate = isGFX9Only in { defm V_SCREEN_PARTITION_4SE_B32 : VOP1Inst <"v_screen_partition_4se_b32", VOP_I32_I32>; } // End SubtargetPredicate = isGFX9Only diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index d87d250..11ce102 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -971,7 +971,7 @@ defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_I32_I32_I32, int_a } // End IsNeverUniform = 1 defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_F32_F32_I32, any_fldexp>; -let ReadsModeReg = 0, mayRaiseFPException = 0 in { +let ReadsModeReg = 0, mayRaiseFPException = 0, SubtargetPredicate = HasCvtPkNormVOP2Insts in { defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_V2I16_F32_F32, AMDGPUpknorm_i16_f32>; defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_V2I16_F32_F32, AMDGPUpknorm_u16_f32>; } @@ -1266,14 +1266,14 @@ let Constraints = "$vdst = $src2", defm V_FMAC_F32 : VOP2Inst_VOPD <"v_fmac_f32", VOP_MAC_F32, 0x0, "v_fmac_f32">; } // End SubtargetPredicate = HasDLInsts -let SubtargetPredicate = HasFmaLegacy32 in { +let SubtargetPredicate = HasFmacLegacy32 in { let Constraints = "$vdst = $src2", isConvertibleToThreeAddress = 1, isCommutable = 1 in defm V_FMAC_LEGACY_F32 : VOP2Inst <"v_fmac_legacy_f32", VOP_MAC_LEGACY_F32>; -} // End SubtargetPredicate = HasFmaLegacy32 +} // End SubtargetPredicate = HasFmacLegacy32 let SubtargetPredicate = HasFmacF64Inst, Constraints = "$vdst = $src2", diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 05ba76a..faab9f3 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -151,7 +151,7 @@ class getInterp16Ins <bit HasSrc2, bit HasOMod, class VOP3_INTERP16 <list<ValueType> ArgVT, bit OpSel = 0> : VOPProfile<ArgVT> { let IsSingle = 1; - let HasOMod = !ne(DstVT.Value, f16.Value); + let HasOMod = !ne(DstVT, f16); let HasHigh = 1; let HasOpSel = OpSel; @@ -185,7 +185,8 @@ defm V_FMA_LEGACY_F32 : VOP3Inst <"v_fma_legacy_f32", defm V_MAD_I32_I24 : VOP3Inst <"v_mad_i32_i24", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>; defm V_MAD_U32_U24 : VOP3Inst <"v_mad_u32_u24", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>; defm V_FMA_F32 : VOP3Inst <"v_fma_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, any_fma>, VOPD_Component<0x13, "v_fma_f32">; -defm V_LERP_U8 : VOP3Inst <"v_lerp_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_lerp>; +let SubtargetPredicate = HasLerpInst in + defm V_LERP_U8 : VOP3Inst <"v_lerp_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_lerp>; let SchedRW = [WriteIntMul] in { let SubtargetPredicate = HasMadU32Inst in @@ -258,12 +259,12 @@ defm V_DIV_FMAS_F64 : VOP3Inst <"v_div_fmas_f64", VOP_F64_F64_F64_F64_VCC>; } // End isCommutable = 1 let isReMaterializable = 1 in { -let mayRaiseFPException = 0 in { +let mayRaiseFPException = 0, SubtargetPredicate = HasCubeInsts in { defm V_CUBEID_F32 : VOP3Inst <"v_cubeid_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubeid>; defm V_CUBESC_F32 : VOP3Inst <"v_cubesc_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubesc>; defm V_CUBETC_F32 : VOP3Inst <"v_cubetc_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubetc>; defm V_CUBEMA_F32 : VOP3Inst <"v_cubema_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubema>; -} // End mayRaiseFPException +} // mayRaiseFPException = 0, SubtargetPredicate = HasCubeInsts defm V_BFE_U32 : VOP3Inst <"v_bfe_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfe_u32>; defm V_BFE_I32 : VOP3Inst <"v_bfe_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfe_i32>; @@ -306,12 +307,12 @@ let SubtargetPredicate = HasMinimum3Maximum3F32, ReadsModeReg = 0 in { defm V_MAXIMUM3_F32 : VOP3Inst <"v_maximum3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmaximum3>; } // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 -let isCommutable = 1 in { +let isCommutable = 1, SubtargetPredicate = HasSadInsts in { defm V_SAD_U8 : VOP3Inst <"v_sad_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>; defm V_SAD_HI_U8 : VOP3Inst <"v_sad_hi_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>; defm V_SAD_U16 : VOP3Inst <"v_sad_u16", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>; defm V_SAD_U32 : VOP3Inst <"v_sad_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>; -} // End isCommutable = 1 +} // End isCommutable = 1, SubtargetPredicate = HasSadInsts defm V_CVT_PK_U8_F32 : VOP3Inst<"v_cvt_pk_u8_f32", VOP3_Profile<VOP_I32_F32_I32_I32>, int_amdgcn_cvt_pk_u8_f32>; defm V_DIV_FIXUP_F32 : VOP3Inst <"v_div_fixup_f32", DIV_FIXUP_F32_PROF, AMDGPUdiv_fixup>; @@ -424,7 +425,8 @@ def VOPProfileMQSAD : VOP3_Profile<VOP_V4I32_I64_I32_V4I32, VOP3_CLAMP> { let SubtargetPredicate = isGFX7Plus in { let Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32] in { -defm V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64, VOP3_CLAMP>>; +let SubtargetPredicate = HasQsadInsts in + defm V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64, VOP3_CLAMP>>; defm V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOPProfileMQSAD>; } // End Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32] } // End SubtargetPredicate = isGFX7Plus @@ -789,9 +791,6 @@ let isCommutable = 1 in { defm V_MAD_I32_I16 : VOP3Inst_t16 <"v_mad_i32_i16", VOP_I32_I16_I16_I32>; } // End isCommutable = 1 -defm V_CVT_PKNORM_I16_F16 : VOP3Inst_t16 <"v_cvt_pknorm_i16_f16", VOP_B32_F16_F16>; -defm V_CVT_PKNORM_U16_F16 : VOP3Inst_t16 <"v_cvt_pknorm_u16_f16", VOP_B32_F16_F16>; - defm V_PACK_B32_F16 : VOP3Inst_t16 <"v_pack_b32_f16", VOP_B32_F16_F16>; let isReMaterializable = 1 in { @@ -996,6 +995,11 @@ def : GCNPat<(DivergentBinFrag<or> (or_oneuse i64:$src0, i64:$src1), i64:$src2), } // End SubtargetPredicate = isGFX9Plus +let SubtargetPredicate = HasCvtPkNormVOP3Insts in { + defm V_CVT_PKNORM_I16_F16 : VOP3Inst_t16 <"v_cvt_pknorm_i16_f16", VOP_B32_F16_F16>; + defm V_CVT_PKNORM_U16_F16 : VOP3Inst_t16 <"v_cvt_pknorm_u16_f16", VOP_B32_F16_F16>; +} // end SubtargetPredicate = HasCvtPkNormVOP3Insts + // FIXME: Probably should hardcode clamp bit in pseudo and avoid this. class OpSelBinOpClampPat<SDPatternOperator node, Instruction inst> : GCNPat< diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 4ae2c1e..2dfa905 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -1364,16 +1364,10 @@ class WMMAOpcodeMapping<Instruction TwoAddr, Instruction ThreeAddr> { field bit is_wmma_xdl; } -def WMMAOpcode : GenericEnum { - let FilterClass = "VOP3P_Pseudo"; -} - class WMMAMappingTable : GenericTable { let FilterClass = "WMMAOpcodeMapping"; let CppTypeName = "WMMAOpcodeMappingInfo"; let Fields = ["Opcode2Addr", "Opcode3Addr"]; - string TypeOf_Opcode2Addr = "WMMAOpcode"; - string TypeOf_Opcode3Addr = "WMMAOpcode"; } def WMMAOpcode2AddrMappingTable : WMMAMappingTable { @@ -1707,7 +1701,7 @@ multiclass WMMAInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string Pse defvar WMMAConstraints2Addr = !if(DiffVdstSrc2, "@earlyclobber $vdst", "@earlyclobber $vdst,$vdst = $src2"); defvar WMMAConstraints3Addr = "@earlyclobber $vdst"; - let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in { + let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0, isConvergent = 1 in { let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = 1 in def _twoaddr : VOP3P_Pseudo<Instr, WMMAProfile>, WMMAInstInfo { let PseudoInstr = Instr#PseudoInstrSuffix; @@ -1734,7 +1728,7 @@ multiclass SWMMACInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string P let mayRaiseFPException = 0; let ReadsModeReg = 0; let AsmMatchConverter = "cvtSWMMAC"; - + let isConvergent = 1; let Constraints = "@earlyclobber $vdst,$vdst = $srcTiedDef"; } } @@ -1906,8 +1900,10 @@ defm V_WMMA_SCALE_F32_32X16X128_F4_w32 : WMMAInstGFX12<"v_wmma_scale_f32_32x16 defm V_WMMA_SCALE16_F32_32X16X128_F4_w32 : WMMAInstGFX12<"v_wmma_scale16_f32_32x16x128_f4", F32_32X16X128_F4_SCALE16_w32, "_w32">; } // End is_wmma_xdl = 1. -defm V_WMMA_LD_SCALE_PAIRED_B32 : VOP3PInst<"v_wmma_ld_scale_paired_b32", VOP_WMMA_LD_SCALE<i32, VCSrc_b32_Lo256>>; -defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3PInst<"v_wmma_ld_scale16_paired_b64", VOP_WMMA_LD_SCALE<i64, VCSrc_b64_Lo256>>; +let isConvergent = 1 in { + defm V_WMMA_LD_SCALE_PAIRED_B32 : VOP3PInst<"v_wmma_ld_scale_paired_b32", VOP_WMMA_LD_SCALE<i32, VCSrc_b32_Lo256>>; + defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3PInst<"v_wmma_ld_scale16_paired_b64", VOP_WMMA_LD_SCALE<i64, VCSrc_b64_Lo256>>; +} } // End SubtargetPredicate = isGFX125xOnly } // End WaveSizePredicate = isWave32 @@ -2216,7 +2212,7 @@ class VOP3PX2e <bits<8> op, bits<8> LdScaleOp, VOP3PWMMA_Profile P> : Enc128, VO let Inst{23-16} = LdScaleOp; let Inst{40-32} = scale_src0; let Inst{49-41} = scale_src1; - let Inst{58-50} = 0; // scale src2 + let Inst{58-50} = 0x100; // scale src2 = vgpr0 (dummy) let Inst{59} = matrix_b_scale{0}; // scale_op_sel_hi(0) let Inst{60} = 0; // scale_op_sel_hi(1) let Inst{63-61} = {0, matrix_a_scale_fmt{1-0}}; // neg (lo) @@ -2431,6 +2427,15 @@ multiclass VOP3P_Real_with_name_gfx12<bits<8> op, string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> : VOP3P_Real_with_name<GFX12Gen, op, backing_ps_name, asmName>; +multiclass VOP3P_Real_LD_SCALE_gfx1250<bits<8> op> { + defvar ps = !cast<VOP3P_Pseudo>(NAME); + def _gfx1250 : + VOP3P_Real_Gen<ps, GFX1250Gen, ps.Mnemonic>, + VOP3Pe_gfx11_gfx12<op, ps.Pfl> { + let Inst{58-50} = 0x100; // scale src2 = vgpr0 (dummy) + } +} + defm V_PK_MIN_NUM_F16 : VOP3P_Real_with_name_gfx12<0x1b, "V_PK_MIN_F16", "v_pk_min_num_f16">; defm V_PK_MAX_NUM_F16 : VOP3P_Real_with_name_gfx12<0x1c, "V_PK_MAX_F16", "v_pk_max_num_f16">; @@ -2460,8 +2465,8 @@ defm V_FMA_MIX_F32_BF16 : VOP3P_Realtriple<GFX1250Gen, 0x3d>; defm V_FMA_MIXLO_BF16 : VOP3P_Realtriple<GFX1250Gen, 0x3e>; defm V_FMA_MIXHI_BF16 : VOP3P_Realtriple<GFX1250Gen, 0x3f>; -defm V_WMMA_LD_SCALE_PAIRED_B32 : VOP3P_Real_gfx1250<0x35>; -defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3P_Real_gfx1250<0x3a>; +defm V_WMMA_LD_SCALE_PAIRED_B32 : VOP3P_Real_LD_SCALE_gfx1250<0x35>; +defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3P_Real_LD_SCALE_gfx1250<0x3a>; let AssemblerPredicate = isGFX1250Plus in def : AMDGPUMnemonicAlias<"v_fma_mix_f32_f16", "v_fma_mix_f32">; diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td index 2730ec5..a829b80 100644 --- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td @@ -1233,18 +1233,12 @@ defm V_CMPX_CLASS_F16 : VOPCX_CLASS_F16 <"v_cmpx_class_f16">; // We need to use COPY_TO_REGCLASS to w/a the problem when ReplaceAllUsesWith() // complaints it cannot replace i1 <-> i64/i32 if node was not morphed in place. multiclass ICMP_Pattern <PatFrags cond, Instruction inst, ValueType vt, dag dstInst = (inst $src0, $src1)> { - let WaveSizePredicate = isWave64 in def : GCNPat < - (i64 (AMDGPUsetcc vt:$src0, vt:$src1, cond)), - (i64 (COPY_TO_REGCLASS dstInst, SReg_64)) + (WaveSizeVT (AMDGPUsetcc vt:$src0, vt:$src1, cond)), + dstInst >; let WaveSizePredicate = isWave32 in { - def : GCNPat < - (i32 (AMDGPUsetcc vt:$src0, vt:$src1, cond)), - (i32 (COPY_TO_REGCLASS dstInst, SReg_32)) - >; - // Support codegen of i64 setcc in wave32 mode. def : GCNPat < (i64 (AMDGPUsetcc vt:$src0, vt:$src1, cond)), diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index 8325c62..ea3edb8 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -1357,8 +1357,12 @@ class VOPBinOpClampPat<SDPatternOperator node, Instruction inst, ValueType vt> : class getVOP3ModPat<VOPProfile P, SDPatternOperator node> { dag src0 = !if(P.HasOMod, - (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod), - (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp)); + !if(P.HasClamp, + (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod), + (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i32:$omod)), + !if(P.HasClamp, + (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp), + (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers))); list<dag> ret3 = [(set P.DstVT:$vdst, (DivergentFragOrOp<node, P>.ret (P.Src0VT src0), @@ -2204,12 +2208,12 @@ include "VOP3PInstructions.td" include "VOPDInstructions.td" class ClassPat<Instruction inst, ValueType vt> : GCNPat < - (is_fpclass (vt (VOP3ModsNonCanonicalizing vt:$src0, i32:$src0_mods)), (i32 timm:$mask)), + (i1 (is_fpclass (vt (VOP3ModsNonCanonicalizing vt:$src0, i32:$src0_mods)), (i32 timm:$mask))), (inst i32:$src0_mods, vt:$src0, (V_MOV_B32_e32 timm:$mask)) >; class ClassPat_t16<Instruction inst, ValueType vt> : GCNPat < - (is_fpclass (vt (VOP3ModsNonCanonicalizing vt:$src0, i32:$src0_mods)), (i32 timm:$mask)), + (i1 (is_fpclass (vt (VOP3ModsNonCanonicalizing vt:$src0, i32:$src0_mods)), (i32 timm:$mask))), (inst i32:$src0_mods, vt:$src0, SRCMODS.NONE, (V_MOV_B32_e32 timm:$mask)) >; |
