diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPU.td | 3 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 4 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 14 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 9 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/DSInstructions.td | 20 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/FLATInstructions.td | 130 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 76 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp | 22 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h | 10 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/VOP3PInstructions.td | 70 |
10 files changed, 222 insertions, 136 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 1a697f7..502a8e8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -2592,6 +2592,9 @@ def UseFakeTrue16Insts : True16PredicateClass<"Subtarget->hasTrue16BitInsts() && // FIXME When we default to RealTrue16 instead of Fake, change the line as follows. // AssemblerPredicate<(all_of FeatureTrue16BitInsts, (not FeatureRealTrue16Insts))>; +def UseTrue16WithSramECC : True16PredicateClass<"Subtarget->useRealTrue16Insts() && " + "!Subtarget->d16PreservesUnusedBits()">; + def HasD16Writes32BitVgpr: Predicate<"Subtarget->hasD16Writes32BitVgpr()">, AssemblerPredicate<(all_of FeatureTrue16BitInsts, FeatureRealTrue16Insts, FeatureD16Writes32BitVgpr)>; def NotHasD16Writes32BitVgpr: Predicate<"!Subtarget->hasD16Writes32BitVgpr()">, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index cb49936..65d049e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -1504,7 +1504,6 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM, A.getOrCreateAAFor<AAAMDAttributes>(IRPosition::function(*F)); A.getOrCreateAAFor<AAUniformWorkGroupSize>(IRPosition::function(*F)); A.getOrCreateAAFor<AAAMDMaxNumWorkgroups>(IRPosition::function(*F)); - A.getOrCreateAAFor<AAAMDGPUNoAGPR>(IRPosition::function(*F)); CallingConv::ID CC = F->getCallingConv(); if (!AMDGPU::isEntryFunctionCC(CC)) { A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRPosition::function(*F)); @@ -1515,6 +1514,9 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM, if (!F->isDeclaration() && ST.hasClusters()) A.getOrCreateAAFor<AAAMDGPUClusterDims>(IRPosition::function(*F)); + if (ST.hasGFX90AInsts()) + A.getOrCreateAAFor<AAAMDGPUNoAGPR>(IRPosition::function(*F)); + for (auto &I : instructions(F)) { Value *Ptr = nullptr; if (auto *LI = dyn_cast<LoadInst>(&I)) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 848d9a5..557d87f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -5043,6 +5043,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_mfma_i32_16x16x64_i8: case Intrinsic::amdgcn_mfma_i32_32x32x32_i8: case Intrinsic::amdgcn_mfma_f32_16x16x32_bf16: { + unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + unsigned MinNumRegsRequired = DstSize / 32; + // Default for MAI intrinsics. // srcC can also be an immediate which can be folded later. // FIXME: Should we eventually add an alternative mapping with AGPR src @@ -5051,29 +5054,32 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { // vdst, srcA, srcB, srcC const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); OpdsMapping[0] = - Info->mayNeedAGPRs() + Info->getMinNumAGPRs() >= MinNumRegsRequired ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI) : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); OpdsMapping[4] = - Info->mayNeedAGPRs() + Info->getMinNumAGPRs() >= MinNumRegsRequired ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI) : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); break; } case Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4: case Intrinsic::amdgcn_mfma_scale_f32_32x32x64_f8f6f4: { + unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + unsigned MinNumRegsRequired = DstSize / 32; + const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); OpdsMapping[0] = - Info->mayNeedAGPRs() + Info->getMinNumAGPRs() >= MinNumRegsRequired ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI) : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); OpdsMapping[4] = - Info->mayNeedAGPRs() + Info->getMinNumAGPRs() >= MinNumRegsRequired ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI) : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 280fbe2..723d07e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -929,8 +929,10 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { ThinOrFullLTOPhase Phase) { if (Level != OptimizationLevel::O0) { if (!isLTOPreLink(Phase)) { - AMDGPUAttributorOptions Opts; - MPM.addPass(AMDGPUAttributorPass(*this, Opts, Phase)); + if (getTargetTriple().isAMDGCN()) { + AMDGPUAttributorOptions Opts; + MPM.addPass(AMDGPUAttributorPass(*this, Opts, Phase)); + } } } }); @@ -1296,7 +1298,8 @@ void AMDGPUPassConfig::addIRPasses() { if (LowerCtorDtor) addPass(createAMDGPUCtorDtorLoweringLegacyPass()); - if (isPassEnabled(EnableImageIntrinsicOptimizer)) + if (TM.getTargetTriple().isAMDGCN() && + isPassEnabled(EnableImageIntrinsicOptimizer)) addPass(createAMDGPUImageIntrinsicOptimizerPass(&TM)); // This can be disabled by passing ::Disable here or on the command line diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index b2ff5a1..18582ed 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -951,6 +951,11 @@ class DSReadPat <DS_Pseudo inst, ValueType vt, PatFrag frag, int gds=0> : GCNPat (inst $ptr, Offset:$offset, (i1 gds)) >; +class DSReadPat_t16 <DS_Pseudo inst, ValueType vt, PatFrag frag, int gds=0> : GCNPat < + (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))), + (EXTRACT_SUBREG (inst $ptr, Offset:$offset, (i1 gds)), lo16) +>; + multiclass DSReadPat_mc<DS_Pseudo inst, ValueType vt, string frag> { let OtherPredicates = [LDSRequiresM0Init] in { @@ -968,13 +973,14 @@ multiclass DSReadPat_t16<DS_Pseudo inst, ValueType vt, string frag> { def : DSReadPat<inst, vt, !cast<PatFrag>(frag#"_m0")>; } - let OtherPredicates = [NotLDSRequiresM0Init] in { - let True16Predicate = NotUseRealTrue16Insts in { - def : DSReadPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, !cast<PatFrag>(frag)>; - } - let True16Predicate = UseRealTrue16Insts in { - def : DSReadPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_t16"), vt, !cast<PatFrag>(frag)>; - } + let OtherPredicates = [NotLDSRequiresM0Init], True16Predicate = NotUseRealTrue16Insts in { + def : DSReadPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, !cast<PatFrag>(frag)>; + } + let OtherPredicates = [NotLDSRequiresM0Init, D16PreservesUnusedBits], True16Predicate = UseRealTrue16Insts in { + def : DSReadPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_t16"), vt, !cast<PatFrag>(frag)>; + } + let OtherPredicates = [NotLDSRequiresM0Init], True16Predicate = UseTrue16WithSramECC in { + def : DSReadPat_t16<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, !cast<PatFrag>(frag)>; } } diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 5a22b23..e86816d 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -1383,6 +1383,11 @@ class FlatLoadPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType v (inst $vaddr, $offset, (i32 0)) >; +class FlatLoadPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (vt (node (FlatOffset i64:$vaddr, i32:$offset))), + (EXTRACT_SUBREG (inst $vaddr, $offset), lo16) +>; + class FlatSignedLoadPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < (node (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset), vt:$in), (inst $vaddr, $offset, 0, $in) @@ -1393,6 +1398,11 @@ class FlatSignedLoadPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, Value (inst $vaddr, $offset, (i32 0)) >; +class FlatSignedLoadPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (vt (node (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset))), + (EXTRACT_SUBREG (inst $vaddr, $offset, (i32 0)), lo16) +>; + class GlobalLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), vt:$in)), (inst $saddr, $voffset, $offset, $cpol, $in) @@ -1408,6 +1418,11 @@ class FlatLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueT (inst $saddr, $voffset, $offset, $cpol) >; +class FlatLoadSaddrPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol))), + (EXTRACT_SUBREG (inst $saddr, $voffset, $offset, $cpol), lo16) +>; + class FlatLoadLDSSignedPat_M0 <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat < (node (i64 VReg_64:$vaddr), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm:$cpol), M0), (inst $dsaddr, $vaddr, $offset, $cpol) @@ -1443,6 +1458,11 @@ class GlobalLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, Valu (inst $saddr, $voffset, $offset, $cpol) >; +class GlobalLoadSaddrPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol))), + (EXTRACT_SUBREG (inst $saddr, $voffset, $offset, $cpol), lo16) +>; + class FlatLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < (vt (node (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset))), (inst $vaddr, $offset) @@ -1625,6 +1645,11 @@ class ScratchLoadSignedPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, Va (inst $vaddr, $offset, 0) >; +class ScratchLoadSignedPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (vt (node (ScratchOffset (i32 VGPR_32:$vaddr), i32:$offset))), + (EXTRACT_SUBREG (inst $vaddr, $offset), lo16) +>; + class ScratchStoreSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < (node vt:$data, (ScratchOffset (i32 VGPR_32:$vaddr), i32:$offset)), (inst getVregSrcForVT<vt>.ret:$data, $vaddr, $offset) @@ -1645,6 +1670,11 @@ class ScratchLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, Val (inst $saddr, $offset, 0) >; +class ScratchLoadSaddrPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (vt (node (ScratchSAddr (i32 SGPR_32:$saddr), i32:$offset))), + (EXTRACT_SUBREG (inst $saddr, $offset), lo16) +>; + class ScratchStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < (node vt:$data, (ScratchSAddr (i32 SGPR_32:$saddr), i32:$offset)), @@ -1672,6 +1702,11 @@ class ScratchLoadSVaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, Va (inst $vaddr, $saddr, $offset, $cpol) >; +class ScratchLoadSVaddrPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol))), + (EXTRACT_SUBREG (inst $vaddr, $saddr, $offset, $cpol), lo16) +>; + multiclass GlobalLoadLDSPats_M0<FLAT_Pseudo inst, SDPatternOperator node> { def : FlatLoadLDSSignedPat_M0 <inst, node> { let AddedComplexity = 10; @@ -1764,6 +1799,16 @@ multiclass GlobalFLATLoadPats_D16_t16<string inst, SDPatternOperator node, Value } } +multiclass GlobalFLATLoadPats_t16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { + def : FlatSignedLoadPat_t16<inst, node, vt> { + let AddedComplexity = 10; + } + + def : GlobalLoadSaddrPat_t16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { + let AddedComplexity = 11; + } +} + multiclass GlobalFLATStorePats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { def : FlatStoreSignedPat <inst, node, vt> { @@ -1872,8 +1917,8 @@ multiclass ScratchFLATStorePats<FLAT_Pseudo inst, SDPatternOperator node, } } -multiclass ScratchFLATStorePats_t16<string inst, SDPatternOperator node, - ValueType vt> { +multiclass ScratchFLATStorePats_D16_t16<string inst, SDPatternOperator node, + ValueType vt> { def : ScratchStoreSignedPat <!cast<FLAT_Pseudo>(inst#"_t16"), node, vt> { let AddedComplexity = 25; } @@ -1918,6 +1963,21 @@ multiclass ScratchFLATLoadPats_D16_t16<string inst, SDPatternOperator node, Valu } } +multiclass ScratchFLATLoadPats_t16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { + def : ScratchLoadSignedPat_t16 <inst, node, vt> { + let AddedComplexity = 25; + } + + def : ScratchLoadSaddrPat_t16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { + let AddedComplexity = 26; + } + + def : ScratchLoadSVaddrPat_t16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SVS"), node, vt> { + let SubtargetPredicate = HasFlatScratchSVSMode; + let AddedComplexity = 27; + } +} + multiclass FlatLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { def : FlatLoadPat <inst, node, vt> { let OtherPredicates = [HasFlatAddressSpace]; @@ -1947,6 +2007,17 @@ multiclass FlatLoadPats_D16_t16<FLAT_Pseudo inst, SDPatternOperator node, ValueT } } +multiclass FlatLoadPats_t16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { + def : FlatLoadPat_t16 <inst, node, vt> { + let OtherPredicates = [HasFlatAddressSpace]; + } + + def : FlatLoadSaddrPat_t16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { + let AddedComplexity = 9; + let SubtargetPredicate = HasFlatGVSMode; + } +} + multiclass FlatStorePats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { def : FlatStorePat <inst, node, vt> { let OtherPredicates = [HasFlatAddressSpace]; @@ -1997,6 +2068,17 @@ let True16Predicate = NotUseRealTrue16Insts in { defm : FlatStorePats <FLAT_STORE_SHORT, atomic_store_16_flat, i16>; } +let True16Predicate = UseTrue16WithSramECC in { + defm : FlatLoadPats_t16 <FLAT_LOAD_UBYTE, extloadi8_flat, i16>; + defm : FlatLoadPats_t16 <FLAT_LOAD_UBYTE, zextloadi8_flat, i16>; + defm : FlatLoadPats_t16 <FLAT_LOAD_SBYTE, sextloadi8_flat, i16>; + defm : FlatLoadPats_t16 <FLAT_LOAD_USHORT, load_flat, i16>; + defm : FlatLoadPats_t16 <FLAT_LOAD_UBYTE, atomic_load_aext_8_flat, i16>; + defm : FlatLoadPats_t16 <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i16>; + defm : FlatLoadPats_t16 <FLAT_LOAD_USHORT, atomic_load_nonext_16_flat, i16>; + defm : FlatLoadPats_t16 <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i16>; +} + let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts in { defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, extloadi8_flat, i16>; defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, zextloadi8_flat, i16>; @@ -2006,11 +2088,14 @@ let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predi defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, atomic_load_zext_8_flat, i16>; defm : FlatLoadPats_D16_t16<FLAT_LOAD_SHORT_D16_t16, atomic_load_nonext_16_flat, i16>; defm : FlatLoadPats_D16_t16<FLAT_LOAD_SBYTE_D16_t16, atomic_load_sext_8_flat, i16>; +} // End let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts + +let OtherPredicates = [D16PreservesUnusedBits], True16Predicate = UseRealTrue16Insts in { defm : FlatStorePats_t16 <FLAT_STORE_BYTE, truncstorei8_flat, i16>; defm : FlatStorePats_t16 <FLAT_STORE_SHORT, store_flat, i16>; defm : FlatStorePats_t16 <FLAT_STORE_BYTE, atomic_store_8_flat, i16>; defm : FlatStorePats_t16 <FLAT_STORE_SHORT, atomic_store_16_flat, i16>; -} // End let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts +} defm : FlatLoadPats <FLAT_LOAD_DWORD, atomic_load_nonext_32_flat, i32>; defm : FlatLoadPats <FLAT_LOAD_DWORDX2, atomic_load_nonext_64_flat, i64>; @@ -2140,6 +2225,20 @@ defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, atomic_load_nonext_16_global, i16 defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, atomic_load_zext_16_global, i16>; } +let True16Predicate = UseTrue16WithSramECC in { +defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_UBYTE, extloadi8_global, i16>; +defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_UBYTE, zextloadi8_global, i16>; +defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_SBYTE, sextloadi8_global, i16>; +defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_SSHORT, atomic_load_sext_16_global, i32>; +defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_USHORT, atomic_load_zext_16_global, i32>; +defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_USHORT, load_global, i16>; +defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_UBYTE, atomic_load_aext_8_global, i16>; +defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_UBYTE, atomic_load_zext_8_global, i16>; +defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_SBYTE, atomic_load_sext_8_global, i16>; +defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_USHORT, atomic_load_nonext_16_global, i16>; +defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_USHORT, atomic_load_zext_16_global, i16>; +} + let OtherPredicates = [D16PreservesUnusedBits], True16Predicate = UseRealTrue16Insts in { defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_UBYTE_D16", extloadi8_global, i16>; defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_UBYTE_D16", zextloadi8_global, i16>; @@ -2192,6 +2291,13 @@ defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE, atomic_store_8_global, i16>; defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, atomic_store_16_global, i16>; } +let OtherPredicates = [HasFlatGlobalInsts], True16Predicate = UseRealTrue16Insts in { +defm : GlobalFLATStorePats_D16_t16 <"GLOBAL_STORE_BYTE", truncstorei8_global, i16>; +defm : GlobalFLATStorePats_D16_t16 <"GLOBAL_STORE_SHORT", store_global, i16>; +defm : GlobalFLATStorePats_D16_t16 <"GLOBAL_STORE_BYTE", atomic_store_8_global, i16>; +defm : GlobalFLATStorePats_D16_t16 <"GLOBAL_STORE_SHORT", atomic_store_16_global, i16>; +} + let OtherPredicates = [HasD16LoadStore] in { defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT_D16_HI, truncstorei16_hi16_global, i32>; defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE_D16_HI, truncstorei8_hi16_global, i32>; @@ -2362,14 +2468,24 @@ defm : ScratchFLATStorePats <SCRATCH_STORE_SHORT, store_private, i16>; defm : ScratchFLATStorePats <SCRATCH_STORE_BYTE, truncstorei8_private, i16>; } -let True16Predicate = UseRealTrue16Insts in { +let True16Predicate = UseTrue16WithSramECC in { +defm : ScratchFLATLoadPats_t16 <SCRATCH_LOAD_UBYTE, extloadi8_private, i16>; +defm : ScratchFLATLoadPats_t16 <SCRATCH_LOAD_UBYTE, zextloadi8_private, i16>; +defm : ScratchFLATLoadPats_t16 <SCRATCH_LOAD_SBYTE, sextloadi8_private, i16>; +defm : ScratchFLATLoadPats_t16 <SCRATCH_LOAD_USHORT, load_private, i16>; +} + +let OtherPredicates = [D16PreservesUnusedBits], True16Predicate = UseRealTrue16Insts in { defm : ScratchFLATLoadPats_D16_t16<"SCRATCH_LOAD_UBYTE_D16", extloadi8_private, i16>; defm : ScratchFLATLoadPats_D16_t16<"SCRATCH_LOAD_UBYTE_D16", zextloadi8_private, i16>; defm : ScratchFLATLoadPats_D16_t16<"SCRATCH_LOAD_SBYTE_D16", sextloadi8_private, i16>; defm : ScratchFLATLoadPats_D16_t16<"SCRATCH_LOAD_SHORT_D16", load_private, i16>; -defm : ScratchFLATStorePats_t16 <"SCRATCH_STORE_SHORT", store_private, i16>; -defm : ScratchFLATStorePats_t16 <"SCRATCH_STORE_BYTE", truncstorei8_private, i16>; -} // End True16Predicate = UseRealTrue16Insts +} // End OtherPredicates = [D16PreservesUnusedBits], True16Predicate = UseRealTrue16Insts + +let True16Predicate = UseRealTrue16Insts in { +defm : ScratchFLATStorePats_D16_t16 <"SCRATCH_STORE_SHORT", store_private, i16>; +defm : ScratchFLATStorePats_D16_t16 <"SCRATCH_STORE_BYTE", truncstorei8_private, i16>; +} foreach vt = Reg32Types.types in { defm : ScratchFLATLoadPats <SCRATCH_LOAD_DWORD, load_private, vt>; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index e233457..1a686a9 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -17346,74 +17346,24 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, MachineFunction *MF = MI.getParent()->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); - SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); if (TII->isVOP3(MI.getOpcode())) { // Make sure constant bus requirements are respected. TII->legalizeOperandsVOP3(MRI, MI); - // Prefer VGPRs over AGPRs in mAI instructions where possible. - // This saves a chain-copy of registers and better balance register - // use between vgpr and agpr as agpr tuples tend to be big. - if (!MI.getDesc().operands().empty()) { - unsigned Opc = MI.getOpcode(); - bool HasAGPRs = Info->mayNeedAGPRs(); - const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); - int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); - for (auto I : - {AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), - AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), Src2Idx}) { - if (I == -1) - break; - if ((I == Src2Idx) && (HasAGPRs)) - break; - MachineOperand &Op = MI.getOperand(I); - if (!Op.isReg() || !Op.getReg().isVirtual()) - continue; - auto *RC = TRI->getRegClassForReg(MRI, Op.getReg()); - if (!TRI->hasAGPRs(RC)) - continue; - auto *Src = MRI.getUniqueVRegDef(Op.getReg()); - if (!Src || !Src->isCopy() || - !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg())) - continue; - auto *NewRC = TRI->getEquivalentVGPRClass(RC); - // All uses of agpr64 and agpr32 can also accept vgpr except for - // v_accvgpr_read, but we do not produce agpr reads during selection, - // so no use checks are needed. - MRI.setRegClass(Op.getReg(), NewRC); - } - - if (TII->isMAI(MI)) { - // The ordinary src0, src1, src2 were legalized above. - // - // We have to also legalize the appended v_mfma_ld_scale_b32 operands, - // as a separate instruction. - int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), - AMDGPU::OpName::scale_src0); - if (Src0Idx != -1) { - int Src1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), - AMDGPU::OpName::scale_src1); - if (TII->usesConstantBus(MRI, MI, Src0Idx) && - TII->usesConstantBus(MRI, MI, Src1Idx)) - TII->legalizeOpWithMove(MI, Src1Idx); - } - } - - if (!HasAGPRs) - return; - - // Resolve the rest of AV operands to AGPRs. - if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) { - if (Src2->isReg() && Src2->getReg().isVirtual()) { - auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg()); - if (TRI->isVectorSuperClass(RC)) { - auto *NewRC = TRI->getEquivalentAGPRClass(RC); - MRI.setRegClass(Src2->getReg(), NewRC); - if (Src2->isTied()) - MRI.setRegClass(MI.getOperand(0).getReg(), NewRC); - } - } + if (TII->isMAI(MI)) { + // The ordinary src0, src1, src2 were legalized above. + // + // We have to also legalize the appended v_mfma_ld_scale_b32 operands, + // as a separate instruction. + int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), + AMDGPU::OpName::scale_src0); + if (Src0Idx != -1) { + int Src1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), + AMDGPU::OpName::scale_src1); + if (TII->usesConstantBus(MRI, MI, Src0Idx) && + TII->usesConstantBus(MRI, MI, Src1Idx)) + TII->legalizeOpWithMove(MI, Src1Idx); } } diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 908d856..b398db4 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -33,17 +33,20 @@ using namespace llvm; // optimal RC for Opc and Dest of MFMA. In particular, there are high RP cases // where it is better to produce the VGPR form (e.g. if there are VGPR users // of the MFMA result). -static cl::opt<bool> MFMAVGPRForm( - "amdgpu-mfma-vgpr-form", cl::Hidden, +static cl::opt<bool, true> MFMAVGPRFormOpt( + "amdgpu-mfma-vgpr-form", cl::desc("Whether to force use VGPR for Opc and Dest of MFMA. If " "unspecified, default to compiler heuristics"), - cl::init(false)); + cl::location(SIMachineFunctionInfo::MFMAVGPRForm), cl::init(false), + cl::Hidden); const GCNTargetMachine &getTM(const GCNSubtarget *STI) { const SITargetLowering *TLI = STI->getTargetLowering(); return static_cast<const GCNTargetMachine &>(TLI->getTargetMachine()); } +bool SIMachineFunctionInfo::MFMAVGPRForm = false; + SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F, const GCNSubtarget *STI) : AMDGPUMachineFunction(F, *STI), Mode(F, *STI), GWSResourcePSV(getTM(STI)), @@ -81,14 +84,13 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F, PSInputAddr = AMDGPU::getInitialPSInputAddr(F); } - MayNeedAGPRs = ST.hasMAIInsts(); if (ST.hasGFX90AInsts()) { - // FIXME: MayNeedAGPRs is a misnomer for how this is used. MFMA selection - // should be separated from availability of AGPRs - if (MFMAVGPRForm || - (ST.getMaxNumVGPRs(F) <= ST.getAddressableNumArchVGPRs() && - !mayUseAGPRs(F))) - MayNeedAGPRs = false; // We will select all MAI with VGPR operands. + // FIXME: Extract logic out of getMaxNumVectorRegs; we need to apply the + // allocation granule and clamping. + auto [MinNumAGPRAttr, MaxNumAGPRAttr] = + AMDGPU::getIntegerPairAttribute(F, "amdgpu-agpr-alloc", {~0u, ~0u}, + /*OnlyFirstRequired=*/true); + MinNumAGPRs = MinNumAGPRAttr; } if (AMDGPU::isChainCC(CC)) { diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 4560615..b7dbb59 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -509,7 +509,9 @@ private: // user arguments. This is an offset from the KernargSegmentPtr. bool ImplicitArgPtr : 1; - bool MayNeedAGPRs : 1; + /// Minimum number of AGPRs required to allocate in the function. Only + /// relevant for gfx90a-gfx950. For gfx908, this should be infinite. + unsigned MinNumAGPRs = ~0u; // The hard-wired high half of the address of the global information table // for AMDPAL OS type. 0xffffffff represents no hard-wired high half, since @@ -537,6 +539,8 @@ private: void MRI_NoteCloneVirtualRegister(Register NewReg, Register SrcReg) override; public: + static bool MFMAVGPRForm; + struct VGPRSpillToAGPR { SmallVector<MCPhysReg, 32> Lanes; bool FullyAllocated = false; @@ -1196,9 +1200,7 @@ public: unsigned getMaxMemoryClusterDWords() const { return MaxMemoryClusterDWords; } - bool mayNeedAGPRs() const { - return MayNeedAGPRs; - } + unsigned getMinNumAGPRs() const { return MinNumAGPRs; } // \returns true if a function has a use of AGPRs via inline asm or // has a call which may use it. diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 5daf860..3a0cc35 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -67,7 +67,7 @@ class VOP3P_Mix_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR, class VOP3P_Mix_Profile_t16<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOP3P_Mix_Profile<P, Features, 0> { let IsTrue16 = 1; - let IsRealTrue16 = 1; + let IsRealTrue16 = 1; let DstRC64 = getVALUDstForVT<P.DstVT, 1 /*IsTrue16*/, 1 /*IsVOP3Encoding*/>.ret; } @@ -950,7 +950,7 @@ class MFMA_F8F6F4_WithSizeTable_Helper<VOP3_Pseudo ps, string F8F8Op> : } // Currently assumes scaled instructions never have abid -class MAIFrag<SDPatternOperator Op, code pred, bit HasAbid = true, bit Scaled = false> : PatFrag < +class MAIFrag<SDPatternOperator Op, bit HasAbid = true, bit Scaled = false> : PatFrag < !if(Scaled, (ops node:$src0, node:$src1, node:$src2, node:$cbsz, node:$blgp, node:$src0_modifiers, node:$scale_src0, node:$src1_modifiers, node:$scale_src1), @@ -959,37 +959,30 @@ class MAIFrag<SDPatternOperator Op, code pred, bit HasAbid = true, bit Scaled = (ops node:$blgp))), !if(Scaled, (Op $src0, $src1, $src2, $cbsz, $blgp, $src0_modifiers, $scale_src0, $src1_modifiers, $scale_src1), !if(HasAbid, (Op $src0, $src1, $src2, $cbsz, $abid, $blgp), - (Op $src0, $src1, $src2, $cbsz, $blgp))), - pred ->; - -defvar MayNeedAGPRs = [{ - return MF->getInfo<SIMachineFunctionInfo>()->mayNeedAGPRs(); -}]; - -defvar MayNeedAGPRs_gisel = [{ - return MF.getInfo<SIMachineFunctionInfo>()->mayNeedAGPRs(); -}]; + (Op $src0, $src1, $src2, $cbsz, $blgp)))>; -defvar MayNotNeedAGPRs = [{ - return !MF->getInfo<SIMachineFunctionInfo>()->mayNeedAGPRs(); -}]; +class CanUseAGPR_MAI<ValueType vt> { + code PredicateCode = [{ + return !Subtarget->hasGFX90AInsts() || + (!SIMachineFunctionInfo::MFMAVGPRForm && + MF->getInfo<SIMachineFunctionInfo>()->getMinNumAGPRs() >= + }] # !srl(vt.Size, 5) # ");"; -defvar MayNotNeedAGPRs_gisel = [{ - return !MF.getInfo<SIMachineFunctionInfo>()->mayNeedAGPRs(); -}]; + code GISelPredicateCode = [{ + return !Subtarget->hasGFX90AInsts() || + (!SIMachineFunctionInfo::MFMAVGPRForm && + MF.getInfo<SIMachineFunctionInfo>()->getMinNumAGPRs() >= + }] # !srl(vt.Size, 5) # ");"; +} -class AgprMAIFrag<SDPatternOperator Op, bit HasAbid = true, +class AgprMAIFrag<SDPatternOperator Op, ValueType vt, bit HasAbid = true, bit Scaled = false> : - MAIFrag<Op, MayNeedAGPRs, HasAbid, Scaled> { - let GISelPredicateCode = MayNeedAGPRs_gisel; -} + MAIFrag<Op, HasAbid, Scaled>, + CanUseAGPR_MAI<vt>; class VgprMAIFrag<SDPatternOperator Op, bit HasAbid = true, - bit Scaled = false> : - MAIFrag<Op, MayNotNeedAGPRs, HasAbid, Scaled> { - let GISelPredicateCode = MayNotNeedAGPRs_gisel; -} + bit Scaled = false> : + MAIFrag<Op, HasAbid, Scaled>; let isAsCheapAsAMove = 1, isReMaterializable = 1 in { defm V_ACCVGPR_READ_B32 : VOP3Inst<"v_accvgpr_read_b32", VOPProfileAccRead>; @@ -1037,16 +1030,19 @@ multiclass MAIInst<string OpName, string P, SDPatternOperator node = null_frag, bit HasAbid = true, bit Scaled = false> { defvar NoDstOverlap = !cast<VOPProfileMAI>("VOPProfileMAI_" # P).NoDstOverlap; + defvar ProfileAGPR = !cast<VOPProfileMAI>("VOPProfileMAI_" # P); + defvar ProfileVGPR = !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD"); + let isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1 in { // FP32 denorm mode is respected, rounding mode is not. Exceptions are not supported. let Constraints = !if(NoDstOverlap, "@earlyclobber $vdst", "") in { - def _e64 : MAIInst<OpName, !cast<VOPProfileMAI>("VOPProfileMAI_" # P), - !if(!or(NoDstOverlap, !eq(node, null_frag)), null_frag, AgprMAIFrag<node, HasAbid, Scaled>), Scaled>, + def _e64 : MAIInst<OpName, ProfileAGPR, + !if(!or(NoDstOverlap, !eq(node, null_frag)), null_frag, AgprMAIFrag<node, ProfileAGPR.DstVT, HasAbid, Scaled>), Scaled>, MFMATable<0, "AGPR", NAME # "_e64">; let OtherPredicates = [isGFX90APlus], Mnemonic = OpName in - def _vgprcd_e64 : MAIInst<OpName # "_vgprcd", !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD"), + def _vgprcd_e64 : MAIInst<OpName # "_vgprcd", ProfileVGPR, !if(!or(NoDstOverlap, !eq(node, null_frag)), null_frag, VgprMAIFrag<node, HasAbid, Scaled>), Scaled>, MFMATable<0, "VGPR", NAME # "_vgprcd_e64", NAME # "_e64">; } @@ -1055,12 +1051,12 @@ multiclass MAIInst<string OpName, string P, SDPatternOperator node = null_frag, let Constraints = !if(NoDstOverlap, "$vdst = $src2", ""), isConvertibleToThreeAddress = NoDstOverlap, Mnemonic = OpName in { - def "_mac_e64" : MAIInst<OpName # "_mac", !cast<VOPProfileMAI>("VOPProfileMAI_" # P), - !if(!eq(node, null_frag), null_frag, AgprMAIFrag<node, HasAbid, Scaled>), Scaled>, + def "_mac_e64" : MAIInst<OpName # "_mac", ProfileAGPR, + !if(!eq(node, null_frag), null_frag, AgprMAIFrag<node, ProfileAGPR.DstVT, HasAbid, Scaled>), Scaled>, MFMATable<1, "AGPR", NAME # "_e64", NAME # "_mac_e64">; let OtherPredicates = [isGFX90APlus] in - def _mac_vgprcd_e64 : MAIInst<OpName # "_mac_vgprcd", !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD"), + def _mac_vgprcd_e64 : MAIInst<OpName # "_mac_vgprcd", ProfileVGPR, !if(!eq(node, null_frag), null_frag, VgprMAIFrag<node, HasAbid, Scaled>), Scaled>, MFMATable<1, "VGPR", NAME # "_vgprcd_e64", NAME # "_mac_e64">; } @@ -1074,11 +1070,11 @@ multiclass ScaledMAIInst_mc<string OpName, string UnscaledOpName_, SDPatternOper defvar UnscaledOpName = UnscaledOpName_#VariantSuffix; defvar HasAbid = false; - - defvar NoDstOverlap = !cast<VOPProfileMAI>(!cast<MAIInst>(UnscaledOpName#"_e64").Pfl).NoDstOverlap; + defvar Profile = !cast<VOPProfileMAI>(!cast<MAIInst>(UnscaledOpName#"_e64").Pfl); + defvar NoDstOverlap = Profile.NoDstOverlap; def _e64 : ScaledMAIInst<OpName, - !cast<MAIInst>(UnscaledOpName#"_e64"), !if(NoDstOverlap, null_frag, AgprMAIFrag<node, HasAbid, true>)>, + !cast<MAIInst>(UnscaledOpName#"_e64"), !if(NoDstOverlap, null_frag, AgprMAIFrag<node, Profile.DstVT, HasAbid, true>)>, MFMATable<0, "AGPR", NAME # "_e64">; def _vgprcd_e64 : ScaledMAIInst<OpName # "_vgprcd", @@ -1090,7 +1086,7 @@ multiclass ScaledMAIInst_mc<string OpName, string UnscaledOpName_, SDPatternOper isConvertibleToThreeAddress = NoDstOverlap, Mnemonic = UnscaledOpName_ in { def _mac_e64 : ScaledMAIInst<OpName # "_mac", - !cast<MAIInst>(UnscaledOpName # "_mac_e64"), AgprMAIFrag<node, HasAbid, true>>, + !cast<MAIInst>(UnscaledOpName # "_mac_e64"), AgprMAIFrag<node, Profile.DstVT, HasAbid, true>>, MFMATable<1, "AGPR", NAME # "_e64">; def _mac_vgprcd_e64 : ScaledMAIInst<OpName # " _mac_vgprcd", |