diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp | 5 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 38 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 63 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/GCNSchedStrategy.h | 6 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIFrameLowering.cpp | 1 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 10 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SOPInstructions.td | 2 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/VOP1Instructions.td | 11 |
8 files changed, 28 insertions, 108 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp index f01d5f6..6efa78e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -608,6 +608,8 @@ public: ? LDSToKernelsThatNeedToAccessItIndirectly[HybridModuleRoot] : EmptySet; + const size_t HybridModuleRootKernelsSize = HybridModuleRootKernels.size(); + for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) { // Each iteration of this loop assigns exactly one global variable to // exactly one of the implementation strategies. @@ -647,7 +649,8 @@ public: ModuleScopeVariables.insert(GV); } else if (K.second.size() == 1) { KernelAccessVariables.insert(GV); - } else if (set_is_subset(K.second, HybridModuleRootKernels)) { + } else if (K.second.size() == HybridModuleRootKernelsSize && + set_is_subset(K.second, HybridModuleRootKernels)) { ModuleScopeVariables.insert(GV); } else { TableLookupVariables.insert(GV); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 92a587b..280fbe2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -1384,6 +1384,11 @@ void AMDGPUPassConfig::addCodeGenPrepare() { if (TM->getTargetTriple().isAMDGCN() && EnableLowerKernelArguments) addPass(createAMDGPULowerKernelArgumentsPass()); + TargetPassConfig::addCodeGenPrepare(); + + if (isPassEnabled(EnableLoadStoreVectorizer)) + addPass(createLoadStoreVectorizerPass()); + if (TM->getTargetTriple().isAMDGCN()) { // This lowering has been placed after codegenprepare to take advantage of // address mode matching (which is why it isn't put with the LDS lowerings). @@ -1392,15 +1397,6 @@ void AMDGPUPassConfig::addCodeGenPrepare() { // but has been put before switch lowering and CFG flattening so that those // passes can run on the more optimized control flow this pass creates in // many cases. - // - // FIXME: This should ideally be put after the LoadStoreVectorizer. - // However, due to some annoying facts about ResourceUsageAnalysis, - // (especially as exercised in the resource-usage-dead-function test), - // we need all the function passes codegenprepare all the way through - // said resource usage analysis to run on the call graph produced - // before codegenprepare runs (because codegenprepare will knock some - // nodes out of the graph, which leads to function-level passes not - // being run on them, which causes crashes in the resource usage analysis). addPass(createAMDGPULowerBufferFatPointersPass()); addPass(createAMDGPULowerIntrinsicsLegacyPass()); // In accordance with the above FIXME, manually force all the @@ -1408,11 +1404,6 @@ void AMDGPUPassConfig::addCodeGenPrepare() { addPass(new DummyCGSCCPass()); } - TargetPassConfig::addCodeGenPrepare(); - - if (isPassEnabled(EnableLoadStoreVectorizer)) - addPass(createLoadStoreVectorizerPass()); - // LowerSwitch pass may introduce unreachable blocks that can // cause unexpected behavior for subsequent passes. Placing it // here seems better that these blocks would get cleaned up by @@ -2125,6 +2116,11 @@ void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const { if (EnableLowerKernelArguments) addPass(AMDGPULowerKernelArgumentsPass(TM)); + Base::addCodeGenPrepare(addPass); + + if (isPassEnabled(EnableLoadStoreVectorizer)) + addPass(LoadStoreVectorizerPass()); + // This lowering has been placed after codegenprepare to take advantage of // address mode matching (which is why it isn't put with the LDS lowerings). // It could be placed anywhere before uniformity annotations (an analysis @@ -2132,25 +2128,11 @@ void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const { // but has been put before switch lowering and CFG flattening so that those // passes can run on the more optimized control flow this pass creates in // many cases. - // - // FIXME: This should ideally be put after the LoadStoreVectorizer. - // However, due to some annoying facts about ResourceUsageAnalysis, - // (especially as exercised in the resource-usage-dead-function test), - // we need all the function passes codegenprepare all the way through - // said resource usage analysis to run on the call graph produced - // before codegenprepare runs (because codegenprepare will knock some - // nodes out of the graph, which leads to function-level passes not - // being run on them, which causes crashes in the resource usage analysis). addPass(AMDGPULowerBufferFatPointersPass(TM)); addPass.requireCGSCCOrder(); addPass(AMDGPULowerIntrinsicsPass(TM)); - Base::addCodeGenPrepare(addPass); - - if (isPassEnabled(EnableLoadStoreVectorizer)) - addPass(LoadStoreVectorizerPass()); - // LowerSwitch pass may introduce unreachable blocks that can cause unexpected // behavior for subsequent passes. Placing it here seems better that these // blocks would get cleaned up by UnreachableBlockElim inserted next in the diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index fab78a9..bdc0810 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -29,6 +29,7 @@ #include "SIMachineFunctionInfo.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/CodeGen/CalcSpillWeights.h" #include "llvm/CodeGen/RegisterClassInfo.h" #include "llvm/MC/LaneBitmask.h" #include "llvm/Support/ErrorHandling.h" @@ -1633,64 +1634,6 @@ void GCNSchedStage::revertScheduling() { DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd); } -bool PreRARematStage::allUsesAvailableAt(const MachineInstr *InstToRemat, - SlotIndex OriginalIdx, - SlotIndex RematIdx) const { - - LiveIntervals *LIS = DAG.LIS; - MachineRegisterInfo &MRI = DAG.MRI; - OriginalIdx = OriginalIdx.getRegSlot(true); - RematIdx = std::max(RematIdx, RematIdx.getRegSlot(true)); - for (const MachineOperand &MO : InstToRemat->operands()) { - if (!MO.isReg() || !MO.getReg() || !MO.readsReg()) - continue; - - if (!MO.getReg().isVirtual()) { - // Do not attempt to reason about PhysRegs - // TODO: better analysis of PhysReg livness - if (!DAG.MRI.isConstantPhysReg(MO.getReg()) && - !DAG.TII->isIgnorableUse(MO)) - return false; - - // Constant PhysRegs and IgnorableUses are okay - continue; - } - - LiveInterval &LI = LIS->getInterval(MO.getReg()); - const VNInfo *OVNI = LI.getVNInfoAt(OriginalIdx); - assert(OVNI); - - // Don't allow rematerialization immediately after the original def. - // It would be incorrect if InstToRemat redefines the register. - // See PR14098. - if (SlotIndex::isSameInstr(OriginalIdx, RematIdx)) - return false; - - if (OVNI != LI.getVNInfoAt(RematIdx)) - return false; - - // Check that subrange is live at RematIdx. - if (LI.hasSubRanges()) { - const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo(); - unsigned SubReg = MO.getSubReg(); - LaneBitmask LM = SubReg ? TRI->getSubRegIndexLaneMask(SubReg) - : MRI.getMaxLaneMaskForVReg(MO.getReg()); - for (LiveInterval::SubRange &SR : LI.subranges()) { - if ((SR.LaneMask & LM).none()) - continue; - if (!SR.liveAt(RematIdx)) - return false; - - // Early exit if all used lanes are checked. No need to continue. - LM &= ~SR.LaneMask; - if (LM.none()) - break; - } - } - } - return true; -} - bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() { const Function &F = MF.getFunction(); @@ -1812,9 +1755,9 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() { // Do not rematerialize an instruction it it uses registers that aren't // available at its use. This ensures that we are not extending any live // range while rematerializing. - SlotIndex DefIdx = DAG.LIS->getInstructionIndex(DefMI); SlotIndex UseIdx = DAG.LIS->getInstructionIndex(*UseMI).getRegSlot(true); - if (!allUsesAvailableAt(&DefMI, DefIdx, UseIdx)) + if (!VirtRegAuxInfo::allUsesAvailableAt(&DefMI, UseIdx, *DAG.LIS, DAG.MRI, + *DAG.TII)) continue; REMAT_DEBUG(dbgs() << "Region " << I << ": remat instruction " << DefMI); diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h index 06b9b64..8ea4267 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -496,12 +496,6 @@ private: /// stage to their pre-stage values. void finalizeGCNSchedStage() override; - /// \p Returns true if all the uses in \p InstToRemat defined at \p - /// OriginalIdx are live at \p RematIdx. This only checks liveness of virtual - /// reg uses. - bool allUsesAvailableAt(const MachineInstr *InstToRemat, - SlotIndex OriginalIdx, SlotIndex RematIdx) const; - public: bool initGCNSchedStage() override; diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index 7c5d4fc..e4b3528 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -924,6 +924,7 @@ bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const { case TargetStackID::SGPRSpill: return true; case TargetStackID::ScalableVector: + case TargetStackID::ScalablePredicateVector: case TargetStackID::WasmLocal: return false; } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 1653008..f7265c5 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -64,14 +64,6 @@ static cl::opt<bool> UseDivergentRegisterIndexing( cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false)); -// TODO: This option should be removed once we switch to always using PTRADD in -// the SelectionDAG. -static cl::opt<bool> UseSelectionDAGPTRADD( - "amdgpu-use-sdag-ptradd", cl::Hidden, - cl::desc("Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the " - "SelectionDAG ISel"), - cl::init(false)); - static bool denormalModeIsFlushAllF32(const MachineFunction &MF) { const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign(); @@ -11466,7 +11458,7 @@ static bool isNoUnsignedWrap(SDValue Addr) { bool SITargetLowering::shouldPreservePtrArith(const Function &F, EVT PtrVT) const { - return UseSelectionDAGPTRADD && PtrVT == MVT::i64; + return PtrVT == MVT::i64; } bool SITargetLowering::canTransformPtrArithOutOfBounds(const Function &F, diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index b3fd8c7..84287b6 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -352,10 +352,12 @@ def S_XNOR_SAVEEXEC_B64 : SOP1_64 <"s_xnor_saveexec_b64">; } // End hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC] +let Defs = [SCC] in { def S_QUADMASK_B32 : SOP1_32 <"s_quadmask_b32", [(set i32:$sdst, (int_amdgcn_s_quadmask i32:$src0))]>; def S_QUADMASK_B64 : SOP1_64 <"s_quadmask_b64", [(set i64:$sdst, (int_amdgcn_s_quadmask i64:$src0))]>; +} let Uses = [M0] in { def S_MOVRELS_B32 : SOP1_32R <"s_movrels_b32">; diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 77df721..54f57e0 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -314,9 +314,10 @@ let SubtargetPredicate = HasGFX950Insts, OtherPredicates = [HasBF16ConversionIns defm V_CVT_F32_BF16 : VOP1Inst_t16 <"v_cvt_f32_bf16", VOP_F32_BF16>; } let SubtargetPredicate = isGFX1250Plus, OtherPredicates = [HasBF16ConversionInsts] in { - defm V_CVT_F32_BF16_gfx1250 : VOP1Inst_t16_with_profiles <"v_cvt_f32_bf16_gfx1250", VOP_F32_BF16, - VOPProfile_CVT_F32_BF16_gfx1250_t16, - VOPProfile_CVT_F32_BF16_gfx1250_fake16>; + let True16Predicate = UseRealTrue16Insts in + defm V_CVT_F32_BF16_gfx1250_t16 : VOP1Inst <"V_CVT_F32_BF16_gfx1250_t16", VOPProfile_CVT_F32_BF16_gfx1250_t16>; + let True16Predicate = UseFakeTrue16Insts in + defm V_CVT_F32_BF16_gfx1250_fake16 : VOP1Inst <"V_CVT_F32_BF16_gfx1250_fake16", VOPProfile_CVT_F32_BF16_gfx1250_fake16>; } let ReadsModeReg = 0, mayRaiseFPException = 0 in { @@ -899,6 +900,7 @@ class VOP1_DPP16_Gen<bits<8> op, VOP1_DPP_Pseudo ps, GFXGen Gen, VOPProfile p = let DecoderNamespace = Gen.DecoderNamespace; let OtherPredicates = !listconcat(ps.OtherPredicates, !if(p.HasExt64BitDPP, [HasDPALU_DPP], [])); + let True16Predicate = ps.True16Predicate; } class VOP1_DPP8<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> : @@ -921,6 +923,7 @@ class VOP1_DPP8_Gen<bits<8> op, VOP1_Pseudo ps, GFXGen Gen, VOPProfile p = ps.Pf VOP1_DPP8<op, ps, p> { let AssemblerPredicate = Gen.AssemblerPredicate; let DecoderNamespace = Gen.DecoderNamespace; + let True16Predicate = ps.True16Predicate; } //===----------------------------------------------------------------------===// @@ -1149,7 +1152,7 @@ defm V_TANH_F16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x01f>; defm V_PERMLANE16_SWAP_B32 : VOP1_Real_OpSelIsDPP_gfx1250<0x049>; defm V_TANH_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x04a>; defm V_PRNG_B32 : VOP1_Real_FULL<GFX1250Gen, 0x04b>; -defm V_CVT_F32_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x072, "v_cvt_f32_bf16", "V_CVT_F32_BF16_gfx1250">; +defm V_CVT_F32_BF16_gfx1250 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x072, "v_cvt_f32_bf16">; defm V_SAT_PK4_I4_I8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x073>; defm V_SAT_PK4_U4_U8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x074>; defm V_CVT_PK_F16_FP8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x075>; |