diff options
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 38 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/VOP1Instructions.td | 11 | ||||
-rw-r--r-- | llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp | 4 | ||||
-rw-r--r-- | llvm/lib/Target/TargetMachine.cpp | 2 |
4 files changed, 20 insertions, 35 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 92a587b..280fbe2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -1384,6 +1384,11 @@ void AMDGPUPassConfig::addCodeGenPrepare() { if (TM->getTargetTriple().isAMDGCN() && EnableLowerKernelArguments) addPass(createAMDGPULowerKernelArgumentsPass()); + TargetPassConfig::addCodeGenPrepare(); + + if (isPassEnabled(EnableLoadStoreVectorizer)) + addPass(createLoadStoreVectorizerPass()); + if (TM->getTargetTriple().isAMDGCN()) { // This lowering has been placed after codegenprepare to take advantage of // address mode matching (which is why it isn't put with the LDS lowerings). @@ -1392,15 +1397,6 @@ void AMDGPUPassConfig::addCodeGenPrepare() { // but has been put before switch lowering and CFG flattening so that those // passes can run on the more optimized control flow this pass creates in // many cases. - // - // FIXME: This should ideally be put after the LoadStoreVectorizer. - // However, due to some annoying facts about ResourceUsageAnalysis, - // (especially as exercised in the resource-usage-dead-function test), - // we need all the function passes codegenprepare all the way through - // said resource usage analysis to run on the call graph produced - // before codegenprepare runs (because codegenprepare will knock some - // nodes out of the graph, which leads to function-level passes not - // being run on them, which causes crashes in the resource usage analysis). addPass(createAMDGPULowerBufferFatPointersPass()); addPass(createAMDGPULowerIntrinsicsLegacyPass()); // In accordance with the above FIXME, manually force all the @@ -1408,11 +1404,6 @@ void AMDGPUPassConfig::addCodeGenPrepare() { addPass(new DummyCGSCCPass()); } - TargetPassConfig::addCodeGenPrepare(); - - if (isPassEnabled(EnableLoadStoreVectorizer)) - addPass(createLoadStoreVectorizerPass()); - // LowerSwitch pass may introduce unreachable blocks that can // cause unexpected behavior for subsequent passes. Placing it // here seems better that these blocks would get cleaned up by @@ -2125,6 +2116,11 @@ void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const { if (EnableLowerKernelArguments) addPass(AMDGPULowerKernelArgumentsPass(TM)); + Base::addCodeGenPrepare(addPass); + + if (isPassEnabled(EnableLoadStoreVectorizer)) + addPass(LoadStoreVectorizerPass()); + // This lowering has been placed after codegenprepare to take advantage of // address mode matching (which is why it isn't put with the LDS lowerings). // It could be placed anywhere before uniformity annotations (an analysis @@ -2132,25 +2128,11 @@ void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const { // but has been put before switch lowering and CFG flattening so that those // passes can run on the more optimized control flow this pass creates in // many cases. - // - // FIXME: This should ideally be put after the LoadStoreVectorizer. - // However, due to some annoying facts about ResourceUsageAnalysis, - // (especially as exercised in the resource-usage-dead-function test), - // we need all the function passes codegenprepare all the way through - // said resource usage analysis to run on the call graph produced - // before codegenprepare runs (because codegenprepare will knock some - // nodes out of the graph, which leads to function-level passes not - // being run on them, which causes crashes in the resource usage analysis). addPass(AMDGPULowerBufferFatPointersPass(TM)); addPass.requireCGSCCOrder(); addPass(AMDGPULowerIntrinsicsPass(TM)); - Base::addCodeGenPrepare(addPass); - - if (isPassEnabled(EnableLoadStoreVectorizer)) - addPass(LoadStoreVectorizerPass()); - // LowerSwitch pass may introduce unreachable blocks that can cause unexpected // behavior for subsequent passes. Placing it here seems better that these // blocks would get cleaned up by UnreachableBlockElim inserted next in the diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 77df721..54f57e0 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -314,9 +314,10 @@ let SubtargetPredicate = HasGFX950Insts, OtherPredicates = [HasBF16ConversionIns defm V_CVT_F32_BF16 : VOP1Inst_t16 <"v_cvt_f32_bf16", VOP_F32_BF16>; } let SubtargetPredicate = isGFX1250Plus, OtherPredicates = [HasBF16ConversionInsts] in { - defm V_CVT_F32_BF16_gfx1250 : VOP1Inst_t16_with_profiles <"v_cvt_f32_bf16_gfx1250", VOP_F32_BF16, - VOPProfile_CVT_F32_BF16_gfx1250_t16, - VOPProfile_CVT_F32_BF16_gfx1250_fake16>; + let True16Predicate = UseRealTrue16Insts in + defm V_CVT_F32_BF16_gfx1250_t16 : VOP1Inst <"V_CVT_F32_BF16_gfx1250_t16", VOPProfile_CVT_F32_BF16_gfx1250_t16>; + let True16Predicate = UseFakeTrue16Insts in + defm V_CVT_F32_BF16_gfx1250_fake16 : VOP1Inst <"V_CVT_F32_BF16_gfx1250_fake16", VOPProfile_CVT_F32_BF16_gfx1250_fake16>; } let ReadsModeReg = 0, mayRaiseFPException = 0 in { @@ -899,6 +900,7 @@ class VOP1_DPP16_Gen<bits<8> op, VOP1_DPP_Pseudo ps, GFXGen Gen, VOPProfile p = let DecoderNamespace = Gen.DecoderNamespace; let OtherPredicates = !listconcat(ps.OtherPredicates, !if(p.HasExt64BitDPP, [HasDPALU_DPP], [])); + let True16Predicate = ps.True16Predicate; } class VOP1_DPP8<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> : @@ -921,6 +923,7 @@ class VOP1_DPP8_Gen<bits<8> op, VOP1_Pseudo ps, GFXGen Gen, VOPProfile p = ps.Pf VOP1_DPP8<op, ps, p> { let AssemblerPredicate = Gen.AssemblerPredicate; let DecoderNamespace = Gen.DecoderNamespace; + let True16Predicate = ps.True16Predicate; } //===----------------------------------------------------------------------===// @@ -1149,7 +1152,7 @@ defm V_TANH_F16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x01f>; defm V_PERMLANE16_SWAP_B32 : VOP1_Real_OpSelIsDPP_gfx1250<0x049>; defm V_TANH_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x04a>; defm V_PRNG_B32 : VOP1_Real_FULL<GFX1250Gen, 0x04b>; -defm V_CVT_F32_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x072, "v_cvt_f32_bf16", "V_CVT_F32_BF16_gfx1250">; +defm V_CVT_F32_BF16_gfx1250 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x072, "v_cvt_f32_bf16">; defm V_SAT_PK4_I4_I8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x073>; defm V_SAT_PK4_U4_U8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x074>; defm V_CVT_PK_F16_FP8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x075>; diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index d4124ae..ee25f69 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -3139,8 +3139,8 @@ bool RISCVTTIImpl::isProfitableToSinkOperands( bool IsVPSplat = match(Op, m_Intrinsic<Intrinsic::experimental_vp_splat>( m_Value(), m_Value(), m_Value())); if (!IsVPSplat && - !match(Op, m_Shuffle(m_InsertElt(m_Undef(), m_Value(), m_ZeroInt()), - m_Undef(), m_ZeroMask()))) + !match(Op, m_Shuffle(m_InsertElt(m_Value(), m_Value(), m_ZeroInt()), + m_Value(), m_ZeroMask()))) continue; // Don't sink i1 splats. diff --git a/llvm/lib/Target/TargetMachine.cpp b/llvm/lib/Target/TargetMachine.cpp index ad7e503..cf85691 100644 --- a/llvm/lib/Target/TargetMachine.cpp +++ b/llvm/lib/Target/TargetMachine.cpp @@ -27,7 +27,7 @@ #include "llvm/Target/TargetLoweringObjectFile.h" using namespace llvm; -cl::opt<bool> NoKernelInfoEndLTO( +cl::opt<bool> llvm::NoKernelInfoEndLTO( "no-kernel-info-end-lto", cl::desc("remove the kernel-info pass at the end of the full LTO pipeline"), cl::init(false), cl::Hidden); |