aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp38
-rw-r--r--llvm/lib/Target/AMDGPU/VOP1Instructions.td11
-rw-r--r--llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp4
-rw-r--r--llvm/lib/Target/TargetMachine.cpp2
4 files changed, 20 insertions, 35 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 92a587b..280fbe2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1384,6 +1384,11 @@ void AMDGPUPassConfig::addCodeGenPrepare() {
if (TM->getTargetTriple().isAMDGCN() && EnableLowerKernelArguments)
addPass(createAMDGPULowerKernelArgumentsPass());
+ TargetPassConfig::addCodeGenPrepare();
+
+ if (isPassEnabled(EnableLoadStoreVectorizer))
+ addPass(createLoadStoreVectorizerPass());
+
if (TM->getTargetTriple().isAMDGCN()) {
// This lowering has been placed after codegenprepare to take advantage of
// address mode matching (which is why it isn't put with the LDS lowerings).
@@ -1392,15 +1397,6 @@ void AMDGPUPassConfig::addCodeGenPrepare() {
// but has been put before switch lowering and CFG flattening so that those
// passes can run on the more optimized control flow this pass creates in
// many cases.
- //
- // FIXME: This should ideally be put after the LoadStoreVectorizer.
- // However, due to some annoying facts about ResourceUsageAnalysis,
- // (especially as exercised in the resource-usage-dead-function test),
- // we need all the function passes codegenprepare all the way through
- // said resource usage analysis to run on the call graph produced
- // before codegenprepare runs (because codegenprepare will knock some
- // nodes out of the graph, which leads to function-level passes not
- // being run on them, which causes crashes in the resource usage analysis).
addPass(createAMDGPULowerBufferFatPointersPass());
addPass(createAMDGPULowerIntrinsicsLegacyPass());
// In accordance with the above FIXME, manually force all the
@@ -1408,11 +1404,6 @@ void AMDGPUPassConfig::addCodeGenPrepare() {
addPass(new DummyCGSCCPass());
}
- TargetPassConfig::addCodeGenPrepare();
-
- if (isPassEnabled(EnableLoadStoreVectorizer))
- addPass(createLoadStoreVectorizerPass());
-
// LowerSwitch pass may introduce unreachable blocks that can
// cause unexpected behavior for subsequent passes. Placing it
// here seems better that these blocks would get cleaned up by
@@ -2125,6 +2116,11 @@ void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const {
if (EnableLowerKernelArguments)
addPass(AMDGPULowerKernelArgumentsPass(TM));
+ Base::addCodeGenPrepare(addPass);
+
+ if (isPassEnabled(EnableLoadStoreVectorizer))
+ addPass(LoadStoreVectorizerPass());
+
// This lowering has been placed after codegenprepare to take advantage of
// address mode matching (which is why it isn't put with the LDS lowerings).
// It could be placed anywhere before uniformity annotations (an analysis
@@ -2132,25 +2128,11 @@ void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const {
// but has been put before switch lowering and CFG flattening so that those
// passes can run on the more optimized control flow this pass creates in
// many cases.
- //
- // FIXME: This should ideally be put after the LoadStoreVectorizer.
- // However, due to some annoying facts about ResourceUsageAnalysis,
- // (especially as exercised in the resource-usage-dead-function test),
- // we need all the function passes codegenprepare all the way through
- // said resource usage analysis to run on the call graph produced
- // before codegenprepare runs (because codegenprepare will knock some
- // nodes out of the graph, which leads to function-level passes not
- // being run on them, which causes crashes in the resource usage analysis).
addPass(AMDGPULowerBufferFatPointersPass(TM));
addPass.requireCGSCCOrder();
addPass(AMDGPULowerIntrinsicsPass(TM));
- Base::addCodeGenPrepare(addPass);
-
- if (isPassEnabled(EnableLoadStoreVectorizer))
- addPass(LoadStoreVectorizerPass());
-
// LowerSwitch pass may introduce unreachable blocks that can cause unexpected
// behavior for subsequent passes. Placing it here seems better that these
// blocks would get cleaned up by UnreachableBlockElim inserted next in the
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 77df721..54f57e0 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -314,9 +314,10 @@ let SubtargetPredicate = HasGFX950Insts, OtherPredicates = [HasBF16ConversionIns
defm V_CVT_F32_BF16 : VOP1Inst_t16 <"v_cvt_f32_bf16", VOP_F32_BF16>;
}
let SubtargetPredicate = isGFX1250Plus, OtherPredicates = [HasBF16ConversionInsts] in {
- defm V_CVT_F32_BF16_gfx1250 : VOP1Inst_t16_with_profiles <"v_cvt_f32_bf16_gfx1250", VOP_F32_BF16,
- VOPProfile_CVT_F32_BF16_gfx1250_t16,
- VOPProfile_CVT_F32_BF16_gfx1250_fake16>;
+ let True16Predicate = UseRealTrue16Insts in
+ defm V_CVT_F32_BF16_gfx1250_t16 : VOP1Inst <"V_CVT_F32_BF16_gfx1250_t16", VOPProfile_CVT_F32_BF16_gfx1250_t16>;
+ let True16Predicate = UseFakeTrue16Insts in
+ defm V_CVT_F32_BF16_gfx1250_fake16 : VOP1Inst <"V_CVT_F32_BF16_gfx1250_fake16", VOPProfile_CVT_F32_BF16_gfx1250_fake16>;
}
let ReadsModeReg = 0, mayRaiseFPException = 0 in {
@@ -899,6 +900,7 @@ class VOP1_DPP16_Gen<bits<8> op, VOP1_DPP_Pseudo ps, GFXGen Gen, VOPProfile p =
let DecoderNamespace = Gen.DecoderNamespace;
let OtherPredicates = !listconcat(ps.OtherPredicates,
!if(p.HasExt64BitDPP, [HasDPALU_DPP], []));
+ let True16Predicate = ps.True16Predicate;
}
class VOP1_DPP8<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> :
@@ -921,6 +923,7 @@ class VOP1_DPP8_Gen<bits<8> op, VOP1_Pseudo ps, GFXGen Gen, VOPProfile p = ps.Pf
VOP1_DPP8<op, ps, p> {
let AssemblerPredicate = Gen.AssemblerPredicate;
let DecoderNamespace = Gen.DecoderNamespace;
+ let True16Predicate = ps.True16Predicate;
}
//===----------------------------------------------------------------------===//
@@ -1149,7 +1152,7 @@ defm V_TANH_F16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x01f>;
defm V_PERMLANE16_SWAP_B32 : VOP1_Real_OpSelIsDPP_gfx1250<0x049>;
defm V_TANH_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x04a>;
defm V_PRNG_B32 : VOP1_Real_FULL<GFX1250Gen, 0x04b>;
-defm V_CVT_F32_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x072, "v_cvt_f32_bf16", "V_CVT_F32_BF16_gfx1250">;
+defm V_CVT_F32_BF16_gfx1250 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x072, "v_cvt_f32_bf16">;
defm V_SAT_PK4_I4_I8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x073>;
defm V_SAT_PK4_U4_U8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x074>;
defm V_CVT_PK_F16_FP8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x075>;
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index d4124ae..ee25f69 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -3139,8 +3139,8 @@ bool RISCVTTIImpl::isProfitableToSinkOperands(
bool IsVPSplat = match(Op, m_Intrinsic<Intrinsic::experimental_vp_splat>(
m_Value(), m_Value(), m_Value()));
if (!IsVPSplat &&
- !match(Op, m_Shuffle(m_InsertElt(m_Undef(), m_Value(), m_ZeroInt()),
- m_Undef(), m_ZeroMask())))
+ !match(Op, m_Shuffle(m_InsertElt(m_Value(), m_Value(), m_ZeroInt()),
+ m_Value(), m_ZeroMask())))
continue;
// Don't sink i1 splats.
diff --git a/llvm/lib/Target/TargetMachine.cpp b/llvm/lib/Target/TargetMachine.cpp
index ad7e503..cf85691 100644
--- a/llvm/lib/Target/TargetMachine.cpp
+++ b/llvm/lib/Target/TargetMachine.cpp
@@ -27,7 +27,7 @@
#include "llvm/Target/TargetLoweringObjectFile.h"
using namespace llvm;
-cl::opt<bool> NoKernelInfoEndLTO(
+cl::opt<bool> llvm::NoKernelInfoEndLTO(
"no-kernel-info-end-lto",
cl::desc("remove the kernel-info pass at the end of the full LTO pipeline"),
cl::init(false), cl::Hidden);