aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp81
1 files changed, 69 insertions, 12 deletions
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index ae537b1..b39fbdc 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -352,6 +352,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
return 1;
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
case AMDGPU::S_LOAD_DWORDX2_IMM:
case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
case AMDGPU::GLOBAL_LOAD_DWORDX2:
@@ -363,6 +365,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
return 2;
case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
case AMDGPU::S_LOAD_DWORDX3_IMM:
case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
case AMDGPU::GLOBAL_LOAD_DWORDX3:
@@ -374,6 +378,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
return 3;
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
case AMDGPU::S_LOAD_DWORDX4_IMM:
case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
case AMDGPU::GLOBAL_LOAD_DWORDX4:
@@ -385,6 +391,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
return 4;
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
case AMDGPU::S_LOAD_DWORDX8_IMM:
case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
return 8;
@@ -499,12 +507,20 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
return S_BUFFER_LOAD_IMM;
case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
return S_BUFFER_LOAD_SGPR_IMM;
case AMDGPU::S_LOAD_DWORD_IMM:
case AMDGPU::S_LOAD_DWORDX2_IMM:
@@ -587,12 +603,20 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
case AMDGPU::S_LOAD_DWORD_IMM:
case AMDGPU::S_LOAD_DWORDX2_IMM:
@@ -703,6 +727,10 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
Result.SOffset = true;
[[fallthrough]];
case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
@@ -710,6 +738,10 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
case AMDGPU::S_LOAD_DWORD_IMM:
case AMDGPU::S_LOAD_DWORDX2_IMM:
case AMDGPU::S_LOAD_DWORDX3_IMM:
@@ -1679,6 +1711,14 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
return New;
}
+static bool needsConstrainedOpcode(const GCNSubtarget &STM,
+ ArrayRef<MachineMemOperand *> MMOs,
+ unsigned Width) {
+ // Conservatively returns true if not found the MMO.
+ return STM.isXNACKEnabled() &&
+ (MMOs.size() != 1 || MMOs[0]->getAlign().value() < Width * 4);
+}
+
unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
const CombineInfo &Paired) {
const unsigned Width = CI.Width + Paired.Width;
@@ -1696,38 +1736,55 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
case UNKNOWN:
llvm_unreachable("Unknown instruction class");
- case S_BUFFER_LOAD_IMM:
+ case S_BUFFER_LOAD_IMM: {
+ // If XNACK is enabled, use the constrained opcodes when the first load is
+ // under-aligned.
+ bool NeedsConstrainedOpc =
+ needsConstrainedOpcode(*STM, CI.I->memoperands(), Width);
switch (Width) {
default:
return 0;
case 2:
- return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
+ return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec
+ : AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
case 3:
- return AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
+ return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec
+ : AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
case 4:
- return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
+ return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec
+ : AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
case 8:
- return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
+ return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec
+ : AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
}
- case S_BUFFER_LOAD_SGPR_IMM:
+ }
+ case S_BUFFER_LOAD_SGPR_IMM: {
+ // If XNACK is enabled, use the constrained opcodes when the first load is
+ // under-aligned.
+ bool NeedsConstrainedOpc =
+ needsConstrainedOpcode(*STM, CI.I->memoperands(), Width);
switch (Width) {
default:
return 0;
case 2:
- return AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
+ return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec
+ : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
case 3:
- return AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
+ return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec
+ : AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
case 4:
- return AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
+ return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec
+ : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
case 8:
- return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
+ return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec
+ : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
}
+ }
case S_LOAD_IMM: {
// If XNACK is enabled, use the constrained opcodes when the first load is
// under-aligned.
- const MachineMemOperand *MMO = *CI.I->memoperands_begin();
bool NeedsConstrainedOpc =
- STM->isXNACKEnabled() && MMO->getAlign().value() < Width * 4;
+ needsConstrainedOpcode(*STM, CI.I->memoperands(), Width);
switch (Width) {
default:
return 0;