diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp')
-rw-r--r-- | llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp | 81 |
1 files changed, 69 insertions, 12 deletions
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index ae537b1..b39fbdc 100644 --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -352,6 +352,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { return 1; case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec: case AMDGPU::S_LOAD_DWORDX2_IMM: case AMDGPU::S_LOAD_DWORDX2_IMM_ec: case AMDGPU::GLOBAL_LOAD_DWORDX2: @@ -363,6 +365,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { return 2; case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec: case AMDGPU::S_LOAD_DWORDX3_IMM: case AMDGPU::S_LOAD_DWORDX3_IMM_ec: case AMDGPU::GLOBAL_LOAD_DWORDX3: @@ -374,6 +378,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { return 3; case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec: case AMDGPU::S_LOAD_DWORDX4_IMM: case AMDGPU::S_LOAD_DWORDX4_IMM_ec: case AMDGPU::GLOBAL_LOAD_DWORDX4: @@ -385,6 +391,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { return 4; case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec: case AMDGPU::S_LOAD_DWORDX8_IMM: case AMDGPU::S_LOAD_DWORDX8_IMM_ec: return 8; @@ -499,12 +507,20 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec: return S_BUFFER_LOAD_IMM; case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec: return S_BUFFER_LOAD_SGPR_IMM; case AMDGPU::S_LOAD_DWORD_IMM: case AMDGPU::S_LOAD_DWORDX2_IMM: @@ -587,12 +603,20 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec: return AMDGPU::S_BUFFER_LOAD_DWORD_IMM; case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec: return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM; case AMDGPU::S_LOAD_DWORD_IMM: case AMDGPU::S_LOAD_DWORDX2_IMM: @@ -703,6 +727,10 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec: Result.SOffset = true; [[fallthrough]]; case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: @@ -710,6 +738,10 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec: case AMDGPU::S_LOAD_DWORD_IMM: case AMDGPU::S_LOAD_DWORDX2_IMM: case AMDGPU::S_LOAD_DWORDX3_IMM: @@ -1679,6 +1711,14 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair( return New; } +static bool needsConstrainedOpcode(const GCNSubtarget &STM, + ArrayRef<MachineMemOperand *> MMOs, + unsigned Width) { + // Conservatively returns true if not found the MMO. + return STM.isXNACKEnabled() && + (MMOs.size() != 1 || MMOs[0]->getAlign().value() < Width * 4); +} + unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired) { const unsigned Width = CI.Width + Paired.Width; @@ -1696,38 +1736,55 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, case UNKNOWN: llvm_unreachable("Unknown instruction class"); - case S_BUFFER_LOAD_IMM: + case S_BUFFER_LOAD_IMM: { + // If XNACK is enabled, use the constrained opcodes when the first load is + // under-aligned. + bool NeedsConstrainedOpc = + needsConstrainedOpcode(*STM, CI.I->memoperands(), Width); switch (Width) { default: return 0; case 2: - return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; + return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec + : AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; case 3: - return AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM; + return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec + : AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM; case 4: - return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; + return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec + : AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; case 8: - return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM; + return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec + : AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM; } - case S_BUFFER_LOAD_SGPR_IMM: + } + case S_BUFFER_LOAD_SGPR_IMM: { + // If XNACK is enabled, use the constrained opcodes when the first load is + // under-aligned. + bool NeedsConstrainedOpc = + needsConstrainedOpcode(*STM, CI.I->memoperands(), Width); switch (Width) { default: return 0; case 2: - return AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM; + return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec + : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM; case 3: - return AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM; + return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec + : AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM; case 4: - return AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM; + return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec + : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM; case 8: - return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM; + return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec + : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM; } + } case S_LOAD_IMM: { // If XNACK is enabled, use the constrained opcodes when the first load is // under-aligned. - const MachineMemOperand *MMO = *CI.I->memoperands_begin(); bool NeedsConstrainedOpc = - STM->isXNACKEnabled() && MMO->getAlign().value() < Width * 4; + needsConstrainedOpcode(*STM, CI.I->memoperands(), Width); switch (Width) { default: return 0; |