aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorThomas Symalla <thomas.symalla@amd.com>2022-04-08 09:48:23 +0200
committerThomas Symalla <thomas.symalla@amd.com>2022-04-08 12:47:24 +0200
commit6d97ca690c4d9f59226d06e30233e247a3308e9b (patch)
tree65325af07f57d214756a99f4fa91a3637d188c87
parent08920cc04343e69ee8a56168a5911acddf40e6ba (diff)
downloadllvm-6d97ca690c4d9f59226d06e30233e247a3308e9b.zip
llvm-6d97ca690c4d9f59226d06e30233e247a3308e9b.tar.gz
llvm-6d97ca690c4d9f59226d06e30233e247a3308e9b.tar.bz2
[AMDGPU] Increase detection range for s_mov, v_cmpx transformation.
We found that it might be beneficial to have the SIOptimizeExecMasking pass detect more cases where v_cmp, s_and_saveexec patterns can be transformed to s_mov, v_cmpx patterns. Currently, the search range for finding a fitting v_cmp instruction is 5, however, this is doubled to 10 here. Reviewed By: foad Differential Revision: https://reviews.llvm.org/D123367
-rw-r--r--llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp7
-rw-r--r--llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.mir27
2 files changed, 31 insertions, 3 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
index 4c61892..36f9ab6f 100644
--- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
@@ -302,12 +302,15 @@ static MachineInstr *
findInstrBackwards(MachineInstr &Origin,
std::function<bool(MachineInstr *)> Pred,
ArrayRef<MCRegister> NonModifiableRegs,
- const SIRegisterInfo *TRI, unsigned MaxInstructions = 5) {
+ const SIRegisterInfo *TRI, unsigned MaxInstructions = 20) {
MachineBasicBlock::reverse_iterator A = Origin.getReverseIterator(),
E = Origin.getParent()->rend();
unsigned CurrentIteration = 0;
for (++A; CurrentIteration < MaxInstructions && A != E; ++A) {
+ if (A->isDebugInstr())
+ continue;
+
if (Pred(&*A))
return &*A;
@@ -315,7 +318,7 @@ findInstrBackwards(MachineInstr &Origin,
if (A->modifiesRegister(Reg, TRI))
return nullptr;
}
-
+
++CurrentIteration;
}
diff --git a/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.mir b/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.mir
index fceca41..7407cdf 100644
--- a/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.mir
+++ b/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.mir
@@ -1,6 +1,5 @@
# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass=si-optimize-exec-masking -verify-machineinstrs %s -o - | FileCheck --check-prefixes=GCN,GFX1010 %s
# RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass=si-optimize-exec-masking -verify-machineinstrs %s -o - | FileCheck --check-prefixes=GCN,GFX1030 %s
-
---
# After the Optimize exec masking (post-RA) pass, there's a change of having v_cmpx instructions
@@ -62,3 +61,29 @@ body: |
$sgpr2_sgpr3 = COPY $exec, implicit-def $exec
$sgpr2_sgpr3 = S_AND_B64 killed renamable $sgpr2_sgpr3, killed renamable $sgpr0_sgpr1, implicit-def dead $scc
$exec = S_MOV_B64_term killed renamable $sgpr2_sgpr3
+...
+
+---
+
+# Check if the sequence will be optimized even with more than 5 (unrelated) instructions inbetween the v_cmp and s_and_saveexec.
+
+# GCN-LABEL: name: vcmp_saveexec_to_mov_vcmpx_check_many_instrs
+# GFX1010: V_CMP_LT_F32_e64
+# GFX1010: S_AND_SAVEEXEC_B64
+# GFX1030: S_MOV_B64
+# GFX1030: V_CMPX_LT_F32_nosdst_e64 0, 953267991, 2
+name: vcmp_saveexec_to_mov_vcmpx_check_many_instrs
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $sgpr2, $vgpr1
+ renamable $sgpr0_sgpr1 = V_CMP_LT_F32_e64 0, 953267991, 2, $vgpr0, 0, implicit $mode, implicit $exec
+ $vgpr1 = V_WRITELANE_B32 0, $sgpr2, $vgpr1
+ $vgpr1 = V_WRITELANE_B32 0, $sgpr2, $vgpr1
+ $vgpr1 = V_WRITELANE_B32 0, $sgpr2, $vgpr1
+ $vgpr1 = V_WRITELANE_B32 0, $sgpr2, $vgpr1
+ $vgpr1 = V_WRITELANE_B32 0, $sgpr2, $vgpr1
+ $vgpr1 = V_WRITELANE_B32 0, $sgpr2, $vgpr1
+ $sgpr2_sgpr3 = COPY $exec, implicit-def $exec
+ $sgpr2_sgpr3 = S_AND_B64 killed renamable $sgpr2_sgpr3, killed renamable $sgpr0_sgpr1, implicit-def dead $scc
+ $exec = S_MOV_B64_term killed renamable $sgpr2_sgpr3