[AMDGPU] Run unmerge combines post regbankselect

RegBankSelect can insert G_UNMERGE_VALUES in a lot of places which left us with a lot of unmerge/merge pairs that could be simplified. These often got in the way of pattern matching and made codegen worse. This patch: - Makes the necessary changes to the merge/unmerge combines so they can run post RegBankSelect - Adds relevant unmerge combines to the list of RegBankSelect combines for AMDGPU - Updates some tablegen patterns that were missing explicit cross-regbank copies (V_BFI patterns were causing constant bus violations with this change). This seems to be mostly beneficial for code quality. Reviewed By: arsenm Differential Revision: https://reviews.llvm.org/D142192
author: Pierre van Houtryve <pierre.vanhoutryve@amd.com> 2023-01-24 10:33:29 +0100
committer: pvanhout <pierre.vanhoutryve@amd.com> 2023-02-10 08:34:23 +0100
commit: d9a6fc82f56f1e4ebb6ba053a57a5839c3907a7e (patch)
tree: 0e645a2535d14b97d78b3caec02a320edd8aceb6 /llvm/lib
parent: 90f5176ab2c6f46449c9a7050f7269a7356f7a41 (diff)
download: llvm-d9a6fc82f56f1e4ebb6ba053a57a5839c3907a7e.zip
llvm-d9a6fc82f56f1e4ebb6ba053a57a5839c3907a7e.tar.gz
llvm-d9a6fc82f56f1e4ebb6ba053a57a5839c3907a7e.tar.bz2
3 files changed, 21 insertions, 4 deletions
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 7b85f2e..5fc2674 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -1763,6 +1763,15 @@ void CombinerHelper::applyCombineUnmergeMergeToPlainValues(
   for (unsigned Idx = 0; Idx < NumElems; ++Idx) {
     Register DstReg = MI.getOperand(Idx).getReg();
     Register SrcReg = Operands[Idx];
+
+    // This combine may run after RegBankSelect, so we need to be aware of
+    // register banks.
+    const auto &DstCB = MRI.getRegClassOrRegBank(DstReg);
+    if (!DstCB.isNull() && DstCB != MRI.getRegClassOrRegBank(SrcReg)) {
+      SrcReg = Builder.buildCopy(MRI.getType(SrcReg), SrcReg).getReg(0);
+      MRI.setRegClassOrRegBank(SrcReg, DstCB);
+    }
+
     if (CanReuseInputDirectly)
       replaceRegWith(MRI, DstReg, SrcReg);
     else
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index c11d465..539b566 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -127,7 +127,8 @@ def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper<
 
 def AMDGPURegBankCombinerHelper : GICombinerHelper<
   "AMDGPUGenRegBankCombinerHelper",
-  [zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain,
+  [unmerge_merge, unmerge_cst, unmerge_undef,
+   zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain,
    fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp]> {
   let DisableRuleOption = "amdgpuregbankcombiner-disable-rule";
   let StateClass = "AMDGPURegBankCombinerHelperState";
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 2c10cdc..012459f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2055,7 +2055,9 @@ def BFIImm32 : PatFrag<
 // (y & x) | (z & ~x)
 def : AMDGPUPat <
   (DivergentBinFrag<or> (and i32:$y, i32:$x), (and i32:$z, (not i32:$x))),
-  (V_BFI_B32_e64 VSrc_b32:$x, VSrc_b32:$y, VSrc_b32:$z)
+  (V_BFI_B32_e64 (COPY_TO_REGCLASS VSrc_b32:$x, VGPR_32),
+                 (COPY_TO_REGCLASS VSrc_b32:$y, VGPR_32),
+                 (COPY_TO_REGCLASS VSrc_b32:$z, VGPR_32))
 >;
 
 // (y & C) | (z & ~C)
@@ -2080,7 +2082,9 @@ def : AMDGPUPat <
 // z ^ (x & (y ^ z))
 def : AMDGPUPat <
   (DivergentBinFrag<xor> i32:$z, (and i32:$x, (xor i32:$y, i32:$z))),
-  (V_BFI_B32_e64 VSrc_b32:$x, VSrc_b32:$y, VSrc_b32:$z)
+  (V_BFI_B32_e64 (COPY_TO_REGCLASS VSrc_b32:$x, VGPR_32),
+                 (COPY_TO_REGCLASS VSrc_b32:$y, VGPR_32),
+                 (COPY_TO_REGCLASS VSrc_b32:$z, VGPR_32))
 >;
 
 // 64-bit version
@@ -3196,7 +3200,10 @@ def : AMDGPUPat <
 def : AMDGPUPat <
   (DivergentBinFrag<or> (and i32:$x, i32:$z),
                         (and i32:$y, (or i32:$x, i32:$z))),
-  (V_BFI_B32_e64 (V_XOR_B32_e64 VSrc_b32:$x, VSrc_b32:$y), VSrc_b32:$z, VSrc_b32:$y)
+  (V_BFI_B32_e64 (V_XOR_B32_e64 (COPY_TO_REGCLASS VSrc_b32:$x, VGPR_32),
+                                (COPY_TO_REGCLASS VSrc_b32:$y, VGPR_32)),
+                 (COPY_TO_REGCLASS VSrc_b32:$z, VGPR_32),
+                 (COPY_TO_REGCLASS VSrc_b32:$y, VGPR_32))
 >;
 
 def : AMDGPUPat <
author	Pierre van Houtryve <pierre.vanhoutryve@amd.com>	2023-01-24 10:33:29 +0100
committer	pvanhout <pierre.vanhoutryve@amd.com>	2023-02-10 08:34:23 +0100
commit	d9a6fc82f56f1e4ebb6ba053a57a5839c3907a7e (patch)
tree	0e645a2535d14b97d78b3caec02a320edd8aceb6 /llvm/lib
parent	90f5176ab2c6f46449c9a7050f7269a7356f7a41 (diff)
download	llvm-d9a6fc82f56f1e4ebb6ba053a57a5839c3907a7e.zip llvm-d9a6fc82f56f1e4ebb6ba053a57a5839c3907a7e.tar.gz llvm-d9a6fc82f56f1e4ebb6ba053a57a5839c3907a7e.tar.bz2