diff options
author | Pierre van Houtryve <pierre.vanhoutryve@amd.com> | 2023-01-24 10:33:29 +0100 |
---|---|---|
committer | pvanhout <pierre.vanhoutryve@amd.com> | 2023-02-10 08:34:23 +0100 |
commit | d9a6fc82f56f1e4ebb6ba053a57a5839c3907a7e (patch) | |
tree | 0e645a2535d14b97d78b3caec02a320edd8aceb6 /llvm/lib | |
parent | 90f5176ab2c6f46449c9a7050f7269a7356f7a41 (diff) | |
download | llvm-d9a6fc82f56f1e4ebb6ba053a57a5839c3907a7e.zip llvm-d9a6fc82f56f1e4ebb6ba053a57a5839c3907a7e.tar.gz llvm-d9a6fc82f56f1e4ebb6ba053a57a5839c3907a7e.tar.bz2 |
[AMDGPU] Run unmerge combines post regbankselect
RegBankSelect can insert G_UNMERGE_VALUES in a lot of places which
left us with a lot of unmerge/merge pairs that could be simplified.
These often got in the way of pattern matching and made codegen
worse.
This patch:
- Makes the necessary changes to the merge/unmerge combines so they can run post RegBankSelect
- Adds relevant unmerge combines to the list of RegBankSelect combines for AMDGPU
- Updates some tablegen patterns that were missing explicit cross-regbank copies (V_BFI patterns were causing constant bus violations with this change).
This seems to be mostly beneficial for code quality.
Reviewed By: arsenm
Differential Revision: https://reviews.llvm.org/D142192
Diffstat (limited to 'llvm/lib')
-rw-r--r-- | llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp | 9 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUCombine.td | 3 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstructions.td | 13 |
3 files changed, 21 insertions, 4 deletions
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 7b85f2e..5fc2674 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -1763,6 +1763,15 @@ void CombinerHelper::applyCombineUnmergeMergeToPlainValues( for (unsigned Idx = 0; Idx < NumElems; ++Idx) { Register DstReg = MI.getOperand(Idx).getReg(); Register SrcReg = Operands[Idx]; + + // This combine may run after RegBankSelect, so we need to be aware of + // register banks. + const auto &DstCB = MRI.getRegClassOrRegBank(DstReg); + if (!DstCB.isNull() && DstCB != MRI.getRegClassOrRegBank(SrcReg)) { + SrcReg = Builder.buildCopy(MRI.getType(SrcReg), SrcReg).getReg(0); + MRI.setRegClassOrRegBank(SrcReg, DstCB); + } + if (CanReuseInputDirectly) replaceRegWith(MRI, DstReg, SrcReg); else diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td index c11d465..539b566 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -127,7 +127,8 @@ def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper< def AMDGPURegBankCombinerHelper : GICombinerHelper< "AMDGPUGenRegBankCombinerHelper", - [zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain, + [unmerge_merge, unmerge_cst, unmerge_undef, + zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain, fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp]> { let DisableRuleOption = "amdgpuregbankcombiner-disable-rule"; let StateClass = "AMDGPURegBankCombinerHelperState"; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 2c10cdc..012459f 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2055,7 +2055,9 @@ def BFIImm32 : PatFrag< // (y & x) | (z & ~x) def : AMDGPUPat < (DivergentBinFrag<or> (and i32:$y, i32:$x), (and i32:$z, (not i32:$x))), - (V_BFI_B32_e64 VSrc_b32:$x, VSrc_b32:$y, VSrc_b32:$z) + (V_BFI_B32_e64 (COPY_TO_REGCLASS VSrc_b32:$x, VGPR_32), + (COPY_TO_REGCLASS VSrc_b32:$y, VGPR_32), + (COPY_TO_REGCLASS VSrc_b32:$z, VGPR_32)) >; // (y & C) | (z & ~C) @@ -2080,7 +2082,9 @@ def : AMDGPUPat < // z ^ (x & (y ^ z)) def : AMDGPUPat < (DivergentBinFrag<xor> i32:$z, (and i32:$x, (xor i32:$y, i32:$z))), - (V_BFI_B32_e64 VSrc_b32:$x, VSrc_b32:$y, VSrc_b32:$z) + (V_BFI_B32_e64 (COPY_TO_REGCLASS VSrc_b32:$x, VGPR_32), + (COPY_TO_REGCLASS VSrc_b32:$y, VGPR_32), + (COPY_TO_REGCLASS VSrc_b32:$z, VGPR_32)) >; // 64-bit version @@ -3196,7 +3200,10 @@ def : AMDGPUPat < def : AMDGPUPat < (DivergentBinFrag<or> (and i32:$x, i32:$z), (and i32:$y, (or i32:$x, i32:$z))), - (V_BFI_B32_e64 (V_XOR_B32_e64 VSrc_b32:$x, VSrc_b32:$y), VSrc_b32:$z, VSrc_b32:$y) + (V_BFI_B32_e64 (V_XOR_B32_e64 (COPY_TO_REGCLASS VSrc_b32:$x, VGPR_32), + (COPY_TO_REGCLASS VSrc_b32:$y, VGPR_32)), + (COPY_TO_REGCLASS VSrc_b32:$z, VGPR_32), + (COPY_TO_REGCLASS VSrc_b32:$y, VGPR_32)) >; def : AMDGPUPat < |