aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPierre van Houtryve <pierre.vanhoutryve@amd.com>2024-06-20 10:59:00 +0200
committerGitHub <noreply@github.com>2024-06-20 10:59:00 +0200
commitf0897ea4bb2c8cbebdf9d90dc4a0c1effdfe20e9 (patch)
treea4851e27680a915d76ebea9308841bbe40858e7e
parent46beeaa3940ef8380ee99b9f666cd27b884f63e4 (diff)
downloadllvm-f0897ea4bb2c8cbebdf9d90dc4a0c1effdfe20e9.zip
llvm-f0897ea4bb2c8cbebdf9d90dc4a0c1effdfe20e9.tar.gz
llvm-f0897ea4bb2c8cbebdf9d90dc4a0c1effdfe20e9.tar.bz2
[MachineLICM] Work-around Incomplete RegUnits (#95926)
Reverts the behavior introduced by 770393b while keeping the refactored code. Fixes a miscompile on AArch64, at the cost of a small regression on AMDGPU. #96146 opened to investigate the issue
-rw-r--r--llvm/lib/CodeGen/MachineLICM.cpp42
-rw-r--r--llvm/test/CodeGen/AArch64/mlicm-csr-mask.mir49
-rw-r--r--llvm/test/CodeGen/AMDGPU/indirect-call.ll4
3 files changed, 85 insertions, 10 deletions
diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index d81fe54f..287bd00 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -426,28 +426,54 @@ static bool InstructionStoresToFI(const MachineInstr *MI, int FI) {
static void applyBitsNotInRegMaskToRegUnitsMask(const TargetRegisterInfo &TRI,
BitVector &RUs,
const uint32_t *Mask) {
- BitVector ClobberedRUs(TRI.getNumRegUnits(), true);
+ // FIXME: This intentionally works in reverse due to some issues with the
+ // Register Units infrastructure.
+ //
+ // This is used to apply callee-saved-register masks to the clobbered regunits
+ // mask.
+ //
+ // The right way to approach this is to start with a BitVector full of ones,
+ // then reset all the bits of the regunits of each register that is set in the
+ // mask (registers preserved), then OR the resulting bits with the Clobbers
+ // mask. This correctly prioritizes the saved registers, so if a RU is shared
+ // between a register that is preserved, and one that is NOT preserved, that
+ // RU will not be set in the output vector (the clobbers).
+ //
+ // What we have to do for now is the opposite: we have to assume that the
+ // regunits of all registers that are NOT preserved are clobbered, even if
+ // those regunits are preserved by another register. So if a RU is shared
+ // like described previously, that RU will be set.
+ //
+ // This is to work around an issue which appears in AArch64, but isn't
+ // exclusive to that target: AArch64's Qn registers (128 bits) have Dn
+ // register (lower 64 bits). A few Dn registers are preserved by some calling
+ // conventions, but Qn and Dn share exactly the same reg units.
+ //
+ // If we do this the right way, Qn will be marked as NOT clobbered even though
+ // its upper 64 bits are NOT preserved. The conservative approach handles this
+ // correctly at the cost of some missed optimizations on other targets.
+ //
+ // This is caused by how RegUnits are handled within TableGen. Ideally, Qn
+ // should have an extra RegUnit to model the "unknown" bits not covered by the
+ // subregs.
+ BitVector RUsFromRegsNotInMask(TRI.getNumRegUnits());
const unsigned NumRegs = TRI.getNumRegs();
const unsigned MaskWords = (NumRegs + 31) / 32;
for (unsigned K = 0; K < MaskWords; ++K) {
const uint32_t Word = Mask[K];
- if (!Word)
- continue;
-
for (unsigned Bit = 0; Bit < 32; ++Bit) {
const unsigned PhysReg = (K * 32) + Bit;
if (PhysReg == NumRegs)
break;
- // Check if we have a valid PhysReg that is set in the mask.
- if ((Word >> Bit) & 1) {
+ if (PhysReg && !((Word >> Bit) & 1)) {
for (MCRegUnitIterator RUI(PhysReg, &TRI); RUI.isValid(); ++RUI)
- ClobberedRUs.reset(*RUI);
+ RUsFromRegsNotInMask.set(*RUI);
}
}
}
- RUs |= ClobberedRUs;
+ RUs |= RUsFromRegsNotInMask;
}
/// Examine the instruction for potentai LICM candidate. Also
diff --git a/llvm/test/CodeGen/AArch64/mlicm-csr-mask.mir b/llvm/test/CodeGen/AArch64/mlicm-csr-mask.mir
new file mode 100644
index 0000000..f6a0abf
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/mlicm-csr-mask.mir
@@ -0,0 +1,49 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=aarch64-unknown-linux-gnu -run-pass=greedy,machinelicm -verify-machineinstrs -debug -o - %s | FileCheck %s
+
+# FIXME: Running RA is needed otherwise it runs pre-RA LICM.
+---
+name: test
+tracksRegLiveness: true
+body: |
+ ; CHECK-LABEL: name: test
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: liveins: $x0, $w1, $x2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: B %bb.1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; CHECK-NEXT: liveins: $x0, $w1, $x2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $q11 = MOVIv4i32 2, 8
+ ; CHECK-NEXT: BL &memset, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit $w1, implicit $x2, implicit-def $sp, implicit-def $x0
+ ; CHECK-NEXT: renamable $q10 = MVNIv4i32 4, 0
+ ; CHECK-NEXT: $xzr = SUBSXri $x0, 1, 0, implicit-def $nzcv
+ ; CHECK-NEXT: Bcc 11, %bb.1, implicit $nzcv
+ ; CHECK-NEXT: B %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: liveins: $q10, $q11
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: $q0 = COPY $q10
+ ; CHECK-NEXT: $q1 = COPY $q11
+ bb.0:
+ liveins: $x0, $w1, $x2
+ B %bb.1
+
+ bb.1:
+ liveins: $x0, $w1, $x2
+ renamable $q11 = MOVIv4i32 2, 8
+ BL &memset, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit $w1, implicit $x2, implicit-def $sp, implicit-def $x0
+ renamable $q10 = MVNIv4i32 4, 0
+ $xzr = SUBSXri $x0, 1, 0, implicit-def $nzcv
+ Bcc 11, %bb.1, implicit $nzcv
+ B %bb.2
+
+ bb.2:
+ liveins: $q10, $q11
+ $q0 = COPY $q10
+ $q1 = COPY $q11
+...
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
index 7799b95..da8aa54 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
@@ -886,12 +886,12 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
; GCN-NEXT: v_writelane_b32 v40, s62, 30
; GCN-NEXT: v_writelane_b32 v40, s63, 31
; GCN-NEXT: s_mov_b64 s[6:7], exec
-; GCN-NEXT: s_movk_i32 s4, 0x7b
; GCN-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
; GCN-NEXT: v_readfirstlane_b32 s8, v0
; GCN-NEXT: v_readfirstlane_b32 s9, v1
; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
; GCN-NEXT: s_and_saveexec_b64 s[10:11], vcc
+; GCN-NEXT: s_movk_i32 s4, 0x7b
; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9]
; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN-NEXT: s_xor_b64 exec, exec, s[10:11]
@@ -980,12 +980,12 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
; GISEL-NEXT: v_writelane_b32 v40, s62, 30
; GISEL-NEXT: v_writelane_b32 v40, s63, 31
; GISEL-NEXT: s_mov_b64 s[6:7], exec
-; GISEL-NEXT: s_movk_i32 s4, 0x7b
; GISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
; GISEL-NEXT: v_readfirstlane_b32 s8, v0
; GISEL-NEXT: v_readfirstlane_b32 s9, v1
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
; GISEL-NEXT: s_and_saveexec_b64 s[10:11], vcc
+; GISEL-NEXT: s_movk_i32 s4, 0x7b
; GISEL-NEXT: s_swappc_b64 s[30:31], s[8:9]
; GISEL-NEXT: ; implicit-def: $vgpr0
; GISEL-NEXT: s_xor_b64 exec, exec, s[10:11]