diff options
author | Benjamin Maxwell <benjamin.maxwell@arm.com> | 2025-09-05 12:15:28 +0000 |
---|---|---|
committer | Benjamin Maxwell <benjamin.maxwell@arm.com> | 2025-09-12 11:21:41 +0000 |
commit | b666a28a0581e3a1f7838eca5f241d2550c05e4b (patch) | |
tree | 41473d9785a63148661490cb95234d4d6f304258 /llvm/test/CodeGen/AArch64/sme-lazy-save-in-loop.ll | |
parent | 77596b78e5664fff8d272599c0420fc9b87e2c2d (diff) | |
download | llvm-users/MacDue/sme-loops.zip llvm-users/MacDue/sme-loops.tar.gz llvm-users/MacDue/sme-loops.tar.bz2 |
[AArch64][SME] Avoid ZA save state changes in loops in MachineSMEABIPassusers/MacDue/sme-loops
This patch uses the MachineLoopInfo to give blocks within loops a higher
weight when choosing the bundle ZA state. MachineLoopInfo does not find
loop trip counts, so this uses an arbitrary weight (default 10), which
can be configured with the `-aarch64-sme-abi-loop-edge-weight` flag.
This makes the MachineSMEABIPass pass more likely to pick a bundle state
that matches the loop's entry/exit state, which avoids state changes in
the loop (which we assume will happen more than once).
This does require some extra analysis, so this is only enabled at -O1
and above.
Change-Id: If318c809d2f7cc1fca144fbe424ba2a2ca7fb19f
Diffstat (limited to 'llvm/test/CodeGen/AArch64/sme-lazy-save-in-loop.ll')
-rw-r--r-- | llvm/test/CodeGen/AArch64/sme-lazy-save-in-loop.ll | 115 |
1 files changed, 115 insertions, 0 deletions
diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-in-loop.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-in-loop.ll new file mode 100644 index 0000000..200280f --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-in-loop.ll @@ -0,0 +1,115 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -O0 -mtriple=aarch64-linux-gnu -mattr=+sme -aarch64-new-sme-abi < %s | FileCheck %s --check-prefix=CHECK-O0 +; RUN: llc -O1 -mtriple=aarch64-linux-gnu -mattr=+sme -aarch64-new-sme-abi < %s | FileCheck %s --check-prefix=CHECK-O1 + +declare void @private_za_call() +declare void @shared_za_call() "aarch64_inout_za" + +; This test checks that at -O0 we don't attempt to optimize lazy save state +; changes in loops, and that -O1 (and above) we attempt to push state changes +; out of loops. + +define void @private_za_loop_active_entry_and_exit(i32 %n) "aarch64_inout_za" nounwind { +; CHECK-O0-LABEL: private_za_loop_active_entry_and_exit: +; CHECK-O0: // %bb.0: // %entry +; CHECK-O0-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-O0-NEXT: mov x29, sp +; CHECK-O0-NEXT: sub sp, sp, #32 +; CHECK-O0-NEXT: rdsvl x9, #1 +; CHECK-O0-NEXT: mov x8, sp +; CHECK-O0-NEXT: msub x8, x9, x9, x8 +; CHECK-O0-NEXT: mov sp, x8 +; CHECK-O0-NEXT: stp x8, x9, [x29, #-16] +; CHECK-O0-NEXT: stur w0, [x29, #-24] // 4-byte Folded Spill +; CHECK-O0-NEXT: bl shared_za_call +; CHECK-O0-NEXT: ldur w0, [x29, #-24] // 4-byte Folded Reload +; CHECK-O0-NEXT: mov w8, wzr +; CHECK-O0-NEXT: subs w9, w0, #1 +; CHECK-O0-NEXT: stur w8, [x29, #-20] // 4-byte Folded Spill +; CHECK-O0-NEXT: b.lt .LBB0_4 +; CHECK-O0-NEXT: b .LBB0_1 +; CHECK-O0-NEXT: .LBB0_1: // %loop +; CHECK-O0-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-O0-NEXT: ldur w8, [x29, #-20] // 4-byte Folded Reload +; CHECK-O0-NEXT: stur w8, [x29, #-28] // 4-byte Folded Spill +; CHECK-O0-NEXT: sub x8, x29, #16 +; CHECK-O0-NEXT: msr TPIDR2_EL0, x8 +; CHECK-O0-NEXT: bl private_za_call +; CHECK-O0-NEXT: ldur w8, [x29, #-28] // 4-byte Folded Reload +; CHECK-O0-NEXT: ldur w10, [x29, #-24] // 4-byte Folded Reload +; CHECK-O0-NEXT: add w9, w8, #1 +; CHECK-O0-NEXT: mov w8, w9 +; CHECK-O0-NEXT: subs w9, w9, w10 +; CHECK-O0-NEXT: mrs x9, NZCV +; CHECK-O0-NEXT: smstart za +; CHECK-O0-NEXT: mrs x10, TPIDR2_EL0 +; CHECK-O0-NEXT: sub x0, x29, #16 +; CHECK-O0-NEXT: cbz x10, .LBB0_2 +; CHECK-O0-NEXT: b .LBB0_3 +; CHECK-O0-NEXT: .LBB0_2: // %loop +; CHECK-O0-NEXT: // in Loop: Header=BB0_1 Depth=1 +; CHECK-O0-NEXT: bl __arm_tpidr2_restore +; CHECK-O0-NEXT: b .LBB0_3 +; CHECK-O0-NEXT: .LBB0_3: // %loop +; CHECK-O0-NEXT: // in Loop: Header=BB0_1 Depth=1 +; CHECK-O0-NEXT: msr TPIDR2_EL0, xzr +; CHECK-O0-NEXT: msr NZCV, x9 +; CHECK-O0-NEXT: stur w8, [x29, #-20] // 4-byte Folded Spill +; CHECK-O0-NEXT: b.ne .LBB0_1 +; CHECK-O0-NEXT: b .LBB0_4 +; CHECK-O0-NEXT: .LBB0_4: // %exit +; CHECK-O0-NEXT: mov sp, x29 +; CHECK-O0-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-O0-NEXT: b shared_za_call +; +; CHECK-O1-LABEL: private_za_loop_active_entry_and_exit: +; CHECK-O1: // %bb.0: // %entry +; CHECK-O1-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-O1-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-O1-NEXT: mov x29, sp +; CHECK-O1-NEXT: sub sp, sp, #16 +; CHECK-O1-NEXT: rdsvl x8, #1 +; CHECK-O1-NEXT: mov x9, sp +; CHECK-O1-NEXT: msub x9, x8, x8, x9 +; CHECK-O1-NEXT: mov sp, x9 +; CHECK-O1-NEXT: mov w19, w0 +; CHECK-O1-NEXT: stp x9, x8, [x29, #-16] +; CHECK-O1-NEXT: bl shared_za_call +; CHECK-O1-NEXT: cmp w19, #1 +; CHECK-O1-NEXT: sub x8, x29, #16 +; CHECK-O1-NEXT: msr TPIDR2_EL0, x8 +; CHECK-O1-NEXT: b.lt .LBB0_2 +; CHECK-O1-NEXT: .LBB0_1: // %loop +; CHECK-O1-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-O1-NEXT: bl private_za_call +; CHECK-O1-NEXT: subs w19, w19, #1 +; CHECK-O1-NEXT: b.ne .LBB0_1 +; CHECK-O1-NEXT: .LBB0_2: // %exit +; CHECK-O1-NEXT: smstart za +; CHECK-O1-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-O1-NEXT: sub x0, x29, #16 +; CHECK-O1-NEXT: cbnz x8, .LBB0_4 +; CHECK-O1-NEXT: // %bb.3: // %exit +; CHECK-O1-NEXT: bl __arm_tpidr2_restore +; CHECK-O1-NEXT: .LBB0_4: // %exit +; CHECK-O1-NEXT: msr TPIDR2_EL0, xzr +; CHECK-O1-NEXT: mov sp, x29 +; CHECK-O1-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-O1-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-O1-NEXT: b shared_za_call +entry: + %cmpgt = icmp sgt i32 %n, 0 + tail call void @shared_za_call() + br i1 %cmpgt, label %loop, label %exit + +loop: + %iv = phi i32 [ %next_iv, %loop ], [ 0, %entry ] + tail call void @private_za_call() + %next_iv = add nuw nsw i32 %iv, 1 + %cmpeq = icmp eq i32 %next_iv, %n + br i1 %cmpeq, label %exit, label %loop + +exit: + tail call void @shared_za_call() + ret void +} |