; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-streaming-hazard-size=0 -mattr=+sve,+sme2 < %s | FileCheck %s declare void @callee() declare void @callee_sm() "aarch64_pstate_sm_enabled" declare void @callee_farg(float) declare float @callee_farg_fret(float) ; normal caller -> streaming callees define void @test0(ptr %callee) nounwind { ; CHECK-LABEL: test0: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill ; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: smstart sm ; CHECK-NEXT: bl callee_sm ; CHECK-NEXT: bl callee_sm ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload ; CHECK-NEXT: ret call void @callee_sm() call void @callee_sm() ret void } ; streaming caller -> normal callees define void @test1() nounwind "aarch64_pstate_sm_enabled" { ; CHECK-LABEL: test1: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill ; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl callee ; CHECK-NEXT: bl callee ; CHECK-NEXT: smstart sm ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload ; CHECK-NEXT: ret call void @callee() call void @callee() ret void } ; streaming-compatible caller -> normal callees ; these conditional smstart/smstop are not yet optimized away. define void @test2() nounwind "aarch64_pstate_sm_compatible" { ; CHECK-LABEL: test2: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill ; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill ; CHECK-NEXT: bl __arm_sme_state ; CHECK-NEXT: and x19, x0, #0x1 ; CHECK-NEXT: tbz w19, #0, .LBB2_2 ; CHECK-NEXT: // %bb.1: ; CHECK-NEXT: smstop sm ; CHECK-NEXT: .LBB2_2: ; CHECK-NEXT: bl callee ; CHECK-NEXT: tbz w19, #0, .LBB2_4 ; CHECK-NEXT: // %bb.3: ; CHECK-NEXT: smstart sm ; CHECK-NEXT: .LBB2_4: ; CHECK-NEXT: bl __arm_sme_state ; CHECK-NEXT: and x19, x0, #0x1 ; CHECK-NEXT: tbz w19, #0, .LBB2_6 ; CHECK-NEXT: // %bb.5: ; CHECK-NEXT: smstop sm ; CHECK-NEXT: .LBB2_6: ; CHECK-NEXT: bl callee ; CHECK-NEXT: tbz w19, #0, .LBB2_8 ; CHECK-NEXT: // %bb.7: ; CHECK-NEXT: smstart sm ; CHECK-NEXT: .LBB2_8: ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret call void @callee() call void @callee() ret void } ; streaming-compatible caller -> mixed callees define void @test3() nounwind "aarch64_pstate_sm_compatible" { ; CHECK-LABEL: test3: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill ; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill ; CHECK-NEXT: bl __arm_sme_state ; CHECK-NEXT: and x19, x0, #0x1 ; CHECK-NEXT: tbnz w19, #0, .LBB3_2 ; CHECK-NEXT: // %bb.1: ; CHECK-NEXT: smstart sm ; CHECK-NEXT: .LBB3_2: ; CHECK-NEXT: bl callee_sm ; CHECK-NEXT: tbnz w19, #0, .LBB3_4 ; CHECK-NEXT: // %bb.3: ; CHECK-NEXT: smstop sm ; CHECK-NEXT: .LBB3_4: ; CHECK-NEXT: bl __arm_sme_state ; CHECK-NEXT: and x19, x0, #0x1 ; CHECK-NEXT: tbz w19, #0, .LBB3_6 ; CHECK-NEXT: // %bb.5: ; CHECK-NEXT: smstop sm ; CHECK-NEXT: .LBB3_6: ; CHECK-NEXT: bl callee ; CHECK-NEXT: tbz w19, #0, .LBB3_8 ; CHECK-NEXT: // %bb.7: ; CHECK-NEXT: smstart sm ; CHECK-NEXT: .LBB3_8: ; CHECK-NEXT: bl __arm_sme_state ; CHECK-NEXT: and x19, x0, #0x1 ; CHECK-NEXT: tbnz w19, #0, .LBB3_10 ; CHECK-NEXT: // %bb.9: ; CHECK-NEXT: smstart sm ; CHECK-NEXT: .LBB3_10: ; CHECK-NEXT: bl callee_sm ; CHECK-NEXT: tbnz w19, #0, .LBB3_12 ; CHECK-NEXT: // %bb.11: ; CHECK-NEXT: smstop sm ; CHECK-NEXT: .LBB3_12: ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret call void @callee_sm() call void @callee() call void @callee_sm() ret void } ; streaming caller -> normal callees (pass 0.0f) define void @test4() nounwind "aarch64_pstate_sm_enabled" { ; CHECK-LABEL: test4: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill ; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: fmov s0, wzr ; CHECK-NEXT: bl callee_farg ; CHECK-NEXT: fmov s0, wzr ; CHECK-NEXT: bl callee_farg ; CHECK-NEXT: smstart sm ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload ; CHECK-NEXT: ret call void @callee_farg(float zeroinitializer) call void @callee_farg(float zeroinitializer) ret void } ; streaming caller -> normal callees (pass fp arg) define void @test5(float %f) nounwind "aarch64_pstate_sm_enabled" { ; CHECK-LABEL: test5: ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #96 ; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload ; CHECK-NEXT: bl callee_farg ; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload ; CHECK-NEXT: bl callee_farg ; CHECK-NEXT: smstart sm ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret call void @callee_farg(float %f) call void @callee_farg(float %f) ret void } define float @test6(float %f) nounwind "aarch64_pstate_sm_enabled" { ; CHECK-LABEL: test6: ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #96 ; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload ; CHECK-NEXT: bl callee_farg_fret ; CHECK-NEXT: bl callee_farg_fret ; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill ; CHECK-NEXT: smstart sm ; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %res0 = call float @callee_farg_fret(float %f) %res1 = call float @callee_farg_fret(float %res0) ret float %res1 } ; save/restore zt0 to stack is not yet optimised away by the pass, ; because of the ldr/str of zt0, which will need some further analysis ; to make sure if the redundant str can be removed. define void @test7() nounwind "aarch64_inout_zt0" { ; CHECK-LABEL: test7: ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #144 ; CHECK-NEXT: stp x30, x19, [sp, #128] // 16-byte Folded Spill ; CHECK-NEXT: add x19, sp, #64 ; CHECK-NEXT: str zt0, [x19] ; CHECK-NEXT: smstop za ; CHECK-NEXT: bl callee ; CHECK-NEXT: smstart za ; CHECK-NEXT: ldr zt0, [x19] ; CHECK-NEXT: mov x19, sp ; CHECK-NEXT: str zt0, [x19] ; CHECK-NEXT: smstop za ; CHECK-NEXT: bl callee ; CHECK-NEXT: smstart za ; CHECK-NEXT: ldr zt0, [x19] ; CHECK-NEXT: ldp x30, x19, [sp, #128] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #144 ; CHECK-NEXT: ret call void @callee() call void @callee() ret void } ; test that 'smstop za' is not cancelled out with 'smstart sm'. define void @test8() nounwind "aarch64_pstate_sm_enabled" { ; CHECK-LABEL: test8: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill ; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl callee ; CHECK-NEXT: smstart sm ; CHECK-NEXT: smstop za ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload ; CHECK-NEXT: ret call void @callee() call void @llvm.aarch64.sme.za.disable() ret void } ; test that the 'smstart' and 'smstop' are entirely removed, ; along with any code to read 'vg' for the CFI. define void @test9() "aarch64_pstate_sm_body" { ; CHECK-LABEL: test9: ; CHECK: // %bb.0: ; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: .cfi_offset w30, -16 ; CHECK-NEXT: bl callee ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret call void @callee() ret void } ; similar to above, but in this case only the first ; 'smstart, smstop' pair can be removed and the code required ; for the CFI is still needed. define void @test10() "aarch64_pstate_sm_body" { ; CHECK-LABEL: test10: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 96 ; CHECK-NEXT: rdsvl x9, #1 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: lsr x9, x9, #3 ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: cntd x9 ; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill ; CHECK-NEXT: .cfi_offset vg, -16 ; CHECK-NEXT: .cfi_offset w30, -32 ; CHECK-NEXT: .cfi_offset b8, -40 ; CHECK-NEXT: .cfi_offset b9, -48 ; CHECK-NEXT: .cfi_offset b10, -56 ; CHECK-NEXT: .cfi_offset b11, -64 ; CHECK-NEXT: .cfi_offset b12, -72 ; CHECK-NEXT: .cfi_offset b13, -80 ; CHECK-NEXT: .cfi_offset b14, -88 ; CHECK-NEXT: .cfi_offset b15, -96 ; CHECK-NEXT: bl callee ; CHECK-NEXT: smstart sm ; CHECK-NEXT: .cfi_restore vg ; CHECK-NEXT: bl callee_sm ; CHECK-NEXT: .cfi_offset vg, -24 ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl callee ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: .cfi_restore w30 ; CHECK-NEXT: .cfi_restore b8 ; CHECK-NEXT: .cfi_restore b9 ; CHECK-NEXT: .cfi_restore b10 ; CHECK-NEXT: .cfi_restore b11 ; CHECK-NEXT: .cfi_restore b12 ; CHECK-NEXT: .cfi_restore b13 ; CHECK-NEXT: .cfi_restore b14 ; CHECK-NEXT: .cfi_restore b15 ; CHECK-NEXT: ret call void @callee() call void @callee_sm() call void @callee() ret void } ; test that an operation like a store is executed in the right ; streaming mode and blocks the optimization. define void @test11(ptr %p) nounwind "aarch64_pstate_sm_enabled" { ; CHECK-LABEL: test11: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill ; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl callee ; CHECK-NEXT: smstart sm ; CHECK-NEXT: mov z0.b, #0 // =0x0 ; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl callee ; CHECK-NEXT: smstart sm ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret call void @callee() store zeroinitializer, ptr %p call void @callee() ret void } ; test that 'smstart sm' and 'smstop za' don't get folded away together. ; we can further optimize this test by considering streaming mode ; separately from ZA. define void @test12() "aarch64_pstate_sm_body" { ; CHECK-LABEL: test12: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 96 ; CHECK-NEXT: rdsvl x9, #1 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: lsr x9, x9, #3 ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: cntd x9 ; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill ; CHECK-NEXT: .cfi_offset vg, -16 ; CHECK-NEXT: .cfi_offset w30, -32 ; CHECK-NEXT: .cfi_offset b8, -40 ; CHECK-NEXT: .cfi_offset b9, -48 ; CHECK-NEXT: .cfi_offset b10, -56 ; CHECK-NEXT: .cfi_offset b11, -64 ; CHECK-NEXT: .cfi_offset b12, -72 ; CHECK-NEXT: .cfi_offset b13, -80 ; CHECK-NEXT: .cfi_offset b14, -88 ; CHECK-NEXT: .cfi_offset b15, -96 ; CHECK-NEXT: smstart sm ; CHECK-NEXT: smstop za ; CHECK-NEXT: .cfi_offset vg, -24 ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl callee ; CHECK-NEXT: smstart sm ; CHECK-NEXT: .cfi_restore vg ; CHECK-NEXT: smstart za ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: .cfi_restore w30 ; CHECK-NEXT: .cfi_restore b8 ; CHECK-NEXT: .cfi_restore b9 ; CHECK-NEXT: .cfi_restore b10 ; CHECK-NEXT: .cfi_restore b11 ; CHECK-NEXT: .cfi_restore b12 ; CHECK-NEXT: .cfi_restore b13 ; CHECK-NEXT: .cfi_restore b14 ; CHECK-NEXT: .cfi_restore b15 ; CHECK-NEXT: ret call void @llvm.aarch64.sme.za.disable() call void @callee() call void @llvm.aarch64.sme.za.enable() ret void } ; We conservatively don't remove the smstart/smstop pair yet when there are COPY ; instructions that copy SVE registers, because we can't yet conclusively prove ; that this is safe (although for this example, it would be). define void @test13(ptr %ptr) nounwind "aarch64_pstate_sm_enabled" { ; CHECK-LABEL: test13: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill ; CHECK-NEXT: cntd x9 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: mov z0.s, #0 // =0x0 ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl callee_farg_fret ; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstart sm ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl callee_farg_fret ; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstart sm ; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: str z0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret %res0 = call @callee_farg_fret( zeroinitializer) %res1 = call @callee_farg_fret( %res0) store %res1, ptr %ptr ret void }