1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
|
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -O0 -mtriple=aarch64-linux-gnu -mattr=+sme -aarch64-new-sme-abi < %s | FileCheck %s --check-prefix=CHECK-O0
; RUN: llc -O1 -mtriple=aarch64-linux-gnu -mattr=+sme -aarch64-new-sme-abi < %s | FileCheck %s --check-prefix=CHECK-O1
declare void @private_za_call()
declare void @shared_za_call() "aarch64_inout_za"
; This test checks that at -O0 we don't attempt to optimize lazy save state
; changes in loops, and that -O1 (and above) we attempt to push state changes
; out of loops.
define void @private_za_loop_active_entry_and_exit(i32 %n) "aarch64_inout_za" nounwind {
; CHECK-O0-LABEL: private_za_loop_active_entry_and_exit:
; CHECK-O0: // %bb.0: // %entry
; CHECK-O0-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
; CHECK-O0-NEXT: mov x29, sp
; CHECK-O0-NEXT: sub sp, sp, #32
; CHECK-O0-NEXT: rdsvl x9, #1
; CHECK-O0-NEXT: mov x8, sp
; CHECK-O0-NEXT: msub x8, x9, x9, x8
; CHECK-O0-NEXT: mov sp, x8
; CHECK-O0-NEXT: stp x8, x9, [x29, #-16]
; CHECK-O0-NEXT: stur w0, [x29, #-24] // 4-byte Folded Spill
; CHECK-O0-NEXT: bl shared_za_call
; CHECK-O0-NEXT: ldur w0, [x29, #-24] // 4-byte Folded Reload
; CHECK-O0-NEXT: mov w8, wzr
; CHECK-O0-NEXT: subs w9, w0, #1
; CHECK-O0-NEXT: stur w8, [x29, #-20] // 4-byte Folded Spill
; CHECK-O0-NEXT: b.lt .LBB0_4
; CHECK-O0-NEXT: b .LBB0_1
; CHECK-O0-NEXT: .LBB0_1: // %loop
; CHECK-O0-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-O0-NEXT: ldur w8, [x29, #-20] // 4-byte Folded Reload
; CHECK-O0-NEXT: stur w8, [x29, #-28] // 4-byte Folded Spill
; CHECK-O0-NEXT: sub x8, x29, #16
; CHECK-O0-NEXT: msr TPIDR2_EL0, x8
; CHECK-O0-NEXT: bl private_za_call
; CHECK-O0-NEXT: ldur w8, [x29, #-28] // 4-byte Folded Reload
; CHECK-O0-NEXT: ldur w10, [x29, #-24] // 4-byte Folded Reload
; CHECK-O0-NEXT: add w9, w8, #1
; CHECK-O0-NEXT: mov w8, w9
; CHECK-O0-NEXT: subs w9, w9, w10
; CHECK-O0-NEXT: mrs x9, NZCV
; CHECK-O0-NEXT: smstart za
; CHECK-O0-NEXT: mrs x10, TPIDR2_EL0
; CHECK-O0-NEXT: sub x0, x29, #16
; CHECK-O0-NEXT: cbz x10, .LBB0_2
; CHECK-O0-NEXT: b .LBB0_3
; CHECK-O0-NEXT: .LBB0_2: // %loop
; CHECK-O0-NEXT: // in Loop: Header=BB0_1 Depth=1
; CHECK-O0-NEXT: bl __arm_tpidr2_restore
; CHECK-O0-NEXT: b .LBB0_3
; CHECK-O0-NEXT: .LBB0_3: // %loop
; CHECK-O0-NEXT: // in Loop: Header=BB0_1 Depth=1
; CHECK-O0-NEXT: msr TPIDR2_EL0, xzr
; CHECK-O0-NEXT: msr NZCV, x9
; CHECK-O0-NEXT: stur w8, [x29, #-20] // 4-byte Folded Spill
; CHECK-O0-NEXT: b.ne .LBB0_1
; CHECK-O0-NEXT: b .LBB0_4
; CHECK-O0-NEXT: .LBB0_4: // %exit
; CHECK-O0-NEXT: mov sp, x29
; CHECK-O0-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
; CHECK-O0-NEXT: b shared_za_call
;
; CHECK-O1-LABEL: private_za_loop_active_entry_and_exit:
; CHECK-O1: // %bb.0: // %entry
; CHECK-O1-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
; CHECK-O1-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
; CHECK-O1-NEXT: mov x29, sp
; CHECK-O1-NEXT: sub sp, sp, #16
; CHECK-O1-NEXT: rdsvl x8, #1
; CHECK-O1-NEXT: mov x9, sp
; CHECK-O1-NEXT: msub x9, x8, x8, x9
; CHECK-O1-NEXT: mov sp, x9
; CHECK-O1-NEXT: mov w19, w0
; CHECK-O1-NEXT: stp x9, x8, [x29, #-16]
; CHECK-O1-NEXT: bl shared_za_call
; CHECK-O1-NEXT: cmp w19, #1
; CHECK-O1-NEXT: sub x8, x29, #16
; CHECK-O1-NEXT: msr TPIDR2_EL0, x8
; CHECK-O1-NEXT: b.lt .LBB0_2
; CHECK-O1-NEXT: .LBB0_1: // %loop
; CHECK-O1-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-O1-NEXT: bl private_za_call
; CHECK-O1-NEXT: subs w19, w19, #1
; CHECK-O1-NEXT: b.ne .LBB0_1
; CHECK-O1-NEXT: .LBB0_2: // %exit
; CHECK-O1-NEXT: smstart za
; CHECK-O1-NEXT: mrs x8, TPIDR2_EL0
; CHECK-O1-NEXT: sub x0, x29, #16
; CHECK-O1-NEXT: cbnz x8, .LBB0_4
; CHECK-O1-NEXT: // %bb.3: // %exit
; CHECK-O1-NEXT: bl __arm_tpidr2_restore
; CHECK-O1-NEXT: .LBB0_4: // %exit
; CHECK-O1-NEXT: msr TPIDR2_EL0, xzr
; CHECK-O1-NEXT: mov sp, x29
; CHECK-O1-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
; CHECK-O1-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-O1-NEXT: b shared_za_call
entry:
%cmpgt = icmp sgt i32 %n, 0
tail call void @shared_za_call()
br i1 %cmpgt, label %loop, label %exit
loop:
%iv = phi i32 [ %next_iv, %loop ], [ 0, %entry ]
tail call void @private_za_call()
%next_iv = add nuw nsw i32 %iv, 1
%cmpeq = icmp eq i32 %next_iv, %n
br i1 %cmpeq, label %exit, label %loop
exit:
tail call void @shared_za_call()
ret void
}
|