1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
|
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc -march=amdgcn -mcpu=gfx950 < %s | FileCheck %s -check-prefix=GFX950
declare hidden i32 @_ZN25__hip_builtin_threadIdx_t7__get_xEv()
; Before #177352 this test showed poor scheduling due to register pressure
; problems. The symptom was that two global_load instructions were immediately
; followed by s_waitcnt vmcnt(0).
define amdgpu_kernel void @main(i1 %arg, ptr %ptr, ptr addrspace(1) %ptr1, ptr addrspace(5) %ptr5) {
; GFX950-LABEL: main:
; GFX950: ; %bb.0: ; %bb
; GFX950-NEXT: s_load_dword s33, s[4:5], 0x3c
; GFX950-NEXT: s_mov_b32 s14, s10
; GFX950-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX950-NEXT: s_load_dword s6, s[4:5], 0x24
; GFX950-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x2c
; GFX950-NEXT: s_mov_b32 s12, s8
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: scratch_load_dwordx4 v[40:43], off, s33
; GFX950-NEXT: s_mov_b32 s13, s9
; GFX950-NEXT: s_bitcmp1_b32 s6, 0
; GFX950-NEXT: s_cselect_b64 s[34:35], -1, 0
; GFX950-NEXT: s_add_u32 s8, s4, 64
; GFX950-NEXT: s_addc_u32 s9, s5, 0
; GFX950-NEXT: s_getpc_b64 s[16:17]
; GFX950-NEXT: s_add_u32 s16, s16, _ZN25__hip_builtin_threadIdx_t7__get_xEv@rel32@lo+4
; GFX950-NEXT: s_addc_u32 s17, s17, _ZN25__hip_builtin_threadIdx_t7__get_xEv@rel32@hi+12
; GFX950-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX950-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX950-NEXT: v_mov_b32_e32 v31, v0
; GFX950-NEXT: s_mov_b32 s32, 0
; GFX950-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX950-NEXT: v_mov_b32_e32 v1, 0
; GFX950-NEXT: v_lshl_add_u64 v[10:11], v[0:1], 3, s[38:39]
; GFX950-NEXT: global_load_dwordx4 v[2:5], v[10:11], off
; GFX950-NEXT: global_load_dwordx4 v[6:9], v1, s[38:39]
; GFX950-NEXT: v_mov_b32_e32 v0, 0
; GFX950-NEXT: v_mov_b64_e32 v[10:11], s[36:37]
; GFX950-NEXT: s_and_b64 vcc, exec, s[34:35]
; GFX950-NEXT: v_mov_b32_e32 v12, v1
; GFX950-NEXT: v_mov_b32_e32 v14, v1
; GFX950-NEXT: v_mov_b32_e32 v15, v1
; GFX950-NEXT: v_mov_b32_e32 v16, v1
; GFX950-NEXT: v_mov_b32_e32 v18, v1
; GFX950-NEXT: v_mov_b32_e32 v17, v1
; GFX950-NEXT: v_mov_b32_e32 v19, v1
; GFX950-NEXT: v_mov_b32_e32 v20, v1
; GFX950-NEXT: v_mov_b32_e32 v21, v1
; GFX950-NEXT: .LBB0_1: ; %bb4
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: v_mov_b32_e32 v13, v1
; GFX950-NEXT: v_lshlrev_b64 v[22:23], 3, v[12:13]
; GFX950-NEXT: v_lshl_add_u64 v[22:23], s[38:39], 0, v[22:23]
; GFX950-NEXT: global_load_dwordx4 v[22:25], v[22:23], off
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: v_lshl_add_u64 v[26:27], v[0:1], 3, s[38:39]
; GFX950-NEXT: s_waitcnt vmcnt(1)
; GFX950-NEXT: v_or_b32_e32 v0, v40, v6
; GFX950-NEXT: v_or_b32_e32 v13, v41, v7
; GFX950-NEXT: v_or_b32_e32 v30, v42, v8
; GFX950-NEXT: v_or_b32_e32 v31, v43, v9
; GFX950-NEXT: global_load_dwordx4 v[26:29], v[26:27], off
; GFX950-NEXT: v_or_b32_e32 v18, v5, v18
; GFX950-NEXT: v_or_b32_e32 v16, v4, v16
; GFX950-NEXT: v_or_b32_e32 v15, v3, v15
; GFX950-NEXT: v_or_b32_e32 v14, v2, v14
; GFX950-NEXT: v_or_b32_e32 v12, 1, v12
; GFX950-NEXT: v_mov_b32_e32 v40, 0
; GFX950-NEXT: v_mov_b32_e32 v41, 0
; GFX950-NEXT: v_mov_b32_e32 v42, 0
; GFX950-NEXT: v_mov_b32_e32 v43, 0
; GFX950-NEXT: s_waitcnt vmcnt(1)
; GFX950-NEXT: v_or_b32_e32 v25, v25, v31
; GFX950-NEXT: v_or_b32_e32 v24, v24, v30
; GFX950-NEXT: v_or_b32_e32 v23, v23, v13
; GFX950-NEXT: v_or_b32_e32 v22, v22, v0
; GFX950-NEXT: scratch_store_dwordx4 off, v[22:25], s33
; GFX950-NEXT: flat_load_dword v0, v[10:11]
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_or_b32_e32 v21, v29, v21
; GFX950-NEXT: v_or_b32_e32 v20, v28, v20
; GFX950-NEXT: v_or_b32_e32 v19, v27, v19
; GFX950-NEXT: v_or_b32_e32 v17, v26, v17
; GFX950-NEXT: s_mov_b64 vcc, vcc
; GFX950-NEXT: s_cbranch_vccz .LBB0_1
; GFX950-NEXT: ; %bb.2: ; %bb2
; GFX950-NEXT: v_or_b32_e32 v3, v21, v18
; GFX950-NEXT: v_or_b32_e32 v2, v20, v16
; GFX950-NEXT: v_or_b32_e32 v1, v19, v15
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: v_or_b32_e32 v0, v17, v14
; GFX950-NEXT: scratch_store_dwordx4 off, v[0:3], s33
; GFX950-NEXT: s_endpgm
bb:
%i = load <4 x i32>, ptr addrspace(5) %ptr5
%i1 = tail call i32 @_ZN25__hip_builtin_threadIdx_t7__get_xEv()
br label %bb4
bb2: ; preds = %bb4
%i3 = or <4 x i32> %i17, %i13
store <4 x i32> %i3, ptr addrspace(5) %ptr5
ret void
bb4: ; preds = %bb4, %bb
%i5 = phi <4 x i32> [ %i, %bb ], [ zeroinitializer, %bb4 ]
%i6 = phi i32 [ 0, %bb ], [ %i24, %bb4 ]
%i7 = phi i32 [ 0, %bb ], [ %i25, %bb4 ]
%i8 = phi <4 x i32> [ zeroinitializer, %bb ], [ %i17, %bb4 ]
%i9 = phi <4 x i32> [ zeroinitializer, %bb ], [ %i13, %bb4 ]
%i10 = zext i32 %i1 to i64
%i11 = getelementptr i64, ptr addrspace(1) %ptr1, i64 %i10
%i12 = load <4 x i32>, ptr addrspace(1) %i11
%i13 = or <4 x i32> %i12, %i9
%i14 = zext i32 %i6 to i64
%i15 = getelementptr i64, ptr addrspace(1) %ptr1, i64 %i14
%i16 = load <4 x i32>, ptr addrspace(1) %i15
%i17 = or <4 x i32> %i16, %i8
%i18 = zext i32 %i7 to i64
%i19 = getelementptr i64, ptr addrspace(1) %ptr1, i64 %i18
%i20 = load <4 x i32>, ptr addrspace(1) %i19
%i21 = load <4 x i32>, ptr addrspace(1) %ptr1
%i22 = or <4 x i32> %i5, %i21
%i23 = or <4 x i32> %i20, %i22
store <4 x i32> %i23, ptr addrspace(5) %ptr5
%i24 = load i32, ptr %ptr
%i25 = or i32 %i7, 1
br i1 %arg, label %bb2, label %bb4
}
|