1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
|
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc -mcpu=gfx900 < %s | FileCheck %s
; Check for assert when trying to optimize s_and_b64 + s_cbranch_vccz
; pattern.
target triple = "amdgcn-amd-amdhsa"
declare hidden void @func(<4 x i8>, <2 x i8>)
define <4 x i8> @issue176578() #0 {
; CHECK-LABEL: issue176578:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_mov_b32 s16, s33
; CHECK-NEXT: s_mov_b32 s33, s32
; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1
; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[18:19]
; CHECK-NEXT: v_writelane_b32 v41, s16, 15
; CHECK-NEXT: v_writelane_b32 v41, s30, 0
; CHECK-NEXT: v_writelane_b32 v41, s31, 1
; CHECK-NEXT: v_writelane_b32 v41, s34, 2
; CHECK-NEXT: v_writelane_b32 v41, s35, 3
; CHECK-NEXT: v_writelane_b32 v41, s36, 4
; CHECK-NEXT: v_writelane_b32 v41, s37, 5
; CHECK-NEXT: v_writelane_b32 v41, s38, 6
; CHECK-NEXT: v_writelane_b32 v41, s39, 7
; CHECK-NEXT: v_writelane_b32 v41, s48, 8
; CHECK-NEXT: v_writelane_b32 v41, s49, 9
; CHECK-NEXT: v_writelane_b32 v41, s50, 10
; CHECK-NEXT: v_writelane_b32 v41, s51, 11
; CHECK-NEXT: v_writelane_b32 v41, s52, 12
; CHECK-NEXT: v_writelane_b32 v41, s53, 13
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; CHECK-NEXT: v_writelane_b32 v41, s54, 14
; CHECK-NEXT: v_mov_b32_e32 v40, v31
; CHECK-NEXT: s_mov_b32 s50, s15
; CHECK-NEXT: s_mov_b32 s51, s14
; CHECK-NEXT: s_mov_b32 s52, s13
; CHECK-NEXT: s_mov_b32 s53, s12
; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11]
; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9]
; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5]
; CHECK-NEXT: s_mov_b32 s54, 0
; CHECK-NEXT: s_addk_i32 s32, 0x400
; CHECK-NEXT: s_branch .LBB0_2
; CHECK-NEXT: .LBB0_1: ; %Flow
; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1
; CHECK-NEXT: s_andn2_b64 vcc, exec, s[4:5]
; CHECK-NEXT: s_cbranch_vccz .LBB0_4
; CHECK-NEXT: .LBB0_2: ; %bb1
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: s_and_b64 vcc, exec, -1
; CHECK-NEXT: s_cselect_b32 s54, 0, s54
; CHECK-NEXT: s_mov_b64 s[4:5], -1
; CHECK-NEXT: s_cbranch_execnz .LBB0_1
; CHECK-NEXT: ; %bb.3: ; %bb3
; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1
; CHECK-NEXT: s_getpc_b64 s[16:17]
; CHECK-NEXT: s_add_u32 s16, s16, func@rel32@lo+4
; CHECK-NEXT: s_addc_u32 s17, s17, func@rel32@hi+12
; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49]
; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37]
; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
; CHECK-NEXT: s_mov_b32 s12, s53
; CHECK-NEXT: s_mov_b32 s13, s52
; CHECK-NEXT: s_mov_b32 s14, s51
; CHECK-NEXT: s_mov_b32 s15, s50
; CHECK-NEXT: v_mov_b32_e32 v31, v40
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: v_mov_b32_e32 v2, 0
; CHECK-NEXT: v_mov_b32_e32 v3, 0
; CHECK-NEXT: v_mov_b32_e32 v4, 0
; CHECK-NEXT: v_mov_b32_e32 v5, 0
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: s_mov_b64 s[4:5], 0
; CHECK-NEXT: s_branch .LBB0_1
; CHECK-NEXT: .LBB0_4: ; %bb4
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; CHECK-NEXT: v_mov_b32_e32 v0, s54
; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: v_mov_b32_e32 v2, 0
; CHECK-NEXT: v_mov_b32_e32 v3, 0
; CHECK-NEXT: v_readlane_b32 s54, v41, 14
; CHECK-NEXT: v_readlane_b32 s53, v41, 13
; CHECK-NEXT: v_readlane_b32 s52, v41, 12
; CHECK-NEXT: v_readlane_b32 s51, v41, 11
; CHECK-NEXT: v_readlane_b32 s50, v41, 10
; CHECK-NEXT: v_readlane_b32 s49, v41, 9
; CHECK-NEXT: v_readlane_b32 s48, v41, 8
; CHECK-NEXT: v_readlane_b32 s39, v41, 7
; CHECK-NEXT: v_readlane_b32 s38, v41, 6
; CHECK-NEXT: v_readlane_b32 s37, v41, 5
; CHECK-NEXT: v_readlane_b32 s36, v41, 4
; CHECK-NEXT: v_readlane_b32 s35, v41, 3
; CHECK-NEXT: v_readlane_b32 s34, v41, 2
; CHECK-NEXT: v_readlane_b32 s31, v41, 1
; CHECK-NEXT: v_readlane_b32 s30, v41, 0
; CHECK-NEXT: s_mov_b32 s32, s33
; CHECK-NEXT: v_readlane_b32 s4, v41, 15
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[6:7]
; CHECK-NEXT: s_mov_b32 s33, s4
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
bb:
br label %bb1
bb1: ; preds = %bb3, %bb
%phi = phi i8 [ 0, %bb ], [ %select, %bb3 ]
%call = tail call i32 @llvm.amdgcn.msad.u8(i32 0, i32 0, i32 0)
%call2 = tail call i32 @llvm.amdgcn.perm(i32 %call, i32 0, i32 0)
%and = and i32 %call2, 1
%icmp = icmp ne i32 %and, 1
%select = select i1 %icmp, i8 0, i8 %phi
br i1 %icmp, label %bb4, label %bb3
bb3: ; preds = %bb1
tail call void @func(<4 x i8> zeroinitializer, <2 x i8> zeroinitializer)
br label %bb1
bb4: ; preds = %bb1
%insertelement = insertelement <4 x i8> zeroinitializer, i8 %select, i64 0
ret <4 x i8> %insertelement
}
declare i32 @llvm.amdgcn.perm(i32, i32, i32) #1
declare i32 @llvm.amdgcn.msad.u8(i32, i32, i32) #2
attributes #0 = { nounwind }
attributes #1 = { nocallback nofree nounwind speculatable willreturn memory(none) }
attributes #2 = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
|