1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -amdgpu-s-branch-bits=7 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX1030 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-s-branch-bits=7 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX1010 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-s-branch-bits=7 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX1030 %s
; For gfx1010, overestimate the branch size in case we need to insert
; a nop for the buggy offset.
; GCN-LABEL: long_forward_scc_branch_3f_offset_bug:
; GFX1030: s_cmp_lg_u32
; GFX1030: s_cbranch_scc1 [[ENDBB:.LBB[0-9]+_[0-9]+]]
; GFX1010: s_cmp_lg_u32
; GFX1010-NEXT: s_cbranch_scc0 [[RELAX_BB:.LBB[0-9]+_[0-9]+]]
; GFX1010: s_getpc_b64
; GFX1010-NEXT: [[POST_GETPC:.Lpost_getpc[0-9]+]]:{{$}}
; GFX1010-NEXT: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, ([[ENDBB:.LBB[0-9]+_[0-9]+]]-[[POST_GETPC]])&4294967295
; GFX1010-NEXT: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, ([[ENDBB:.LBB[0-9]+_[0-9]+]]-[[POST_GETPC]])>>32
; GFX1010: [[RELAX_BB]]:
; GCN: v_nop
; GCN: s_sleep
; GCN: s_cbranch_scc1
; GCN: [[ENDBB]]:
; GCN: global_store_{{dword|b32}}
define amdgpu_kernel void @long_forward_scc_branch_3f_offset_bug(ptr addrspace(1) %arg, i32 %cnd0) #0 {
bb0:
%cmp0 = icmp eq i32 %cnd0, 0
br i1 %cmp0, label %bb2, label %bb3
bb2:
%val = call i32 asm sideeffect
"s_mov_b32 $0, 0
v_nop_e64
v_nop_e64
v_nop_e64
v_nop_e64
v_nop_e64
v_nop_e64
v_nop_e64
v_nop_e64
v_nop_e64
v_nop_e64
v_nop_e64", "=s"() ; 20 * 12 = 240
call void @llvm.amdgcn.s.sleep(i32 0) ; +4 = 244
%cmp1 = icmp eq i32 %val, 0 ; +4 = 248
br i1 %cmp1, label %bb2, label %bb3 ; +4 (gfx1030), +8 with workaround (gfx1010)
bb3:
store volatile i32 %cnd0, ptr addrspace(1) %arg
ret void
}
; GCN-LABEL: {{^}}long_forward_exec_branch_3f_offset_bug:
; GFX1030: s_mov_b32
; GFX1030: v_cmpx_eq_u32
; GFX1030: s_cbranch_execnz [[RELAX_BB:.LBB[0-9]+_[0-9]+]]
; GFX1010: v_cmp_eq_u32
; GFX1010: s_and_saveexec_b32
; GFX1010-NEXT: s_cbranch_execnz [[RELAX_BB:.LBB[0-9]+_[0-9]+]]
; GCN: s_getpc_b64
; GCN-NEXT: [[POST_GETPC:.Lpost_getpc[0-9]+]]:{{$}}
; GCN-NEXT: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, ([[ENDBB:.LBB[0-9]+_[0-9]+]]-[[POST_GETPC]])&4294967295
; GCN-NEXT: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, ([[ENDBB:.LBB[0-9]+_[0-9]+]]-[[POST_GETPC]])>>32
; GCN: [[RELAX_BB]]:
; GCN: v_nop
; GCN: s_sleep
; GCN: s_cbranch_execz
; GCN: [[ENDBB]]:
; GCN: global_store_{{dword|b32}}
define void @long_forward_exec_branch_3f_offset_bug(ptr addrspace(1) %arg, i32 %cnd0) #0 {
bb0:
%cmp0 = icmp eq i32 %cnd0, 0
br i1 %cmp0, label %bb2, label %bb3
bb2:
%val = call i32 asm sideeffect
"v_mov_b32 $0, 0
v_nop_e64
v_nop_e64
v_nop_e64
v_nop_e64
v_nop_e64
v_nop_e64
v_nop_e64
v_nop_e64
v_nop_e64
v_nop_e64
v_nop_e64", "=v"() ; 20 * 12 = 240
call void @llvm.amdgcn.s.sleep(i32 0) ; +4 = 244
%cmp1 = icmp eq i32 %val, 0 ; +4 = 248
br i1 %cmp1, label %bb2, label %bb3 ; +4 (gfx1030), +8 with workaround (gfx1010)
bb3:
store volatile i32 %cnd0, ptr addrspace(1) %arg
ret void
}
declare void @llvm.amdgcn.s.sleep(i32 immarg)
|