; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 ; RUN: llc -march=amdgcn -mcpu=gfx950 < %s | FileCheck %s -check-prefix=GFX950 declare hidden i32 @_ZN25__hip_builtin_threadIdx_t7__get_xEv() ; Before #177352 this test showed poor scheduling due to register pressure ; problems. The symptom was that two global_load instructions were immediately ; followed by s_waitcnt vmcnt(0). define amdgpu_kernel void @main(i1 %arg, ptr %ptr, ptr addrspace(1) %ptr1, ptr addrspace(5) %ptr5) { ; GFX950-LABEL: main: ; GFX950: ; %bb.0: ; %bb ; GFX950-NEXT: s_load_dword s33, s[4:5], 0x3c ; GFX950-NEXT: s_mov_b32 s14, s10 ; GFX950-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX950-NEXT: s_load_dword s6, s[4:5], 0x24 ; GFX950-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x2c ; GFX950-NEXT: s_mov_b32 s12, s8 ; GFX950-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-NEXT: scratch_load_dwordx4 v[40:43], off, s33 ; GFX950-NEXT: s_mov_b32 s13, s9 ; GFX950-NEXT: s_bitcmp1_b32 s6, 0 ; GFX950-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GFX950-NEXT: s_add_u32 s8, s4, 64 ; GFX950-NEXT: s_addc_u32 s9, s5, 0 ; GFX950-NEXT: s_getpc_b64 s[16:17] ; GFX950-NEXT: s_add_u32 s16, s16, _ZN25__hip_builtin_threadIdx_t7__get_xEv@rel32@lo+4 ; GFX950-NEXT: s_addc_u32 s17, s17, _ZN25__hip_builtin_threadIdx_t7__get_xEv@rel32@hi+12 ; GFX950-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX950-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX950-NEXT: v_mov_b32_e32 v31, v0 ; GFX950-NEXT: s_mov_b32 s32, 0 ; GFX950-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX950-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-NEXT: v_lshl_add_u64 v[10:11], v[0:1], 3, s[38:39] ; GFX950-NEXT: global_load_dwordx4 v[2:5], v[10:11], off ; GFX950-NEXT: global_load_dwordx4 v[6:9], v1, s[38:39] ; GFX950-NEXT: v_mov_b32_e32 v0, 0 ; GFX950-NEXT: v_mov_b64_e32 v[10:11], s[36:37] ; GFX950-NEXT: s_and_b64 vcc, exec, s[34:35] ; GFX950-NEXT: v_mov_b32_e32 v12, v1 ; GFX950-NEXT: v_mov_b32_e32 v14, v1 ; GFX950-NEXT: v_mov_b32_e32 v15, v1 ; GFX950-NEXT: v_mov_b32_e32 v16, v1 ; GFX950-NEXT: v_mov_b32_e32 v18, v1 ; GFX950-NEXT: v_mov_b32_e32 v17, v1 ; GFX950-NEXT: v_mov_b32_e32 v19, v1 ; GFX950-NEXT: v_mov_b32_e32 v20, v1 ; GFX950-NEXT: v_mov_b32_e32 v21, v1 ; GFX950-NEXT: .LBB0_1: ; %bb4 ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: v_mov_b32_e32 v13, v1 ; GFX950-NEXT: v_lshlrev_b64 v[22:23], 3, v[12:13] ; GFX950-NEXT: v_lshl_add_u64 v[22:23], s[38:39], 0, v[22:23] ; GFX950-NEXT: global_load_dwordx4 v[22:25], v[22:23], off ; GFX950-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-NEXT: v_lshl_add_u64 v[26:27], v[0:1], 3, s[38:39] ; GFX950-NEXT: s_waitcnt vmcnt(1) ; GFX950-NEXT: v_or_b32_e32 v0, v40, v6 ; GFX950-NEXT: v_or_b32_e32 v13, v41, v7 ; GFX950-NEXT: v_or_b32_e32 v30, v42, v8 ; GFX950-NEXT: v_or_b32_e32 v31, v43, v9 ; GFX950-NEXT: global_load_dwordx4 v[26:29], v[26:27], off ; GFX950-NEXT: v_or_b32_e32 v18, v5, v18 ; GFX950-NEXT: v_or_b32_e32 v16, v4, v16 ; GFX950-NEXT: v_or_b32_e32 v15, v3, v15 ; GFX950-NEXT: v_or_b32_e32 v14, v2, v14 ; GFX950-NEXT: v_or_b32_e32 v12, 1, v12 ; GFX950-NEXT: v_mov_b32_e32 v40, 0 ; GFX950-NEXT: v_mov_b32_e32 v41, 0 ; GFX950-NEXT: v_mov_b32_e32 v42, 0 ; GFX950-NEXT: v_mov_b32_e32 v43, 0 ; GFX950-NEXT: s_waitcnt vmcnt(1) ; GFX950-NEXT: v_or_b32_e32 v25, v25, v31 ; GFX950-NEXT: v_or_b32_e32 v24, v24, v30 ; GFX950-NEXT: v_or_b32_e32 v23, v23, v13 ; GFX950-NEXT: v_or_b32_e32 v22, v22, v0 ; GFX950-NEXT: scratch_store_dwordx4 off, v[22:25], s33 ; GFX950-NEXT: flat_load_dword v0, v[10:11] ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_or_b32_e32 v21, v29, v21 ; GFX950-NEXT: v_or_b32_e32 v20, v28, v20 ; GFX950-NEXT: v_or_b32_e32 v19, v27, v19 ; GFX950-NEXT: v_or_b32_e32 v17, v26, v17 ; GFX950-NEXT: s_mov_b64 vcc, vcc ; GFX950-NEXT: s_cbranch_vccz .LBB0_1 ; GFX950-NEXT: ; %bb.2: ; %bb2 ; GFX950-NEXT: v_or_b32_e32 v3, v21, v18 ; GFX950-NEXT: v_or_b32_e32 v2, v20, v16 ; GFX950-NEXT: v_or_b32_e32 v1, v19, v15 ; GFX950-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-NEXT: v_or_b32_e32 v0, v17, v14 ; GFX950-NEXT: scratch_store_dwordx4 off, v[0:3], s33 ; GFX950-NEXT: s_endpgm bb: %i = load <4 x i32>, ptr addrspace(5) %ptr5 %i1 = tail call i32 @_ZN25__hip_builtin_threadIdx_t7__get_xEv() br label %bb4 bb2: ; preds = %bb4 %i3 = or <4 x i32> %i17, %i13 store <4 x i32> %i3, ptr addrspace(5) %ptr5 ret void bb4: ; preds = %bb4, %bb %i5 = phi <4 x i32> [ %i, %bb ], [ zeroinitializer, %bb4 ] %i6 = phi i32 [ 0, %bb ], [ %i24, %bb4 ] %i7 = phi i32 [ 0, %bb ], [ %i25, %bb4 ] %i8 = phi <4 x i32> [ zeroinitializer, %bb ], [ %i17, %bb4 ] %i9 = phi <4 x i32> [ zeroinitializer, %bb ], [ %i13, %bb4 ] %i10 = zext i32 %i1 to i64 %i11 = getelementptr i64, ptr addrspace(1) %ptr1, i64 %i10 %i12 = load <4 x i32>, ptr addrspace(1) %i11 %i13 = or <4 x i32> %i12, %i9 %i14 = zext i32 %i6 to i64 %i15 = getelementptr i64, ptr addrspace(1) %ptr1, i64 %i14 %i16 = load <4 x i32>, ptr addrspace(1) %i15 %i17 = or <4 x i32> %i16, %i8 %i18 = zext i32 %i7 to i64 %i19 = getelementptr i64, ptr addrspace(1) %ptr1, i64 %i18 %i20 = load <4 x i32>, ptr addrspace(1) %i19 %i21 = load <4 x i32>, ptr addrspace(1) %ptr1 %i22 = or <4 x i32> %i5, %i21 %i23 = or <4 x i32> %i20, %i22 store <4 x i32> %i23, ptr addrspace(5) %ptr5 %i24 = load i32, ptr %ptr %i25 = or i32 %i7, 1 br i1 %arg, label %bb2, label %bb4 }