; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 ; RUN: sed 's/ATTRS/\"amdgpu-expand-waitcnt-profiling\"/g' %s | llc -mtriple=amdgcn -mcpu=gfx900 | FileCheck --check-prefix=GFX9-EXPAND %s ; RUN: sed 's/ATTRS//g' %s | llc -mtriple=amdgcn -mcpu=gfx900 | FileCheck --check-prefix=GFX9-NOEXPAND %s ; RUN: sed 's/ATTRS/\"amdgpu-expand-waitcnt-profiling\"/g' %s | llc -mtriple=amdgcn -mcpu=gfx1010 | FileCheck --check-prefix=GFX10-EXPAND %s ; RUN: sed 's/ATTRS//g' %s | llc -mtriple=amdgcn -mcpu=gfx1010 | FileCheck --check-prefix=GFX10-NOEXPAND %s ; RUN: sed 's/ATTRS/\"amdgpu-expand-waitcnt-profiling\"/g' %s | llc -mtriple=amdgcn -mcpu=gfx1100 | FileCheck --check-prefix=GFX11-EXPAND %s ; RUN: sed 's/ATTRS//g' %s | llc -mtriple=amdgcn -mcpu=gfx1100 | FileCheck --check-prefix=GFX11-NOEXPAND %s ; RUN: sed 's/ATTRS/\"amdgpu-expand-waitcnt-profiling\"/g' %s | llc -mtriple=amdgcn -mcpu=gfx1200 | FileCheck --check-prefix=GFX12-EXPAND %s ; RUN: sed 's/ATTRS//g' %s | llc -mtriple=amdgcn -mcpu=gfx1200 | FileCheck --check-prefix=GFX12-NOEXPAND %s ; When -amdgpu-expand-waitcnt-profiling is enabled and there are N outstanding ; operations, instead of emitting a single waitcnt(target), we emit: ; waitcnt(N-1), waitcnt(N-2), ..., waitcnt(target) ; ; This allows PC-sampling profilers to identify which specific operation ; is causing a stall by observing where the program counter is stuck. define amdgpu_kernel void @test_lgkmcnt_scalar_loads(ptr addrspace(4) %ptr_a, ptr addrspace(4) %ptr_b, ptr addrspace(4) %ptr_c, ptr addrspace(1) %out) #0 { ; GFX9-EXPAND-LABEL: test_lgkmcnt_scalar_loads: ; GFX9-EXPAND: ; %bb.0: ; GFX9-EXPAND-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-EXPAND-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-EXPAND-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX9-EXPAND-NEXT: s_load_dword s1, s[10:11], 0x0 ; GFX9-EXPAND-NEXT: s_load_dword s2, s[12:13], 0x0 ; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-EXPAND-NEXT: s_add_i32 s0, s0, s1 ; GFX9-EXPAND-NEXT: s_add_i32 s0, s0, s2 ; GFX9-EXPAND-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-EXPAND-NEXT: global_store_dword v0, v1, s[14:15] ; GFX9-EXPAND-NEXT: s_endpgm ; ; GFX9-NOEXPAND-LABEL: test_lgkmcnt_scalar_loads: ; GFX9-NOEXPAND: ; %bb.0: ; GFX9-NOEXPAND-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NOEXPAND-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX9-NOEXPAND-NEXT: s_load_dword s1, s[10:11], 0x0 ; GFX9-NOEXPAND-NEXT: s_load_dword s2, s[12:13], 0x0 ; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NOEXPAND-NEXT: s_add_i32 s0, s0, s1 ; GFX9-NOEXPAND-NEXT: s_add_i32 s0, s0, s2 ; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NOEXPAND-NEXT: global_store_dword v0, v1, s[14:15] ; GFX9-NOEXPAND-NEXT: s_endpgm ; ; GFX10-EXPAND-LABEL: test_lgkmcnt_scalar_loads: ; GFX10-EXPAND: ; %bb.0: ; GFX10-EXPAND-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX10-EXPAND-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-EXPAND-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX10-EXPAND-NEXT: s_load_dword s1, s[10:11], 0x0 ; GFX10-EXPAND-NEXT: s_load_dword s2, s[12:13], 0x0 ; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-EXPAND-NEXT: s_add_i32 s0, s0, s1 ; GFX10-EXPAND-NEXT: s_add_i32 s0, s0, s2 ; GFX10-EXPAND-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-EXPAND-NEXT: global_store_dword v0, v1, s[14:15] ; GFX10-EXPAND-NEXT: s_endpgm ; ; GFX10-NOEXPAND-LABEL: test_lgkmcnt_scalar_loads: ; GFX10-NOEXPAND: ; %bb.0: ; GFX10-NOEXPAND-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NOEXPAND-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX10-NOEXPAND-NEXT: s_load_dword s1, s[10:11], 0x0 ; GFX10-NOEXPAND-NEXT: s_load_dword s2, s[12:13], 0x0 ; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NOEXPAND-NEXT: s_add_i32 s0, s0, s1 ; GFX10-NOEXPAND-NEXT: s_add_i32 s0, s0, s2 ; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-NOEXPAND-NEXT: global_store_dword v0, v1, s[14:15] ; GFX10-NOEXPAND-NEXT: s_endpgm ; ; GFX11-EXPAND-LABEL: test_lgkmcnt_scalar_loads: ; GFX11-EXPAND: ; %bb.0: ; GFX11-EXPAND-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-EXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-EXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX11-EXPAND-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-EXPAND-NEXT: s_add_i32 s0, s0, s1 ; GFX11-EXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-EXPAND-NEXT: s_add_i32 s0, s0, s2 ; GFX11-EXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; GFX11-EXPAND-NEXT: global_store_b32 v0, v1, s[6:7] ; GFX11-EXPAND-NEXT: s_endpgm ; ; GFX11-NOEXPAND-LABEL: test_lgkmcnt_scalar_loads: ; GFX11-NOEXPAND: ; %bb.0: ; GFX11-NOEXPAND-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOEXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NOEXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX11-NOEXPAND-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOEXPAND-NEXT: s_add_i32 s0, s0, s1 ; GFX11-NOEXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NOEXPAND-NEXT: s_add_i32 s0, s0, s2 ; GFX11-NOEXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; GFX11-NOEXPAND-NEXT: global_store_b32 v0, v1, s[6:7] ; GFX11-NOEXPAND-NEXT: s_endpgm ; ; GFX12-EXPAND-LABEL: test_lgkmcnt_scalar_loads: ; GFX12-EXPAND: ; %bb.0: ; GFX12-EXPAND-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0 ; GFX12-EXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX12-EXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX12-EXPAND-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0 ; GFX12-EXPAND-NEXT: s_add_co_i32 s0, s0, s1 ; GFX12-EXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-EXPAND-NEXT: s_add_co_i32 s0, s0, s2 ; GFX12-EXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; GFX12-EXPAND-NEXT: global_store_b32 v0, v1, s[6:7] ; GFX12-EXPAND-NEXT: s_endpgm ; ; GFX12-NOEXPAND-LABEL: test_lgkmcnt_scalar_loads: ; GFX12-NOEXPAND: ; %bb.0: ; GFX12-NOEXPAND-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0 ; GFX12-NOEXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX12-NOEXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX12-NOEXPAND-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0 ; GFX12-NOEXPAND-NEXT: s_add_co_i32 s0, s0, s1 ; GFX12-NOEXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NOEXPAND-NEXT: s_add_co_i32 s0, s0, s2 ; GFX12-NOEXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; GFX12-NOEXPAND-NEXT: global_store_b32 v0, v1, s[6:7] ; GFX12-NOEXPAND-NEXT: s_endpgm %val_a = load i32, ptr addrspace(4) %ptr_a, align 4 %val_b = load i32, ptr addrspace(4) %ptr_b, align 4 %val_c = load i32, ptr addrspace(4) %ptr_c, align 4 %sum1 = add i32 %val_a, %val_b %sum2 = add i32 %sum1, %val_c store i32 %sum2, ptr addrspace(1) %out, align 4 ret void } define amdgpu_kernel void @test_vmcnt_global_loads(ptr addrspace(1) %buf, ptr addrspace(1) %out) #0 { ; GFX9-EXPAND-LABEL: test_vmcnt_global_loads: ; GFX9-EXPAND: ; %bb.0: ; GFX9-EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-EXPAND-NEXT: global_load_dword v1, v0, s[0:1] ; GFX9-EXPAND-NEXT: global_load_dword v2, v0, s[0:1] offset:256 ; GFX9-EXPAND-NEXT: global_load_dword v3, v0, s[0:1] offset:512 ; GFX9-EXPAND-NEXT: s_waitcnt vmcnt(2) ; GFX9-EXPAND-NEXT: s_waitcnt vmcnt(1) ; GFX9-EXPAND-NEXT: s_waitcnt vmcnt(0) ; GFX9-EXPAND-NEXT: v_add3_u32 v1, v1, v2, v3 ; GFX9-EXPAND-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-EXPAND-NEXT: s_endpgm ; ; GFX9-NOEXPAND-LABEL: test_vmcnt_global_loads: ; GFX9-NOEXPAND: ; %bb.0: ; GFX9-NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NOEXPAND-NEXT: global_load_dword v1, v0, s[0:1] ; GFX9-NOEXPAND-NEXT: global_load_dword v2, v0, s[0:1] offset:256 ; GFX9-NOEXPAND-NEXT: global_load_dword v3, v0, s[0:1] offset:512 ; GFX9-NOEXPAND-NEXT: s_waitcnt vmcnt(0) ; GFX9-NOEXPAND-NEXT: v_add3_u32 v1, v1, v2, v3 ; GFX9-NOEXPAND-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NOEXPAND-NEXT: s_endpgm ; ; GFX10-EXPAND-LABEL: test_vmcnt_global_loads: ; GFX10-EXPAND: ; %bb.0: ; GFX10-EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-EXPAND-NEXT: s_clause 0x2 ; GFX10-EXPAND-NEXT: global_load_dword v1, v0, s[0:1] ; GFX10-EXPAND-NEXT: global_load_dword v2, v0, s[0:1] offset:256 ; GFX10-EXPAND-NEXT: global_load_dword v3, v0, s[0:1] offset:512 ; GFX10-EXPAND-NEXT: s_waitcnt vmcnt(2) ; GFX10-EXPAND-NEXT: s_waitcnt vmcnt(1) ; GFX10-EXPAND-NEXT: s_waitcnt vmcnt(0) ; GFX10-EXPAND-NEXT: v_add3_u32 v1, v1, v2, v3 ; GFX10-EXPAND-NEXT: global_store_dword v0, v1, s[2:3] ; GFX10-EXPAND-NEXT: s_endpgm ; ; GFX10-NOEXPAND-LABEL: test_vmcnt_global_loads: ; GFX10-NOEXPAND: ; %bb.0: ; GFX10-NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NOEXPAND-NEXT: s_clause 0x2 ; GFX10-NOEXPAND-NEXT: global_load_dword v1, v0, s[0:1] ; GFX10-NOEXPAND-NEXT: global_load_dword v2, v0, s[0:1] offset:256 ; GFX10-NOEXPAND-NEXT: global_load_dword v3, v0, s[0:1] offset:512 ; GFX10-NOEXPAND-NEXT: s_waitcnt vmcnt(0) ; GFX10-NOEXPAND-NEXT: v_add3_u32 v1, v1, v2, v3 ; GFX10-NOEXPAND-NEXT: global_store_dword v0, v1, s[2:3] ; GFX10-NOEXPAND-NEXT: s_endpgm ; ; GFX11-EXPAND-LABEL: test_vmcnt_global_loads: ; GFX11-EXPAND: ; %bb.0: ; GFX11-EXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-EXPAND-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-EXPAND-NEXT: s_clause 0x2 ; GFX11-EXPAND-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-EXPAND-NEXT: global_load_b32 v2, v0, s[0:1] offset:256 ; GFX11-EXPAND-NEXT: global_load_b32 v3, v0, s[0:1] offset:512 ; GFX11-EXPAND-NEXT: s_waitcnt vmcnt(2) ; GFX11-EXPAND-NEXT: s_waitcnt vmcnt(1) ; GFX11-EXPAND-NEXT: s_waitcnt vmcnt(0) ; GFX11-EXPAND-NEXT: v_add3_u32 v1, v1, v2, v3 ; GFX11-EXPAND-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX11-EXPAND-NEXT: s_endpgm ; ; GFX11-NOEXPAND-LABEL: test_vmcnt_global_loads: ; GFX11-NOEXPAND: ; %bb.0: ; GFX11-NOEXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NOEXPAND-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOEXPAND-NEXT: s_clause 0x2 ; GFX11-NOEXPAND-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-NOEXPAND-NEXT: global_load_b32 v2, v0, s[0:1] offset:256 ; GFX11-NOEXPAND-NEXT: global_load_b32 v3, v0, s[0:1] offset:512 ; GFX11-NOEXPAND-NEXT: s_waitcnt vmcnt(0) ; GFX11-NOEXPAND-NEXT: v_add3_u32 v1, v1, v2, v3 ; GFX11-NOEXPAND-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX11-NOEXPAND-NEXT: s_endpgm ; ; GFX12-EXPAND-LABEL: test_vmcnt_global_loads: ; GFX12-EXPAND: ; %bb.0: ; GFX12-EXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-EXPAND-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0 ; GFX12-EXPAND-NEXT: s_clause 0x2 ; GFX12-EXPAND-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX12-EXPAND-NEXT: global_load_b32 v2, v0, s[0:1] offset:256 ; GFX12-EXPAND-NEXT: global_load_b32 v3, v0, s[0:1] offset:512 ; GFX12-EXPAND-NEXT: s_wait_loadcnt 0x2 ; GFX12-EXPAND-NEXT: s_wait_loadcnt 0x1 ; GFX12-EXPAND-NEXT: s_wait_loadcnt 0x0 ; GFX12-EXPAND-NEXT: v_add3_u32 v1, v1, v2, v3 ; GFX12-EXPAND-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX12-EXPAND-NEXT: s_endpgm ; ; GFX12-NOEXPAND-LABEL: test_vmcnt_global_loads: ; GFX12-NOEXPAND: ; %bb.0: ; GFX12-NOEXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NOEXPAND-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0 ; GFX12-NOEXPAND-NEXT: s_clause 0x2 ; GFX12-NOEXPAND-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX12-NOEXPAND-NEXT: global_load_b32 v2, v0, s[0:1] offset:256 ; GFX12-NOEXPAND-NEXT: global_load_b32 v3, v0, s[0:1] offset:512 ; GFX12-NOEXPAND-NEXT: s_wait_loadcnt 0x0 ; GFX12-NOEXPAND-NEXT: v_add3_u32 v1, v1, v2, v3 ; GFX12-NOEXPAND-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX12-NOEXPAND-NEXT: s_endpgm ; Use thread ID to create thread-varying addresses -> forces vector loads %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid64 = zext i32 %tid to i64 ; Three separate global loads with thread-varying addresses ; Non-volatile loads allow multiple operations to be in-flight %ptr0 = getelementptr i32, ptr addrspace(1) %buf, i64 %tid64 %val0 = load i32, ptr addrspace(1) %ptr0, align 4 %offset1 = add i64 %tid64, 64 %ptr1 = getelementptr i32, ptr addrspace(1) %buf, i64 %offset1 %val1 = load i32, ptr addrspace(1) %ptr1, align 4 %offset2 = add i64 %tid64, 128 %ptr2 = getelementptr i32, ptr addrspace(1) %buf, i64 %offset2 %val2 = load i32, ptr addrspace(1) %ptr2, align 4 %sum1 = add i32 %val0, %val1 %sum2 = add i32 %sum1, %val2 %out_ptr = getelementptr i32, ptr addrspace(1) %out, i64 %tid64 store i32 %sum2, ptr addrspace(1) %out_ptr, align 4 ret void } declare i32 @llvm.amdgcn.workitem.id.x() define amdgpu_kernel void @test_lgkmcnt_lds_operations(ptr addrspace(3) %lds_ptr, ptr addrspace(1) %out) #0 { ; GFX9-EXPAND-LABEL: test_lgkmcnt_lds_operations: ; GFX9-EXPAND: ; %bb.0: ; GFX9-EXPAND-NEXT: s_load_dword s2, s[4:5], 0x24 ; GFX9-EXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c ; GFX9-EXPAND-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-EXPAND-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-EXPAND-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 ; GFX9-EXPAND-NEXT: ds_read_b32 v2, v2 offset:8 ; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-EXPAND-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-EXPAND-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX9-EXPAND-NEXT: global_store_dword v3, v0, s[0:1] ; GFX9-EXPAND-NEXT: s_endpgm ; ; GFX9-NOEXPAND-LABEL: test_lgkmcnt_lds_operations: ; GFX9-NOEXPAND: ; %bb.0: ; GFX9-NOEXPAND-NEXT: s_load_dword s2, s[4:5], 0x24 ; GFX9-NOEXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c ; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NOEXPAND-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 ; GFX9-NOEXPAND-NEXT: ds_read_b32 v2, v2 offset:8 ; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-NOEXPAND-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NOEXPAND-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX9-NOEXPAND-NEXT: global_store_dword v3, v0, s[0:1] ; GFX9-NOEXPAND-NEXT: s_endpgm ; ; GFX10-EXPAND-LABEL: test_lgkmcnt_lds_operations: ; GFX10-EXPAND: ; %bb.0: ; GFX10-EXPAND-NEXT: s_clause 0x1 ; GFX10-EXPAND-NEXT: s_load_dword s2, s[4:5], 0x24 ; GFX10-EXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c ; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-EXPAND-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-EXPAND-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 ; GFX10-EXPAND-NEXT: ds_read_b32 v2, v2 offset:8 ; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(1) ; GFX10-EXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX10-EXPAND-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-EXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; GFX10-EXPAND-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-EXPAND-NEXT: s_endpgm ; ; GFX10-NOEXPAND-LABEL: test_lgkmcnt_lds_operations: ; GFX10-NOEXPAND: ; %bb.0: ; GFX10-NOEXPAND-NEXT: s_clause 0x1 ; GFX10-NOEXPAND-NEXT: s_load_dword s2, s[4:5], 0x24 ; GFX10-NOEXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c ; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-NOEXPAND-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 ; GFX10-NOEXPAND-NEXT: ds_read_b32 v2, v2 offset:8 ; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(1) ; GFX10-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; GFX10-NOEXPAND-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NOEXPAND-NEXT: s_endpgm ; ; GFX11-EXPAND-LABEL: test_lgkmcnt_lds_operations: ; GFX11-EXPAND: ; %bb.0: ; GFX11-EXPAND-NEXT: s_clause 0x1 ; GFX11-EXPAND-NEXT: s_load_b32 s2, s[4:5], 0x24 ; GFX11-EXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c ; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-EXPAND-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-EXPAND-NEXT: ds_load_2addr_b32 v[0:1], v2 offset1:1 ; GFX11-EXPAND-NEXT: ds_load_b32 v2, v2 offset:8 ; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(1) ; GFX11-EXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-EXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v2 ; GFX11-EXPAND-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-EXPAND-NEXT: s_endpgm ; ; GFX11-NOEXPAND-LABEL: test_lgkmcnt_lds_operations: ; GFX11-NOEXPAND: ; %bb.0: ; GFX11-NOEXPAND-NEXT: s_clause 0x1 ; GFX11-NOEXPAND-NEXT: s_load_b32 s2, s[4:5], 0x24 ; GFX11-NOEXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c ; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOEXPAND-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-NOEXPAND-NEXT: ds_load_2addr_b32 v[0:1], v2 offset1:1 ; GFX11-NOEXPAND-NEXT: ds_load_b32 v2, v2 offset:8 ; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(1) ; GFX11-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NOEXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v2 ; GFX11-NOEXPAND-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NOEXPAND-NEXT: s_endpgm ; ; GFX12-EXPAND-LABEL: test_lgkmcnt_lds_operations: ; GFX12-EXPAND: ; %bb.0: ; GFX12-EXPAND-NEXT: s_clause 0x1 ; GFX12-EXPAND-NEXT: s_load_b32 s2, s[4:5], 0x24 ; GFX12-EXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c ; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0 ; GFX12-EXPAND-NEXT: v_mov_b32_e32 v2, s2 ; GFX12-EXPAND-NEXT: ds_load_2addr_b32 v[0:1], v2 offset1:1 ; GFX12-EXPAND-NEXT: ds_load_b32 v2, v2 offset:8 ; GFX12-EXPAND-NEXT: s_wait_dscnt 0x1 ; GFX12-EXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX12-EXPAND-NEXT: s_wait_dscnt 0x0 ; GFX12-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-EXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v2 ; GFX12-EXPAND-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-EXPAND-NEXT: s_endpgm ; ; GFX12-NOEXPAND-LABEL: test_lgkmcnt_lds_operations: ; GFX12-NOEXPAND: ; %bb.0: ; GFX12-NOEXPAND-NEXT: s_clause 0x1 ; GFX12-NOEXPAND-NEXT: s_load_b32 s2, s[4:5], 0x24 ; GFX12-NOEXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c ; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0 ; GFX12-NOEXPAND-NEXT: v_mov_b32_e32 v2, s2 ; GFX12-NOEXPAND-NEXT: ds_load_2addr_b32 v[0:1], v2 offset1:1 ; GFX12-NOEXPAND-NEXT: ds_load_b32 v2, v2 offset:8 ; GFX12-NOEXPAND-NEXT: s_wait_dscnt 0x1 ; GFX12-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX12-NOEXPAND-NEXT: s_wait_dscnt 0x0 ; GFX12-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NOEXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v2 ; GFX12-NOEXPAND-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NOEXPAND-NEXT: s_endpgm %ptr0 = getelementptr i32, ptr addrspace(3) %lds_ptr, i32 0 %ptr1 = getelementptr i32, ptr addrspace(3) %lds_ptr, i32 1 %ptr2 = getelementptr i32, ptr addrspace(3) %lds_ptr, i32 2 %val0 = load i32, ptr addrspace(3) %ptr0, align 4 %val1 = load i32, ptr addrspace(3) %ptr1, align 4 %val2 = load i32, ptr addrspace(3) %ptr2, align 4 %sum1 = add i32 %val0, %val1 %sum2 = add i32 %sum1, %val2 store i32 %sum2, ptr addrspace(1) %out, align 4 ret void } define amdgpu_kernel void @test_combined_vmcnt_lgkmcnt(ptr addrspace(4) %scalar_ptr_a, ptr addrspace(4) %scalar_ptr_b, ptr addrspace(1) %out) #0 { ; GFX9-EXPAND-LABEL: test_combined_vmcnt_lgkmcnt: ; GFX9-EXPAND: ; %bb.0: ; GFX9-EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-EXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-EXPAND-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-EXPAND-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-EXPAND-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-EXPAND-NEXT: s_add_i32 s0, s4, s5 ; GFX9-EXPAND-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-EXPAND-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-EXPAND-NEXT: s_endpgm ; ; GFX9-NOEXPAND-LABEL: test_combined_vmcnt_lgkmcnt: ; GFX9-NOEXPAND: ; %bb.0: ; GFX9-NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NOEXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NOEXPAND-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-NOEXPAND-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NOEXPAND-NEXT: s_add_i32 s0, s4, s5 ; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NOEXPAND-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NOEXPAND-NEXT: s_endpgm ; ; GFX10-EXPAND-LABEL: test_combined_vmcnt_lgkmcnt: ; GFX10-EXPAND: ; %bb.0: ; GFX10-EXPAND-NEXT: s_clause 0x1 ; GFX10-EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-EXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-EXPAND-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-EXPAND-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX10-EXPAND-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-EXPAND-NEXT: s_add_i32 s0, s4, s5 ; GFX10-EXPAND-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-EXPAND-NEXT: global_store_dword v0, v1, s[6:7] ; GFX10-EXPAND-NEXT: s_endpgm ; ; GFX10-NOEXPAND-LABEL: test_combined_vmcnt_lgkmcnt: ; GFX10-NOEXPAND: ; %bb.0: ; GFX10-NOEXPAND-NEXT: s_clause 0x1 ; GFX10-NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NOEXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NOEXPAND-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX10-NOEXPAND-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NOEXPAND-NEXT: s_add_i32 s0, s4, s5 ; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-NOEXPAND-NEXT: global_store_dword v0, v1, s[6:7] ; GFX10-NOEXPAND-NEXT: s_endpgm ; ; GFX11-EXPAND-LABEL: test_combined_vmcnt_lgkmcnt: ; GFX11-EXPAND: ; %bb.0: ; GFX11-EXPAND-NEXT: s_clause 0x1 ; GFX11-EXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-EXPAND-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-EXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-EXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-EXPAND-NEXT: s_add_i32 s0, s0, s1 ; GFX11-EXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-EXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; GFX11-EXPAND-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX11-EXPAND-NEXT: s_endpgm ; ; GFX11-NOEXPAND-LABEL: test_combined_vmcnt_lgkmcnt: ; GFX11-NOEXPAND: ; %bb.0: ; GFX11-NOEXPAND-NEXT: s_clause 0x1 ; GFX11-NOEXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NOEXPAND-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOEXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NOEXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOEXPAND-NEXT: s_add_i32 s0, s0, s1 ; GFX11-NOEXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NOEXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; GFX11-NOEXPAND-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX11-NOEXPAND-NEXT: s_endpgm ; ; GFX12-EXPAND-LABEL: test_combined_vmcnt_lgkmcnt: ; GFX12-EXPAND: ; %bb.0: ; GFX12-EXPAND-NEXT: s_clause 0x1 ; GFX12-EXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-EXPAND-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0 ; GFX12-EXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX12-EXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0 ; GFX12-EXPAND-NEXT: s_add_co_i32 s0, s0, s1 ; GFX12-EXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-EXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; GFX12-EXPAND-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX12-EXPAND-NEXT: s_endpgm ; ; GFX12-NOEXPAND-LABEL: test_combined_vmcnt_lgkmcnt: ; GFX12-NOEXPAND: ; %bb.0: ; GFX12-NOEXPAND-NEXT: s_clause 0x1 ; GFX12-NOEXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NOEXPAND-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0 ; GFX12-NOEXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX12-NOEXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0 ; GFX12-NOEXPAND-NEXT: s_add_co_i32 s0, s0, s1 ; GFX12-NOEXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NOEXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; GFX12-NOEXPAND-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX12-NOEXPAND-NEXT: s_endpgm %scalar_val1 = load i32, ptr addrspace(4) %scalar_ptr_a, align 4 %scalar_val2 = load i32, ptr addrspace(4) %scalar_ptr_b, align 4 %result = add i32 %scalar_val1, %scalar_val2 store i32 %result, ptr addrspace(1) %out, align 4 ret void } ; Test that expansion is NOT applied when counters are out-of-order (mixed event types). ; In pre-GFX12, LDS and SMEM operations both use DS_CNT (lgkmcnt), but they can complete ; out-of-order relative to each other. When both are in-flight, we should NOT expand ; because the expansion would be misleading. define amdgpu_kernel void @test_outoforder_lds_and_smem(ptr addrspace(3) %lds_ptr, ptr addrspace(4) %smem_ptr, ptr addrspace(1) %out) #0 { ; GFX9-EXPAND-LABEL: test_outoforder_lds_and_smem: ; GFX9-EXPAND: ; %bb.0: ; GFX9-EXPAND-NEXT: s_load_dword s6, s[4:5], 0x24 ; GFX9-EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; GFX9-EXPAND-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-EXPAND-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-EXPAND-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 ; GFX9-EXPAND-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-EXPAND-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-EXPAND-NEXT: v_add_u32_e32 v0, s0, v0 ; GFX9-EXPAND-NEXT: global_store_dword v2, v0, s[2:3] ; GFX9-EXPAND-NEXT: s_endpgm ; ; GFX9-NOEXPAND-LABEL: test_outoforder_lds_and_smem: ; GFX9-NOEXPAND: ; %bb.0: ; GFX9-NOEXPAND-NEXT: s_load_dword s6, s[4:5], 0x24 ; GFX9-NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NOEXPAND-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 ; GFX9-NOEXPAND-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NOEXPAND-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NOEXPAND-NEXT: v_add_u32_e32 v0, s0, v0 ; GFX9-NOEXPAND-NEXT: global_store_dword v2, v0, s[2:3] ; GFX9-NOEXPAND-NEXT: s_endpgm ; ; GFX10-EXPAND-LABEL: test_outoforder_lds_and_smem: ; GFX10-EXPAND: ; %bb.0: ; GFX10-EXPAND-NEXT: s_clause 0x1 ; GFX10-EXPAND-NEXT: s_load_dword s6, s[4:5], 0x24 ; GFX10-EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-EXPAND-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-EXPAND-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-EXPAND-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 ; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-EXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX10-EXPAND-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-EXPAND-NEXT: v_add_nc_u32_e32 v0, s0, v0 ; GFX10-EXPAND-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-EXPAND-NEXT: s_endpgm ; ; GFX10-NOEXPAND-LABEL: test_outoforder_lds_and_smem: ; GFX10-NOEXPAND: ; %bb.0: ; GFX10-NOEXPAND-NEXT: s_clause 0x1 ; GFX10-NOEXPAND-NEXT: s_load_dword s6, s[4:5], 0x24 ; GFX10-NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-NOEXPAND-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-NOEXPAND-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 ; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, s0, v0 ; GFX10-NOEXPAND-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NOEXPAND-NEXT: s_endpgm ; ; GFX11-EXPAND-LABEL: test_outoforder_lds_and_smem: ; GFX11-EXPAND: ; %bb.0: ; GFX11-EXPAND-NEXT: s_clause 0x1 ; GFX11-EXPAND-NEXT: s_load_b32 s6, s[4:5], 0x24 ; GFX11-EXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c ; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-EXPAND-NEXT: v_mov_b32_e32 v0, s6 ; GFX11-EXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-EXPAND-NEXT: ds_load_2addr_b32 v[0:1], v0 offset1:1 ; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-EXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX11-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-EXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s0, v0 ; GFX11-EXPAND-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX11-EXPAND-NEXT: s_endpgm ; ; GFX11-NOEXPAND-LABEL: test_outoforder_lds_and_smem: ; GFX11-NOEXPAND: ; %bb.0: ; GFX11-NOEXPAND-NEXT: s_clause 0x1 ; GFX11-NOEXPAND-NEXT: s_load_b32 s6, s[4:5], 0x24 ; GFX11-NOEXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c ; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOEXPAND-NEXT: v_mov_b32_e32 v0, s6 ; GFX11-NOEXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NOEXPAND-NEXT: ds_load_2addr_b32 v[0:1], v0 offset1:1 ; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX11-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NOEXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s0, v0 ; GFX11-NOEXPAND-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX11-NOEXPAND-NEXT: s_endpgm ; ; GFX12-EXPAND-LABEL: test_outoforder_lds_and_smem: ; GFX12-EXPAND: ; %bb.0: ; GFX12-EXPAND-NEXT: s_clause 0x1 ; GFX12-EXPAND-NEXT: s_load_b32 s6, s[4:5], 0x24 ; GFX12-EXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c ; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0 ; GFX12-EXPAND-NEXT: v_mov_b32_e32 v0, s6 ; GFX12-EXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX12-EXPAND-NEXT: ds_load_2addr_b32 v[0:1], v0 offset1:1 ; GFX12-EXPAND-NEXT: s_wait_dscnt 0x0 ; GFX12-EXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0 ; GFX12-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-EXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s0, v0 ; GFX12-EXPAND-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12-EXPAND-NEXT: s_endpgm ; ; GFX12-NOEXPAND-LABEL: test_outoforder_lds_and_smem: ; GFX12-NOEXPAND: ; %bb.0: ; GFX12-NOEXPAND-NEXT: s_clause 0x1 ; GFX12-NOEXPAND-NEXT: s_load_b32 s6, s[4:5], 0x24 ; GFX12-NOEXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c ; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0 ; GFX12-NOEXPAND-NEXT: v_mov_b32_e32 v0, s6 ; GFX12-NOEXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX12-NOEXPAND-NEXT: ds_load_2addr_b32 v[0:1], v0 offset1:1 ; GFX12-NOEXPAND-NEXT: s_wait_dscnt 0x0 ; GFX12-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0 ; GFX12-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NOEXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s0, v0 ; GFX12-NOEXPAND-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12-NOEXPAND-NEXT: s_endpgm %lds_val1 = load i32, ptr addrspace(3) %lds_ptr, align 4 %smem_val = load i32, ptr addrspace(4) %smem_ptr, align 4 %lds_ptr2 = getelementptr i32, ptr addrspace(3) %lds_ptr, i32 1 %lds_val2 = load i32, ptr addrspace(3) %lds_ptr2, align 4 %sum1 = add i32 %lds_val1, %lds_val2 %sum2 = add i32 %sum1, %smem_val store i32 %sum2, ptr addrspace(1) %out, align 4 ret void } define amdgpu_kernel void @test_vscnt_global_stores(ptr addrspace(1) %buf) #0 { ; Test vector memory stores (STORE_CNT/vscnt on GFX10-11, storecnt on GFX12+) ; GFX9-EXPAND-LABEL: test_vscnt_global_stores: ; GFX9-EXPAND: ; %bb.0: ; %entry ; GFX9-EXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-EXPAND-NEXT: v_mov_b32_e32 v1, 2 ; GFX9-EXPAND-NEXT: v_mov_b32_e32 v2, 1 ; GFX9-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-EXPAND-NEXT: global_store_dword v0, v2, s[0:1] ; GFX9-EXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:256 ; GFX9-EXPAND-NEXT: v_mov_b32_e32 v1, 3 ; GFX9-EXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:512 ; GFX9-EXPAND-NEXT: s_waitcnt vmcnt(0) ; GFX9-EXPAND-NEXT: s_endpgm ; ; GFX9-NOEXPAND-LABEL: test_vscnt_global_stores: ; GFX9-NOEXPAND: ; %bb.0: ; %entry ; GFX9-NOEXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v1, 2 ; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v2, 1 ; GFX9-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NOEXPAND-NEXT: global_store_dword v0, v2, s[0:1] ; GFX9-NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:256 ; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v1, 3 ; GFX9-NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:512 ; GFX9-NOEXPAND-NEXT: s_waitcnt vmcnt(0) ; GFX9-NOEXPAND-NEXT: s_endpgm ; ; GFX10-EXPAND-LABEL: test_vscnt_global_stores: ; GFX10-EXPAND: ; %bb.0: ; %entry ; GFX10-EXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-EXPAND-NEXT: v_mov_b32_e32 v1, 1 ; GFX10-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-EXPAND-NEXT: v_mov_b32_e32 v2, 2 ; GFX10-EXPAND-NEXT: v_mov_b32_e32 v3, 3 ; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-EXPAND-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-EXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:256 ; GFX10-EXPAND-NEXT: global_store_dword v0, v3, s[0:1] offset:512 ; GFX10-EXPAND-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-EXPAND-NEXT: s_endpgm ; ; GFX10-NOEXPAND-LABEL: test_vscnt_global_stores: ; GFX10-NOEXPAND: ; %bb.0: ; %entry ; GFX10-NOEXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v1, 1 ; GFX10-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v2, 2 ; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v3, 3 ; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NOEXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:256 ; GFX10-NOEXPAND-NEXT: global_store_dword v0, v3, s[0:1] offset:512 ; GFX10-NOEXPAND-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NOEXPAND-NEXT: s_endpgm ; ; GFX11-EXPAND-LABEL: test_vscnt_global_stores: ; GFX11-EXPAND: ; %bb.0: ; %entry ; GFX11-EXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-EXPAND-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-EXPAND-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 3 ; GFX11-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-EXPAND-NEXT: s_clause 0x2 ; GFX11-EXPAND-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-EXPAND-NEXT: global_store_b32 v0, v2, s[0:1] offset:256 ; GFX11-EXPAND-NEXT: global_store_b32 v0, v3, s[0:1] offset:512 ; GFX11-EXPAND-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-EXPAND-NEXT: s_endpgm ; ; GFX11-NOEXPAND-LABEL: test_vscnt_global_stores: ; GFX11-NOEXPAND: ; %bb.0: ; %entry ; GFX11-NOEXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NOEXPAND-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NOEXPAND-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 3 ; GFX11-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOEXPAND-NEXT: s_clause 0x2 ; GFX11-NOEXPAND-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NOEXPAND-NEXT: global_store_b32 v0, v2, s[0:1] offset:256 ; GFX11-NOEXPAND-NEXT: global_store_b32 v0, v3, s[0:1] offset:512 ; GFX11-NOEXPAND-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NOEXPAND-NEXT: s_endpgm ; ; GFX12-EXPAND-LABEL: test_vscnt_global_stores: ; GFX12-EXPAND: ; %bb.0: ; %entry ; GFX12-EXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-EXPAND-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-EXPAND-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 3 ; GFX12-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0 ; GFX12-EXPAND-NEXT: s_clause 0x2 ; GFX12-EXPAND-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-EXPAND-NEXT: global_store_b32 v0, v2, s[0:1] offset:256 ; GFX12-EXPAND-NEXT: global_store_b32 v0, v3, s[0:1] offset:512 ; GFX12-EXPAND-NEXT: global_wb scope:SCOPE_SYS ; GFX12-EXPAND-NEXT: s_wait_storecnt 0x0 ; GFX12-EXPAND-NEXT: s_endpgm ; ; GFX12-NOEXPAND-LABEL: test_vscnt_global_stores: ; GFX12-NOEXPAND: ; %bb.0: ; %entry ; GFX12-NOEXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-NOEXPAND-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NOEXPAND-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 3 ; GFX12-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0 ; GFX12-NOEXPAND-NEXT: s_clause 0x2 ; GFX12-NOEXPAND-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NOEXPAND-NEXT: global_store_b32 v0, v2, s[0:1] offset:256 ; GFX12-NOEXPAND-NEXT: global_store_b32 v0, v3, s[0:1] offset:512 ; GFX12-NOEXPAND-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NOEXPAND-NEXT: s_wait_storecnt 0x0 ; GFX12-NOEXPAND-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid64 = zext i32 %tid to i64 ; Issue multiple stores %ptr0 = getelementptr i32, ptr addrspace(1) %buf, i64 %tid64 store i32 1, ptr addrspace(1) %ptr0, align 4 %offset1 = add i64 %tid64, 64 %ptr1 = getelementptr i32, ptr addrspace(1) %buf, i64 %offset1 store i32 2, ptr addrspace(1) %ptr1, align 4 %offset2 = add i64 %tid64, 128 %ptr2 = getelementptr i32, ptr addrspace(1) %buf, i64 %offset2 store i32 3, ptr addrspace(1) %ptr2, align 4 ; Memory fence forces wait for all stores fence release ret void } define amdgpu_ps void @test_expcnt_exports(float %x, float %y, float %z, float %w) #0 { ; Test export operations (EXP_CNT/expcnt) ; GFX9-EXPAND-LABEL: test_expcnt_exports: ; GFX9-EXPAND: ; %bb.0: ; %entry ; GFX9-EXPAND-NEXT: v_mov_b32_e32 v4, 1.0 ; GFX9-EXPAND-NEXT: exp mrt0 v0, v1, v2, v3 ; GFX9-EXPAND-NEXT: exp mrt1 v3, v2, v1, v0 ; GFX9-EXPAND-NEXT: exp mrt2 v0, v3, v1, v2 ; GFX9-EXPAND-NEXT: exp param0 v4, v4, v4, v4 done ; GFX9-EXPAND-NEXT: s_endpgm ; ; GFX9-NOEXPAND-LABEL: test_expcnt_exports: ; GFX9-NOEXPAND: ; %bb.0: ; %entry ; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v4, 1.0 ; GFX9-NOEXPAND-NEXT: exp mrt0 v0, v1, v2, v3 ; GFX9-NOEXPAND-NEXT: exp mrt1 v3, v2, v1, v0 ; GFX9-NOEXPAND-NEXT: exp mrt2 v0, v3, v1, v2 ; GFX9-NOEXPAND-NEXT: exp param0 v4, v4, v4, v4 done ; GFX9-NOEXPAND-NEXT: s_endpgm ; ; GFX10-EXPAND-LABEL: test_expcnt_exports: ; GFX10-EXPAND: ; %bb.0: ; %entry ; GFX10-EXPAND-NEXT: v_mov_b32_e32 v4, 1.0 ; GFX10-EXPAND-NEXT: exp mrt0 v0, v1, v2, v3 ; GFX10-EXPAND-NEXT: exp mrt1 v3, v2, v1, v0 ; GFX10-EXPAND-NEXT: exp mrt2 v0, v3, v1, v2 ; GFX10-EXPAND-NEXT: exp param0 v4, v4, v4, v4 done ; GFX10-EXPAND-NEXT: s_endpgm ; ; GFX10-NOEXPAND-LABEL: test_expcnt_exports: ; GFX10-NOEXPAND: ; %bb.0: ; %entry ; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v4, 1.0 ; GFX10-NOEXPAND-NEXT: exp mrt0 v0, v1, v2, v3 ; GFX10-NOEXPAND-NEXT: exp mrt1 v3, v2, v1, v0 ; GFX10-NOEXPAND-NEXT: exp mrt2 v0, v3, v1, v2 ; GFX10-NOEXPAND-NEXT: exp param0 v4, v4, v4, v4 done ; GFX10-NOEXPAND-NEXT: s_endpgm ; ; GFX11-EXPAND-LABEL: test_expcnt_exports: ; GFX11-EXPAND: ; %bb.0: ; %entry ; GFX11-EXPAND-NEXT: v_mov_b32_e32 v4, 1.0 ; GFX11-EXPAND-NEXT: exp mrt0 v0, v1, v2, v3 ; GFX11-EXPAND-NEXT: exp mrt1 v3, v2, v1, v0 ; GFX11-EXPAND-NEXT: exp mrt2 v0, v3, v1, v2 ; GFX11-EXPAND-NEXT: exp invalid_target_32 v4, v4, v4, v4 done ; GFX11-EXPAND-NEXT: s_endpgm ; ; GFX11-NOEXPAND-LABEL: test_expcnt_exports: ; GFX11-NOEXPAND: ; %bb.0: ; %entry ; GFX11-NOEXPAND-NEXT: v_mov_b32_e32 v4, 1.0 ; GFX11-NOEXPAND-NEXT: exp mrt0 v0, v1, v2, v3 ; GFX11-NOEXPAND-NEXT: exp mrt1 v3, v2, v1, v0 ; GFX11-NOEXPAND-NEXT: exp mrt2 v0, v3, v1, v2 ; GFX11-NOEXPAND-NEXT: exp invalid_target_32 v4, v4, v4, v4 done ; GFX11-NOEXPAND-NEXT: s_endpgm ; ; GFX12-EXPAND-LABEL: test_expcnt_exports: ; GFX12-EXPAND: ; %bb.0: ; %entry ; GFX12-EXPAND-NEXT: v_mov_b32_e32 v4, 1.0 ; GFX12-EXPAND-NEXT: export mrt0 v0, v1, v2, v3 ; GFX12-EXPAND-NEXT: export mrt1 v3, v2, v1, v0 ; GFX12-EXPAND-NEXT: export mrt2 v0, v3, v1, v2 ; GFX12-EXPAND-NEXT: export invalid_target_32 v4, v4, v4, v4 done ; GFX12-EXPAND-NEXT: s_endpgm ; ; GFX12-NOEXPAND-LABEL: test_expcnt_exports: ; GFX12-NOEXPAND: ; %bb.0: ; %entry ; GFX12-NOEXPAND-NEXT: v_mov_b32_e32 v4, 1.0 ; GFX12-NOEXPAND-NEXT: export mrt0 v0, v1, v2, v3 ; GFX12-NOEXPAND-NEXT: export mrt1 v3, v2, v1, v0 ; GFX12-NOEXPAND-NEXT: export mrt2 v0, v3, v1, v2 ; GFX12-NOEXPAND-NEXT: export invalid_target_32 v4, v4, v4, v4 done ; GFX12-NOEXPAND-NEXT: s_endpgm entry: ; Multiple MRT exports call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 false, i1 false) call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %w, float %z, float %y, float %x, i1 false, i1 false) call void @llvm.amdgcn.exp.f32(i32 2, i32 15, float %x, float %w, float %y, float %z, i1 false, i1 false) ; Final export with done bit call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float 1.0, float 1.0, float 1.0, float 1.0, i1 true, i1 false) ret void } declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) attributes #0 = { nounwind ATTRS }