diff options
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/load-constant-i1.ll')
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/load-constant-i1.ll | 578 |
1 files changed, 317 insertions, 261 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll index 83c240c..81e407d 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll @@ -78,6 +78,7 @@ define amdgpu_kernel void @constant_load_i1(ptr addrspace(1) %out, ptr addrspace ; ; GFX1250-LABEL: constant_load_i1: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0 @@ -161,6 +162,7 @@ define amdgpu_kernel void @constant_load_v2i1(ptr addrspace(1) %out, ptr addrspa ; ; GFX1250-LABEL: constant_load_v2i1: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -241,6 +243,7 @@ define amdgpu_kernel void @constant_load_v3i1(ptr addrspace(1) %out, ptr addrspa ; ; GFX1250-LABEL: constant_load_v3i1: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -322,6 +325,7 @@ define amdgpu_kernel void @constant_load_v4i1(ptr addrspace(1) %out, ptr addrspa ; ; GFX1250-LABEL: constant_load_v4i1: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -403,6 +407,7 @@ define amdgpu_kernel void @constant_load_v8i1(ptr addrspace(1) %out, ptr addrspa ; ; GFX1250-LABEL: constant_load_v8i1: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -484,6 +489,7 @@ define amdgpu_kernel void @constant_load_v16i1(ptr addrspace(1) %out, ptr addrsp ; ; GFX1250-LABEL: constant_load_v16i1: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -549,6 +555,7 @@ define amdgpu_kernel void @constant_load_v32i1(ptr addrspace(1) %out, ptr addrsp ; ; GFX1250-LABEL: constant_load_v32i1: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -617,6 +624,7 @@ define amdgpu_kernel void @constant_load_v64i1(ptr addrspace(1) %out, ptr addrsp ; ; GFX1250-LABEL: constant_load_v64i1: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -689,6 +697,7 @@ define amdgpu_kernel void @constant_zextload_i1_to_i32(ptr addrspace(1) %out, pt ; ; GFX1250-LABEL: constant_zextload_i1_to_i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0 @@ -766,6 +775,7 @@ define amdgpu_kernel void @constant_sextload_i1_to_i32(ptr addrspace(1) %out, pt ; ; GFX1250-LABEL: constant_sextload_i1_to_i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0 @@ -840,6 +850,7 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i32(ptr addrspace(1) %out ; ; GFX1250-LABEL: constant_zextload_v1i1_to_v1i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0 @@ -917,6 +928,7 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i32(ptr addrspace(1) %out ; ; GFX1250-LABEL: constant_sextload_v1i1_to_v1i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0 @@ -1002,6 +1014,7 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i32(ptr addrspace(1) %out ; ; GFX1250-LABEL: constant_zextload_v2i1_to_v2i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1088,6 +1101,7 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i32(ptr addrspace(1) %out ; ; GFX1250-LABEL: constant_sextload_v2i1_to_v2i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0 @@ -1184,6 +1198,7 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i32(ptr addrspace(1) %out ; ; GFX1250-LABEL: constant_zextload_v3i1_to_v3i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v3, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1281,6 +1296,7 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i32(ptr addrspace(1) %out ; ; GFX1250-LABEL: constant_sextload_v3i1_to_v3i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0 @@ -1378,6 +1394,7 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i32(ptr addrspace(1) %out ; ; GFX1250-LABEL: constant_zextload_v4i1_to_v4i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v4, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1480,6 +1497,7 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i32(ptr addrspace(1) %out ; ; GFX1250-LABEL: constant_sextload_v4i1_to_v4i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0 @@ -1611,7 +1629,7 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(ptr addrspace(1) %out ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 7, v0 ; GFX12-NEXT: v_bfe_u32 v2, v0, 6, 1 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v7, s3 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v4, s6 ; GFX12-NEXT: v_mov_b32_e32 v6, s7 ; GFX12-NEXT: s_clause 0x1 @@ -1621,6 +1639,7 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(ptr addrspace(1) %out ; ; GFX1250-LABEL: constant_zextload_v8i1_to_v8i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v8, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1767,6 +1786,7 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(ptr addrspace(1) %out ; ; GFX1250-LABEL: constant_sextload_v8i1_to_v8i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0 @@ -1989,6 +2009,7 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %o ; ; GFX1250-LABEL: constant_zextload_v16i1_to_v16i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v16, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2222,6 +2243,7 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %o ; ; GFX1250-LABEL: constant_sextload_v16i1_to_v16i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_u16 s2, s[2:3], 0x0 @@ -2629,6 +2651,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %o ; ; GFX1250-LABEL: constant_zextload_v32i1_to_v32i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -3100,6 +3123,7 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(ptr addrspace(1) %o ; ; GFX1250-LABEL: constant_sextload_v32i1_to_v32i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -3835,7 +3859,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX12-NEXT: v_dual_mov_b32 v10, s53 :: v_dual_mov_b32 v13, s20 ; GFX12-NEXT: v_dual_mov_b32 v12, s51 :: v_dual_mov_b32 v15, s19 ; GFX12-NEXT: v_dual_mov_b32 v14, s52 :: v_dual_mov_b32 v17, s18 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_dual_mov_b32 v16, s2 :: v_dual_mov_b32 v19, s17 ; GFX12-NEXT: v_dual_mov_b32 v18, s50 :: v_dual_mov_b32 v21, s16 ; GFX12-NEXT: v_dual_mov_b32 v20, s49 :: v_dual_mov_b32 v23, s15 @@ -3870,6 +3894,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o ; ; GFX1250-LABEL: constant_zextload_v64i1_to_v64i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 @@ -4731,7 +4756,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX12-NEXT: v_dual_mov_b32 v10, s41 :: v_dual_mov_b32 v13, s38 ; GFX12-NEXT: v_dual_mov_b32 v12, s39 :: v_dual_mov_b32 v15, s36 ; GFX12-NEXT: v_dual_mov_b32 v14, s37 :: v_dual_mov_b32 v17, s35 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_dual_mov_b32 v16, s2 :: v_dual_mov_b32 v19, s33 ; GFX12-NEXT: v_dual_mov_b32 v18, s34 :: v_dual_mov_b32 v21, s30 ; GFX12-NEXT: v_dual_mov_b32 v20, s31 :: v_dual_mov_b32 v23, s28 @@ -4766,6 +4791,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o ; ; GFX1250-LABEL: constant_sextload_v64i1_to_v64i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 @@ -4970,6 +4996,7 @@ define amdgpu_kernel void @constant_zextload_i1_to_i64(ptr addrspace(1) %out, pt ; ; GFX1250-LABEL: constant_zextload_i1_to_i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0 @@ -5052,6 +5079,7 @@ define amdgpu_kernel void @constant_sextload_i1_to_i64(ptr addrspace(1) %out, pt ; ; GFX1250-LABEL: constant_sextload_i1_to_i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -5134,6 +5162,7 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i64(ptr addrspace(1) %out ; ; GFX1250-LABEL: constant_zextload_v1i1_to_v1i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0 @@ -5216,6 +5245,7 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i64(ptr addrspace(1) %out ; ; GFX1250-LABEL: constant_sextload_v1i1_to_v1i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -5311,6 +5341,7 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i64(ptr addrspace(1) %out ; ; GFX1250-LABEL: constant_zextload_v2i1_to_v2i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -5412,6 +5443,7 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i64(ptr addrspace(1) %out ; ; GFX1250-LABEL: constant_sextload_v2i1_to_v2i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v4, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -5530,6 +5562,7 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(ptr addrspace(1) %out ; ; GFX1250-LABEL: constant_zextload_v3i1_to_v3i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v5, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -5661,6 +5694,7 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(ptr addrspace(1) %out ; ; GFX1250-LABEL: constant_sextload_v3i1_to_v3i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v5, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -5787,11 +5821,11 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX12-NEXT: s_and_b32 s2, s2, 1 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX12-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: v_mov_b32_e32 v2, s3 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] @@ -5799,6 +5833,7 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out ; ; GFX1250-LABEL: constant_zextload_v4i1_to_v4i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -5948,6 +5983,7 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(ptr addrspace(1) %out ; ; GFX1250-LABEL: constant_sextload_v4i1_to_v4i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v9, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -6135,6 +6171,7 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out ; ; GFX1250-LABEL: constant_zextload_v8i1_to_v8i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -6340,7 +6377,7 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out ; GFX12-NEXT: s_lshr_b32 s10, s3, 2 ; GFX12-NEXT: s_lshr_b32 s12, s3, 3 ; GFX12-NEXT: s_lshr_b32 s14, s3, 1 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 ; GFX12-NEXT: v_bfe_i32 v12, v9, 0, 1 @@ -6348,7 +6385,7 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out ; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 @@ -6367,6 +6404,7 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out ; ; GFX1250-LABEL: constant_sextload_v8i1_to_v8i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v16, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -6473,85 +6511,84 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o ; GFX8-LABEL: constant_zextload_v16i1_to_v16i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_mov_b32_e32 v7, v3 +; GFX8-NEXT: v_mov_b32_e32 v9, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ushort v0, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, v1 -; GFX8-NEXT: v_mov_b32_e32 v7, v1 -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v11, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: s_bfe_u32 s3, s2, 0x10009 -; GFX8-NEXT: s_bfe_u32 s4, s2, 0x1000d -; GFX8-NEXT: s_bfe_u32 s5, s2, 0x10007 -; GFX8-NEXT: s_bfe_u32 s6, s2, 0x10003 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x10001 -; GFX8-NEXT: s_and_b32 s8, s2, 1 -; GFX8-NEXT: s_bfe_u32 s9, s2, 0x10002 -; GFX8-NEXT: s_bfe_u32 s10, s2, 0x10004 -; GFX8-NEXT: s_bfe_u32 s11, s2, 0x10006 -; GFX8-NEXT: s_bfe_u32 s12, s2, 0x1000c +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0 +; GFX8-NEXT: s_bfe_u32 s4, s2, 0x10009 +; GFX8-NEXT: s_bfe_u32 s5, s2, 0x1000d +; GFX8-NEXT: s_bfe_u32 s6, s2, 0x10007 +; GFX8-NEXT: s_bfe_u32 s7, s2, 0x10003 +; GFX8-NEXT: s_bfe_u32 s8, s2, 0x10001 +; GFX8-NEXT: s_and_b32 s9, s2, 1 +; GFX8-NEXT: s_bfe_u32 s10, s2, 0x10002 +; GFX8-NEXT: s_bfe_u32 s11, s2, 0x10004 +; GFX8-NEXT: s_bfe_u32 s12, s2, 0x10006 +; GFX8-NEXT: s_bfe_u32 s13, s2, 0x1000c ; GFX8-NEXT: s_bfe_u32 s2, s2, 0x1000a -; GFX8-NEXT: v_and_b32_e32 v4, 0xffff, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_bfe_u32 v4, v2, 11, 1 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 15, v2 +; GFX8-NEXT: v_bfe_u32 v12, v2, 5, 1 +; GFX8-NEXT: v_bfe_u32 v6, v2, 14, 1 +; GFX8-NEXT: v_bfe_u32 v0, v2, 8, 1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x50 -; GFX8-NEXT: v_mov_b32_e32 v6, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v13, s3 -; GFX8-NEXT: v_mov_b32_e32 v12, s2 +; GFX8-NEXT: v_mov_b32_e32 v11, s3 +; GFX8-NEXT: v_mov_b32_e32 v10, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 64 -; GFX8-NEXT: v_bfe_u32 v2, v4, 11, 1 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v13, s3 -; GFX8-NEXT: v_mov_b32_e32 v12, s2 +; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[2:5] +; GFX8-NEXT: v_mov_b32_e32 v11, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v10, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x70 -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 15, v4 -; GFX8-NEXT: v_bfe_u32 v14, v4, 5, 1 -; GFX8-NEXT: v_bfe_u32 v8, v4, 14, 1 -; GFX8-NEXT: v_bfe_u32 v4, v4, 8, 1 +; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[0:3] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[4:7] -; GFX8-NEXT: v_mov_b32_e32 v0, s12 -; GFX8-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_add_u32 s2, s0, 0x60 +; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[6:9] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[8:11] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s13 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_add_u32 s2, s0, 48 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, s12 +; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_add_u32 s2, s0, 32 -; GFX8-NEXT: v_mov_b32_e32 v0, s11 -; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, s11 +; GFX8-NEXT: v_mov_b32_e32 v4, v12 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_add_u32 s2, s0, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, s10 -; GFX8-NEXT: v_mov_b32_e32 v2, v14 +; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NEXT: v_mov_b32_e32 v0, s9 -; GFX8-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: v_mov_b32_e32 v2, s7 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, s10 +; GFX8-NEXT: v_mov_b32_e32 v4, s7 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v2, s9 +; GFX8-NEXT: v_mov_b32_e32 v4, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; GFX8-NEXT: s_endpgm ; ; EG-LABEL: constant_zextload_v16i1_to_v16i64: @@ -6640,61 +6677,63 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o ; GFX12-LABEL: constant_zextload_v16i1_to_v16i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: v_mov_b32_e32 v3, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_u16 v0, v1, s[2:3] +; GFX12-NEXT: global_load_u16 v0, v3, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s2, v0 -; GFX12-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_and_b32 v4, 0xffff, v0 -; GFX12-NEXT: v_mov_b32_e32 v11, v1 +; GFX12-NEXT: v_dual_mov_b32 v7, v3 :: v_dual_and_b32 v6, 0xffff, v0 +; GFX12-NEXT: v_mov_b32_e32 v9, v3 ; GFX12-NEXT: s_bfe_u32 s3, s2, 0x1000a ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v0, s3 -; GFX12-NEXT: v_bfe_u32 v2, v4, 11, 1 -; GFX12-NEXT: s_bfe_u32 s3, s2, 0x1000d +; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v2, s3 +; GFX12-NEXT: v_bfe_u32 v4, v6, 11, 1 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10009 +; GFX12-NEXT: v_mov_b32_e32 v1, v3 +; GFX12-NEXT: v_bfe_u32 v0, v6, 8, 1 ; GFX12-NEXT: s_bfe_u32 s4, s2, 0x1000c -; GFX12-NEXT: v_mov_b32_e32 v5, v1 -; GFX12-NEXT: v_bfe_u32 v6, v4, 5, 1 -; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:80 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:80 +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x1000d +; GFX12-NEXT: v_lshrrev_b32_e32 v8, 15, v6 +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-NEXT: v_mov_b32_e32 v4, s3 ; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10007 +; GFX12-NEXT: global_store_b128 v3, v[0:3], s[0:1] offset:64 +; GFX12-NEXT: v_mov_b32_e32 v2, s4 ; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10006 -; GFX12-NEXT: v_mov_b32_e32 v9, v1 -; GFX12-NEXT: s_bfe_u32 s6, s2, 0x10002 -; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:96 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_bfe_u32 v0, v6, 5, 1 +; GFX12-NEXT: v_bfe_u32 v6, v6, 14, 1 +; GFX12-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:96 +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: v_mov_b32_e32 v4, s3 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10004 +; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10002 +; GFX12-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:48 +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_mov_b32_e32 v2, s3 -; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10004 -; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10009 -; GFX12-NEXT: s_bfe_u32 s5, s2, 0x10001 -; GFX12-NEXT: v_lshrrev_b32_e32 v10, 15, v4 -; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-NEXT: v_mov_b32_e32 v2, v6 -; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10003 -; GFX12-NEXT: s_and_b32 s2, s2, 1 -; GFX12-NEXT: v_bfe_u32 v8, v4, 14, 1 -; GFX12-NEXT: v_bfe_u32 v4, v4, 8, 1 -; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:32 -; GFX12-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v4, v0 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10003 +; GFX12-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:32 ; GFX12-NEXT: v_mov_b32_e32 v2, s4 -; GFX12-NEXT: v_mov_b32_e32 v6, s3 -; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16 -; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: v_mov_b32_e32 v2, s5 -; GFX12-NEXT: s_clause 0x2 -; GFX12-NEXT: global_store_b128 v1, v[8:11], s[0:1] offset:112 -; GFX12-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:64 -; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-NEXT: v_mov_b32_e32 v4, s3 +; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10001 +; GFX12-NEXT: s_and_b32 s2, s2, 1 +; GFX12-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:16 +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-NEXT: v_mov_b32_e32 v4, s3 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v3, v[6:9], s[0:1] offset:112 +; GFX12-NEXT: global_store_b128 v3, v[2:5], s[0:1] ; GFX12-NEXT: s_endpgm ; ; GFX1250-LABEL: constant_zextload_v16i1_to_v16i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -7062,7 +7101,7 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o ; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v5, s7 ; GFX12-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v2, s4 ; GFX12-NEXT: v_dual_mov_b32 v7, s9 :: v_dual_mov_b32 v4, s6 @@ -7100,6 +7139,7 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o ; ; GFX1250-LABEL: constant_sextload_v16i1_to_v16i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v32, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -7606,91 +7646,91 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, v1 ; GFX12-NEXT: s_bfe_u32 s4, s2, 0x1001c ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:240 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: v_mov_b32_e32 v2, s3 ; GFX12-NEXT: s_bfe_u32 s3, s2, 0x1001b ; GFX12-NEXT: s_bfe_u32 s4, s2, 0x1001a ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:224 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: v_mov_b32_e32 v2, s3 ; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10019 ; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10018 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:208 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: v_mov_b32_e32 v2, s3 ; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10017 ; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10016 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:192 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: v_mov_b32_e32 v2, s3 ; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10014 ; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10015 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:176 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_mov_b32_e32 v0, s3 ; GFX12-NEXT: v_mov_b32_e32 v2, s4 ; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10013 ; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10012 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:160 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: v_mov_b32_e32 v2, s3 ; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10011 ; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10010 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:144 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: v_mov_b32_e32 v2, s3 ; GFX12-NEXT: s_bfe_u32 s3, s2, 0x1000f ; GFX12-NEXT: s_bfe_u32 s4, s2, 0x1000e ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:128 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: v_mov_b32_e32 v2, s3 ; GFX12-NEXT: s_bfe_u32 s3, s2, 0x1000d ; GFX12-NEXT: s_bfe_u32 s4, s2, 0x1000c ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:112 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: v_mov_b32_e32 v2, s3 ; GFX12-NEXT: s_bfe_u32 s3, s2, 0x1000b ; GFX12-NEXT: s_bfe_u32 s4, s2, 0x1000a ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:96 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: v_mov_b32_e32 v2, s3 ; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10009 ; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10008 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:80 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: v_mov_b32_e32 v2, s3 ; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10007 ; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10006 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:64 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: v_mov_b32_e32 v2, s3 ; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10005 ; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10004 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: v_mov_b32_e32 v2, s3 ; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10003 ; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10002 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:32 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: v_mov_b32_e32 v2, s3 ; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10001 ; GFX12-NEXT: s_and_b32 s2, s2, 1 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: v_mov_b32_e32 v2, s3 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] @@ -7698,6 +7738,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o ; ; GFX1250-LABEL: constant_zextload_v32i1_to_v32i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -8495,6 +8536,7 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; ; GFX1250-LABEL: constant_sextload_v32i1_to_v32i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -9465,191 +9507,191 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 ; GFX12-NEXT: s_bfe_u32 s5, s3, 0x10015 ; GFX12-NEXT: s_lshr_b32 s4, s3, 31 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, v1 ; GFX12-NEXT: s_bfe_u32 s5, s3, 0x1001e ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:416 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_mov_b32_e32 v0, s5 ; GFX12-NEXT: v_mov_b32_e32 v2, s4 ; GFX12-NEXT: s_bfe_u32 s4, s3, 0x1001d ; GFX12-NEXT: s_bfe_u32 s5, s3, 0x1001c ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:496 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_mov_b32_e32 v0, s5 ; GFX12-NEXT: v_mov_b32_e32 v2, s4 ; GFX12-NEXT: s_bfe_u32 s4, s3, 0x1001b ; GFX12-NEXT: s_bfe_u32 s5, s3, 0x1001a ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:480 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_mov_b32_e32 v0, s5 ; GFX12-NEXT: v_mov_b32_e32 v2, s4 ; GFX12-NEXT: s_bfe_u32 s4, s3, 0x10019 ; GFX12-NEXT: s_bfe_u32 s5, s3, 0x10018 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:464 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_mov_b32_e32 v0, s5 ; GFX12-NEXT: v_mov_b32_e32 v2, s4 ; GFX12-NEXT: s_bfe_u32 s4, s3, 0x10017 ; GFX12-NEXT: s_bfe_u32 s5, s3, 0x10016 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:448 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_mov_b32_e32 v0, s5 ; GFX12-NEXT: v_mov_b32_e32 v2, s4 ; GFX12-NEXT: s_bfe_u32 s4, s3, 0x10013 ; GFX12-NEXT: s_bfe_u32 s5, s3, 0x10012 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:432 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_mov_b32_e32 v0, s5 ; GFX12-NEXT: v_mov_b32_e32 v2, s4 ; GFX12-NEXT: s_bfe_u32 s4, s3, 0x10011 ; GFX12-NEXT: s_bfe_u32 s5, s3, 0x10010 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:400 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_mov_b32_e32 v0, s5 ; GFX12-NEXT: v_mov_b32_e32 v2, s4 ; GFX12-NEXT: s_bfe_u32 s4, s3, 0x1000f ; GFX12-NEXT: s_bfe_u32 s5, s3, 0x1000e ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:384 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_mov_b32_e32 v0, s5 ; GFX12-NEXT: v_mov_b32_e32 v2, s4 ; GFX12-NEXT: s_bfe_u32 s4, s3, 0x1000d ; GFX12-NEXT: s_bfe_u32 s5, s3, 0x1000c ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:368 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_mov_b32_e32 v0, s5 ; GFX12-NEXT: v_mov_b32_e32 v2, s4 ; GFX12-NEXT: s_bfe_u32 s4, s3, 0x1000b ; GFX12-NEXT: s_bfe_u32 s5, s3, 0x1000a ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:352 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_mov_b32_e32 v0, s5 ; GFX12-NEXT: v_mov_b32_e32 v2, s4 ; GFX12-NEXT: s_bfe_u32 s4, s3, 0x10009 ; GFX12-NEXT: s_bfe_u32 s5, s3, 0x10008 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:336 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_mov_b32_e32 v0, s5 ; GFX12-NEXT: v_mov_b32_e32 v2, s4 ; GFX12-NEXT: s_bfe_u32 s4, s3, 0x10007 ; GFX12-NEXT: s_bfe_u32 s5, s3, 0x10006 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:320 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_mov_b32_e32 v0, s5 ; GFX12-NEXT: v_mov_b32_e32 v2, s4 ; GFX12-NEXT: s_bfe_u32 s4, s3, 0x10005 ; GFX12-NEXT: s_bfe_u32 s5, s3, 0x10004 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:304 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_mov_b32_e32 v0, s5 ; GFX12-NEXT: v_mov_b32_e32 v2, s4 ; GFX12-NEXT: s_bfe_u32 s4, s3, 0x10003 ; GFX12-NEXT: s_bfe_u32 s5, s3, 0x10002 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:288 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_mov_b32_e32 v0, s5 ; GFX12-NEXT: v_mov_b32_e32 v2, s4 ; GFX12-NEXT: s_bfe_u32 s4, s3, 0x10001 ; GFX12-NEXT: s_and_b32 s3, s3, 1 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:272 ; GFX12-NEXT: v_mov_b32_e32 v0, s3 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_mov_b32_e32 v2, s4 ; GFX12-NEXT: s_lshr_b32 s3, s2, 31 ; GFX12-NEXT: s_bfe_u32 s4, s2, 0x1001e ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:256 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: v_mov_b32_e32 v2, s3 ; GFX12-NEXT: s_bfe_u32 s3, s2, 0x1001d ; GFX12-NEXT: s_bfe_u32 s4, s2, 0x1001c ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:240 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: v_mov_b32_e32 v2, s3 ; GFX12-NEXT: s_bfe_u32 s3, s2, 0x1001b ; GFX12-NEXT: s_bfe_u32 s4, s2, 0x1001a ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:224 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: v_mov_b32_e32 v2, s3 ; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10019 ; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10018 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:208 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: v_mov_b32_e32 v2, s3 ; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10017 ; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10016 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:192 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: v_mov_b32_e32 v2, s3 ; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10014 ; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10015 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:176 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_mov_b32_e32 v0, s3 ; GFX12-NEXT: v_mov_b32_e32 v2, s4 ; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10013 ; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10012 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:160 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: v_mov_b32_e32 v2, s3 ; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10011 ; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10010 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:144 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: v_mov_b32_e32 v2, s3 ; GFX12-NEXT: s_bfe_u32 s3, s2, 0x1000f ; GFX12-NEXT: s_bfe_u32 s4, s2, 0x1000e ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:128 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: v_mov_b32_e32 v2, s3 ; GFX12-NEXT: s_bfe_u32 s3, s2, 0x1000d ; GFX12-NEXT: s_bfe_u32 s4, s2, 0x1000c ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:112 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: v_mov_b32_e32 v2, s3 ; GFX12-NEXT: s_bfe_u32 s3, s2, 0x1000b ; GFX12-NEXT: s_bfe_u32 s4, s2, 0x1000a ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:96 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: v_mov_b32_e32 v2, s3 ; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10009 ; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10008 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:80 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: v_mov_b32_e32 v2, s3 ; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10007 ; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10006 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:64 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: v_mov_b32_e32 v2, s3 ; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10005 ; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10004 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: v_mov_b32_e32 v2, s3 ; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10003 ; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10002 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:32 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: v_mov_b32_e32 v2, s3 ; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10001 ; GFX12-NEXT: s_and_b32 s2, s2, 1 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: v_mov_b32_e32 v2, s3 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] @@ -9657,175 +9699,188 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; ; GFX1250-LABEL: constant_zextload_v64i1_to_v64i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x10014 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX1250-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v2, s4 ; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x10015 ; GFX1250-NEXT: s_lshr_b32 s4, s3, 31 -; GFX1250-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, v1 +; GFX1250-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, v3 ; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x1001e -; GFX1250-NEXT: s_bfe_u32 s6, s2, 0x10004 -; GFX1250-NEXT: s_and_b32 s7, s2, 1 -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:416 +; GFX1250-NEXT: s_bfe_u32 s6, s3, 0x10002 +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:416 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4 +; GFX1250-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v4, s4 ; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x1001d ; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x1001c -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:496 +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:496 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4 +; GFX1250-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v4, s4 ; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x1001b ; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x1001a -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:480 +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:480 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4 +; GFX1250-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v4, s4 ; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x10019 ; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x10018 -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:464 +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:464 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4 +; GFX1250-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v4, s4 ; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x10017 ; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x10016 -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:448 +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:448 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4 +; GFX1250-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v4, s4 ; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x10013 ; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x10012 -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:432 +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:432 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4 +; GFX1250-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v4, s4 ; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x10011 ; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x10010 -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:400 +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:400 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4 +; GFX1250-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v4, s4 ; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x1000f ; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x1000e -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:384 +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:384 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4 +; GFX1250-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v4, s4 ; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x1000d ; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x1000c -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:368 +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:368 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4 +; GFX1250-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v4, s4 ; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x1000b ; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x1000a -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:352 +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:352 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4 +; GFX1250-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v4, s4 ; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x10009 ; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x10008 -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:336 +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:336 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4 +; GFX1250-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v4, s4 ; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x10007 ; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x10006 -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:320 +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:320 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4 +; GFX1250-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v4, s4 ; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x10005 ; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x10004 -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:304 +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:304 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4 -; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x10003 -; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x10002 -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:288 +; GFX1250-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v4, s4 +; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x10003 +; GFX1250-NEXT: s_mov_b32 s4, s3 +; GFX1250-NEXT: s_bfe_u32 s3, s3, 0x10001 +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:288 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4 -; GFX1250-NEXT: s_lshr_b32 s4, s2, 31 -; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x1001e -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:272 +; GFX1250-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v4, s5 +; GFX1250-NEXT: s_and_b64 s[4:5], s[4:5], 1 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:272 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4 -; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x1001d -; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x1001c -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:240 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: s_lshr_b32 s3, s2, 31 +; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x1001e +; GFX1250-NEXT: v_mov_b32_e32 v4, s3 +; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x1001d +; GFX1250-NEXT: global_store_b128 v3, v[0:3], s[0:1] offset:256 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4 -; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x1001b -; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x1001a -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:224 +; GFX1250-NEXT: v_mov_b32_e32 v2, s4 +; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x1001c +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:240 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4 -; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10019 -; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x10018 -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:208 +; GFX1250-NEXT: v_mov_b32_e32 v4, s3 +; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x1001b +; GFX1250-NEXT: v_mov_b32_e32 v2, s4 +; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x1001a +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:224 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4 -; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10017 -; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x10016 -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:192 +; GFX1250-NEXT: v_mov_b32_e32 v4, s3 +; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10019 +; GFX1250-NEXT: v_mov_b32_e32 v2, s4 +; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10018 +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:208 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4 -; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10014 -; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x10015 -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:176 +; GFX1250-NEXT: v_mov_b32_e32 v4, s3 +; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10017 +; GFX1250-NEXT: v_mov_b32_e32 v2, s4 +; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10016 +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:192 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v2, s5 -; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10013 -; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x10012 -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:160 +; GFX1250-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v4, s3 +; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10014 +; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10015 +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:176 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4 -; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10011 -; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x10010 -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:144 +; GFX1250-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v4, s4 +; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10013 +; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10012 +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:160 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4 -; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x1000f -; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x1000e -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:128 +; GFX1250-NEXT: v_mov_b32_e32 v4, s3 +; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10011 +; GFX1250-NEXT: v_mov_b32_e32 v2, s4 +; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10010 +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:144 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4 -; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x1000d -; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x1000c -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:112 +; GFX1250-NEXT: v_mov_b32_e32 v4, s3 +; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x1000f +; GFX1250-NEXT: v_mov_b32_e32 v2, s4 +; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x1000e +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:128 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4 -; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x1000b -; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x1000a -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:96 +; GFX1250-NEXT: v_mov_b32_e32 v4, s3 +; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x1000d +; GFX1250-NEXT: v_mov_b32_e32 v2, s4 +; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x1000c +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:112 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4 -; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10009 -; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x10008 -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:80 +; GFX1250-NEXT: v_mov_b32_e32 v4, s3 +; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x1000b +; GFX1250-NEXT: v_mov_b32_e32 v2, s4 +; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x1000a +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:96 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4 -; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10007 -; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x10006 -; GFX1250-NEXT: v_mov_b32_e32 v7, v1 -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:64 +; GFX1250-NEXT: v_mov_b32_e32 v4, s3 +; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10009 +; GFX1250-NEXT: v_mov_b32_e32 v2, s4 +; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10008 +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:80 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4 -; GFX1250-NEXT: s_mov_b32 s4, s3 -; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x10001 +; GFX1250-NEXT: v_mov_b32_e32 v4, s3 +; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10007 +; GFX1250-NEXT: v_mov_b32_e32 v2, s4 +; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10006 +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:64 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v4, s3 ; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10005 -; GFX1250-NEXT: v_mov_b32_e32 v6, s5 -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48 +; GFX1250-NEXT: v_mov_b32_e32 v2, s4 +; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10004 +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:48 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v4, s3 ; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10003 -; GFX1250-NEXT: s_bfe_u32 s6, s2, 0x10001 -; GFX1250-NEXT: s_bfe_u32 s2, s2, 0x10002 -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:32 +; GFX1250-NEXT: v_mov_b32_e32 v2, s4 +; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10002 +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:32 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v2, s3 -; GFX1250-NEXT: s_and_b64 s[2:3], s[4:5], 1 -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1250-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v4, s3 +; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10001 +; GFX1250-NEXT: v_mov_b32_e32 v2, s4 +; GFX1250-NEXT: s_and_b32 s2, s2, 1 +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:16 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v2, s6 -; GFX1250-NEXT: s_clause 0x1 -; GFX1250-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:256 -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] +; GFX1250-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v4, s3 +; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] ; GFX1250-NEXT: s_endpgm %load = load <64 x i1>, ptr addrspace(4) %in %ext = zext <64 x i1> %load to <64 x i64> @@ -11148,7 +11203,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX12-NEXT: s_lshr_b32 s44, s10, 4 ; GFX12-NEXT: s_lshr_b32 s30, s10, 2 ; GFX12-NEXT: s_bfe_i64 s[18:19], s[10:11], 0x10000 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_bfe_i64 s[10:11], s[26:27], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[26:27], s[40:41], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[40:41], s[52:53], 0x10000 @@ -11163,7 +11218,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX12-NEXT: s_bfe_i64 s[78:79], s[78:79], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[92:93], s[92:93], 0x10000 ; GFX12-NEXT: v_dual_mov_b32 v14, s91 :: v_dual_mov_b32 v15, s98 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_dual_mov_b32 v16, s99 :: v_dual_mov_b32 v17, s96 ; GFX12-NEXT: s_bfe_i64 s[66:67], s[66:67], 0x10000 ; GFX12-NEXT: v_dual_mov_b32 v18, s97 :: v_dual_mov_b32 v19, s94 @@ -11180,14 +11235,14 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX12-NEXT: global_store_b128 v0, v[21:24], s[0:1] offset:96 ; GFX12-NEXT: v_dual_mov_b32 v1, s78 :: v_dual_mov_b32 v2, s79 ; GFX12-NEXT: v_dual_mov_b32 v3, s74 :: v_dual_mov_b32 v4, s75 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_mov_b32_e32 v5, s66 ; GFX12-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 ; GFX12-NEXT: v_dual_mov_b32 v6, s67 :: v_dual_mov_b32 v7, s62 ; GFX12-NEXT: v_dual_mov_b32 v8, s63 :: v_dual_mov_b32 v9, s56 ; GFX12-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 ; GFX12-NEXT: v_dual_mov_b32 v10, s57 :: v_dual_mov_b32 v11, s52 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: v_dual_mov_b32 v12, s53 :: v_dual_mov_b32 v13, s44 ; GFX12-NEXT: v_dual_mov_b32 v14, s45 :: v_dual_mov_b32 v15, s40 ; GFX12-NEXT: v_dual_mov_b32 v16, s41 :: v_dual_mov_b32 v17, s30 @@ -11206,6 +11261,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; ; GFX1250-LABEL: constant_sextload_v64i1_to_v64i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[10:11], s[2:3], 0x0 |
