aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/load-constant-i1.ll')
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-constant-i1.ll578
1 files changed, 317 insertions, 261 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
index 83c240c..81e407d 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
@@ -78,6 +78,7 @@ define amdgpu_kernel void @constant_load_i1(ptr addrspace(1) %out, ptr addrspace
;
; GFX1250-LABEL: constant_load_i1:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0
@@ -161,6 +162,7 @@ define amdgpu_kernel void @constant_load_v2i1(ptr addrspace(1) %out, ptr addrspa
;
; GFX1250-LABEL: constant_load_v2i1:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
@@ -241,6 +243,7 @@ define amdgpu_kernel void @constant_load_v3i1(ptr addrspace(1) %out, ptr addrspa
;
; GFX1250-LABEL: constant_load_v3i1:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
@@ -322,6 +325,7 @@ define amdgpu_kernel void @constant_load_v4i1(ptr addrspace(1) %out, ptr addrspa
;
; GFX1250-LABEL: constant_load_v4i1:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
@@ -403,6 +407,7 @@ define amdgpu_kernel void @constant_load_v8i1(ptr addrspace(1) %out, ptr addrspa
;
; GFX1250-LABEL: constant_load_v8i1:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
@@ -484,6 +489,7 @@ define amdgpu_kernel void @constant_load_v16i1(ptr addrspace(1) %out, ptr addrsp
;
; GFX1250-LABEL: constant_load_v16i1:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
@@ -549,6 +555,7 @@ define amdgpu_kernel void @constant_load_v32i1(ptr addrspace(1) %out, ptr addrsp
;
; GFX1250-LABEL: constant_load_v32i1:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0
@@ -617,6 +624,7 @@ define amdgpu_kernel void @constant_load_v64i1(ptr addrspace(1) %out, ptr addrsp
;
; GFX1250-LABEL: constant_load_v64i1:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
@@ -689,6 +697,7 @@ define amdgpu_kernel void @constant_zextload_i1_to_i32(ptr addrspace(1) %out, pt
;
; GFX1250-LABEL: constant_zextload_i1_to_i32:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0
@@ -766,6 +775,7 @@ define amdgpu_kernel void @constant_sextload_i1_to_i32(ptr addrspace(1) %out, pt
;
; GFX1250-LABEL: constant_sextload_i1_to_i32:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0
@@ -840,6 +850,7 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i32(ptr addrspace(1) %out
;
; GFX1250-LABEL: constant_zextload_v1i1_to_v1i32:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0
@@ -917,6 +928,7 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i32(ptr addrspace(1) %out
;
; GFX1250-LABEL: constant_sextload_v1i1_to_v1i32:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0
@@ -1002,6 +1014,7 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i32(ptr addrspace(1) %out
;
; GFX1250-LABEL: constant_zextload_v2i1_to_v2i32:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
@@ -1088,6 +1101,7 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i32(ptr addrspace(1) %out
;
; GFX1250-LABEL: constant_sextload_v2i1_to_v2i32:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0
@@ -1184,6 +1198,7 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i32(ptr addrspace(1) %out
;
; GFX1250-LABEL: constant_zextload_v3i1_to_v3i32:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1250-NEXT: v_mov_b32_e32 v3, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
@@ -1281,6 +1296,7 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i32(ptr addrspace(1) %out
;
; GFX1250-LABEL: constant_sextload_v3i1_to_v3i32:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0
@@ -1378,6 +1394,7 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i32(ptr addrspace(1) %out
;
; GFX1250-LABEL: constant_zextload_v4i1_to_v4i32:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1250-NEXT: v_mov_b32_e32 v4, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
@@ -1480,6 +1497,7 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i32(ptr addrspace(1) %out
;
; GFX1250-LABEL: constant_sextload_v4i1_to_v4i32:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0
@@ -1611,7 +1629,7 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(ptr addrspace(1) %out
; GFX12-NEXT: v_lshrrev_b32_e32 v3, 7, v0
; GFX12-NEXT: v_bfe_u32 v2, v0, 6, 1
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v7, s3
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v4, s6
; GFX12-NEXT: v_mov_b32_e32 v6, s7
; GFX12-NEXT: s_clause 0x1
@@ -1621,6 +1639,7 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(ptr addrspace(1) %out
;
; GFX1250-LABEL: constant_zextload_v8i1_to_v8i32:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1250-NEXT: v_mov_b32_e32 v8, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
@@ -1767,6 +1786,7 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(ptr addrspace(1) %out
;
; GFX1250-LABEL: constant_sextload_v8i1_to_v8i32:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0
@@ -1989,6 +2009,7 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %o
;
; GFX1250-LABEL: constant_zextload_v16i1_to_v16i32:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1250-NEXT: v_mov_b32_e32 v16, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
@@ -2222,6 +2243,7 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %o
;
; GFX1250-LABEL: constant_sextload_v16i1_to_v16i32:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_load_u16 s2, s[2:3], 0x0
@@ -2629,6 +2651,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %o
;
; GFX1250-LABEL: constant_zextload_v32i1_to_v32i32:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0
@@ -3100,6 +3123,7 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(ptr addrspace(1) %o
;
; GFX1250-LABEL: constant_sextload_v32i1_to_v32i32:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0
@@ -3835,7 +3859,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: v_dual_mov_b32 v10, s53 :: v_dual_mov_b32 v13, s20
; GFX12-NEXT: v_dual_mov_b32 v12, s51 :: v_dual_mov_b32 v15, s19
; GFX12-NEXT: v_dual_mov_b32 v14, s52 :: v_dual_mov_b32 v17, s18
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_dual_mov_b32 v16, s2 :: v_dual_mov_b32 v19, s17
; GFX12-NEXT: v_dual_mov_b32 v18, s50 :: v_dual_mov_b32 v21, s16
; GFX12-NEXT: v_dual_mov_b32 v20, s49 :: v_dual_mov_b32 v23, s15
@@ -3870,6 +3894,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
;
; GFX1250-LABEL: constant_zextload_v64i1_to_v64i32:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
@@ -4731,7 +4756,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: v_dual_mov_b32 v10, s41 :: v_dual_mov_b32 v13, s38
; GFX12-NEXT: v_dual_mov_b32 v12, s39 :: v_dual_mov_b32 v15, s36
; GFX12-NEXT: v_dual_mov_b32 v14, s37 :: v_dual_mov_b32 v17, s35
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_dual_mov_b32 v16, s2 :: v_dual_mov_b32 v19, s33
; GFX12-NEXT: v_dual_mov_b32 v18, s34 :: v_dual_mov_b32 v21, s30
; GFX12-NEXT: v_dual_mov_b32 v20, s31 :: v_dual_mov_b32 v23, s28
@@ -4766,6 +4791,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
;
; GFX1250-LABEL: constant_sextload_v64i1_to_v64i32:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
@@ -4970,6 +4996,7 @@ define amdgpu_kernel void @constant_zextload_i1_to_i64(ptr addrspace(1) %out, pt
;
; GFX1250-LABEL: constant_zextload_i1_to_i64:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0
@@ -5052,6 +5079,7 @@ define amdgpu_kernel void @constant_sextload_i1_to_i64(ptr addrspace(1) %out, pt
;
; GFX1250-LABEL: constant_sextload_i1_to_i64:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
@@ -5134,6 +5162,7 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i64(ptr addrspace(1) %out
;
; GFX1250-LABEL: constant_zextload_v1i1_to_v1i64:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0
@@ -5216,6 +5245,7 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i64(ptr addrspace(1) %out
;
; GFX1250-LABEL: constant_sextload_v1i1_to_v1i64:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
@@ -5311,6 +5341,7 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i64(ptr addrspace(1) %out
;
; GFX1250-LABEL: constant_zextload_v2i1_to_v2i64:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1250-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
@@ -5412,6 +5443,7 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i64(ptr addrspace(1) %out
;
; GFX1250-LABEL: constant_sextload_v2i1_to_v2i64:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1250-NEXT: v_mov_b32_e32 v4, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
@@ -5530,6 +5562,7 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(ptr addrspace(1) %out
;
; GFX1250-LABEL: constant_zextload_v3i1_to_v3i64:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1250-NEXT: v_mov_b32_e32 v5, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
@@ -5661,6 +5694,7 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(ptr addrspace(1) %out
;
; GFX1250-LABEL: constant_sextload_v3i1_to_v3i64:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1250-NEXT: v_mov_b32_e32 v5, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
@@ -5787,11 +5821,11 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX12-NEXT: s_and_b32 s2, s2, 1
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_and_b32 s3, 0xffff, s3
; GFX12-NEXT: s_and_b32 s2, 0xffff, s2
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: v_mov_b32_e32 v2, s3
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
@@ -5799,6 +5833,7 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out
;
; GFX1250-LABEL: constant_zextload_v4i1_to_v4i64:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1250-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
@@ -5948,6 +5983,7 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(ptr addrspace(1) %out
;
; GFX1250-LABEL: constant_sextload_v4i1_to_v4i64:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1250-NEXT: v_mov_b32_e32 v9, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
@@ -6135,6 +6171,7 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out
;
; GFX1250-LABEL: constant_zextload_v8i1_to_v8i64:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1250-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
@@ -6340,7 +6377,7 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out
; GFX12-NEXT: s_lshr_b32 s10, s3, 2
; GFX12-NEXT: s_lshr_b32 s12, s3, 3
; GFX12-NEXT: s_lshr_b32 s14, s3, 1
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000
; GFX12-NEXT: v_bfe_i32 v12, v9, 0, 1
@@ -6348,7 +6385,7 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out
; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000
; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000
; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000
@@ -6367,6 +6404,7 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out
;
; GFX1250-LABEL: constant_sextload_v8i1_to_v8i64:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1250-NEXT: v_mov_b32_e32 v16, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
@@ -6473,85 +6511,84 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
; GFX8-LABEL: constant_zextload_v16i1_to_v16i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT: v_mov_b32_e32 v3, 0
+; GFX8-NEXT: v_mov_b32_e32 v5, v3
+; GFX8-NEXT: v_mov_b32_e32 v7, v3
+; GFX8-NEXT: v_mov_b32_e32 v9, v3
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v5, v1
-; GFX8-NEXT: v_mov_b32_e32 v7, v1
-; GFX8-NEXT: v_mov_b32_e32 v9, v1
-; GFX8-NEXT: v_mov_b32_e32 v11, v1
+; GFX8-NEXT: v_mov_b32_e32 v1, v3
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s2, v0
-; GFX8-NEXT: s_bfe_u32 s3, s2, 0x10009
-; GFX8-NEXT: s_bfe_u32 s4, s2, 0x1000d
-; GFX8-NEXT: s_bfe_u32 s5, s2, 0x10007
-; GFX8-NEXT: s_bfe_u32 s6, s2, 0x10003
-; GFX8-NEXT: s_bfe_u32 s7, s2, 0x10001
-; GFX8-NEXT: s_and_b32 s8, s2, 1
-; GFX8-NEXT: s_bfe_u32 s9, s2, 0x10002
-; GFX8-NEXT: s_bfe_u32 s10, s2, 0x10004
-; GFX8-NEXT: s_bfe_u32 s11, s2, 0x10006
-; GFX8-NEXT: s_bfe_u32 s12, s2, 0x1000c
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0
+; GFX8-NEXT: s_bfe_u32 s4, s2, 0x10009
+; GFX8-NEXT: s_bfe_u32 s5, s2, 0x1000d
+; GFX8-NEXT: s_bfe_u32 s6, s2, 0x10007
+; GFX8-NEXT: s_bfe_u32 s7, s2, 0x10003
+; GFX8-NEXT: s_bfe_u32 s8, s2, 0x10001
+; GFX8-NEXT: s_and_b32 s9, s2, 1
+; GFX8-NEXT: s_bfe_u32 s10, s2, 0x10002
+; GFX8-NEXT: s_bfe_u32 s11, s2, 0x10004
+; GFX8-NEXT: s_bfe_u32 s12, s2, 0x10006
+; GFX8-NEXT: s_bfe_u32 s13, s2, 0x1000c
; GFX8-NEXT: s_bfe_u32 s2, s2, 0x1000a
-; GFX8-NEXT: v_and_b32_e32 v4, 0xffff, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_bfe_u32 v4, v2, 11, 1
+; GFX8-NEXT: v_lshrrev_b32_e32 v8, 15, v2
+; GFX8-NEXT: v_bfe_u32 v12, v2, 5, 1
+; GFX8-NEXT: v_bfe_u32 v6, v2, 14, 1
+; GFX8-NEXT: v_bfe_u32 v0, v2, 8, 1
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: s_add_u32 s2, s0, 0x50
-; GFX8-NEXT: v_mov_b32_e32 v6, s3
; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v13, s3
-; GFX8-NEXT: v_mov_b32_e32 v12, s2
+; GFX8-NEXT: v_mov_b32_e32 v11, s3
+; GFX8-NEXT: v_mov_b32_e32 v10, s2
; GFX8-NEXT: s_add_u32 s2, s0, 64
-; GFX8-NEXT: v_bfe_u32 v2, v4, 11, 1
; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
-; GFX8-NEXT: v_mov_b32_e32 v13, s3
-; GFX8-NEXT: v_mov_b32_e32 v12, s2
+; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[2:5]
+; GFX8-NEXT: v_mov_b32_e32 v11, s3
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: v_mov_b32_e32 v10, s2
; GFX8-NEXT: s_add_u32 s2, s0, 0x70
-; GFX8-NEXT: v_lshrrev_b32_e32 v10, 15, v4
-; GFX8-NEXT: v_bfe_u32 v14, v4, 5, 1
-; GFX8-NEXT: v_bfe_u32 v8, v4, 14, 1
-; GFX8-NEXT: v_bfe_u32 v4, v4, 8, 1
+; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[4:7]
-; GFX8-NEXT: v_mov_b32_e32 v0, s12
-; GFX8-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NEXT: v_mov_b32_e32 v4, s2
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: s_add_u32 s2, s0, 0x60
+; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[6:9]
; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[8:11]
-; GFX8-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NEXT: v_mov_b32_e32 v4, s2
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v4, s5
+; GFX8-NEXT: v_mov_b32_e32 v2, s13
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: s_add_u32 s2, s0, 48
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NEXT: v_mov_b32_e32 v4, s2
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v2, s12
+; GFX8-NEXT: v_mov_b32_e32 v4, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: s_add_u32 s2, s0, 32
-; GFX8-NEXT: v_mov_b32_e32 v0, s11
-; GFX8-NEXT: v_mov_b32_e32 v2, s5
+; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NEXT: v_mov_b32_e32 v4, s2
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v2, s11
+; GFX8-NEXT: v_mov_b32_e32 v4, v12
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: s_add_u32 s2, s0, 16
-; GFX8-NEXT: v_mov_b32_e32 v0, s10
-; GFX8-NEXT: v_mov_b32_e32 v2, v14
+; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NEXT: v_mov_b32_e32 v0, s9
-; GFX8-NEXT: v_mov_b32_e32 v2, s6
-; GFX8-NEXT: v_mov_b32_e32 v4, s2
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NEXT: v_mov_b32_e32 v0, s8
-; GFX8-NEXT: v_mov_b32_e32 v2, s7
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v2, s10
+; GFX8-NEXT: v_mov_b32_e32 v4, s7
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v2, s9
+; GFX8-NEXT: v_mov_b32_e32 v4, s8
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; GFX8-NEXT: s_endpgm
;
; EG-LABEL: constant_zextload_v16i1_to_v16i64:
@@ -6640,61 +6677,63 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
; GFX12-LABEL: constant_zextload_v16i1_to_v16i64:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: v_mov_b32_e32 v3, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v0, v1, s[2:3]
+; GFX12-NEXT: global_load_u16 v0, v3, s[2:3]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_readfirstlane_b32 s2, v0
-; GFX12-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_and_b32 v4, 0xffff, v0
-; GFX12-NEXT: v_mov_b32_e32 v11, v1
+; GFX12-NEXT: v_dual_mov_b32 v7, v3 :: v_dual_and_b32 v6, 0xffff, v0
+; GFX12-NEXT: v_mov_b32_e32 v9, v3
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x1000a
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v0, s3
-; GFX12-NEXT: v_bfe_u32 v2, v4, 11, 1
-; GFX12-NEXT: s_bfe_u32 s3, s2, 0x1000d
+; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v2, s3
+; GFX12-NEXT: v_bfe_u32 v4, v6, 11, 1
+; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10009
+; GFX12-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-NEXT: v_bfe_u32 v0, v6, 8, 1
; GFX12-NEXT: s_bfe_u32 s4, s2, 0x1000c
-; GFX12-NEXT: v_mov_b32_e32 v5, v1
-; GFX12-NEXT: v_bfe_u32 v6, v4, 5, 1
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:80
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:80
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v2, s3
+; GFX12-NEXT: s_bfe_u32 s3, s2, 0x1000d
+; GFX12-NEXT: v_lshrrev_b32_e32 v8, 15, v6
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT: v_mov_b32_e32 v4, s3
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10007
+; GFX12-NEXT: global_store_b128 v3, v[0:3], s[0:1] offset:64
+; GFX12-NEXT: v_mov_b32_e32 v2, s4
; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10006
-; GFX12-NEXT: v_mov_b32_e32 v9, v1
-; GFX12-NEXT: s_bfe_u32 s6, s2, 0x10002
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:96
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
+; GFX12-NEXT: v_bfe_u32 v0, v6, 5, 1
+; GFX12-NEXT: v_bfe_u32 v6, v6, 14, 1
+; GFX12-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:96
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT: v_mov_b32_e32 v2, s4
+; GFX12-NEXT: v_mov_b32_e32 v4, s3
+; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10004
+; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10002
+; GFX12-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:48
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v2, s3
-; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10004
-; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10009
-; GFX12-NEXT: s_bfe_u32 s5, s2, 0x10001
-; GFX12-NEXT: v_lshrrev_b32_e32 v10, 15, v4
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: v_mov_b32_e32 v2, v6
-; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10003
-; GFX12-NEXT: s_and_b32 s2, s2, 1
-; GFX12-NEXT: v_bfe_u32 v8, v4, 14, 1
-; GFX12-NEXT: v_bfe_u32 v4, v4, 8, 1
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:32
-; GFX12-NEXT: v_mov_b32_e32 v0, s6
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: v_mov_b32_e32 v4, v0
+; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10003
+; GFX12-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:32
; GFX12-NEXT: v_mov_b32_e32 v2, s4
-; GFX12-NEXT: v_mov_b32_e32 v6, s3
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: v_mov_b32_e32 v2, s5
-; GFX12-NEXT: s_clause 0x2
-; GFX12-NEXT: global_store_b128 v1, v[8:11], s[0:1] offset:112
-; GFX12-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:64
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT: v_mov_b32_e32 v4, s3
+; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10001
+; GFX12-NEXT: s_and_b32 s2, s2, 1
+; GFX12-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:16
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-NEXT: v_mov_b32_e32 v4, s3
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v3, v[6:9], s[0:1] offset:112
+; GFX12-NEXT: global_store_b128 v3, v[2:5], s[0:1]
; GFX12-NEXT: s_endpgm
;
; GFX1250-LABEL: constant_zextload_v16i1_to_v16i64:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1250-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
@@ -7062,7 +7101,7 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o
; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000
; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000
; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v5, s7
; GFX12-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v2, s4
; GFX12-NEXT: v_dual_mov_b32 v7, s9 :: v_dual_mov_b32 v4, s6
@@ -7100,6 +7139,7 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o
;
; GFX1250-LABEL: constant_sextload_v16i1_to_v16i64:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1250-NEXT: v_mov_b32_e32 v32, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
@@ -7606,91 +7646,91 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, v1
; GFX12-NEXT: s_bfe_u32 s4, s2, 0x1001c
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:240
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: v_mov_b32_e32 v2, s3
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x1001b
; GFX12-NEXT: s_bfe_u32 s4, s2, 0x1001a
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:224
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: v_mov_b32_e32 v2, s3
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10019
; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10018
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:208
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: v_mov_b32_e32 v2, s3
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10017
; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10016
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:192
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: v_mov_b32_e32 v2, s3
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10014
; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10015
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:176
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v0, s3
; GFX12-NEXT: v_mov_b32_e32 v2, s4
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10013
; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10012
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:160
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: v_mov_b32_e32 v2, s3
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10011
; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10010
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:144
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: v_mov_b32_e32 v2, s3
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x1000f
; GFX12-NEXT: s_bfe_u32 s4, s2, 0x1000e
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:128
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: v_mov_b32_e32 v2, s3
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x1000d
; GFX12-NEXT: s_bfe_u32 s4, s2, 0x1000c
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:112
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: v_mov_b32_e32 v2, s3
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x1000b
; GFX12-NEXT: s_bfe_u32 s4, s2, 0x1000a
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:96
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: v_mov_b32_e32 v2, s3
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10009
; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10008
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:80
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: v_mov_b32_e32 v2, s3
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10007
; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10006
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:64
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: v_mov_b32_e32 v2, s3
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10005
; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10004
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: v_mov_b32_e32 v2, s3
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10003
; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10002
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:32
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: v_mov_b32_e32 v2, s3
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10001
; GFX12-NEXT: s_and_b32 s2, s2, 1
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: v_mov_b32_e32 v2, s3
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
@@ -7698,6 +7738,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
;
; GFX1250-LABEL: constant_zextload_v32i1_to_v32i64:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0
@@ -8495,6 +8536,7 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
;
; GFX1250-LABEL: constant_sextload_v32i1_to_v32i64:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0
@@ -9465,191 +9507,191 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4
; GFX12-NEXT: s_bfe_u32 s5, s3, 0x10015
; GFX12-NEXT: s_lshr_b32 s4, s3, 31
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, v1
; GFX12-NEXT: s_bfe_u32 s5, s3, 0x1001e
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:416
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v0, s5
; GFX12-NEXT: v_mov_b32_e32 v2, s4
; GFX12-NEXT: s_bfe_u32 s4, s3, 0x1001d
; GFX12-NEXT: s_bfe_u32 s5, s3, 0x1001c
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:496
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v0, s5
; GFX12-NEXT: v_mov_b32_e32 v2, s4
; GFX12-NEXT: s_bfe_u32 s4, s3, 0x1001b
; GFX12-NEXT: s_bfe_u32 s5, s3, 0x1001a
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:480
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v0, s5
; GFX12-NEXT: v_mov_b32_e32 v2, s4
; GFX12-NEXT: s_bfe_u32 s4, s3, 0x10019
; GFX12-NEXT: s_bfe_u32 s5, s3, 0x10018
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:464
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v0, s5
; GFX12-NEXT: v_mov_b32_e32 v2, s4
; GFX12-NEXT: s_bfe_u32 s4, s3, 0x10017
; GFX12-NEXT: s_bfe_u32 s5, s3, 0x10016
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:448
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v0, s5
; GFX12-NEXT: v_mov_b32_e32 v2, s4
; GFX12-NEXT: s_bfe_u32 s4, s3, 0x10013
; GFX12-NEXT: s_bfe_u32 s5, s3, 0x10012
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:432
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v0, s5
; GFX12-NEXT: v_mov_b32_e32 v2, s4
; GFX12-NEXT: s_bfe_u32 s4, s3, 0x10011
; GFX12-NEXT: s_bfe_u32 s5, s3, 0x10010
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:400
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v0, s5
; GFX12-NEXT: v_mov_b32_e32 v2, s4
; GFX12-NEXT: s_bfe_u32 s4, s3, 0x1000f
; GFX12-NEXT: s_bfe_u32 s5, s3, 0x1000e
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:384
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v0, s5
; GFX12-NEXT: v_mov_b32_e32 v2, s4
; GFX12-NEXT: s_bfe_u32 s4, s3, 0x1000d
; GFX12-NEXT: s_bfe_u32 s5, s3, 0x1000c
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:368
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v0, s5
; GFX12-NEXT: v_mov_b32_e32 v2, s4
; GFX12-NEXT: s_bfe_u32 s4, s3, 0x1000b
; GFX12-NEXT: s_bfe_u32 s5, s3, 0x1000a
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:352
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v0, s5
; GFX12-NEXT: v_mov_b32_e32 v2, s4
; GFX12-NEXT: s_bfe_u32 s4, s3, 0x10009
; GFX12-NEXT: s_bfe_u32 s5, s3, 0x10008
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:336
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v0, s5
; GFX12-NEXT: v_mov_b32_e32 v2, s4
; GFX12-NEXT: s_bfe_u32 s4, s3, 0x10007
; GFX12-NEXT: s_bfe_u32 s5, s3, 0x10006
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:320
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v0, s5
; GFX12-NEXT: v_mov_b32_e32 v2, s4
; GFX12-NEXT: s_bfe_u32 s4, s3, 0x10005
; GFX12-NEXT: s_bfe_u32 s5, s3, 0x10004
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:304
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v0, s5
; GFX12-NEXT: v_mov_b32_e32 v2, s4
; GFX12-NEXT: s_bfe_u32 s4, s3, 0x10003
; GFX12-NEXT: s_bfe_u32 s5, s3, 0x10002
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:288
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v0, s5
; GFX12-NEXT: v_mov_b32_e32 v2, s4
; GFX12-NEXT: s_bfe_u32 s4, s3, 0x10001
; GFX12-NEXT: s_and_b32 s3, s3, 1
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:272
; GFX12-NEXT: v_mov_b32_e32 v0, s3
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v2, s4
; GFX12-NEXT: s_lshr_b32 s3, s2, 31
; GFX12-NEXT: s_bfe_u32 s4, s2, 0x1001e
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:256
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: v_mov_b32_e32 v2, s3
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x1001d
; GFX12-NEXT: s_bfe_u32 s4, s2, 0x1001c
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:240
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: v_mov_b32_e32 v2, s3
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x1001b
; GFX12-NEXT: s_bfe_u32 s4, s2, 0x1001a
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:224
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: v_mov_b32_e32 v2, s3
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10019
; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10018
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:208
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: v_mov_b32_e32 v2, s3
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10017
; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10016
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:192
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: v_mov_b32_e32 v2, s3
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10014
; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10015
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:176
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v0, s3
; GFX12-NEXT: v_mov_b32_e32 v2, s4
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10013
; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10012
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:160
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: v_mov_b32_e32 v2, s3
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10011
; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10010
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:144
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: v_mov_b32_e32 v2, s3
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x1000f
; GFX12-NEXT: s_bfe_u32 s4, s2, 0x1000e
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:128
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: v_mov_b32_e32 v2, s3
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x1000d
; GFX12-NEXT: s_bfe_u32 s4, s2, 0x1000c
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:112
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: v_mov_b32_e32 v2, s3
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x1000b
; GFX12-NEXT: s_bfe_u32 s4, s2, 0x1000a
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:96
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: v_mov_b32_e32 v2, s3
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10009
; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10008
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:80
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: v_mov_b32_e32 v2, s3
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10007
; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10006
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:64
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: v_mov_b32_e32 v2, s3
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10005
; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10004
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: v_mov_b32_e32 v2, s3
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10003
; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10002
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:32
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: v_mov_b32_e32 v2, s3
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10001
; GFX12-NEXT: s_and_b32 s2, s2, 1
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: v_mov_b32_e32 v2, s3
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
@@ -9657,175 +9699,188 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
;
; GFX1250-LABEL: constant_zextload_v64i1_to_v64i64:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x10014
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4
+; GFX1250-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v2, s4
; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x10015
; GFX1250-NEXT: s_lshr_b32 s4, s3, 31
-; GFX1250-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, v1
+; GFX1250-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, v3
; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x1001e
-; GFX1250-NEXT: s_bfe_u32 s6, s2, 0x10004
-; GFX1250-NEXT: s_and_b32 s7, s2, 1
-; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:416
+; GFX1250-NEXT: s_bfe_u32 s6, s3, 0x10002
+; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:416
; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v4, s4
; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x1001d
; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x1001c
-; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:496
+; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:496
; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v4, s4
; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x1001b
; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x1001a
-; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:480
+; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:480
; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v4, s4
; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x10019
; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x10018
-; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:464
+; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:464
; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v4, s4
; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x10017
; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x10016
-; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:448
+; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:448
; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v4, s4
; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x10013
; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x10012
-; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:432
+; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:432
; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v4, s4
; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x10011
; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x10010
-; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:400
+; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:400
; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v4, s4
; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x1000f
; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x1000e
-; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:384
+; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:384
; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v4, s4
; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x1000d
; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x1000c
-; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:368
+; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:368
; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v4, s4
; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x1000b
; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x1000a
-; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:352
+; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:352
; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v4, s4
; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x10009
; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x10008
-; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:336
+; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:336
; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v4, s4
; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x10007
; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x10006
-; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:320
+; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:320
; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v4, s4
; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x10005
; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x10004
-; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:304
+; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:304
; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
-; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x10003
-; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x10002
-; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:288
+; GFX1250-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v4, s4
+; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x10003
+; GFX1250-NEXT: s_mov_b32 s4, s3
+; GFX1250-NEXT: s_bfe_u32 s3, s3, 0x10001
+; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:288
; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
-; GFX1250-NEXT: s_lshr_b32 s4, s2, 31
-; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x1001e
-; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:272
+; GFX1250-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v4, s5
+; GFX1250-NEXT: s_and_b64 s[4:5], s[4:5], 1
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:272
; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
-; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x1001d
-; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x1001c
-; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:240
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: s_lshr_b32 s3, s2, 31
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x1001e
+; GFX1250-NEXT: v_mov_b32_e32 v4, s3
+; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x1001d
+; GFX1250-NEXT: global_store_b128 v3, v[0:3], s[0:1] offset:256
; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
-; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x1001b
-; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x1001a
-; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:224
+; GFX1250-NEXT: v_mov_b32_e32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x1001c
+; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:240
; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
-; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10019
-; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x10018
-; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:208
+; GFX1250-NEXT: v_mov_b32_e32 v4, s3
+; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x1001b
+; GFX1250-NEXT: v_mov_b32_e32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x1001a
+; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:224
; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
-; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10017
-; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x10016
-; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:192
+; GFX1250-NEXT: v_mov_b32_e32 v4, s3
+; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10019
+; GFX1250-NEXT: v_mov_b32_e32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10018
+; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:208
; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
-; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10014
-; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x10015
-; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:176
+; GFX1250-NEXT: v_mov_b32_e32 v4, s3
+; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10017
+; GFX1250-NEXT: v_mov_b32_e32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10016
+; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:192
; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v2, s5
-; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10013
-; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x10012
-; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:160
+; GFX1250-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v4, s3
+; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10014
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10015
+; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:176
; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
-; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10011
-; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x10010
-; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:144
+; GFX1250-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v4, s4
+; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10013
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10012
+; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:160
; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
-; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x1000f
-; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x1000e
-; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:128
+; GFX1250-NEXT: v_mov_b32_e32 v4, s3
+; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10011
+; GFX1250-NEXT: v_mov_b32_e32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10010
+; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:144
; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
-; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x1000d
-; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x1000c
-; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:112
+; GFX1250-NEXT: v_mov_b32_e32 v4, s3
+; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x1000f
+; GFX1250-NEXT: v_mov_b32_e32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x1000e
+; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:128
; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
-; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x1000b
-; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x1000a
-; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:96
+; GFX1250-NEXT: v_mov_b32_e32 v4, s3
+; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x1000d
+; GFX1250-NEXT: v_mov_b32_e32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x1000c
+; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:112
; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
-; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10009
-; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x10008
-; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:80
+; GFX1250-NEXT: v_mov_b32_e32 v4, s3
+; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x1000b
+; GFX1250-NEXT: v_mov_b32_e32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x1000a
+; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:96
; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
-; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10007
-; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x10006
-; GFX1250-NEXT: v_mov_b32_e32 v7, v1
-; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:64
+; GFX1250-NEXT: v_mov_b32_e32 v4, s3
+; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10009
+; GFX1250-NEXT: v_mov_b32_e32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10008
+; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:80
; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
-; GFX1250-NEXT: s_mov_b32 s4, s3
-; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x10001
+; GFX1250-NEXT: v_mov_b32_e32 v4, s3
+; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10007
+; GFX1250-NEXT: v_mov_b32_e32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10006
+; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:64
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v4, s3
; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10005
-; GFX1250-NEXT: v_mov_b32_e32 v6, s5
-; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48
+; GFX1250-NEXT: v_mov_b32_e32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10004
+; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:48
; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v4, s3
; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10003
-; GFX1250-NEXT: s_bfe_u32 s6, s2, 0x10001
-; GFX1250-NEXT: s_bfe_u32 s2, s2, 0x10002
-; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:32
+; GFX1250-NEXT: v_mov_b32_e32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10002
+; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:32
; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v2, s3
-; GFX1250-NEXT: s_and_b64 s[2:3], s[4:5], 1
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
-; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v4, s3
+; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10001
+; GFX1250-NEXT: v_mov_b32_e32 v2, s4
+; GFX1250-NEXT: s_and_b32 s2, s2, 1
+; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:16
; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v2, s6
-; GFX1250-NEXT: s_clause 0x1
-; GFX1250-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:256
-; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1]
+; GFX1250-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v4, s3
+; GFX1250-NEXT: global_store_b128 v3, v[2:5], s[0:1]
; GFX1250-NEXT: s_endpgm
%load = load <64 x i1>, ptr addrspace(4) %in
%ext = zext <64 x i1> %load to <64 x i64>
@@ -11148,7 +11203,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: s_lshr_b32 s44, s10, 4
; GFX12-NEXT: s_lshr_b32 s30, s10, 2
; GFX12-NEXT: s_bfe_i64 s[18:19], s[10:11], 0x10000
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_bfe_i64 s[10:11], s[26:27], 0x10000
; GFX12-NEXT: s_bfe_i64 s[26:27], s[40:41], 0x10000
; GFX12-NEXT: s_bfe_i64 s[40:41], s[52:53], 0x10000
@@ -11163,7 +11218,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: s_bfe_i64 s[78:79], s[78:79], 0x10000
; GFX12-NEXT: s_bfe_i64 s[92:93], s[92:93], 0x10000
; GFX12-NEXT: v_dual_mov_b32 v14, s91 :: v_dual_mov_b32 v15, s98
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_dual_mov_b32 v16, s99 :: v_dual_mov_b32 v17, s96
; GFX12-NEXT: s_bfe_i64 s[66:67], s[66:67], 0x10000
; GFX12-NEXT: v_dual_mov_b32 v18, s97 :: v_dual_mov_b32 v19, s94
@@ -11180,14 +11235,14 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: global_store_b128 v0, v[21:24], s[0:1] offset:96
; GFX12-NEXT: v_dual_mov_b32 v1, s78 :: v_dual_mov_b32 v2, s79
; GFX12-NEXT: v_dual_mov_b32 v3, s74 :: v_dual_mov_b32 v4, s75
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v5, s66
; GFX12-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000
; GFX12-NEXT: v_dual_mov_b32 v6, s67 :: v_dual_mov_b32 v7, s62
; GFX12-NEXT: v_dual_mov_b32 v8, s63 :: v_dual_mov_b32 v9, s56
; GFX12-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000
; GFX12-NEXT: v_dual_mov_b32 v10, s57 :: v_dual_mov_b32 v11, s52
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_dual_mov_b32 v12, s53 :: v_dual_mov_b32 v13, s44
; GFX12-NEXT: v_dual_mov_b32 v14, s45 :: v_dual_mov_b32 v15, s40
; GFX12-NEXT: v_dual_mov_b32 v16, s41 :: v_dual_mov_b32 v17, s30
@@ -11206,6 +11261,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
;
; GFX1250-LABEL: constant_sextload_v64i1_to_v64i64:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_load_b64 s[10:11], s[2:3], 0x0