aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/load-global-i16.ll')
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-global-i16.ll715
1 files changed, 345 insertions, 370 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
index bca39d0..38800628 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -4582,18 +4582,18 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90
-; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[12:15]
-; GCN-HSA-NEXT: flat_store_dwordx4 v[38:39], v[20:23]
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60
+; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[12:15]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[38:39], v[20:23]
+; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 16, v3
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 16, v2
; GCN-HSA-NEXT: v_bfe_i32 v22, v3, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v20, v2, 0, 16
-; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[20:23]
; GCN-HSA-NEXT: s_waitcnt vmcnt(11)
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v9
@@ -5985,14 +5985,13 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out,
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, v1
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v0, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 31, v1
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v6, 16, v1
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v2, 0, 16
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v1, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v3, 0, 16
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v2, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
@@ -6011,23 +6010,22 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out,
; GCN-HSA-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1
-; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2
-; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0
+; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3
+; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1
+; GCN-HSA-NEXT: v_bfe_i32 v4, v1, 0, 16
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GCN-HSA-NEXT: v_bfe_i32 v4, v3, 0, 16
-; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v1
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 16, v1
-; GCN-HSA-NEXT: v_bfe_i32 v2, v2, 0, 16
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4
+; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GCN-HSA-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
-; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: s_endpgm
;
; GCN-NOHSA-VI-LABEL: global_sextload_v4i16_to_v4i64:
@@ -6044,11 +6042,10 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out,
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, v2
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v4, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v5, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v2, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v6, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v1, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v3, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 31, v4
@@ -6362,7 +6359,6 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out,
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v3
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v0, 0, 16
@@ -6372,7 +6368,7 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out,
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v1, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v15, 31, v3
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v14, 16, v3
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v12, v5, 0, 16
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v12, v3, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v7, 0, 16
@@ -6407,32 +6403,31 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out,
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0
; GCN-HSA-NEXT: s_add_u32 s0, s0, 16
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v23, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v22, s0
+; GCN-HSA-NEXT: v_mov_b32_e32 v23, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2
+; GCN-HSA-NEXT: v_mov_b32_e32 v22, s0
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
-; GCN-HSA-NEXT: v_mov_b32_e32 v8, v3
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v2
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v0
-; GCN-HSA-NEXT: v_bfe_i32 v4, v1, 0, 16
+; GCN-HSA-NEXT: v_bfe_i32 v8, v3, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v1
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 16, v1
+; GCN-HSA-NEXT: v_bfe_i32 v4, v1, 0, 16
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v3
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v3
; GCN-HSA-NEXT: v_bfe_i32 v12, v2, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v2, v5, 0, 16
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; GCN-HSA-NEXT: v_bfe_i32 v14, v9, 0, 16
-; GCN-HSA-NEXT: v_bfe_i32 v8, v8, 0, 16
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v3
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v3
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v1
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 16, v1
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8
-; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[4:7]
; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[8:11]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[4:7]
; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[12:15]
; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3]
; GCN-HSA-NEXT: s_endpgm
@@ -6451,12 +6446,11 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out,
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, v3
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v3, 0, 16
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v0
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v1
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 16, v2
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v11, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v3, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v0, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v1, 0, 16
@@ -6980,27 +6974,25 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, v3
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v13, 16, v0
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, v7
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v6
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v4
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v11, 16, v4
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v4, 0, 16
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v11, 31, v5
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v10, 16, v5
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v5, 0, 16
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v15, 31, v7
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v14, 16, v7
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v6, 0, 16
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v12, v12, 0, 16
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v6, 0, 16
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v15, 31, v5
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v14, 16, v5
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v12, v5, 0, 16
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v19, 31, v7
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v18, 16, v7
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v7, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v20, v0, 0, 16
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v18, 0, 16
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v18, v17, 0, 16
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v13, 0, 16
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v9, 0, 16
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v11, 0, 16
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v10, v10, 0, 16
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v9, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v3
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v3
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v3, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:112
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v27, 31, v1
@@ -7011,22 +7003,22 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v2, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v1, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 31, v4
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v9, 31, v8
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v21, 31, v20
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v9, 31, v8
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v24
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 31, v6
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v11, 31, v10
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:80
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
; GCN-NOHSA-SI-NEXT: s_endpgm
;
@@ -7070,57 +7062,55 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0
; GCN-HSA-NEXT: s_add_u32 s0, s0, 64
-; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2
+; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v27, s1
+; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v26, s0
; GCN-HSA-NEXT: s_waitcnt vmcnt(1)
-; GCN-HSA-NEXT: v_bfe_i32 v8, v5, 0, 16
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v5
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v5
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, v7
-; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[8:11]
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v6
-; GCN-HSA-NEXT: v_bfe_i32 v8, v5, 0, 16
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v7
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v7
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v4
-; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11]
-; GCN-HSA-NEXT: v_bfe_i32 v7, v6, 0, 16
-; GCN-HSA-NEXT: v_bfe_i32 v9, v18, 0, 16
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v7
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 16, v7
+; GCN-HSA-NEXT: v_bfe_i32 v7, v7, 0, 16
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v7
+; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[7:10]
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v6
+; GCN-HSA-NEXT: v_bfe_i32 v7, v5, 0, 16
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v5
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 16, v5
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v7
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v4
+; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[7:10]
; GCN-HSA-NEXT: v_bfe_i32 v4, v4, 0, 16
+; GCN-HSA-NEXT: v_bfe_i32 v7, v6, 0, 16
+; GCN-HSA-NEXT: v_bfe_i32 v9, v11, 0, 16
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v7
-; GCN-HSA-NEXT: v_bfe_i32 v6, v19, 0, 16
+; GCN-HSA-NEXT: v_bfe_i32 v6, v14, 0, 16
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v9
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[7:10]
; GCN-HSA-NEXT: s_waitcnt vmcnt(3)
-; GCN-HSA-NEXT: v_mov_b32_e32 v15, v3
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v2
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v0
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v0
; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7]
; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GCN-HSA-NEXT: v_bfe_i32 v4, v1, 0, 16
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v1
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 16, v1
+; GCN-HSA-NEXT: v_bfe_i32 v4, v3, 0, 16
+; GCN-HSA-NEXT: v_bfe_i32 v8, v1, 0, 16
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v3
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 16, v3
; GCN-HSA-NEXT: v_bfe_i32 v12, v2, 0, 16
-; GCN-HSA-NEXT: v_bfe_i32 v2, v18, 0, 16
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4
+; GCN-HSA-NEXT: v_bfe_i32 v2, v15, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v14, v14, 0, 16
-; GCN-HSA-NEXT: v_bfe_i32 v8, v15, 0, 16
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v3
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v3
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v1
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v1
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8
-; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7]
-; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[8:11]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[4:7]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[8:11]
; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[12:15]
; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[0:3]
; GCN-HSA-NEXT: s_endpgm
@@ -7135,62 +7125,60 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
-; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[1:4], off, s[8:11], 0
+; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[5:8], off, s[8:11], 0 offset:16
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1)
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v0, 0, 16
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v0
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v1, 0, 16
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v2, 0, 16
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v2
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v1, 0, 16
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 16, v1
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v5
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v1, 0, 16
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v4
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v3
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v20, 16, v6
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v22, v7
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v5, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v27, v6, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v4, 0, 16
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v6
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v3, 0, 16
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v12, 16, v3
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v3, v1, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v1, v6, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v9, v2, 0, 16
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v2
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v4, 0, 16
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v14, 16, v4
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v2, 31, v1
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v4, 31, v3
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0 offset:80
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v25, v5, 0, 16
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v7
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v1, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v9, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v11, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v19, v13, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v21, v3, 0, 16
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 31, v15
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v5
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v8
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v27, v1, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v7, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v10, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v11, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v12, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v19, v14, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v23, v3, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v21, v8, 0, 16
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v26, 31, v25
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v28, 31, v27
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v10, 31, v9
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 31, v13
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v18, 31, v17
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v23, v22, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v25, v7, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v29, v20, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 31, v4
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 31, v6
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:80
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 31, v8
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v13, 31, v12
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v28, 31, v27
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 31, v10
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v22, 31, v21
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 31, v14
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v12, 31, v11
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 31, v15
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v20, 31, v19
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v22, 31, v21
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v30, 31, v29
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 31, v6
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v24, 31, v23
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v26, 31, v25
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:112
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:96
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:48
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[25:28], off, s[0:3], 0 offset:64
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:112
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:48
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:32
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:16
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_v16i16_to_v16i64:
@@ -8119,19 +8107,17 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(3)
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v20, 16, v14
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, v15
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(2)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, v3
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v6
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, v7
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v4
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v23, 0, 16
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v22, 16, v8
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v15
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v15
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v15, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:240
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
@@ -8140,11 +8126,10 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v13, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:208
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v11
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v27, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v3
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v3
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v3, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:176
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
@@ -8153,35 +8138,34 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v1, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:144
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v22, 0, 16
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v7
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 16, v7
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:112
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v7
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v7
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v7, 0, 16
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:112
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v5
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 16, v5
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v5, 0, 16
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:80
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v5
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v5
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v5, 0, 16
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:80
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v11
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 16, v11
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v13, 0, 16
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:48
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v11
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v11
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v11, 0, 16
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:48
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v11, v12, 0, 16
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v14, 0, 16
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v14, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v13, v21, 0, 16
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v24, v20, 0, 16
-; GCN-NOHSA-SI-NEXT: v_bfe_i32 v3, v1, 0, 16
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v25, v20, 0, 16
+; GCN-NOHSA-SI-NEXT: v_bfe_i32 v3, v22, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v1, v8, 0, 16
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22
-; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v24
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:224
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23
+; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v25
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:224
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v8, 31, v9
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 16, v9
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v5, v9, 0, 16
@@ -8240,8 +8224,8 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
; GCN-HSA-NEXT: s_add_u32 s2, s2, 16
; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1]
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4
@@ -8249,161 +8233,157 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GCN-HSA-NEXT: s_add_u32 s2, s0, 32
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2
+; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3
+; GCN-HSA-NEXT: v_mov_b32_e32 v21, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 48
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2
+; GCN-HSA-NEXT: v_mov_b32_e32 v24, s3
+; GCN-HSA-NEXT: v_mov_b32_e32 v23, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v27, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v26, s2
+; GCN-HSA-NEXT: v_mov_b32_e32 v26, s3
+; GCN-HSA-NEXT: v_mov_b32_e32 v25, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v21, s1
-; GCN-HSA-NEXT: v_mov_b32_e32 v20, s0
+; GCN-HSA-NEXT: v_mov_b32_e32 v20, s1
+; GCN-HSA-NEXT: v_mov_b32_e32 v19, s0
; GCN-HSA-NEXT: s_waitcnt vmcnt(3)
-; GCN-HSA-NEXT: v_bfe_i32 v16, v13, 0, 16
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v13
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v13
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16
-; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[16:19]
-; GCN-HSA-NEXT: v_mov_b32_e32 v27, s3
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v14
-; GCN-HSA-NEXT: v_mov_b32_e32 v26, s2
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v15
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 16, v15
+; GCN-HSA-NEXT: v_bfe_i32 v15, v15, 0, 16
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v15
+; GCN-HSA-NEXT: flat_store_dwordx4 v[23:24], v[15:18]
+; GCN-HSA-NEXT: v_mov_b32_e32 v24, s3
+; GCN-HSA-NEXT: v_bfe_i32 v15, v13, 0, 16
+; GCN-HSA-NEXT: v_mov_b32_e32 v23, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0
-; GCN-HSA-NEXT: v_bfe_i32 v18, v13, 0, 16
-; GCN-HSA-NEXT: v_bfe_i32 v16, v14, 0, 16
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v13
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 16, v13
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v15
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v14
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v18
-; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19]
-; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v13, v15
-; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2
+; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[15:18]
+; GCN-HSA-NEXT: s_waitcnt vmcnt(4)
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v10
+; GCN-HSA-NEXT: v_bfe_i32 v15, v13, 0, 16
+; GCN-HSA-NEXT: v_bfe_i32 v13, v14, 0, 16
+; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v13
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v15
+; GCN-HSA-NEXT: v_mov_b32_e32 v17, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0
-; GCN-HSA-NEXT: v_bfe_i32 v13, v13, 0, 16
+; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[13:16]
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v15
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v15
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v13
-; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[13:16]
-; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v12
-; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2
-; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xd0
; GCN-HSA-NEXT: v_bfe_i32 v12, v12, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v14, v14, 0, 16
-; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14
-; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3
-; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[12:15]
-; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2
-; GCN-HSA-NEXT: s_waitcnt vmcnt(6)
-; GCN-HSA-NEXT: v_bfe_i32 v12, v9, 0, 16
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v28, 16, v10
-; GCN-HSA-NEXT: v_mov_b32_e32 v29, v11
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v8
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v9
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 16, v9
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12
-; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
-; GCN-HSA-NEXT: v_bfe_i32 v16, v29, 0, 16
-; GCN-HSA-NEXT: v_bfe_i32 v12, v8, 0, 16
-; GCN-HSA-NEXT: v_bfe_i32 v14, v18, 0, 16
-; GCN-HSA-NEXT: v_bfe_i32 v8, v10, 0, 16
-; GCN-HSA-NEXT: v_bfe_i32 v10, v28, 0, 16
+; GCN-HSA-NEXT: v_mov_b32_e32 v21, s2
+; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xd0
+; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[12:15]
+; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v11
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v11
+; GCN-HSA-NEXT: v_bfe_i32 v11, v11, 0, 16
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 31, v11
+; GCN-HSA-NEXT: v_mov_b32_e32 v26, s3
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v28, 16, v8
+; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[11:14]
+; GCN-HSA-NEXT: v_mov_b32_e32 v25, s2
+; GCN-HSA-NEXT: v_bfe_i32 v11, v9, 0, 16
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v11
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v11
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v9
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v9
+; GCN-HSA-NEXT: v_bfe_i32 v15, v10, 0, 16
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 31, v11
+; GCN-HSA-NEXT: v_bfe_i32 v10, v28, 0, 16
+; GCN-HSA-NEXT: v_bfe_i32 v17, v27, 0, 16
+; GCN-HSA-NEXT: v_bfe_i32 v8, v8, 0, 16
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v15
; GCN-HSA-NEXT: s_waitcnt vmcnt(5)
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 16, v2
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v2
+; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[11:14]
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v17
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12
-; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11]
-; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19]
-; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[12:15]
-; GCN-HSA-NEXT: v_bfe_i32 v8, v2, 0, 16
-; GCN-HSA-NEXT: v_bfe_i32 v10, v20, 0, 16
-; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10
-; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2
+; GCN-HSA-NEXT: v_bfe_i32 v12, v2, 0, 16
+; GCN-HSA-NEXT: v_bfe_i32 v14, v19, 0, 16
+; GCN-HSA-NEXT: flat_store_dwordx4 v[23:24], v[15:18]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[8:11]
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12
+; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14
+; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0
-; GCN-HSA-NEXT: v_mov_b32_e32 v21, v3
-; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[12:15]
+; GCN-HSA-NEXT: v_bfe_i32 v10, v3, 0, 16
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v3
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 16, v3
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v3
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v3
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2
-; GCN-HSA-NEXT: v_bfe_i32 v8, v21, 0, 16
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v0
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8
-; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v26, 16, v0
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v6
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v4
-; GCN-HSA-NEXT: v_bfe_i32 v12, v4, 0, 16
-; GCN-HSA-NEXT: v_bfe_i32 v16, v6, 0, 16
-; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[8:11]
-; GCN-HSA-NEXT: v_bfe_i32 v6, v5, 0, 16
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v5
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 16, v5
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 31, v1
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 16, v1
+; GCN-HSA-NEXT: v_bfe_i32 v20, v1, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v24, v0, 0, 16
-; GCN-HSA-NEXT: v_bfe_i32 v26, v26, 0, 16
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90
+; GCN-HSA-NEXT: v_bfe_i32 v26, v27, 0, 16
+; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v25, 31, v24
-; GCN-HSA-NEXT: v_bfe_i32 v14, v27, 0, 16
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v27, 31, v26
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90
+; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[24:27]
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[24:27]
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT: v_bfe_i32 v20, v1, 0, 16
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 31, v1
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 16, v1
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v6
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 31, v20
-; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[20:23]
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v19, v7
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v4
+; GCN-HSA-NEXT: v_bfe_i32 v16, v6, 0, 16
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10
; GCN-HSA-NEXT: v_bfe_i32 v18, v18, 0, 16
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70
+; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[20:23]
+; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16
-; GCN-HSA-NEXT: v_bfe_i32 v0, v19, 0, 16
+; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[10:13]
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT: v_bfe_i32 v10, v19, 0, 16
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v18
+; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70
+; GCN-HSA-NEXT: v_bfe_i32 v2, v7, 0, 16
+; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19]
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[16:19]
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v7
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v2, 16, v7
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT: v_bfe_i32 v8, v4, 0, 16
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v5
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 16, v5
+; GCN-HSA-NEXT: v_bfe_i32 v12, v5, 0, 16
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v7
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v4, 16, v7
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: s_add_u32 s2, s0, 64
-; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: s_add_u32 s0, s0, 0x50
-; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
-; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[6:9]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15]
; GCN-HSA-NEXT: s_endpgm
;
; GCN-NOHSA-VI-LABEL: global_sextload_v32i16_to_v32i64:
@@ -8416,115 +8396,110 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
-; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[5:8], off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[13:16], off, s[8:11], 0 offset:48
-; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[9:12], off, s[8:11], 0 offset:32
-; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[1:4], off, s[8:11], 0 offset:16
+; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0
+; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[15:18], off, s[8:11], 0 offset:48
+; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32
+; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[3:6], off, s[8:11], 0 offset:16
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3)
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v6, 0, 16
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v8
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(2)
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v15, 0, 16
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v15, 0, 16
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v17, 16, v6
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, v16
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 31, v18
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v21, 31, v20
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:224
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 16, v14
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v6, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v16, 0, 16
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 31, v18
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v21, 31, v20
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:240
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v13
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v13, 0, 16
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v17
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v19, v17, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v21, v2, 0, 16
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v20, 31, v19
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v22, 31, v21
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v18
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:224
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v18, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v19, v2, 0, 16
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v18, 31, v17
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v20, 31, v19
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v15
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:240
+; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(2)
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v22, v4, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v15, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v19, v2, 0, 16
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v18, 31, v17
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v20, 31, v19
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v16
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:192
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v16, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v2, 0, 16
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 31, v15
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v18, 31, v17
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v13
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:208
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v20, 16, v9
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v13, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v2, 0, 16
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 31, v15
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v18, 31, v17
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v14
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:160
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v14, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v15, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v2, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 31, v13
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 31, v15
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v6, 0, 16
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:208
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 31, v18
-; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(4)
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v13, 16, v11
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v21, 31, v20
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v13, 0, 16
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v11
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:176
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v9, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v11, 0, 16
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:192
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v2, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 31, v13
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, v12
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 31, v15
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:160
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v19, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v12, 0, 16
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v18, v8
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v16, 16, v8
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v12, 31, v11
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 31, v13
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v8, 16, v9
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:176
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v5, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v8, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v9, 0, 16
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v12, 31, v11
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 31, v13
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v10
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:128
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v9, v9, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v7, 0, 16
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v12, 16, v7
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v7, v10, 0, 16
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v8, 31, v7
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v10, 31, v9
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:144
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(8)
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v7, v3, 0, 16
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v9, v3, 0, 16
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v8, 31, v7
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v12
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:128
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v9, v5, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v2, 0, 16
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v5
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v2, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v1, 0, 16
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v4
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v12, 0, 16
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v4, 16, v3
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v24, v1, 0, 16
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v6
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v8, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v7, 0, 16
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v10, 0, 16
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 16, v10
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 31, v14
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 31, v16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v10, 31, v9
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:96
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v5, 0, 16
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, v4
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v1
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v27, v5, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v29, v4, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v2, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v17, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v19, v18, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v21, v16, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v12, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v23, v1, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v3, 0, 16
-; GCN-NOHSA-VI-NEXT: v_bfe_i32 v25, v7, 0, 16
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v28, 31, v27
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v30, 31, v29
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v12, 31, v11
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v28, v4, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v6, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v1, 0, 16
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:144
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:96
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v19, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v7, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v20, 0, 16
+; GCN-NOHSA-VI-NEXT: v_bfe_i32 v26, v3, 0, 16
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 31, v4
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 31, v6
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 31, v6
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 31, v8
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v20, 31, v19
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v22, 31, v21
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v12, 31, v11
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 31, v10
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 31, v13
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 31, v15
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v18, 31, v17
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v24, 31, v23
-; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v26, 31, v25
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:112
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:64
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:80
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:32
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:48
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 31, v18
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v21, 31, v20
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v23, 31, v22
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v25, 31, v24
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v27, 31, v26
+; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v29, 31, v28
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:112
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[26:29], off, s[0:3], 0 offset:64
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:80
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:32
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:48
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
; GCN-NOHSA-VI-NEXT: s_endpgm
;