diff options
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/load-local-i16.ll')
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/load-local-i16.ll | 214 |
1 files changed, 105 insertions, 109 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll index bd191a3..062a985 100644 --- a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll @@ -3172,27 +3172,25 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-NO-DS128-LABEL: local_zextload_v64i16_to_v64i32: ; VI-NO-DS128: ; %bb.0: ; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; VI-NO-DS128-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; VI-NO-DS128-NEXT: s_mov_b32 m0, -1 +; VI-NO-DS128-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; VI-NO-DS128-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; VI-NO-DS128-NEXT: s_mov_b32 s90, -1 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_mov_b32_e32 v16, s1 ; VI-NO-DS128-NEXT: ds_read2_b64 v[10:13], v16 offset1:1 ; VI-NO-DS128-NEXT: ds_read2_b64 v[17:20], v16 offset0:2 offset1:3 +; VI-NO-DS128-NEXT: ds_read2_b64 v[21:24], v16 offset0:4 offset1:5 ; VI-NO-DS128-NEXT: s_mov_b32 s91, 0xe80000 ; VI-NO-DS128-NEXT: s_add_u32 s88, s88, s11 -; VI-NO-DS128-NEXT: s_addc_u32 s89, s89, 0 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) +; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(2) ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v11 -; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v11 -; VI-NO-DS128-NEXT: buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill -; VI-NO-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v10 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v13 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v12 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) +; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v18 +; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v11 ; VI-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v10 ; VI-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v13 ; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v12 @@ -3200,7 +3198,6 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v20 ; VI-NO-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v18 ; VI-NO-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v17 -; VI-NO-DS128-NEXT: ds_read2_b64 v[21:24], v16 offset0:4 offset1:5 ; VI-NO-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v20 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v19 ; VI-NO-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v19 @@ -3243,17 +3240,19 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v56, 16, v19 ; VI-NO-DS128-NEXT: v_and_b32_e32 v55, 0xffff, v19 ; VI-NO-DS128-NEXT: ds_read2_b64 v[16:19], v16 offset0:14 offset1:15 +; VI-NO-DS128-NEXT: s_addc_u32 s89, s89, 0 +; VI-NO-DS128-NEXT: buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill +; VI-NO-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v54, 16, v20 ; VI-NO-DS128-NEXT: v_and_b32_e32 v53, 0xffff, v20 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v58, 16, v22 -; VI-NO-DS128-NEXT: v_and_b32_e32 v57, 0xffff, v22 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v20, 16, v19 ; VI-NO-DS128-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v18 ; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v18 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v18, s0 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v58, 16, v22 +; VI-NO-DS128-NEXT: v_and_b32_e32 v57, 0xffff, v22 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v22, 16, v21 ; VI-NO-DS128-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v60, 16, v24 @@ -3296,21 +3295,17 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v56, s1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[10:13], v56 offset1:1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[14:17], v56 offset0:2 offset1:3 -; GFX9-NO-DS128-NEXT: s_add_u32 s12, s12, s11 -; GFX9-NO-DS128-NEXT: s_addc_u32 s13, s13, 0 -; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v11 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v11 -; GFX9-NO-DS128-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill -; GFX9-NO-DS128-NEXT: s_nop 0 -; GFX9-NO-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[18:21], v56 offset0:4 offset1:5 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[22:25], v56 offset0:6 offset1:7 +; GFX9-NO-DS128-NEXT: s_add_u32 s12, s12, s11 +; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(3) +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v11 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v10 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v13 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v12 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(2) ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v15 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v11 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v10 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v13 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v12 @@ -3337,9 +3332,11 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v36, 0xffff, v22 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[16:19], v56 offset0:8 offset1:9 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[20:23], v56 offset0:10 offset1:11 +; GFX9-NO-DS128-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NO-DS128-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill +; GFX9-NO-DS128-NEXT: s_nop 0 +; GFX9-NO-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v39, 16, v25 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v38, 0xffff, v25 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v41, 16, v17 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v40, 0xffff, v17 @@ -3360,16 +3357,17 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v55, 16, v22 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v54, 0xffff, v22 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[20:23], v56 offset0:14 offset1:15 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v39, 16, v25 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v38, 0xffff, v25 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v25, 16, v24 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v57, 16, v17 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v56, 0xffff, v17 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v63, 16, v23 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v62, 0xffff, v23 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v22 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v57, 16, v17 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v56, 0xffff, v17 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v16 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v59, 16, v19 @@ -3806,9 +3804,11 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-NEXT: ds_read_b128 v[16:19], v0 offset:16 ; VI-DS128-NEXT: s_mov_b32 s91, 0xe80000 ; VI-DS128-NEXT: s_add_u32 s88, s88, s11 -; VI-DS128-NEXT: s_addc_u32 s89, s89, 0 -; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) +; VI-DS128-NEXT: ds_read_b128 v[20:23], v0 offset:32 +; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:48 +; VI-DS128-NEXT: s_waitcnt lgkmcnt(3) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v11 +; VI-DS128-NEXT: s_addc_u32 s89, s89, 0 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v10 ; VI-DS128-NEXT: v_mov_b32_e32 v4, v3 ; VI-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v11 @@ -3825,23 +3825,16 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-NEXT: buffer_store_dword v5, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill ; VI-DS128-NEXT: buffer_store_dword v6, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill ; VI-DS128-NEXT: buffer_store_dword v7, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill -; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) +; VI-DS128-NEXT: s_waitcnt lgkmcnt(2) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v19 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v18 -; VI-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v19 -; VI-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; VI-DS128-NEXT: ds_read_b128 v[20:23], v0 offset:32 -; VI-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:32 ; 4-byte Folded Spill -; VI-DS128-NEXT: buffer_store_dword v2, off, s[88:91], 0 offset:36 ; 4-byte Folded Spill -; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:40 ; 4-byte Folded Spill -; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:44 ; 4-byte Folded Spill -; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:48 -; VI-DS128-NEXT: ds_read_b128 v[36:39], v0 offset:64 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v17 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v16 +; VI-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v19 +; VI-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v18 ; VI-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v17 ; VI-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v16 -; VI-DS128-NEXT: s_waitcnt lgkmcnt(2) +; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v23 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v22 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v21 @@ -3850,21 +3843,25 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v22 ; VI-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v21 ; VI-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v20 -; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) +; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v27 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v26 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v35, 16, v25 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v33, 16, v24 ; VI-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v27 +; VI-DS128-NEXT: ds_read_b128 v[36:39], v0 offset:64 ; VI-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v26 ; VI-DS128-NEXT: v_and_b32_e32 v34, 0xffff, v25 ; VI-DS128-NEXT: v_and_b32_e32 v32, 0xffff, v24 ; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:80 ; VI-DS128-NEXT: ds_read_b128 v[55:58], v0 offset:96 +; VI-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:32 ; 4-byte Folded Spill +; VI-DS128-NEXT: buffer_store_dword v2, off, s[88:91], 0 offset:36 ; 4-byte Folded Spill +; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:40 ; 4-byte Folded Spill +; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:44 ; 4-byte Folded Spill ; VI-DS128-NEXT: s_waitcnt lgkmcnt(2) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v42, 16, v39 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v40, 16, v38 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v46, 16, v37 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v50, 16, v27 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v48, 16, v26 @@ -3875,16 +3872,17 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-NEXT: v_and_b32_e32 v53, 0xffff, v25 ; VI-DS128-NEXT: v_and_b32_e32 v51, 0xffff, v24 ; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:112 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v46, 16, v37 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v44, 16, v36 ; VI-DS128-NEXT: v_and_b32_e32 v41, 0xffff, v39 ; VI-DS128-NEXT: v_and_b32_e32 v39, 0xffff, v38 -; VI-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v37 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v25 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v24 ; VI-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v25 ; VI-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v24 ; VI-DS128-NEXT: v_mov_b32_e32 v24, s0 +; VI-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v37 ; VI-DS128-NEXT: v_and_b32_e32 v43, 0xffff, v36 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v61, 16, v58 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v59, 16, v57 @@ -3943,9 +3941,11 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: ds_read_b128 v[8:11], v0 ; GFX9-DS128-NEXT: ds_read_b128 v[16:19], v0 offset:16 ; GFX9-DS128-NEXT: s_add_u32 s12, s12, s11 -; GFX9-DS128-NEXT: s_addc_u32 s13, s13, 0 -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-DS128-NEXT: ds_read_b128 v[20:23], v0 offset:32 +; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:48 +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(3) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v11 +; GFX9-DS128-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v10 ; GFX9-DS128-NEXT: v_mov_b32_e32 v4, v3 ; GFX9-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v11 @@ -3964,24 +3964,16 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v19 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v18 -; GFX9-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v19 -; GFX9-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; GFX9-DS128-NEXT: ds_read_b128 v[20:23], v0 offset:32 -; GFX9-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: s_nop 0 -; GFX9-DS128-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:48 -; GFX9-DS128-NEXT: ds_read_b128 v[36:39], v0 offset:64 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v17 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v16 +; GFX9-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v19 +; GFX9-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v18 ; GFX9-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v17 ; GFX9-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v16 -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2) +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v23 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v22 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v21 @@ -3990,21 +3982,26 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v22 ; GFX9-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v21 ; GFX9-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v20 -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v27 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v26 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v35, 16, v25 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v33, 16, v24 ; GFX9-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v27 +; GFX9-DS128-NEXT: ds_read_b128 v[36:39], v0 offset:64 ; GFX9-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v26 ; GFX9-DS128-NEXT: v_and_b32_e32 v34, 0xffff, v25 ; GFX9-DS128-NEXT: v_and_b32_e32 v32, 0xffff, v24 ; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:80 ; GFX9-DS128-NEXT: ds_read_b128 v[55:58], v0 offset:96 +; GFX9-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill +; GFX9-DS128-NEXT: s_nop 0 +; GFX9-DS128-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill +; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill +; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v42, 16, v39 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v40, 16, v38 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v46, 16, v37 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v50, 16, v27 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v48, 16, v26 @@ -4015,16 +4012,17 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: v_and_b32_e32 v53, 0xffff, v25 ; GFX9-DS128-NEXT: v_and_b32_e32 v51, 0xffff, v24 ; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:112 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v46, 16, v37 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v44, 16, v36 ; GFX9-DS128-NEXT: v_and_b32_e32 v41, 0xffff, v39 ; GFX9-DS128-NEXT: v_and_b32_e32 v39, 0xffff, v38 -; GFX9-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v37 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v25 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v24 ; GFX9-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v25 ; GFX9-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v24 ; GFX9-DS128-NEXT: v_mov_b32_e32 v24, s0 +; GFX9-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v37 ; GFX9-DS128-NEXT: v_and_b32_e32 v43, 0xffff, v36 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v61, 16, v58 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v59, 16, v57 @@ -4197,29 +4195,20 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-NO-DS128-LABEL: local_sextload_v64i16_to_v64i32: ; VI-NO-DS128: ; %bb.0: ; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; VI-NO-DS128-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; VI-NO-DS128-NEXT: s_mov_b32 m0, -1 +; VI-NO-DS128-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; VI-NO-DS128-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; VI-NO-DS128-NEXT: s_mov_b32 s90, -1 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_mov_b32_e32 v28, s1 +; VI-NO-DS128-NEXT: ds_read2_b64 v[20:23], v28 offset0:4 offset1:5 +; VI-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:6 offset1:7 +; VI-NO-DS128-NEXT: ds_read2_b64 v[33:36], v28 offset0:8 offset1:9 ; VI-NO-DS128-NEXT: ds_read2_b64 v[10:13], v28 offset1:1 ; VI-NO-DS128-NEXT: ds_read2_b64 v[14:17], v28 offset0:2 offset1:3 ; VI-NO-DS128-NEXT: s_mov_b32 s91, 0xe80000 ; VI-NO-DS128-NEXT: s_add_u32 s88, s88, s11 -; VI-NO-DS128-NEXT: s_addc_u32 s89, s89, 0 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v11 -; VI-NO-DS128-NEXT: v_bfe_i32 v0, v11, 0, 16 -; VI-NO-DS128-NEXT: buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill -; VI-NO-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill -; VI-NO-DS128-NEXT: ds_read2_b64 v[20:23], v28 offset0:4 offset1:5 -; VI-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:6 offset1:7 -; VI-NO-DS128-NEXT: ds_read2_b64 v[33:36], v28 offset0:8 offset1:9 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v10 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v13 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v12 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) +; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(3) ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v30 ; VI-NO-DS128-NEXT: v_bfe_i32 v24, v30, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v27, 16, v29 @@ -4229,7 +4218,7 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v40, 16, v31 ; VI-NO-DS128-NEXT: v_bfe_i32 v39, v31, 0, 16 ; VI-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:10 offset1:11 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) +; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(3) ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v42, 16, v34 ; VI-NO-DS128-NEXT: v_bfe_i32 v41, v34, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v44, 16, v33 @@ -4247,16 +4236,24 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v56, 16, v31 ; VI-NO-DS128-NEXT: v_bfe_i32 v55, v31, 0, 16 ; VI-NO-DS128-NEXT: ds_read2_b64 v[28:31], v28 offset0:14 offset1:15 +; VI-NO-DS128-NEXT: s_addc_u32 s89, s89, 0 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v11 +; VI-NO-DS128-NEXT: v_bfe_i32 v0, v11, 0, 16 +; VI-NO-DS128-NEXT: buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill +; VI-NO-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v54, 16, v32 ; VI-NO-DS128-NEXT: v_bfe_i32 v53, v32, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v15 -; VI-NO-DS128-NEXT: v_bfe_i32 v2, v10, 0, 16 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v32, 16, v31 ; VI-NO-DS128-NEXT: v_bfe_i32 v31, v31, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v30 ; VI-NO-DS128-NEXT: v_bfe_i32 v0, v30, 0, 16 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v30, s0 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v10 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v13 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v12 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v15 +; VI-NO-DS128-NEXT: v_bfe_i32 v2, v10, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v4, v13, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v6, v12, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v14 @@ -4316,23 +4313,14 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-NO-DS128-NEXT: s_mov_b32 s15, 0xe00000 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v28, s1 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[20:23], v28 offset0:4 offset1:5 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:6 offset1:7 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[33:36], v28 offset0:8 offset1:9 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[10:13], v28 offset1:1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[14:17], v28 offset0:2 offset1:3 ; GFX9-NO-DS128-NEXT: s_add_u32 s12, s12, s11 ; GFX9-NO-DS128-NEXT: s_addc_u32 s13, s13, 0 -; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v11 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v11, 0, 16 -; GFX9-NO-DS128-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill -; GFX9-NO-DS128-NEXT: s_nop 0 -; GFX9-NO-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[20:23], v28 offset0:4 offset1:5 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:6 offset1:7 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[33:36], v28 offset0:8 offset1:9 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v10 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v13 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v12 -; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(3) ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v30 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v24, v30, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v27, 16, v29 @@ -4342,7 +4330,7 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v40, 16, v31 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v39, v31, 0, 16 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:10 offset1:11 -; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(3) ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v42, 16, v34 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v41, v34, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v44, 16, v33 @@ -4360,16 +4348,24 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v56, 16, v31 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v55, v31, 0, 16 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[28:31], v28 offset0:14 offset1:15 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v11 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v11, 0, 16 +; GFX9-NO-DS128-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill +; GFX9-NO-DS128-NEXT: s_nop 0 +; GFX9-NO-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v54, 16, v32 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v53, v32, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v15 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v10, 0, 16 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v32, 16, v31 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v31, v31, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v30 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v30, 0, 16 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v30, s0 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v10 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v13 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v12 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v15 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v10, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v4, v13, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v6, v12, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v14 @@ -4857,10 +4853,12 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-NEXT: v_mov_b32_e32 v32, s1 ; VI-DS128-NEXT: ds_read_b128 v[8:11], v32 ; VI-DS128-NEXT: ds_read_b128 v[16:19], v32 offset:16 +; VI-DS128-NEXT: ds_read_b128 v[24:27], v32 offset:32 +; VI-DS128-NEXT: ds_read_b128 v[33:36], v32 offset:48 ; VI-DS128-NEXT: s_mov_b32 s91, 0xe80000 ; VI-DS128-NEXT: s_add_u32 s88, s88, s11 ; VI-DS128-NEXT: s_addc_u32 s89, s89, 0 -; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) +; VI-DS128-NEXT: s_waitcnt lgkmcnt(3) ; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v11 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v10 ; VI-DS128-NEXT: v_bfe_i32 v2, v11, 0, 16 @@ -4873,12 +4871,6 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v8 ; VI-DS128-NEXT: v_bfe_i32 v5, v9, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v3, v8, 0, 16 -; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill -; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill -; VI-DS128-NEXT: buffer_store_dword v5, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill -; VI-DS128-NEXT: buffer_store_dword v6, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill -; VI-DS128-NEXT: ds_read_b128 v[24:27], v32 offset:32 -; VI-DS128-NEXT: ds_read_b128 v[33:36], v32 offset:48 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(2) ; VI-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v19 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v18 @@ -4899,8 +4891,11 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-NEXT: ds_read_b128 v[36:39], v32 offset:64 ; VI-DS128-NEXT: ds_read_b128 v[40:43], v32 offset:80 ; VI-DS128-NEXT: ds_read_b128 v[56:59], v32 offset:96 +; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill +; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill +; VI-DS128-NEXT: buffer_store_dword v5, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill +; VI-DS128-NEXT: buffer_store_dword v6, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill ; VI-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v25 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v24 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(2) ; VI-DS128-NEXT: v_ashrrev_i32_e32 v47, 16, v39 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v45, 16, v38 @@ -4913,14 +4908,15 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-NEXT: v_bfe_i32 v52, v40, 0, 16 ; VI-DS128-NEXT: ds_read_b128 v[37:40], v32 offset:112 ; VI-DS128-NEXT: v_mov_b32_e32 v32, s0 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v24 ; VI-DS128-NEXT: v_bfe_i32 v22, v25, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v20, v24, 0, 16 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v35 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v38 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v37 ; VI-DS128-NEXT: v_bfe_i32 v2, v38, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v0, v37, 0, 16 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v35 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v31, 16, v34 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v29, 16, v33 ; VI-DS128-NEXT: v_bfe_i32 v24, v35, 0, 16 @@ -4985,9 +4981,11 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: v_mov_b32_e32 v32, s1 ; GFX9-DS128-NEXT: ds_read_b128 v[8:11], v32 ; GFX9-DS128-NEXT: ds_read_b128 v[16:19], v32 offset:16 +; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v32 offset:32 +; GFX9-DS128-NEXT: ds_read_b128 v[33:36], v32 offset:48 ; GFX9-DS128-NEXT: s_add_u32 s12, s12, s11 ; GFX9-DS128-NEXT: s_addc_u32 s13, s13, 0 -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(3) ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v11 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v10 ; GFX9-DS128-NEXT: v_bfe_i32 v2, v11, 0, 16 @@ -5001,13 +4999,6 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v8 ; GFX9-DS128-NEXT: v_bfe_i32 v5, v9, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v3, v8, 0, 16 -; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: s_nop 0 -; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v32 offset:32 -; GFX9-DS128-NEXT: ds_read_b128 v[33:36], v32 offset:48 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2) ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v19 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v18 @@ -5028,8 +5019,12 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: ds_read_b128 v[36:39], v32 offset:64 ; GFX9-DS128-NEXT: ds_read_b128 v[40:43], v32 offset:80 ; GFX9-DS128-NEXT: ds_read_b128 v[56:59], v32 offset:96 +; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill +; GFX9-DS128-NEXT: s_nop 0 +; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill +; GFX9-DS128-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill +; GFX9-DS128-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v25 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v24 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2) ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v47, 16, v39 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v45, 16, v38 @@ -5042,14 +5037,15 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: v_bfe_i32 v52, v40, 0, 16 ; GFX9-DS128-NEXT: ds_read_b128 v[37:40], v32 offset:112 ; GFX9-DS128-NEXT: v_mov_b32_e32 v32, s0 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v24 ; GFX9-DS128-NEXT: v_bfe_i32 v22, v25, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v20, v24, 0, 16 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v35 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v38 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v37 ; GFX9-DS128-NEXT: v_bfe_i32 v2, v38, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v0, v37, 0, 16 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v35 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v31, 16, v34 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v29, 16, v33 ; GFX9-DS128-NEXT: v_bfe_i32 v24, v35, 0, 16 |
