; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=SI %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-enable-ds128 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=VI,VI-NO-DS128 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-enable-ds128 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GFX9,GFX9-NO-DS128 %s ; RUN: llc -mtriple=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=EG %s ; Testing for ds_read/write_b128 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=+enable-ds128 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=VI,VI-DS128 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+enable-ds128 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GFX9,GFX9-DS128 %s define amdgpu_kernel void @local_load_i16(ptr addrspace(3) %out, ptr addrspace(3) %in) { ; SI-LABEL: local_load_i16: ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s1 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read_u16 v0, v0 ; SI-NEXT: v_mov_b32_e32 v1, s0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: ds_write_b16 v1, v0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: local_load_i16: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s1 ; VI-NEXT: ds_read_u16 v0, v0 ; VI-NEXT: v_mov_b32_e32 v1, s0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: ds_write_b16 v1, v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: local_load_i16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: ds_read_u16 v0, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_write_b16 v1, v0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: local_load_i16: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU 4, @0, KC0[CB0:0-32], KC1[] ; EG-NEXT: MOV * T0.W, KC0[2].Z, ; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.X, OQAP, ; EG-NEXT: MOV * T0.W, KC0[2].Y, ; EG-NEXT: LDS_SHORT_WRITE * T0.W, T0.X, ; EG-NEXT: RETURN entry: %ld = load i16, ptr addrspace(3) %in store i16 %ld, ptr addrspace(3) %out ret void } define amdgpu_kernel void @local_load_v2i16(ptr addrspace(3) %out, ptr addrspace(3) %in) { ; SI-LABEL: local_load_v2i16: ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s1 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read_b32 v0, v0 ; SI-NEXT: v_mov_b32_e32 v1, s0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: ds_write_b32 v1, v0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: local_load_v2i16: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s1 ; VI-NEXT: ds_read_b32 v0, v0 ; VI-NEXT: v_mov_b32_e32 v1, s0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: ds_write_b32 v1, v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: local_load_v2i16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: ds_read_b32 v0, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_write_b32 v1, v0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: local_load_v2i16: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU 4, @1, KC0[CB0:0-32], KC1[] ; EG-NEXT: MOV * T0.W, KC0[2].Z, ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.X, OQAP, ; EG-NEXT: MOV * T0.W, KC0[2].Y, ; EG-NEXT: LDS_WRITE * T0.W, T0.X, ; EG-NEXT: RETURN entry: %ld = load <2 x i16>, ptr addrspace(3) %in store <2 x i16> %ld, ptr addrspace(3) %out ret void } define amdgpu_kernel void @local_load_v3i16(ptr addrspace(3) %out, ptr addrspace(3) %in) { ; SI-LABEL: local_load_v3i16: ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s1 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read_b64 v[0:1], v0 ; SI-NEXT: v_mov_b32_e32 v2, s0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: ds_write_b32 v2, v0 ; SI-NEXT: ds_write_b16 v2, v1 offset:4 ; SI-NEXT: s_endpgm ; ; VI-LABEL: local_load_v3i16: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s1 ; VI-NEXT: ds_read_b64 v[0:1], v0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: ds_write_b16 v2, v1 offset:4 ; VI-NEXT: ds_write_b32 v2, v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: local_load_v3i16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: ds_read_b64 v[0:1], v0 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_write_b16 v2, v1 offset:4 ; GFX9-NEXT: ds_write_b32 v2, v0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: local_load_v3i16: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU 11, @2, KC0[CB0:0-32], KC1[] ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) ; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.X, OQAP, ; EG-NEXT: MOV * T0.W, KC0[2].Z, ; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.Y, OQAP, ; EG-NEXT: MOV * T0.W, KC0[2].Y, ; EG-NEXT: LDS_WRITE * T0.W, T0.Y, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) ; EG-NEXT: LDS_SHORT_WRITE * T0.W, T0.X, ; EG-NEXT: RETURN entry: %ld = load <3 x i16>, ptr addrspace(3) %in store <3 x i16> %ld, ptr addrspace(3) %out ret void } define amdgpu_kernel void @local_load_v4i16(ptr addrspace(3) %out, ptr addrspace(3) %in) { ; SI-LABEL: local_load_v4i16: ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s1 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read_b64 v[0:1], v0 ; SI-NEXT: v_mov_b32_e32 v2, s0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: ds_write_b64 v2, v[0:1] ; SI-NEXT: s_endpgm ; ; VI-LABEL: local_load_v4i16: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s1 ; VI-NEXT: ds_read_b64 v[0:1], v0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: ds_write_b64 v2, v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: local_load_v4i16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: ds_read_b64 v[0:1], v0 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_write_b64 v2, v[0:1] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: local_load_v4i16: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU 11, @3, KC0[CB0:0-32], KC1[] ; EG-NEXT: MOV * T0.W, KC0[2].Z, ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.X, OQAP, ; EG-NEXT: MOV * T0.W, KC0[2].Y, ; EG-NEXT: LDS_WRITE * T0.W, T0.X, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.X, OQAP, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T0.X, ; EG-NEXT: RETURN entry: %ld = load <4 x i16>, ptr addrspace(3) %in store <4 x i16> %ld, ptr addrspace(3) %out ret void } define amdgpu_kernel void @local_load_v8i16(ptr addrspace(3) %out, ptr addrspace(3) %in) { ; SI-LABEL: local_load_v8i16: ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s1 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 ; SI-NEXT: v_mov_b32_e32 v4, s0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 ; SI-NEXT: s_endpgm ; ; VI-NO-DS128-LABEL: local_load_v8i16: ; VI-NO-DS128: ; %bb.0: ; %entry ; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NO-DS128-NEXT: s_mov_b32 m0, -1 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_mov_b32_e32 v0, s1 ; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v4, s0 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 ; VI-NO-DS128-NEXT: s_endpgm ; ; GFX9-NO-DS128-LABEL: local_load_v8i16: ; GFX9-NO-DS128: ; %bb.0: ; %entry ; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v4, s0 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 ; GFX9-NO-DS128-NEXT: s_endpgm ; ; EG-LABEL: local_load_v8i16: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU 25, @4, KC0[CB0:0-32], KC1[] ; EG-NEXT: MOV * T0.W, KC0[2].Z, ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.X, OQAP, ; EG-NEXT: MOV * T0.W, KC0[2].Y, ; EG-NEXT: LDS_WRITE * T0.W, T0.X, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.X, OQAP, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T0.X, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.X, OQAP, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T0.X, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.X, OQAP, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T0.X, ; EG-NEXT: RETURN ; ; VI-DS128-LABEL: local_load_v8i16: ; VI-DS128: ; %bb.0: ; %entry ; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-DS128-NEXT: s_mov_b32 m0, -1 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_mov_b32_e32 v0, s1 ; VI-DS128-NEXT: ds_read_b128 v[0:3], v0 ; VI-DS128-NEXT: v_mov_b32_e32 v4, s0 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: ds_write_b128 v4, v[0:3] ; VI-DS128-NEXT: s_endpgm ; ; GFX9-DS128-LABEL: local_load_v8i16: ; GFX9-DS128: ; %bb.0: ; %entry ; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-DS128-NEXT: ds_read_b128 v[0:3], v0 ; GFX9-DS128-NEXT: v_mov_b32_e32 v4, s0 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: ds_write_b128 v4, v[0:3] ; GFX9-DS128-NEXT: s_endpgm entry: %ld = load <8 x i16>, ptr addrspace(3) %in store <8 x i16> %ld, ptr addrspace(3) %out ret void } define amdgpu_kernel void @local_load_v16i16(ptr addrspace(3) %out, ptr addrspace(3) %in) { ; SI-LABEL: local_load_v16i16: ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v4, s1 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read2_b64 v[0:3], v4 offset0:2 offset1:3 ; SI-NEXT: ds_read2_b64 v[4:7], v4 offset1:1 ; SI-NEXT: v_mov_b32_e32 v8, s0 ; SI-NEXT: s_waitcnt lgkmcnt(1) ; SI-NEXT: ds_write2_b64 v8, v[0:1], v[2:3] offset0:2 offset1:3 ; SI-NEXT: s_waitcnt lgkmcnt(1) ; SI-NEXT: ds_write2_b64 v8, v[4:5], v[6:7] offset1:1 ; SI-NEXT: s_endpgm ; ; VI-NO-DS128-LABEL: local_load_v16i16: ; VI-NO-DS128: ; %bb.0: ; %entry ; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NO-DS128-NEXT: s_mov_b32 m0, -1 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_mov_b32_e32 v4, s1 ; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v4 offset0:2 offset1:3 ; VI-NO-DS128-NEXT: ds_read2_b64 v[4:7], v4 offset1:1 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v8, s0 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-NO-DS128-NEXT: ds_write2_b64 v8, v[0:1], v[2:3] offset0:2 offset1:3 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-NO-DS128-NEXT: ds_write2_b64 v8, v[4:5], v[6:7] offset1:1 ; VI-NO-DS128-NEXT: s_endpgm ; ; GFX9-NO-DS128-LABEL: local_load_v16i16: ; GFX9-NO-DS128: ; %bb.0: ; %entry ; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v4 offset0:2 offset1:3 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[4:7], v4 offset1:1 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v8, s0 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-NO-DS128-NEXT: ds_write2_b64 v8, v[0:1], v[2:3] offset0:2 offset1:3 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-NO-DS128-NEXT: ds_write2_b64 v8, v[4:5], v[6:7] offset1:1 ; GFX9-NO-DS128-NEXT: s_endpgm ; ; EG-LABEL: local_load_v16i16: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU 53, @5, KC0[CB0:0-32], KC1[] ; EG-NEXT: MOV * T0.W, KC0[2].Z, ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.X, OQAP, ; EG-NEXT: MOV * T0.W, KC0[2].Y, ; EG-NEXT: LDS_WRITE * T0.W, T0.X, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.X, OQAP, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T0.X, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.X, OQAP, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T0.X, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.X, OQAP, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T0.X, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.X, OQAP, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T0.X, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.X, OQAP, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T0.X, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.X, OQAP, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T0.X, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.X, OQAP, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T0.X, ; EG-NEXT: RETURN ; ; VI-DS128-LABEL: local_load_v16i16: ; VI-DS128: ; %bb.0: ; %entry ; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-DS128-NEXT: s_mov_b32 m0, -1 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_mov_b32_e32 v4, s1 ; VI-DS128-NEXT: ds_read_b128 v[0:3], v4 offset:16 ; VI-DS128-NEXT: ds_read_b128 v[4:7], v4 ; VI-DS128-NEXT: v_mov_b32_e32 v8, s0 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-DS128-NEXT: ds_write_b128 v8, v[0:3] offset:16 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-DS128-NEXT: ds_write_b128 v8, v[4:7] ; VI-DS128-NEXT: s_endpgm ; ; GFX9-DS128-LABEL: local_load_v16i16: ; GFX9-DS128: ; %bb.0: ; %entry ; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-DS128-NEXT: ds_read_b128 v[0:3], v4 offset:16 ; GFX9-DS128-NEXT: ds_read_b128 v[4:7], v4 ; GFX9-DS128-NEXT: v_mov_b32_e32 v8, s0 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-DS128-NEXT: ds_write_b128 v8, v[0:3] offset:16 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-DS128-NEXT: ds_write_b128 v8, v[4:7] ; GFX9-DS128-NEXT: s_endpgm entry: %ld = load <16 x i16>, ptr addrspace(3) %in store <16 x i16> %ld, ptr addrspace(3) %out ret void } define amdgpu_kernel void @local_zextload_i16_to_i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { ; SI-LABEL: local_zextload_i16_to_i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s1 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read_u16 v0, v0 ; SI-NEXT: v_mov_b32_e32 v1, s0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: ds_write_b32 v1, v0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: local_zextload_i16_to_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s1 ; VI-NEXT: ds_read_u16 v0, v0 ; VI-NEXT: v_mov_b32_e32 v1, s0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: ds_write_b32 v1, v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: local_zextload_i16_to_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: ds_read_u16 v0, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_write_b32 v1, v0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: local_zextload_i16_to_i32: ; EG: ; %bb.0: ; EG-NEXT: ALU 4, @6, KC0[CB0:0-32], KC1[] ; EG-NEXT: MOV * T0.W, KC0[2].Z, ; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.X, OQAP, ; EG-NEXT: MOV * T0.W, KC0[2].Y, ; EG-NEXT: LDS_WRITE * T0.W, T0.X, ; EG-NEXT: RETURN %a = load i16, ptr addrspace(3) %in %ext = zext i16 %a to i32 store i32 %ext, ptr addrspace(3) %out ret void } define amdgpu_kernel void @local_sextload_i16_to_i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { ; SI-LABEL: local_sextload_i16_to_i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s1 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read_i16 v0, v0 ; SI-NEXT: v_mov_b32_e32 v1, s0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: ds_write_b32 v1, v0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: local_sextload_i16_to_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s1 ; VI-NEXT: ds_read_i16 v0, v0 ; VI-NEXT: v_mov_b32_e32 v1, s0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: ds_write_b32 v1, v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: local_sextload_i16_to_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: ds_read_i16 v0, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_write_b32 v1, v0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: local_sextload_i16_to_i32: ; EG: ; %bb.0: ; EG-NEXT: ALU 6, @7, KC0[CB0:0-32], KC1[] ; EG-NEXT: MOV * T0.W, KC0[2].Z, ; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W ; EG-NEXT: MOV * T0.X, OQAP, ; EG-NEXT: BFE_INT T0.W, PV.X, 0.0, literal.x, ; EG-NEXT: MOV * T1.W, KC0[2].Y, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: RETURN %a = load i16, ptr addrspace(3) %in %ext = sext i16 %a to i32 store i32 %ext, ptr addrspace(3) %out ret void } define amdgpu_kernel void @local_zextload_v1i16_to_v1i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { ; SI-LABEL: local_zextload_v1i16_to_v1i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s1 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read_u16 v0, v0 ; SI-NEXT: v_mov_b32_e32 v1, s0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: ds_write_b32 v1, v0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: local_zextload_v1i16_to_v1i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s1 ; VI-NEXT: ds_read_u16 v0, v0 ; VI-NEXT: v_mov_b32_e32 v1, s0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: ds_write_b32 v1, v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: local_zextload_v1i16_to_v1i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: ds_read_u16 v0, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_write_b32 v1, v0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: local_zextload_v1i16_to_v1i32: ; EG: ; %bb.0: ; EG-NEXT: ALU 4, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: MOV * T0.W, KC0[2].Z, ; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.X, OQAP, ; EG-NEXT: MOV * T0.W, KC0[2].Y, ; EG-NEXT: LDS_WRITE * T0.W, T0.X, ; EG-NEXT: RETURN %load = load <1 x i16>, ptr addrspace(3) %in %ext = zext <1 x i16> %load to <1 x i32> store <1 x i32> %ext, ptr addrspace(3) %out ret void } define amdgpu_kernel void @local_sextload_v1i16_to_v1i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { ; SI-LABEL: local_sextload_v1i16_to_v1i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s1 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read_i16 v0, v0 ; SI-NEXT: v_mov_b32_e32 v1, s0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: ds_write_b32 v1, v0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: local_sextload_v1i16_to_v1i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s1 ; VI-NEXT: ds_read_i16 v0, v0 ; VI-NEXT: v_mov_b32_e32 v1, s0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: ds_write_b32 v1, v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: local_sextload_v1i16_to_v1i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: ds_read_i16 v0, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_write_b32 v1, v0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: local_sextload_v1i16_to_v1i32: ; EG: ; %bb.0: ; EG-NEXT: ALU 6, @9, KC0[CB0:0-32], KC1[] ; EG-NEXT: MOV * T0.W, KC0[2].Z, ; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W ; EG-NEXT: MOV * T0.X, OQAP, ; EG-NEXT: BFE_INT T0.W, PV.X, 0.0, literal.x, ; EG-NEXT: MOV * T1.W, KC0[2].Y, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: RETURN %load = load <1 x i16>, ptr addrspace(3) %in %ext = sext <1 x i16> %load to <1 x i32> store <1 x i32> %ext, ptr addrspace(3) %out ret void } define amdgpu_kernel void @local_zextload_v2i16_to_v2i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { ; SI-LABEL: local_zextload_v2i16_to_v2i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s1 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read_b32 v0, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_mov_b32_e32 v2, s0 ; SI-NEXT: ds_write_b64 v2, v[0:1] ; SI-NEXT: s_endpgm ; ; VI-LABEL: local_zextload_v2i16_to_v2i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s1 ; VI-NEXT: ds_read_b32 v0, v0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; VI-NEXT: ds_write_b64 v2, v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: local_zextload_v2i16_to_v2i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: ds_read_b32 v0, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: ds_write_b64 v2, v[0:1] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: local_zextload_v2i16_to_v2i32: ; EG: ; %bb.0: ; EG-NEXT: ALU 10, @10, KC0[CB0:0-32], KC1[] ; EG-NEXT: MOV * T0.W, KC0[2].Z, ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV * T0.Y, OQAP, ; EG-NEXT: AND_INT T0.W, PV.Y, literal.x, ; EG-NEXT: MOV * T1.W, KC0[2].Y, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: LSHR T0.W, T0.Y, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 4(5.605194e-45) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: RETURN %load = load <2 x i16>, ptr addrspace(3) %in %ext = zext <2 x i16> %load to <2 x i32> store <2 x i32> %ext, ptr addrspace(3) %out ret void } define amdgpu_kernel void @local_sextload_v2i16_to_v2i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { ; SI-LABEL: local_sextload_v2i16_to_v2i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s1 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read_b32 v0, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_ashrrev_i32_e32 v1, 16, v0 ; SI-NEXT: v_bfe_i32 v0, v0, 0, 16 ; SI-NEXT: v_mov_b32_e32 v2, s0 ; SI-NEXT: ds_write_b64 v2, v[0:1] ; SI-NEXT: s_endpgm ; ; VI-LABEL: local_sextload_v2i16_to_v2i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s1 ; VI-NEXT: ds_read_b32 v0, v0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_ashrrev_i32_e32 v1, 16, v0 ; VI-NEXT: v_bfe_i32 v0, v0, 0, 16 ; VI-NEXT: ds_write_b64 v2, v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: local_sextload_v2i16_to_v2i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: ds_read_b32 v0, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 16, v0 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX9-NEXT: ds_write_b64 v2, v[0:1] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: local_sextload_v2i16_to_v2i32: ; EG: ; %bb.0: ; EG-NEXT: ALU 12, @11, KC0[CB0:0-32], KC1[] ; EG-NEXT: MOV * T0.W, KC0[2].Z, ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV * T0.Y, OQAP, ; EG-NEXT: LSHR * T0.W, PV.Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 4(5.605194e-45) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: BFE_INT T0.W, T0.Y, 0.0, literal.x, ; EG-NEXT: MOV * T1.W, KC0[2].Y, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: RETURN %load = load <2 x i16>, ptr addrspace(3) %in %ext = sext <2 x i16> %load to <2 x i32> store <2 x i32> %ext, ptr addrspace(3) %out ret void } define amdgpu_kernel void @local_local_zextload_v3i16_to_v3i32(ptr addrspace(3) %out, ptr addrspace(3) %in) { ; SI-LABEL: local_local_zextload_v3i16_to_v3i32: ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s1 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read_b64 v[0:1], v0 ; SI-NEXT: v_mov_b32_e32 v4, s0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 ; SI-NEXT: ds_write_b32 v4, v0 offset:8 ; SI-NEXT: ds_write_b64 v4, v[2:3] ; SI-NEXT: s_endpgm ; ; VI-LABEL: local_local_zextload_v3i16_to_v3i32: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s1 ; VI-NEXT: ds_read_b64 v[0:1], v0 ; VI-NEXT: v_mov_b32_e32 v3, s0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_and_b32_e32 v2, 0xffff, v1 ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; VI-NEXT: ds_write_b96 v3, v[0:2] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: local_local_zextload_v3i16_to_v3i32: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: ds_read_b64 v[0:1], v0 ; GFX9-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: ds_write_b96 v3, v[0:2] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: local_local_zextload_v3i16_to_v3i32: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU 18, @12, KC0[CB0:0-32], KC1[] ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.X, OQAP, ; EG-NEXT: MOV * T0.W, KC0[2].Z, ; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.Y, OQAP, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) ; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.Z, OQAP, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T0.Z, ; EG-NEXT: MOV * T0.W, KC0[2].Y, ; EG-NEXT: LDS_WRITE * T0.W, T0.Y, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T0.X, ; EG-NEXT: RETURN entry: %ld = load <3 x i16>, ptr addrspace(3) %in %ext = zext <3 x i16> %ld to <3 x i32> store <3 x i32> %ext, ptr addrspace(3) %out ret void } define amdgpu_kernel void @local_local_sextload_v3i16_to_v3i32(ptr addrspace(3) %out, ptr addrspace(3) %in) { ; SI-LABEL: local_local_sextload_v3i16_to_v3i32: ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s1 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read_b64 v[0:1], v0 ; SI-NEXT: v_mov_b32_e32 v4, s0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_ashrrev_i32_e32 v3, 16, v0 ; SI-NEXT: v_bfe_i32 v2, v0, 0, 16 ; SI-NEXT: v_bfe_i32 v0, v1, 0, 16 ; SI-NEXT: ds_write_b32 v4, v0 offset:8 ; SI-NEXT: ds_write_b64 v4, v[2:3] ; SI-NEXT: s_endpgm ; ; VI-LABEL: local_local_sextload_v3i16_to_v3i32: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s1 ; VI-NEXT: ds_read_b64 v[3:4], v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_ashrrev_i32_e32 v1, 16, v3 ; VI-NEXT: v_bfe_i32 v2, v4, 0, 16 ; VI-NEXT: v_bfe_i32 v0, v3, 0, 16 ; VI-NEXT: v_mov_b32_e32 v3, s0 ; VI-NEXT: ds_write_b96 v3, v[0:2] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: local_local_sextload_v3i16_to_v3i32: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: ds_read_b64 v[3:4], v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 16, v3 ; GFX9-NEXT: v_bfe_i32 v2, v4, 0, 16 ; GFX9-NEXT: v_bfe_i32 v0, v3, 0, 16 ; GFX9-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-NEXT: ds_write_b96 v3, v[0:2] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: local_local_sextload_v3i16_to_v3i32: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU 22, @13, KC0[CB0:0-32], KC1[] ; EG-NEXT: MOV * T0.W, KC0[2].Z, ; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.X, OQAP, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) ; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.Y, OQAP, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.Z, OQAP, ; EG-NEXT: BFE_INT T0.W, T0.Y, 0.0, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: BFE_INT T0.W, T0.Z, 0.0, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 4(5.605194e-45) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: BFE_INT T0.W, T0.X, 0.0, literal.x, ; EG-NEXT: MOV * T1.W, KC0[2].Y, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: RETURN entry: %ld = load <3 x i16>, ptr addrspace(3) %in %ext = sext <3 x i16> %ld to <3 x i32> store <3 x i32> %ext, ptr addrspace(3) %out ret void } define amdgpu_kernel void @local_local_zextload_v4i16_to_v4i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { ; SI-LABEL: local_local_zextload_v4i16_to_v4i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s1 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read_b64 v[0:1], v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v1 ; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 ; SI-NEXT: s_endpgm ; ; VI-NO-DS128-LABEL: local_local_zextload_v4i16_to_v4i32: ; VI-NO-DS128: ; %bb.0: ; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NO-DS128-NEXT: s_mov_b32 m0, -1 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_mov_b32_e32 v0, s1 ; VI-NO-DS128-NEXT: ds_read_b64 v[0:1], v0 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; VI-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v0 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; VI-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v1 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v0, s0 ; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 ; VI-NO-DS128-NEXT: s_endpgm ; ; GFX9-NO-DS128-LABEL: local_local_zextload_v4i16_to_v4i32: ; GFX9-NO-DS128: ; %bb.0: ; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NO-DS128-NEXT: ds_read_b64 v[0:1], v0 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v0 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v1 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 ; GFX9-NO-DS128-NEXT: s_endpgm ; ; EG-LABEL: local_local_zextload_v4i16_to_v4i32: ; EG: ; %bb.0: ; EG-NEXT: ALU 22, @14, KC0[CB0:0-32], KC1[] ; EG-NEXT: MOV * T0.W, KC0[2].Z, ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.Y, OQAP, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.Z, OQAP, ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, ; EG-NEXT: MOV * T1.W, KC0[2].Y, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: LSHR T0.W, T0.Y, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 4(5.605194e-45) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: AND_INT T0.W, T0.Z, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 8(1.121039e-44) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: LSHR T0.W, T0.Z, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 12(1.681558e-44) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: RETURN ; ; VI-DS128-LABEL: local_local_zextload_v4i16_to_v4i32: ; VI-DS128: ; %bb.0: ; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-DS128-NEXT: s_mov_b32 m0, -1 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_mov_b32_e32 v0, s1 ; VI-DS128-NEXT: ds_read_b64 v[0:1], v0 ; VI-DS128-NEXT: v_mov_b32_e32 v4, s0 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; VI-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v1 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; VI-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; VI-DS128-NEXT: ds_write_b128 v4, v[0:3] ; VI-DS128-NEXT: s_endpgm ; ; GFX9-DS128-LABEL: local_local_zextload_v4i16_to_v4i32: ; GFX9-DS128: ; %bb.0: ; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-DS128-NEXT: ds_read_b64 v[0:1], v0 ; GFX9-DS128-NEXT: v_mov_b32_e32 v4, s0 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX9-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v1 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX9-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-DS128-NEXT: ds_write_b128 v4, v[0:3] ; GFX9-DS128-NEXT: s_endpgm %load = load <4 x i16>, ptr addrspace(3) %in %ext = zext <4 x i16> %load to <4 x i32> store <4 x i32> %ext, ptr addrspace(3) %out ret void } define amdgpu_kernel void @local_sextload_v4i16_to_v4i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { ; SI-LABEL: local_sextload_v4i16_to_v4i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s1 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read_b64 v[0:1], v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_ashrrev_i32_e32 v2, 16, v1 ; SI-NEXT: v_ashrrev_i32_e32 v4, 16, v0 ; SI-NEXT: v_bfe_i32 v1, v1, 0, 16 ; SI-NEXT: v_bfe_i32 v3, v0, 0, 16 ; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: ds_write2_b64 v0, v[3:4], v[1:2] offset1:1 ; SI-NEXT: s_endpgm ; ; VI-NO-DS128-LABEL: local_sextload_v4i16_to_v4i32: ; VI-NO-DS128: ; %bb.0: ; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NO-DS128-NEXT: s_mov_b32 m0, -1 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_mov_b32_e32 v0, s1 ; VI-NO-DS128-NEXT: ds_read_b64 v[0:1], v0 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v0 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v1 ; VI-NO-DS128-NEXT: v_bfe_i32 v2, v0, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v4, v1, 0, 16 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v0, s0 ; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 ; VI-NO-DS128-NEXT: s_endpgm ; ; GFX9-NO-DS128-LABEL: local_sextload_v4i16_to_v4i32: ; GFX9-NO-DS128: ; %bb.0: ; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NO-DS128-NEXT: ds_read_b64 v[0:1], v0 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v0 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v1 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v0, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v4, v1, 0, 16 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 ; GFX9-NO-DS128-NEXT: s_endpgm ; ; EG-LABEL: local_sextload_v4i16_to_v4i32: ; EG: ; %bb.0: ; EG-NEXT: ALU 25, @15, KC0[CB0:0-32], KC1[] ; EG-NEXT: MOV * T0.W, KC0[2].Z, ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.Y, OQAP, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.Z, OQAP, ; EG-NEXT: LSHR * T0.W, T0.Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: LSHR T1.Z, PV.Z, literal.x, ; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 4(5.605194e-45) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: BFE_INT T0.W, T1.Z, 0.0, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 12(1.681558e-44) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: BFE_INT T0.W, T0.Y, 0.0, literal.x, ; EG-NEXT: MOV * T1.W, KC0[2].Y, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: BFE_INT T0.W, T0.Z, 0.0, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: RETURN ; ; VI-DS128-LABEL: local_sextload_v4i16_to_v4i32: ; VI-DS128: ; %bb.0: ; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-DS128-NEXT: s_mov_b32 m0, -1 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_mov_b32_e32 v0, s1 ; VI-DS128-NEXT: ds_read_b64 v[4:5], v0 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v5 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v4 ; VI-DS128-NEXT: v_bfe_i32 v2, v5, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v0, v4, 0, 16 ; VI-DS128-NEXT: v_mov_b32_e32 v4, s0 ; VI-DS128-NEXT: ds_write_b128 v4, v[0:3] ; VI-DS128-NEXT: s_endpgm ; ; GFX9-DS128-LABEL: local_sextload_v4i16_to_v4i32: ; GFX9-DS128: ; %bb.0: ; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-DS128-NEXT: ds_read_b64 v[4:5], v0 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v5 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v4 ; GFX9-DS128-NEXT: v_bfe_i32 v2, v5, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v0, v4, 0, 16 ; GFX9-DS128-NEXT: v_mov_b32_e32 v4, s0 ; GFX9-DS128-NEXT: ds_write_b128 v4, v[0:3] ; GFX9-DS128-NEXT: s_endpgm %load = load <4 x i16>, ptr addrspace(3) %in %ext = sext <4 x i16> %load to <4 x i32> store <4 x i32> %ext, ptr addrspace(3) %out ret void } define amdgpu_kernel void @local_zextload_v8i16_to_v8i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { ; SI-LABEL: local_zextload_v8i16_to_v8i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s1 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 ; SI-NEXT: v_mov_b32_e32 v12, s0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v3 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v2 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v3 ; SI-NEXT: ds_write2_b64 v12, v[8:9], v[10:11] offset0:2 offset1:3 ; SI-NEXT: ds_write2_b64 v12, v[4:5], v[6:7] offset1:1 ; SI-NEXT: s_endpgm ; ; VI-NO-DS128-LABEL: local_zextload_v8i16_to_v8i32: ; VI-NO-DS128: ; %bb.0: ; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NO-DS128-NEXT: s_mov_b32 m0, -1 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_mov_b32_e32 v0, s1 ; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; VI-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v0 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v1 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v3 ; VI-NO-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v3 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v2, s0 ; VI-NO-DS128-NEXT: ds_write2_b64 v2, v[0:1], v[8:9] offset0:2 offset1:3 ; VI-NO-DS128-NEXT: ds_write2_b64 v2, v[4:5], v[6:7] offset1:1 ; VI-NO-DS128-NEXT: s_endpgm ; ; GFX9-NO-DS128-LABEL: local_zextload_v8i16_to_v8i32: ; GFX9-NO-DS128: ; %bb.0: ; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v10, s0 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v0 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v1 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v3 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v3 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v10, v[0:1], v[8:9] offset0:2 offset1:3 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v10, v[4:5], v[6:7] offset1:1 ; GFX9-NO-DS128-NEXT: s_endpgm ; ; EG-LABEL: local_zextload_v8i16_to_v8i32: ; EG: ; %bb.0: ; EG-NEXT: ALU 46, @16, KC0[CB0:0-32], KC1[] ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.Y, OQAP, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.Z, OQAP, ; EG-NEXT: MOV * T0.W, KC0[2].Z, ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.W, OQAP, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x, ; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T1.W ; EG-NEXT: MOV T1.Y, OQAP, ; EG-NEXT: AND_INT T1.W, T0.W, literal.x, ; EG-NEXT: MOV * T2.W, KC0[2].Y, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T2.W, T1.W, ; EG-NEXT: LSHR T0.W, T0.W, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 4(5.605194e-45) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: AND_INT T0.W, T0.Z, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 8(1.121039e-44) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: LSHR T0.W, T0.Z, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 12(1.681558e-44) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: LSHR T0.W, T0.Y, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 20(2.802597e-44) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: AND_INT T0.W, T1.Y, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 24(3.363116e-44) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: LSHR T0.W, T1.Y, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 28(3.923636e-44) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: RETURN ; ; VI-DS128-LABEL: local_zextload_v8i16_to_v8i32: ; VI-DS128: ; %bb.0: ; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-DS128-NEXT: s_mov_b32 m0, -1 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_mov_b32_e32 v0, s1 ; VI-DS128-NEXT: ds_read_b128 v[0:3], v0 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; VI-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v0 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v3 ; VI-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v3 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v2 ; VI-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v2 ; VI-DS128-NEXT: v_mov_b32_e32 v0, s0 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; VI-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v1 ; VI-DS128-NEXT: ds_write_b128 v0, v[8:11] offset:16 ; VI-DS128-NEXT: ds_write_b128 v0, v[4:7] ; VI-DS128-NEXT: s_endpgm ; ; GFX9-DS128-LABEL: local_zextload_v8i16_to_v8i32: ; GFX9-DS128: ; %bb.0: ; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-DS128-NEXT: ds_read_b128 v[0:3], v0 ; GFX9-DS128-NEXT: v_mov_b32_e32 v12, s0 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v3 ; GFX9-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v3 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v2 ; GFX9-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v2 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; GFX9-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v1 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX9-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v0 ; GFX9-DS128-NEXT: ds_write_b128 v12, v[8:11] offset:16 ; GFX9-DS128-NEXT: ds_write_b128 v12, v[4:7] ; GFX9-DS128-NEXT: s_endpgm %load = load <8 x i16>, ptr addrspace(3) %in %ext = zext <8 x i16> %load to <8 x i32> store <8 x i32> %ext, ptr addrspace(3) %out ret void } define amdgpu_kernel void @local_sextload_v8i16_to_v8i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { ; SI-LABEL: local_sextload_v8i16_to_v8i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s1 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 ; SI-NEXT: v_mov_b32_e32 v12, s0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_ashrrev_i32_e32 v5, 16, v0 ; SI-NEXT: v_ashrrev_i32_e32 v7, 16, v1 ; SI-NEXT: v_ashrrev_i32_e32 v9, 16, v2 ; SI-NEXT: v_ashrrev_i32_e32 v11, 16, v3 ; SI-NEXT: v_bfe_i32 v4, v0, 0, 16 ; SI-NEXT: v_bfe_i32 v6, v1, 0, 16 ; SI-NEXT: v_bfe_i32 v8, v2, 0, 16 ; SI-NEXT: v_bfe_i32 v10, v3, 0, 16 ; SI-NEXT: ds_write2_b64 v12, v[8:9], v[10:11] offset0:2 offset1:3 ; SI-NEXT: ds_write2_b64 v12, v[4:5], v[6:7] offset1:1 ; SI-NEXT: s_endpgm ; ; VI-NO-DS128-LABEL: local_sextload_v8i16_to_v8i32: ; VI-NO-DS128: ; %bb.0: ; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NO-DS128-NEXT: s_mov_b32 m0, -1 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_mov_b32_e32 v0, s1 ; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v0 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v2 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v3 ; VI-NO-DS128-NEXT: v_bfe_i32 v4, v0, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v8, v2, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v10, v3, 0, 16 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v0, s0 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v1 ; VI-NO-DS128-NEXT: v_bfe_i32 v6, v1, 0, 16 ; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[8:9], v[10:11] offset0:2 offset1:3 ; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[4:5], v[6:7] offset1:1 ; VI-NO-DS128-NEXT: s_endpgm ; ; GFX9-NO-DS128-LABEL: local_sextload_v8i16_to_v8i32: ; GFX9-NO-DS128: ; %bb.0: ; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v12, s0 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v2 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v3 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v8, v2, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v10, v3, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v0 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v1 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v4, v0, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v6, v1, 0, 16 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v12, v[8:9], v[10:11] offset0:2 offset1:3 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v12, v[4:5], v[6:7] offset1:1 ; GFX9-NO-DS128-NEXT: s_endpgm ; ; EG-LABEL: local_sextload_v8i16_to_v8i32: ; EG: ; %bb.0: ; EG-NEXT: ALU 51, @17, KC0[CB0:0-32], KC1[] ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.Y, OQAP, ; EG-NEXT: MOV * T0.W, KC0[2].Z, ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.Z, OQAP, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.W, OQAP, ; EG-NEXT: LSHR * T1.W, T0.Z, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x, ; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T2.W ; EG-NEXT: MOV T1.Y, OQAP, ; EG-NEXT: LSHR T1.Z, T0.W, literal.x, ; EG-NEXT: BFE_INT T1.W, T1.W, 0.0, literal.x, BS:VEC_120/SCL_212 ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 4(5.605194e-45) ; EG-NEXT: LDS_WRITE * T2.W, T1.W, ; EG-NEXT: LSHR T2.Z, T0.Y, literal.x, ; EG-NEXT: BFE_INT T1.W, T1.Z, 0.0, literal.x, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 12(1.681558e-44) ; EG-NEXT: LDS_WRITE * T2.W, T1.W, ; EG-NEXT: LSHR T1.Z, T1.Y, literal.x, ; EG-NEXT: BFE_INT T1.W, T2.Z, 0.0, literal.x, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 20(2.802597e-44) ; EG-NEXT: LDS_WRITE * T2.W, T1.W, ; EG-NEXT: BFE_INT T1.W, T1.Z, 0.0, literal.x, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 28(3.923636e-44) ; EG-NEXT: LDS_WRITE * T2.W, T1.W, ; EG-NEXT: BFE_INT T1.W, T0.Z, 0.0, literal.x, ; EG-NEXT: MOV * T2.W, KC0[2].Y, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T2.W, T1.W, ; EG-NEXT: BFE_INT T0.W, T0.W, 0.0, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: BFE_INT T0.W, T0.Y, 0.0, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: BFE_INT T0.W, T1.Y, 0.0, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: RETURN ; ; VI-DS128-LABEL: local_sextload_v8i16_to_v8i32: ; VI-DS128: ; %bb.0: ; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-DS128-NEXT: s_mov_b32 m0, -1 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_mov_b32_e32 v0, s1 ; VI-DS128-NEXT: ds_read_b128 v[0:3], v0 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v0 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v3 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v2 ; VI-DS128-NEXT: v_bfe_i32 v4, v0, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v10, v3, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v8, v2, 0, 16 ; VI-DS128-NEXT: v_mov_b32_e32 v0, s0 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v1 ; VI-DS128-NEXT: v_bfe_i32 v6, v1, 0, 16 ; VI-DS128-NEXT: ds_write_b128 v0, v[8:11] offset:16 ; VI-DS128-NEXT: ds_write_b128 v0, v[4:7] ; VI-DS128-NEXT: s_endpgm ; ; GFX9-DS128-LABEL: local_sextload_v8i16_to_v8i32: ; GFX9-DS128: ; %bb.0: ; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-DS128-NEXT: ds_read_b128 v[0:3], v0 ; GFX9-DS128-NEXT: v_mov_b32_e32 v12, s0 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v3 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v2 ; GFX9-DS128-NEXT: v_bfe_i32 v10, v3, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v8, v2, 0, 16 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v1 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v0 ; GFX9-DS128-NEXT: v_bfe_i32 v6, v1, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v4, v0, 0, 16 ; GFX9-DS128-NEXT: ds_write_b128 v12, v[8:11] offset:16 ; GFX9-DS128-NEXT: ds_write_b128 v12, v[4:7] ; GFX9-DS128-NEXT: s_endpgm %load = load <8 x i16>, ptr addrspace(3) %in %ext = sext <8 x i16> %load to <8 x i32> store <8 x i32> %ext, ptr addrspace(3) %out ret void } define amdgpu_kernel void @local_zextload_v16i16_to_v16i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { ; SI-LABEL: local_zextload_v16i16_to_v16i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v4, s1 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read2_b64 v[0:3], v4 offset1:1 ; SI-NEXT: ds_read2_b64 v[4:7], v4 offset0:2 offset1:3 ; SI-NEXT: s_waitcnt lgkmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v3 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v2 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v7 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v6 ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v5 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v4 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v6 ; SI-NEXT: v_mov_b32_e32 v4, s0 ; SI-NEXT: ds_write2_b64 v4, v[2:3], v[0:1] offset0:6 offset1:7 ; SI-NEXT: ds_write2_b64 v4, v[18:19], v[16:17] offset0:4 offset1:5 ; SI-NEXT: ds_write2_b64 v4, v[14:15], v[12:13] offset0:2 offset1:3 ; SI-NEXT: ds_write2_b64 v4, v[10:11], v[8:9] offset1:1 ; SI-NEXT: s_endpgm ; ; VI-NO-DS128-LABEL: local_zextload_v16i16_to_v16i32: ; VI-NO-DS128: ; %bb.0: ; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NO-DS128-NEXT: s_mov_b32 m0, -1 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_mov_b32_e32 v4, s1 ; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v4 offset1:1 ; VI-NO-DS128-NEXT: ds_read2_b64 v[4:7], v4 offset0:2 offset1:3 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v16, s0 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v1 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v7 ; VI-NO-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v7 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v6 ; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; VI-NO-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v1 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v3 ; VI-NO-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v3 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; VI-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v5 ; VI-NO-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v5 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; VI-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; VI-NO-DS128-NEXT: ds_write2_b64 v16, v[6:7], v[14:15] offset0:6 offset1:7 ; VI-NO-DS128-NEXT: ds_write2_b64 v16, v[4:5], v[12:13] offset0:4 offset1:5 ; VI-NO-DS128-NEXT: ds_write2_b64 v16, v[2:3], v[10:11] offset0:2 offset1:3 ; VI-NO-DS128-NEXT: ds_write2_b64 v16, v[0:1], v[8:9] offset1:1 ; VI-NO-DS128-NEXT: s_endpgm ; ; GFX9-NO-DS128-LABEL: local_zextload_v16i16_to_v16i32: ; GFX9-NO-DS128: ; %bb.0: ; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v4 offset1:1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[4:7], v4 offset0:2 offset1:3 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v16, s0 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v1 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v7 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v7 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v6 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v1 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v3 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v3 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v5 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v5 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v16, v[6:7], v[14:15] offset0:6 offset1:7 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v16, v[4:5], v[12:13] offset0:4 offset1:5 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v16, v[2:3], v[10:11] offset0:2 offset1:3 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v16, v[0:1], v[8:9] offset1:1 ; GFX9-NO-DS128-NEXT: s_endpgm ; ; EG-LABEL: local_zextload_v16i16_to_v16i32: ; EG: ; %bb.0: ; EG-NEXT: ALU 94, @18, KC0[CB0:0-32], KC1[] ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.Y, OQAP, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.Z, OQAP, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.W, OQAP, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x, ; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T1.W ; EG-NEXT: MOV T1.Y, OQAP, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T1.W ; EG-NEXT: MOV T1.Z, OQAP, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x, ; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T1.W ; EG-NEXT: MOV T1.W, OQAP, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x, ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T2.W ; EG-NEXT: MOV T2.Y, OQAP, ; EG-NEXT: MOV * T2.W, KC0[2].Z, ; EG-NEXT: LDS_READ_RET * OQAP, T2.W ; EG-NEXT: MOV T2.Z, OQAP, ; EG-NEXT: LSHR T2.W, T2.Y, literal.x, ; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 12(1.681558e-44) ; EG-NEXT: LDS_WRITE * T3.W, T2.W, ; EG-NEXT: AND_INT T2.W, T2.Y, literal.x, ; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 8(1.121039e-44) ; EG-NEXT: LDS_WRITE * T3.W, T2.W, ; EG-NEXT: LSHR T2.W, T2.Z, literal.x, ; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 4(5.605194e-45) ; EG-NEXT: LDS_WRITE * T3.W, T2.W, ; EG-NEXT: AND_INT T2.W, T2.Z, literal.x, ; EG-NEXT: MOV * T3.W, KC0[2].Y, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T3.W, T2.W, ; EG-NEXT: LSHR T2.W, T1.W, literal.x, ; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 28(3.923636e-44) ; EG-NEXT: LDS_WRITE * T3.W, T2.W, ; EG-NEXT: AND_INT T1.W, T1.W, literal.x, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 24(3.363116e-44) ; EG-NEXT: LDS_WRITE * T2.W, T1.W, ; EG-NEXT: LSHR T1.W, T1.Z, literal.x, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 20(2.802597e-44) ; EG-NEXT: LDS_WRITE * T2.W, T1.W, ; EG-NEXT: AND_INT T1.W, T1.Z, literal.x, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) ; EG-NEXT: LDS_WRITE * T2.W, T1.W, ; EG-NEXT: LSHR T1.W, T1.Y, literal.x, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 44(6.165713e-44) ; EG-NEXT: LDS_WRITE * T2.W, T1.W, ; EG-NEXT: AND_INT T1.W, T1.Y, literal.x, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 40(5.605194e-44) ; EG-NEXT: LDS_WRITE * T2.W, T1.W, ; EG-NEXT: LSHR T1.W, T0.W, literal.x, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 36(5.044674e-44) ; EG-NEXT: LDS_WRITE * T2.W, T1.W, ; EG-NEXT: AND_INT T0.W, T0.W, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 32(4.484155e-44) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: LSHR T0.W, T0.Z, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 60(8.407791e-44) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: AND_INT T0.W, T0.Z, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 56(7.847271e-44) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: LSHR T0.W, T0.Y, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 52(7.286752e-44) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 48(6.726233e-44) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: RETURN ; ; VI-DS128-LABEL: local_zextload_v16i16_to_v16i32: ; VI-DS128: ; %bb.0: ; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-DS128-NEXT: s_mov_b32 m0, -1 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_mov_b32_e32 v4, s1 ; VI-DS128-NEXT: ds_read_b128 v[0:3], v4 ; VI-DS128-NEXT: ds_read_b128 v[4:7], v4 offset:16 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v1 ; VI-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v1 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v0 ; VI-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v0 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v4 ; VI-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v7 ; VI-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v7 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v6 ; VI-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v6 ; VI-DS128-NEXT: v_mov_b32_e32 v4, s0 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v3 ; VI-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v3 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v2 ; VI-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v2 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v5 ; VI-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v5 ; VI-DS128-NEXT: ds_write_b128 v4, v[16:19] offset:48 ; VI-DS128-NEXT: ds_write_b128 v4, v[0:3] offset:32 ; VI-DS128-NEXT: ds_write_b128 v4, v[12:15] offset:16 ; VI-DS128-NEXT: ds_write_b128 v4, v[8:11] ; VI-DS128-NEXT: s_endpgm ; ; GFX9-DS128-LABEL: local_zextload_v16i16_to_v16i32: ; GFX9-DS128: ; %bb.0: ; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-DS128-NEXT: ds_read_b128 v[0:3], v4 ; GFX9-DS128-NEXT: ds_read_b128 v[4:7], v4 offset:16 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v1 ; GFX9-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v1 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v0 ; GFX9-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v0 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v4 ; GFX9-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v7 ; GFX9-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v7 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v6 ; GFX9-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v6 ; GFX9-DS128-NEXT: v_mov_b32_e32 v4, s0 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v3 ; GFX9-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v3 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v2 ; GFX9-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v2 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v5 ; GFX9-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v5 ; GFX9-DS128-NEXT: ds_write_b128 v4, v[16:19] offset:48 ; GFX9-DS128-NEXT: ds_write_b128 v4, v[0:3] offset:32 ; GFX9-DS128-NEXT: ds_write_b128 v4, v[12:15] offset:16 ; GFX9-DS128-NEXT: ds_write_b128 v4, v[8:11] ; GFX9-DS128-NEXT: s_endpgm %load = load <16 x i16>, ptr addrspace(3) %in %ext = zext <16 x i16> %load to <16 x i32> store <16 x i32> %ext, ptr addrspace(3) %out ret void } define amdgpu_kernel void @local_sextload_v16i16_to_v16i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { ; SI-LABEL: local_sextload_v16i16_to_v16i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v4, s1 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read2_b64 v[0:3], v4 offset1:1 ; SI-NEXT: ds_read2_b64 v[4:7], v4 offset0:2 offset1:3 ; SI-NEXT: s_waitcnt lgkmcnt(1) ; SI-NEXT: v_ashrrev_i32_e32 v9, 16, v1 ; SI-NEXT: v_ashrrev_i32_e32 v11, 16, v0 ; SI-NEXT: v_ashrrev_i32_e32 v13, 16, v3 ; SI-NEXT: v_ashrrev_i32_e32 v15, 16, v2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_ashrrev_i32_e32 v17, 16, v5 ; SI-NEXT: v_ashrrev_i32_e32 v19, 16, v4 ; SI-NEXT: v_bfe_i32 v8, v1, 0, 16 ; SI-NEXT: v_bfe_i32 v10, v0, 0, 16 ; SI-NEXT: v_bfe_i32 v12, v3, 0, 16 ; SI-NEXT: v_bfe_i32 v14, v2, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v1, 16, v7 ; SI-NEXT: v_ashrrev_i32_e32 v3, 16, v6 ; SI-NEXT: v_bfe_i32 v16, v5, 0, 16 ; SI-NEXT: v_bfe_i32 v18, v4, 0, 16 ; SI-NEXT: v_bfe_i32 v0, v7, 0, 16 ; SI-NEXT: v_bfe_i32 v2, v6, 0, 16 ; SI-NEXT: v_mov_b32_e32 v4, s0 ; SI-NEXT: ds_write2_b64 v4, v[2:3], v[0:1] offset0:6 offset1:7 ; SI-NEXT: ds_write2_b64 v4, v[18:19], v[16:17] offset0:4 offset1:5 ; SI-NEXT: ds_write2_b64 v4, v[14:15], v[12:13] offset0:2 offset1:3 ; SI-NEXT: ds_write2_b64 v4, v[10:11], v[8:9] offset1:1 ; SI-NEXT: s_endpgm ; ; VI-NO-DS128-LABEL: local_sextload_v16i16_to_v16i32: ; VI-NO-DS128: ; %bb.0: ; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NO-DS128-NEXT: s_mov_b32 m0, -1 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_mov_b32_e32 v4, s1 ; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v4 offset1:1 ; VI-NO-DS128-NEXT: ds_read2_b64 v[4:7], v4 offset0:2 offset1:3 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v1 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v0 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v3 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v2 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v4 ; VI-NO-DS128-NEXT: v_bfe_i32 v8, v1, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v10, v0, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v12, v3, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v14, v2, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v7 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v6 ; VI-NO-DS128-NEXT: v_bfe_i32 v18, v4, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v0, v7, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v2, v6, 0, 16 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v4, s0 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v5 ; VI-NO-DS128-NEXT: v_bfe_i32 v16, v5, 0, 16 ; VI-NO-DS128-NEXT: ds_write2_b64 v4, v[2:3], v[0:1] offset0:6 offset1:7 ; VI-NO-DS128-NEXT: ds_write2_b64 v4, v[18:19], v[16:17] offset0:4 offset1:5 ; VI-NO-DS128-NEXT: ds_write2_b64 v4, v[14:15], v[12:13] offset0:2 offset1:3 ; VI-NO-DS128-NEXT: ds_write2_b64 v4, v[10:11], v[8:9] offset1:1 ; VI-NO-DS128-NEXT: s_endpgm ; ; GFX9-NO-DS128-LABEL: local_sextload_v16i16_to_v16i32: ; GFX9-NO-DS128: ; %bb.0: ; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v4 offset1:1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[4:7], v4 offset0:2 offset1:3 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v1 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v0 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v3 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v2 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v4 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v8, v1, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v10, v0, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v12, v3, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v14, v2, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v7 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v6 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v18, v4, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v7, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v6, 0, 16 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v4, s0 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v5 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v5, 0, 16 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v4, v[2:3], v[0:1] offset0:6 offset1:7 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v4, v[18:19], v[16:17] offset0:4 offset1:5 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v4, v[14:15], v[12:13] offset0:2 offset1:3 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v4, v[10:11], v[8:9] offset1:1 ; GFX9-NO-DS128-NEXT: s_endpgm ; ; EG-LABEL: local_sextload_v16i16_to_v16i32: ; EG: ; %bb.0: ; EG-NEXT: ALU 95, @19, KC0[CB0:0-32], KC1[] ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.Y, OQAP, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.Z, OQAP, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.W, OQAP, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T1.W ; EG-NEXT: MOV T1.Y, OQAP, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x, ; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T1.W ; EG-NEXT: MOV T1.Z, OQAP, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x, ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T1.W ; EG-NEXT: MOV T1.W, OQAP, ; EG-NEXT: MOV * T2.W, KC0[2].Z, ; EG-NEXT: LDS_READ_RET * OQAP, T2.W ; EG-NEXT: MOV T2.Y, OQAP, ; EG-NEXT: LSHR T2.W, T1.W, literal.x, ; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.y, ; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) ; EG-NEXT: LDS_READ_RET * OQAP, T3.W ; EG-NEXT: MOV T2.Z, OQAP, ; EG-NEXT: LSHR * T3.Z, T2.Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: BFE_INT T2.W, T2.W, 0.0, literal.x, ; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 12(1.681558e-44) ; EG-NEXT: LDS_WRITE * T3.W, T2.W, ; EG-NEXT: LSHR T4.Z, T0.Y, literal.x, ; EG-NEXT: BFE_INT T2.W, T3.Z, 0.0, literal.x, ; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 4(5.605194e-45) ; EG-NEXT: LDS_WRITE * T3.W, T2.W, ; EG-NEXT: LSHR T3.Z, T0.Z, literal.x, ; EG-NEXT: BFE_INT T2.W, T4.Z, 0.0, literal.x, BS:VEC_120/SCL_212 ; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 28(3.923636e-44) ; EG-NEXT: LDS_WRITE * T3.W, T2.W, ; EG-NEXT: LSHR T4.Z, T0.W, literal.x, ; EG-NEXT: BFE_INT T2.W, T3.Z, 0.0, literal.x, ; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 20(2.802597e-44) ; EG-NEXT: LDS_WRITE * T3.W, T2.W, ; EG-NEXT: LSHR T3.Z, T1.Y, literal.x, ; EG-NEXT: BFE_INT T2.W, T4.Z, 0.0, literal.x, ; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 44(6.165713e-44) ; EG-NEXT: LDS_WRITE * T3.W, T2.W, ; EG-NEXT: LSHR T4.Z, T1.Z, literal.x, ; EG-NEXT: BFE_INT T2.W, T3.Z, 0.0, literal.x, BS:VEC_120/SCL_212 ; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 36(5.044674e-44) ; EG-NEXT: LDS_WRITE * T3.W, T2.W, ; EG-NEXT: LSHR T3.Z, T2.Z, literal.x, ; EG-NEXT: BFE_INT T2.W, T4.Z, 0.0, literal.x, BS:VEC_120/SCL_212 ; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 60(8.407791e-44) ; EG-NEXT: LDS_WRITE * T3.W, T2.W, ; EG-NEXT: BFE_INT T2.W, T3.Z, 0.0, literal.x, ; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 52(7.286752e-44) ; EG-NEXT: LDS_WRITE * T3.W, T2.W, ; EG-NEXT: BFE_INT T1.W, T1.W, 0.0, literal.x, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44) ; EG-NEXT: LDS_WRITE * T2.W, T1.W, ; EG-NEXT: BFE_INT T1.W, T2.Y, 0.0, literal.x, ; EG-NEXT: MOV * T2.W, KC0[2].Y, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T2.W, T1.W, ; EG-NEXT: BFE_INT T1.W, T0.Y, 0.0, literal.x, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) ; EG-NEXT: LDS_WRITE * T2.W, T1.W, ; EG-NEXT: BFE_INT T1.W, T0.Z, 0.0, literal.x, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T2.W, T1.W, ; EG-NEXT: BFE_INT T0.W, T0.W, 0.0, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 40(5.605194e-44) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: BFE_INT T0.W, T1.Y, 0.0, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 32(4.484155e-44) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: ALU 7, @20, KC0[CB0:0-32], KC1[] ; EG-NEXT: BFE_INT T0.W, T1.Z, 0.0, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 56(7.847271e-44) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: BFE_INT T0.W, T2.Z, 0.0, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 48(6.726233e-44) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: RETURN ; ; VI-DS128-LABEL: local_sextload_v16i16_to_v16i32: ; VI-DS128: ; %bb.0: ; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-DS128-NEXT: s_mov_b32 m0, -1 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_mov_b32_e32 v4, s1 ; VI-DS128-NEXT: ds_read_b128 v[0:3], v4 ; VI-DS128-NEXT: ds_read_b128 v[4:7], v4 offset:16 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v1 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v0 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v3 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v2 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v4 ; VI-DS128-NEXT: v_bfe_i32 v10, v1, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v8, v0, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v14, v3, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v12, v2, 0, 16 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v7 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v6 ; VI-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v2, v7, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v0, v6, 0, 16 ; VI-DS128-NEXT: v_mov_b32_e32 v4, s0 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v5 ; VI-DS128-NEXT: v_bfe_i32 v18, v5, 0, 16 ; VI-DS128-NEXT: ds_write_b128 v4, v[0:3] offset:48 ; VI-DS128-NEXT: ds_write_b128 v4, v[16:19] offset:32 ; VI-DS128-NEXT: ds_write_b128 v4, v[12:15] offset:16 ; VI-DS128-NEXT: ds_write_b128 v4, v[8:11] ; VI-DS128-NEXT: s_endpgm ; ; GFX9-DS128-LABEL: local_sextload_v16i16_to_v16i32: ; GFX9-DS128: ; %bb.0: ; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-DS128-NEXT: ds_read_b128 v[0:3], v4 ; GFX9-DS128-NEXT: ds_read_b128 v[4:7], v4 offset:16 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v1 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v0 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v3 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v2 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v4 ; GFX9-DS128-NEXT: v_bfe_i32 v10, v1, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v8, v0, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v14, v3, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v12, v2, 0, 16 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v7 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v6 ; GFX9-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v2, v7, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v0, v6, 0, 16 ; GFX9-DS128-NEXT: v_mov_b32_e32 v4, s0 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v5 ; GFX9-DS128-NEXT: v_bfe_i32 v18, v5, 0, 16 ; GFX9-DS128-NEXT: ds_write_b128 v4, v[0:3] offset:48 ; GFX9-DS128-NEXT: ds_write_b128 v4, v[16:19] offset:32 ; GFX9-DS128-NEXT: ds_write_b128 v4, v[12:15] offset:16 ; GFX9-DS128-NEXT: ds_write_b128 v4, v[8:11] ; GFX9-DS128-NEXT: s_endpgm %load = load <16 x i16>, ptr addrspace(3) %in %ext = sext <16 x i16> %load to <16 x i32> store <16 x i32> %ext, ptr addrspace(3) %out ret void } define amdgpu_kernel void @local_zextload_v32i16_to_v32i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { ; SI-LABEL: local_zextload_v32i16_to_v32i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v12, s1 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read2_b64 v[0:3], v12 offset1:1 ; SI-NEXT: ds_read2_b64 v[4:7], v12 offset0:2 offset1:3 ; SI-NEXT: ds_read2_b64 v[8:11], v12 offset0:4 offset1:5 ; SI-NEXT: ds_read2_b64 v[12:15], v12 offset0:6 offset1:7 ; SI-NEXT: s_waitcnt lgkmcnt(3) ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v0 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v20, 0xffff, v3 ; SI-NEXT: v_and_b32_e32 v22, 0xffff, v2 ; SI-NEXT: s_waitcnt lgkmcnt(2) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v4 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v7 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v7 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: s_waitcnt lgkmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v9 ; SI-NEXT: v_and_b32_e32 v24, 0xffff, v9 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v11 ; SI-NEXT: v_and_b32_e32 v26, 0xffff, v11 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v13 ; SI-NEXT: v_and_b32_e32 v28, 0xffff, v13 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 ; SI-NEXT: v_and_b32_e32 v30, 0xffff, v15 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: v_mov_b32_e32 v32, s0 ; SI-NEXT: ds_write2_b64 v32, v[14:15], v[30:31] offset0:14 offset1:15 ; SI-NEXT: ds_write2_b64 v32, v[12:13], v[28:29] offset0:12 offset1:13 ; SI-NEXT: ds_write2_b64 v32, v[10:11], v[26:27] offset0:10 offset1:11 ; SI-NEXT: ds_write2_b64 v32, v[8:9], v[24:25] offset0:8 offset1:9 ; SI-NEXT: ds_write2_b64 v32, v[6:7], v[4:5] offset0:6 offset1:7 ; SI-NEXT: ds_write2_b64 v32, v[2:3], v[0:1] offset0:4 offset1:5 ; SI-NEXT: ds_write2_b64 v32, v[22:23], v[20:21] offset0:2 offset1:3 ; SI-NEXT: ds_write2_b64 v32, v[18:19], v[16:17] offset1:1 ; SI-NEXT: s_endpgm ; ; VI-NO-DS128-LABEL: local_zextload_v32i16_to_v32i32: ; VI-NO-DS128: ; %bb.0: ; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NO-DS128-NEXT: s_mov_b32 m0, -1 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_mov_b32_e32 v24, s1 ; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v24 offset1:1 ; VI-NO-DS128-NEXT: ds_read2_b64 v[4:7], v24 offset0:2 offset1:3 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v32, s0 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v3 ; VI-NO-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v3 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v2 ; VI-NO-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v2 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v1 ; VI-NO-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v1 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v0 ; VI-NO-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v0 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v7 ; VI-NO-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v7 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v6 ; VI-NO-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v6 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v5 ; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v24 offset0:4 offset1:5 ; VI-NO-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v5 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v4 ; VI-NO-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v4 ; VI-NO-DS128-NEXT: ds_read2_b64 v[4:7], v24 offset0:6 offset1:7 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v25, 16, v1 ; VI-NO-DS128-NEXT: v_and_b32_e32 v24, 0xffff, v1 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v5 ; VI-NO-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v5 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; VI-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v27, 16, v3 ; VI-NO-DS128-NEXT: v_and_b32_e32 v26, 0xffff, v3 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; VI-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v7 ; VI-NO-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v7 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v6 ; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[4:5], v[30:31] offset0:12 offset1:13 ; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[6:7], v[28:29] offset0:14 offset1:15 ; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[2:3], v[26:27] offset0:10 offset1:11 ; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[0:1], v[24:25] offset0:8 offset1:9 ; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[22:23], v[20:21] offset0:4 offset1:5 ; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[18:19], v[16:17] offset0:6 offset1:7 ; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[14:15], v[12:13] offset1:1 ; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[10:11], v[8:9] offset0:2 offset1:3 ; VI-NO-DS128-NEXT: s_endpgm ; ; GFX9-NO-DS128-LABEL: local_zextload_v32i16_to_v32i32: ; GFX9-NO-DS128: ; %bb.0: ; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v24, s1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v24 offset1:1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[4:7], v24 offset0:2 offset1:3 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v32, s0 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v3 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v3 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v2 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v2 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v1 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v1 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v0 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v0 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v7 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v7 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v6 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v6 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v5 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v24 offset0:4 offset1:5 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v5 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v4 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v4 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[4:7], v24 offset0:6 offset1:7 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v25, 16, v1 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v24, 0xffff, v1 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v5 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v5 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v27, 16, v3 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v26, 0xffff, v3 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v7 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v7 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v6 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[4:5], v[30:31] offset0:12 offset1:13 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[6:7], v[28:29] offset0:14 offset1:15 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[2:3], v[26:27] offset0:10 offset1:11 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[0:1], v[24:25] offset0:8 offset1:9 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[22:23], v[20:21] offset0:4 offset1:5 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[18:19], v[16:17] offset0:6 offset1:7 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[14:15], v[12:13] offset1:1 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[10:11], v[8:9] offset0:2 offset1:3 ; GFX9-NO-DS128-NEXT: s_endpgm ; ; EG-LABEL: local_zextload_v32i16_to_v32i32: ; EG: ; %bb.0: ; EG-NEXT: ALU 105, @21, KC0[CB0:0-32], KC1[] ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.Y, OQAP, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 52(7.286752e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.Z, OQAP, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 56(7.847271e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.W, OQAP, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x, ; EG-NEXT: 60(8.407791e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T1.W ; EG-NEXT: MOV T1.Y, OQAP, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x, ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T1.W ; EG-NEXT: MOV T1.Z, OQAP, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x, ; EG-NEXT: 36(5.044674e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T1.W ; EG-NEXT: MOV T1.W, OQAP, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x, ; EG-NEXT: 40(5.605194e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T2.W ; EG-NEXT: MOV T2.Y, OQAP, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x, ; EG-NEXT: 44(6.165713e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T2.W ; EG-NEXT: MOV T2.Z, OQAP, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T2.W ; EG-NEXT: MOV T2.W, OQAP, ; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x, ; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T3.W ; EG-NEXT: MOV T3.Y, OQAP, ; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x, ; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T3.W ; EG-NEXT: MOV T3.Z, OQAP, ; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x, ; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T3.W ; EG-NEXT: MOV T3.W, OQAP, ; EG-NEXT: MOV * T4.W, KC0[2].Z, ; EG-NEXT: LDS_READ_RET * OQAP, T4.W ; EG-NEXT: MOV T4.Y, OQAP, ; EG-NEXT: ADD_INT * T4.W, KC0[2].Z, literal.x, ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T4.W ; EG-NEXT: MOV T4.Z, OQAP, ; EG-NEXT: ADD_INT * T4.W, KC0[2].Z, literal.x, ; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T4.W ; EG-NEXT: MOV T4.W, OQAP, ; EG-NEXT: ADD_INT * T5.W, KC0[2].Z, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T5.W ; EG-NEXT: MOV T5.Y, OQAP, ; EG-NEXT: LSHR T5.W, T4.W, literal.x, ; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 28(3.923636e-44) ; EG-NEXT: LDS_WRITE * T6.W, T5.W, ; EG-NEXT: AND_INT T4.W, T4.W, literal.x, ; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 24(3.363116e-44) ; EG-NEXT: LDS_WRITE * T5.W, T4.W, ; EG-NEXT: LSHR T4.W, T5.Y, literal.x, ; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 20(2.802597e-44) ; EG-NEXT: LDS_WRITE * T5.W, T4.W, ; EG-NEXT: AND_INT T4.W, T5.Y, literal.x, ; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) ; EG-NEXT: LDS_WRITE * T5.W, T4.W, ; EG-NEXT: LSHR T4.W, T4.Z, literal.x, ; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 12(1.681558e-44) ; EG-NEXT: LDS_WRITE * T5.W, T4.W, ; EG-NEXT: AND_INT T4.W, T4.Z, literal.x, ; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 8(1.121039e-44) ; EG-NEXT: LDS_WRITE * T5.W, T4.W, ; EG-NEXT: LSHR T4.W, T4.Y, literal.x, ; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 4(5.605194e-45) ; EG-NEXT: LDS_WRITE * T5.W, T4.W, ; EG-NEXT: AND_INT T4.W, T4.Y, literal.x, ; EG-NEXT: MOV * T5.W, KC0[2].Y, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T5.W, T4.W, ; EG-NEXT: LSHR T4.W, T3.W, literal.x, ; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 60(8.407791e-44) ; EG-NEXT: LDS_WRITE * T5.W, T4.W, ; EG-NEXT: AND_INT T3.W, T3.W, literal.x, ; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 56(7.847271e-44) ; EG-NEXT: LDS_WRITE * T4.W, T3.W, ; EG-NEXT: LSHR T3.W, T3.Z, literal.x, ; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 52(7.286752e-44) ; EG-NEXT: ALU 84, @22, KC0[CB0:0-32], KC1[] ; EG-NEXT: LDS_WRITE * T4.W, T3.W, ; EG-NEXT: AND_INT T3.W, T3.Z, literal.x, ; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 48(6.726233e-44) ; EG-NEXT: LDS_WRITE * T4.W, T3.W, ; EG-NEXT: LSHR T3.W, T3.Y, literal.x, ; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 44(6.165713e-44) ; EG-NEXT: LDS_WRITE * T4.W, T3.W, ; EG-NEXT: AND_INT T3.W, T3.Y, literal.x, ; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 40(5.605194e-44) ; EG-NEXT: LDS_WRITE * T4.W, T3.W, ; EG-NEXT: LSHR T3.W, T2.W, literal.x, ; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 36(5.044674e-44) ; EG-NEXT: LDS_WRITE * T4.W, T3.W, ; EG-NEXT: AND_INT T2.W, T2.W, literal.x, ; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 32(4.484155e-44) ; EG-NEXT: LDS_WRITE * T3.W, T2.W, ; EG-NEXT: LSHR T2.W, T2.Z, literal.x, ; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 92(1.289195e-43) ; EG-NEXT: LDS_WRITE * T3.W, T2.W, ; EG-NEXT: AND_INT T2.W, T2.Z, literal.x, ; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 88(1.233143e-43) ; EG-NEXT: LDS_WRITE * T3.W, T2.W, ; EG-NEXT: LSHR T2.W, T2.Y, literal.x, ; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 84(1.177091e-43) ; EG-NEXT: LDS_WRITE * T3.W, T2.W, ; EG-NEXT: AND_INT T2.W, T2.Y, literal.x, ; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 80(1.121039e-43) ; EG-NEXT: LDS_WRITE * T3.W, T2.W, ; EG-NEXT: LSHR T2.W, T1.W, literal.x, ; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 76(1.064987e-43) ; EG-NEXT: LDS_WRITE * T3.W, T2.W, ; EG-NEXT: AND_INT T1.W, T1.W, literal.x, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 72(1.008935e-43) ; EG-NEXT: LDS_WRITE * T2.W, T1.W, ; EG-NEXT: LSHR T1.W, T1.Z, literal.x, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 68(9.528830e-44) ; EG-NEXT: LDS_WRITE * T2.W, T1.W, ; EG-NEXT: AND_INT T1.W, T1.Z, literal.x, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 64(8.968310e-44) ; EG-NEXT: LDS_WRITE * T2.W, T1.W, ; EG-NEXT: LSHR T1.W, T1.Y, literal.x, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 124(1.737610e-43) ; EG-NEXT: LDS_WRITE * T2.W, T1.W, ; EG-NEXT: AND_INT T1.W, T1.Y, literal.x, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 120(1.681558e-43) ; EG-NEXT: LDS_WRITE * T2.W, T1.W, ; EG-NEXT: LSHR T1.W, T0.W, literal.x, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 116(1.625506e-43) ; EG-NEXT: LDS_WRITE * T2.W, T1.W, ; EG-NEXT: AND_INT T0.W, T0.W, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 112(1.569454e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: LSHR T0.W, T0.Z, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 108(1.513402e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: AND_INT T0.W, T0.Z, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 104(1.457350e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: LSHR T0.W, T0.Y, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 100(1.401298e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 96(1.345247e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: RETURN ; ; VI-DS128-LABEL: local_zextload_v32i16_to_v32i32: ; VI-DS128: ; %bb.0: ; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-DS128-NEXT: s_mov_b32 m0, -1 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_mov_b32_e32 v20, s1 ; VI-DS128-NEXT: ds_read_b128 v[0:3], v20 ; VI-DS128-NEXT: ds_read_b128 v[4:7], v20 offset:16 ; VI-DS128-NEXT: ds_read_b128 v[16:19], v20 offset:32 ; VI-DS128-NEXT: ds_read_b128 v[20:23], v20 offset:48 ; VI-DS128-NEXT: v_mov_b32_e32 v32, s0 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(3) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v3 ; VI-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v3 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v2 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v23 ; VI-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v23 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v22 ; VI-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v22 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v21 ; VI-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v21 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v20 ; VI-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; VI-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v2 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; VI-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v1 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; VI-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v7 ; VI-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v7 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v6 ; VI-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v6 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v5 ; VI-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v5 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; VI-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v27, 16, v19 ; VI-DS128-NEXT: v_and_b32_e32 v26, 0xffff, v19 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v25, 16, v18 ; VI-DS128-NEXT: v_and_b32_e32 v24, 0xffff, v18 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v17 ; VI-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v17 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v16 ; VI-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; VI-DS128-NEXT: ds_write_b128 v32, v[20:23] offset:96 ; VI-DS128-NEXT: ds_write_b128 v32, v[28:31] offset:112 ; VI-DS128-NEXT: ds_write_b128 v32, v[16:19] offset:64 ; VI-DS128-NEXT: ds_write_b128 v32, v[24:27] offset:80 ; VI-DS128-NEXT: ds_write_b128 v32, v[4:7] offset:32 ; VI-DS128-NEXT: ds_write_b128 v32, v[12:15] offset:48 ; VI-DS128-NEXT: ds_write_b128 v32, v[0:3] ; VI-DS128-NEXT: ds_write_b128 v32, v[8:11] offset:16 ; VI-DS128-NEXT: s_endpgm ; ; GFX9-DS128-LABEL: local_zextload_v32i16_to_v32i32: ; GFX9-DS128: ; %bb.0: ; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_mov_b32_e32 v20, s1 ; GFX9-DS128-NEXT: ds_read_b128 v[0:3], v20 ; GFX9-DS128-NEXT: ds_read_b128 v[4:7], v20 offset:16 ; GFX9-DS128-NEXT: ds_read_b128 v[16:19], v20 offset:32 ; GFX9-DS128-NEXT: ds_read_b128 v[20:23], v20 offset:48 ; GFX9-DS128-NEXT: v_mov_b32_e32 v32, s0 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(3) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v3 ; GFX9-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v3 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v2 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v23 ; GFX9-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v23 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v22 ; GFX9-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v22 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v21 ; GFX9-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v21 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v20 ; GFX9-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; GFX9-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v2 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX9-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v1 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX9-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v7 ; GFX9-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v7 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v6 ; GFX9-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v6 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v5 ; GFX9-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v5 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX9-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v27, 16, v19 ; GFX9-DS128-NEXT: v_and_b32_e32 v26, 0xffff, v19 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v25, 16, v18 ; GFX9-DS128-NEXT: v_and_b32_e32 v24, 0xffff, v18 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v17 ; GFX9-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v17 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v16 ; GFX9-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; GFX9-DS128-NEXT: ds_write_b128 v32, v[20:23] offset:96 ; GFX9-DS128-NEXT: ds_write_b128 v32, v[28:31] offset:112 ; GFX9-DS128-NEXT: ds_write_b128 v32, v[16:19] offset:64 ; GFX9-DS128-NEXT: ds_write_b128 v32, v[24:27] offset:80 ; GFX9-DS128-NEXT: ds_write_b128 v32, v[4:7] offset:32 ; GFX9-DS128-NEXT: ds_write_b128 v32, v[12:15] offset:48 ; GFX9-DS128-NEXT: ds_write_b128 v32, v[0:3] ; GFX9-DS128-NEXT: ds_write_b128 v32, v[8:11] offset:16 ; GFX9-DS128-NEXT: s_endpgm %load = load <32 x i16>, ptr addrspace(3) %in %ext = zext <32 x i16> %load to <32 x i32> store <32 x i32> %ext, ptr addrspace(3) %out ret void } define amdgpu_kernel void @local_sextload_v32i16_to_v32i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { ; SI-LABEL: local_sextload_v32i16_to_v32i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v12, s1 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read2_b64 v[0:3], v12 offset1:1 ; SI-NEXT: ds_read2_b64 v[4:7], v12 offset0:2 offset1:3 ; SI-NEXT: ds_read2_b64 v[8:11], v12 offset0:4 offset1:5 ; SI-NEXT: ds_read2_b64 v[12:15], v12 offset0:6 offset1:7 ; SI-NEXT: s_waitcnt lgkmcnt(3) ; SI-NEXT: v_ashrrev_i32_e32 v17, 16, v1 ; SI-NEXT: v_ashrrev_i32_e32 v19, 16, v0 ; SI-NEXT: v_ashrrev_i32_e32 v21, 16, v3 ; SI-NEXT: v_ashrrev_i32_e32 v23, 16, v2 ; SI-NEXT: v_bfe_i32 v16, v1, 0, 16 ; SI-NEXT: v_bfe_i32 v18, v0, 0, 16 ; SI-NEXT: v_bfe_i32 v20, v3, 0, 16 ; SI-NEXT: v_bfe_i32 v22, v2, 0, 16 ; SI-NEXT: s_waitcnt lgkmcnt(2) ; SI-NEXT: v_ashrrev_i32_e32 v1, 16, v5 ; SI-NEXT: v_ashrrev_i32_e32 v3, 16, v4 ; SI-NEXT: v_bfe_i32 v0, v5, 0, 16 ; SI-NEXT: v_bfe_i32 v2, v4, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v5, 16, v7 ; SI-NEXT: v_bfe_i32 v4, v7, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v7, 16, v6 ; SI-NEXT: v_bfe_i32 v6, v6, 0, 16 ; SI-NEXT: s_waitcnt lgkmcnt(1) ; SI-NEXT: v_ashrrev_i32_e32 v25, 16, v9 ; SI-NEXT: v_bfe_i32 v24, v9, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v9, 16, v8 ; SI-NEXT: v_bfe_i32 v8, v8, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v27, 16, v11 ; SI-NEXT: v_bfe_i32 v26, v11, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v11, 16, v10 ; SI-NEXT: v_bfe_i32 v10, v10, 0, 16 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_ashrrev_i32_e32 v29, 16, v13 ; SI-NEXT: v_bfe_i32 v28, v13, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v13, 16, v12 ; SI-NEXT: v_bfe_i32 v12, v12, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v31, 16, v15 ; SI-NEXT: v_bfe_i32 v30, v15, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v15, 16, v14 ; SI-NEXT: v_bfe_i32 v14, v14, 0, 16 ; SI-NEXT: v_mov_b32_e32 v32, s0 ; SI-NEXT: ds_write2_b64 v32, v[14:15], v[30:31] offset0:14 offset1:15 ; SI-NEXT: ds_write2_b64 v32, v[12:13], v[28:29] offset0:12 offset1:13 ; SI-NEXT: ds_write2_b64 v32, v[10:11], v[26:27] offset0:10 offset1:11 ; SI-NEXT: ds_write2_b64 v32, v[8:9], v[24:25] offset0:8 offset1:9 ; SI-NEXT: ds_write2_b64 v32, v[6:7], v[4:5] offset0:6 offset1:7 ; SI-NEXT: ds_write2_b64 v32, v[2:3], v[0:1] offset0:4 offset1:5 ; SI-NEXT: ds_write2_b64 v32, v[22:23], v[20:21] offset0:2 offset1:3 ; SI-NEXT: ds_write2_b64 v32, v[18:19], v[16:17] offset1:1 ; SI-NEXT: s_endpgm ; ; VI-NO-DS128-LABEL: local_sextload_v32i16_to_v32i32: ; VI-NO-DS128: ; %bb.0: ; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NO-DS128-NEXT: s_mov_b32 m0, -1 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_mov_b32_e32 v24, s1 ; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v24 offset1:1 ; VI-NO-DS128-NEXT: ds_read2_b64 v[4:7], v24 offset0:2 offset1:3 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v32, s0 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v3 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v2 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v1 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v0 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v7 ; VI-NO-DS128-NEXT: v_bfe_i32 v8, v3, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v10, v2, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v12, v1, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v14, v0, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v6 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v5 ; VI-NO-DS128-NEXT: v_bfe_i32 v16, v7, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v18, v6, 0, 16 ; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v24 offset0:4 offset1:5 ; VI-NO-DS128-NEXT: v_bfe_i32 v20, v5, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v4 ; VI-NO-DS128-NEXT: v_bfe_i32 v22, v4, 0, 16 ; VI-NO-DS128-NEXT: ds_read2_b64 v[4:7], v24 offset0:6 offset1:7 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v1 ; VI-NO-DS128-NEXT: v_bfe_i32 v24, v1, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v0 ; VI-NO-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v31, 16, v5 ; VI-NO-DS128-NEXT: v_bfe_i32 v30, v5, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v4 ; VI-NO-DS128-NEXT: v_bfe_i32 v4, v4, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v27, 16, v3 ; VI-NO-DS128-NEXT: v_bfe_i32 v26, v3, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v2 ; VI-NO-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v29, 16, v7 ; VI-NO-DS128-NEXT: v_bfe_i32 v28, v7, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v6 ; VI-NO-DS128-NEXT: v_bfe_i32 v6, v6, 0, 16 ; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[4:5], v[30:31] offset0:12 offset1:13 ; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[6:7], v[28:29] offset0:14 offset1:15 ; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[2:3], v[26:27] offset0:10 offset1:11 ; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[0:1], v[24:25] offset0:8 offset1:9 ; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[22:23], v[20:21] offset0:4 offset1:5 ; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[18:19], v[16:17] offset0:6 offset1:7 ; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[14:15], v[12:13] offset1:1 ; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[10:11], v[8:9] offset0:2 offset1:3 ; VI-NO-DS128-NEXT: s_endpgm ; ; GFX9-NO-DS128-LABEL: local_sextload_v32i16_to_v32i32: ; GFX9-NO-DS128: ; %bb.0: ; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v24, s1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v24 offset1:1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[4:7], v24 offset0:2 offset1:3 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v32, s0 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v3 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v2 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v1 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v0 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v7 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v8, v3, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v10, v2, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v12, v1, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v14, v0, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v6 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v5 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v7, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v18, v6, 0, 16 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v24 offset0:4 offset1:5 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v20, v5, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v4 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v22, v4, 0, 16 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[4:7], v24 offset0:6 offset1:7 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v1 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v24, v1, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v0 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v31, 16, v5 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v30, v5, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v4 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v4, v4, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v27, 16, v3 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v26, v3, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v2 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v29, 16, v7 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v28, v7, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v6 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v6, v6, 0, 16 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[4:5], v[30:31] offset0:12 offset1:13 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[6:7], v[28:29] offset0:14 offset1:15 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[2:3], v[26:27] offset0:10 offset1:11 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[0:1], v[24:25] offset0:8 offset1:9 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[22:23], v[20:21] offset0:4 offset1:5 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[18:19], v[16:17] offset0:6 offset1:7 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[14:15], v[12:13] offset1:1 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[10:11], v[8:9] offset0:2 offset1:3 ; GFX9-NO-DS128-NEXT: s_endpgm ; ; EG-LABEL: local_sextload_v32i16_to_v32i32: ; EG: ; %bb.0: ; EG-NEXT: ALU 101, @23, KC0[CB0:0-32], KC1[] ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.Y, OQAP, ; EG-NEXT: MOV * T0.W, KC0[2].Z, ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.Z, OQAP, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.W, OQAP, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x, ; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T1.W ; EG-NEXT: MOV T1.Y, OQAP, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x, ; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T1.W ; EG-NEXT: MOV T1.Z, OQAP, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T1.W ; EG-NEXT: MOV T1.W, OQAP, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x, ; EG-NEXT: 44(6.165713e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T2.W ; EG-NEXT: MOV T2.Y, OQAP, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x, ; EG-NEXT: 40(5.605194e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T2.W ; EG-NEXT: MOV T2.Z, OQAP, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x, ; EG-NEXT: 36(5.044674e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T2.W ; EG-NEXT: MOV T2.W, OQAP, ; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x, ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T3.W ; EG-NEXT: MOV T3.Y, OQAP, ; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x, ; EG-NEXT: 60(8.407791e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T3.W ; EG-NEXT: MOV T3.Z, OQAP, ; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x, ; EG-NEXT: 56(7.847271e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T3.W ; EG-NEXT: MOV T3.W, OQAP, ; EG-NEXT: ADD_INT * T4.W, KC0[2].Z, literal.x, ; EG-NEXT: 52(7.286752e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T4.W ; EG-NEXT: MOV T4.Y, OQAP, ; EG-NEXT: ADD_INT * T4.W, KC0[2].Z, literal.x, ; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T4.W ; EG-NEXT: MOV T4.Z, OQAP, ; EG-NEXT: ADD_INT * T4.W, KC0[2].Z, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T4.W ; EG-NEXT: MOV T4.W, OQAP, ; EG-NEXT: LSHR * T5.W, T4.Z, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: ADD_INT * T6.W, KC0[2].Z, literal.x, ; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T6.W ; EG-NEXT: MOV T5.Y, OQAP, ; EG-NEXT: LSHR T5.Z, T4.W, literal.x, ; EG-NEXT: BFE_INT T5.W, T5.W, 0.0, literal.x, BS:VEC_120/SCL_212 ; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 28(3.923636e-44) ; EG-NEXT: LDS_WRITE * T6.W, T5.W, ; EG-NEXT: LSHR T6.Z, T0.Y, literal.x, ; EG-NEXT: BFE_INT T5.W, T5.Z, 0.0, literal.x, ; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 20(2.802597e-44) ; EG-NEXT: LDS_WRITE * T6.W, T5.W, ; EG-NEXT: LSHR T5.Z, T0.Z, literal.x, ; EG-NEXT: BFE_INT T5.W, T6.Z, 0.0, literal.x, BS:VEC_120/SCL_212 ; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 12(1.681558e-44) ; EG-NEXT: LDS_WRITE * T6.W, T5.W, ; EG-NEXT: LSHR T6.Z, T0.W, literal.x, ; EG-NEXT: BFE_INT T5.W, T5.Z, 0.0, literal.x, ; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 4(5.605194e-45) ; EG-NEXT: LDS_WRITE * T6.W, T5.W, ; EG-NEXT: LSHR T5.Z, T1.Y, literal.x, ; EG-NEXT: BFE_INT T5.W, T6.Z, 0.0, literal.x, ; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 60(8.407791e-44) ; EG-NEXT: LDS_WRITE * T6.W, T5.W, ; EG-NEXT: LSHR T6.Z, T1.Z, literal.x, ; EG-NEXT: BFE_INT T5.W, T5.Z, 0.0, literal.x, BS:VEC_120/SCL_212 ; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 52(7.286752e-44) ; EG-NEXT: LDS_WRITE * T6.W, T5.W, ; EG-NEXT: LSHR T5.Z, T1.W, literal.x, ; EG-NEXT: BFE_INT T5.W, T6.Z, 0.0, literal.x, ; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 44(6.165713e-44) ; EG-NEXT: LDS_WRITE * T6.W, T5.W, ; EG-NEXT: LSHR * T6.Z, T2.Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: ALU 89, @24, KC0[CB0:0-32], KC1[] ; EG-NEXT: BFE_INT T5.W, T5.Z, 0.0, literal.x, ; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 36(5.044674e-44) ; EG-NEXT: LDS_WRITE * T6.W, T5.W, ; EG-NEXT: LSHR T5.Z, T2.Z, literal.x, ; EG-NEXT: BFE_INT T5.W, T6.Z, 0.0, literal.x, BS:VEC_120/SCL_212 ; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 92(1.289195e-43) ; EG-NEXT: LDS_WRITE * T6.W, T5.W, ; EG-NEXT: LSHR T6.Z, T2.W, literal.x, ; EG-NEXT: BFE_INT T5.W, T5.Z, 0.0, literal.x, ; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 84(1.177091e-43) ; EG-NEXT: LDS_WRITE * T6.W, T5.W, ; EG-NEXT: LSHR T5.Z, T3.Y, literal.x, ; EG-NEXT: BFE_INT T5.W, T6.Z, 0.0, literal.x, ; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 76(1.064987e-43) ; EG-NEXT: LDS_WRITE * T6.W, T5.W, ; EG-NEXT: LSHR T6.Z, T3.Z, literal.x, ; EG-NEXT: BFE_INT T5.W, T5.Z, 0.0, literal.x, BS:VEC_120/SCL_212 ; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 68(9.528830e-44) ; EG-NEXT: LDS_WRITE * T6.W, T5.W, ; EG-NEXT: LSHR T5.Z, T3.W, literal.x, ; EG-NEXT: BFE_INT T5.W, T6.Z, 0.0, literal.x, ; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 124(1.737610e-43) ; EG-NEXT: LDS_WRITE * T6.W, T5.W, ; EG-NEXT: LSHR T6.Z, T4.Y, literal.x, ; EG-NEXT: BFE_INT T5.W, T5.Z, 0.0, literal.x, ; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 116(1.625506e-43) ; EG-NEXT: LDS_WRITE * T6.W, T5.W, ; EG-NEXT: LSHR T5.Z, T5.Y, literal.x, ; EG-NEXT: BFE_INT T5.W, T6.Z, 0.0, literal.x, ; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 108(1.513402e-43) ; EG-NEXT: LDS_WRITE * T6.W, T5.W, ; EG-NEXT: BFE_INT T5.W, T5.Z, 0.0, literal.x, ; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 100(1.401298e-43) ; EG-NEXT: LDS_WRITE * T6.W, T5.W, ; EG-NEXT: BFE_INT T5.W, T4.Z, 0.0, literal.x, ; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) ; EG-NEXT: LDS_WRITE * T6.W, T5.W, ; EG-NEXT: BFE_INT T4.W, T4.W, 0.0, literal.x, ; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T5.W, T4.W, ; EG-NEXT: BFE_INT T4.W, T0.Y, 0.0, literal.x, ; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44) ; EG-NEXT: LDS_WRITE * T5.W, T4.W, ; EG-NEXT: BFE_INT T4.W, T0.Z, 0.0, literal.x, ; EG-NEXT: MOV * T5.W, KC0[2].Y, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T5.W, T4.W, ; EG-NEXT: BFE_INT T0.W, T0.W, 0.0, literal.x, ; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 56(7.847271e-44) ; EG-NEXT: LDS_WRITE * T4.W, T0.W, ; EG-NEXT: BFE_INT T0.W, T1.Y, 0.0, literal.x, ; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 48(6.726233e-44) ; EG-NEXT: LDS_WRITE * T4.W, T0.W, ; EG-NEXT: BFE_INT T0.W, T1.Z, 0.0, literal.x, ; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 40(5.605194e-44) ; EG-NEXT: LDS_WRITE * T4.W, T0.W, ; EG-NEXT: BFE_INT T0.W, T1.W, 0.0, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 32(4.484155e-44) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: BFE_INT T0.W, T2.Y, 0.0, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 88(1.233143e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: BFE_INT T0.W, T2.Z, 0.0, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 80(1.121039e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: BFE_INT T0.W, T2.W, 0.0, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 72(1.008935e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: BFE_INT T0.W, T3.Y, 0.0, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 64(8.968310e-44) ; EG-NEXT: ALU 16, @25, KC0[CB0:0-32], KC1[] ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: BFE_INT T0.W, T3.Z, 0.0, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 120(1.681558e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: BFE_INT T0.W, T3.W, 0.0, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 112(1.569454e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: BFE_INT T0.W, T4.Y, 0.0, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 104(1.457350e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: BFE_INT T0.W, T5.Y, 0.0, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 96(1.345247e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: RETURN ; ; VI-DS128-LABEL: local_sextload_v32i16_to_v32i32: ; VI-DS128: ; %bb.0: ; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-DS128-NEXT: s_mov_b32 m0, -1 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_mov_b32_e32 v24, s1 ; VI-DS128-NEXT: ds_read_b128 v[0:3], v24 ; VI-DS128-NEXT: ds_read_b128 v[4:7], v24 offset:16 ; VI-DS128-NEXT: ds_read_b128 v[20:23], v24 offset:32 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(2) ; VI-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v3 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v2 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v1 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v0 ; VI-DS128-NEXT: v_bfe_i32 v10, v3, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v8, v2, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v14, v1, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v12, v0, 0, 16 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v7 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v6 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v5 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v4 ; VI-DS128-NEXT: v_bfe_i32 v2, v7, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v0, v6, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v18, v5, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16 ; VI-DS128-NEXT: ds_read_b128 v[4:7], v24 offset:48 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-DS128-NEXT: v_ashrrev_i32_e32 v26, 16, v23 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v24, 16, v22 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v30, 16, v21 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v28, 16, v20 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_ashrrev_i32_e32 v38, 16, v5 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v36, 16, v4 ; VI-DS128-NEXT: v_bfe_i32 v37, v5, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v35, v4, 0, 16 ; VI-DS128-NEXT: v_mov_b32_e32 v4, s0 ; VI-DS128-NEXT: v_bfe_i32 v25, v23, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v23, v22, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v29, v21, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v27, v20, 0, 16 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v34, 16, v7 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v32, 16, v6 ; VI-DS128-NEXT: v_bfe_i32 v33, v7, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v31, v6, 0, 16 ; VI-DS128-NEXT: ds_write_b128 v4, v[35:38] offset:96 ; VI-DS128-NEXT: ds_write_b128 v4, v[31:34] offset:112 ; VI-DS128-NEXT: ds_write_b128 v4, v[27:30] offset:64 ; VI-DS128-NEXT: ds_write_b128 v4, v[23:26] offset:80 ; VI-DS128-NEXT: ds_write_b128 v4, v[16:19] offset:32 ; VI-DS128-NEXT: ds_write_b128 v4, v[0:3] offset:48 ; VI-DS128-NEXT: ds_write_b128 v4, v[12:15] ; VI-DS128-NEXT: ds_write_b128 v4, v[8:11] offset:16 ; VI-DS128-NEXT: s_endpgm ; ; GFX9-DS128-LABEL: local_sextload_v32i16_to_v32i32: ; GFX9-DS128: ; %bb.0: ; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_mov_b32_e32 v24, s1 ; GFX9-DS128-NEXT: ds_read_b128 v[0:3], v24 ; GFX9-DS128-NEXT: ds_read_b128 v[4:7], v24 offset:16 ; GFX9-DS128-NEXT: ds_read_b128 v[20:23], v24 offset:32 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2) ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v3 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v2 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v1 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v0 ; GFX9-DS128-NEXT: v_bfe_i32 v10, v3, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v8, v2, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v14, v1, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v12, v0, 0, 16 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v7 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v6 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v5 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v4 ; GFX9-DS128-NEXT: v_bfe_i32 v2, v7, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v0, v6, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v18, v5, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16 ; GFX9-DS128-NEXT: ds_read_b128 v[4:7], v24 offset:48 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v26, 16, v23 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v24, 16, v22 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v30, 16, v21 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v28, 16, v20 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v38, 16, v5 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v36, 16, v4 ; GFX9-DS128-NEXT: v_bfe_i32 v37, v5, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v35, v4, 0, 16 ; GFX9-DS128-NEXT: v_mov_b32_e32 v4, s0 ; GFX9-DS128-NEXT: v_bfe_i32 v25, v23, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v23, v22, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v29, v21, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v27, v20, 0, 16 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v34, 16, v7 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v32, 16, v6 ; GFX9-DS128-NEXT: v_bfe_i32 v33, v7, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v31, v6, 0, 16 ; GFX9-DS128-NEXT: ds_write_b128 v4, v[35:38] offset:96 ; GFX9-DS128-NEXT: ds_write_b128 v4, v[31:34] offset:112 ; GFX9-DS128-NEXT: ds_write_b128 v4, v[27:30] offset:64 ; GFX9-DS128-NEXT: ds_write_b128 v4, v[23:26] offset:80 ; GFX9-DS128-NEXT: ds_write_b128 v4, v[16:19] offset:32 ; GFX9-DS128-NEXT: ds_write_b128 v4, v[0:3] offset:48 ; GFX9-DS128-NEXT: ds_write_b128 v4, v[12:15] ; GFX9-DS128-NEXT: ds_write_b128 v4, v[8:11] offset:16 ; GFX9-DS128-NEXT: s_endpgm %load = load <32 x i16>, ptr addrspace(3) %in %ext = sext <32 x i16> %load to <32 x i32> store <32 x i32> %ext, ptr addrspace(3) %out ret void } define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { ; SI-LABEL: local_zextload_v64i16_to_v64i32: ; SI: ; %bb.0: ; SI-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; SI-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; SI-NEXT: s_mov_b32 s14, -1 ; SI-NEXT: s_mov_b32 s15, 0xe8f000 ; SI-NEXT: s_add_u32 s12, s12, s11 ; SI-NEXT: s_addc_u32 s13, s13, 0 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v24, s1 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read2_b64 v[0:3], v24 offset0:8 offset1:9 ; SI-NEXT: ds_read2_b64 v[4:7], v24 offset0:10 offset1:11 ; SI-NEXT: ds_read2_b64 v[12:15], v24 offset0:12 offset1:13 ; SI-NEXT: ds_read2_b64 v[8:11], v24 offset0:14 offset1:15 ; SI-NEXT: ds_read2_b64 v[20:23], v24 offset1:1 ; SI-NEXT: ds_read2_b64 v[16:19], v24 offset0:2 offset1:3 ; SI-NEXT: ds_read2_b64 v[34:37], v24 offset0:4 offset1:5 ; SI-NEXT: ds_read2_b64 v[38:41], v24 offset0:6 offset1:7 ; SI-NEXT: s_waitcnt lgkmcnt(7) ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v0 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 ; SI-NEXT: s_waitcnt lgkmcnt(6) ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 ; SI-NEXT: v_and_b32_e32 v24, 0xffff, v1 ; SI-NEXT: buffer_store_dword v24, off, s[12:15], 0 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v25, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; SI-NEXT: v_and_b32_e32 v26, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v28, 0xffff, v3 ; SI-NEXT: v_and_b32_e32 v30, 0xffff, v2 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v7 ; SI-NEXT: v_and_b32_e32 v32, 0xffff, v5 ; SI-NEXT: v_and_b32_e32 v24, 0xffff, v4 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v7 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v6 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v6 ; SI-NEXT: s_waitcnt lgkmcnt(5) ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v13 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v13 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v15 ; SI-NEXT: v_and_b32_e32 v42, 0xffff, v15 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: s_waitcnt lgkmcnt(4) ; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v9 ; SI-NEXT: v_and_b32_e32 v44, 0xffff, v9 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v11 ; SI-NEXT: v_and_b32_e32 v46, 0xffff, v11 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: s_waitcnt lgkmcnt(3) ; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v21 ; SI-NEXT: v_and_b32_e32 v48, 0xffff, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 ; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v23 ; SI-NEXT: v_and_b32_e32 v50, 0xffff, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 ; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; SI-NEXT: s_waitcnt lgkmcnt(2) ; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v17 ; SI-NEXT: v_and_b32_e32 v52, 0xffff, v17 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v19 ; SI-NEXT: v_and_b32_e32 v54, 0xffff, v19 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; SI-NEXT: s_waitcnt lgkmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v35 ; SI-NEXT: v_and_b32_e32 v56, 0xffff, v35 ; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v34 ; SI-NEXT: v_and_b32_e32 v34, 0xffff, v34 ; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v37 ; SI-NEXT: v_and_b32_e32 v58, 0xffff, v37 ; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v36 ; SI-NEXT: v_and_b32_e32 v36, 0xffff, v36 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v39 ; SI-NEXT: v_and_b32_e32 v60, 0xffff, v39 ; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v38 ; SI-NEXT: v_and_b32_e32 v38, 0xffff, v38 ; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v41 ; SI-NEXT: v_and_b32_e32 v62, 0xffff, v41 ; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v40 ; SI-NEXT: v_and_b32_e32 v40, 0xffff, v40 ; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: ds_write2_b64 v0, v[40:41], v[62:63] offset0:14 offset1:15 ; SI-NEXT: ds_write2_b64 v0, v[38:39], v[60:61] offset0:12 offset1:13 ; SI-NEXT: ds_write2_b64 v0, v[36:37], v[58:59] offset0:10 offset1:11 ; SI-NEXT: ds_write2_b64 v0, v[34:35], v[56:57] offset0:8 offset1:9 ; SI-NEXT: ds_write2_b64 v0, v[18:19], v[54:55] offset0:6 offset1:7 ; SI-NEXT: ds_write2_b64 v0, v[16:17], v[52:53] offset0:4 offset1:5 ; SI-NEXT: ds_write2_b64 v0, v[22:23], v[50:51] offset0:2 offset1:3 ; SI-NEXT: ds_write2_b64 v0, v[20:21], v[48:49] offset1:1 ; SI-NEXT: ds_write2_b64 v0, v[10:11], v[46:47] offset0:30 offset1:31 ; SI-NEXT: ds_write2_b64 v0, v[8:9], v[44:45] offset0:28 offset1:29 ; SI-NEXT: ds_write2_b64 v0, v[14:15], v[42:43] offset0:26 offset1:27 ; SI-NEXT: ds_write2_b64 v0, v[12:13], v[6:7] offset0:24 offset1:25 ; SI-NEXT: ds_write2_b64 v0, v[4:5], v[2:3] offset0:22 offset1:23 ; SI-NEXT: ds_write2_b64 v0, v[24:25], v[32:33] offset0:20 offset1:21 ; SI-NEXT: ds_write2_b64 v0, v[30:31], v[28:29] offset0:18 offset1:19 ; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: ds_write2_b64 v0, v[26:27], v[1:2] offset0:16 offset1:17 ; SI-NEXT: s_endpgm ; ; VI-NO-DS128-LABEL: local_zextload_v64i16_to_v64i32: ; VI-NO-DS128: ; %bb.0: ; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NO-DS128-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; VI-NO-DS128-NEXT: s_mov_b32 m0, -1 ; VI-NO-DS128-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; VI-NO-DS128-NEXT: s_mov_b32 s90, -1 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_mov_b32_e32 v16, s1 ; VI-NO-DS128-NEXT: ds_read2_b64 v[10:13], v16 offset1:1 ; VI-NO-DS128-NEXT: ds_read2_b64 v[17:20], v16 offset0:2 offset1:3 ; VI-NO-DS128-NEXT: s_mov_b32 s91, 0xe80000 ; VI-NO-DS128-NEXT: s_add_u32 s88, s88, s11 ; VI-NO-DS128-NEXT: s_addc_u32 s89, s89, 0 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v11 ; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v11 ; VI-NO-DS128-NEXT: buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill ; VI-NO-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v10 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v13 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v12 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v18 ; VI-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v10 ; VI-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v13 ; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v12 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v17 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v20 ; VI-NO-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v18 ; VI-NO-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v17 ; VI-NO-DS128-NEXT: ds_read2_b64 v[21:24], v16 offset0:4 offset1:5 ; VI-NO-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v20 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v19 ; VI-NO-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v19 ; VI-NO-DS128-NEXT: ds_read2_b64 v[17:20], v16 offset0:6 offset1:7 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v26, 16, v22 ; VI-NO-DS128-NEXT: v_and_b32_e32 v25, 0xffff, v22 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v28, 16, v21 ; VI-NO-DS128-NEXT: v_and_b32_e32 v27, 0xffff, v21 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v30, 16, v24 ; VI-NO-DS128-NEXT: v_and_b32_e32 v29, 0xffff, v24 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v32, 16, v23 ; VI-NO-DS128-NEXT: v_and_b32_e32 v31, 0xffff, v23 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v34, 16, v18 ; VI-NO-DS128-NEXT: v_and_b32_e32 v33, 0xffff, v18 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v36, 16, v17 ; VI-NO-DS128-NEXT: v_and_b32_e32 v35, 0xffff, v17 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v38, 16, v20 ; VI-NO-DS128-NEXT: ds_read2_b64 v[21:24], v16 offset0:8 offset1:9 ; VI-NO-DS128-NEXT: v_and_b32_e32 v37, 0xffff, v20 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v40, 16, v19 ; VI-NO-DS128-NEXT: v_and_b32_e32 v39, 0xffff, v19 ; VI-NO-DS128-NEXT: ds_read2_b64 v[17:20], v16 offset0:10 offset1:11 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v42, 16, v22 ; VI-NO-DS128-NEXT: v_and_b32_e32 v41, 0xffff, v22 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v44, 16, v21 ; VI-NO-DS128-NEXT: v_and_b32_e32 v43, 0xffff, v21 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v46, 16, v24 ; VI-NO-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v24 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v48, 16, v23 ; VI-NO-DS128-NEXT: v_and_b32_e32 v47, 0xffff, v23 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v50, 16, v18 ; VI-NO-DS128-NEXT: v_and_b32_e32 v49, 0xffff, v18 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v52, 16, v17 ; VI-NO-DS128-NEXT: v_and_b32_e32 v51, 0xffff, v17 ; VI-NO-DS128-NEXT: ds_read2_b64 v[21:24], v16 offset0:12 offset1:13 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v56, 16, v19 ; VI-NO-DS128-NEXT: v_and_b32_e32 v55, 0xffff, v19 ; VI-NO-DS128-NEXT: ds_read2_b64 v[16:19], v16 offset0:14 offset1:15 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v54, 16, v20 ; VI-NO-DS128-NEXT: v_and_b32_e32 v53, 0xffff, v20 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v58, 16, v22 ; VI-NO-DS128-NEXT: v_and_b32_e32 v57, 0xffff, v22 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v20, 16, v19 ; VI-NO-DS128-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v18 ; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v18 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v18, s0 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v22, 16, v21 ; VI-NO-DS128-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v60, 16, v24 ; VI-NO-DS128-NEXT: v_and_b32_e32 v59, 0xffff, v24 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v24, 16, v23 ; VI-NO-DS128-NEXT: v_and_b32_e32 v23, 0xffff, v23 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v62, 16, v17 ; VI-NO-DS128-NEXT: v_and_b32_e32 v61, 0xffff, v17 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v16 ; VI-NO-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[0:1], v[19:20] offset0:30 offset1:31 ; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[16:17], v[61:62] offset0:28 offset1:29 ; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[23:24], v[59:60] offset0:26 offset1:27 ; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[21:22], v[57:58] offset0:24 offset1:25 ; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[55:56], v[53:54] offset0:22 offset1:23 ; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[51:52], v[49:50] offset0:20 offset1:21 ; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[47:48], v[45:46] offset0:18 offset1:19 ; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[43:44], v[41:42] offset0:16 offset1:17 ; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[39:40], v[37:38] offset0:14 offset1:15 ; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[35:36], v[33:34] offset0:12 offset1:13 ; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[31:32], v[29:30] offset0:10 offset1:11 ; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[27:28], v[25:26] offset0:8 offset1:9 ; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[14:15], v[12:13] offset0:6 offset1:7 ; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[10:11], v[8:9] offset0:4 offset1:5 ; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[6:7], v[4:5] offset0:2 offset1:3 ; VI-NO-DS128-NEXT: buffer_load_dword v0, off, s[88:91], 0 ; 4-byte Folded Reload ; VI-NO-DS128-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload ; VI-NO-DS128-NEXT: s_waitcnt vmcnt(0) ; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[2:3], v[0:1] offset1:1 ; VI-NO-DS128-NEXT: s_endpgm ; ; GFX9-NO-DS128-LABEL: local_zextload_v64i16_to_v64i32: ; GFX9-NO-DS128: ; %bb.0: ; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NO-DS128-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-NO-DS128-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-NO-DS128-NEXT: s_mov_b32 s14, -1 ; GFX9-NO-DS128-NEXT: s_mov_b32 s15, 0xe00000 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v56, s1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[10:13], v56 offset1:1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[14:17], v56 offset0:2 offset1:3 ; GFX9-NO-DS128-NEXT: s_add_u32 s12, s12, s11 ; GFX9-NO-DS128-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v11 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v11 ; GFX9-NO-DS128-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill ; GFX9-NO-DS128-NEXT: s_nop 0 ; GFX9-NO-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[18:21], v56 offset0:4 offset1:5 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[22:25], v56 offset0:6 offset1:7 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v10 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v13 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v12 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(2) ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v15 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v10 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v13 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v12 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v14 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v17 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v15 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v14 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v17 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v16 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v16 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v27, 16, v19 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v26, 0xffff, v19 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v18 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v18 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v21 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v21 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v33, 16, v20 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v32, 0xffff, v20 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v35, 16, v23 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v34, 0xffff, v23 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v37, 16, v22 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v36, 0xffff, v22 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[16:19], v56 offset0:8 offset1:9 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[20:23], v56 offset0:10 offset1:11 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v39, 16, v25 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v38, 0xffff, v25 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v41, 16, v17 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v40, 0xffff, v17 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v43, 16, v16 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v42, 0xffff, v16 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v45, 16, v19 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v44, 0xffff, v19 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v47, 16, v18 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v46, 0xffff, v18 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v49, 16, v21 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v48, 0xffff, v21 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v51, 16, v20 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v50, 0xffff, v20 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v53, 16, v23 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[16:19], v56 offset0:12 offset1:13 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v52, 0xffff, v23 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v55, 16, v22 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v54, 0xffff, v22 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[20:23], v56 offset0:14 offset1:15 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v25, 16, v24 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v24, 0xffff, v24 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v57, 16, v17 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v56, 0xffff, v17 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v63, 16, v23 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v62, 0xffff, v23 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v22 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v16 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v59, 16, v19 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v58, 0xffff, v19 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v18 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v61, 16, v21 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v60, 0xffff, v21 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v20 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[22:23], v[62:63] offset0:30 offset1:31 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[20:21], v[60:61] offset0:28 offset1:29 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[18:19], v[58:59] offset0:26 offset1:27 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[16:17], v[56:57] offset0:24 offset1:25 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[54:55], v[52:53] offset0:22 offset1:23 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[50:51], v[48:49] offset0:20 offset1:21 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[46:47], v[44:45] offset0:18 offset1:19 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[42:43], v[40:41] offset0:16 offset1:17 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[24:25], v[38:39] offset0:14 offset1:15 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[36:37], v[34:35] offset0:12 offset1:13 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[32:33], v[30:31] offset0:10 offset1:11 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[28:29], v[26:27] offset0:8 offset1:9 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[14:15], v[12:13] offset0:6 offset1:7 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[10:11], v[8:9] offset0:4 offset1:5 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[6:7], v[4:5] offset0:2 offset1:3 ; GFX9-NO-DS128-NEXT: buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload ; GFX9-NO-DS128-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GFX9-NO-DS128-NEXT: s_waitcnt vmcnt(0) ; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 ; GFX9-NO-DS128-NEXT: s_endpgm ; ; EG-LABEL: local_zextload_v64i16_to_v64i32: ; EG: ; %bb.0: ; EG-NEXT: ALU 116, @26, KC0[CB0:0-32], KC1[] ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 112(1.569454e-43), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.Y, OQAP, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 116(1.625506e-43), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.Z, OQAP, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 120(1.681558e-43), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.W, OQAP, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x, ; EG-NEXT: 124(1.737610e-43), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T1.W ; EG-NEXT: MOV T1.Y, OQAP, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x, ; EG-NEXT: 96(1.345247e-43), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T1.W ; EG-NEXT: MOV T1.Z, OQAP, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x, ; EG-NEXT: 100(1.401298e-43), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T1.W ; EG-NEXT: MOV T1.W, OQAP, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x, ; EG-NEXT: 104(1.457350e-43), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T2.W ; EG-NEXT: MOV T2.Y, OQAP, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x, ; EG-NEXT: 108(1.513402e-43), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T2.W ; EG-NEXT: MOV T2.Z, OQAP, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x, ; EG-NEXT: 80(1.121039e-43), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T2.W ; EG-NEXT: MOV T2.W, OQAP, ; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x, ; EG-NEXT: 84(1.177091e-43), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T3.W ; EG-NEXT: MOV T3.Y, OQAP, ; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x, ; EG-NEXT: 88(1.233143e-43), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T3.W ; EG-NEXT: MOV T3.Z, OQAP, ; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x, ; EG-NEXT: 92(1.289195e-43), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T3.W ; EG-NEXT: MOV T3.W, OQAP, ; EG-NEXT: ADD_INT * T4.W, KC0[2].Z, literal.x, ; EG-NEXT: 64(8.968310e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T4.W ; EG-NEXT: MOV T4.Y, OQAP, ; EG-NEXT: ADD_INT * T4.W, KC0[2].Z, literal.x, ; EG-NEXT: 68(9.528830e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T4.W ; EG-NEXT: MOV T4.Z, OQAP, ; EG-NEXT: ADD_INT * T4.W, KC0[2].Z, literal.x, ; EG-NEXT: 72(1.008935e-43), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T4.W ; EG-NEXT: MOV T4.W, OQAP, ; EG-NEXT: ADD_INT * T5.W, KC0[2].Z, literal.x, ; EG-NEXT: 76(1.064987e-43), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T5.W ; EG-NEXT: MOV T5.Y, OQAP, ; EG-NEXT: ADD_INT * T5.W, KC0[2].Z, literal.x, ; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T5.W ; EG-NEXT: MOV T5.Z, OQAP, ; EG-NEXT: ADD_INT * T5.W, KC0[2].Z, literal.x, ; EG-NEXT: 52(7.286752e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T5.W ; EG-NEXT: MOV T5.W, OQAP, ; EG-NEXT: ADD_INT * T6.W, KC0[2].Z, literal.x, ; EG-NEXT: 56(7.847271e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T6.W ; EG-NEXT: MOV T6.Y, OQAP, ; EG-NEXT: ADD_INT * T6.W, KC0[2].Z, literal.x, ; EG-NEXT: 60(8.407791e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T6.W ; EG-NEXT: MOV T6.Z, OQAP, ; EG-NEXT: ADD_INT * T6.W, KC0[2].Z, literal.x, ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T6.W ; EG-NEXT: MOV T6.W, OQAP, ; EG-NEXT: ADD_INT * T7.W, KC0[2].Z, literal.x, ; EG-NEXT: 36(5.044674e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T7.W ; EG-NEXT: MOV T7.Y, OQAP, ; EG-NEXT: ADD_INT * T7.W, KC0[2].Z, literal.x, ; EG-NEXT: 40(5.605194e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T7.W ; EG-NEXT: MOV T7.Z, OQAP, ; EG-NEXT: ADD_INT * T7.W, KC0[2].Z, literal.x, ; EG-NEXT: 44(6.165713e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T7.W ; EG-NEXT: MOV T7.W, OQAP, ; EG-NEXT: ADD_INT * T8.W, KC0[2].Z, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T8.W ; EG-NEXT: MOV T8.Y, OQAP, ; EG-NEXT: ADD_INT * T8.W, KC0[2].Z, literal.x, ; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T8.W ; EG-NEXT: MOV T8.Z, OQAP, ; EG-NEXT: ADD_INT * T8.W, KC0[2].Z, literal.x, ; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T8.W ; EG-NEXT: MOV T8.W, OQAP, ; EG-NEXT: ADD_INT * T9.W, KC0[2].Z, literal.x, ; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T9.W ; EG-NEXT: MOV T9.Y, OQAP, ; EG-NEXT: MOV * T9.W, KC0[2].Z, ; EG-NEXT: LDS_READ_RET * OQAP, T9.W ; EG-NEXT: MOV T9.Z, OQAP, ; EG-NEXT: ADD_INT * T9.W, KC0[2].Z, literal.x, ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) ; EG-NEXT: ALU 95, @27, KC0[CB0:0-32], KC1[] ; EG-NEXT: LDS_READ_RET * OQAP, T9.W ; EG-NEXT: MOV T9.W, OQAP, ; EG-NEXT: ADD_INT * T10.W, KC0[2].Z, literal.x, ; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T10.W ; EG-NEXT: MOV T10.Y, OQAP, ; EG-NEXT: ADD_INT * T10.W, KC0[2].Z, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T10.W ; EG-NEXT: MOV T10.Z, OQAP, ; EG-NEXT: LSHR T10.W, T10.Y, literal.x, ; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 28(3.923636e-44) ; EG-NEXT: LDS_WRITE * T11.W, T10.W, ; EG-NEXT: AND_INT T10.W, T10.Y, literal.x, ; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 24(3.363116e-44) ; EG-NEXT: LDS_WRITE * T11.W, T10.W, ; EG-NEXT: LSHR T10.W, T10.Z, literal.x, ; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 20(2.802597e-44) ; EG-NEXT: LDS_WRITE * T11.W, T10.W, ; EG-NEXT: AND_INT T10.W, T10.Z, literal.x, ; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) ; EG-NEXT: LDS_WRITE * T11.W, T10.W, ; EG-NEXT: LSHR T10.W, T9.W, literal.x, ; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 12(1.681558e-44) ; EG-NEXT: LDS_WRITE * T11.W, T10.W, ; EG-NEXT: AND_INT T9.W, T9.W, literal.x, ; EG-NEXT: ADD_INT * T10.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 8(1.121039e-44) ; EG-NEXT: LDS_WRITE * T10.W, T9.W, ; EG-NEXT: LSHR T9.W, T9.Z, literal.x, ; EG-NEXT: ADD_INT * T10.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 4(5.605194e-45) ; EG-NEXT: LDS_WRITE * T10.W, T9.W, ; EG-NEXT: AND_INT T9.W, T9.Z, literal.x, ; EG-NEXT: MOV * T10.W, KC0[2].Y, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T10.W, T9.W, ; EG-NEXT: LSHR T9.W, T9.Y, literal.x, ; EG-NEXT: ADD_INT * T10.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 60(8.407791e-44) ; EG-NEXT: LDS_WRITE * T10.W, T9.W, ; EG-NEXT: AND_INT T9.W, T9.Y, literal.x, ; EG-NEXT: ADD_INT * T10.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 56(7.847271e-44) ; EG-NEXT: LDS_WRITE * T10.W, T9.W, ; EG-NEXT: LSHR T9.W, T8.W, literal.x, ; EG-NEXT: ADD_INT * T10.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 52(7.286752e-44) ; EG-NEXT: LDS_WRITE * T10.W, T9.W, ; EG-NEXT: AND_INT T8.W, T8.W, literal.x, ; EG-NEXT: ADD_INT * T9.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 48(6.726233e-44) ; EG-NEXT: LDS_WRITE * T9.W, T8.W, ; EG-NEXT: LSHR T8.W, T8.Z, literal.x, ; EG-NEXT: ADD_INT * T9.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 44(6.165713e-44) ; EG-NEXT: LDS_WRITE * T9.W, T8.W, ; EG-NEXT: AND_INT T8.W, T8.Z, literal.x, ; EG-NEXT: ADD_INT * T9.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 40(5.605194e-44) ; EG-NEXT: LDS_WRITE * T9.W, T8.W, ; EG-NEXT: LSHR T8.W, T8.Y, literal.x, ; EG-NEXT: ADD_INT * T9.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 36(5.044674e-44) ; EG-NEXT: LDS_WRITE * T9.W, T8.W, ; EG-NEXT: AND_INT T8.W, T8.Y, literal.x, ; EG-NEXT: ADD_INT * T9.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 32(4.484155e-44) ; EG-NEXT: LDS_WRITE * T9.W, T8.W, ; EG-NEXT: LSHR T8.W, T7.W, literal.x, ; EG-NEXT: ADD_INT * T9.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 92(1.289195e-43) ; EG-NEXT: LDS_WRITE * T9.W, T8.W, ; EG-NEXT: AND_INT T7.W, T7.W, literal.x, ; EG-NEXT: ADD_INT * T8.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 88(1.233143e-43) ; EG-NEXT: LDS_WRITE * T8.W, T7.W, ; EG-NEXT: LSHR T7.W, T7.Z, literal.x, ; EG-NEXT: ADD_INT * T8.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 84(1.177091e-43) ; EG-NEXT: LDS_WRITE * T8.W, T7.W, ; EG-NEXT: AND_INT T7.W, T7.Z, literal.x, ; EG-NEXT: ADD_INT * T8.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 80(1.121039e-43) ; EG-NEXT: LDS_WRITE * T8.W, T7.W, ; EG-NEXT: LSHR T7.W, T7.Y, literal.x, ; EG-NEXT: ADD_INT * T8.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 76(1.064987e-43) ; EG-NEXT: LDS_WRITE * T8.W, T7.W, ; EG-NEXT: AND_INT * T7.W, T7.Y, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) ; EG-NEXT: ALU 93, @28, KC0[CB0:0-32], KC1[] ; EG-NEXT: ADD_INT * T8.W, KC0[2].Y, literal.x, ; EG-NEXT: 72(1.008935e-43), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T8.W, T7.W, ; EG-NEXT: LSHR T7.W, T6.W, literal.x, ; EG-NEXT: ADD_INT * T8.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 68(9.528830e-44) ; EG-NEXT: LDS_WRITE * T8.W, T7.W, ; EG-NEXT: AND_INT T6.W, T6.W, literal.x, ; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 64(8.968310e-44) ; EG-NEXT: LDS_WRITE * T7.W, T6.W, ; EG-NEXT: LSHR T6.W, T6.Z, literal.x, ; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 124(1.737610e-43) ; EG-NEXT: LDS_WRITE * T7.W, T6.W, ; EG-NEXT: AND_INT T6.W, T6.Z, literal.x, ; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 120(1.681558e-43) ; EG-NEXT: LDS_WRITE * T7.W, T6.W, ; EG-NEXT: LSHR T6.W, T6.Y, literal.x, ; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 116(1.625506e-43) ; EG-NEXT: LDS_WRITE * T7.W, T6.W, ; EG-NEXT: AND_INT T6.W, T6.Y, literal.x, ; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 112(1.569454e-43) ; EG-NEXT: LDS_WRITE * T7.W, T6.W, ; EG-NEXT: LSHR T6.W, T5.W, literal.x, ; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 108(1.513402e-43) ; EG-NEXT: LDS_WRITE * T7.W, T6.W, ; EG-NEXT: AND_INT T5.W, T5.W, literal.x, ; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 104(1.457350e-43) ; EG-NEXT: LDS_WRITE * T6.W, T5.W, ; EG-NEXT: LSHR T5.W, T5.Z, literal.x, ; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 100(1.401298e-43) ; EG-NEXT: LDS_WRITE * T6.W, T5.W, ; EG-NEXT: AND_INT T5.W, T5.Z, literal.x, ; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 96(1.345247e-43) ; EG-NEXT: LDS_WRITE * T6.W, T5.W, ; EG-NEXT: LSHR T5.W, T5.Y, literal.x, ; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 156(2.186026e-43) ; EG-NEXT: LDS_WRITE * T6.W, T5.W, ; EG-NEXT: AND_INT T5.W, T5.Y, literal.x, ; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 152(2.129974e-43) ; EG-NEXT: LDS_WRITE * T6.W, T5.W, ; EG-NEXT: LSHR T5.W, T4.W, literal.x, ; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 148(2.073922e-43) ; EG-NEXT: LDS_WRITE * T6.W, T5.W, ; EG-NEXT: AND_INT T4.W, T4.W, literal.x, ; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 144(2.017870e-43) ; EG-NEXT: LDS_WRITE * T5.W, T4.W, ; EG-NEXT: LSHR T4.W, T4.Z, literal.x, ; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 140(1.961818e-43) ; EG-NEXT: LDS_WRITE * T5.W, T4.W, ; EG-NEXT: AND_INT T4.W, T4.Z, literal.x, ; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 136(1.905766e-43) ; EG-NEXT: LDS_WRITE * T5.W, T4.W, ; EG-NEXT: LSHR T4.W, T4.Y, literal.x, ; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 132(1.849714e-43) ; EG-NEXT: LDS_WRITE * T5.W, T4.W, ; EG-NEXT: AND_INT T4.W, T4.Y, literal.x, ; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 128(1.793662e-43) ; EG-NEXT: LDS_WRITE * T5.W, T4.W, ; EG-NEXT: LSHR T4.W, T3.W, literal.x, ; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 188(2.634441e-43) ; EG-NEXT: LDS_WRITE * T5.W, T4.W, ; EG-NEXT: AND_INT T3.W, T3.W, literal.x, ; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 184(2.578389e-43) ; EG-NEXT: LDS_WRITE * T4.W, T3.W, ; EG-NEXT: LSHR T3.W, T3.Z, literal.x, ; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 180(2.522337e-43) ; EG-NEXT: LDS_WRITE * T4.W, T3.W, ; EG-NEXT: AND_INT T3.W, T3.Z, literal.x, ; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 176(2.466285e-43) ; EG-NEXT: LDS_WRITE * T4.W, T3.W, ; EG-NEXT: LSHR T3.W, T3.Y, literal.x, ; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 172(2.410233e-43) ; EG-NEXT: ALU 76, @29, KC0[CB0:0-32], KC1[] ; EG-NEXT: LDS_WRITE * T4.W, T3.W, ; EG-NEXT: AND_INT T3.W, T3.Y, literal.x, ; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 168(2.354181e-43) ; EG-NEXT: LDS_WRITE * T4.W, T3.W, ; EG-NEXT: LSHR T3.W, T2.W, literal.x, ; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 164(2.298129e-43) ; EG-NEXT: LDS_WRITE * T4.W, T3.W, ; EG-NEXT: AND_INT T2.W, T2.W, literal.x, ; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 160(2.242078e-43) ; EG-NEXT: LDS_WRITE * T3.W, T2.W, ; EG-NEXT: LSHR T2.W, T2.Z, literal.x, ; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 220(3.082857e-43) ; EG-NEXT: LDS_WRITE * T3.W, T2.W, ; EG-NEXT: AND_INT T2.W, T2.Z, literal.x, ; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 216(3.026805e-43) ; EG-NEXT: LDS_WRITE * T3.W, T2.W, ; EG-NEXT: LSHR T2.W, T2.Y, literal.x, ; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 212(2.970753e-43) ; EG-NEXT: LDS_WRITE * T3.W, T2.W, ; EG-NEXT: AND_INT T2.W, T2.Y, literal.x, ; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 208(2.914701e-43) ; EG-NEXT: LDS_WRITE * T3.W, T2.W, ; EG-NEXT: LSHR T2.W, T1.W, literal.x, ; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 204(2.858649e-43) ; EG-NEXT: LDS_WRITE * T3.W, T2.W, ; EG-NEXT: AND_INT T1.W, T1.W, literal.x, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 200(2.802597e-43) ; EG-NEXT: LDS_WRITE * T2.W, T1.W, ; EG-NEXT: LSHR T1.W, T1.Z, literal.x, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 196(2.746545e-43) ; EG-NEXT: LDS_WRITE * T2.W, T1.W, ; EG-NEXT: AND_INT T1.W, T1.Z, literal.x, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 192(2.690493e-43) ; EG-NEXT: LDS_WRITE * T2.W, T1.W, ; EG-NEXT: LSHR T1.W, T1.Y, literal.x, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 252(3.531272e-43) ; EG-NEXT: LDS_WRITE * T2.W, T1.W, ; EG-NEXT: AND_INT T1.W, T1.Y, literal.x, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 248(3.475220e-43) ; EG-NEXT: LDS_WRITE * T2.W, T1.W, ; EG-NEXT: LSHR T1.W, T0.W, literal.x, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 244(3.419168e-43) ; EG-NEXT: LDS_WRITE * T2.W, T1.W, ; EG-NEXT: AND_INT T0.W, T0.W, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 240(3.363116e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: LSHR T0.W, T0.Z, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 236(3.307064e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: AND_INT T0.W, T0.Z, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 232(3.251012e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: LSHR T0.W, T0.Y, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 228(3.194960e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 224(3.138909e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: RETURN ; ; VI-DS128-LABEL: local_zextload_v64i16_to_v64i32: ; VI-DS128: ; %bb.0: ; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-DS128-NEXT: s_mov_b32 m0, -1 ; VI-DS128-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; VI-DS128-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; VI-DS128-NEXT: s_mov_b32 s90, -1 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_mov_b32_e32 v0, s1 ; VI-DS128-NEXT: ds_read_b128 v[8:11], v0 ; VI-DS128-NEXT: ds_read_b128 v[16:19], v0 offset:16 ; VI-DS128-NEXT: s_mov_b32 s91, 0xe80000 ; VI-DS128-NEXT: s_add_u32 s88, s88, s11 ; VI-DS128-NEXT: s_addc_u32 s89, s89, 0 ; VI-DS128-NEXT: ds_read_b128 v[20:23], v0 offset:32 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v19 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v18 ; VI-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v19 ; VI-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v18 ; VI-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 ; 4-byte Folded Spill ; VI-DS128-NEXT: buffer_store_dword v2, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill ; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill ; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill ; VI-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v17 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v16 ; VI-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v17 ; VI-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v16 ; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill ; VI-DS128-NEXT: buffer_store_dword v5, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill ; VI-DS128-NEXT: buffer_store_dword v6, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill ; VI-DS128-NEXT: buffer_store_dword v7, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill ; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:48 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v23 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v22 ; VI-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v23 ; VI-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v22 ; VI-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:32 ; 4-byte Folded Spill ; VI-DS128-NEXT: buffer_store_dword v2, off, s[88:91], 0 offset:36 ; 4-byte Folded Spill ; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:40 ; 4-byte Folded Spill ; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:44 ; 4-byte Folded Spill ; VI-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v21 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v20 ; VI-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v21 ; VI-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v20 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v27 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v26 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v35, 16, v25 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v33, 16, v24 ; VI-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v27 ; VI-DS128-NEXT: ds_read_b128 v[36:39], v0 offset:64 ; VI-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v26 ; VI-DS128-NEXT: v_and_b32_e32 v34, 0xffff, v25 ; VI-DS128-NEXT: v_and_b32_e32 v32, 0xffff, v24 ; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:80 ; VI-DS128-NEXT: ds_read_b128 v[55:58], v0 offset:96 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v11 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v10 ; VI-DS128-NEXT: v_mov_b32_e32 v31, v15 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v50, 16, v27 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v48, 16, v26 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v54, 16, v25 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v52, 16, v24 ; VI-DS128-NEXT: v_and_b32_e32 v49, 0xffff, v27 ; VI-DS128-NEXT: v_and_b32_e32 v47, 0xffff, v26 ; VI-DS128-NEXT: v_and_b32_e32 v53, 0xffff, v25 ; VI-DS128-NEXT: v_and_b32_e32 v51, 0xffff, v24 ; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:112 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v9 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v8 ; VI-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v11 ; VI-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v10 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v25 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v24 ; VI-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v25 ; VI-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v24 ; VI-DS128-NEXT: v_mov_b32_e32 v24, s0 ; VI-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v9 ; VI-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v8 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v42, 16, v39 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v40, 16, v38 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v46, 16, v37 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v44, 16, v36 ; VI-DS128-NEXT: v_and_b32_e32 v41, 0xffff, v39 ; VI-DS128-NEXT: v_and_b32_e32 v39, 0xffff, v38 ; VI-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v37 ; VI-DS128-NEXT: v_and_b32_e32 v43, 0xffff, v36 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v61, 16, v58 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v59, 16, v57 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v56 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v55 ; VI-DS128-NEXT: v_and_b32_e32 v60, 0xffff, v58 ; VI-DS128-NEXT: v_and_b32_e32 v58, 0xffff, v57 ; VI-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v56 ; VI-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v55 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v27 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v26 ; VI-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v27 ; VI-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v26 ; VI-DS128-NEXT: ds_write_b128 v24, v[0:3] offset:224 ; VI-DS128-NEXT: ds_write_b128 v24, v[4:7] offset:240 ; VI-DS128-NEXT: ds_write_b128 v24, v[8:11] offset:192 ; VI-DS128-NEXT: ds_write_b128 v24, v[58:61] offset:208 ; VI-DS128-NEXT: ds_write_b128 v24, v[51:54] offset:160 ; VI-DS128-NEXT: ds_write_b128 v24, v[47:50] offset:176 ; VI-DS128-NEXT: ds_write_b128 v24, v[43:46] offset:128 ; VI-DS128-NEXT: ds_write_b128 v24, v[39:42] offset:144 ; VI-DS128-NEXT: ds_write_b128 v24, v[32:35] offset:96 ; VI-DS128-NEXT: ds_write_b128 v24, v[20:23] offset:112 ; VI-DS128-NEXT: ds_write_b128 v24, v[16:19] offset:64 ; VI-DS128-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:32 ; 4-byte Folded Reload ; VI-DS128-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:36 ; 4-byte Folded Reload ; VI-DS128-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:40 ; 4-byte Folded Reload ; VI-DS128-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:44 ; 4-byte Folded Reload ; VI-DS128-NEXT: s_waitcnt vmcnt(0) ; VI-DS128-NEXT: ds_write_b128 v24, v[0:3] offset:80 ; VI-DS128-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:16 ; 4-byte Folded Reload ; VI-DS128-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:20 ; 4-byte Folded Reload ; VI-DS128-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:24 ; 4-byte Folded Reload ; VI-DS128-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:28 ; 4-byte Folded Reload ; VI-DS128-NEXT: s_waitcnt vmcnt(0) ; VI-DS128-NEXT: ds_write_b128 v24, v[0:3] offset:32 ; VI-DS128-NEXT: buffer_load_dword v0, off, s[88:91], 0 ; 4-byte Folded Reload ; VI-DS128-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload ; VI-DS128-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:8 ; 4-byte Folded Reload ; VI-DS128-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:12 ; 4-byte Folded Reload ; VI-DS128-NEXT: s_waitcnt vmcnt(0) ; VI-DS128-NEXT: ds_write_b128 v24, v[0:3] offset:48 ; VI-DS128-NEXT: ds_write_b128 v24, v[12:15] ; VI-DS128-NEXT: ds_write_b128 v24, v[28:31] offset:16 ; VI-DS128-NEXT: s_endpgm ; ; GFX9-DS128-LABEL: local_zextload_v64i16_to_v64i32: ; GFX9-DS128: ; %bb.0: ; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DS128-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-DS128-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-DS128-NEXT: s_mov_b32 s14, -1 ; GFX9-DS128-NEXT: s_mov_b32 s15, 0xe00000 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-DS128-NEXT: ds_read_b128 v[8:11], v0 ; GFX9-DS128-NEXT: ds_read_b128 v[16:19], v0 offset:16 ; GFX9-DS128-NEXT: s_add_u32 s12, s12, s11 ; GFX9-DS128-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DS128-NEXT: ds_read_b128 v[20:23], v0 offset:32 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v11 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v19 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v18 ; GFX9-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v19 ; GFX9-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v18 ; GFX9-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: s_nop 0 ; GFX9-DS128-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v17 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v16 ; GFX9-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v17 ; GFX9-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v16 ; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: s_nop 0 ; GFX9-DS128-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:48 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v23 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v22 ; GFX9-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v23 ; GFX9-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v22 ; GFX9-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: s_nop 0 ; GFX9-DS128-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v21 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v20 ; GFX9-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v21 ; GFX9-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v20 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v27 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v26 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v35, 16, v25 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v33, 16, v24 ; GFX9-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v27 ; GFX9-DS128-NEXT: ds_read_b128 v[36:39], v0 offset:64 ; GFX9-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v26 ; GFX9-DS128-NEXT: v_and_b32_e32 v34, 0xffff, v25 ; GFX9-DS128-NEXT: v_and_b32_e32 v32, 0xffff, v24 ; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:80 ; GFX9-DS128-NEXT: ds_read_b128 v[55:58], v0 offset:96 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v10 ; GFX9-DS128-NEXT: v_mov_b32_e32 v31, v15 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v9 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v50, 16, v27 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v48, 16, v26 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v54, 16, v25 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v52, 16, v24 ; GFX9-DS128-NEXT: v_and_b32_e32 v49, 0xffff, v27 ; GFX9-DS128-NEXT: v_and_b32_e32 v47, 0xffff, v26 ; GFX9-DS128-NEXT: v_and_b32_e32 v53, 0xffff, v25 ; GFX9-DS128-NEXT: v_and_b32_e32 v51, 0xffff, v24 ; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:112 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v8 ; GFX9-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v11 ; GFX9-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v10 ; GFX9-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v9 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v25 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v24 ; GFX9-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v25 ; GFX9-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v24 ; GFX9-DS128-NEXT: v_mov_b32_e32 v24, s0 ; GFX9-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v8 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v42, 16, v39 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v40, 16, v38 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v46, 16, v37 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v44, 16, v36 ; GFX9-DS128-NEXT: v_and_b32_e32 v41, 0xffff, v39 ; GFX9-DS128-NEXT: v_and_b32_e32 v39, 0xffff, v38 ; GFX9-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v37 ; GFX9-DS128-NEXT: v_and_b32_e32 v43, 0xffff, v36 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v61, 16, v58 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v59, 16, v57 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v56 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v55 ; GFX9-DS128-NEXT: v_and_b32_e32 v60, 0xffff, v58 ; GFX9-DS128-NEXT: v_and_b32_e32 v58, 0xffff, v57 ; GFX9-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v56 ; GFX9-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v55 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v27 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v26 ; GFX9-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v27 ; GFX9-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v26 ; GFX9-DS128-NEXT: ds_write_b128 v24, v[0:3] offset:224 ; GFX9-DS128-NEXT: ds_write_b128 v24, v[4:7] offset:240 ; GFX9-DS128-NEXT: ds_write_b128 v24, v[8:11] offset:192 ; GFX9-DS128-NEXT: ds_write_b128 v24, v[58:61] offset:208 ; GFX9-DS128-NEXT: ds_write_b128 v24, v[51:54] offset:160 ; GFX9-DS128-NEXT: ds_write_b128 v24, v[47:50] offset:176 ; GFX9-DS128-NEXT: ds_write_b128 v24, v[43:46] offset:128 ; GFX9-DS128-NEXT: ds_write_b128 v24, v[39:42] offset:144 ; GFX9-DS128-NEXT: ds_write_b128 v24, v[32:35] offset:96 ; GFX9-DS128-NEXT: ds_write_b128 v24, v[20:23] offset:112 ; GFX9-DS128-NEXT: ds_write_b128 v24, v[16:19] offset:64 ; GFX9-DS128-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:44 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: s_waitcnt vmcnt(0) ; GFX9-DS128-NEXT: ds_write_b128 v24, v[0:3] offset:80 ; GFX9-DS128-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: s_waitcnt vmcnt(0) ; GFX9-DS128-NEXT: ds_write_b128 v24, v[0:3] offset:32 ; GFX9-DS128-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: s_waitcnt vmcnt(0) ; GFX9-DS128-NEXT: ds_write_b128 v24, v[0:3] offset:48 ; GFX9-DS128-NEXT: ds_write_b128 v24, v[12:15] ; GFX9-DS128-NEXT: ds_write_b128 v24, v[28:31] offset:16 ; GFX9-DS128-NEXT: s_endpgm %load = load <64 x i16>, ptr addrspace(3) %in %ext = zext <64 x i16> %load to <64 x i32> store <64 x i32> %ext, ptr addrspace(3) %out ret void } define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { ; SI-LABEL: local_sextload_v64i16_to_v64i32: ; SI: ; %bb.0: ; SI-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; SI-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; SI-NEXT: s_mov_b32 s14, -1 ; SI-NEXT: s_mov_b32 s15, 0xe8f000 ; SI-NEXT: s_add_u32 s12, s12, s11 ; SI-NEXT: s_addc_u32 s13, s13, 0 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v20, s1 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read2_b64 v[4:7], v20 offset0:8 offset1:9 ; SI-NEXT: ds_read2_b64 v[0:3], v20 offset0:10 offset1:11 ; SI-NEXT: ds_read2_b64 v[8:11], v20 offset0:12 offset1:13 ; SI-NEXT: ds_read2_b64 v[12:15], v20 offset0:14 offset1:15 ; SI-NEXT: ds_read2_b64 v[16:19], v20 offset1:1 ; SI-NEXT: ds_read2_b64 v[30:33], v20 offset0:2 offset1:3 ; SI-NEXT: ds_read2_b64 v[34:37], v20 offset0:4 offset1:5 ; SI-NEXT: ds_read2_b64 v[38:41], v20 offset0:6 offset1:7 ; SI-NEXT: s_waitcnt lgkmcnt(7) ; SI-NEXT: v_ashrrev_i32_e32 v21, 16, v5 ; SI-NEXT: v_ashrrev_i32_e32 v23, 16, v4 ; SI-NEXT: v_ashrrev_i32_e32 v25, 16, v7 ; SI-NEXT: v_ashrrev_i32_e32 v27, 16, v6 ; SI-NEXT: s_waitcnt lgkmcnt(6) ; SI-NEXT: v_ashrrev_i32_e32 v29, 16, v1 ; SI-NEXT: v_bfe_i32 v20, v5, 0, 16 ; SI-NEXT: buffer_store_dword v20, off, s[12:15], 0 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v21, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; SI-NEXT: v_bfe_i32 v22, v4, 0, 16 ; SI-NEXT: v_bfe_i32 v24, v7, 0, 16 ; SI-NEXT: v_bfe_i32 v26, v6, 0, 16 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_ashrrev_i32_e32 v21, 16, v0 ; SI-NEXT: v_ashrrev_i32_e32 v7, 16, v3 ; SI-NEXT: v_bfe_i32 v28, v1, 0, 16 ; SI-NEXT: v_bfe_i32 v20, v0, 0, 16 ; SI-NEXT: v_bfe_i32 v6, v3, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v5, 16, v2 ; SI-NEXT: v_bfe_i32 v4, v2, 0, 16 ; SI-NEXT: s_waitcnt lgkmcnt(5) ; SI-NEXT: v_ashrrev_i32_e32 v3, 16, v9 ; SI-NEXT: v_bfe_i32 v2, v9, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v9, 16, v8 ; SI-NEXT: v_bfe_i32 v8, v8, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v43, 16, v11 ; SI-NEXT: v_bfe_i32 v42, v11, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v11, 16, v10 ; SI-NEXT: v_bfe_i32 v10, v10, 0, 16 ; SI-NEXT: s_waitcnt lgkmcnt(4) ; SI-NEXT: v_ashrrev_i32_e32 v45, 16, v13 ; SI-NEXT: v_bfe_i32 v44, v13, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v13, 16, v12 ; SI-NEXT: v_bfe_i32 v12, v12, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v47, 16, v15 ; SI-NEXT: v_bfe_i32 v46, v15, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v15, 16, v14 ; SI-NEXT: v_bfe_i32 v14, v14, 0, 16 ; SI-NEXT: s_waitcnt lgkmcnt(3) ; SI-NEXT: v_ashrrev_i32_e32 v49, 16, v17 ; SI-NEXT: v_bfe_i32 v48, v17, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v17, 16, v16 ; SI-NEXT: v_bfe_i32 v16, v16, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v51, 16, v19 ; SI-NEXT: v_bfe_i32 v50, v19, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v19, 16, v18 ; SI-NEXT: v_bfe_i32 v18, v18, 0, 16 ; SI-NEXT: s_waitcnt lgkmcnt(2) ; SI-NEXT: v_ashrrev_i32_e32 v53, 16, v31 ; SI-NEXT: v_bfe_i32 v52, v31, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v31, 16, v30 ; SI-NEXT: v_bfe_i32 v30, v30, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v55, 16, v33 ; SI-NEXT: v_bfe_i32 v54, v33, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v33, 16, v32 ; SI-NEXT: v_bfe_i32 v32, v32, 0, 16 ; SI-NEXT: s_waitcnt lgkmcnt(1) ; SI-NEXT: v_ashrrev_i32_e32 v57, 16, v35 ; SI-NEXT: v_bfe_i32 v56, v35, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v35, 16, v34 ; SI-NEXT: v_bfe_i32 v34, v34, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v59, 16, v37 ; SI-NEXT: v_bfe_i32 v58, v37, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v37, 16, v36 ; SI-NEXT: v_bfe_i32 v36, v36, 0, 16 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_ashrrev_i32_e32 v61, 16, v39 ; SI-NEXT: v_bfe_i32 v60, v39, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v39, 16, v38 ; SI-NEXT: v_bfe_i32 v38, v38, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v63, 16, v41 ; SI-NEXT: v_bfe_i32 v62, v41, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v41, 16, v40 ; SI-NEXT: v_bfe_i32 v40, v40, 0, 16 ; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: ds_write2_b64 v0, v[40:41], v[62:63] offset0:14 offset1:15 ; SI-NEXT: ds_write2_b64 v0, v[38:39], v[60:61] offset0:12 offset1:13 ; SI-NEXT: ds_write2_b64 v0, v[36:37], v[58:59] offset0:10 offset1:11 ; SI-NEXT: ds_write2_b64 v0, v[34:35], v[56:57] offset0:8 offset1:9 ; SI-NEXT: ds_write2_b64 v0, v[32:33], v[54:55] offset0:6 offset1:7 ; SI-NEXT: ds_write2_b64 v0, v[30:31], v[52:53] offset0:4 offset1:5 ; SI-NEXT: ds_write2_b64 v0, v[18:19], v[50:51] offset0:2 offset1:3 ; SI-NEXT: ds_write2_b64 v0, v[16:17], v[48:49] offset1:1 ; SI-NEXT: ds_write2_b64 v0, v[14:15], v[46:47] offset0:30 offset1:31 ; SI-NEXT: ds_write2_b64 v0, v[12:13], v[44:45] offset0:28 offset1:29 ; SI-NEXT: ds_write2_b64 v0, v[10:11], v[42:43] offset0:26 offset1:27 ; SI-NEXT: ds_write2_b64 v0, v[8:9], v[2:3] offset0:24 offset1:25 ; SI-NEXT: ds_write2_b64 v0, v[4:5], v[6:7] offset0:22 offset1:23 ; SI-NEXT: ds_write2_b64 v0, v[20:21], v[28:29] offset0:20 offset1:21 ; SI-NEXT: ds_write2_b64 v0, v[26:27], v[24:25] offset0:18 offset1:19 ; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: ds_write2_b64 v0, v[22:23], v[1:2] offset0:16 offset1:17 ; SI-NEXT: s_endpgm ; ; VI-NO-DS128-LABEL: local_sextload_v64i16_to_v64i32: ; VI-NO-DS128: ; %bb.0: ; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NO-DS128-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; VI-NO-DS128-NEXT: s_mov_b32 m0, -1 ; VI-NO-DS128-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; VI-NO-DS128-NEXT: s_mov_b32 s90, -1 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_mov_b32_e32 v28, s1 ; VI-NO-DS128-NEXT: ds_read2_b64 v[10:13], v28 offset1:1 ; VI-NO-DS128-NEXT: ds_read2_b64 v[14:17], v28 offset0:2 offset1:3 ; VI-NO-DS128-NEXT: s_mov_b32 s91, 0xe80000 ; VI-NO-DS128-NEXT: s_add_u32 s88, s88, s11 ; VI-NO-DS128-NEXT: s_addc_u32 s89, s89, 0 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v11 ; VI-NO-DS128-NEXT: v_bfe_i32 v0, v11, 0, 16 ; VI-NO-DS128-NEXT: buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill ; VI-NO-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill ; VI-NO-DS128-NEXT: ds_read2_b64 v[20:23], v28 offset0:4 offset1:5 ; VI-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:6 offset1:7 ; VI-NO-DS128-NEXT: ds_read2_b64 v[33:36], v28 offset0:8 offset1:9 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v10 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v13 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v12 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v30 ; VI-NO-DS128-NEXT: v_bfe_i32 v24, v30, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v27, 16, v29 ; VI-NO-DS128-NEXT: v_bfe_i32 v26, v29, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v38, 16, v32 ; VI-NO-DS128-NEXT: v_bfe_i32 v37, v32, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v40, 16, v31 ; VI-NO-DS128-NEXT: v_bfe_i32 v39, v31, 0, 16 ; VI-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:10 offset1:11 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v42, 16, v34 ; VI-NO-DS128-NEXT: v_bfe_i32 v41, v34, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v44, 16, v33 ; VI-NO-DS128-NEXT: v_bfe_i32 v43, v33, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v46, 16, v36 ; VI-NO-DS128-NEXT: v_bfe_i32 v45, v36, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v48, 16, v35 ; VI-NO-DS128-NEXT: v_bfe_i32 v47, v35, 0, 16 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v50, 16, v30 ; VI-NO-DS128-NEXT: v_bfe_i32 v49, v30, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v52, 16, v29 ; VI-NO-DS128-NEXT: v_bfe_i32 v51, v29, 0, 16 ; VI-NO-DS128-NEXT: ds_read2_b64 v[33:36], v28 offset0:12 offset1:13 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v56, 16, v31 ; VI-NO-DS128-NEXT: v_bfe_i32 v55, v31, 0, 16 ; VI-NO-DS128-NEXT: ds_read2_b64 v[28:31], v28 offset0:14 offset1:15 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v54, 16, v32 ; VI-NO-DS128-NEXT: v_bfe_i32 v53, v32, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v15 ; VI-NO-DS128-NEXT: v_bfe_i32 v2, v10, 0, 16 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v32, 16, v31 ; VI-NO-DS128-NEXT: v_bfe_i32 v31, v31, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v30 ; VI-NO-DS128-NEXT: v_bfe_i32 v0, v30, 0, 16 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v30, s0 ; VI-NO-DS128-NEXT: v_bfe_i32 v4, v13, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v6, v12, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v14 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v17 ; VI-NO-DS128-NEXT: v_bfe_i32 v8, v15, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v10, v14, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v12, v17, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v16 ; VI-NO-DS128-NEXT: v_bfe_i32 v14, v16, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v21 ; VI-NO-DS128-NEXT: v_bfe_i32 v16, v21, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v20 ; VI-NO-DS128-NEXT: v_bfe_i32 v18, v20, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v23 ; VI-NO-DS128-NEXT: v_bfe_i32 v20, v23, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v22 ; VI-NO-DS128-NEXT: v_bfe_i32 v22, v22, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v58, 16, v34 ; VI-NO-DS128-NEXT: v_bfe_i32 v57, v34, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v34, 16, v33 ; VI-NO-DS128-NEXT: v_bfe_i32 v33, v33, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v60, 16, v36 ; VI-NO-DS128-NEXT: v_bfe_i32 v59, v36, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v36, 16, v35 ; VI-NO-DS128-NEXT: v_bfe_i32 v35, v35, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v62, 16, v29 ; VI-NO-DS128-NEXT: v_bfe_i32 v61, v29, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v29, 16, v28 ; VI-NO-DS128-NEXT: v_bfe_i32 v28, v28, 0, 16 ; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[0:1], v[31:32] offset0:30 offset1:31 ; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[28:29], v[61:62] offset0:28 offset1:29 ; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[35:36], v[59:60] offset0:26 offset1:27 ; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[33:34], v[57:58] offset0:24 offset1:25 ; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[55:56], v[53:54] offset0:22 offset1:23 ; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[51:52], v[49:50] offset0:20 offset1:21 ; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[47:48], v[45:46] offset0:18 offset1:19 ; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[43:44], v[41:42] offset0:16 offset1:17 ; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[39:40], v[37:38] offset0:14 offset1:15 ; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[26:27], v[24:25] offset0:12 offset1:13 ; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[22:23], v[20:21] offset0:10 offset1:11 ; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[18:19], v[16:17] offset0:8 offset1:9 ; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[14:15], v[12:13] offset0:6 offset1:7 ; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[10:11], v[8:9] offset0:4 offset1:5 ; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[6:7], v[4:5] offset0:2 offset1:3 ; VI-NO-DS128-NEXT: buffer_load_dword v0, off, s[88:91], 0 ; 4-byte Folded Reload ; VI-NO-DS128-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload ; VI-NO-DS128-NEXT: s_waitcnt vmcnt(0) ; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[2:3], v[0:1] offset1:1 ; VI-NO-DS128-NEXT: s_endpgm ; ; GFX9-NO-DS128-LABEL: local_sextload_v64i16_to_v64i32: ; GFX9-NO-DS128: ; %bb.0: ; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NO-DS128-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-NO-DS128-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-NO-DS128-NEXT: s_mov_b32 s14, -1 ; GFX9-NO-DS128-NEXT: s_mov_b32 s15, 0xe00000 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v28, s1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[10:13], v28 offset1:1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[14:17], v28 offset0:2 offset1:3 ; GFX9-NO-DS128-NEXT: s_add_u32 s12, s12, s11 ; GFX9-NO-DS128-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v11 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v11, 0, 16 ; GFX9-NO-DS128-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill ; GFX9-NO-DS128-NEXT: s_nop 0 ; GFX9-NO-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[20:23], v28 offset0:4 offset1:5 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:6 offset1:7 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[33:36], v28 offset0:8 offset1:9 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v10 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v13 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v12 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v30 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v24, v30, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v27, 16, v29 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v26, v29, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v38, 16, v32 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v37, v32, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v40, 16, v31 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v39, v31, 0, 16 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:10 offset1:11 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v42, 16, v34 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v41, v34, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v44, 16, v33 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v43, v33, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v46, 16, v36 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v45, v36, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v48, 16, v35 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v47, v35, 0, 16 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v50, 16, v30 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v49, v30, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v52, 16, v29 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v51, v29, 0, 16 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[33:36], v28 offset0:12 offset1:13 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v56, 16, v31 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v55, v31, 0, 16 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[28:31], v28 offset0:14 offset1:15 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v54, 16, v32 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v53, v32, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v15 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v10, 0, 16 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v32, 16, v31 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v31, v31, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v30 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v30, 0, 16 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v30, s0 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v4, v13, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v6, v12, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v14 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v17 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v8, v15, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v10, v14, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v12, v17, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v14, v16, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v21 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v21, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v20 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v18, v20, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v23 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v20, v23, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v22 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v22, v22, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v58, 16, v34 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v57, v34, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v34, 16, v33 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v33, v33, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v60, 16, v36 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v59, v36, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v36, 16, v35 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v35, v35, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v62, 16, v29 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v61, v29, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v29, 16, v28 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v28, v28, 0, 16 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[0:1], v[31:32] offset0:30 offset1:31 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[28:29], v[61:62] offset0:28 offset1:29 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[35:36], v[59:60] offset0:26 offset1:27 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[33:34], v[57:58] offset0:24 offset1:25 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[55:56], v[53:54] offset0:22 offset1:23 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[51:52], v[49:50] offset0:20 offset1:21 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[47:48], v[45:46] offset0:18 offset1:19 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[43:44], v[41:42] offset0:16 offset1:17 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[39:40], v[37:38] offset0:14 offset1:15 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[26:27], v[24:25] offset0:12 offset1:13 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[22:23], v[20:21] offset0:10 offset1:11 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[18:19], v[16:17] offset0:8 offset1:9 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[14:15], v[12:13] offset0:6 offset1:7 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[10:11], v[8:9] offset0:4 offset1:5 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[6:7], v[4:5] offset0:2 offset1:3 ; GFX9-NO-DS128-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GFX9-NO-DS128-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GFX9-NO-DS128-NEXT: s_waitcnt vmcnt(0) ; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[2:3], v[0:1] offset1:1 ; GFX9-NO-DS128-NEXT: s_endpgm ; ; EG-LABEL: local_sextload_v64i16_to_v64i32: ; EG: ; %bb.0: ; EG-NEXT: ALU 116, @30, KC0[CB0:0-32], KC1[] ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.Y, OQAP, ; EG-NEXT: MOV * T0.W, KC0[2].Z, ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.Z, OQAP, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.W, OQAP, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x, ; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T1.W ; EG-NEXT: MOV T1.Y, OQAP, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x, ; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T1.W ; EG-NEXT: MOV T1.Z, OQAP, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T1.W ; EG-NEXT: MOV T1.W, OQAP, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x, ; EG-NEXT: 44(6.165713e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T2.W ; EG-NEXT: MOV T2.Y, OQAP, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x, ; EG-NEXT: 40(5.605194e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T2.W ; EG-NEXT: MOV T2.Z, OQAP, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x, ; EG-NEXT: 36(5.044674e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T2.W ; EG-NEXT: MOV T2.W, OQAP, ; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x, ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T3.W ; EG-NEXT: MOV T3.Y, OQAP, ; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x, ; EG-NEXT: 60(8.407791e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T3.W ; EG-NEXT: MOV T3.Z, OQAP, ; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x, ; EG-NEXT: 56(7.847271e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T3.W ; EG-NEXT: MOV T3.W, OQAP, ; EG-NEXT: ADD_INT * T4.W, KC0[2].Z, literal.x, ; EG-NEXT: 52(7.286752e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T4.W ; EG-NEXT: MOV T4.Y, OQAP, ; EG-NEXT: ADD_INT * T4.W, KC0[2].Z, literal.x, ; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T4.W ; EG-NEXT: MOV T4.Z, OQAP, ; EG-NEXT: ADD_INT * T4.W, KC0[2].Z, literal.x, ; EG-NEXT: 76(1.064987e-43), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T4.W ; EG-NEXT: MOV T4.W, OQAP, ; EG-NEXT: ADD_INT * T5.W, KC0[2].Z, literal.x, ; EG-NEXT: 72(1.008935e-43), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T5.W ; EG-NEXT: MOV T5.Y, OQAP, ; EG-NEXT: ADD_INT * T5.W, KC0[2].Z, literal.x, ; EG-NEXT: 68(9.528830e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T5.W ; EG-NEXT: MOV T5.Z, OQAP, ; EG-NEXT: ADD_INT * T5.W, KC0[2].Z, literal.x, ; EG-NEXT: 64(8.968310e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T5.W ; EG-NEXT: MOV T5.W, OQAP, ; EG-NEXT: ADD_INT * T6.W, KC0[2].Z, literal.x, ; EG-NEXT: 92(1.289195e-43), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T6.W ; EG-NEXT: MOV T6.Y, OQAP, ; EG-NEXT: ADD_INT * T6.W, KC0[2].Z, literal.x, ; EG-NEXT: 88(1.233143e-43), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T6.W ; EG-NEXT: MOV T6.Z, OQAP, ; EG-NEXT: ADD_INT * T6.W, KC0[2].Z, literal.x, ; EG-NEXT: 84(1.177091e-43), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T6.W ; EG-NEXT: MOV T6.W, OQAP, ; EG-NEXT: ADD_INT * T7.W, KC0[2].Z, literal.x, ; EG-NEXT: 80(1.121039e-43), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T7.W ; EG-NEXT: MOV T7.Y, OQAP, ; EG-NEXT: ADD_INT * T7.W, KC0[2].Z, literal.x, ; EG-NEXT: 108(1.513402e-43), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T7.W ; EG-NEXT: MOV T7.Z, OQAP, ; EG-NEXT: ADD_INT * T7.W, KC0[2].Z, literal.x, ; EG-NEXT: 104(1.457350e-43), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T7.W ; EG-NEXT: MOV T7.W, OQAP, ; EG-NEXT: ADD_INT * T8.W, KC0[2].Z, literal.x, ; EG-NEXT: 100(1.401298e-43), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T8.W ; EG-NEXT: MOV T8.Y, OQAP, ; EG-NEXT: ADD_INT * T8.W, KC0[2].Z, literal.x, ; EG-NEXT: 96(1.345247e-43), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T8.W ; EG-NEXT: MOV T8.Z, OQAP, ; EG-NEXT: ADD_INT * T8.W, KC0[2].Z, literal.x, ; EG-NEXT: 124(1.737610e-43), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T8.W ; EG-NEXT: MOV T8.W, OQAP, ; EG-NEXT: ADD_INT * T9.W, KC0[2].Z, literal.x, ; EG-NEXT: 120(1.681558e-43), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T9.W ; EG-NEXT: MOV T9.Y, OQAP, ; EG-NEXT: ADD_INT * T9.W, KC0[2].Z, literal.x, ; EG-NEXT: 116(1.625506e-43), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T9.W ; EG-NEXT: MOV T9.Z, OQAP, ; EG-NEXT: ADD_INT * T9.W, KC0[2].Z, literal.x, ; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00) ; EG-NEXT: ALU 85, @31, KC0[CB0:0-32], KC1[] ; EG-NEXT: LDS_READ_RET * OQAP, T9.W ; EG-NEXT: MOV T9.W, OQAP, ; EG-NEXT: ADD_INT * T10.W, KC0[2].Z, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T10.W ; EG-NEXT: MOV T10.Y, OQAP, ; EG-NEXT: LSHR T10.W, T9.W, literal.x, ; EG-NEXT: ADD_INT * T11.W, KC0[2].Z, literal.y, ; EG-NEXT: 16(2.242078e-44), 112(1.569454e-43) ; EG-NEXT: LDS_READ_RET * OQAP, T11.W ; EG-NEXT: MOV T10.Z, OQAP, ; EG-NEXT: LSHR * T11.Z, T10.Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: BFE_INT T10.W, T10.W, 0.0, literal.x, ; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 28(3.923636e-44) ; EG-NEXT: LDS_WRITE * T11.W, T10.W, ; EG-NEXT: LSHR T12.Z, T0.Y, literal.x, ; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x, ; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 20(2.802597e-44) ; EG-NEXT: LDS_WRITE * T11.W, T10.W, ; EG-NEXT: LSHR T11.Z, T0.Z, literal.x, ; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x, BS:VEC_120/SCL_212 ; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 12(1.681558e-44) ; EG-NEXT: LDS_WRITE * T11.W, T10.W, ; EG-NEXT: LSHR T12.Z, T0.W, literal.x, ; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x, ; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 4(5.605194e-45) ; EG-NEXT: LDS_WRITE * T11.W, T10.W, ; EG-NEXT: LSHR T11.Z, T1.Y, literal.x, ; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x, ; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 60(8.407791e-44) ; EG-NEXT: LDS_WRITE * T11.W, T10.W, ; EG-NEXT: LSHR T12.Z, T1.Z, literal.x, ; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x, BS:VEC_120/SCL_212 ; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 52(7.286752e-44) ; EG-NEXT: LDS_WRITE * T11.W, T10.W, ; EG-NEXT: LSHR T11.Z, T1.W, literal.x, ; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x, ; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 44(6.165713e-44) ; EG-NEXT: LDS_WRITE * T11.W, T10.W, ; EG-NEXT: LSHR T12.Z, T2.Y, literal.x, ; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x, ; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 36(5.044674e-44) ; EG-NEXT: LDS_WRITE * T11.W, T10.W, ; EG-NEXT: LSHR T11.Z, T2.Z, literal.x, ; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x, BS:VEC_120/SCL_212 ; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 92(1.289195e-43) ; EG-NEXT: LDS_WRITE * T11.W, T10.W, ; EG-NEXT: LSHR T12.Z, T2.W, literal.x, ; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x, ; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 84(1.177091e-43) ; EG-NEXT: LDS_WRITE * T11.W, T10.W, ; EG-NEXT: LSHR T11.Z, T3.Y, literal.x, ; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x, ; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 76(1.064987e-43) ; EG-NEXT: LDS_WRITE * T11.W, T10.W, ; EG-NEXT: LSHR T12.Z, T3.Z, literal.x, ; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x, BS:VEC_120/SCL_212 ; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 68(9.528830e-44) ; EG-NEXT: LDS_WRITE * T11.W, T10.W, ; EG-NEXT: LSHR T11.Z, T3.W, literal.x, ; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x, ; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 124(1.737610e-43) ; EG-NEXT: LDS_WRITE * T11.W, T10.W, ; EG-NEXT: LSHR T12.Z, T4.Y, literal.x, ; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x, ; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 116(1.625506e-43) ; EG-NEXT: LDS_WRITE * T11.W, T10.W, ; EG-NEXT: LSHR T11.Z, T4.Z, literal.x, ; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x, BS:VEC_120/SCL_212 ; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 108(1.513402e-43) ; EG-NEXT: ALU 83, @32, KC0[CB0:0-32], KC1[] ; EG-NEXT: LDS_WRITE * T11.W, T10.W, ; EG-NEXT: LSHR T12.Z, T4.W, literal.x, ; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x, ; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 100(1.401298e-43) ; EG-NEXT: LDS_WRITE * T11.W, T10.W, ; EG-NEXT: LSHR T11.Z, T5.Y, literal.x, ; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x, ; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 156(2.186026e-43) ; EG-NEXT: LDS_WRITE * T11.W, T10.W, ; EG-NEXT: LSHR T12.Z, T5.Z, literal.x, ; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x, BS:VEC_120/SCL_212 ; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 148(2.073922e-43) ; EG-NEXT: LDS_WRITE * T11.W, T10.W, ; EG-NEXT: LSHR T11.Z, T5.W, literal.x, ; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x, ; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 140(1.961818e-43) ; EG-NEXT: LDS_WRITE * T11.W, T10.W, ; EG-NEXT: LSHR T12.Z, T6.Y, literal.x, ; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x, ; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 132(1.849714e-43) ; EG-NEXT: LDS_WRITE * T11.W, T10.W, ; EG-NEXT: LSHR T11.Z, T6.Z, literal.x, ; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x, BS:VEC_120/SCL_212 ; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 188(2.634441e-43) ; EG-NEXT: LDS_WRITE * T11.W, T10.W, ; EG-NEXT: LSHR T12.Z, T6.W, literal.x, ; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x, ; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 180(2.522337e-43) ; EG-NEXT: LDS_WRITE * T11.W, T10.W, ; EG-NEXT: LSHR T11.Z, T7.Y, literal.x, ; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x, ; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 172(2.410233e-43) ; EG-NEXT: LDS_WRITE * T11.W, T10.W, ; EG-NEXT: LSHR T12.Z, T7.Z, literal.x, ; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x, BS:VEC_120/SCL_212 ; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 164(2.298129e-43) ; EG-NEXT: LDS_WRITE * T11.W, T10.W, ; EG-NEXT: LSHR T11.Z, T7.W, literal.x, ; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x, ; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 220(3.082857e-43) ; EG-NEXT: LDS_WRITE * T11.W, T10.W, ; EG-NEXT: LSHR T12.Z, T8.Y, literal.x, ; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x, ; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 212(2.970753e-43) ; EG-NEXT: LDS_WRITE * T11.W, T10.W, ; EG-NEXT: LSHR T11.Z, T8.Z, literal.x, ; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x, BS:VEC_120/SCL_212 ; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 204(2.858649e-43) ; EG-NEXT: LDS_WRITE * T11.W, T10.W, ; EG-NEXT: LSHR T12.Z, T8.W, literal.x, ; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x, ; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 196(2.746545e-43) ; EG-NEXT: LDS_WRITE * T11.W, T10.W, ; EG-NEXT: LSHR T11.Z, T9.Y, literal.x, ; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x, ; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 252(3.531272e-43) ; EG-NEXT: LDS_WRITE * T11.W, T10.W, ; EG-NEXT: LSHR T12.Z, T9.Z, literal.x, ; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x, BS:VEC_120/SCL_212 ; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 244(3.419168e-43) ; EG-NEXT: LDS_WRITE * T11.W, T10.W, ; EG-NEXT: LSHR T11.Z, T10.Z, literal.x, ; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x, BS:VEC_120/SCL_212 ; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 236(3.307064e-43) ; EG-NEXT: LDS_WRITE * T11.W, T10.W, ; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x, ; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 228(3.194960e-43) ; EG-NEXT: ALU 94, @33, KC0[CB0:0-32], KC1[] ; EG-NEXT: LDS_WRITE * T11.W, T10.W, ; EG-NEXT: BFE_INT T9.W, T9.W, 0.0, literal.x, ; EG-NEXT: ADD_INT * T10.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) ; EG-NEXT: LDS_WRITE * T10.W, T9.W, ; EG-NEXT: BFE_INT T9.W, T10.Y, 0.0, literal.x, ; EG-NEXT: ADD_INT * T10.W, KC0[2].Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T10.W, T9.W, ; EG-NEXT: BFE_INT T9.W, T0.Y, 0.0, literal.x, ; EG-NEXT: ADD_INT * T10.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44) ; EG-NEXT: LDS_WRITE * T10.W, T9.W, ; EG-NEXT: BFE_INT T9.W, T0.Z, 0.0, literal.x, ; EG-NEXT: MOV * T10.W, KC0[2].Y, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T10.W, T9.W, ; EG-NEXT: BFE_INT T0.W, T0.W, 0.0, literal.x, ; EG-NEXT: ADD_INT * T9.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 56(7.847271e-44) ; EG-NEXT: LDS_WRITE * T9.W, T0.W, ; EG-NEXT: BFE_INT T0.W, T1.Y, 0.0, literal.x, ; EG-NEXT: ADD_INT * T9.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 48(6.726233e-44) ; EG-NEXT: LDS_WRITE * T9.W, T0.W, ; EG-NEXT: BFE_INT T0.W, T1.Z, 0.0, literal.x, ; EG-NEXT: ADD_INT * T9.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 40(5.605194e-44) ; EG-NEXT: LDS_WRITE * T9.W, T0.W, ; EG-NEXT: BFE_INT T0.W, T1.W, 0.0, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 32(4.484155e-44) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: BFE_INT T0.W, T2.Y, 0.0, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 88(1.233143e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: BFE_INT T0.W, T2.Z, 0.0, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 80(1.121039e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: BFE_INT T0.W, T2.W, 0.0, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 72(1.008935e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: BFE_INT T0.W, T3.Y, 0.0, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 64(8.968310e-44) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: BFE_INT T0.W, T3.Z, 0.0, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 120(1.681558e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: BFE_INT T0.W, T3.W, 0.0, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 112(1.569454e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: BFE_INT T0.W, T4.Y, 0.0, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 104(1.457350e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: BFE_INT T0.W, T4.Z, 0.0, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 96(1.345247e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: BFE_INT T0.W, T4.W, 0.0, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 152(2.129974e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: BFE_INT T0.W, T5.Y, 0.0, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 144(2.017870e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: BFE_INT T0.W, T5.Z, 0.0, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 136(1.905766e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: BFE_INT T0.W, T5.W, 0.0, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 128(1.793662e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: BFE_INT T0.W, T6.Y, 0.0, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 184(2.578389e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: BFE_INT T0.W, T6.Z, 0.0, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 176(2.466285e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: BFE_INT T0.W, T6.W, 0.0, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 168(2.354181e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: BFE_INT * T0.W, T7.Y, 0.0, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: ALU 34, @34, KC0[CB0:0-32], KC1[] ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x, ; EG-NEXT: 160(2.242078e-43), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: BFE_INT T0.W, T7.Z, 0.0, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 216(3.026805e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: BFE_INT T0.W, T7.W, 0.0, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 208(2.914701e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: BFE_INT T0.W, T8.Y, 0.0, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 200(2.802597e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: BFE_INT T0.W, T8.Z, 0.0, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 192(2.690493e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: BFE_INT T0.W, T8.W, 0.0, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 248(3.475220e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: BFE_INT T0.W, T9.Y, 0.0, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 240(3.363116e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: BFE_INT T0.W, T9.Z, 0.0, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 232(3.251012e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: BFE_INT T0.W, T10.Z, 0.0, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 224(3.138909e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: RETURN ; ; VI-DS128-LABEL: local_sextload_v64i16_to_v64i32: ; VI-DS128: ; %bb.0: ; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-DS128-NEXT: s_mov_b32 m0, -1 ; VI-DS128-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; VI-DS128-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; VI-DS128-NEXT: s_mov_b32 s90, -1 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_mov_b32_e32 v32, s1 ; VI-DS128-NEXT: ds_read_b128 v[8:11], v32 ; VI-DS128-NEXT: ds_read_b128 v[16:19], v32 offset:16 ; VI-DS128-NEXT: s_mov_b32 s91, 0xe80000 ; VI-DS128-NEXT: s_add_u32 s88, s88, s11 ; VI-DS128-NEXT: s_addc_u32 s89, s89, 0 ; VI-DS128-NEXT: ds_read_b128 v[24:27], v32 offset:32 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v19 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v18 ; VI-DS128-NEXT: v_bfe_i32 v2, v19, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v0, v18, 0, 16 ; VI-DS128-NEXT: buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill ; VI-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill ; VI-DS128-NEXT: buffer_store_dword v2, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill ; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill ; VI-DS128-NEXT: v_ashrrev_i32_e32 v6, 16, v17 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v16 ; VI-DS128-NEXT: v_bfe_i32 v5, v17, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v3, v16, 0, 16 ; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill ; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill ; VI-DS128-NEXT: buffer_store_dword v5, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill ; VI-DS128-NEXT: buffer_store_dword v6, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill ; VI-DS128-NEXT: ds_read_b128 v[33:36], v32 offset:48 ; VI-DS128-NEXT: ds_read_b128 v[40:43], v32 offset:80 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(2) ; VI-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v27 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v26 ; VI-DS128-NEXT: v_bfe_i32 v18, v27, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v16, v26, 0, 16 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-DS128-NEXT: v_ashrrev_i32_e32 v27, 16, v36 ; VI-DS128-NEXT: v_bfe_i32 v26, v36, 0, 16 ; VI-DS128-NEXT: ds_read_b128 v[36:39], v32 offset:64 ; VI-DS128-NEXT: ds_read_b128 v[56:59], v32 offset:96 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(2) ; VI-DS128-NEXT: v_ashrrev_i32_e32 v53, 16, v40 ; VI-DS128-NEXT: v_bfe_i32 v52, v40, 0, 16 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v11 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-DS128-NEXT: v_ashrrev_i32_e32 v47, 16, v39 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v45, 16, v38 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v51, 16, v37 ; VI-DS128-NEXT: v_bfe_i32 v46, v39, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v44, v38, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v50, v37, 0, 16 ; VI-DS128-NEXT: ds_read_b128 v[37:40], v32 offset:112 ; VI-DS128-NEXT: v_mov_b32_e32 v32, s0 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v10 ; VI-DS128-NEXT: v_mov_b32_e32 v23, v15 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v9 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v38 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v37 ; VI-DS128-NEXT: v_bfe_i32 v2, v38, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v0, v37, 0, 16 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v8 ; VI-DS128-NEXT: v_bfe_i32 v22, v11, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v20, v10, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v14, v9, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v12, v8, 0, 16 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v25 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v24 ; VI-DS128-NEXT: v_bfe_i32 v10, v25, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v8, v24, 0, 16 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v35 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v31, 16, v34 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v29, 16, v33 ; VI-DS128-NEXT: v_bfe_i32 v24, v35, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v30, v34, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v28, v33, 0, 16 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v49, 16, v36 ; VI-DS128-NEXT: v_bfe_i32 v48, v36, 0, 16 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v36, 16, v43 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v34, 16, v42 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v55, 16, v41 ; VI-DS128-NEXT: v_bfe_i32 v35, v43, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v33, v42, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v54, v41, 0, 16 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v62, 16, v59 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v60, 16, v58 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v57 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v56 ; VI-DS128-NEXT: v_bfe_i32 v61, v59, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v59, v58, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v6, v57, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v4, v56, 0, 16 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v43, 16, v40 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v41, 16, v39 ; VI-DS128-NEXT: v_bfe_i32 v42, v40, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v40, v39, 0, 16 ; VI-DS128-NEXT: ds_write_b128 v32, v[0:3] offset:224 ; VI-DS128-NEXT: ds_write_b128 v32, v[40:43] offset:240 ; VI-DS128-NEXT: ds_write_b128 v32, v[4:7] offset:192 ; VI-DS128-NEXT: ds_write_b128 v32, v[59:62] offset:208 ; VI-DS128-NEXT: ds_write_b128 v32, v[52:55] offset:160 ; VI-DS128-NEXT: ds_write_b128 v32, v[33:36] offset:176 ; VI-DS128-NEXT: ds_write_b128 v32, v[48:51] offset:128 ; VI-DS128-NEXT: ds_write_b128 v32, v[44:47] offset:144 ; VI-DS128-NEXT: ds_write_b128 v32, v[28:31] offset:96 ; VI-DS128-NEXT: ds_write_b128 v32, v[24:27] offset:112 ; VI-DS128-NEXT: ds_write_b128 v32, v[8:11] offset:64 ; VI-DS128-NEXT: ds_write_b128 v32, v[16:19] offset:80 ; VI-DS128-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:16 ; 4-byte Folded Reload ; VI-DS128-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:20 ; 4-byte Folded Reload ; VI-DS128-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:24 ; 4-byte Folded Reload ; VI-DS128-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:28 ; 4-byte Folded Reload ; VI-DS128-NEXT: s_waitcnt vmcnt(0) ; VI-DS128-NEXT: ds_write_b128 v32, v[0:3] offset:32 ; VI-DS128-NEXT: buffer_load_dword v0, off, s[88:91], 0 ; 4-byte Folded Reload ; VI-DS128-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload ; VI-DS128-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:8 ; 4-byte Folded Reload ; VI-DS128-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:12 ; 4-byte Folded Reload ; VI-DS128-NEXT: s_waitcnt vmcnt(0) ; VI-DS128-NEXT: ds_write_b128 v32, v[0:3] offset:48 ; VI-DS128-NEXT: ds_write_b128 v32, v[12:15] ; VI-DS128-NEXT: ds_write_b128 v32, v[20:23] offset:16 ; VI-DS128-NEXT: s_endpgm ; ; GFX9-DS128-LABEL: local_sextload_v64i16_to_v64i32: ; GFX9-DS128: ; %bb.0: ; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DS128-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-DS128-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-DS128-NEXT: s_mov_b32 s14, -1 ; GFX9-DS128-NEXT: s_mov_b32 s15, 0xe00000 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_mov_b32_e32 v32, s1 ; GFX9-DS128-NEXT: ds_read_b128 v[8:11], v32 ; GFX9-DS128-NEXT: ds_read_b128 v[16:19], v32 offset:16 ; GFX9-DS128-NEXT: s_add_u32 s12, s12, s11 ; GFX9-DS128-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v32 offset:32 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2) ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v11 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v19 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v18 ; GFX9-DS128-NEXT: v_bfe_i32 v2, v19, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v0, v18, 0, 16 ; GFX9-DS128-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: s_nop 0 ; GFX9-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v6, 16, v17 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v16 ; GFX9-DS128-NEXT: v_bfe_i32 v5, v17, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v3, v16, 0, 16 ; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: s_nop 0 ; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: ds_read_b128 v[33:36], v32 offset:48 ; GFX9-DS128-NEXT: ds_read_b128 v[40:43], v32 offset:80 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2) ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v27 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v26 ; GFX9-DS128-NEXT: v_bfe_i32 v18, v27, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v16, v26, 0, 16 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v27, 16, v36 ; GFX9-DS128-NEXT: v_bfe_i32 v26, v36, 0, 16 ; GFX9-DS128-NEXT: ds_read_b128 v[36:39], v32 offset:64 ; GFX9-DS128-NEXT: ds_read_b128 v[56:59], v32 offset:96 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2) ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v53, 16, v40 ; GFX9-DS128-NEXT: v_bfe_i32 v52, v40, 0, 16 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v10 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v47, 16, v39 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v45, 16, v38 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v51, 16, v37 ; GFX9-DS128-NEXT: v_bfe_i32 v46, v39, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v44, v38, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v50, v37, 0, 16 ; GFX9-DS128-NEXT: ds_read_b128 v[37:40], v32 offset:112 ; GFX9-DS128-NEXT: v_mov_b32_e32 v32, s0 ; GFX9-DS128-NEXT: v_mov_b32_e32 v23, v15 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v9 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v8 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v38 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v37 ; GFX9-DS128-NEXT: v_bfe_i32 v2, v38, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v0, v37, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v22, v11, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v20, v10, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v14, v9, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v12, v8, 0, 16 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v25 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v24 ; GFX9-DS128-NEXT: v_bfe_i32 v10, v25, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v8, v24, 0, 16 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v35 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v31, 16, v34 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v29, 16, v33 ; GFX9-DS128-NEXT: v_bfe_i32 v24, v35, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v30, v34, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v28, v33, 0, 16 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v49, 16, v36 ; GFX9-DS128-NEXT: v_bfe_i32 v48, v36, 0, 16 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v36, 16, v43 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v34, 16, v42 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v55, 16, v41 ; GFX9-DS128-NEXT: v_bfe_i32 v35, v43, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v33, v42, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v54, v41, 0, 16 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v62, 16, v59 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v60, 16, v58 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v57 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v56 ; GFX9-DS128-NEXT: v_bfe_i32 v61, v59, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v59, v58, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v6, v57, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v4, v56, 0, 16 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v43, 16, v40 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v41, 16, v39 ; GFX9-DS128-NEXT: v_bfe_i32 v42, v40, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v40, v39, 0, 16 ; GFX9-DS128-NEXT: ds_write_b128 v32, v[0:3] offset:224 ; GFX9-DS128-NEXT: ds_write_b128 v32, v[40:43] offset:240 ; GFX9-DS128-NEXT: ds_write_b128 v32, v[4:7] offset:192 ; GFX9-DS128-NEXT: ds_write_b128 v32, v[59:62] offset:208 ; GFX9-DS128-NEXT: ds_write_b128 v32, v[52:55] offset:160 ; GFX9-DS128-NEXT: ds_write_b128 v32, v[33:36] offset:176 ; GFX9-DS128-NEXT: ds_write_b128 v32, v[48:51] offset:128 ; GFX9-DS128-NEXT: ds_write_b128 v32, v[44:47] offset:144 ; GFX9-DS128-NEXT: ds_write_b128 v32, v[28:31] offset:96 ; GFX9-DS128-NEXT: ds_write_b128 v32, v[24:27] offset:112 ; GFX9-DS128-NEXT: ds_write_b128 v32, v[8:11] offset:64 ; GFX9-DS128-NEXT: ds_write_b128 v32, v[16:19] offset:80 ; GFX9-DS128-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: s_waitcnt vmcnt(0) ; GFX9-DS128-NEXT: ds_write_b128 v32, v[0:3] offset:32 ; GFX9-DS128-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: s_waitcnt vmcnt(0) ; GFX9-DS128-NEXT: ds_write_b128 v32, v[0:3] offset:48 ; GFX9-DS128-NEXT: ds_write_b128 v32, v[12:15] ; GFX9-DS128-NEXT: ds_write_b128 v32, v[20:23] offset:16 ; GFX9-DS128-NEXT: s_endpgm %load = load <64 x i16>, ptr addrspace(3) %in %ext = sext <64 x i16> %load to <64 x i32> store <64 x i32> %ext, ptr addrspace(3) %out ret void } define amdgpu_kernel void @local_zextload_i16_to_i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { ; SI-LABEL: local_zextload_i16_to_i64: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s1 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read_u16 v0, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: v_mov_b32_e32 v2, s0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: ds_write_b64 v2, v[0:1] ; SI-NEXT: s_endpgm ; ; VI-LABEL: local_zextload_i16_to_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s1 ; VI-NEXT: ds_read_u16 v0, v0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; VI-NEXT: ds_write_b64 v2, v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: local_zextload_i16_to_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: ds_read_u16 v0, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: ds_write_b64 v2, v[0:1] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: local_zextload_i16_to_i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 8, @35, KC0[CB0:0-32], KC1[] ; EG-NEXT: MOV * T0.W, KC0[2].Z, ; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.X, OQAP, ; EG-NEXT: MOV * T0.W, KC0[2].Y, ; EG-NEXT: LDS_WRITE * T0.W, T0.X, ; EG-NEXT: MOV T0.W, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 0(0.000000e+00), 4(5.605194e-45) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: RETURN %a = load i16, ptr addrspace(3) %in %ext = zext i16 %a to i64 store i64 %ext, ptr addrspace(3) %out ret void } ; FIXME: Need to optimize this sequence to avoid an extra shift. ; t25: i32,ch = load t12, t10, undef:i32 ; t28: i64 = any_extend t25 ; t30: i64 = sign_extend_inreg t28, ValueType:ch:i16 define amdgpu_kernel void @local_sextload_i16_to_i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { ; SI-LABEL: local_sextload_i16_to_i64: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s1 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read_i16 v0, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; SI-NEXT: v_mov_b32_e32 v2, s0 ; SI-NEXT: ds_write_b64 v2, v[0:1] ; SI-NEXT: s_endpgm ; ; VI-LABEL: local_sextload_i16_to_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s1 ; VI-NEXT: ds_read_u16 v0, v0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_bfe_i32 v0, v0, 0, 16 ; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VI-NEXT: ds_write_b64 v2, v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: local_sextload_i16_to_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: ds_read_u16 v0, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-NEXT: ds_write_b64 v2, v[0:1] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: local_sextload_i16_to_i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 10, @36, KC0[CB0:0-32], KC1[] ; EG-NEXT: MOV * T0.W, KC0[2].Z, ; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W ; EG-NEXT: MOV * T0.X, OQAP, ; EG-NEXT: BFE_INT * T0.W, PV.X, 0.0, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: ASHR T1.W, PV.W, literal.x, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 31(4.344025e-44), 4(5.605194e-45) ; EG-NEXT: LDS_WRITE * T2.W, T1.W, ; EG-NEXT: MOV * T1.W, KC0[2].Y, ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: RETURN %a = load i16, ptr addrspace(3) %in %ext = sext i16 %a to i64 store i64 %ext, ptr addrspace(3) %out ret void } define amdgpu_kernel void @local_zextload_v1i16_to_v1i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { ; SI-LABEL: local_zextload_v1i16_to_v1i64: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s1 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read_u16 v0, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: v_mov_b32_e32 v2, s0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: ds_write_b64 v2, v[0:1] ; SI-NEXT: s_endpgm ; ; VI-LABEL: local_zextload_v1i16_to_v1i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s1 ; VI-NEXT: ds_read_u16 v0, v0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; VI-NEXT: ds_write_b64 v2, v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: local_zextload_v1i16_to_v1i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: ds_read_u16 v0, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: ds_write_b64 v2, v[0:1] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: local_zextload_v1i16_to_v1i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 8, @37, KC0[CB0:0-32], KC1[] ; EG-NEXT: MOV * T0.W, KC0[2].Z, ; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.X, OQAP, ; EG-NEXT: MOV * T0.W, KC0[2].Y, ; EG-NEXT: LDS_WRITE * T0.W, T0.X, ; EG-NEXT: MOV T0.W, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 0(0.000000e+00), 4(5.605194e-45) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: RETURN %load = load <1 x i16>, ptr addrspace(3) %in %ext = zext <1 x i16> %load to <1 x i64> store <1 x i64> %ext, ptr addrspace(3) %out ret void } define amdgpu_kernel void @local_sextload_v1i16_to_v1i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { ; SI-LABEL: local_sextload_v1i16_to_v1i64: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s1 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read_i16 v0, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; SI-NEXT: v_mov_b32_e32 v2, s0 ; SI-NEXT: ds_write_b64 v2, v[0:1] ; SI-NEXT: s_endpgm ; ; VI-LABEL: local_sextload_v1i16_to_v1i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s1 ; VI-NEXT: ds_read_u16 v0, v0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_bfe_i32 v0, v0, 0, 16 ; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VI-NEXT: ds_write_b64 v2, v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: local_sextload_v1i16_to_v1i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: ds_read_u16 v0, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-NEXT: ds_write_b64 v2, v[0:1] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: local_sextload_v1i16_to_v1i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 10, @38, KC0[CB0:0-32], KC1[] ; EG-NEXT: MOV * T0.W, KC0[2].Z, ; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W ; EG-NEXT: MOV * T0.X, OQAP, ; EG-NEXT: BFE_INT * T0.W, PV.X, 0.0, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: ASHR T1.W, PV.W, literal.x, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 31(4.344025e-44), 4(5.605194e-45) ; EG-NEXT: LDS_WRITE * T2.W, T1.W, ; EG-NEXT: MOV * T1.W, KC0[2].Y, ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: RETURN %load = load <1 x i16>, ptr addrspace(3) %in %ext = sext <1 x i16> %load to <1 x i64> store <1 x i64> %ext, ptr addrspace(3) %out ret void } define amdgpu_kernel void @local_zextload_v2i16_to_v2i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { ; SI-LABEL: local_zextload_v2i16_to_v2i64: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s1 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read_b32 v2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_mov_b32_e32 v4, s0 ; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: ds_write2_b64 v4, v[2:3], v[0:1] offset1:1 ; SI-NEXT: s_endpgm ; ; VI-NO-DS128-LABEL: local_zextload_v2i16_to_v2i64: ; VI-NO-DS128: ; %bb.0: ; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NO-DS128-NEXT: s_mov_b32 m0, -1 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v1, 0 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v3, v1 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_mov_b32_e32 v0, s1 ; VI-NO-DS128-NEXT: ds_read_b32 v0, v0 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v4, s0 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v0 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; VI-NO-DS128-NEXT: ds_write2_b64 v4, v[2:3], v[0:1] offset1:1 ; VI-NO-DS128-NEXT: s_endpgm ; ; GFX9-NO-DS128-LABEL: local_zextload_v2i16_to_v2i64: ; GFX9-NO-DS128: ; %bb.0: ; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NO-DS128-NEXT: ds_read_b32 v0, v0 ; GFX9-NO-DS128-NEXT: s_mov_b32 s1, 0xffff ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v4, s0 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_and_b32_sdwa v2, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 ; GFX9-NO-DS128-NEXT: s_endpgm ; ; EG-LABEL: local_zextload_v2i16_to_v2i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 17, @39, KC0[CB0:0-32], KC1[] ; EG-NEXT: MOV * T0.W, KC0[2].Z, ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV * T0.Y, OQAP, ; EG-NEXT: AND_INT T0.W, PV.Y, literal.x, ; EG-NEXT: MOV * T1.W, KC0[2].Y, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: LSHR T0.W, T0.Y, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.x, ; EG-NEXT: MOV * T1.W, literal.y, ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: RETURN ; ; VI-DS128-LABEL: local_zextload_v2i16_to_v2i64: ; VI-DS128: ; %bb.0: ; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-DS128-NEXT: s_mov_b32 m0, -1 ; VI-DS128-NEXT: v_mov_b32_e32 v1, 0 ; VI-DS128-NEXT: v_mov_b32_e32 v3, v1 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_mov_b32_e32 v0, s1 ; VI-DS128-NEXT: ds_read_b32 v0, v0 ; VI-DS128-NEXT: v_mov_b32_e32 v4, s0 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; VI-DS128-NEXT: ds_write_b128 v4, v[0:3] ; VI-DS128-NEXT: s_endpgm ; ; GFX9-DS128-LABEL: local_zextload_v2i16_to_v2i64: ; GFX9-DS128: ; %bb.0: ; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DS128-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DS128-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-DS128-NEXT: ds_read_b32 v2, v0 ; GFX9-DS128-NEXT: s_mov_b32 s1, 0xffff ; GFX9-DS128-NEXT: v_mov_b32_e32 v4, s0 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GFX9-DS128-NEXT: v_and_b32_sdwa v2, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DS128-NEXT: ds_write_b128 v4, v[0:3] ; GFX9-DS128-NEXT: s_endpgm %load = load <2 x i16>, ptr addrspace(3) %in %ext = zext <2 x i16> %load to <2 x i64> store <2 x i64> %ext, ptr addrspace(3) %out ret void } define amdgpu_kernel void @local_sextload_v2i16_to_v2i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { ; SI-LABEL: local_sextload_v2i16_to_v2i64: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s1 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read_b32 v0, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-NEXT: v_bfe_i32 v0, v0, 0, 16 ; SI-NEXT: v_bfe_i32 v2, v1, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; SI-NEXT: v_mov_b32_e32 v4, s0 ; SI-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 ; SI-NEXT: s_endpgm ; ; VI-NO-DS128-LABEL: local_sextload_v2i16_to_v2i64: ; VI-NO-DS128: ; %bb.0: ; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NO-DS128-NEXT: s_mov_b32 m0, -1 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_mov_b32_e32 v0, s1 ; VI-NO-DS128-NEXT: ds_read_b32 v0, v0 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v4, s0 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; VI-NO-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v2, v1, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; VI-NO-DS128-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 ; VI-NO-DS128-NEXT: s_endpgm ; ; GFX9-NO-DS128-LABEL: local_sextload_v2i16_to_v2i64: ; GFX9-NO-DS128: ; %bb.0: ; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NO-DS128-NEXT: ds_read_b32 v0, v0 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v4, s0 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v1, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 ; GFX9-NO-DS128-NEXT: s_endpgm ; ; EG-LABEL: local_sextload_v2i16_to_v2i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 18, @40, KC0[CB0:0-32], KC1[] ; EG-NEXT: MOV * T0.W, KC0[2].Z, ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV * T0.Y, OQAP, ; EG-NEXT: BFE_INT * T0.W, PV.Y, 0.0, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: ASHR T1.W, PV.W, literal.x, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 31(4.344025e-44), 4(5.605194e-45) ; EG-NEXT: LDS_WRITE * T2.W, T1.W, ; EG-NEXT: MOV * T1.W, KC0[2].Y, ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: ASHR T0.W, T0.Y, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: ASHR T0.W, T0.Y, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 31(4.344025e-44), 12(1.681558e-44) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: RETURN ; ; VI-DS128-LABEL: local_sextload_v2i16_to_v2i64: ; VI-DS128: ; %bb.0: ; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-DS128-NEXT: s_mov_b32 m0, -1 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_mov_b32_e32 v0, s1 ; VI-DS128-NEXT: ds_read_b32 v1, v0 ; VI-DS128-NEXT: v_mov_b32_e32 v4, s0 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; VI-DS128-NEXT: v_bfe_i32 v0, v1, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; VI-DS128-NEXT: ds_write_b128 v4, v[0:3] ; VI-DS128-NEXT: s_endpgm ; ; GFX9-DS128-LABEL: local_sextload_v2i16_to_v2i64: ; GFX9-DS128: ; %bb.0: ; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-DS128-NEXT: ds_read_b32 v1, v0 ; GFX9-DS128-NEXT: v_mov_b32_e32 v4, s0 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX9-DS128-NEXT: v_bfe_i32 v0, v1, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX9-DS128-NEXT: ds_write_b128 v4, v[0:3] ; GFX9-DS128-NEXT: s_endpgm %load = load <2 x i16>, ptr addrspace(3) %in %ext = sext <2 x i16> %load to <2 x i64> store <2 x i64> %ext, ptr addrspace(3) %out ret void } define amdgpu_kernel void @local_zextload_v4i16_to_v4i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { ; SI-LABEL: local_zextload_v4i16_to_v4i64: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s1 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read_b64 v[0:1], v0 ; SI-NEXT: v_mov_b32_e32 v3, 0 ; SI-NEXT: v_mov_b32_e32 v5, v3 ; SI-NEXT: v_mov_b32_e32 v7, v3 ; SI-NEXT: v_mov_b32_e32 v9, v3 ; SI-NEXT: v_mov_b32_e32 v10, s0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v1 ; SI-NEXT: ds_write2_b64 v10, v[4:5], v[2:3] offset0:2 offset1:3 ; SI-NEXT: ds_write2_b64 v10, v[6:7], v[8:9] offset1:1 ; SI-NEXT: s_endpgm ; ; VI-NO-DS128-LABEL: local_zextload_v4i16_to_v4i64: ; VI-NO-DS128: ; %bb.0: ; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NO-DS128-NEXT: s_mov_b32 m0, -1 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v2, 0 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v4, v2 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v6, v2 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_mov_b32_e32 v0, s1 ; VI-NO-DS128-NEXT: ds_read_b64 v[0:1], v0 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v9, s0 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v8, v2 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_and_b32_e32 v7, 0xffff, v0 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; VI-NO-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v1 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NO-DS128-NEXT: ds_write2_b64 v9, v[3:4], v[1:2] offset0:2 offset1:3 ; VI-NO-DS128-NEXT: ds_write2_b64 v9, v[7:8], v[5:6] offset1:1 ; VI-NO-DS128-NEXT: s_endpgm ; ; GFX9-NO-DS128-LABEL: local_zextload_v4i16_to_v4i64: ; GFX9-NO-DS128: ; %bb.0: ; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NO-DS128-NEXT: s_mov_b32 s2, 0xffff ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v4, v2 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v6, v2 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NO-DS128-NEXT: ds_read_b64 v[0:1], v0 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v9, s0 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v8, v2 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_and_b32_sdwa v7, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NO-DS128-NEXT: v_and_b32_sdwa v5, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v1 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v0 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v9, v[3:4], v[5:6] offset0:2 offset1:3 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v9, v[1:2], v[7:8] offset1:1 ; GFX9-NO-DS128-NEXT: s_endpgm ; ; EG-LABEL: local_zextload_v4i16_to_v4i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 35, @41, KC0[CB0:0-32], KC1[] ; EG-NEXT: MOV * T0.W, KC0[2].Z, ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.Y, OQAP, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.Z, OQAP, ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, ; EG-NEXT: MOV * T1.W, KC0[2].Y, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: LSHR T0.W, T0.Y, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: AND_INT T0.W, T0.Z, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: LSHR T0.W, T0.Z, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.x, ; EG-NEXT: MOV * T1.W, literal.y, ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: RETURN ; ; VI-DS128-LABEL: local_zextload_v4i16_to_v4i64: ; VI-DS128: ; %bb.0: ; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-DS128-NEXT: s_mov_b32 m0, -1 ; VI-DS128-NEXT: v_mov_b32_e32 v1, 0 ; VI-DS128-NEXT: v_mov_b32_e32 v3, v1 ; VI-DS128-NEXT: v_mov_b32_e32 v5, v1 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_mov_b32_e32 v0, s1 ; VI-DS128-NEXT: ds_read_b64 v[7:8], v0 ; VI-DS128-NEXT: v_mov_b32_e32 v9, s0 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v7 ; VI-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v7 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v8 ; VI-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v8 ; VI-DS128-NEXT: v_mov_b32_e32 v7, v1 ; VI-DS128-NEXT: ds_write_b128 v9, v[0:3] offset:16 ; VI-DS128-NEXT: ds_write_b128 v9, v[4:7] ; VI-DS128-NEXT: s_endpgm ; ; GFX9-DS128-LABEL: local_zextload_v4i16_to_v4i64: ; GFX9-DS128: ; %bb.0: ; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DS128-NEXT: s_mov_b32 s2, 0xffff ; GFX9-DS128-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DS128-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-DS128-NEXT: v_mov_b32_e32 v5, v1 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-DS128-NEXT: ds_read_b64 v[6:7], v0 ; GFX9-DS128-NEXT: v_mov_b32_e32 v8, s0 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v6 ; GFX9-DS128-NEXT: v_and_b32_sdwa v6, s2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v7 ; GFX9-DS128-NEXT: v_and_b32_sdwa v2, s2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DS128-NEXT: v_mov_b32_e32 v7, v1 ; GFX9-DS128-NEXT: ds_write_b128 v8, v[0:3] offset:16 ; GFX9-DS128-NEXT: ds_write_b128 v8, v[4:7] ; GFX9-DS128-NEXT: s_endpgm %load = load <4 x i16>, ptr addrspace(3) %in %ext = zext <4 x i16> %load to <4 x i64> store <4 x i64> %ext, ptr addrspace(3) %out ret void } define amdgpu_kernel void @local_sextload_v4i16_to_v4i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { ; SI-LABEL: local_sextload_v4i16_to_v4i64: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s1 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read_b64 v[0:1], v0 ; SI-NEXT: v_mov_b32_e32 v9, s0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; SI-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; SI-NEXT: v_bfe_i32 v3, v3, 0, 16 ; SI-NEXT: v_bfe_i32 v5, v0, 0, 16 ; SI-NEXT: v_bfe_i32 v7, v4, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; SI-NEXT: v_ashrrev_i32_e32 v8, 31, v7 ; SI-NEXT: ds_write2_b64 v9, v[3:4], v[1:2] offset0:2 offset1:3 ; SI-NEXT: ds_write2_b64 v9, v[5:6], v[7:8] offset1:1 ; SI-NEXT: s_endpgm ; ; VI-NO-DS128-LABEL: local_sextload_v4i16_to_v4i64: ; VI-NO-DS128: ; %bb.0: ; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NO-DS128-NEXT: s_mov_b32 m0, -1 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_mov_b32_e32 v0, s1 ; VI-NO-DS128-NEXT: ds_read_b64 v[0:1], v0 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v8, s0 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NO-DS128-NEXT: v_bfe_i32 v4, v3, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v6, v1, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VI-NO-DS128-NEXT: ds_write2_b64 v8, v[6:7], v[4:5] offset0:2 offset1:3 ; VI-NO-DS128-NEXT: ds_write2_b64 v8, v[0:1], v[2:3] offset1:1 ; VI-NO-DS128-NEXT: s_endpgm ; ; GFX9-NO-DS128-LABEL: local_sextload_v4i16_to_v4i64: ; GFX9-NO-DS128: ; %bb.0: ; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NO-DS128-NEXT: ds_read_b64 v[0:1], v0 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v8, s0 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v4, v3, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v6, v1, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v8, v[6:7], v[4:5] offset0:2 offset1:3 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v8, v[0:1], v[2:3] offset1:1 ; GFX9-NO-DS128-NEXT: s_endpgm ; ; EG-LABEL: local_sextload_v4i16_to_v4i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 39, @42, KC0[CB0:0-32], KC1[] ; EG-NEXT: MOV * T0.W, KC0[2].Z, ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.Y, OQAP, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.Z, OQAP, ; EG-NEXT: BFE_INT * T0.W, T0.Y, 0.0, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: BFE_INT T1.Z, PV.Z, 0.0, literal.x, ; EG-NEXT: ASHR T1.W, PV.W, literal.y, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.z, ; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44) ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T2.W, T1.W, ; EG-NEXT: ASHR T1.W, T1.Z, literal.x, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 31(4.344025e-44), 20(2.802597e-44) ; EG-NEXT: LDS_WRITE * T2.W, T1.W, ; EG-NEXT: MOV * T1.W, KC0[2].Y, ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: ASHR T0.W, T0.Y, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: ASHR T0.W, T0.Y, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 31(4.344025e-44), 12(1.681558e-44) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.Z, ; EG-NEXT: ASHR T0.W, T0.Z, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: ASHR T0.W, T0.Z, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 31(4.344025e-44), 28(3.923636e-44) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: RETURN ; ; VI-DS128-LABEL: local_sextload_v4i16_to_v4i64: ; VI-DS128: ; %bb.0: ; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-DS128-NEXT: s_mov_b32 m0, -1 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_mov_b32_e32 v0, s1 ; VI-DS128-NEXT: ds_read_b64 v[0:1], v0 ; VI-DS128-NEXT: v_mov_b32_e32 v8, s0 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_mov_b32_e32 v3, v1 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-DS128-NEXT: v_bfe_i32 v4, v3, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v6, v1, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VI-DS128-NEXT: ds_write_b128 v8, v[4:7] offset:16 ; VI-DS128-NEXT: ds_write_b128 v8, v[0:3] ; VI-DS128-NEXT: s_endpgm ; ; GFX9-DS128-LABEL: local_sextload_v4i16_to_v4i64: ; GFX9-DS128: ; %bb.0: ; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-DS128-NEXT: ds_read_b64 v[0:1], v0 ; GFX9-DS128-NEXT: v_mov_b32_e32 v8, s0 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-DS128-NEXT: v_bfe_i32 v4, v3, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v6, v1, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX9-DS128-NEXT: ds_write_b128 v8, v[4:7] offset:16 ; GFX9-DS128-NEXT: ds_write_b128 v8, v[0:3] ; GFX9-DS128-NEXT: s_endpgm %load = load <4 x i16>, ptr addrspace(3) %in %ext = sext <4 x i16> %load to <4 x i64> store <4 x i64> %ext, ptr addrspace(3) %out ret void } define amdgpu_kernel void @local_zextload_v8i16_to_v8i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { ; SI-LABEL: local_zextload_v8i16_to_v8i64: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s1 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 ; SI-NEXT: v_mov_b32_e32 v5, 0 ; SI-NEXT: v_mov_b32_e32 v7, v5 ; SI-NEXT: v_mov_b32_e32 v9, v5 ; SI-NEXT: v_mov_b32_e32 v11, v5 ; SI-NEXT: v_mov_b32_e32 v13, v5 ; SI-NEXT: v_mov_b32_e32 v15, v5 ; SI-NEXT: v_mov_b32_e32 v17, v5 ; SI-NEXT: v_mov_b32_e32 v19, v5 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 ; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v2 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v3 ; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: ds_write2_b64 v0, v[8:9], v[6:7] offset0:6 offset1:7 ; SI-NEXT: ds_write2_b64 v0, v[12:13], v[4:5] offset0:2 offset1:3 ; SI-NEXT: ds_write2_b64 v0, v[10:11], v[16:17] offset0:4 offset1:5 ; SI-NEXT: ds_write2_b64 v0, v[14:15], v[18:19] offset1:1 ; SI-NEXT: s_endpgm ; ; VI-NO-DS128-LABEL: local_zextload_v8i16_to_v8i64: ; VI-NO-DS128: ; %bb.0: ; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NO-DS128-NEXT: s_mov_b32 m0, -1 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_mov_b32_e32 v0, s1 ; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v11, s0 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v3 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v3 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v3, 0 ; VI-NO-DS128-NEXT: v_and_b32_e32 v9, 0xffff, v2 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v10, v3 ; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[9:10], v[2:3] offset0:4 offset1:5 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v9, v3 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v7, v3 ; VI-NO-DS128-NEXT: v_and_b32_e32 v5, 0xffff, v1 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[6:7], v[8:9] offset0:6 offset1:7 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v2, v3 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v6, v3 ; VI-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v0 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[5:6], v[1:2] offset0:2 offset1:3 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v1, v3 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v5, v3 ; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[4:5], v[0:1] offset1:1 ; VI-NO-DS128-NEXT: s_endpgm ; ; GFX9-NO-DS128-LABEL: local_zextload_v8i16_to_v8i64: ; GFX9-NO-DS128: ; %bb.0: ; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v12, 0 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v8, v12 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v10, v12 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 ; GFX9-NO-DS128-NEXT: s_mov_b32 s1, 0xffff ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v13, s0 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_and_b32_sdwa v7, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v11, 0xffff, v3 ; GFX9-NO-DS128-NEXT: v_and_b32_sdwa v6, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v3, v12 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v13, v[11:12], v[7:8] offset0:6 offset1:7 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v7, v12 ; GFX9-NO-DS128-NEXT: v_and_b32_sdwa v5, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v9, 0xffff, v1 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v13, v[2:3], v[6:7] offset0:4 offset1:5 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v6, v12 ; GFX9-NO-DS128-NEXT: v_and_b32_sdwa v4, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v1, v12 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v13, v[9:10], v[5:6] offset0:2 offset1:3 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v5, v12 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v13, v[0:1], v[4:5] offset1:1 ; GFX9-NO-DS128-NEXT: s_endpgm ; ; EG-LABEL: local_zextload_v8i16_to_v8i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 71, @43, KC0[CB0:0-32], KC1[] ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.Y, OQAP, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.Z, OQAP, ; EG-NEXT: MOV * T0.W, KC0[2].Z, ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.W, OQAP, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x, ; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T1.W ; EG-NEXT: MOV T1.Y, OQAP, ; EG-NEXT: AND_INT T1.W, T0.W, literal.x, ; EG-NEXT: MOV * T2.W, KC0[2].Y, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T2.W, T1.W, ; EG-NEXT: LSHR T0.W, T0.W, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: AND_INT T0.W, T0.Z, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: LSHR T0.W, T0.Z, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 32(4.484155e-44) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: LSHR T0.W, T0.Y, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 40(5.605194e-44) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: AND_INT T0.W, T1.Y, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 48(6.726233e-44) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: LSHR T0.W, T1.Y, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 56(7.847271e-44) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.x, ; EG-NEXT: MOV * T1.W, literal.y, ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 36(5.044674e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 44(6.165713e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 52(7.286752e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 60(8.407791e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: RETURN ; ; VI-DS128-LABEL: local_zextload_v8i16_to_v8i64: ; VI-DS128: ; %bb.0: ; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-DS128-NEXT: s_mov_b32 m0, -1 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_mov_b32_e32 v0, s1 ; VI-DS128-NEXT: ds_read_b128 v[0:3], v0 ; VI-DS128-NEXT: v_mov_b32_e32 v14, s0 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v1 ; VI-DS128-NEXT: v_and_b32_e32 v7, 0xffff, v1 ; VI-DS128-NEXT: v_mov_b32_e32 v1, 0 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v12, 16, v2 ; VI-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v2 ; VI-DS128-NEXT: v_mov_b32_e32 v11, v1 ; VI-DS128-NEXT: v_mov_b32_e32 v13, v1 ; VI-DS128-NEXT: ds_write_b128 v14, v[10:13] offset:32 ; VI-DS128-NEXT: v_mov_b32_e32 v8, v1 ; VI-DS128-NEXT: v_mov_b32_e32 v10, v1 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; VI-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v0 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; VI-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; VI-DS128-NEXT: v_mov_b32_e32 v3, v1 ; VI-DS128-NEXT: ds_write_b128 v14, v[7:10] offset:16 ; VI-DS128-NEXT: v_mov_b32_e32 v5, v1 ; VI-DS128-NEXT: v_mov_b32_e32 v7, v1 ; VI-DS128-NEXT: ds_write_b128 v14, v[0:3] offset:48 ; VI-DS128-NEXT: ds_write_b128 v14, v[4:7] ; VI-DS128-NEXT: s_endpgm ; ; GFX9-DS128-LABEL: local_zextload_v8i16_to_v8i64: ; GFX9-DS128: ; %bb.0: ; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DS128-NEXT: v_mov_b32_e32 v11, 0 ; GFX9-DS128-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DS128-NEXT: v_mov_b32_e32 v8, v11 ; GFX9-DS128-NEXT: v_mov_b32_e32 v5, v11 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-DS128-NEXT: ds_read_b128 v[0:3], v0 ; GFX9-DS128-NEXT: s_mov_b32 s1, 0xffff ; GFX9-DS128-NEXT: v_mov_b32_e32 v14, s0 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v3 ; GFX9-DS128-NEXT: v_and_b32_sdwa v12, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DS128-NEXT: v_and_b32_e32 v7, 0xffff, v1 ; GFX9-DS128-NEXT: v_and_b32_sdwa v9, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DS128-NEXT: ds_write_b128 v14, v[10:13] offset:48 ; GFX9-DS128-NEXT: v_mov_b32_e32 v10, v11 ; GFX9-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v0 ; GFX9-DS128-NEXT: v_and_b32_sdwa v6, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GFX9-DS128-NEXT: v_and_b32_sdwa v2, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DS128-NEXT: v_mov_b32_e32 v1, v11 ; GFX9-DS128-NEXT: v_mov_b32_e32 v3, v11 ; GFX9-DS128-NEXT: ds_write_b128 v14, v[7:10] offset:16 ; GFX9-DS128-NEXT: v_mov_b32_e32 v7, v11 ; GFX9-DS128-NEXT: ds_write_b128 v14, v[0:3] offset:32 ; GFX9-DS128-NEXT: ds_write_b128 v14, v[4:7] ; GFX9-DS128-NEXT: s_endpgm %load = load <8 x i16>, ptr addrspace(3) %in %ext = zext <8 x i16> %load to <8 x i64> store <8 x i64> %ext, ptr addrspace(3) %out ret void } define amdgpu_kernel void @local_sextload_v8i16_to_v8i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { ; SI-LABEL: local_sextload_v8i16_to_v8i64: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s1 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 ; SI-NEXT: v_mov_b32_e32 v16, s0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, v3 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 ; SI-NEXT: v_ashrrev_i32_e32 v5, 31, v1 ; SI-NEXT: v_ashrrev_i32_e32 v4, 16, v1 ; SI-NEXT: v_ashrrev_i32_e32 v7, 31, v3 ; SI-NEXT: v_ashrrev_i32_e32 v6, 16, v3 ; SI-NEXT: v_bfe_i32 v0, v0, 0, 16 ; SI-NEXT: v_bfe_i32 v8, v1, 0, 16 ; SI-NEXT: v_bfe_i32 v2, v2, 0, 16 ; SI-NEXT: v_bfe_i32 v10, v9, 0, 16 ; SI-NEXT: v_bfe_i32 v12, v12, 0, 16 ; SI-NEXT: v_bfe_i32 v14, v11, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; SI-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; SI-NEXT: v_ashrrev_i32_e32 v11, 31, v10 ; SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 ; SI-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; SI-NEXT: ds_write2_b64 v16, v[10:11], v[6:7] offset0:6 offset1:7 ; SI-NEXT: ds_write2_b64 v16, v[8:9], v[4:5] offset0:2 offset1:3 ; SI-NEXT: ds_write2_b64 v16, v[2:3], v[14:15] offset0:4 offset1:5 ; SI-NEXT: ds_write2_b64 v16, v[0:1], v[12:13] offset1:1 ; SI-NEXT: s_endpgm ; ; VI-NO-DS128-LABEL: local_sextload_v8i16_to_v8i64: ; VI-NO-DS128: ; %bb.0: ; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NO-DS128-NEXT: s_mov_b32 m0, -1 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_mov_b32_e32 v0, s1 ; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v16, s0 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v10, 16, v3 ; VI-NO-DS128-NEXT: v_bfe_i32 v14, v2, 0, 16 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v2, v3 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; VI-NO-DS128-NEXT: v_bfe_i32 v10, v10, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v4, v4, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v6, v5, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v8, v7, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 31, v10 ; VI-NO-DS128-NEXT: v_bfe_i32 v12, v0, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v0, v1, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v13, 31, v12 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; VI-NO-DS128-NEXT: ds_write2_b64 v16, v[2:3], v[10:11] offset0:6 offset1:7 ; VI-NO-DS128-NEXT: ds_write2_b64 v16, v[14:15], v[8:9] offset0:4 offset1:5 ; VI-NO-DS128-NEXT: ds_write2_b64 v16, v[0:1], v[6:7] offset0:2 offset1:3 ; VI-NO-DS128-NEXT: ds_write2_b64 v16, v[12:13], v[4:5] offset1:1 ; VI-NO-DS128-NEXT: s_endpgm ; ; GFX9-NO-DS128-LABEL: local_sextload_v8i16_to_v8i64: ; GFX9-NO-DS128: ; %bb.0: ; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v16, s0 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v3 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v14, v2, 0, 16 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v2, v3 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v10, v9, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v4, v4, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v6, v5, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v8, v7, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 31, v10 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v12, v0, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v1, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v13, 31, v12 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v16, v[2:3], v[10:11] offset0:6 offset1:7 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v16, v[14:15], v[8:9] offset0:4 offset1:5 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v16, v[0:1], v[6:7] offset0:2 offset1:3 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v16, v[12:13], v[4:5] offset1:1 ; GFX9-NO-DS128-NEXT: s_endpgm ; ; EG-LABEL: local_sextload_v8i16_to_v8i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 80, @44, KC0[CB0:0-32], KC1[] ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.Y, OQAP, ; EG-NEXT: MOV * T0.W, KC0[2].Z, ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.Z, OQAP, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV * T0.W, OQAP, ; EG-NEXT: BFE_INT T1.W, T0.Z, 0.0, literal.x, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.y, ; EG-NEXT: 16(2.242078e-44), 12(1.681558e-44) ; EG-NEXT: LDS_READ_RET * OQAP, T2.W ; EG-NEXT: MOV T1.Y, OQAP, ; EG-NEXT: BFE_INT T1.Z, T0.W, 0.0, literal.x, ; EG-NEXT: ASHR T2.W, T1.W, literal.y, BS:VEC_120/SCL_212 ; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.z, ; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44) ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T3.W, T2.W, ; EG-NEXT: BFE_INT T2.Z, T0.Y, 0.0, literal.x, ; EG-NEXT: ASHR T2.W, T1.Z, literal.y, ; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.z, ; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44) ; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T3.W, T2.W, ; EG-NEXT: BFE_INT T3.Z, T1.Y, 0.0, literal.x, ; EG-NEXT: ASHR T2.W, T2.Z, literal.y, ; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.z, ; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44) ; EG-NEXT: 36(5.044674e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T3.W, T2.W, ; EG-NEXT: ASHR T2.W, T3.Z, literal.x, ; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, ; EG-NEXT: 31(4.344025e-44), 52(7.286752e-44) ; EG-NEXT: LDS_WRITE * T3.W, T2.W, ; EG-NEXT: MOV * T2.W, KC0[2].Y, ; EG-NEXT: LDS_WRITE * T2.W, T1.W, ; EG-NEXT: ASHR T1.W, T0.Z, literal.x, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44) ; EG-NEXT: LDS_WRITE * T2.W, T1.W, ; EG-NEXT: ASHR T1.W, T0.Z, literal.x, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 31(4.344025e-44), 12(1.681558e-44) ; EG-NEXT: LDS_WRITE * T2.W, T1.W, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T1.W, T1.Z, ; EG-NEXT: ASHR T1.W, T0.W, literal.x, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) ; EG-NEXT: LDS_WRITE * T2.W, T1.W, ; EG-NEXT: ASHR T0.W, T0.W, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 31(4.344025e-44), 28(3.923636e-44) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T2.Z, ; EG-NEXT: ASHR T0.W, T0.Y, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 40(5.605194e-44) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: ASHR T0.W, T0.Y, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 31(4.344025e-44), 44(6.165713e-44) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T3.Z, ; EG-NEXT: ASHR T0.W, T1.Y, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 56(7.847271e-44) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: ASHR T0.W, T1.Y, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 31(4.344025e-44), 60(8.407791e-44) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: RETURN ; ; VI-DS128-LABEL: local_sextload_v8i16_to_v8i64: ; VI-DS128: ; %bb.0: ; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-DS128-NEXT: s_mov_b32 m0, -1 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_mov_b32_e32 v0, s1 ; VI-DS128-NEXT: ds_read_b128 v[0:3], v0 ; VI-DS128-NEXT: v_mov_b32_e32 v16, s0 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_bfe_i32 v4, v0, 0, 16 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; VI-DS128-NEXT: v_bfe_i32 v6, v0, 0, 16 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; VI-DS128-NEXT: v_bfe_i32 v12, v2, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v14, v0, 0, 16 ; VI-DS128-NEXT: v_mov_b32_e32 v0, v3 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; VI-DS128-NEXT: v_bfe_i32 v8, v1, 0, 16 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v10, v1, 0, 16 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v11, 31, v10 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v13, 31, v12 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; VI-DS128-NEXT: ds_write_b128 v16, v[0:3] offset:48 ; VI-DS128-NEXT: ds_write_b128 v16, v[12:15] offset:32 ; VI-DS128-NEXT: ds_write_b128 v16, v[8:11] offset:16 ; VI-DS128-NEXT: ds_write_b128 v16, v[4:7] ; VI-DS128-NEXT: s_endpgm ; ; GFX9-DS128-LABEL: local_sextload_v8i16_to_v8i64: ; GFX9-DS128: ; %bb.0: ; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-DS128-NEXT: ds_read_b128 v[0:3], v0 ; GFX9-DS128-NEXT: v_mov_b32_e32 v16, s0 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_bfe_i32 v4, v0, 0, 16 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-DS128-NEXT: v_bfe_i32 v6, v0, 0, 16 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; GFX9-DS128-NEXT: v_bfe_i32 v12, v2, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v14, v0, 0, 16 ; GFX9-DS128-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; GFX9-DS128-NEXT: v_bfe_i32 v8, v1, 0, 16 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v10, v1, 0, 16 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v11, 31, v10 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v13, 31, v12 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; GFX9-DS128-NEXT: ds_write_b128 v16, v[0:3] offset:48 ; GFX9-DS128-NEXT: ds_write_b128 v16, v[12:15] offset:32 ; GFX9-DS128-NEXT: ds_write_b128 v16, v[8:11] offset:16 ; GFX9-DS128-NEXT: ds_write_b128 v16, v[4:7] ; GFX9-DS128-NEXT: s_endpgm %load = load <8 x i16>, ptr addrspace(3) %in %ext = sext <8 x i16> %load to <8 x i64> store <8 x i64> %ext, ptr addrspace(3) %out ret void } define amdgpu_kernel void @local_zextload_v16i16_to_v16i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { ; SI-LABEL: local_zextload_v16i16_to_v16i64: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v4, s1 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read2_b64 v[0:3], v4 offset0:2 offset1:3 ; SI-NEXT: v_mov_b32_e32 v9, 0 ; SI-NEXT: ds_read2_b64 v[4:7], v4 offset1:1 ; SI-NEXT: v_mov_b32_e32 v11, v9 ; SI-NEXT: v_mov_b32_e32 v13, v9 ; SI-NEXT: v_mov_b32_e32 v15, v9 ; SI-NEXT: v_mov_b32_e32 v17, v9 ; SI-NEXT: v_mov_b32_e32 v20, s0 ; SI-NEXT: s_waitcnt lgkmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v1 ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v1 ; SI-NEXT: ds_write2_b64 v20, v[16:17], v[14:15] offset0:10 offset1:11 ; SI-NEXT: v_mov_b32_e32 v16, v9 ; SI-NEXT: s_waitcnt lgkmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v7 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v3 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v3 ; SI-NEXT: ds_write2_b64 v20, v[14:15], v[12:13] offset0:14 offset1:15 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v7 ; SI-NEXT: ds_write2_b64 v20, v[15:16], v[10:11] offset0:6 offset1:7 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v5 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v6 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v2 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v0 ; SI-NEXT: v_mov_b32_e32 v5, v9 ; SI-NEXT: ds_write2_b64 v20, v[4:5], v[8:9] offset0:2 offset1:3 ; SI-NEXT: v_mov_b32_e32 v19, v9 ; SI-NEXT: v_mov_b32_e32 v8, v9 ; SI-NEXT: v_mov_b32_e32 v15, v9 ; SI-NEXT: v_mov_b32_e32 v2, v9 ; SI-NEXT: v_mov_b32_e32 v4, v9 ; SI-NEXT: ds_write2_b64 v20, v[18:19], v[12:13] offset0:8 offset1:9 ; SI-NEXT: ds_write2_b64 v20, v[16:17], v[14:15] offset0:12 offset1:13 ; SI-NEXT: ds_write2_b64 v20, v[10:11], v[1:2] offset0:4 offset1:5 ; SI-NEXT: ds_write2_b64 v20, v[7:8], v[3:4] offset1:1 ; SI-NEXT: s_endpgm ; ; VI-NO-DS128-LABEL: local_zextload_v16i16_to_v16i64: ; VI-NO-DS128: ; %bb.0: ; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NO-DS128-NEXT: s_mov_b32 m0, -1 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v8, 0 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v10, v8 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v13, v8 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_mov_b32_e32 v4, s1 ; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v4 offset1:1 ; VI-NO-DS128-NEXT: ds_read2_b64 v[4:7], v4 offset0:2 offset1:3 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v14, s0 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v0 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v5 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v5 ; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[12:13], v[9:10] offset0:10 offset1:11 ; VI-NO-DS128-NEXT: v_and_b32_e32 v5, 0xffff, v6 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v6 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v6, v8 ; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[5:6], v[9:10] offset0:12 offset1:13 ; VI-NO-DS128-NEXT: v_and_b32_e32 v5, 0xffff, v7 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v7 ; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[5:6], v[9:10] offset0:14 offset1:15 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v3 ; VI-NO-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v4 ; VI-NO-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v4 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v4, v8 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; VI-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[12:13], v[7:8] offset0:8 offset1:9 ; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[3:4], v[9:10] offset0:6 offset1:7 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v3, v8 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v7, v8 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; VI-NO-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[2:3], v[6:7] offset0:4 offset1:5 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v2, v8 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v6, v8 ; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[1:2], v[5:6] offset0:2 offset1:3 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v1, v8 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v12, v8 ; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[0:1], v[11:12] offset1:1 ; VI-NO-DS128-NEXT: s_endpgm ; ; GFX9-NO-DS128-LABEL: local_zextload_v16i16_to_v16i64: ; GFX9-NO-DS128: ; %bb.0: ; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v10, v8 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v12, v8 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v14, v8 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v4 offset1:1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[4:7], v4 offset0:2 offset1:3 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v15, s0 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v0 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v11, 0xffff, v5 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v5 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[11:12], v[9:10] offset0:10 offset1:11 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v5, 0xffff, v6 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v6 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v6, v8 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[5:6], v[9:10] offset0:12 offset1:13 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v5, 0xffff, v7 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v7 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[5:6], v[9:10] offset0:14 offset1:15 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v4 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v4 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v11, v8 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v3 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[10:11], v[7:8] offset0:8 offset1:9 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v4, v8 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v10, v8 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[3:4], v[9:10] offset0:6 offset1:7 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v3, v8 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v7, v8 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[2:3], v[6:7] offset0:4 offset1:5 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v2, v8 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v6, v8 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[1:2], v[5:6] offset0:2 offset1:3 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v1, v8 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[0:1], v[13:14] offset1:1 ; GFX9-NO-DS128-NEXT: s_endpgm ; ; EG-LABEL: local_zextload_v16i16_to_v16i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 100, @45, KC0[CB0:0-32], KC1[] ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.Y, OQAP, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.Z, OQAP, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.W, OQAP, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T1.W ; EG-NEXT: MOV T1.Y, OQAP, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x, ; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T1.W ; EG-NEXT: MOV T1.Z, OQAP, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T1.W ; EG-NEXT: MOV T1.W, OQAP, ; EG-NEXT: MOV * T2.W, KC0[2].Z, ; EG-NEXT: LDS_READ_RET * OQAP, T2.W ; EG-NEXT: MOV T2.Y, OQAP, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x, ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T2.W ; EG-NEXT: MOV T2.Z, OQAP, ; EG-NEXT: LSHR T2.W, T2.Y, literal.x, ; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44) ; EG-NEXT: LDS_WRITE * T3.W, T2.W, ; EG-NEXT: AND_INT T2.W, T2.Y, literal.x, ; EG-NEXT: MOV * T3.W, KC0[2].Y, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T3.W, T2.W, ; EG-NEXT: LSHR T2.W, T2.Z, literal.x, ; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) ; EG-NEXT: LDS_WRITE * T3.W, T2.W, ; EG-NEXT: AND_INT T2.W, T2.Z, literal.x, ; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) ; EG-NEXT: LDS_WRITE * T3.W, T2.W, ; EG-NEXT: LSHR T2.W, T1.W, literal.x, ; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 40(5.605194e-44) ; EG-NEXT: LDS_WRITE * T3.W, T2.W, ; EG-NEXT: AND_INT T1.W, T1.W, literal.x, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 32(4.484155e-44) ; EG-NEXT: LDS_WRITE * T2.W, T1.W, ; EG-NEXT: LSHR T1.W, T1.Z, literal.x, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 56(7.847271e-44) ; EG-NEXT: LDS_WRITE * T2.W, T1.W, ; EG-NEXT: AND_INT T1.W, T1.Z, literal.x, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 48(6.726233e-44) ; EG-NEXT: LDS_WRITE * T2.W, T1.W, ; EG-NEXT: LSHR T1.W, T1.Y, literal.x, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 72(1.008935e-43) ; EG-NEXT: LDS_WRITE * T2.W, T1.W, ; EG-NEXT: AND_INT T1.W, T1.Y, literal.x, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 64(8.968310e-44) ; EG-NEXT: LDS_WRITE * T2.W, T1.W, ; EG-NEXT: LSHR T1.W, T0.W, literal.x, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 88(1.233143e-43) ; EG-NEXT: LDS_WRITE * T2.W, T1.W, ; EG-NEXT: AND_INT T0.W, T0.W, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 80(1.121039e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: LSHR T0.W, T0.Z, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 104(1.457350e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: AND_INT T0.W, T0.Z, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 96(1.345247e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: LSHR T0.W, T0.Y, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 120(1.681558e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 112(1.569454e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.x, ; EG-NEXT: MOV * T1.W, literal.y, ; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) ; EG-NEXT: ALU 42, @46, KC0[CB0:0-32], KC1[] ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 44(6.165713e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 36(5.044674e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 60(8.407791e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 52(7.286752e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 76(1.064987e-43), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 68(9.528830e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 92(1.289195e-43), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 84(1.177091e-43), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 108(1.513402e-43), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 100(1.401298e-43), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 124(1.737610e-43), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 116(1.625506e-43), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: RETURN ; ; VI-DS128-LABEL: local_zextload_v16i16_to_v16i64: ; VI-DS128: ; %bb.0: ; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-DS128-NEXT: s_mov_b32 m0, -1 ; VI-DS128-NEXT: v_mov_b32_e32 v26, 0 ; VI-DS128-NEXT: v_mov_b32_e32 v22, v26 ; VI-DS128-NEXT: v_mov_b32_e32 v24, v26 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_mov_b32_e32 v5, s1 ; VI-DS128-NEXT: ds_read_b128 v[0:3], v5 ; VI-DS128-NEXT: ds_read_b128 v[13:16], v5 offset:16 ; VI-DS128-NEXT: v_mov_b32_e32 v11, v26 ; VI-DS128-NEXT: v_mov_b32_e32 v19, v26 ; VI-DS128-NEXT: v_mov_b32_e32 v8, v26 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v12, 16, v2 ; VI-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v2 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v13 ; VI-DS128-NEXT: v_and_b32_e32 v21, 0xffff, v13 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v27, 16, v14 ; VI-DS128-NEXT: v_and_b32_e32 v25, 0xffff, v14 ; VI-DS128-NEXT: v_mov_b32_e32 v14, s0 ; VI-DS128-NEXT: v_mov_b32_e32 v13, v26 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v1 ; VI-DS128-NEXT: v_and_b32_e32 v7, 0xffff, v1 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v20, 16, v16 ; VI-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v16 ; VI-DS128-NEXT: ds_write_b128 v14, v[21:24] offset:64 ; VI-DS128-NEXT: v_mov_b32_e32 v21, v26 ; VI-DS128-NEXT: ds_write_b128 v14, v[10:13] offset:32 ; VI-DS128-NEXT: v_mov_b32_e32 v10, v26 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; VI-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v0 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; VI-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v15 ; VI-DS128-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; VI-DS128-NEXT: ds_write_b128 v14, v[18:21] offset:112 ; VI-DS128-NEXT: v_mov_b32_e32 v16, v26 ; VI-DS128-NEXT: v_mov_b32_e32 v18, v26 ; VI-DS128-NEXT: v_mov_b32_e32 v1, v26 ; VI-DS128-NEXT: v_mov_b32_e32 v3, v26 ; VI-DS128-NEXT: v_mov_b32_e32 v28, v26 ; VI-DS128-NEXT: ds_write_b128 v14, v[7:10] offset:16 ; VI-DS128-NEXT: v_mov_b32_e32 v5, v26 ; VI-DS128-NEXT: v_mov_b32_e32 v7, v26 ; VI-DS128-NEXT: ds_write_b128 v14, v[15:18] offset:96 ; VI-DS128-NEXT: ds_write_b128 v14, v[0:3] offset:48 ; VI-DS128-NEXT: ds_write_b128 v14, v[25:28] offset:80 ; VI-DS128-NEXT: ds_write_b128 v14, v[4:7] ; VI-DS128-NEXT: s_endpgm ; ; GFX9-DS128-LABEL: local_zextload_v16i16_to_v16i64: ; GFX9-DS128: ; %bb.0: ; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DS128-NEXT: v_mov_b32_e32 v25, 0 ; GFX9-DS128-NEXT: v_mov_b32_e32 v21, v25 ; GFX9-DS128-NEXT: v_mov_b32_e32 v23, v25 ; GFX9-DS128-NEXT: v_mov_b32_e32 v18, v25 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-DS128-NEXT: ds_read_b128 v[0:3], v4 ; GFX9-DS128-NEXT: ds_read_b128 v[4:7], v4 offset:16 ; GFX9-DS128-NEXT: v_mov_b32_e32 v28, s0 ; GFX9-DS128-NEXT: v_mov_b32_e32 v15, v25 ; GFX9-DS128-NEXT: v_mov_b32_e32 v12, v25 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v2 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v22, 16, v7 ; GFX9-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v7 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v6 ; GFX9-DS128-NEXT: v_and_b32_e32 v17, 0xffff, v6 ; GFX9-DS128-NEXT: ds_write_b128 v28, v[20:23] offset:112 ; GFX9-DS128-NEXT: v_mov_b32_e32 v20, v25 ; GFX9-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v2 ; GFX9-DS128-NEXT: ds_write_b128 v28, v[17:20] offset:96 ; GFX9-DS128-NEXT: v_mov_b32_e32 v17, v25 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v1 ; GFX9-DS128-NEXT: v_and_b32_e32 v11, 0xffff, v1 ; GFX9-DS128-NEXT: ds_write_b128 v28, v[14:17] offset:32 ; GFX9-DS128-NEXT: v_mov_b32_e32 v14, v25 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v10, 16, v0 ; GFX9-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v0 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; GFX9-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v4 ; GFX9-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v26, 16, v5 ; GFX9-DS128-NEXT: v_and_b32_e32 v24, 0xffff, v5 ; GFX9-DS128-NEXT: v_mov_b32_e32 v5, v25 ; GFX9-DS128-NEXT: v_mov_b32_e32 v7, v25 ; GFX9-DS128-NEXT: v_mov_b32_e32 v1, v25 ; GFX9-DS128-NEXT: v_mov_b32_e32 v3, v25 ; GFX9-DS128-NEXT: v_mov_b32_e32 v27, v25 ; GFX9-DS128-NEXT: ds_write_b128 v28, v[11:14] offset:16 ; GFX9-DS128-NEXT: v_mov_b32_e32 v9, v25 ; GFX9-DS128-NEXT: v_mov_b32_e32 v11, v25 ; GFX9-DS128-NEXT: ds_write_b128 v28, v[4:7] offset:64 ; GFX9-DS128-NEXT: ds_write_b128 v28, v[0:3] offset:48 ; GFX9-DS128-NEXT: ds_write_b128 v28, v[24:27] offset:80 ; GFX9-DS128-NEXT: ds_write_b128 v28, v[8:11] ; GFX9-DS128-NEXT: s_endpgm %load = load <16 x i16>, ptr addrspace(3) %in %ext = zext <16 x i16> %load to <16 x i64> store <16 x i64> %ext, ptr addrspace(3) %out ret void } define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { ; SI-LABEL: local_sextload_v16i16_to_v16i64: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v4, s1 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read2_b64 v[0:3], v4 offset0:2 offset1:3 ; SI-NEXT: ds_read2_b64 v[4:7], v4 offset1:1 ; SI-NEXT: v_mov_b32_e32 v18, s0 ; SI-NEXT: s_waitcnt lgkmcnt(1) ; SI-NEXT: v_mov_b32_e32 v12, v3 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v14, v7 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v2 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4 ; SI-NEXT: v_ashrrev_i32_e32 v9, 31, v5 ; SI-NEXT: v_ashrrev_i32_e32 v8, 16, v5 ; SI-NEXT: v_ashrrev_i32_e32 v11, 31, v3 ; SI-NEXT: v_ashrrev_i32_e32 v10, 16, v3 ; SI-NEXT: v_bfe_i32 v12, v12, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 ; SI-NEXT: ds_write2_b64 v18, v[12:13], v[10:11] offset0:14 offset1:15 ; SI-NEXT: v_ashrrev_i32_e32 v11, 31, v1 ; SI-NEXT: v_ashrrev_i32_e32 v10, 16, v1 ; SI-NEXT: v_bfe_i32 v12, v1, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 ; SI-NEXT: ds_write2_b64 v18, v[12:13], v[10:11] offset0:10 offset1:11 ; SI-NEXT: v_ashrrev_i32_e32 v11, 31, v7 ; SI-NEXT: v_ashrrev_i32_e32 v10, 16, v7 ; SI-NEXT: v_bfe_i32 v12, v14, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 ; SI-NEXT: ds_write2_b64 v18, v[12:13], v[10:11] offset0:6 offset1:7 ; SI-NEXT: v_bfe_i32 v1, v4, 0, 16 ; SI-NEXT: v_bfe_i32 v3, v5, 0, 16 ; SI-NEXT: v_bfe_i32 v5, v6, 0, 16 ; SI-NEXT: v_bfe_i32 v7, v0, 0, 16 ; SI-NEXT: v_bfe_i32 v10, v2, 0, 16 ; SI-NEXT: v_bfe_i32 v12, v19, 0, 16 ; SI-NEXT: v_bfe_i32 v14, v17, 0, 16 ; SI-NEXT: v_bfe_i32 v16, v16, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; SI-NEXT: ds_write2_b64 v18, v[3:4], v[8:9] offset0:2 offset1:3 ; SI-NEXT: v_bfe_i32 v3, v15, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; SI-NEXT: v_ashrrev_i32_e32 v8, 31, v7 ; SI-NEXT: v_ashrrev_i32_e32 v11, 31, v10 ; SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 ; SI-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; SI-NEXT: ds_write2_b64 v18, v[10:11], v[3:4] offset0:12 offset1:13 ; SI-NEXT: ds_write2_b64 v18, v[7:8], v[16:17] offset0:8 offset1:9 ; SI-NEXT: ds_write2_b64 v18, v[5:6], v[14:15] offset0:4 offset1:5 ; SI-NEXT: ds_write2_b64 v18, v[1:2], v[12:13] offset1:1 ; SI-NEXT: s_endpgm ; ; VI-NO-DS128-LABEL: local_sextload_v16i16_to_v16i64: ; VI-NO-DS128: ; %bb.0: ; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NO-DS128-NEXT: s_mov_b32 m0, -1 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_mov_b32_e32 v4, s1 ; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v4 offset1:1 ; VI-NO-DS128-NEXT: ds_read2_b64 v[4:7], v4 offset0:2 offset1:3 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v19, s0 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v18, 16, v3 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v14, 16, v4 ; VI-NO-DS128-NEXT: v_bfe_i32 v14, v14, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v5 ; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[16:17], v[14:15] offset0:8 offset1:9 ; VI-NO-DS128-NEXT: v_bfe_i32 v14, v4, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v4, v5, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[4:5], v[14:15] offset0:10 offset1:11 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v6 ; VI-NO-DS128-NEXT: v_bfe_i32 v4, v4, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v14, v6, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[14:15], v[4:5] offset0:12 offset1:13 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v14, 16, v7 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v16, v7 ; VI-NO-DS128-NEXT: v_bfe_i32 v14, v14, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v16, v16, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v4, v18, 0, 16 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v18, v3 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v0 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v1 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v2 ; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[16:17], v[14:15] offset0:14 offset1:15 ; VI-NO-DS128-NEXT: v_bfe_i32 v14, v18, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v8, v8, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v10, v9, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v12, v11, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; VI-NO-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v6, v1, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 31, v10 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v13, 31, v12 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[14:15], v[4:5] offset0:6 offset1:7 ; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[2:3], v[12:13] offset0:4 offset1:5 ; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[6:7], v[10:11] offset0:2 offset1:3 ; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[0:1], v[8:9] offset1:1 ; VI-NO-DS128-NEXT: s_endpgm ; ; GFX9-NO-DS128-LABEL: local_sextload_v16i16_to_v16i64: ; GFX9-NO-DS128: ; %bb.0: ; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v4 offset1:1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[4:7], v4 offset0:2 offset1:3 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v19, s0 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v18, 16, v3 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v14, 16, v4 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v14, v14, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v5 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[16:17], v[14:15] offset0:8 offset1:9 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v14, v4, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v4, v5, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[4:5], v[14:15] offset0:10 offset1:11 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v6 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v4, v4, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v14, v6, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[14:15], v[4:5] offset0:12 offset1:13 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v14, 16, v7 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v16, v7 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v14, v14, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v16, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v4, v18, 0, 16 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v18, v3 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v0 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v1 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v2 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[16:17], v[14:15] offset0:14 offset1:15 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v14, v18, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v8, v8, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v10, v9, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v12, v11, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v6, v1, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 31, v10 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v13, 31, v12 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[14:15], v[4:5] offset0:6 offset1:7 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[2:3], v[12:13] offset0:4 offset1:5 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[6:7], v[10:11] offset0:2 offset1:3 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[0:1], v[8:9] offset1:1 ; GFX9-NO-DS128-NEXT: s_endpgm ; ; EG-LABEL: local_sextload_v16i16_to_v16i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 101, @47, KC0[CB0:0-32], KC1[] ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.Y, OQAP, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.Z, OQAP, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.W, OQAP, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x, ; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T1.W ; EG-NEXT: MOV T1.Y, OQAP, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x, ; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T1.W ; EG-NEXT: MOV T1.Z, OQAP, ; EG-NEXT: MOV * T1.W, KC0[2].Z, ; EG-NEXT: LDS_READ_RET * OQAP, T1.W ; EG-NEXT: MOV T1.W, OQAP, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x, ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T2.W ; EG-NEXT: MOV T2.Y, OQAP, ; EG-NEXT: BFE_INT T2.W, T1.W, 0.0, literal.x, ; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.y, ; EG-NEXT: 16(2.242078e-44), 28(3.923636e-44) ; EG-NEXT: LDS_READ_RET * OQAP, T3.W ; EG-NEXT: MOV * T2.Z, OQAP, ; EG-NEXT: BFE_INT T3.Z, T2.Y, 0.0, literal.x, ; EG-NEXT: ASHR T3.W, T2.W, literal.y, ; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.z, ; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44) ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T4.W, T3.W, ; EG-NEXT: BFE_INT T4.Z, T0.Y, 0.0, literal.x, ; EG-NEXT: ASHR T3.W, T3.Z, literal.y, ; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.z, ; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44) ; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T4.W, T3.W, ; EG-NEXT: BFE_INT T5.Z, T0.Z, 0.0, literal.x, ; EG-NEXT: ASHR T3.W, T4.Z, literal.y, BS:VEC_120/SCL_212 ; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.z, ; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44) ; EG-NEXT: 36(5.044674e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T4.W, T3.W, ; EG-NEXT: BFE_INT T6.Z, T0.W, 0.0, literal.x, ; EG-NEXT: ASHR T3.W, T5.Z, literal.y, ; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.z, ; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44) ; EG-NEXT: 52(7.286752e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T4.W, T3.W, ; EG-NEXT: BFE_INT T7.Z, T1.Y, 0.0, literal.x, ; EG-NEXT: ASHR T3.W, T6.Z, literal.y, ; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.z, ; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44) ; EG-NEXT: 68(9.528830e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T4.W, T3.W, ; EG-NEXT: BFE_INT T8.Z, T1.Z, 0.0, literal.x, ; EG-NEXT: ASHR T3.W, T7.Z, literal.y, BS:VEC_120/SCL_212 ; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.z, ; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44) ; EG-NEXT: 84(1.177091e-43), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T4.W, T3.W, ; EG-NEXT: BFE_INT T9.Z, T2.Z, 0.0, literal.x, ; EG-NEXT: ASHR T3.W, T8.Z, literal.y, BS:VEC_120/SCL_212 ; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.z, ; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44) ; EG-NEXT: 100(1.401298e-43), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T4.W, T3.W, ; EG-NEXT: ASHR T3.W, T9.Z, literal.x, ; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y, ; EG-NEXT: 31(4.344025e-44), 116(1.625506e-43) ; EG-NEXT: LDS_WRITE * T4.W, T3.W, ; EG-NEXT: ASHR T3.W, T1.W, literal.x, ; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y, ; EG-NEXT: 31(4.344025e-44), 12(1.681558e-44) ; EG-NEXT: LDS_WRITE * T4.W, T3.W, ; EG-NEXT: ASHR T1.W, T1.W, literal.x, ; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44) ; EG-NEXT: LDS_WRITE * T3.W, T1.W, ; EG-NEXT: MOV * T1.W, KC0[2].Y, ; EG-NEXT: LDS_WRITE * T1.W, T2.W, ; EG-NEXT: ASHR T1.W, T2.Y, literal.x, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 31(4.344025e-44), 28(3.923636e-44) ; EG-NEXT: LDS_WRITE * T2.W, T1.W, ; EG-NEXT: ASHR T1.W, T2.Y, literal.x, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) ; EG-NEXT: LDS_WRITE * T2.W, T1.W, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T1.W, T3.Z, ; EG-NEXT: ASHR T1.W, T0.Y, literal.x, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 31(4.344025e-44), 44(6.165713e-44) ; EG-NEXT: ALU 62, @48, KC0[CB0:0-32], KC1[] ; EG-NEXT: LDS_WRITE * T2.W, T1.W, ; EG-NEXT: ASHR T1.W, T0.Y, literal.x, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 40(5.605194e-44) ; EG-NEXT: LDS_WRITE * T2.W, T1.W, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x, ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T1.W, T4.Z, ; EG-NEXT: ASHR T1.W, T0.Z, literal.x, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 31(4.344025e-44), 60(8.407791e-44) ; EG-NEXT: LDS_WRITE * T2.W, T1.W, ; EG-NEXT: ASHR T1.W, T0.Z, literal.x, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 56(7.847271e-44) ; EG-NEXT: LDS_WRITE * T2.W, T1.W, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x, ; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T1.W, T5.Z, ; EG-NEXT: ASHR T1.W, T0.W, literal.x, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 31(4.344025e-44), 76(1.064987e-43) ; EG-NEXT: LDS_WRITE * T2.W, T1.W, ; EG-NEXT: ASHR T0.W, T0.W, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 72(1.008935e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 64(8.968310e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T6.Z, ; EG-NEXT: ASHR T0.W, T1.Y, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 31(4.344025e-44), 92(1.289195e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: ASHR T0.W, T1.Y, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 88(1.233143e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 80(1.121039e-43), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T7.Z, ; EG-NEXT: ASHR T0.W, T1.Z, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 31(4.344025e-44), 108(1.513402e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: ASHR T0.W, T1.Z, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 104(1.457350e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 96(1.345247e-43), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T8.Z, ; EG-NEXT: ASHR T0.W, T2.Z, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 31(4.344025e-44), 124(1.737610e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: ASHR T0.W, T2.Z, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 120(1.681558e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 112(1.569454e-43), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T9.Z, ; EG-NEXT: RETURN ; ; VI-DS128-LABEL: local_sextload_v16i16_to_v16i64: ; VI-DS128: ; %bb.0: ; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-DS128-NEXT: s_mov_b32 m0, -1 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_mov_b32_e32 v0, s1 ; VI-DS128-NEXT: ds_read_b128 v[3:6], v0 ; VI-DS128-NEXT: ds_read_b128 v[7:10], v0 offset:16 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-DS128-NEXT: v_mov_b32_e32 v18, v6 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_bfe_i32 v11, v8, 0, 16 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; VI-DS128-NEXT: v_bfe_i32 v13, v8, 0, 16 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; VI-DS128-NEXT: v_mov_b32_e32 v8, s0 ; VI-DS128-NEXT: ds_write_b128 v8, v[11:14] offset:80 ; VI-DS128-NEXT: v_bfe_i32 v11, v7, 0, 16 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; VI-DS128-NEXT: v_bfe_i32 v13, v7, 0, 16 ; VI-DS128-NEXT: v_mov_b32_e32 v15, v10 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v10 ; VI-DS128-NEXT: ds_write_b128 v8, v[11:14] offset:64 ; VI-DS128-NEXT: v_bfe_i32 v11, v15, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v13, v7, 0, 16 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v9 ; VI-DS128-NEXT: ds_write_b128 v8, v[11:14] offset:112 ; VI-DS128-NEXT: v_bfe_i32 v14, v9, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v16, v16, 0, 16 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v6 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; VI-DS128-NEXT: v_bfe_i32 v10, v4, 0, 16 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v5 ; VI-DS128-NEXT: ds_write_b128 v8, v[14:17] offset:96 ; VI-DS128-NEXT: v_bfe_i32 v14, v18, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v16, v19, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v0, v3, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v12, v4, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v4, v5, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v6, v7, 0, 16 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v11, 31, v10 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v13, 31, v12 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; VI-DS128-NEXT: ds_write_b128 v8, v[14:17] offset:48 ; VI-DS128-NEXT: ds_write_b128 v8, v[4:7] offset:32 ; VI-DS128-NEXT: ds_write_b128 v8, v[10:13] offset:16 ; VI-DS128-NEXT: ds_write_b128 v8, v[0:3] ; VI-DS128-NEXT: s_endpgm ; ; GFX9-DS128-LABEL: local_sextload_v16i16_to_v16i64: ; GFX9-DS128: ; %bb.0: ; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-DS128-NEXT: ds_read_b128 v[3:6], v0 ; GFX9-DS128-NEXT: ds_read_b128 v[7:10], v0 offset:16 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-DS128-NEXT: v_bfe_i32 v0, v3, 0, 16 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v8 ; GFX9-DS128-NEXT: v_bfe_i32 v11, v8, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v13, v3, 0, 16 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; GFX9-DS128-NEXT: v_mov_b32_e32 v8, s0 ; GFX9-DS128-NEXT: ds_write_b128 v8, v[11:14] offset:80 ; GFX9-DS128-NEXT: v_bfe_i32 v11, v7, 0, 16 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GFX9-DS128-NEXT: v_bfe_i32 v13, v7, 0, 16 ; GFX9-DS128-NEXT: v_mov_b32_e32 v15, v10 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v10 ; GFX9-DS128-NEXT: ds_write_b128 v8, v[11:14] offset:64 ; GFX9-DS128-NEXT: v_bfe_i32 v11, v15, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v13, v7, 0, 16 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v9 ; GFX9-DS128-NEXT: ds_write_b128 v8, v[11:14] offset:112 ; GFX9-DS128-NEXT: v_bfe_i32 v14, v9, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v16, v16, 0, 16 ; GFX9-DS128-NEXT: v_mov_b32_e32 v18, v6 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v6 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GFX9-DS128-NEXT: v_bfe_i32 v10, v4, 0, 16 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v5 ; GFX9-DS128-NEXT: ds_write_b128 v8, v[14:17] offset:96 ; GFX9-DS128-NEXT: v_bfe_i32 v14, v18, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v16, v19, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v12, v4, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v4, v5, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v6, v7, 0, 16 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v11, 31, v10 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v13, 31, v12 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; GFX9-DS128-NEXT: ds_write_b128 v8, v[14:17] offset:48 ; GFX9-DS128-NEXT: ds_write_b128 v8, v[4:7] offset:32 ; GFX9-DS128-NEXT: ds_write_b128 v8, v[10:13] offset:16 ; GFX9-DS128-NEXT: ds_write_b128 v8, v[0:3] ; GFX9-DS128-NEXT: s_endpgm %load = load <16 x i16>, ptr addrspace(3) %in %ext = sext <16 x i16> %load to <16 x i64> store <16 x i64> %ext, ptr addrspace(3) %out ret void } define amdgpu_kernel void @local_zextload_v32i16_to_v32i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { ; SI-LABEL: local_zextload_v32i16_to_v32i64: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s1 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read2_b64 v[2:5], v0 offset0:2 offset1:3 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: ds_read2_b64 v[6:9], v0 offset1:1 ; SI-NEXT: v_mov_b32_e32 v19, v1 ; SI-NEXT: v_mov_b32_e32 v21, v1 ; SI-NEXT: v_mov_b32_e32 v22, s0 ; SI-NEXT: s_waitcnt lgkmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 ; SI-NEXT: v_and_b32_e32 v20, 0xffff, v5 ; SI-NEXT: ds_read2_b64 v[10:13], v0 offset0:4 offset1:5 ; SI-NEXT: ds_read2_b64 v[14:17], v0 offset0:6 offset1:7 ; SI-NEXT: ds_write2_b64 v22, v[20:21], v[18:19] offset0:14 offset1:15 ; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 ; SI-NEXT: v_and_b32_e32 v20, 0xffff, v3 ; SI-NEXT: ds_write2_b64 v22, v[20:21], v[18:19] offset0:10 offset1:11 ; SI-NEXT: s_waitcnt lgkmcnt(4) ; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 ; SI-NEXT: v_and_b32_e32 v20, 0xffff, v9 ; SI-NEXT: ds_write2_b64 v22, v[20:21], v[18:19] offset0:6 offset1:7 ; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v7 ; SI-NEXT: v_and_b32_e32 v20, 0xffff, v7 ; SI-NEXT: ds_write2_b64 v22, v[20:21], v[18:19] offset0:2 offset1:3 ; SI-NEXT: s_waitcnt lgkmcnt(4) ; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 ; SI-NEXT: v_and_b32_e32 v20, 0xffff, v17 ; SI-NEXT: ds_write2_b64 v22, v[20:21], v[18:19] offset0:30 offset1:31 ; SI-NEXT: v_mov_b32_e32 v18, v1 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 ; SI-NEXT: v_mov_b32_e32 v20, v1 ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v15 ; SI-NEXT: ds_write2_b64 v22, v[19:20], v[17:18] offset0:26 offset1:27 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v13 ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v13 ; SI-NEXT: ds_write2_b64 v22, v[19:20], v[17:18] offset0:22 offset1:23 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v4 ; SI-NEXT: v_mov_b32_e32 v5, v1 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: ds_write2_b64 v22, v[4:5], v[17:18] offset0:12 offset1:13 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v2 ; SI-NEXT: v_mov_b32_e32 v4, v1 ; SI-NEXT: ds_write2_b64 v22, v[17:18], v[3:4] offset0:8 offset1:9 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: v_mov_b32_e32 v9, v1 ; SI-NEXT: v_mov_b32_e32 v7, v1 ; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v11 ; SI-NEXT: ds_write2_b64 v22, v[8:9], v[2:3] offset0:4 offset1:5 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v12 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v10 ; SI-NEXT: ds_write2_b64 v22, v[6:7], v[4:5] offset1:1 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v10 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v11 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v12 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v16 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v14 ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v16 ; SI-NEXT: v_mov_b32_e32 v6, v1 ; SI-NEXT: ds_write2_b64 v22, v[5:6], v[0:1] offset0:18 offset1:19 ; SI-NEXT: v_mov_b32_e32 v11, v1 ; SI-NEXT: v_mov_b32_e32 v5, v1 ; SI-NEXT: v_mov_b32_e32 v13, v1 ; SI-NEXT: v_mov_b32_e32 v16, v1 ; SI-NEXT: ds_write2_b64 v22, v[19:20], v[12:13] offset0:28 offset1:29 ; SI-NEXT: ds_write2_b64 v22, v[17:18], v[15:16] offset0:24 offset1:25 ; SI-NEXT: ds_write2_b64 v22, v[10:11], v[2:3] offset0:20 offset1:21 ; SI-NEXT: ds_write2_b64 v22, v[4:5], v[8:9] offset0:16 offset1:17 ; SI-NEXT: s_endpgm ; ; VI-NO-DS128-LABEL: local_zextload_v32i16_to_v32i64: ; VI-NO-DS128: ; %bb.0: ; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NO-DS128-NEXT: s_mov_b32 m0, -1 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v5, 0 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v19, v5 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v21, v5 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_mov_b32_e32 v4, s1 ; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v4 offset0:6 offset1:7 ; VI-NO-DS128-NEXT: ds_read2_b64 v[6:9], v4 offset0:4 offset1:5 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v22, s0 ; VI-NO-DS128-NEXT: ds_read2_b64 v[10:13], v4 offset0:2 offset1:3 ; VI-NO-DS128-NEXT: ds_read2_b64 v[14:17], v4 offset1:1 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(3) ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v20, 16, v2 ; VI-NO-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v2 ; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[18:19], v[20:21] offset0:28 offset1:29 ; VI-NO-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v1 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v2, v5 ; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[18:19], v[1:2] offset0:26 offset1:27 ; VI-NO-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v0 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v18, 16, v0 ; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[1:2], v[18:19] offset0:24 offset1:25 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(5) ; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v9 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v1, v5 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v18, 16, v9 ; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[0:1], v[18:19] offset0:22 offset1:23 ; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v8 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v9, v5 ; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[0:1], v[8:9] offset0:20 offset1:21 ; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v7 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v8, v5 ; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[0:1], v[7:8] offset0:18 offset1:19 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v6 ; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v7, v5 ; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[6:7], v[0:1] offset0:16 offset1:17 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(8) ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v13 ; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v13 ; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[6:7], v[0:1] offset0:14 offset1:15 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v12 ; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v12 ; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[6:7], v[0:1] offset0:12 offset1:13 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v11 ; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v11 ; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[6:7], v[0:1] offset0:10 offset1:11 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(10) ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v14 ; VI-NO-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v14 ; VI-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v3 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v3 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v14, v5 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v10 ; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v10 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v10, 16, v17 ; VI-NO-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v17 ; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[4:5], v[13:14] offset0:30 offset1:31 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v13, v5 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v11, v5 ; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[6:7], v[1:2] offset0:8 offset1:9 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v16 ; VI-NO-DS128-NEXT: v_and_b32_e32 v9, 0xffff, v16 ; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[12:13], v[10:11] offset0:6 offset1:7 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v10, v5 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v3, v5 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v15 ; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v15 ; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[9:10], v[2:3] offset0:4 offset1:5 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v2, v5 ; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[6:7], v[1:2] offset0:2 offset1:3 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v9, v5 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v1, v5 ; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[8:9], v[0:1] offset1:1 ; VI-NO-DS128-NEXT: s_endpgm ; ; GFX9-NO-DS128-LABEL: local_zextload_v32i16_to_v32i64: ; GFX9-NO-DS128: ; %bb.0: ; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v19, v5 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v21, v5 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[6:9], v4 offset0:4 offset1:5 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v4 offset0:6 offset1:7 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v22, s0 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[10:13], v4 offset1:1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[14:17], v4 offset0:2 offset1:3 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(2) ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v20, 16, v2 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v2 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[18:19], v[20:21] offset0:28 offset1:29 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v1 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v2, v5 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[18:19], v[1:2] offset0:26 offset1:27 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v0 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v18, 16, v0 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[1:2], v[18:19] offset0:24 offset1:25 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v9 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v18, 16, v9 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[0:1], v[18:19] offset0:22 offset1:23 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v8 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v9, v5 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[0:1], v[8:9] offset0:20 offset1:21 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v7 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v8, v5 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[0:1], v[7:8] offset0:18 offset1:19 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v6 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[6:7], v[0:1] offset0:16 offset1:17 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(7) ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v17 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v17 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[6:7], v[0:1] offset0:14 offset1:15 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v16 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v16 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[6:7], v[0:1] offset0:12 offset1:13 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v15 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v15 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[6:7], v[0:1] offset0:10 offset1:11 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v14 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v14 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v10 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v10 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[6:7], v[1:2] offset0:8 offset1:9 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v12 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v9, 0xffff, v12 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v10, 16, v13 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v13 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v3 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v3 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v14, v5 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v11 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v11 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[4:5], v[13:14] offset0:30 offset1:31 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v13, v5 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v11, v5 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[12:13], v[10:11] offset0:6 offset1:7 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v10, v5 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v3, v5 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[9:10], v[2:3] offset0:4 offset1:5 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v2, v5 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[6:7], v[1:2] offset0:2 offset1:3 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v9, v5 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[8:9], v[0:1] offset1:1 ; GFX9-NO-DS128-NEXT: s_endpgm ; ; EG-LABEL: local_zextload_v32i16_to_v32i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 105, @49, KC0[CB0:0-32], KC1[] ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 56(7.847271e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.Y, OQAP, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 60(8.407791e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.Z, OQAP, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.W, OQAP, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x, ; EG-NEXT: 52(7.286752e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T1.W ; EG-NEXT: MOV T1.Y, OQAP, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x, ; EG-NEXT: 40(5.605194e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T1.W ; EG-NEXT: MOV T1.Z, OQAP, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x, ; EG-NEXT: 44(6.165713e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T1.W ; EG-NEXT: MOV T1.W, OQAP, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x, ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T2.W ; EG-NEXT: MOV T2.Y, OQAP, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x, ; EG-NEXT: 36(5.044674e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T2.W ; EG-NEXT: MOV T2.Z, OQAP, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x, ; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T2.W ; EG-NEXT: MOV T2.W, OQAP, ; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x, ; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T3.W ; EG-NEXT: MOV T3.Y, OQAP, ; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T3.W ; EG-NEXT: MOV T3.Z, OQAP, ; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x, ; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T3.W ; EG-NEXT: MOV T3.W, OQAP, ; EG-NEXT: ADD_INT * T4.W, KC0[2].Z, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T4.W ; EG-NEXT: MOV T4.Y, OQAP, ; EG-NEXT: ADD_INT * T4.W, KC0[2].Z, literal.x, ; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T4.W ; EG-NEXT: MOV T4.Z, OQAP, ; EG-NEXT: ADD_INT * T4.W, KC0[2].Z, literal.x, ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T4.W ; EG-NEXT: MOV T4.W, OQAP, ; EG-NEXT: MOV * T5.W, KC0[2].Z, ; EG-NEXT: LDS_READ_RET * OQAP, T5.W ; EG-NEXT: MOV T5.Y, OQAP, ; EG-NEXT: LSHR T5.W, T4.W, literal.x, ; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) ; EG-NEXT: LDS_WRITE * T6.W, T5.W, ; EG-NEXT: AND_INT T4.W, T4.W, literal.x, ; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) ; EG-NEXT: LDS_WRITE * T5.W, T4.W, ; EG-NEXT: LSHR T4.W, T5.Y, literal.x, ; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44) ; EG-NEXT: LDS_WRITE * T5.W, T4.W, ; EG-NEXT: AND_INT T4.W, T5.Y, literal.x, ; EG-NEXT: MOV * T5.W, KC0[2].Y, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T5.W, T4.W, ; EG-NEXT: LSHR T4.W, T4.Z, literal.x, ; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 56(7.847271e-44) ; EG-NEXT: LDS_WRITE * T5.W, T4.W, ; EG-NEXT: AND_INT T4.W, T4.Z, literal.x, ; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 48(6.726233e-44) ; EG-NEXT: LDS_WRITE * T5.W, T4.W, ; EG-NEXT: LSHR T4.W, T4.Y, literal.x, ; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 40(5.605194e-44) ; EG-NEXT: LDS_WRITE * T5.W, T4.W, ; EG-NEXT: AND_INT T4.W, T4.Y, literal.x, ; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 32(4.484155e-44) ; EG-NEXT: LDS_WRITE * T5.W, T4.W, ; EG-NEXT: LSHR T4.W, T3.W, literal.x, ; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 88(1.233143e-43) ; EG-NEXT: LDS_WRITE * T5.W, T4.W, ; EG-NEXT: AND_INT T3.W, T3.W, literal.x, ; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 80(1.121039e-43) ; EG-NEXT: LDS_WRITE * T4.W, T3.W, ; EG-NEXT: LSHR T3.W, T3.Z, literal.x, ; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 72(1.008935e-43) ; EG-NEXT: ALU 93, @50, KC0[CB0:0-32], KC1[] ; EG-NEXT: LDS_WRITE * T4.W, T3.W, ; EG-NEXT: AND_INT T3.W, T3.Z, literal.x, ; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 64(8.968310e-44) ; EG-NEXT: LDS_WRITE * T4.W, T3.W, ; EG-NEXT: LSHR T3.W, T3.Y, literal.x, ; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 120(1.681558e-43) ; EG-NEXT: LDS_WRITE * T4.W, T3.W, ; EG-NEXT: AND_INT T3.W, T3.Y, literal.x, ; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 112(1.569454e-43) ; EG-NEXT: LDS_WRITE * T4.W, T3.W, ; EG-NEXT: LSHR T3.W, T2.W, literal.x, ; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 104(1.457350e-43) ; EG-NEXT: LDS_WRITE * T4.W, T3.W, ; EG-NEXT: AND_INT T2.W, T2.W, literal.x, ; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 96(1.345247e-43) ; EG-NEXT: LDS_WRITE * T3.W, T2.W, ; EG-NEXT: LSHR T2.W, T2.Z, literal.x, ; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 152(2.129974e-43) ; EG-NEXT: LDS_WRITE * T3.W, T2.W, ; EG-NEXT: AND_INT T2.W, T2.Z, literal.x, ; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 144(2.017870e-43) ; EG-NEXT: LDS_WRITE * T3.W, T2.W, ; EG-NEXT: LSHR T2.W, T2.Y, literal.x, ; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 136(1.905766e-43) ; EG-NEXT: LDS_WRITE * T3.W, T2.W, ; EG-NEXT: AND_INT T2.W, T2.Y, literal.x, ; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 128(1.793662e-43) ; EG-NEXT: LDS_WRITE * T3.W, T2.W, ; EG-NEXT: LSHR T2.W, T1.W, literal.x, ; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 184(2.578389e-43) ; EG-NEXT: LDS_WRITE * T3.W, T2.W, ; EG-NEXT: AND_INT T1.W, T1.W, literal.x, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 176(2.466285e-43) ; EG-NEXT: LDS_WRITE * T2.W, T1.W, ; EG-NEXT: LSHR T1.W, T1.Z, literal.x, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 168(2.354181e-43) ; EG-NEXT: LDS_WRITE * T2.W, T1.W, ; EG-NEXT: AND_INT T1.W, T1.Z, literal.x, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 160(2.242078e-43) ; EG-NEXT: LDS_WRITE * T2.W, T1.W, ; EG-NEXT: LSHR T1.W, T1.Y, literal.x, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 216(3.026805e-43) ; EG-NEXT: LDS_WRITE * T2.W, T1.W, ; EG-NEXT: AND_INT T1.W, T1.Y, literal.x, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 208(2.914701e-43) ; EG-NEXT: LDS_WRITE * T2.W, T1.W, ; EG-NEXT: LSHR T1.W, T0.W, literal.x, ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 200(2.802597e-43) ; EG-NEXT: LDS_WRITE * T2.W, T1.W, ; EG-NEXT: AND_INT T0.W, T0.W, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 192(2.690493e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: LSHR T0.W, T0.Z, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 248(3.475220e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: AND_INT T0.W, T0.Z, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 240(3.363116e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: LSHR T0.W, T0.Y, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 232(3.251012e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 224(3.138909e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.x, ; EG-NEXT: MOV * T1.W, literal.y, ; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00) ; EG-NEXT: ALU 87, @51, KC0[CB0:0-32], KC1[] ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 60(8.407791e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 52(7.286752e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 44(6.165713e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 36(5.044674e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 92(1.289195e-43), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 84(1.177091e-43), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 76(1.064987e-43), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 68(9.528830e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 124(1.737610e-43), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 116(1.625506e-43), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 108(1.513402e-43), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 100(1.401298e-43), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 156(2.186026e-43), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 148(2.073922e-43), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 140(1.961818e-43), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 132(1.849714e-43), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 188(2.634441e-43), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 180(2.522337e-43), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 172(2.410233e-43), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 164(2.298129e-43), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 220(3.082857e-43), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 212(2.970753e-43), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 204(2.858649e-43), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 196(2.746545e-43), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 252(3.531272e-43), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 244(3.419168e-43), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 236(3.307064e-43), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 228(3.194960e-43), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T1.W, ; EG-NEXT: RETURN ; ; VI-DS128-LABEL: local_zextload_v32i16_to_v32i64: ; VI-DS128: ; %bb.0: ; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-DS128-NEXT: s_mov_b32 m0, -1 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_mov_b32_e32 v1, s1 ; VI-DS128-NEXT: ds_read_b128 v[3:6], v1 ; VI-DS128-NEXT: ds_read_b128 v[7:10], v1 offset:16 ; VI-DS128-NEXT: v_mov_b32_e32 v52, s0 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v6 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v8 ; VI-DS128-NEXT: v_and_b32_e32 v17, 0xffff, v8 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v22, 16, v7 ; VI-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v7 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v25, 16, v10 ; VI-DS128-NEXT: v_and_b32_e32 v23, 0xffff, v10 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v28, 16, v9 ; VI-DS128-NEXT: v_and_b32_e32 v26, 0xffff, v9 ; VI-DS128-NEXT: ds_read_b128 v[7:10], v1 offset:32 ; VI-DS128-NEXT: ds_read_b128 v[29:32], v1 offset:48 ; VI-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v6 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v4 ; VI-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v38, 16, v7 ; VI-DS128-NEXT: v_and_b32_e32 v36, 0xffff, v7 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v44, 16, v9 ; VI-DS128-NEXT: v_and_b32_e32 v42, 0xffff, v9 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v30 ; VI-DS128-NEXT: v_and_b32_e32 v7, 0xffff, v30 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v50, 16, v32 ; VI-DS128-NEXT: v_and_b32_e32 v48, 0xffff, v32 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v32, 16, v31 ; VI-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v31 ; VI-DS128-NEXT: v_mov_b32_e32 v31, 0 ; VI-DS128-NEXT: v_mov_b32_e32 v49, v31 ; VI-DS128-NEXT: v_mov_b32_e32 v51, v31 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v47, 16, v29 ; VI-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v29 ; VI-DS128-NEXT: ds_write_b128 v52, v[48:51] offset:240 ; VI-DS128-NEXT: v_mov_b32_e32 v46, v31 ; VI-DS128-NEXT: v_mov_b32_e32 v48, v31 ; VI-DS128-NEXT: v_mov_b32_e32 v27, v31 ; VI-DS128-NEXT: v_mov_b32_e32 v29, v31 ; VI-DS128-NEXT: ds_write_b128 v52, v[45:48] offset:192 ; VI-DS128-NEXT: v_mov_b32_e32 v43, v31 ; VI-DS128-NEXT: v_mov_b32_e32 v45, v31 ; VI-DS128-NEXT: ds_write_b128 v52, v[26:29] offset:96 ; VI-DS128-NEXT: v_mov_b32_e32 v24, v31 ; VI-DS128-NEXT: v_mov_b32_e32 v26, v31 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v41, 16, v10 ; VI-DS128-NEXT: v_and_b32_e32 v39, 0xffff, v10 ; VI-DS128-NEXT: ds_write_b128 v52, v[42:45] offset:160 ; VI-DS128-NEXT: v_mov_b32_e32 v40, v31 ; VI-DS128-NEXT: v_mov_b32_e32 v42, v31 ; VI-DS128-NEXT: ds_write_b128 v52, v[23:26] offset:112 ; VI-DS128-NEXT: v_mov_b32_e32 v21, v31 ; VI-DS128-NEXT: v_mov_b32_e32 v23, v31 ; VI-DS128-NEXT: ds_write_b128 v52, v[39:42] offset:176 ; VI-DS128-NEXT: v_mov_b32_e32 v37, v31 ; VI-DS128-NEXT: v_mov_b32_e32 v39, v31 ; VI-DS128-NEXT: ds_write_b128 v52, v[20:23] offset:64 ; VI-DS128-NEXT: v_mov_b32_e32 v18, v31 ; VI-DS128-NEXT: v_mov_b32_e32 v20, v31 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v35, 16, v8 ; VI-DS128-NEXT: v_and_b32_e32 v33, 0xffff, v8 ; VI-DS128-NEXT: v_mov_b32_e32 v8, v31 ; VI-DS128-NEXT: v_mov_b32_e32 v10, v31 ; VI-DS128-NEXT: ds_write_b128 v52, v[36:39] offset:128 ; VI-DS128-NEXT: v_mov_b32_e32 v34, v31 ; VI-DS128-NEXT: v_mov_b32_e32 v36, v31 ; VI-DS128-NEXT: ds_write_b128 v52, v[17:20] offset:80 ; VI-DS128-NEXT: v_mov_b32_e32 v15, v31 ; VI-DS128-NEXT: v_mov_b32_e32 v17, v31 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v3 ; VI-DS128-NEXT: v_and_b32_e32 v11, 0xffff, v3 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v5 ; VI-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v5 ; VI-DS128-NEXT: ds_write_b128 v52, v[7:10] offset:208 ; VI-DS128-NEXT: ds_write_b128 v52, v[33:36] offset:144 ; VI-DS128-NEXT: v_mov_b32_e32 v5, v31 ; VI-DS128-NEXT: v_mov_b32_e32 v7, v31 ; VI-DS128-NEXT: v_mov_b32_e32 v33, v31 ; VI-DS128-NEXT: ds_write_b128 v52, v[14:17] offset:48 ; VI-DS128-NEXT: v_mov_b32_e32 v12, v31 ; VI-DS128-NEXT: v_mov_b32_e32 v14, v31 ; VI-DS128-NEXT: v_mov_b32_e32 v1, v31 ; VI-DS128-NEXT: v_mov_b32_e32 v3, v31 ; VI-DS128-NEXT: ds_write_b128 v52, v[4:7] offset:32 ; VI-DS128-NEXT: ds_write_b128 v52, v[30:33] offset:224 ; VI-DS128-NEXT: ds_write_b128 v52, v[11:14] ; VI-DS128-NEXT: ds_write_b128 v52, v[0:3] offset:16 ; VI-DS128-NEXT: s_endpgm ; ; GFX9-DS128-LABEL: local_zextload_v32i16_to_v32i64: ; GFX9-DS128: ; %bb.0: ; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DS128-NEXT: ds_read_b128 v[3:6], v1 ; GFX9-DS128-NEXT: ds_read_b128 v[7:10], v1 offset:16 ; GFX9-DS128-NEXT: v_mov_b32_e32 v52, s0 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v6 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v8 ; GFX9-DS128-NEXT: v_and_b32_e32 v17, 0xffff, v8 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v22, 16, v7 ; GFX9-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v7 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v25, 16, v10 ; GFX9-DS128-NEXT: v_and_b32_e32 v23, 0xffff, v10 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v28, 16, v9 ; GFX9-DS128-NEXT: v_and_b32_e32 v26, 0xffff, v9 ; GFX9-DS128-NEXT: ds_read_b128 v[7:10], v1 offset:32 ; GFX9-DS128-NEXT: ds_read_b128 v[29:32], v1 offset:48 ; GFX9-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v6 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v4 ; GFX9-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v38, 16, v7 ; GFX9-DS128-NEXT: v_and_b32_e32 v36, 0xffff, v7 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v44, 16, v9 ; GFX9-DS128-NEXT: v_and_b32_e32 v42, 0xffff, v9 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v30 ; GFX9-DS128-NEXT: v_and_b32_e32 v7, 0xffff, v30 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v50, 16, v32 ; GFX9-DS128-NEXT: v_and_b32_e32 v48, 0xffff, v32 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v32, 16, v31 ; GFX9-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v31 ; GFX9-DS128-NEXT: v_mov_b32_e32 v31, 0 ; GFX9-DS128-NEXT: v_mov_b32_e32 v49, v31 ; GFX9-DS128-NEXT: v_mov_b32_e32 v51, v31 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v47, 16, v29 ; GFX9-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v29 ; GFX9-DS128-NEXT: ds_write_b128 v52, v[48:51] offset:240 ; GFX9-DS128-NEXT: v_mov_b32_e32 v46, v31 ; GFX9-DS128-NEXT: v_mov_b32_e32 v48, v31 ; GFX9-DS128-NEXT: v_mov_b32_e32 v27, v31 ; GFX9-DS128-NEXT: v_mov_b32_e32 v29, v31 ; GFX9-DS128-NEXT: ds_write_b128 v52, v[45:48] offset:192 ; GFX9-DS128-NEXT: v_mov_b32_e32 v43, v31 ; GFX9-DS128-NEXT: v_mov_b32_e32 v45, v31 ; GFX9-DS128-NEXT: ds_write_b128 v52, v[26:29] offset:96 ; GFX9-DS128-NEXT: v_mov_b32_e32 v24, v31 ; GFX9-DS128-NEXT: v_mov_b32_e32 v26, v31 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v41, 16, v10 ; GFX9-DS128-NEXT: v_and_b32_e32 v39, 0xffff, v10 ; GFX9-DS128-NEXT: ds_write_b128 v52, v[42:45] offset:160 ; GFX9-DS128-NEXT: v_mov_b32_e32 v40, v31 ; GFX9-DS128-NEXT: v_mov_b32_e32 v42, v31 ; GFX9-DS128-NEXT: ds_write_b128 v52, v[23:26] offset:112 ; GFX9-DS128-NEXT: v_mov_b32_e32 v21, v31 ; GFX9-DS128-NEXT: v_mov_b32_e32 v23, v31 ; GFX9-DS128-NEXT: ds_write_b128 v52, v[39:42] offset:176 ; GFX9-DS128-NEXT: v_mov_b32_e32 v37, v31 ; GFX9-DS128-NEXT: v_mov_b32_e32 v39, v31 ; GFX9-DS128-NEXT: ds_write_b128 v52, v[20:23] offset:64 ; GFX9-DS128-NEXT: v_mov_b32_e32 v18, v31 ; GFX9-DS128-NEXT: v_mov_b32_e32 v20, v31 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v35, 16, v8 ; GFX9-DS128-NEXT: v_and_b32_e32 v33, 0xffff, v8 ; GFX9-DS128-NEXT: v_mov_b32_e32 v8, v31 ; GFX9-DS128-NEXT: v_mov_b32_e32 v10, v31 ; GFX9-DS128-NEXT: ds_write_b128 v52, v[36:39] offset:128 ; GFX9-DS128-NEXT: v_mov_b32_e32 v34, v31 ; GFX9-DS128-NEXT: v_mov_b32_e32 v36, v31 ; GFX9-DS128-NEXT: ds_write_b128 v52, v[17:20] offset:80 ; GFX9-DS128-NEXT: v_mov_b32_e32 v15, v31 ; GFX9-DS128-NEXT: v_mov_b32_e32 v17, v31 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v3 ; GFX9-DS128-NEXT: v_and_b32_e32 v11, 0xffff, v3 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v5 ; GFX9-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v5 ; GFX9-DS128-NEXT: ds_write_b128 v52, v[7:10] offset:208 ; GFX9-DS128-NEXT: ds_write_b128 v52, v[33:36] offset:144 ; GFX9-DS128-NEXT: v_mov_b32_e32 v5, v31 ; GFX9-DS128-NEXT: v_mov_b32_e32 v7, v31 ; GFX9-DS128-NEXT: v_mov_b32_e32 v33, v31 ; GFX9-DS128-NEXT: ds_write_b128 v52, v[14:17] offset:48 ; GFX9-DS128-NEXT: v_mov_b32_e32 v12, v31 ; GFX9-DS128-NEXT: v_mov_b32_e32 v14, v31 ; GFX9-DS128-NEXT: v_mov_b32_e32 v1, v31 ; GFX9-DS128-NEXT: v_mov_b32_e32 v3, v31 ; GFX9-DS128-NEXT: ds_write_b128 v52, v[4:7] offset:32 ; GFX9-DS128-NEXT: ds_write_b128 v52, v[30:33] offset:224 ; GFX9-DS128-NEXT: ds_write_b128 v52, v[11:14] ; GFX9-DS128-NEXT: ds_write_b128 v52, v[0:3] offset:16 ; GFX9-DS128-NEXT: s_endpgm %load = load <32 x i16>, ptr addrspace(3) %in %ext = zext <32 x i16> %load to <32 x i64> store <32 x i64> %ext, ptr addrspace(3) %out ret void } define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { ; SI-LABEL: local_sextload_v32i16_to_v32i64: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v12, s1 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read2_b64 v[4:7], v12 offset0:2 offset1:3 ; SI-NEXT: ds_read2_b64 v[0:3], v12 offset1:1 ; SI-NEXT: ds_read2_b64 v[8:11], v12 offset0:6 offset1:7 ; SI-NEXT: ds_read2_b64 v[12:15], v12 offset0:4 offset1:5 ; SI-NEXT: s_waitcnt lgkmcnt(3) ; SI-NEXT: v_mov_b32_e32 v18, v7 ; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v7 ; SI-NEXT: v_ashrrev_i32_e32 v16, 16, v7 ; SI-NEXT: v_bfe_i32 v18, v18, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 ; SI-NEXT: v_mov_b32_e32 v7, s0 ; SI-NEXT: ds_write2_b64 v7, v[18:19], v[16:17] offset0:14 offset1:15 ; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v5 ; SI-NEXT: v_ashrrev_i32_e32 v16, 16, v5 ; SI-NEXT: v_bfe_i32 v18, v5, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 ; SI-NEXT: ds_write2_b64 v7, v[18:19], v[16:17] offset0:10 offset1:11 ; SI-NEXT: s_waitcnt lgkmcnt(4) ; SI-NEXT: v_mov_b32_e32 v5, v3 ; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v3 ; SI-NEXT: v_ashrrev_i32_e32 v16, 16, v3 ; SI-NEXT: v_bfe_i32 v18, v5, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 ; SI-NEXT: ds_write2_b64 v7, v[18:19], v[16:17] offset0:6 offset1:7 ; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v1 ; SI-NEXT: v_ashrrev_i32_e32 v16, 16, v1 ; SI-NEXT: v_bfe_i32 v18, v1, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 ; SI-NEXT: ds_write2_b64 v7, v[18:19], v[16:17] offset0:2 offset1:3 ; SI-NEXT: s_waitcnt lgkmcnt(5) ; SI-NEXT: v_mov_b32_e32 v1, v11 ; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v11 ; SI-NEXT: v_ashrrev_i32_e32 v16, 16, v11 ; SI-NEXT: v_bfe_i32 v18, v1, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 ; SI-NEXT: ds_write2_b64 v7, v[18:19], v[16:17] offset0:30 offset1:31 ; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v9 ; SI-NEXT: v_ashrrev_i32_e32 v16, 16, v9 ; SI-NEXT: v_bfe_i32 v18, v9, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 ; SI-NEXT: ds_write2_b64 v7, v[18:19], v[16:17] offset0:26 offset1:27 ; SI-NEXT: s_waitcnt lgkmcnt(6) ; SI-NEXT: v_mov_b32_e32 v1, v15 ; SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 ; SI-NEXT: v_ashrrev_i32_e32 v15, 16, v15 ; SI-NEXT: v_bfe_i32 v17, v1, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 ; SI-NEXT: ds_write2_b64 v7, v[17:18], v[15:16] offset0:22 offset1:23 ; SI-NEXT: v_ashrrev_i32_e32 v16, 31, v13 ; SI-NEXT: v_ashrrev_i32_e32 v15, 16, v13 ; SI-NEXT: v_bfe_i32 v17, v13, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 ; SI-NEXT: ds_write2_b64 v7, v[17:18], v[15:16] offset0:18 offset1:19 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 ; SI-NEXT: v_bfe_i32 v5, v6, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; SI-NEXT: v_bfe_i32 v15, v1, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 ; SI-NEXT: ds_write2_b64 v7, v[5:6], v[15:16] offset0:12 offset1:13 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 ; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 ; SI-NEXT: v_bfe_i32 v3, v4, 0, 16 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v8 ; SI-NEXT: v_bfe_i32 v5, v1, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; SI-NEXT: ds_write2_b64 v7, v[3:4], v[5:6] offset0:8 offset1:9 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v12 ; SI-NEXT: v_bfe_i32 v1, v12, 0, 16 ; SI-NEXT: v_bfe_i32 v3, v14, 0, 16 ; SI-NEXT: v_bfe_i32 v5, v8, 0, 16 ; SI-NEXT: v_bfe_i32 v8, v10, 0, 16 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 ; SI-NEXT: v_bfe_i32 v9, v0, 0, 16 ; SI-NEXT: v_bfe_i32 v10, v2, 0, 16 ; SI-NEXT: v_bfe_i32 v12, v11, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v11, 31, v10 ; SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 ; SI-NEXT: ds_write2_b64 v7, v[10:11], v[12:13] offset0:4 offset1:5 ; SI-NEXT: v_bfe_i32 v11, v6, 0, 16 ; SI-NEXT: v_bfe_i32 v13, v4, 0, 16 ; SI-NEXT: v_bfe_i32 v15, v15, 0, 16 ; SI-NEXT: v_bfe_i32 v16, v14, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v10, 31, v9 ; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; SI-NEXT: ds_write2_b64 v7, v[9:10], v[16:17] offset1:1 ; SI-NEXT: v_bfe_i32 v17, v18, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; SI-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; SI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; SI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 ; SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 ; SI-NEXT: ds_write2_b64 v7, v[8:9], v[17:18] offset0:28 offset1:29 ; SI-NEXT: ds_write2_b64 v7, v[5:6], v[15:16] offset0:24 offset1:25 ; SI-NEXT: ds_write2_b64 v7, v[3:4], v[13:14] offset0:20 offset1:21 ; SI-NEXT: ds_write2_b64 v7, v[1:2], v[11:12] offset0:16 offset1:17 ; SI-NEXT: s_endpgm ; ; VI-NO-DS128-LABEL: local_sextload_v32i16_to_v32i64: ; VI-NO-DS128: ; %bb.0: ; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NO-DS128-NEXT: s_mov_b32 m0, -1 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_mov_b32_e32 v7, s1 ; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v7 offset0:6 offset1:7 ; VI-NO-DS128-NEXT: ds_read2_b64 v[12:15], v7 offset0:4 offset1:5 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v11, s0 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; VI-NO-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v18, v3, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18 ; VI-NO-DS128-NEXT: ds_read2_b64 v[3:6], v7 offset0:2 offset1:3 ; VI-NO-DS128-NEXT: ds_read2_b64 v[7:10], v7 offset1:1 ; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[18:19], v[16:17] offset0:30 offset1:31 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v2 ; VI-NO-DS128-NEXT: v_bfe_i32 v16, v16, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v18, v2, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[18:19], v[16:17] offset0:28 offset1:29 ; VI-NO-DS128-NEXT: v_bfe_i32 v16, v2, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v1, v1, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[1:2], v[16:17] offset0:26 offset1:27 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v0 ; VI-NO-DS128-NEXT: v_bfe_i32 v16, v0, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v18, v17, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[16:17], v[18:19] offset0:24 offset1:25 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(6) ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v15 ; VI-NO-DS128-NEXT: v_bfe_i32 v16, v16, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v18, v15, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v14 ; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[18:19], v[16:17] offset0:22 offset1:23 ; VI-NO-DS128-NEXT: v_bfe_i32 v15, v15, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v17, v14, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v14, 16, v13 ; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[17:18], v[15:16] offset0:20 offset1:21 ; VI-NO-DS128-NEXT: v_bfe_i32 v14, v14, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v16, v13, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[16:17], v[14:15] offset0:18 offset1:19 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v12 ; VI-NO-DS128-NEXT: v_bfe_i32 v15, v12, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v17, v16, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 ; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[15:16], v[17:18] offset0:16 offset1:17 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(9) ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v6 ; VI-NO-DS128-NEXT: v_bfe_i32 v15, v15, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v17, v6, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v5 ; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[17:18], v[15:16] offset0:14 offset1:15 ; VI-NO-DS128-NEXT: v_bfe_i32 v15, v6, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v5, v5, 0, 16 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v12, 16, v4 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(9) ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v8 ; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[5:6], v[15:16] offset0:12 offset1:13 ; VI-NO-DS128-NEXT: v_bfe_i32 v5, v12, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v15, v4, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v13, v0, 0, 16 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v9 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 ; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[15:16], v[5:6] offset0:10 offset1:11 ; VI-NO-DS128-NEXT: v_bfe_i32 v15, v0, 0, 16 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v3 ; VI-NO-DS128-NEXT: v_bfe_i32 v17, v3, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v21, v0, 0, 16 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v10 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v22, 31, v21 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v7 ; VI-NO-DS128-NEXT: v_bfe_i32 v19, v19, 0, 16 ; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[17:18], v[21:22] offset0:8 offset1:9 ; VI-NO-DS128-NEXT: v_bfe_i32 v17, v10, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v1, v1, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v4, v7, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v6, v8, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v8, v9, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[17:18], v[19:20] offset0:6 offset1:7 ; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[8:9], v[15:16] offset0:4 offset1:5 ; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[6:7], v[13:14] offset0:2 offset1:3 ; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[4:5], v[1:2] offset1:1 ; VI-NO-DS128-NEXT: s_endpgm ; ; GFX9-NO-DS128-LABEL: local_sextload_v32i16_to_v32i64: ; GFX9-NO-DS128: ; %bb.0: ; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v8, s1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[4:7], v8 offset0:6 offset1:7 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v8 offset0:4 offset1:5 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v15, s0 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v7 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v9, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v18, v7, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[11:14], v8 offset1:1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[7:10], v8 offset0:2 offset1:3 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[18:19], v[16:17] offset0:30 offset1:31 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v6 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v16, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v18, v6, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v5 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[18:19], v[16:17] offset0:28 offset1:29 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v6, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v5, v5, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[5:6], v[16:17] offset0:26 offset1:27 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v4 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v18, v17, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(5) ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[16:17], v[18:19] offset0:24 offset1:25 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v3, v3, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[3:4], v[16:17] offset0:22 offset1:23 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v3, v3, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v2, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[16:17], v[3:4] offset0:20 offset1:21 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v1, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[16:17], v[2:3] offset0:18 offset1:19 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v3, v0, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[3:4], v[16:17] offset0:16 offset1:17 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(8) ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v10 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v3, v3, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v10, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[16:17], v[3:4] offset0:14 offset1:15 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v9 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v3, v3, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v9, v9, 0, 16 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v8 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[9:10], v[3:4] offset0:12 offset1:13 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v3, v0, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v8, v8, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v20, 16, v12 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[8:9], v[3:4] offset0:10 offset1:11 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v7 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v1, v20, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v7, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v20, v4, 0, 16 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v18, 16, v13 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v14 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, v14 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v21, 31, v20 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v11 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v8, v12, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v12, v18, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v18, v19, 0, 16 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[16:17], v[20:21] offset0:8 offset1:9 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v0, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v5, v5, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v3, v11, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v10, v13, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v13, 31, v12 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 31, v10 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[16:17], v[18:19] offset0:6 offset1:7 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[10:11], v[12:13] offset0:4 offset1:5 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[8:9], v[1:2] offset0:2 offset1:3 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[3:4], v[5:6] offset1:1 ; GFX9-NO-DS128-NEXT: s_endpgm ; ; EG-LABEL: local_sextload_v32i16_to_v32i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 107, @52, KC0[CB0:0-32], KC1[] ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.Y, OQAP, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T1.Y, OQAP, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T1.Z, OQAP, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T1.W, OQAP, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T2.Y, OQAP, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T2.Z, OQAP, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 36(5.044674e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T2.W, OQAP, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T3.Y, OQAP, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 44(6.165713e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T3.Z, OQAP, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 40(5.605194e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T3.W, OQAP, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 52(7.286752e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T4.Y, OQAP, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T4.Z, OQAP, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 60(8.407791e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T4.W, OQAP, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T5.Y, OQAP, ; EG-NEXT: MOV * T0.W, KC0[2].Z, ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T5.Z, OQAP, ; EG-NEXT: BFE_INT T0.W, T5.Y, 0.0, literal.x, ; EG-NEXT: ADD_INT * T5.W, KC0[2].Z, literal.y, ; EG-NEXT: 16(2.242078e-44), 56(7.847271e-44) ; EG-NEXT: LDS_READ_RET * OQAP, T5.W ; EG-NEXT: MOV * T5.W, OQAP, ; EG-NEXT: BFE_INT T0.Z, T5.Z, 0.0, literal.x, ; EG-NEXT: ASHR T6.W, T0.W, literal.y, ; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.z, ; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44) ; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T7.W, T6.W, ; EG-NEXT: BFE_INT T6.Z, T0.Y, 0.0, literal.x, ; EG-NEXT: ASHR T6.W, T0.Z, literal.y, ; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.z, ; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44) ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T7.W, T6.W, ; EG-NEXT: BFE_INT T7.Z, T1.Y, 0.0, literal.x, ; EG-NEXT: ASHR T6.W, T6.Z, literal.y, ; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.z, ; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44) ; EG-NEXT: 52(7.286752e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T7.W, T6.W, ; EG-NEXT: BFE_INT T8.Z, T1.Z, 0.0, literal.x, ; EG-NEXT: ASHR T6.W, T7.Z, literal.y, BS:VEC_120/SCL_212 ; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.z, ; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44) ; EG-NEXT: 36(5.044674e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T7.W, T6.W, ; EG-NEXT: BFE_INT T9.Z, T1.W, 0.0, literal.x, ; EG-NEXT: ASHR T6.W, T8.Z, literal.y, ; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.z, ; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44) ; EG-NEXT: 84(1.177091e-43), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T7.W, T6.W, ; EG-NEXT: BFE_INT T10.Z, T2.Y, 0.0, literal.x, ; EG-NEXT: ASHR T6.W, T9.Z, literal.y, ; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.z, ; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44) ; EG-NEXT: 68(9.528830e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T7.W, T6.W, ; EG-NEXT: BFE_INT T11.Z, T2.Z, 0.0, literal.x, ; EG-NEXT: ASHR T6.W, T10.Z, literal.y, BS:VEC_120/SCL_212 ; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.z, ; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44) ; EG-NEXT: 116(1.625506e-43), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T7.W, T6.W, ; EG-NEXT: BFE_INT * T12.Z, T2.W, 0.0, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: ALU 98, @53, KC0[CB0:0-32], KC1[] ; EG-NEXT: ASHR T6.W, T11.Z, literal.x, ; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.y, ; EG-NEXT: 31(4.344025e-44), 100(1.401298e-43) ; EG-NEXT: LDS_WRITE * T7.W, T6.W, ; EG-NEXT: BFE_INT T13.Z, T3.Y, 0.0, literal.x, ; EG-NEXT: ASHR T6.W, T12.Z, literal.y, ; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.z, ; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44) ; EG-NEXT: 148(2.073922e-43), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T7.W, T6.W, ; EG-NEXT: BFE_INT T14.Z, T3.Z, 0.0, literal.x, ; EG-NEXT: ASHR T6.W, T13.Z, literal.y, BS:VEC_120/SCL_212 ; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.z, ; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44) ; EG-NEXT: 132(1.849714e-43), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T7.W, T6.W, ; EG-NEXT: BFE_INT T15.Z, T3.W, 0.0, literal.x, ; EG-NEXT: ASHR T6.W, T14.Z, literal.y, ; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.z, ; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44) ; EG-NEXT: 180(2.522337e-43), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T7.W, T6.W, ; EG-NEXT: BFE_INT T16.Z, T4.Y, 0.0, literal.x, ; EG-NEXT: ASHR T6.W, T15.Z, literal.y, ; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.z, ; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44) ; EG-NEXT: 164(2.298129e-43), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T7.W, T6.W, ; EG-NEXT: BFE_INT T17.Z, T4.Z, 0.0, literal.x, ; EG-NEXT: ASHR T6.W, T16.Z, literal.y, BS:VEC_120/SCL_212 ; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.z, ; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44) ; EG-NEXT: 212(2.970753e-43), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T7.W, T6.W, ; EG-NEXT: BFE_INT T18.Z, T4.W, 0.0, literal.x, ; EG-NEXT: ASHR T6.W, T17.Z, literal.y, ; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.z, ; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44) ; EG-NEXT: 196(2.746545e-43), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T7.W, T6.W, ; EG-NEXT: BFE_INT T19.Z, T5.W, 0.0, literal.x, ; EG-NEXT: ASHR T6.W, T18.Z, literal.y, ; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.z, ; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44) ; EG-NEXT: 244(3.419168e-43), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T7.W, T6.W, ; EG-NEXT: ASHR T6.W, T19.Z, literal.x, ; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.y, ; EG-NEXT: 31(4.344025e-44), 228(3.194960e-43) ; EG-NEXT: LDS_WRITE * T7.W, T6.W, ; EG-NEXT: ASHR T6.W, T5.Y, literal.x, ; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.y, ; EG-NEXT: 31(4.344025e-44), 28(3.923636e-44) ; EG-NEXT: LDS_WRITE * T7.W, T6.W, ; EG-NEXT: ASHR T6.W, T5.Y, literal.x, ; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) ; EG-NEXT: LDS_WRITE * T7.W, T6.W, ; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T6.W, T0.W, ; EG-NEXT: ASHR T0.W, T5.Z, literal.x, ; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, ; EG-NEXT: 31(4.344025e-44), 12(1.681558e-44) ; EG-NEXT: LDS_WRITE * T6.W, T0.W, ; EG-NEXT: ASHR T0.W, T5.Z, literal.x, ; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44) ; EG-NEXT: LDS_WRITE * T6.W, T0.W, ; EG-NEXT: MOV * T0.W, KC0[2].Y, ; EG-NEXT: LDS_WRITE * T0.W, T0.Z, ; EG-NEXT: ASHR T0.W, T0.Y, literal.x, ; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, ; EG-NEXT: 31(4.344025e-44), 60(8.407791e-44) ; EG-NEXT: LDS_WRITE * T6.W, T0.W, ; EG-NEXT: ASHR T0.W, T0.Y, literal.x, ; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 56(7.847271e-44) ; EG-NEXT: LDS_WRITE * T6.W, T0.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T6.Z, ; EG-NEXT: ASHR T0.W, T1.Y, literal.x, ; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, ; EG-NEXT: 31(4.344025e-44), 44(6.165713e-44) ; EG-NEXT: LDS_WRITE * T6.W, T0.W, ; EG-NEXT: ASHR T0.W, T1.Y, literal.x, ; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 40(5.605194e-44) ; EG-NEXT: LDS_WRITE * T6.W, T0.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T7.Z, ; EG-NEXT: ASHR T0.W, T1.Z, literal.x, ; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, ; EG-NEXT: 31(4.344025e-44), 92(1.289195e-43) ; EG-NEXT: LDS_WRITE * T6.W, T0.W, ; EG-NEXT: ASHR * T0.W, T1.Z, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: ALU 99, @54, KC0[CB0:0-32], KC1[] ; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.x, ; EG-NEXT: 88(1.233143e-43), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T6.W, T0.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 80(1.121039e-43), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T8.Z, ; EG-NEXT: ASHR T0.W, T1.W, literal.x, ; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y, ; EG-NEXT: 31(4.344025e-44), 76(1.064987e-43) ; EG-NEXT: LDS_WRITE * T6.W, T0.W, ; EG-NEXT: ASHR T0.W, T1.W, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 72(1.008935e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 64(8.968310e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T9.Z, ; EG-NEXT: ASHR T0.W, T2.Y, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 31(4.344025e-44), 124(1.737610e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: ASHR T0.W, T2.Y, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 120(1.681558e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 112(1.569454e-43), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T10.Z, ; EG-NEXT: ASHR T0.W, T2.Z, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 31(4.344025e-44), 108(1.513402e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: ASHR T0.W, T2.Z, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 104(1.457350e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 96(1.345247e-43), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T11.Z, ; EG-NEXT: ASHR T0.W, T2.W, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 31(4.344025e-44), 156(2.186026e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: ASHR T0.W, T2.W, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 152(2.129974e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 144(2.017870e-43), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T12.Z, ; EG-NEXT: ASHR T0.W, T3.Y, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 31(4.344025e-44), 140(1.961818e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: ASHR T0.W, T3.Y, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 136(1.905766e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 128(1.793662e-43), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T13.Z, ; EG-NEXT: ASHR T0.W, T3.Z, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 31(4.344025e-44), 188(2.634441e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: ASHR T0.W, T3.Z, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 184(2.578389e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 176(2.466285e-43), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T14.Z, ; EG-NEXT: ASHR T0.W, T3.W, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 31(4.344025e-44), 172(2.410233e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: ASHR T0.W, T3.W, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 168(2.354181e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 160(2.242078e-43), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T15.Z, ; EG-NEXT: ASHR T0.W, T4.Y, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 31(4.344025e-44), 220(3.082857e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: ASHR T0.W, T4.Y, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 216(3.026805e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 208(2.914701e-43), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T16.Z, ; EG-NEXT: ASHR T0.W, T4.Z, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 31(4.344025e-44), 204(2.858649e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: ASHR * T0.W, T4.Z, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: ALU 27, @55, KC0[CB0:0-32], KC1[] ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x, ; EG-NEXT: 200(2.802597e-43), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 192(2.690493e-43), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T17.Z, ; EG-NEXT: ASHR T0.W, T4.W, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 31(4.344025e-44), 252(3.531272e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: ASHR T0.W, T4.W, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 248(3.475220e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 240(3.363116e-43), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T18.Z, ; EG-NEXT: ASHR T0.W, T5.W, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 31(4.344025e-44), 236(3.307064e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: ASHR T0.W, T5.W, literal.x, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 232(3.251012e-43) ; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 224(3.138909e-43), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T19.Z, ; EG-NEXT: RETURN ; ; VI-DS128-LABEL: local_sextload_v32i16_to_v32i64: ; VI-DS128: ; %bb.0: ; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-DS128-NEXT: s_mov_b32 m0, -1 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_mov_b32_e32 v4, s1 ; VI-DS128-NEXT: ds_read_b128 v[0:3], v4 offset:48 ; VI-DS128-NEXT: ds_read_b128 v[9:12], v4 offset:32 ; VI-DS128-NEXT: v_mov_b32_e32 v8, s0 ; VI-DS128-NEXT: ds_read_b128 v[17:20], v4 offset:16 ; VI-DS128-NEXT: ds_read_b128 v[4:7], v4 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(3) ; VI-DS128-NEXT: v_bfe_i32 v13, v2, 0, 16 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-DS128-NEXT: v_bfe_i32 v15, v2, 0, 16 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 ; VI-DS128-NEXT: v_mov_b32_e32 v2, v3 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; VI-DS128-NEXT: ds_write_b128 v8, v[13:16] offset:224 ; VI-DS128-NEXT: v_bfe_i32 v13, v2, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v15, v3, 0, 16 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-DS128-NEXT: ds_write_b128 v8, v[13:16] offset:240 ; VI-DS128-NEXT: v_bfe_i32 v15, v2, 0, 16 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; VI-DS128-NEXT: v_bfe_i32 v13, v0, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v0, v1, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; VI-DS128-NEXT: ds_write_b128 v8, v[0:3] offset:208 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(5) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v11 ; VI-DS128-NEXT: v_bfe_i32 v0, v11, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; VI-DS128-NEXT: ds_write_b128 v8, v[13:16] offset:192 ; VI-DS128-NEXT: v_mov_b32_e32 v13, v12 ; VI-DS128-NEXT: ds_write_b128 v8, v[0:3] offset:160 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v12 ; VI-DS128-NEXT: v_bfe_i32 v0, v13, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v10 ; VI-DS128-NEXT: ds_write_b128 v8, v[0:3] offset:176 ; VI-DS128-NEXT: v_bfe_i32 v0, v9, 0, 16 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v9 ; VI-DS128-NEXT: v_bfe_i32 v9, v10, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v11, v11, 0, 16 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 ; VI-DS128-NEXT: ds_write_b128 v8, v[9:12] offset:144 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(8) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v19 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; VI-DS128-NEXT: v_bfe_i32 v9, v19, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v11, v11, 0, 16 ; VI-DS128-NEXT: ds_write_b128 v8, v[0:3] offset:128 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(8) ; VI-DS128-NEXT: v_bfe_i32 v0, v5, 0, 16 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v5 ; VI-DS128-NEXT: v_mov_b32_e32 v5, v20 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; VI-DS128-NEXT: ds_write_b128 v8, v[9:12] offset:96 ; VI-DS128-NEXT: v_bfe_i32 v9, v5, 0, 16 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v20 ; VI-DS128-NEXT: v_bfe_i32 v11, v5, 0, 16 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v17 ; VI-DS128-NEXT: ds_write_b128 v8, v[9:12] offset:112 ; VI-DS128-NEXT: v_bfe_i32 v9, v17, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v11, v5, 0, 16 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v18 ; VI-DS128-NEXT: ds_write_b128 v8, v[9:12] offset:64 ; VI-DS128-NEXT: v_bfe_i32 v9, v4, 0, 16 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; VI-DS128-NEXT: v_bfe_i32 v13, v18, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v15, v5, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v11, v4, 0, 16 ; VI-DS128-NEXT: v_mov_b32_e32 v4, v7 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 ; VI-DS128-NEXT: ds_write_b128 v8, v[13:16] offset:80 ; VI-DS128-NEXT: v_bfe_i32 v13, v4, 0, 16 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v7 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v6 ; VI-DS128-NEXT: v_bfe_i32 v15, v4, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v4, v6, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v6, v7, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 ; VI-DS128-NEXT: ds_write_b128 v8, v[4:7] offset:32 ; VI-DS128-NEXT: ds_write_b128 v8, v[13:16] offset:48 ; VI-DS128-NEXT: ds_write_b128 v8, v[9:12] ; VI-DS128-NEXT: ds_write_b128 v8, v[0:3] offset:16 ; VI-DS128-NEXT: s_endpgm ; ; GFX9-DS128-LABEL: local_sextload_v32i16_to_v32i64: ; GFX9-DS128: ; %bb.0: ; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_mov_b32_e32 v13, s1 ; GFX9-DS128-NEXT: ds_read_b128 v[4:7], v13 offset:48 ; GFX9-DS128-NEXT: ds_read_b128 v[0:3], v13 offset:32 ; GFX9-DS128-NEXT: v_mov_b32_e32 v12, s0 ; GFX9-DS128-NEXT: ds_read_b128 v[8:11], v13 ; GFX9-DS128-NEXT: ds_read_b128 v[18:21], v13 offset:16 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(3) ; GFX9-DS128-NEXT: v_bfe_i32 v14, v6, 0, 16 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX9-DS128-NEXT: v_bfe_i32 v16, v6, 0, 16 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GFX9-DS128-NEXT: v_mov_b32_e32 v6, v7 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GFX9-DS128-NEXT: ds_write_b128 v12, v[14:17] offset:224 ; GFX9-DS128-NEXT: v_bfe_i32 v13, v6, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v15, v7, 0, 16 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v4 ; GFX9-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:240 ; GFX9-DS128-NEXT: v_bfe_i32 v15, v6, 0, 16 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v5 ; GFX9-DS128-NEXT: v_bfe_i32 v13, v4, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v4, v5, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v6, v6, 0, 16 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; GFX9-DS128-NEXT: ds_write_b128 v12, v[4:7] offset:208 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(5) ; GFX9-DS128-NEXT: v_bfe_i32 v4, v2, 0, 16 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 ; GFX9-DS128-NEXT: v_bfe_i32 v6, v2, 0, 16 ; GFX9-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:192 ; GFX9-DS128-NEXT: v_mov_b32_e32 v13, v3 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; GFX9-DS128-NEXT: ds_write_b128 v12, v[4:7] offset:160 ; GFX9-DS128-NEXT: v_bfe_i32 v4, v13, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v6, v2, 0, 16 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; GFX9-DS128-NEXT: ds_write_b128 v12, v[4:7] offset:176 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX9-DS128-NEXT: v_bfe_i32 v2, v0, 0, 16 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-DS128-NEXT: v_bfe_i32 v13, v1, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v15, v6, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v4, v0, 0, 16 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(6) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v20 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v9 ; GFX9-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:144 ; GFX9-DS128-NEXT: v_bfe_i32 v13, v20, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v15, v1, 0, 16 ; GFX9-DS128-NEXT: ds_write_b128 v12, v[2:5] offset:128 ; GFX9-DS128-NEXT: v_bfe_i32 v4, v0, 0, 16 ; GFX9-DS128-NEXT: v_mov_b32_e32 v0, v21 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 ; GFX9-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:96 ; GFX9-DS128-NEXT: v_bfe_i32 v13, v0, 0, 16 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v21 ; GFX9-DS128-NEXT: v_bfe_i32 v15, v0, 0, 16 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v18 ; GFX9-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:112 ; GFX9-DS128-NEXT: v_bfe_i32 v13, v18, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v15, v0, 0, 16 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v19 ; GFX9-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:64 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v8 ; GFX9-DS128-NEXT: v_bfe_i32 v13, v19, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v15, v1, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v6, v8, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v8, v0, 0, 16 ; GFX9-DS128-NEXT: v_mov_b32_e32 v0, v11 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 ; GFX9-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:80 ; GFX9-DS128-NEXT: v_bfe_i32 v13, v0, 0, 16 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v11 ; GFX9-DS128-NEXT: v_bfe_i32 v15, v0, 0, 16 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v10 ; GFX9-DS128-NEXT: v_bfe_i32 v17, v10, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v19, v0, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v2, v9, 0, 16 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 ; GFX9-DS128-NEXT: ds_write_b128 v12, v[17:20] offset:32 ; GFX9-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:48 ; GFX9-DS128-NEXT: ds_write_b128 v12, v[6:9] ; GFX9-DS128-NEXT: ds_write_b128 v12, v[2:5] offset:16 ; GFX9-DS128-NEXT: s_endpgm %load = load <32 x i16>, ptr addrspace(3) %in %ext = sext <32 x i16> %load to <32 x i64> store <32 x i64> %ext, ptr addrspace(3) %out ret void } ; ; XFUNC-LABEL: {{^}}local_zextload_v64i16_to_v64i64: ; define amdgpu_kernel void @local_zextload_v64i16_to_v64i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { ; %load = load <64 x i16>, ptr addrspace(3) %in ; %ext = zext <64 x i16> %load to <64 x i64> ; store <64 x i64> %ext, ptr addrspace(3) %out ; ret void ; } ; ; XFUNC-LABEL: {{^}}local_sextload_v64i16_to_v64i64: ; define amdgpu_kernel void @local_sextload_v64i16_to_v64i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { ; %load = load <64 x i16>, ptr addrspace(3) %in ; %ext = sext <64 x i16> %load to <64 x i64> ; store <64 x i64> %ext, ptr addrspace(3) %out ; ret void ; } ; Tests if ds_read/write_b128 gets generated for the 16 byte aligned load. define amdgpu_kernel void @local_v8i16_to_128(ptr addrspace(3) %out, ptr addrspace(3) %in) { ; SI-LABEL: local_v8i16_to_128: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s1 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 ; SI-NEXT: v_mov_b32_e32 v4, s0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 ; SI-NEXT: s_endpgm ; ; VI-NO-DS128-LABEL: local_v8i16_to_128: ; VI-NO-DS128: ; %bb.0: ; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NO-DS128-NEXT: s_mov_b32 m0, -1 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_mov_b32_e32 v0, s1 ; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v4, s0 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 ; VI-NO-DS128-NEXT: s_endpgm ; ; GFX9-NO-DS128-LABEL: local_v8i16_to_128: ; GFX9-NO-DS128: ; %bb.0: ; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v4, s0 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 ; GFX9-NO-DS128-NEXT: s_endpgm ; ; EG-LABEL: local_v8i16_to_128: ; EG: ; %bb.0: ; EG-NEXT: ALU 25, @56, KC0[CB0:0-32], KC1[] ; EG-NEXT: MOV * T0.W, KC0[2].Z, ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.X, OQAP, ; EG-NEXT: MOV * T0.W, KC0[2].Y, ; EG-NEXT: LDS_WRITE * T0.W, T0.X, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.X, OQAP, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T0.X, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.X, OQAP, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T0.X, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, ; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00) ; EG-NEXT: LDS_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.X, OQAP, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00) ; EG-NEXT: LDS_WRITE * T0.W, T0.X, ; EG-NEXT: RETURN ; ; VI-DS128-LABEL: local_v8i16_to_128: ; VI-DS128: ; %bb.0: ; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-DS128-NEXT: s_mov_b32 m0, -1 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_mov_b32_e32 v0, s1 ; VI-DS128-NEXT: ds_read_b128 v[0:3], v0 ; VI-DS128-NEXT: v_mov_b32_e32 v4, s0 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: ds_write_b128 v4, v[0:3] ; VI-DS128-NEXT: s_endpgm ; ; GFX9-DS128-LABEL: local_v8i16_to_128: ; GFX9-DS128: ; %bb.0: ; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-DS128-NEXT: ds_read_b128 v[0:3], v0 ; GFX9-DS128-NEXT: v_mov_b32_e32 v4, s0 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: ds_write_b128 v4, v[0:3] ; GFX9-DS128-NEXT: s_endpgm %ld = load <8 x i16>, ptr addrspace(3) %in, align 16 store <8 x i16> %ld, ptr addrspace(3) %out, align 16 ret void } attributes #0 = { nounwind }