; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12 %s define amdgpu_ps void @load_P0_B16_D16(<2 x i16> %vec, ptr addrspace(0) %ptra, ptr addrspace(0) %out) { ; GFX12-LABEL: load_P0_B16_D16: ; GFX12: ; %bb.0: ; GFX12-NEXT: flat_load_d16_b16 v0, v[1:2] ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: flat_store_b32 v[3:4], v0 ; GFX12-NEXT: s_endpgm %a = load i16, ptr addrspace(0) %ptra %res = insertelement <2 x i16> %vec, i16 %a, i32 0 store <2 x i16> %res, ptr addrspace(0) %out ret void } define amdgpu_ps void @load_P0_B16_D16_Hi(<2 x i16> %vec, ptr addrspace(0) %ptra, ptr addrspace(0) %out) { ; GFX12-LABEL: load_P0_B16_D16_Hi: ; GFX12: ; %bb.0: ; GFX12-NEXT: flat_load_d16_hi_b16 v0, v[1:2] ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: flat_store_b32 v[3:4], v0 ; GFX12-NEXT: s_endpgm %a = load i16, ptr addrspace(0) %ptra %res = insertelement <2 x i16> %vec, i16 %a, i32 1 store <2 x i16> %res, ptr addrspace(0) %out ret void } define amdgpu_ps void @sextload_P0_i8_D16(<2 x i16> %vec, ptr addrspace(0) %ptra, ptr addrspace(0) %out) { ; GFX12-LABEL: sextload_P0_i8_D16: ; GFX12: ; %bb.0: ; GFX12-NEXT: flat_load_d16_i8 v0, v[1:2] ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: flat_store_b32 v[3:4], v0 ; GFX12-NEXT: s_endpgm %a = load i8, ptr addrspace(0) %ptra %a16 = sext i8 %a to i16 %res = insertelement <2 x i16> %vec, i16 %a16, i32 0 store <2 x i16> %res, ptr addrspace(0) %out ret void } define amdgpu_ps void @sextload_P0_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(0) %ptra, ptr addrspace(0) %out) { ; GFX12-LABEL: sextload_P0_i8_D16_Hi: ; GFX12: ; %bb.0: ; GFX12-NEXT: flat_load_d16_hi_i8 v0, v[1:2] ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: flat_store_b32 v[3:4], v0 ; GFX12-NEXT: s_endpgm %a = load i8, ptr addrspace(0) %ptra %a16 = sext i8 %a to i16 %res = insertelement <2 x i16> %vec, i16 %a16, i32 1 store <2 x i16> %res, ptr addrspace(0) %out ret void } define amdgpu_ps void @zextload_P0_i8_D16(<2 x i16> %vec, ptr addrspace(0) %ptra, ptr addrspace(0) %out) { ; GFX12-LABEL: zextload_P0_i8_D16: ; GFX12: ; %bb.0: ; GFX12-NEXT: flat_load_d16_u8 v0, v[1:2] ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: flat_store_b32 v[3:4], v0 ; GFX12-NEXT: s_endpgm %a = load i8, ptr addrspace(0) %ptra %a16 = zext i8 %a to i16 %res = insertelement <2 x i16> %vec, i16 %a16, i32 0 store <2 x i16> %res, ptr addrspace(0) %out ret void } define amdgpu_ps void @zextload_P0_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(0) %ptra, ptr addrspace(0) %out) { ; GFX12-LABEL: zextload_P0_i8_D16_Hi: ; GFX12: ; %bb.0: ; GFX12-NEXT: flat_load_d16_hi_u8 v0, v[1:2] ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: flat_store_b32 v[3:4], v0 ; GFX12-NEXT: s_endpgm %a = load i8, ptr addrspace(0) %ptra %a16 = zext i8 %a to i16 %res = insertelement <2 x i16> %vec, i16 %a16, i32 1 store <2 x i16> %res, ptr addrspace(0) %out ret void } define amdgpu_ps void @load_P1_B16_D16(<2 x i16> %vec, ptr addrspace(1) %ptra, ptr addrspace(1) %out) { ; GFX12-LABEL: load_P1_B16_D16: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_d16_b16 v0, v[1:2], off ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v[3:4], v0, off ; GFX12-NEXT: s_endpgm %a = load i16, ptr addrspace(1) %ptra %res = insertelement <2 x i16> %vec, i16 %a, i32 0 store <2 x i16> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @load_P1_B16_D16_Hi(<2 x i16> %vec, ptr addrspace(1) %ptra, ptr addrspace(1) %out) { ; GFX12-LABEL: load_P1_B16_D16_Hi: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_d16_hi_b16 v0, v[1:2], off ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v[3:4], v0, off ; GFX12-NEXT: s_endpgm %a = load i16, ptr addrspace(1) %ptra %res = insertelement <2 x i16> %vec, i16 %a, i32 1 store <2 x i16> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @sextload_P1_i8_D16(<2 x i16> %vec, ptr addrspace(1) %ptra, ptr addrspace(1) %out) { ; GFX12-LABEL: sextload_P1_i8_D16: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_d16_i8 v0, v[1:2], off ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v[3:4], v0, off ; GFX12-NEXT: s_endpgm %a = load i8, ptr addrspace(1) %ptra %a16 = sext i8 %a to i16 %res = insertelement <2 x i16> %vec, i16 %a16, i32 0 store <2 x i16> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @sextload_P1_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(1) %ptra, ptr addrspace(1) %out) { ; GFX12-LABEL: sextload_P1_i8_D16_Hi: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_d16_hi_i8 v0, v[1:2], off ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v[3:4], v0, off ; GFX12-NEXT: s_endpgm %a = load i8, ptr addrspace(1) %ptra %a16 = sext i8 %a to i16 %res = insertelement <2 x i16> %vec, i16 %a16, i32 1 store <2 x i16> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @zextload_P1_i8_D16(<2 x i16> %vec, ptr addrspace(1) %ptra, ptr addrspace(1) %out) { ; GFX12-LABEL: zextload_P1_i8_D16: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_d16_u8 v0, v[1:2], off ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v[3:4], v0, off ; GFX12-NEXT: s_endpgm %a = load i8, ptr addrspace(1) %ptra %a16 = zext i8 %a to i16 %res = insertelement <2 x i16> %vec, i16 %a16, i32 0 store <2 x i16> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @zextload_P1_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(1) %ptra, ptr addrspace(1) %out) { ; GFX12-LABEL: zextload_P1_i8_D16_Hi: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_d16_hi_u8 v0, v[1:2], off ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v[3:4], v0, off ; GFX12-NEXT: s_endpgm %a = load i8, ptr addrspace(1) %ptra %a16 = zext i8 %a to i16 %res = insertelement <2 x i16> %vec, i16 %a16, i32 1 store <2 x i16> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @load_P3_B16_D16(<2 x i16> %vec, ptr addrspace(3) %ptra, ptr addrspace(3) %out) { ; GFX12-LABEL: load_P3_B16_D16: ; GFX12: ; %bb.0: ; GFX12-NEXT: ds_load_u16_d16 v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: ds_store_b32 v2, v0 ; GFX12-NEXT: s_endpgm %a = load i16, ptr addrspace(3) %ptra %res = insertelement <2 x i16> %vec, i16 %a, i32 0 store <2 x i16> %res, ptr addrspace(3) %out ret void } define amdgpu_ps void @load_P3_B16_D16_Hi(<2 x i16> %vec, ptr addrspace(3) %ptra, ptr addrspace(3) %out) { ; GFX12-LABEL: load_P3_B16_D16_Hi: ; GFX12: ; %bb.0: ; GFX12-NEXT: ds_load_u16_d16_hi v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: ds_store_b32 v2, v0 ; GFX12-NEXT: s_endpgm %a = load i16, ptr addrspace(3) %ptra %res = insertelement <2 x i16> %vec, i16 %a, i32 1 store <2 x i16> %res, ptr addrspace(3) %out ret void } define amdgpu_ps void @sextload_P3_i8_D16(<2 x i16> %vec, ptr addrspace(3) %ptra, ptr addrspace(3) %out) { ; GFX12-LABEL: sextload_P3_i8_D16: ; GFX12: ; %bb.0: ; GFX12-NEXT: ds_load_i8_d16 v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: ds_store_b32 v2, v0 ; GFX12-NEXT: s_endpgm %a = load i8, ptr addrspace(3) %ptra %a16 = sext i8 %a to i16 %res = insertelement <2 x i16> %vec, i16 %a16, i32 0 store <2 x i16> %res, ptr addrspace(3) %out ret void } define amdgpu_ps void @sextload_P3_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(3) %ptra, ptr addrspace(3) %out) { ; GFX12-LABEL: sextload_P3_i8_D16_Hi: ; GFX12: ; %bb.0: ; GFX12-NEXT: ds_load_i8_d16_hi v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: ds_store_b32 v2, v0 ; GFX12-NEXT: s_endpgm %a = load i8, ptr addrspace(3) %ptra %a16 = sext i8 %a to i16 %res = insertelement <2 x i16> %vec, i16 %a16, i32 1 store <2 x i16> %res, ptr addrspace(3) %out ret void } define amdgpu_ps void @zextload_P3_i8_D16(<2 x i16> %vec, ptr addrspace(3) %ptra, ptr addrspace(3) %out) { ; GFX12-LABEL: zextload_P3_i8_D16: ; GFX12: ; %bb.0: ; GFX12-NEXT: ds_load_u8_d16 v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: ds_store_b32 v2, v0 ; GFX12-NEXT: s_endpgm %a = load i8, ptr addrspace(3) %ptra %a16 = zext i8 %a to i16 %res = insertelement <2 x i16> %vec, i16 %a16, i32 0 store <2 x i16> %res, ptr addrspace(3) %out ret void } define amdgpu_ps void @zextload_P3_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(3) %ptra, ptr addrspace(3) %out) { ; GFX12-LABEL: zextload_P3_i8_D16_Hi: ; GFX12: ; %bb.0: ; GFX12-NEXT: ds_load_u8_d16_hi v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: ds_store_b32 v2, v0 ; GFX12-NEXT: s_endpgm %a = load i8, ptr addrspace(3) %ptra %a16 = zext i8 %a to i16 %res = insertelement <2 x i16> %vec, i16 %a16, i32 1 store <2 x i16> %res, ptr addrspace(3) %out ret void } define amdgpu_ps void @load_P4_B16_D16(<2 x i16> %vec, ptr addrspace(4) %ptra, ptr addrspace(1) %out) { ; GFX12-LABEL: load_P4_B16_D16: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_d16_b16 v0, v[1:2], off ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v[3:4], v0, off ; GFX12-NEXT: s_endpgm %a = load i16, ptr addrspace(4) %ptra %res = insertelement <2 x i16> %vec, i16 %a, i32 0 store <2 x i16> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @load_P4_B16_D16_Hi(<2 x i16> %vec, ptr addrspace(4) %ptra, ptr addrspace(1) %out) { ; GFX12-LABEL: load_P4_B16_D16_Hi: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_d16_hi_b16 v0, v[1:2], off ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v[3:4], v0, off ; GFX12-NEXT: s_endpgm %a = load i16, ptr addrspace(4) %ptra %res = insertelement <2 x i16> %vec, i16 %a, i32 1 store <2 x i16> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @sextload_P4_i8_D16(<2 x i16> %vec, ptr addrspace(4) %ptra, ptr addrspace(1) %out) { ; GFX12-LABEL: sextload_P4_i8_D16: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_d16_i8 v0, v[1:2], off ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v[3:4], v0, off ; GFX12-NEXT: s_endpgm %a = load i8, ptr addrspace(4) %ptra %a16 = sext i8 %a to i16 %res = insertelement <2 x i16> %vec, i16 %a16, i32 0 store <2 x i16> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @sextload_P4_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(4) %ptra, ptr addrspace(1) %out) { ; GFX12-LABEL: sextload_P4_i8_D16_Hi: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_d16_hi_i8 v0, v[1:2], off ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v[3:4], v0, off ; GFX12-NEXT: s_endpgm %a = load i8, ptr addrspace(4) %ptra %a16 = sext i8 %a to i16 %res = insertelement <2 x i16> %vec, i16 %a16, i32 1 store <2 x i16> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @zextload_P4_i8_D16(<2 x i16> %vec, ptr addrspace(4) %ptra, ptr addrspace(1) %out) { ; GFX12-LABEL: zextload_P4_i8_D16: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_d16_u8 v0, v[1:2], off ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v[3:4], v0, off ; GFX12-NEXT: s_endpgm %a = load i8, ptr addrspace(4) %ptra %a16 = zext i8 %a to i16 %res = insertelement <2 x i16> %vec, i16 %a16, i32 0 store <2 x i16> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @zextload_P4_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(4) %ptra, ptr addrspace(1) %out) { ; GFX12-LABEL: zextload_P4_i8_D16_Hi: ; GFX12: ; %bb.0: ; GFX12-NEXT: global_load_d16_hi_u8 v0, v[1:2], off ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v[3:4], v0, off ; GFX12-NEXT: s_endpgm %a = load i8, ptr addrspace(4) %ptra %a16 = zext i8 %a to i16 %res = insertelement <2 x i16> %vec, i16 %a16, i32 1 store <2 x i16> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @load_P5_B16_D16(<2 x i16> %vec, ptr addrspace(5) %ptra, ptr addrspace(5) %out) { ; GFX12-LABEL: load_P5_B16_D16: ; GFX12: ; %bb.0: ; GFX12-NEXT: scratch_load_d16_b16 v0, v1, off ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: scratch_store_b32 v2, v0, off ; GFX12-NEXT: s_endpgm %a = load i16, ptr addrspace(5) %ptra %res = insertelement <2 x i16> %vec, i16 %a, i32 0 store <2 x i16> %res, ptr addrspace(5) %out ret void } define amdgpu_ps void @load_P5_B16_D16_Hi(<2 x i16> %vec, ptr addrspace(5) %ptra, ptr addrspace(5) %out) { ; GFX12-LABEL: load_P5_B16_D16_Hi: ; GFX12: ; %bb.0: ; GFX12-NEXT: scratch_load_d16_hi_b16 v0, v1, off ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: scratch_store_b32 v2, v0, off ; GFX12-NEXT: s_endpgm %a = load i16, ptr addrspace(5) %ptra %res = insertelement <2 x i16> %vec, i16 %a, i32 1 store <2 x i16> %res, ptr addrspace(5) %out ret void } define amdgpu_ps void @sextload_P5_i8_D16(<2 x i16> %vec, ptr addrspace(5) %ptra, ptr addrspace(5) %out) { ; GFX12-LABEL: sextload_P5_i8_D16: ; GFX12: ; %bb.0: ; GFX12-NEXT: scratch_load_d16_i8 v0, v1, off ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: scratch_store_b32 v2, v0, off ; GFX12-NEXT: s_endpgm %a = load i8, ptr addrspace(5) %ptra %a16 = sext i8 %a to i16 %res = insertelement <2 x i16> %vec, i16 %a16, i32 0 store <2 x i16> %res, ptr addrspace(5) %out ret void } define amdgpu_ps void @sextload_P5_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(5) %ptra, ptr addrspace(5) %out) { ; GFX12-LABEL: sextload_P5_i8_D16_Hi: ; GFX12: ; %bb.0: ; GFX12-NEXT: scratch_load_d16_hi_i8 v0, v1, off ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: scratch_store_b32 v2, v0, off ; GFX12-NEXT: s_endpgm %a = load i8, ptr addrspace(5) %ptra %a16 = sext i8 %a to i16 %res = insertelement <2 x i16> %vec, i16 %a16, i32 1 store <2 x i16> %res, ptr addrspace(5) %out ret void } define amdgpu_ps void @zextload_P5_i8_D16(<2 x i16> %vec, ptr addrspace(5) %ptra, ptr addrspace(5) %out) { ; GFX12-LABEL: zextload_P5_i8_D16: ; GFX12: ; %bb.0: ; GFX12-NEXT: scratch_load_d16_u8 v0, v1, off ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: scratch_store_b32 v2, v0, off ; GFX12-NEXT: s_endpgm %a = load i8, ptr addrspace(5) %ptra %a16 = zext i8 %a to i16 %res = insertelement <2 x i16> %vec, i16 %a16, i32 0 store <2 x i16> %res, ptr addrspace(5) %out ret void } define amdgpu_ps void @zextload_P5_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(5) %ptra, ptr addrspace(5) %out) { ; GFX12-LABEL: zextload_P5_i8_D16_Hi: ; GFX12: ; %bb.0: ; GFX12-NEXT: scratch_load_d16_hi_u8 v0, v1, off ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: scratch_store_b32 v2, v0, off ; GFX12-NEXT: s_endpgm %a = load i8, ptr addrspace(5) %ptra %a16 = zext i8 %a to i16 %res = insertelement <2 x i16> %vec, i16 %a16, i32 1 store <2 x i16> %res, ptr addrspace(5) %out ret void }