; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+unaligned-access-mode < %s | FileCheck --check-prefixes=GFX11 %s ; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+unaligned-access-mode -mattr=+real-true16 < %s | FileCheck --check-prefixes=GFX12,GFX12-True16 %s ; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+unaligned-access-mode -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX12,GFX12-NoTrue16 %s ; global address space, addrspace(1) ; gfx12, true16 is S16 16-bit load ; gfx12, without true 16 is S32 16-bit any-extending load define amdgpu_ps void @load_uniform_P1_i16_gfx12(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) { ; GFX11-LABEL: load_uniform_P1_i16_gfx12: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: global_load_d16_b16 v2, v2, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_readfirstlane_b32 s0, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mov_b16_e32 v2.l, s0 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off ; GFX11-NEXT: s_endpgm ; ; GFX12-True16-LABEL: load_uniform_P1_i16_gfx12: ; GFX12-True16: ; %bb.0: ; GFX12-True16-NEXT: s_load_u16 s0, s[0:1], 0x0 ; GFX12-True16-NEXT: s_wait_kmcnt 0x0 ; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0 ; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off ; GFX12-True16-NEXT: s_endpgm ; ; GFX12-NoTrue16-LABEL: load_uniform_P1_i16_gfx12: ; GFX12-NoTrue16: ; %bb.0: ; GFX12-NoTrue16-NEXT: s_load_u16 s0, s[0:1], 0x0 ; GFX12-NoTrue16-NEXT: s_wait_kmcnt 0x0 ; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off ; GFX12-NoTrue16-NEXT: s_endpgm %a = load i16, ptr addrspace(1) %ptra store i16 %a, ptr addrspace(1) %out ret void } ; gfx11, and older true16 is S16 16-bit load ; gfx11, and older without true 16 is S32 16-bit any-extending load ; both cases require align 4 and uniform mmo to widen mmo to 32-bit load define amdgpu_ps void @load_uniform_P1_i16_align4_widen_mmo_gfx11(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) { ; GFX11-LABEL: load_uniform_P1_i16_align4_widen_mmo_gfx11: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b16_e32 v2.l, s0 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off ; GFX11-NEXT: s_endpgm ; ; GFX12-True16-LABEL: load_uniform_P1_i16_align4_widen_mmo_gfx11: ; GFX12-True16: ; %bb.0: ; GFX12-True16-NEXT: s_load_u16 s0, s[0:1], 0x0 ; GFX12-True16-NEXT: s_wait_kmcnt 0x0 ; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0 ; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off ; GFX12-True16-NEXT: s_endpgm ; ; GFX12-NoTrue16-LABEL: load_uniform_P1_i16_align4_widen_mmo_gfx11: ; GFX12-NoTrue16: ; %bb.0: ; GFX12-NoTrue16-NEXT: s_load_u16 s0, s[0:1], 0x0 ; GFX12-NoTrue16-NEXT: s_wait_kmcnt 0x0 ; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off ; GFX12-NoTrue16-NEXT: s_endpgm %a = load i16, ptr addrspace(1) %ptra, align 4 store i16 %a, ptr addrspace(1) %out ret void } ; gfx12, S32 8-bit anyextending load, no difference regarding true 16 define amdgpu_ps void @load_uniform_P1_i8_any_extending_load(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) { ; GFX11-LABEL: load_uniform_P1_i8_any_extending_load: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: global_load_u8 v2, v2, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b8 v[0:1], v2, off ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: load_uniform_P1_i8_any_extending_load: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_u8 s0, s[0:1], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-NEXT: global_store_b8 v[0:1], v2, off ; GFX12-NEXT: s_endpgm %a = load i8, ptr addrspace(1) %ptra store i8 %a, ptr addrspace(1) %out ret void } ; gfx11 and older, S32 8-bit any-extending load, no difference regarding true 16 define amdgpu_ps void @load_uniform_P1_i8_any_extending_load_align4_widen_mmo_gfx11(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) { ; GFX11-LABEL: load_uniform_P1_i8_any_extending_load_align4_widen_mmo_gfx11: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-NEXT: global_store_b8 v[0:1], v2, off ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: load_uniform_P1_i8_any_extending_load_align4_widen_mmo_gfx11: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_u8 s0, s[0:1], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-NEXT: global_store_b8 v[0:1], v2, off ; GFX12-NEXT: s_endpgm %a = load i8, ptr addrspace(1) %ptra, align 4 store i8 %a, ptr addrspace(1) %out ret void } define amdgpu_ps void @load_uniform_P1_i32(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) { ; GFX11-LABEL: load_uniform_P1_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: load_uniform_P1_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-NEXT: global_store_b32 v[0:1], v2, off ; GFX12-NEXT: s_endpgm %a = load i32, ptr addrspace(1) %ptra store i32 %a, ptr addrspace(1) %out ret void } define amdgpu_ps void @load_uniform_P1_v2i32(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) { ; GFX11-LABEL: load_uniform_P1_v2i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: load_uniform_P1_v2i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX12-NEXT: s_endpgm %a = load <2 x i32>, ptr addrspace(1) %ptra store <2 x i32> %a, ptr addrspace(1) %out ret void } ; gfx11, S96 load align 16(default) to load S128 define amdgpu_ps void @load_uniform_P1_v3i32(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) { ; GFX11-LABEL: load_uniform_P1_v3i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 ; GFX11-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-NEXT: global_store_b96 v[0:1], v[2:4], off ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: load_uniform_P1_v3i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-NEXT: global_store_b96 v[0:1], v[2:4], off ; GFX12-NEXT: s_endpgm %a = load <3 x i32>, ptr addrspace(1) %ptra store <3 x i32> %a, ptr addrspace(1) %out ret void } ; gfx11, S96 load align 4 to load S64 + load S32 define amdgpu_ps void @load_uniform_P1_v3i32_align4_gfx11(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) { ; GFX11-LABEL: load_uniform_P1_v3i32_align4_gfx11: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX11-NEXT: s_load_b32 s6, s[0:1], 0x8 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX11-NEXT: v_mov_b32_e32 v4, s6 ; GFX11-NEXT: global_store_b96 v[0:1], v[2:4], off ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: load_uniform_P1_v3i32_align4_gfx11: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-NEXT: global_store_b96 v[0:1], v[2:4], off ; GFX12-NEXT: s_endpgm %a = load <3 x i32>, ptr addrspace(1) %ptra, align 4 store <3 x i32> %a, ptr addrspace(1) %out ret void } define amdgpu_ps void @load_uniform_P1_v4i32(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) { ; GFX11-LABEL: load_uniform_P1_v4i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 ; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: load_uniform_P1_v4i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX12-NEXT: s_endpgm %a = load <4 x i32>, ptr addrspace(1) %ptra store <4 x i32> %a, ptr addrspace(1) %out ret void } define amdgpu_ps void @load_uniform_P1_v8i32(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) { ; GFX11-LABEL: load_uniform_P1_v8i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 ; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6 ; GFX11-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: load_uniform_P1_v8i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6 ; GFX12-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX12-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 ; GFX12-NEXT: s_endpgm %a = load <8 x i32>, ptr addrspace(1) %ptra store <8 x i32> %a, ptr addrspace(1) %out ret void } define amdgpu_ps void @load_uniform_P1_v16i32(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) { ; GFX11-LABEL: load_uniform_P1_v16i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b512 s[0:15], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 ; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6 ; GFX11-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4 ; GFX11-NEXT: v_dual_mov_b32 v13, s11 :: v_dual_mov_b32 v12, s10 ; GFX11-NEXT: v_dual_mov_b32 v11, s9 :: v_dual_mov_b32 v10, s8 ; GFX11-NEXT: v_dual_mov_b32 v17, s15 :: v_dual_mov_b32 v16, s14 ; GFX11-NEXT: v_dual_mov_b32 v15, s13 :: v_dual_mov_b32 v14, s12 ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 ; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 ; GFX11-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: load_uniform_P1_v16i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b512 s[0:15], s[0:1], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6 ; GFX12-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4 ; GFX12-NEXT: v_dual_mov_b32 v13, s11 :: v_dual_mov_b32 v12, s10 ; GFX12-NEXT: v_dual_mov_b32 v11, s9 :: v_dual_mov_b32 v10, s8 ; GFX12-NEXT: v_dual_mov_b32 v17, s15 :: v_dual_mov_b32 v16, s14 ; GFX12-NEXT: v_dual_mov_b32 v15, s13 :: v_dual_mov_b32 v14, s12 ; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX12-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 ; GFX12-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 ; GFX12-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 ; GFX12-NEXT: s_endpgm %a = load <16 x i32>, ptr addrspace(1) %ptra store <16 x i32> %a, ptr addrspace(1) %out ret void } ; constant address space, addrspace(4) define amdgpu_ps void @load_uniform_P4_i16_gfx12(ptr addrspace(4) inreg %ptra, ptr addrspace(1) %out) { ; GFX11-LABEL: load_uniform_P4_i16_gfx12: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: global_load_d16_b16 v2, v2, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_readfirstlane_b32 s0, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mov_b16_e32 v2.l, s0 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off ; GFX11-NEXT: s_endpgm ; ; GFX12-True16-LABEL: load_uniform_P4_i16_gfx12: ; GFX12-True16: ; %bb.0: ; GFX12-True16-NEXT: s_load_u16 s0, s[0:1], 0x0 ; GFX12-True16-NEXT: s_wait_kmcnt 0x0 ; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0 ; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off ; GFX12-True16-NEXT: s_endpgm ; ; GFX12-NoTrue16-LABEL: load_uniform_P4_i16_gfx12: ; GFX12-NoTrue16: ; %bb.0: ; GFX12-NoTrue16-NEXT: s_load_u16 s0, s[0:1], 0x0 ; GFX12-NoTrue16-NEXT: s_wait_kmcnt 0x0 ; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off ; GFX12-NoTrue16-NEXT: s_endpgm %a = load i16, ptr addrspace(4) %ptra store i16 %a, ptr addrspace(1) %out ret void } define amdgpu_ps void @load_uniform_P4_i16_align4_widen_mmo_gfx11(ptr addrspace(4) inreg %ptra, ptr addrspace(1) %out) { ; GFX11-LABEL: load_uniform_P4_i16_align4_widen_mmo_gfx11: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b16_e32 v2.l, s0 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off ; GFX11-NEXT: s_endpgm ; ; GFX12-True16-LABEL: load_uniform_P4_i16_align4_widen_mmo_gfx11: ; GFX12-True16: ; %bb.0: ; GFX12-True16-NEXT: s_load_u16 s0, s[0:1], 0x0 ; GFX12-True16-NEXT: s_wait_kmcnt 0x0 ; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0 ; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off ; GFX12-True16-NEXT: s_endpgm ; ; GFX12-NoTrue16-LABEL: load_uniform_P4_i16_align4_widen_mmo_gfx11: ; GFX12-NoTrue16: ; %bb.0: ; GFX12-NoTrue16-NEXT: s_load_u16 s0, s[0:1], 0x0 ; GFX12-NoTrue16-NEXT: s_wait_kmcnt 0x0 ; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off ; GFX12-NoTrue16-NEXT: s_endpgm %a = load i16, ptr addrspace(4) %ptra, align 4 store i16 %a, ptr addrspace(1) %out ret void } ; gfx12, S32 8-bit anyextending load, no difference regarding true 16 define amdgpu_ps void @load_uniform_P4_i8_any_extending_load(ptr addrspace(4) inreg %ptra, ptr addrspace(1) %out) { ; GFX11-LABEL: load_uniform_P4_i8_any_extending_load: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: global_load_u8 v2, v2, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b8 v[0:1], v2, off ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: load_uniform_P4_i8_any_extending_load: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_u8 s0, s[0:1], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-NEXT: global_store_b8 v[0:1], v2, off ; GFX12-NEXT: s_endpgm %a = load i8, ptr addrspace(4) %ptra store i8 %a, ptr addrspace(1) %out ret void } ; gfx11 and older, S32 8-bit any-extending load, no difference regarding true 16 define amdgpu_ps void @load_uniform_P4_i8_any_extending_load_align4_widen_mmo_gfx11(ptr addrspace(4) inreg %ptra, ptr addrspace(1) %out) { ; GFX11-LABEL: load_uniform_P4_i8_any_extending_load_align4_widen_mmo_gfx11: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-NEXT: global_store_b8 v[0:1], v2, off ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: load_uniform_P4_i8_any_extending_load_align4_widen_mmo_gfx11: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_u8 s0, s[0:1], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-NEXT: global_store_b8 v[0:1], v2, off ; GFX12-NEXT: s_endpgm %a = load i8, ptr addrspace(4) %ptra, align 4 store i8 %a, ptr addrspace(1) %out ret void } define amdgpu_ps void @load_uniform_P4_i32(ptr addrspace(4) inreg %ptra, ptr addrspace(1) %out) { ; GFX11-LABEL: load_uniform_P4_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: load_uniform_P4_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-NEXT: global_store_b32 v[0:1], v2, off ; GFX12-NEXT: s_endpgm %a = load i32, ptr addrspace(4) %ptra store i32 %a, ptr addrspace(1) %out ret void } define amdgpu_ps void @load_uniform_P4_v2i32(ptr addrspace(4) inreg %ptra, ptr addrspace(1) %out) { ; GFX11-LABEL: load_uniform_P4_v2i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: load_uniform_P4_v2i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX12-NEXT: s_endpgm %a = load <2 x i32>, ptr addrspace(4) %ptra store <2 x i32> %a, ptr addrspace(1) %out ret void } ; gfx11, S96 load align 16(default) to load S128 define amdgpu_ps void @load_uniform_P4_v3i32(ptr addrspace(4) inreg %ptra, ptr addrspace(1) %out) { ; GFX11-LABEL: load_uniform_P4_v3i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 ; GFX11-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-NEXT: global_store_b96 v[0:1], v[2:4], off ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: load_uniform_P4_v3i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-NEXT: global_store_b96 v[0:1], v[2:4], off ; GFX12-NEXT: s_endpgm %a = load <3 x i32>, ptr addrspace(4) %ptra store <3 x i32> %a, ptr addrspace(1) %out ret void } ; gfx11, S96 load align 4 to load S64 + load S32 define amdgpu_ps void @load_uniform_P4_v3i32_align4_gfx11(ptr addrspace(4) inreg %ptra, ptr addrspace(1) %out) { ; GFX11-LABEL: load_uniform_P4_v3i32_align4_gfx11: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX11-NEXT: s_load_b32 s6, s[0:1], 0x8 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX11-NEXT: v_mov_b32_e32 v4, s6 ; GFX11-NEXT: global_store_b96 v[0:1], v[2:4], off ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: load_uniform_P4_v3i32_align4_gfx11: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-NEXT: global_store_b96 v[0:1], v[2:4], off ; GFX12-NEXT: s_endpgm %a = load <3 x i32>, ptr addrspace(4) %ptra, align 4 store <3 x i32> %a, ptr addrspace(1) %out ret void } define amdgpu_ps void @load_uniform_P4_v4i32(ptr addrspace(4) inreg %ptra, ptr addrspace(1) %out) { ; GFX11-LABEL: load_uniform_P4_v4i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 ; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: load_uniform_P4_v4i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX12-NEXT: s_endpgm %a = load <4 x i32>, ptr addrspace(4) %ptra store <4 x i32> %a, ptr addrspace(1) %out ret void } define amdgpu_ps void @load_uniform_P4_v8i32(ptr addrspace(4) inreg %ptra, ptr addrspace(1) %out) { ; GFX11-LABEL: load_uniform_P4_v8i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 ; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6 ; GFX11-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: load_uniform_P4_v8i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6 ; GFX12-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX12-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 ; GFX12-NEXT: s_endpgm %a = load <8 x i32>, ptr addrspace(4) %ptra store <8 x i32> %a, ptr addrspace(1) %out ret void } define amdgpu_ps void @load_uniform_P4_v16i32(ptr addrspace(4) inreg %ptra, ptr addrspace(1) %out) { ; GFX11-LABEL: load_uniform_P4_v16i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b512 s[0:15], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 ; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6 ; GFX11-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4 ; GFX11-NEXT: v_dual_mov_b32 v13, s11 :: v_dual_mov_b32 v12, s10 ; GFX11-NEXT: v_dual_mov_b32 v11, s9 :: v_dual_mov_b32 v10, s8 ; GFX11-NEXT: v_dual_mov_b32 v17, s15 :: v_dual_mov_b32 v16, s14 ; GFX11-NEXT: v_dual_mov_b32 v15, s13 :: v_dual_mov_b32 v14, s12 ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 ; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 ; GFX11-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: load_uniform_P4_v16i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b512 s[0:15], s[0:1], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6 ; GFX12-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4 ; GFX12-NEXT: v_dual_mov_b32 v13, s11 :: v_dual_mov_b32 v12, s10 ; GFX12-NEXT: v_dual_mov_b32 v11, s9 :: v_dual_mov_b32 v10, s8 ; GFX12-NEXT: v_dual_mov_b32 v17, s15 :: v_dual_mov_b32 v16, s14 ; GFX12-NEXT: v_dual_mov_b32 v15, s13 :: v_dual_mov_b32 v14, s12 ; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX12-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 ; GFX12-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 ; GFX12-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 ; GFX12-NEXT: s_endpgm %a = load <16 x i32>, ptr addrspace(4) %ptra store <16 x i32> %a, ptr addrspace(1) %out ret void }