; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -mattr=+unaligned-access-mode < %s | FileCheck --check-prefixes=GFX7 %s ; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+unaligned-access-mode < %s | FileCheck --check-prefixes=GFX11,GFX11-True16 %s ; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+unaligned-access-mode -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX11,GFX11-NoTrue16 %s ; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+unaligned-access-mode -mattr=+real-true16 < %s | FileCheck --check-prefixes=GFX12,GFX12-True16 %s ; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+unaligned-access-mode -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX12,GFX12-NoTrue16 %s ; global address space, addrspace(1) ; gfx12 true 16, not natural alignment or not uniform mmo define amdgpu_ps void @load_uniform_P1_i16_b16_gfx12(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) { ; GFX7-LABEL: load_uniform_P1_i16_b16_gfx12: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_mov_b32 s4, s2 ; GFX7-NEXT: s_mov_b32 s5, s3 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7-NEXT: buffer_load_ushort v2, off, s[0:3], 0 ; GFX7-NEXT: buffer_load_ushort v3, off, s[4:7], 0 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_readfirstlane_b32 s0, v2 ; GFX7-NEXT: v_readfirstlane_b32 s1, v3 ; GFX7-NEXT: s_add_i32 s0, s0, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_endpgm ; ; GFX11-True16-LABEL: load_uniform_P1_i16_b16_gfx12: ; GFX11-True16: ; %bb.0: ; GFX11-True16-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-True16-NEXT: s_clause 0x1 ; GFX11-True16-NEXT: global_load_d16_b16 v3, v2, s[0:1] ; GFX11-True16-NEXT: global_load_d16_b16 v2, v2, s[2:3] glc dlc ; GFX11-True16-NEXT: s_waitcnt vmcnt(0) ; GFX11-True16-NEXT: v_readfirstlane_b32 s0, v3 ; GFX11-True16-NEXT: v_readfirstlane_b32 s1, v2 ; GFX11-True16-NEXT: s_add_i32 s0, s0, s1 ; GFX11-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-True16-NEXT: v_mov_b16_e32 v2.l, s0 ; GFX11-True16-NEXT: global_store_b16 v[0:1], v2, off ; GFX11-True16-NEXT: s_endpgm ; ; GFX11-NoTrue16-LABEL: load_uniform_P1_i16_b16_gfx12: ; GFX11-NoTrue16: ; %bb.0: ; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NoTrue16-NEXT: s_clause 0x1 ; GFX11-NoTrue16-NEXT: global_load_u16 v3, v2, s[0:1] ; GFX11-NoTrue16-NEXT: global_load_u16 v2, v2, s[2:3] glc dlc ; GFX11-NoTrue16-NEXT: s_waitcnt vmcnt(0) ; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s0, v3 ; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2 ; GFX11-NoTrue16-NEXT: s_add_i32 s0, s0, s1 ; GFX11-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off ; GFX11-NoTrue16-NEXT: s_endpgm ; ; GFX12-True16-LABEL: load_uniform_P1_i16_b16_gfx12: ; GFX12-True16: ; %bb.0: ; GFX12-True16-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-True16-NEXT: s_clause 0x1 ; GFX12-True16-NEXT: global_load_d16_b16 v3, v2, s[0:1] ; GFX12-True16-NEXT: global_load_d16_b16 v2, v2, s[2:3] scope:SCOPE_SYS ; GFX12-True16-NEXT: s_wait_loadcnt 0x0 ; GFX12-True16-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-True16-NEXT: v_readfirstlane_b32 s1, v2 ; GFX12-True16-NEXT: s_add_co_i32 s0, s0, s1 ; GFX12-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0 ; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off ; GFX12-True16-NEXT: s_endpgm ; ; GFX12-NoTrue16-LABEL: load_uniform_P1_i16_b16_gfx12: ; GFX12-NoTrue16: ; %bb.0: ; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NoTrue16-NEXT: s_clause 0x1 ; GFX12-NoTrue16-NEXT: global_load_u16 v3, v2, s[0:1] ; GFX12-NoTrue16-NEXT: global_load_u16 v2, v2, s[2:3] scope:SCOPE_SYS ; GFX12-NoTrue16-NEXT: s_wait_loadcnt 0x0 ; GFX12-NoTrue16-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2 ; GFX12-NoTrue16-NEXT: s_add_co_i32 s0, s0, s1 ; GFX12-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off ; GFX12-NoTrue16-NEXT: s_endpgm %a = load i16, ptr addrspace(1) %ptra, align 1 %b = load volatile i16, ptr addrspace(1) %ptrb %sum = add i16 %a, %b store i16 %sum, ptr addrspace(1) %out ret void } ; gfx11 true16, 16-bit load, not align 4 or not uniform mmo define amdgpu_ps void @load_uniform_P1_i16_b16_gfx11(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) { ; GFX7-LABEL: load_uniform_P1_i16_b16_gfx11: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_load_ushort v2, off, s[0:3], 0 ; GFX7-NEXT: buffer_load_ushort v3, off, s[0:3], 0 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_readfirstlane_b32 s0, v2 ; GFX7-NEXT: v_readfirstlane_b32 s1, v3 ; GFX7-NEXT: s_add_i32 s0, s0, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_endpgm ; ; GFX11-True16-LABEL: load_uniform_P1_i16_b16_gfx11: ; GFX11-True16: ; %bb.0: ; GFX11-True16-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-True16-NEXT: s_clause 0x1 ; GFX11-True16-NEXT: global_load_d16_b16 v3, v2, s[0:1] ; GFX11-True16-NEXT: global_load_d16_b16 v2, v2, s[0:1] glc dlc ; GFX11-True16-NEXT: s_waitcnt vmcnt(0) ; GFX11-True16-NEXT: v_readfirstlane_b32 s0, v3 ; GFX11-True16-NEXT: v_readfirstlane_b32 s1, v2 ; GFX11-True16-NEXT: s_add_i32 s0, s0, s1 ; GFX11-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-True16-NEXT: v_mov_b16_e32 v2.l, s0 ; GFX11-True16-NEXT: global_store_b16 v[0:1], v2, off ; GFX11-True16-NEXT: s_endpgm ; ; GFX11-NoTrue16-LABEL: load_uniform_P1_i16_b16_gfx11: ; GFX11-NoTrue16: ; %bb.0: ; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NoTrue16-NEXT: s_clause 0x1 ; GFX11-NoTrue16-NEXT: global_load_u16 v3, v2, s[0:1] ; GFX11-NoTrue16-NEXT: global_load_u16 v2, v2, s[0:1] glc dlc ; GFX11-NoTrue16-NEXT: s_waitcnt vmcnt(0) ; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s0, v3 ; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2 ; GFX11-NoTrue16-NEXT: s_add_i32 s0, s0, s1 ; GFX11-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off ; GFX11-NoTrue16-NEXT: s_endpgm ; ; GFX12-True16-LABEL: load_uniform_P1_i16_b16_gfx11: ; GFX12-True16: ; %bb.0: ; GFX12-True16-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-True16-NEXT: s_load_u16 s2, s[0:1], 0x0 ; GFX12-True16-NEXT: global_load_d16_b16 v2, v2, s[0:1] scope:SCOPE_SYS ; GFX12-True16-NEXT: s_wait_loadcnt 0x0 ; GFX12-True16-NEXT: v_readfirstlane_b32 s0, v2 ; GFX12-True16-NEXT: s_wait_kmcnt 0x0 ; GFX12-True16-NEXT: s_add_co_i32 s0, s2, s0 ; GFX12-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0 ; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off ; GFX12-True16-NEXT: s_endpgm ; ; GFX12-NoTrue16-LABEL: load_uniform_P1_i16_b16_gfx11: ; GFX12-NoTrue16: ; %bb.0: ; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NoTrue16-NEXT: s_load_u16 s2, s[0:1], 0x0 ; GFX12-NoTrue16-NEXT: global_load_u16 v2, v2, s[0:1] scope:SCOPE_SYS ; GFX12-NoTrue16-NEXT: s_wait_loadcnt 0x0 ; GFX12-NoTrue16-NEXT: v_readfirstlane_b32 s0, v2 ; GFX12-NoTrue16-NEXT: s_wait_kmcnt 0x0 ; GFX12-NoTrue16-NEXT: s_add_co_i32 s0, s2, s0 ; GFX12-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off ; GFX12-NoTrue16-NEXT: s_endpgm %a = load i16, ptr addrspace(1) %ptra %b = load volatile i16, ptr addrspace(1) %ptra, align 4 %sum = add i16 %a, %b store i16 %sum, ptr addrspace(1) %out ret void } ; gfx12 without true16, 16-bit any-extending load, not natural alignment or not uniform mmo define amdgpu_ps void @load_uniform_P1_i16_anyextending_gfx12(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) { ; GFX7-LABEL: load_uniform_P1_i16_anyextending_gfx12: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_load_ushort v2, off, s[0:3], 0 ; GFX7-NEXT: buffer_load_ushort v3, off, s[0:3], 0 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_readfirstlane_b32 s0, v2 ; GFX7-NEXT: v_readfirstlane_b32 s1, v3 ; GFX7-NEXT: s_add_i32 s0, s0, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_endpgm ; ; GFX11-True16-LABEL: load_uniform_P1_i16_anyextending_gfx12: ; GFX11-True16: ; %bb.0: ; GFX11-True16-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-True16-NEXT: s_clause 0x1 ; GFX11-True16-NEXT: global_load_d16_b16 v3, v2, s[0:1] ; GFX11-True16-NEXT: global_load_d16_b16 v2, v2, s[0:1] glc dlc ; GFX11-True16-NEXT: s_waitcnt vmcnt(0) ; GFX11-True16-NEXT: v_readfirstlane_b32 s0, v3 ; GFX11-True16-NEXT: v_readfirstlane_b32 s1, v2 ; GFX11-True16-NEXT: s_add_i32 s0, s0, s1 ; GFX11-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-True16-NEXT: v_mov_b16_e32 v2.l, s0 ; GFX11-True16-NEXT: global_store_b16 v[0:1], v2, off ; GFX11-True16-NEXT: s_endpgm ; ; GFX11-NoTrue16-LABEL: load_uniform_P1_i16_anyextending_gfx12: ; GFX11-NoTrue16: ; %bb.0: ; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NoTrue16-NEXT: s_clause 0x1 ; GFX11-NoTrue16-NEXT: global_load_u16 v3, v2, s[0:1] ; GFX11-NoTrue16-NEXT: global_load_u16 v2, v2, s[0:1] glc dlc ; GFX11-NoTrue16-NEXT: s_waitcnt vmcnt(0) ; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s0, v3 ; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2 ; GFX11-NoTrue16-NEXT: s_add_i32 s0, s0, s1 ; GFX11-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off ; GFX11-NoTrue16-NEXT: s_endpgm ; ; GFX12-True16-LABEL: load_uniform_P1_i16_anyextending_gfx12: ; GFX12-True16: ; %bb.0: ; GFX12-True16-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-True16-NEXT: s_clause 0x1 ; GFX12-True16-NEXT: global_load_d16_b16 v3, v2, s[0:1] ; GFX12-True16-NEXT: global_load_d16_b16 v2, v2, s[0:1] scope:SCOPE_SYS ; GFX12-True16-NEXT: s_wait_loadcnt 0x0 ; GFX12-True16-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-True16-NEXT: v_readfirstlane_b32 s1, v2 ; GFX12-True16-NEXT: s_add_co_i32 s0, s0, s1 ; GFX12-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0 ; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off ; GFX12-True16-NEXT: s_endpgm ; ; GFX12-NoTrue16-LABEL: load_uniform_P1_i16_anyextending_gfx12: ; GFX12-NoTrue16: ; %bb.0: ; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NoTrue16-NEXT: s_clause 0x1 ; GFX12-NoTrue16-NEXT: global_load_u16 v3, v2, s[0:1] ; GFX12-NoTrue16-NEXT: global_load_u16 v2, v2, s[0:1] scope:SCOPE_SYS ; GFX12-NoTrue16-NEXT: s_wait_loadcnt 0x0 ; GFX12-NoTrue16-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2 ; GFX12-NoTrue16-NEXT: s_add_co_i32 s0, s0, s1 ; GFX12-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off ; GFX12-NoTrue16-NEXT: s_endpgm %a = load i16, ptr addrspace(1) %ptra, align 1 %b = load volatile i16, ptr addrspace(1) %ptra %sum = add i16 %a, %b store i16 %sum, ptr addrspace(1) %out ret void } ; gfx11(or older) without true 16, s16 any-extending load, not align 4 or not uniform mmo define amdgpu_ps void @load_uniform_P1_i16_anyextending_gfx11(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) { ; GFX7-LABEL: load_uniform_P1_i16_anyextending_gfx11: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_load_ushort v2, off, s[0:3], 0 ; GFX7-NEXT: buffer_load_ushort v3, off, s[0:3], 0 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_readfirstlane_b32 s0, v2 ; GFX7-NEXT: v_readfirstlane_b32 s1, v3 ; GFX7-NEXT: s_add_i32 s0, s0, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_endpgm ; ; GFX11-True16-LABEL: load_uniform_P1_i16_anyextending_gfx11: ; GFX11-True16: ; %bb.0: ; GFX11-True16-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-True16-NEXT: s_clause 0x1 ; GFX11-True16-NEXT: global_load_d16_b16 v3, v2, s[0:1] ; GFX11-True16-NEXT: global_load_d16_b16 v2, v2, s[0:1] glc dlc ; GFX11-True16-NEXT: s_waitcnt vmcnt(0) ; GFX11-True16-NEXT: v_readfirstlane_b32 s0, v3 ; GFX11-True16-NEXT: v_readfirstlane_b32 s1, v2 ; GFX11-True16-NEXT: s_add_i32 s0, s0, s1 ; GFX11-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-True16-NEXT: v_mov_b16_e32 v2.l, s0 ; GFX11-True16-NEXT: global_store_b16 v[0:1], v2, off ; GFX11-True16-NEXT: s_endpgm ; ; GFX11-NoTrue16-LABEL: load_uniform_P1_i16_anyextending_gfx11: ; GFX11-NoTrue16: ; %bb.0: ; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NoTrue16-NEXT: s_clause 0x1 ; GFX11-NoTrue16-NEXT: global_load_u16 v3, v2, s[0:1] ; GFX11-NoTrue16-NEXT: global_load_u16 v2, v2, s[0:1] glc dlc ; GFX11-NoTrue16-NEXT: s_waitcnt vmcnt(0) ; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s0, v3 ; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2 ; GFX11-NoTrue16-NEXT: s_add_i32 s0, s0, s1 ; GFX11-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off ; GFX11-NoTrue16-NEXT: s_endpgm ; ; GFX12-True16-LABEL: load_uniform_P1_i16_anyextending_gfx11: ; GFX12-True16: ; %bb.0: ; GFX12-True16-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-True16-NEXT: s_load_u16 s2, s[0:1], 0x0 ; GFX12-True16-NEXT: global_load_d16_b16 v2, v2, s[0:1] scope:SCOPE_SYS ; GFX12-True16-NEXT: s_wait_loadcnt 0x0 ; GFX12-True16-NEXT: v_readfirstlane_b32 s0, v2 ; GFX12-True16-NEXT: s_wait_kmcnt 0x0 ; GFX12-True16-NEXT: s_add_co_i32 s0, s2, s0 ; GFX12-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0 ; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off ; GFX12-True16-NEXT: s_endpgm ; ; GFX12-NoTrue16-LABEL: load_uniform_P1_i16_anyextending_gfx11: ; GFX12-NoTrue16: ; %bb.0: ; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NoTrue16-NEXT: s_load_u16 s2, s[0:1], 0x0 ; GFX12-NoTrue16-NEXT: global_load_u16 v2, v2, s[0:1] scope:SCOPE_SYS ; GFX12-NoTrue16-NEXT: s_wait_loadcnt 0x0 ; GFX12-NoTrue16-NEXT: v_readfirstlane_b32 s0, v2 ; GFX12-NoTrue16-NEXT: s_wait_kmcnt 0x0 ; GFX12-NoTrue16-NEXT: s_add_co_i32 s0, s2, s0 ; GFX12-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off ; GFX12-NoTrue16-NEXT: s_endpgm %a = load i16, ptr addrspace(1) %ptra %b = load volatile i16, ptr addrspace(1) %ptra, align 4 %sum = add i16 %a, %b store i16 %sum, ptr addrspace(1) %out ret void } ; any target, 32-bit load load, not align 4 or not uniform mmo define amdgpu_ps void @load_uniform_P1_i32(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) { ; GFX7-LABEL: load_uniform_P1_i32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], 0 ; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_readfirstlane_b32 s0, v2 ; GFX7-NEXT: v_readfirstlane_b32 s1, v3 ; GFX7-NEXT: s_add_i32 s0, s0, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_endpgm ; ; GFX11-LABEL: load_uniform_P1_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b32 v3, v2, s[0:1] ; GFX11-NEXT: global_load_b32 v2, v2, s[0:1] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_readfirstlane_b32 s0, v3 ; GFX11-NEXT: v_readfirstlane_b32 s1, v2 ; GFX11-NEXT: s_add_i32 s0, s0, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: load_uniform_P1_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_load_b32 v3, v2, s[0:1] ; GFX12-NEXT: global_load_b32 v2, v2, s[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-NEXT: v_readfirstlane_b32 s1, v2 ; GFX12-NEXT: s_add_co_i32 s0, s0, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-NEXT: global_store_b32 v[0:1], v2, off ; GFX12-NEXT: s_endpgm %a = load i32, ptr addrspace(1) %ptra, align 2 %b = load volatile i32, ptr addrspace(1) %ptra %sum = add i32 %a, %b store i32 %sum, ptr addrspace(1) %out ret void } ; any target, 64bit load load, not align 4 or not uniform mmo define amdgpu_ps void @load_uniform_P1_v2i32(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) { ; GFX7-LABEL: load_uniform_P1_v2i32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 ; GFX7-NEXT: buffer_load_dwordx2 v[4:5], off, s[0:3], 0 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_readfirstlane_b32 s1, v3 ; GFX7-NEXT: v_readfirstlane_b32 s5, v5 ; GFX7-NEXT: v_readfirstlane_b32 s0, v2 ; GFX7-NEXT: v_readfirstlane_b32 s4, v4 ; GFX7-NEXT: s_add_i32 s1, s1, s5 ; GFX7-NEXT: s_add_i32 s0, s0, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_endpgm ; ; GFX11-LABEL: load_uniform_P1_v2i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b64 v[2:3], v4, s[0:1] ; GFX11-NEXT: global_load_b64 v[4:5], v4, s[0:1] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_readfirstlane_b32 s1, v3 ; GFX11-NEXT: v_readfirstlane_b32 s3, v5 ; GFX11-NEXT: v_readfirstlane_b32 s0, v2 ; GFX11-NEXT: v_readfirstlane_b32 s2, v4 ; GFX11-NEXT: s_add_i32 s1, s1, s3 ; GFX11-NEXT: s_add_i32 s0, s0, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: load_uniform_P1_v2i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_load_b64 v[2:3], v4, s[0:1] ; GFX12-NEXT: global_load_b64 v[4:5], v4, s[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s1, v3 ; GFX12-NEXT: v_readfirstlane_b32 s3, v5 ; GFX12-NEXT: v_readfirstlane_b32 s0, v2 ; GFX12-NEXT: v_readfirstlane_b32 s2, v4 ; GFX12-NEXT: s_add_co_i32 s1, s1, s3 ; GFX12-NEXT: s_add_co_i32 s0, s0, s2 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX12-NEXT: s_endpgm %a = load <2 x i32>, ptr addrspace(1) %ptra, align 2 %b = load volatile <2 x i32>, ptr addrspace(1) %ptra %sum = add <2 x i32> %a, %b store <2 x i32> %sum, ptr addrspace(1) %out ret void } ; any target, 96bit load load, not align 4 or not uniform mmo define amdgpu_ps void @load_uniform_P1_v3i32_gfx12(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) { ; GFX7-LABEL: load_uniform_P1_v3i32_gfx12: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_load_dwordx3 v[2:4], off, s[0:3], 0 ; GFX7-NEXT: buffer_load_dwordx3 v[5:7], off, s[0:3], 0 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_readfirstlane_b32 s0, v2 ; GFX7-NEXT: v_readfirstlane_b32 s4, v5 ; GFX7-NEXT: v_readfirstlane_b32 s1, v3 ; GFX7-NEXT: v_readfirstlane_b32 s6, v4 ; GFX7-NEXT: v_readfirstlane_b32 s5, v6 ; GFX7-NEXT: v_readfirstlane_b32 s7, v7 ; GFX7-NEXT: s_add_i32 s4, s0, s4 ; GFX7-NEXT: s_add_i32 s5, s1, s5 ; GFX7-NEXT: s_add_i32 s6, s6, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: buffer_store_dwordx3 v[2:4], v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_endpgm ; ; GFX11-LABEL: load_uniform_P1_v3i32_gfx12: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v5, 0 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b96 v[2:4], v5, s[0:1] ; GFX11-NEXT: global_load_b96 v[5:7], v5, s[0:1] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_readfirstlane_b32 s2, v4 ; GFX11-NEXT: v_readfirstlane_b32 s5, v7 ; GFX11-NEXT: v_readfirstlane_b32 s0, v2 ; GFX11-NEXT: v_readfirstlane_b32 s1, v3 ; GFX11-NEXT: v_readfirstlane_b32 s3, v5 ; GFX11-NEXT: v_readfirstlane_b32 s4, v6 ; GFX11-NEXT: s_add_i32 s2, s2, s5 ; GFX11-NEXT: s_add_i32 s0, s0, s3 ; GFX11-NEXT: s_add_i32 s1, s1, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 ; GFX11-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-NEXT: global_store_b96 v[0:1], v[2:4], off ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: load_uniform_P1_v3i32_gfx12: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v5, 0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_load_b96 v[2:4], v5, s[0:1] ; GFX12-NEXT: global_load_b96 v[5:7], v5, s[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s2, v4 ; GFX12-NEXT: v_readfirstlane_b32 s5, v7 ; GFX12-NEXT: v_readfirstlane_b32 s0, v2 ; GFX12-NEXT: v_readfirstlane_b32 s1, v3 ; GFX12-NEXT: v_readfirstlane_b32 s3, v5 ; GFX12-NEXT: v_readfirstlane_b32 s4, v6 ; GFX12-NEXT: s_add_co_i32 s2, s2, s5 ; GFX12-NEXT: s_add_co_i32 s0, s0, s3 ; GFX12-NEXT: s_add_co_i32 s1, s1, s4 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-NEXT: global_store_b96 v[0:1], v[2:4], off ; GFX12-NEXT: s_endpgm %a = load <3 x i32>, ptr addrspace(1) %ptra, align 2 %b = load volatile <3 x i32>, ptr addrspace(1) %ptra %sum = add <3 x i32> %a, %b store <3 x i32> %sum, ptr addrspace(1) %out ret void } ; any target, 128-bit load load, not align 4 or not uniform mmo define amdgpu_ps void @load_uniform_P1_v4i32(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) { ; GFX7-LABEL: load_uniform_P1_v4i32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_load_dwordx4 v[2:5], off, s[0:3], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[6:9], off, s[0:3], 0 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_readfirstlane_b32 s0, v2 ; GFX7-NEXT: v_readfirstlane_b32 s4, v6 ; GFX7-NEXT: v_readfirstlane_b32 s1, v3 ; GFX7-NEXT: v_readfirstlane_b32 s6, v4 ; GFX7-NEXT: v_readfirstlane_b32 s7, v5 ; GFX7-NEXT: v_readfirstlane_b32 s5, v7 ; GFX7-NEXT: v_readfirstlane_b32 s8, v8 ; GFX7-NEXT: v_readfirstlane_b32 s9, v9 ; GFX7-NEXT: s_add_i32 s4, s0, s4 ; GFX7-NEXT: s_add_i32 s5, s1, s5 ; GFX7-NEXT: s_add_i32 s6, s6, s8 ; GFX7-NEXT: s_add_i32 s7, s7, s9 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-NEXT: v_mov_b32_e32 v5, s7 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_endpgm ; ; GFX11-LABEL: load_uniform_P1_v4i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v6, 0 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b128 v[2:5], v6, s[0:1] ; GFX11-NEXT: global_load_b128 v[6:9], v6, s[0:1] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_readfirstlane_b32 s3, v5 ; GFX11-NEXT: v_readfirstlane_b32 s7, v9 ; GFX11-NEXT: v_readfirstlane_b32 s0, v2 ; GFX11-NEXT: v_readfirstlane_b32 s1, v3 ; GFX11-NEXT: v_readfirstlane_b32 s2, v4 ; GFX11-NEXT: v_readfirstlane_b32 s4, v6 ; GFX11-NEXT: v_readfirstlane_b32 s5, v7 ; GFX11-NEXT: v_readfirstlane_b32 s6, v8 ; GFX11-NEXT: s_add_i32 s3, s3, s7 ; GFX11-NEXT: s_add_i32 s0, s0, s4 ; GFX11-NEXT: s_add_i32 s1, s1, s5 ; GFX11-NEXT: s_add_i32 s2, s2, s6 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 ; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: load_uniform_P1_v4i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v6, 0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_load_b128 v[2:5], v6, s[0:1] ; GFX12-NEXT: global_load_b128 v[6:9], v6, s[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s3, v5 ; GFX12-NEXT: v_readfirstlane_b32 s7, v9 ; GFX12-NEXT: v_readfirstlane_b32 s0, v2 ; GFX12-NEXT: v_readfirstlane_b32 s1, v3 ; GFX12-NEXT: v_readfirstlane_b32 s2, v4 ; GFX12-NEXT: v_readfirstlane_b32 s4, v6 ; GFX12-NEXT: v_readfirstlane_b32 s5, v7 ; GFX12-NEXT: v_readfirstlane_b32 s6, v8 ; GFX12-NEXT: s_add_co_i32 s3, s3, s7 ; GFX12-NEXT: s_add_co_i32 s0, s0, s4 ; GFX12-NEXT: s_add_co_i32 s1, s1, s5 ; GFX12-NEXT: s_add_co_i32 s2, s2, s6 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX12-NEXT: s_endpgm %a = load <4 x i32>, ptr addrspace(1) %ptra, align 2 %b = load volatile <4 x i32>, ptr addrspace(1) %ptra %sum = add <4 x i32> %a, %b store <4 x i32> %sum, ptr addrspace(1) %out ret void } ; any target, 256bit load load, not align 4 or not uniform mmo define amdgpu_ps void @load_uniform_P1_v8i32(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) { ; GFX7-LABEL: load_uniform_P1_v8i32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_load_dwordx4 v[2:5], off, s[0:3], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[6:9], off, s[0:3], 0 offset:16 ; GFX7-NEXT: buffer_load_dwordx4 v[10:13], off, s[0:3], 0 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_load_dwordx4 v[14:17], off, s[0:3], 0 offset:16 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_readfirstlane_b32 s4, v2 ; GFX7-NEXT: v_readfirstlane_b32 s5, v3 ; GFX7-NEXT: v_readfirstlane_b32 s12, v10 ; GFX7-NEXT: v_readfirstlane_b32 s6, v4 ; GFX7-NEXT: v_readfirstlane_b32 s7, v5 ; GFX7-NEXT: v_readfirstlane_b32 s8, v6 ; GFX7-NEXT: v_readfirstlane_b32 s13, v11 ; GFX7-NEXT: v_readfirstlane_b32 s14, v12 ; GFX7-NEXT: v_readfirstlane_b32 s15, v13 ; GFX7-NEXT: v_readfirstlane_b32 s16, v14 ; GFX7-NEXT: s_add_i32 s4, s4, s12 ; GFX7-NEXT: v_readfirstlane_b32 s9, v7 ; GFX7-NEXT: v_readfirstlane_b32 s10, v8 ; GFX7-NEXT: v_readfirstlane_b32 s11, v9 ; GFX7-NEXT: v_readfirstlane_b32 s17, v15 ; GFX7-NEXT: v_readfirstlane_b32 s18, v16 ; GFX7-NEXT: v_readfirstlane_b32 s19, v17 ; GFX7-NEXT: s_add_i32 s5, s5, s13 ; GFX7-NEXT: s_add_i32 s6, s6, s14 ; GFX7-NEXT: s_add_i32 s7, s7, s15 ; GFX7-NEXT: s_add_i32 s8, s8, s16 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_add_i32 s9, s9, s17 ; GFX7-NEXT: s_add_i32 s10, s10, s18 ; GFX7-NEXT: s_add_i32 s11, s11, s19 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-NEXT: v_mov_b32_e32 v5, s7 ; GFX7-NEXT: v_mov_b32_e32 v6, s8 ; GFX7-NEXT: v_mov_b32_e32 v7, s9 ; GFX7-NEXT: v_mov_b32_e32 v8, s10 ; GFX7-NEXT: v_mov_b32_e32 v9, s11 ; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[0:3], 0 addr64 offset:16 ; GFX7-NEXT: s_endpgm ; ; GFX11-LABEL: load_uniform_P1_v8i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v14, 0 ; GFX11-NEXT: s_clause 0x2 ; GFX11-NEXT: global_load_b128 v[2:5], v14, s[0:1] ; GFX11-NEXT: global_load_b128 v[6:9], v14, s[0:1] offset:16 ; GFX11-NEXT: global_load_b128 v[10:13], v14, s[0:1] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_load_b128 v[14:17], v14, s[0:1] offset:16 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_readfirstlane_b32 s3, v5 ; GFX11-NEXT: v_readfirstlane_b32 s0, v2 ; GFX11-NEXT: v_readfirstlane_b32 s11, v13 ; GFX11-NEXT: v_readfirstlane_b32 s1, v3 ; GFX11-NEXT: v_readfirstlane_b32 s2, v4 ; GFX11-NEXT: v_readfirstlane_b32 s7, v9 ; GFX11-NEXT: v_readfirstlane_b32 s8, v10 ; GFX11-NEXT: v_readfirstlane_b32 s9, v11 ; GFX11-NEXT: v_readfirstlane_b32 s10, v12 ; GFX11-NEXT: v_readfirstlane_b32 s15, v17 ; GFX11-NEXT: v_readfirstlane_b32 s4, v6 ; GFX11-NEXT: v_readfirstlane_b32 s5, v7 ; GFX11-NEXT: v_readfirstlane_b32 s6, v8 ; GFX11-NEXT: v_readfirstlane_b32 s12, v14 ; GFX11-NEXT: v_readfirstlane_b32 s13, v15 ; GFX11-NEXT: v_readfirstlane_b32 s14, v16 ; GFX11-NEXT: s_add_i32 s3, s3, s11 ; GFX11-NEXT: s_add_i32 s0, s0, s8 ; GFX11-NEXT: s_add_i32 s1, s1, s9 ; GFX11-NEXT: s_add_i32 s2, s2, s10 ; GFX11-NEXT: s_add_i32 s7, s7, s15 ; GFX11-NEXT: s_add_i32 s4, s4, s12 ; GFX11-NEXT: s_add_i32 s5, s5, s13 ; GFX11-NEXT: s_add_i32 s6, s6, s14 ; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 ; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6 ; GFX11-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: load_uniform_P1_v8i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v14, 0 ; GFX12-NEXT: s_clause 0x2 ; GFX12-NEXT: global_load_b128 v[2:5], v14, s[0:1] ; GFX12-NEXT: global_load_b128 v[6:9], v14, s[0:1] offset:16 ; GFX12-NEXT: global_load_b128 v[10:13], v14, s[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_load_b128 v[14:17], v14, s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s3, v5 ; GFX12-NEXT: v_readfirstlane_b32 s0, v2 ; GFX12-NEXT: v_readfirstlane_b32 s11, v13 ; GFX12-NEXT: v_readfirstlane_b32 s1, v3 ; GFX12-NEXT: v_readfirstlane_b32 s2, v4 ; GFX12-NEXT: v_readfirstlane_b32 s7, v9 ; GFX12-NEXT: v_readfirstlane_b32 s8, v10 ; GFX12-NEXT: v_readfirstlane_b32 s9, v11 ; GFX12-NEXT: v_readfirstlane_b32 s10, v12 ; GFX12-NEXT: v_readfirstlane_b32 s15, v17 ; GFX12-NEXT: v_readfirstlane_b32 s4, v6 ; GFX12-NEXT: v_readfirstlane_b32 s5, v7 ; GFX12-NEXT: v_readfirstlane_b32 s6, v8 ; GFX12-NEXT: v_readfirstlane_b32 s12, v14 ; GFX12-NEXT: v_readfirstlane_b32 s13, v15 ; GFX12-NEXT: v_readfirstlane_b32 s14, v16 ; GFX12-NEXT: s_add_co_i32 s3, s3, s11 ; GFX12-NEXT: s_add_co_i32 s0, s0, s8 ; GFX12-NEXT: s_add_co_i32 s1, s1, s9 ; GFX12-NEXT: s_add_co_i32 s2, s2, s10 ; GFX12-NEXT: s_add_co_i32 s7, s7, s15 ; GFX12-NEXT: s_add_co_i32 s4, s4, s12 ; GFX12-NEXT: s_add_co_i32 s5, s5, s13 ; GFX12-NEXT: s_add_co_i32 s6, s6, s14 ; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6 ; GFX12-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX12-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 ; GFX12-NEXT: s_endpgm %a = load <8 x i32>, ptr addrspace(1) %ptra, align 2 %b = load volatile <8 x i32>, ptr addrspace(1) %ptra %sum = add <8 x i32> %a, %b store <8 x i32> %sum, ptr addrspace(1) %out ret void } ; any target, 512bit load load, not align 4 or not uniform mmo define amdgpu_ps void @load_uniform_P1_v16i32(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) { ; GFX7-LABEL: load_uniform_P1_v16i32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_load_dwordx4 v[2:5], off, s[0:3], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[6:9], off, s[0:3], 0 offset:16 ; GFX7-NEXT: buffer_load_dwordx4 v[10:13], off, s[0:3], 0 offset:32 ; GFX7-NEXT: buffer_load_dwordx4 v[14:17], off, s[0:3], 0 offset:48 ; GFX7-NEXT: s_waitcnt vmcnt(3) ; GFX7-NEXT: v_readfirstlane_b32 s4, v2 ; GFX7-NEXT: v_readfirstlane_b32 s5, v3 ; GFX7-NEXT: v_readfirstlane_b32 s6, v4 ; GFX7-NEXT: v_readfirstlane_b32 s7, v5 ; GFX7-NEXT: buffer_load_dwordx4 v[2:5], off, s[0:3], 0 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_readfirstlane_b32 s8, v6 ; GFX7-NEXT: v_readfirstlane_b32 s9, v7 ; GFX7-NEXT: v_readfirstlane_b32 s10, v8 ; GFX7-NEXT: v_readfirstlane_b32 s11, v9 ; GFX7-NEXT: buffer_load_dwordx4 v[6:9], off, s[0:3], 0 offset:16 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_readfirstlane_b32 s12, v10 ; GFX7-NEXT: v_readfirstlane_b32 s13, v11 ; GFX7-NEXT: v_readfirstlane_b32 s14, v12 ; GFX7-NEXT: v_readfirstlane_b32 s15, v13 ; GFX7-NEXT: buffer_load_dwordx4 v[10:13], off, s[0:3], 0 offset:32 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_readfirstlane_b32 s16, v14 ; GFX7-NEXT: v_readfirstlane_b32 s17, v15 ; GFX7-NEXT: v_readfirstlane_b32 s18, v16 ; GFX7-NEXT: v_readfirstlane_b32 s19, v17 ; GFX7-NEXT: buffer_load_dwordx4 v[14:17], off, s[0:3], 0 offset:48 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_readfirstlane_b32 s20, v2 ; GFX7-NEXT: v_readfirstlane_b32 s21, v3 ; GFX7-NEXT: v_readfirstlane_b32 s22, v4 ; GFX7-NEXT: v_readfirstlane_b32 s23, v5 ; GFX7-NEXT: s_add_i32 s4, s4, s20 ; GFX7-NEXT: v_readfirstlane_b32 s24, v6 ; GFX7-NEXT: v_readfirstlane_b32 s25, v7 ; GFX7-NEXT: v_readfirstlane_b32 s26, v8 ; GFX7-NEXT: v_readfirstlane_b32 s27, v9 ; GFX7-NEXT: s_add_i32 s5, s5, s21 ; GFX7-NEXT: v_readfirstlane_b32 s28, v10 ; GFX7-NEXT: v_readfirstlane_b32 s29, v11 ; GFX7-NEXT: v_readfirstlane_b32 s30, v12 ; GFX7-NEXT: v_readfirstlane_b32 s31, v13 ; GFX7-NEXT: s_add_i32 s6, s6, s22 ; GFX7-NEXT: v_readfirstlane_b32 s33, v14 ; GFX7-NEXT: v_readfirstlane_b32 s34, v15 ; GFX7-NEXT: v_readfirstlane_b32 s35, v16 ; GFX7-NEXT: v_readfirstlane_b32 s36, v17 ; GFX7-NEXT: s_add_i32 s7, s7, s23 ; GFX7-NEXT: s_add_i32 s8, s8, s24 ; GFX7-NEXT: s_add_i32 s12, s12, s28 ; GFX7-NEXT: s_add_i32 s16, s16, s33 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_add_i32 s9, s9, s25 ; GFX7-NEXT: s_add_i32 s10, s10, s26 ; GFX7-NEXT: s_add_i32 s11, s11, s27 ; GFX7-NEXT: s_add_i32 s13, s13, s29 ; GFX7-NEXT: s_add_i32 s14, s14, s30 ; GFX7-NEXT: s_add_i32 s15, s15, s31 ; GFX7-NEXT: s_add_i32 s17, s17, s34 ; GFX7-NEXT: s_add_i32 s18, s18, s35 ; GFX7-NEXT: s_add_i32 s19, s19, s36 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-NEXT: v_mov_b32_e32 v5, s7 ; GFX7-NEXT: v_mov_b32_e32 v6, s8 ; GFX7-NEXT: v_mov_b32_e32 v10, s12 ; GFX7-NEXT: v_mov_b32_e32 v14, s16 ; GFX7-NEXT: v_mov_b32_e32 v7, s9 ; GFX7-NEXT: v_mov_b32_e32 v8, s10 ; GFX7-NEXT: v_mov_b32_e32 v9, s11 ; GFX7-NEXT: v_mov_b32_e32 v11, s13 ; GFX7-NEXT: v_mov_b32_e32 v12, s14 ; GFX7-NEXT: v_mov_b32_e32 v13, s15 ; GFX7-NEXT: v_mov_b32_e32 v15, s17 ; GFX7-NEXT: v_mov_b32_e32 v16, s18 ; GFX7-NEXT: v_mov_b32_e32 v17, s19 ; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[0:3], 0 addr64 offset:16 ; GFX7-NEXT: buffer_store_dwordx4 v[10:13], v[0:1], s[0:3], 0 addr64 offset:32 ; GFX7-NEXT: buffer_store_dwordx4 v[14:17], v[0:1], s[0:3], 0 addr64 offset:48 ; GFX7-NEXT: s_endpgm ; ; GFX11-LABEL: load_uniform_P1_v16i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v30, 0 ; GFX11-NEXT: s_clause 0x4 ; GFX11-NEXT: global_load_b128 v[2:5], v30, s[0:1] ; GFX11-NEXT: global_load_b128 v[6:9], v30, s[0:1] offset:16 ; GFX11-NEXT: global_load_b128 v[10:13], v30, s[0:1] offset:32 ; GFX11-NEXT: global_load_b128 v[14:17], v30, s[0:1] offset:48 ; GFX11-NEXT: global_load_b128 v[18:21], v30, s[0:1] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_load_b128 v[22:25], v30, s[0:1] offset:16 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_load_b128 v[26:29], v30, s[0:1] offset:32 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_load_b128 v[30:33], v30, s[0:1] offset:48 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_readfirstlane_b32 s3, v5 ; GFX11-NEXT: v_readfirstlane_b32 s0, v2 ; GFX11-NEXT: v_readfirstlane_b32 s1, v3 ; GFX11-NEXT: v_readfirstlane_b32 s2, v4 ; GFX11-NEXT: v_readfirstlane_b32 s19, v21 ; GFX11-NEXT: v_readfirstlane_b32 s7, v9 ; GFX11-NEXT: v_readfirstlane_b32 s16, v18 ; GFX11-NEXT: v_readfirstlane_b32 s17, v19 ; GFX11-NEXT: v_readfirstlane_b32 s18, v20 ; GFX11-NEXT: v_readfirstlane_b32 s23, v25 ; GFX11-NEXT: v_readfirstlane_b32 s4, v6 ; GFX11-NEXT: v_readfirstlane_b32 s5, v7 ; GFX11-NEXT: v_readfirstlane_b32 s6, v8 ; GFX11-NEXT: v_readfirstlane_b32 s11, v13 ; GFX11-NEXT: v_readfirstlane_b32 s20, v22 ; GFX11-NEXT: v_readfirstlane_b32 s21, v23 ; GFX11-NEXT: v_readfirstlane_b32 s22, v24 ; GFX11-NEXT: v_readfirstlane_b32 s27, v29 ; GFX11-NEXT: v_readfirstlane_b32 s8, v10 ; GFX11-NEXT: v_readfirstlane_b32 s9, v11 ; GFX11-NEXT: v_readfirstlane_b32 s10, v12 ; GFX11-NEXT: v_readfirstlane_b32 s15, v17 ; GFX11-NEXT: v_readfirstlane_b32 s24, v26 ; GFX11-NEXT: v_readfirstlane_b32 s25, v27 ; GFX11-NEXT: v_readfirstlane_b32 s26, v28 ; GFX11-NEXT: v_readfirstlane_b32 s31, v33 ; GFX11-NEXT: v_readfirstlane_b32 s12, v14 ; GFX11-NEXT: v_readfirstlane_b32 s13, v15 ; GFX11-NEXT: v_readfirstlane_b32 s14, v16 ; GFX11-NEXT: v_readfirstlane_b32 s28, v30 ; GFX11-NEXT: v_readfirstlane_b32 s29, v31 ; GFX11-NEXT: v_readfirstlane_b32 s30, v32 ; GFX11-NEXT: s_add_i32 s3, s3, s19 ; GFX11-NEXT: s_add_i32 s0, s0, s16 ; GFX11-NEXT: s_add_i32 s1, s1, s17 ; GFX11-NEXT: s_add_i32 s2, s2, s18 ; GFX11-NEXT: s_add_i32 s7, s7, s23 ; GFX11-NEXT: s_add_i32 s4, s4, s20 ; GFX11-NEXT: s_add_i32 s5, s5, s21 ; GFX11-NEXT: s_add_i32 s6, s6, s22 ; GFX11-NEXT: s_add_i32 s11, s11, s27 ; GFX11-NEXT: v_mov_b32_e32 v5, s3 ; GFX11-NEXT: s_add_i32 s8, s8, s24 ; GFX11-NEXT: s_add_i32 s9, s9, s25 ; GFX11-NEXT: s_add_i32 s10, s10, s26 ; GFX11-NEXT: s_add_i32 s15, s15, s31 ; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v9, s7 ; GFX11-NEXT: s_add_i32 s12, s12, s28 ; GFX11-NEXT: s_add_i32 s13, s13, s29 ; GFX11-NEXT: s_add_i32 s14, s14, s30 ; GFX11-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v7, s5 ; GFX11-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v13, s11 ; GFX11-NEXT: v_dual_mov_b32 v12, s10 :: v_dual_mov_b32 v11, s9 ; GFX11-NEXT: v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v17, s15 ; GFX11-NEXT: v_dual_mov_b32 v16, s14 :: v_dual_mov_b32 v15, s13 ; GFX11-NEXT: v_mov_b32_e32 v14, s12 ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 ; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 ; GFX11-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: load_uniform_P1_v16i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v30, 0 ; GFX12-NEXT: s_clause 0x4 ; GFX12-NEXT: global_load_b128 v[2:5], v30, s[0:1] ; GFX12-NEXT: global_load_b128 v[6:9], v30, s[0:1] offset:16 ; GFX12-NEXT: global_load_b128 v[10:13], v30, s[0:1] offset:32 ; GFX12-NEXT: global_load_b128 v[14:17], v30, s[0:1] offset:48 ; GFX12-NEXT: global_load_b128 v[18:21], v30, s[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_load_b128 v[22:25], v30, s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_load_b128 v[26:29], v30, s[0:1] offset:32 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_load_b128 v[30:33], v30, s[0:1] offset:48 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s3, v5 ; GFX12-NEXT: v_readfirstlane_b32 s0, v2 ; GFX12-NEXT: v_readfirstlane_b32 s1, v3 ; GFX12-NEXT: v_readfirstlane_b32 s2, v4 ; GFX12-NEXT: v_readfirstlane_b32 s19, v21 ; GFX12-NEXT: v_readfirstlane_b32 s7, v9 ; GFX12-NEXT: v_readfirstlane_b32 s16, v18 ; GFX12-NEXT: v_readfirstlane_b32 s17, v19 ; GFX12-NEXT: v_readfirstlane_b32 s18, v20 ; GFX12-NEXT: v_readfirstlane_b32 s23, v25 ; GFX12-NEXT: v_readfirstlane_b32 s4, v6 ; GFX12-NEXT: v_readfirstlane_b32 s5, v7 ; GFX12-NEXT: v_readfirstlane_b32 s6, v8 ; GFX12-NEXT: v_readfirstlane_b32 s11, v13 ; GFX12-NEXT: v_readfirstlane_b32 s20, v22 ; GFX12-NEXT: v_readfirstlane_b32 s21, v23 ; GFX12-NEXT: v_readfirstlane_b32 s22, v24 ; GFX12-NEXT: v_readfirstlane_b32 s27, v29 ; GFX12-NEXT: v_readfirstlane_b32 s8, v10 ; GFX12-NEXT: v_readfirstlane_b32 s9, v11 ; GFX12-NEXT: v_readfirstlane_b32 s10, v12 ; GFX12-NEXT: v_readfirstlane_b32 s15, v17 ; GFX12-NEXT: v_readfirstlane_b32 s24, v26 ; GFX12-NEXT: v_readfirstlane_b32 s25, v27 ; GFX12-NEXT: v_readfirstlane_b32 s26, v28 ; GFX12-NEXT: v_readfirstlane_b32 s31, v33 ; GFX12-NEXT: v_readfirstlane_b32 s12, v14 ; GFX12-NEXT: v_readfirstlane_b32 s13, v15 ; GFX12-NEXT: v_readfirstlane_b32 s14, v16 ; GFX12-NEXT: v_readfirstlane_b32 s28, v30 ; GFX12-NEXT: v_readfirstlane_b32 s29, v31 ; GFX12-NEXT: v_readfirstlane_b32 s30, v32 ; GFX12-NEXT: s_add_co_i32 s3, s3, s19 ; GFX12-NEXT: s_add_co_i32 s0, s0, s16 ; GFX12-NEXT: s_add_co_i32 s1, s1, s17 ; GFX12-NEXT: s_add_co_i32 s2, s2, s18 ; GFX12-NEXT: s_add_co_i32 s7, s7, s23 ; GFX12-NEXT: s_add_co_i32 s4, s4, s20 ; GFX12-NEXT: s_add_co_i32 s5, s5, s21 ; GFX12-NEXT: s_add_co_i32 s6, s6, s22 ; GFX12-NEXT: s_add_co_i32 s11, s11, s27 ; GFX12-NEXT: v_mov_b32_e32 v5, s3 ; GFX12-NEXT: s_add_co_i32 s8, s8, s24 ; GFX12-NEXT: s_add_co_i32 s9, s9, s25 ; GFX12-NEXT: s_add_co_i32 s10, s10, s26 ; GFX12-NEXT: s_add_co_i32 s15, s15, s31 ; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v9, s7 ; GFX12-NEXT: s_add_co_i32 s12, s12, s28 ; GFX12-NEXT: s_add_co_i32 s13, s13, s29 ; GFX12-NEXT: s_add_co_i32 s14, s14, s30 ; GFX12-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v7, s5 ; GFX12-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v13, s11 ; GFX12-NEXT: v_dual_mov_b32 v12, s10 :: v_dual_mov_b32 v11, s9 ; GFX12-NEXT: v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v17, s15 ; GFX12-NEXT: v_dual_mov_b32 v16, s14 :: v_dual_mov_b32 v15, s13 ; GFX12-NEXT: v_mov_b32_e32 v14, s12 ; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX12-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 ; GFX12-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 ; GFX12-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 ; GFX12-NEXT: s_endpgm %a = load <16 x i32>, ptr addrspace(1) %ptra, align 2 %b = load volatile <16 x i32>, ptr addrspace(1) %ptra %sum = add <16 x i32> %a, %b store <16 x i32> %sum, ptr addrspace(1) %out ret void } define amdgpu_ps void @load_divergent_P3_i8_any_extending(ptr addrspace(3) inreg %ptra, ptr addrspace(3) %out) { ; GFX7-LABEL: load_divergent_P3_i8_any_extending: ; GFX7: ; %bb.0: ; GFX7-NEXT: v_mov_b32_e32 v1, s0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_u8 v1, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_write_b8 v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX11-LABEL: load_divergent_P3_i8_any_extending: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-NEXT: ds_load_u8 v1, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ds_store_b8 v0, v1 ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: load_divergent_P3_i8_any_extending: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-NEXT: ds_load_u8 v1, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: ds_store_b8 v0, v1 ; GFX12-NEXT: s_endpgm %a = load i8, ptr addrspace(3) %ptra store i8 %a, ptr addrspace(3) %out ret void } ; with true16, S16 16-bit load ; without true16, S32 16-bit any-extending load define amdgpu_ps void @load_divergent_P3_i16(ptr addrspace(3) inreg %ptra, ptr addrspace(3) %out) { ; GFX7-LABEL: load_divergent_P3_i16: ; GFX7: ; %bb.0: ; GFX7-NEXT: v_mov_b32_e32 v1, s0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_u16 v1, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_write_b16 v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX11-True16-LABEL: load_divergent_P3_i16: ; GFX11-True16: ; %bb.0: ; GFX11-True16-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-True16-NEXT: ds_load_u16_d16 v1, v1 ; GFX11-True16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-True16-NEXT: v_readfirstlane_b32 s0, v1 ; GFX11-True16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-True16-NEXT: v_mov_b16_e32 v1.l, s0 ; GFX11-True16-NEXT: ds_store_b16 v0, v1 ; GFX11-True16-NEXT: s_endpgm ; ; GFX11-NoTrue16-LABEL: load_divergent_P3_i16: ; GFX11-NoTrue16: ; %bb.0: ; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-NoTrue16-NEXT: ds_load_u16 v1, v1 ; GFX11-NoTrue16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NoTrue16-NEXT: ds_store_b16 v0, v1 ; GFX11-NoTrue16-NEXT: s_endpgm ; ; GFX12-True16-LABEL: load_divergent_P3_i16: ; GFX12-True16: ; %bb.0: ; GFX12-True16-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-True16-NEXT: ds_load_u16_d16 v1, v1 ; GFX12-True16-NEXT: s_wait_dscnt 0x0 ; GFX12-True16-NEXT: v_readfirstlane_b32 s0, v1 ; GFX12-True16-NEXT: s_wait_alu 0xf1ff ; GFX12-True16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-True16-NEXT: v_mov_b16_e32 v1.l, s0 ; GFX12-True16-NEXT: ds_store_b16 v0, v1 ; GFX12-True16-NEXT: s_endpgm ; ; GFX12-NoTrue16-LABEL: load_divergent_P3_i16: ; GFX12-NoTrue16: ; %bb.0: ; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-NoTrue16-NEXT: ds_load_u16 v1, v1 ; GFX12-NoTrue16-NEXT: s_wait_dscnt 0x0 ; GFX12-NoTrue16-NEXT: ds_store_b16 v0, v1 ; GFX12-NoTrue16-NEXT: s_endpgm %a = load i16, ptr addrspace(3) %ptra store i16 %a, ptr addrspace(3) %out ret void } define amdgpu_ps void @load_divergent_P3_i32(ptr addrspace(3) inreg %ptra, ptr addrspace(3) %out) { ; GFX7-LABEL: load_divergent_P3_i32: ; GFX7: ; %bb.0: ; GFX7-NEXT: v_mov_b32_e32 v1, s0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b32 v1, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; ; GFX11-LABEL: load_divergent_P3_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-NEXT: ds_load_b32 v1, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ds_store_b32 v0, v1 ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: load_divergent_P3_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-NEXT: ds_load_b32 v1, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: ds_store_b32 v0, v1 ; GFX12-NEXT: s_endpgm %a = load i32, ptr addrspace(3) %ptra store i32 %a, ptr addrspace(3) %out ret void } define amdgpu_ps void @load_divergent_P3_v2i32(ptr addrspace(3) inreg %ptra, ptr addrspace(3) %out) { ; GFX7-LABEL: load_divergent_P3_v2i32: ; GFX7: ; %bb.0: ; GFX7-NEXT: v_mov_b32_e32 v1, s0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b64 v[1:2], v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_write_b64 v0, v[1:2] ; GFX7-NEXT: s_endpgm ; ; GFX11-LABEL: load_divergent_P3_v2i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-NEXT: ds_load_b64 v[1:2], v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ds_store_b64 v0, v[1:2] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: load_divergent_P3_v2i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-NEXT: ds_load_b64 v[1:2], v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: ds_store_b64 v0, v[1:2] ; GFX12-NEXT: s_endpgm %a = load <2 x i32>, ptr addrspace(3) %ptra store <2 x i32> %a, ptr addrspace(3) %out ret void } define amdgpu_ps void @load_divergent_P3_v3i32(ptr addrspace(3) inreg %ptra, ptr addrspace(3) %out) { ; GFX7-LABEL: load_divergent_P3_v3i32: ; GFX7: ; %bb.0: ; GFX7-NEXT: v_mov_b32_e32 v1, s0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b96 v[1:3], v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_write_b96 v0, v[1:3] ; GFX7-NEXT: s_endpgm ; ; GFX11-LABEL: load_divergent_P3_v3i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-NEXT: ds_load_b96 v[1:3], v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ds_store_b96 v0, v[1:3] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: load_divergent_P3_v3i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-NEXT: ds_load_b96 v[1:3], v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: ds_store_b96 v0, v[1:3] ; GFX12-NEXT: s_endpgm %a = load <3 x i32>, ptr addrspace(3) %ptra store <3 x i32> %a, ptr addrspace(3) %out ret void } define amdgpu_ps void @load_divergent_P3_v4i32(ptr addrspace(3) inreg %ptra, ptr addrspace(3) %out) { ; GFX7-LABEL: load_divergent_P3_v4i32: ; GFX7: ; %bb.0: ; GFX7-NEXT: v_mov_b32_e32 v1, s0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b128 v[1:4], v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_write_b128 v0, v[1:4] ; GFX7-NEXT: s_endpgm ; ; GFX11-LABEL: load_divergent_P3_v4i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-NEXT: ds_load_b128 v[1:4], v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ds_store_b128 v0, v[1:4] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: load_divergent_P3_v4i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-NEXT: ds_load_b128 v[1:4], v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: ds_store_b128 v0, v[1:4] ; GFX12-NEXT: s_endpgm %a = load <4 x i32>, ptr addrspace(3) %ptra store <4 x i32> %a, ptr addrspace(3) %out ret void } ; constant address space, addrspace(4) ; not uniform load mmo check for G_LOAD is for the case where MMO somehow ends ; up with different addresspace then 4, Don't have tests for it in LLVM-IR. ; %b in tests will end up as uniform load in sgpr ; gfx12 true 16, not natural alignment define amdgpu_ps void @load_uniform_P4_i16_b16_gfx12(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) { ; GFX7-LABEL: load_uniform_P4_i16_b16_gfx12: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_mov_b32 s4, s2 ; GFX7-NEXT: s_mov_b32 s5, s3 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7-NEXT: buffer_load_ushort v2, off, s[0:3], 0 ; GFX7-NEXT: buffer_load_ushort v3, off, s[4:7], 0 glc ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_readfirstlane_b32 s0, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_readfirstlane_b32 s1, v3 ; GFX7-NEXT: s_add_i32 s0, s0, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_endpgm ; ; GFX11-True16-LABEL: load_uniform_P4_i16_b16_gfx12: ; GFX11-True16: ; %bb.0: ; GFX11-True16-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-True16-NEXT: s_clause 0x1 ; GFX11-True16-NEXT: global_load_d16_b16 v3, v2, s[0:1] ; GFX11-True16-NEXT: global_load_d16_b16 v2, v2, s[2:3] glc dlc ; GFX11-True16-NEXT: s_waitcnt vmcnt(1) ; GFX11-True16-NEXT: v_readfirstlane_b32 s0, v3 ; GFX11-True16-NEXT: s_waitcnt vmcnt(0) ; GFX11-True16-NEXT: v_readfirstlane_b32 s1, v2 ; GFX11-True16-NEXT: s_add_i32 s0, s0, s1 ; GFX11-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-True16-NEXT: v_mov_b16_e32 v2.l, s0 ; GFX11-True16-NEXT: global_store_b16 v[0:1], v2, off ; GFX11-True16-NEXT: s_endpgm ; ; GFX11-NoTrue16-LABEL: load_uniform_P4_i16_b16_gfx12: ; GFX11-NoTrue16: ; %bb.0: ; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NoTrue16-NEXT: s_clause 0x1 ; GFX11-NoTrue16-NEXT: global_load_u16 v3, v2, s[0:1] ; GFX11-NoTrue16-NEXT: global_load_u16 v2, v2, s[2:3] glc dlc ; GFX11-NoTrue16-NEXT: s_waitcnt vmcnt(1) ; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s0, v3 ; GFX11-NoTrue16-NEXT: s_waitcnt vmcnt(0) ; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2 ; GFX11-NoTrue16-NEXT: s_add_i32 s0, s0, s1 ; GFX11-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off ; GFX11-NoTrue16-NEXT: s_endpgm ; ; GFX12-True16-LABEL: load_uniform_P4_i16_b16_gfx12: ; GFX12-True16: ; %bb.0: ; GFX12-True16-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-True16-NEXT: global_load_d16_b16 v2, v2, s[0:1] ; GFX12-True16-NEXT: s_load_u16 s0, s[2:3], 0x0 ; GFX12-True16-NEXT: s_wait_loadcnt 0x0 ; GFX12-True16-NEXT: v_readfirstlane_b32 s1, v2 ; GFX12-True16-NEXT: s_wait_kmcnt 0x0 ; GFX12-True16-NEXT: s_add_co_i32 s0, s1, s0 ; GFX12-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0 ; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off ; GFX12-True16-NEXT: s_endpgm ; ; GFX12-NoTrue16-LABEL: load_uniform_P4_i16_b16_gfx12: ; GFX12-NoTrue16: ; %bb.0: ; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NoTrue16-NEXT: global_load_u16 v2, v2, s[0:1] ; GFX12-NoTrue16-NEXT: s_load_u16 s0, s[2:3], 0x0 ; GFX12-NoTrue16-NEXT: s_wait_loadcnt 0x0 ; GFX12-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2 ; GFX12-NoTrue16-NEXT: s_wait_kmcnt 0x0 ; GFX12-NoTrue16-NEXT: s_add_co_i32 s0, s1, s0 ; GFX12-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off ; GFX12-NoTrue16-NEXT: s_endpgm %a = load i16, ptr addrspace(4) %ptra, align 1 %b = load volatile i16, ptr addrspace(4) %ptrb %sum = add i16 %a, %b store i16 %sum, ptr addrspace(1) %out ret void } ; gfx11 true16, 16-bit load, not align 4 define amdgpu_ps void @load_uniform_P4_i16_b16_gfx11(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) { ; GFX7-LABEL: load_uniform_P4_i16_b16_gfx11: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_load_ushort v2, off, s[0:3], 0 ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_readfirstlane_b32 s1, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_i32 s0, s1, s0 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_endpgm ; ; GFX11-True16-LABEL: load_uniform_P4_i16_b16_gfx11: ; GFX11-True16: ; %bb.0: ; GFX11-True16-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-True16-NEXT: global_load_d16_b16 v2, v2, s[0:1] ; GFX11-True16-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-True16-NEXT: s_waitcnt vmcnt(0) ; GFX11-True16-NEXT: v_readfirstlane_b32 s1, v2 ; GFX11-True16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-True16-NEXT: s_add_i32 s0, s1, s0 ; GFX11-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-True16-NEXT: v_mov_b16_e32 v2.l, s0 ; GFX11-True16-NEXT: global_store_b16 v[0:1], v2, off ; GFX11-True16-NEXT: s_endpgm ; ; GFX11-NoTrue16-LABEL: load_uniform_P4_i16_b16_gfx11: ; GFX11-NoTrue16: ; %bb.0: ; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NoTrue16-NEXT: global_load_u16 v2, v2, s[0:1] ; GFX11-NoTrue16-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NoTrue16-NEXT: s_waitcnt vmcnt(0) ; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2 ; GFX11-NoTrue16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NoTrue16-NEXT: s_add_i32 s0, s1, s0 ; GFX11-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off ; GFX11-NoTrue16-NEXT: s_endpgm ; ; GFX12-True16-LABEL: load_uniform_P4_i16_b16_gfx11: ; GFX12-True16: ; %bb.0: ; GFX12-True16-NEXT: s_clause 0x1 ; GFX12-True16-NEXT: s_load_u16 s2, s[0:1], 0x0 ; GFX12-True16-NEXT: s_load_u16 s0, s[0:1], 0x0 ; GFX12-True16-NEXT: s_wait_kmcnt 0x0 ; GFX12-True16-NEXT: s_add_co_i32 s0, s2, s0 ; GFX12-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0 ; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off ; GFX12-True16-NEXT: s_endpgm ; ; GFX12-NoTrue16-LABEL: load_uniform_P4_i16_b16_gfx11: ; GFX12-NoTrue16: ; %bb.0: ; GFX12-NoTrue16-NEXT: s_clause 0x1 ; GFX12-NoTrue16-NEXT: s_load_u16 s2, s[0:1], 0x0 ; GFX12-NoTrue16-NEXT: s_load_u16 s0, s[0:1], 0x0 ; GFX12-NoTrue16-NEXT: s_wait_kmcnt 0x0 ; GFX12-NoTrue16-NEXT: s_add_co_i32 s0, s2, s0 ; GFX12-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off ; GFX12-NoTrue16-NEXT: s_endpgm %a = load i16, ptr addrspace(4) %ptra %b = load volatile i16, ptr addrspace(4) %ptra, align 4 %sum = add i16 %a, %b store i16 %sum, ptr addrspace(1) %out ret void } ; gfx12 without true16, 16-bit any-extending load, not natural alignment define amdgpu_ps void @load_uniform_P4_i16_anyextending_gfx12(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) { ; GFX7-LABEL: load_uniform_P4_i16_anyextending_gfx12: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_load_ushort v2, off, s[0:3], 0 ; GFX7-NEXT: buffer_load_ushort v3, off, s[0:3], 0 glc ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_readfirstlane_b32 s0, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_readfirstlane_b32 s1, v3 ; GFX7-NEXT: s_add_i32 s0, s0, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_endpgm ; ; GFX11-True16-LABEL: load_uniform_P4_i16_anyextending_gfx12: ; GFX11-True16: ; %bb.0: ; GFX11-True16-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-True16-NEXT: s_clause 0x1 ; GFX11-True16-NEXT: global_load_d16_b16 v3, v2, s[0:1] ; GFX11-True16-NEXT: global_load_d16_b16 v2, v2, s[0:1] glc dlc ; GFX11-True16-NEXT: s_waitcnt vmcnt(1) ; GFX11-True16-NEXT: v_readfirstlane_b32 s0, v3 ; GFX11-True16-NEXT: s_waitcnt vmcnt(0) ; GFX11-True16-NEXT: v_readfirstlane_b32 s1, v2 ; GFX11-True16-NEXT: s_add_i32 s0, s0, s1 ; GFX11-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-True16-NEXT: v_mov_b16_e32 v2.l, s0 ; GFX11-True16-NEXT: global_store_b16 v[0:1], v2, off ; GFX11-True16-NEXT: s_endpgm ; ; GFX11-NoTrue16-LABEL: load_uniform_P4_i16_anyextending_gfx12: ; GFX11-NoTrue16: ; %bb.0: ; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NoTrue16-NEXT: s_clause 0x1 ; GFX11-NoTrue16-NEXT: global_load_u16 v3, v2, s[0:1] ; GFX11-NoTrue16-NEXT: global_load_u16 v2, v2, s[0:1] glc dlc ; GFX11-NoTrue16-NEXT: s_waitcnt vmcnt(1) ; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s0, v3 ; GFX11-NoTrue16-NEXT: s_waitcnt vmcnt(0) ; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2 ; GFX11-NoTrue16-NEXT: s_add_i32 s0, s0, s1 ; GFX11-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off ; GFX11-NoTrue16-NEXT: s_endpgm ; ; GFX12-True16-LABEL: load_uniform_P4_i16_anyextending_gfx12: ; GFX12-True16: ; %bb.0: ; GFX12-True16-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-True16-NEXT: global_load_d16_b16 v2, v2, s[0:1] ; GFX12-True16-NEXT: s_load_u16 s0, s[0:1], 0x0 ; GFX12-True16-NEXT: s_wait_loadcnt 0x0 ; GFX12-True16-NEXT: v_readfirstlane_b32 s1, v2 ; GFX12-True16-NEXT: s_wait_kmcnt 0x0 ; GFX12-True16-NEXT: s_add_co_i32 s0, s1, s0 ; GFX12-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0 ; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off ; GFX12-True16-NEXT: s_endpgm ; ; GFX12-NoTrue16-LABEL: load_uniform_P4_i16_anyextending_gfx12: ; GFX12-NoTrue16: ; %bb.0: ; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NoTrue16-NEXT: global_load_u16 v2, v2, s[0:1] ; GFX12-NoTrue16-NEXT: s_load_u16 s0, s[0:1], 0x0 ; GFX12-NoTrue16-NEXT: s_wait_loadcnt 0x0 ; GFX12-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2 ; GFX12-NoTrue16-NEXT: s_wait_kmcnt 0x0 ; GFX12-NoTrue16-NEXT: s_add_co_i32 s0, s1, s0 ; GFX12-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off ; GFX12-NoTrue16-NEXT: s_endpgm %a = load i16, ptr addrspace(4) %ptra, align 1 %b = load volatile i16, ptr addrspace(4) %ptra %sum = add i16 %a, %b store i16 %sum, ptr addrspace(1) %out ret void } ; gfx11(or older) without true 16, s16 any-extending load, not align 4 define amdgpu_ps void @load_uniform_P4_i16_anyextending_gfx11(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) { ; GFX7-LABEL: load_uniform_P4_i16_anyextending_gfx11: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_load_ushort v2, off, s[0:3], 0 ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_readfirstlane_b32 s1, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_i32 s0, s1, s0 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_endpgm ; ; GFX11-True16-LABEL: load_uniform_P4_i16_anyextending_gfx11: ; GFX11-True16: ; %bb.0: ; GFX11-True16-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-True16-NEXT: global_load_d16_b16 v2, v2, s[0:1] ; GFX11-True16-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-True16-NEXT: s_waitcnt vmcnt(0) ; GFX11-True16-NEXT: v_readfirstlane_b32 s1, v2 ; GFX11-True16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-True16-NEXT: s_add_i32 s0, s1, s0 ; GFX11-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-True16-NEXT: v_mov_b16_e32 v2.l, s0 ; GFX11-True16-NEXT: global_store_b16 v[0:1], v2, off ; GFX11-True16-NEXT: s_endpgm ; ; GFX11-NoTrue16-LABEL: load_uniform_P4_i16_anyextending_gfx11: ; GFX11-NoTrue16: ; %bb.0: ; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NoTrue16-NEXT: global_load_u16 v2, v2, s[0:1] ; GFX11-NoTrue16-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NoTrue16-NEXT: s_waitcnt vmcnt(0) ; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2 ; GFX11-NoTrue16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NoTrue16-NEXT: s_add_i32 s0, s1, s0 ; GFX11-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off ; GFX11-NoTrue16-NEXT: s_endpgm ; ; GFX12-True16-LABEL: load_uniform_P4_i16_anyextending_gfx11: ; GFX12-True16: ; %bb.0: ; GFX12-True16-NEXT: s_clause 0x1 ; GFX12-True16-NEXT: s_load_u16 s2, s[0:1], 0x0 ; GFX12-True16-NEXT: s_load_u16 s0, s[0:1], 0x0 ; GFX12-True16-NEXT: s_wait_kmcnt 0x0 ; GFX12-True16-NEXT: s_add_co_i32 s0, s2, s0 ; GFX12-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0 ; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off ; GFX12-True16-NEXT: s_endpgm ; ; GFX12-NoTrue16-LABEL: load_uniform_P4_i16_anyextending_gfx11: ; GFX12-NoTrue16: ; %bb.0: ; GFX12-NoTrue16-NEXT: s_clause 0x1 ; GFX12-NoTrue16-NEXT: s_load_u16 s2, s[0:1], 0x0 ; GFX12-NoTrue16-NEXT: s_load_u16 s0, s[0:1], 0x0 ; GFX12-NoTrue16-NEXT: s_wait_kmcnt 0x0 ; GFX12-NoTrue16-NEXT: s_add_co_i32 s0, s2, s0 ; GFX12-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off ; GFX12-NoTrue16-NEXT: s_endpgm %a = load i16, ptr addrspace(4) %ptra %b = load volatile i16, ptr addrspace(4) %ptra, align 4 %sum = add i16 %a, %b store i16 %sum, ptr addrspace(1) %out ret void } ; any target, 32-bit load load, not align 4 define amdgpu_ps void @load_uniform_P4_i32(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) { ; GFX7-LABEL: load_uniform_P4_i32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], 0 ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_readfirstlane_b32 s1, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_i32 s0, s1, s0 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_endpgm ; ; GFX11-LABEL: load_uniform_P4_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: global_load_b32 v2, v2, s[0:1] ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_readfirstlane_b32 s1, v2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_i32 s0, s1, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: load_uniform_P4_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: global_load_b32 v2, v2, s[0:1] ; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s1, v2 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_co_i32 s0, s1, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-NEXT: global_store_b32 v[0:1], v2, off ; GFX12-NEXT: s_endpgm %a = load i32, ptr addrspace(4) %ptra, align 2 %b = load volatile i32, ptr addrspace(4) %ptra %sum = add i32 %a, %b store i32 %sum, ptr addrspace(1) %out ret void } ; any target, 64bit load load, not align 4 define amdgpu_ps void @load_uniform_P4_v2i32(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) { ; GFX7-LABEL: load_uniform_P4_v2i32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_readfirstlane_b32 s5, v3 ; GFX7-NEXT: v_readfirstlane_b32 s4, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_i32 s1, s5, s1 ; GFX7-NEXT: s_add_i32 s0, s4, s0 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_endpgm ; ; GFX11-LABEL: load_uniform_P4_v2i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: global_load_b64 v[2:3], v2, s[0:1] ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_readfirstlane_b32 s2, v2 ; GFX11-NEXT: v_readfirstlane_b32 s3, v3 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_i32 s0, s2, s0 ; GFX11-NEXT: s_add_i32 s1, s3, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: load_uniform_P4_v2i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: global_load_b64 v[2:3], v2, s[0:1] ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s2, v2 ; GFX12-NEXT: v_readfirstlane_b32 s3, v3 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_co_i32 s0, s2, s0 ; GFX12-NEXT: s_add_co_i32 s1, s3, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX12-NEXT: s_endpgm %a = load <2 x i32>, ptr addrspace(4) %ptra, align 2 %b = load volatile <2 x i32>, ptr addrspace(4) %ptra %sum = add <2 x i32> %a, %b store <2 x i32> %sum, ptr addrspace(1) %out ret void } ; any target, 96bit load load, not align 4 define amdgpu_ps void @load_uniform_P4_v3i32_gfx12(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) { ; GFX7-LABEL: load_uniform_P4_v3i32_gfx12: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_load_dwordx3 v[2:4], off, s[0:3], 0 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_readfirstlane_b32 s0, v2 ; GFX7-NEXT: v_readfirstlane_b32 s1, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_readfirstlane_b32 s7, v4 ; GFX7-NEXT: s_add_i32 s4, s0, s4 ; GFX7-NEXT: s_add_i32 s5, s1, s5 ; GFX7-NEXT: s_add_i32 s6, s7, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: buffer_store_dwordx3 v[2:4], v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_endpgm ; ; GFX11-LABEL: load_uniform_P4_v3i32_gfx12: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: global_load_b96 v[2:4], v2, s[0:1] ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_readfirstlane_b32 s5, v4 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_readfirstlane_b32 s3, v2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v3 ; GFX11-NEXT: s_add_i32 s2, s5, s2 ; GFX11-NEXT: s_add_i32 s0, s3, s0 ; GFX11-NEXT: s_add_i32 s1, s4, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 ; GFX11-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-NEXT: global_store_b96 v[0:1], v[2:4], off ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: load_uniform_P4_v3i32_gfx12: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: global_load_b96 v[2:4], v2, s[0:1] ; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v4 ; GFX12-NEXT: v_readfirstlane_b32 s3, v2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v3 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_co_i32 s2, s5, s2 ; GFX12-NEXT: s_add_co_i32 s0, s3, s0 ; GFX12-NEXT: s_add_co_i32 s1, s4, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-NEXT: global_store_b96 v[0:1], v[2:4], off ; GFX12-NEXT: s_endpgm %a = load <3 x i32>, ptr addrspace(4) %ptra, align 2 %b = load volatile <3 x i32>, ptr addrspace(4) %ptra %sum = add <3 x i32> %a, %b store <3 x i32> %sum, ptr addrspace(1) %out ret void } ; any target, 128-bit load load, not align 4 define amdgpu_ps void @load_uniform_P4_v4i32(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) { ; GFX7-LABEL: load_uniform_P4_v4i32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_load_dwordx4 v[2:5], off, s[0:3], 0 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_readfirstlane_b32 s0, v2 ; GFX7-NEXT: v_readfirstlane_b32 s1, v3 ; GFX7-NEXT: v_readfirstlane_b32 s8, v4 ; GFX7-NEXT: v_readfirstlane_b32 s9, v5 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_i32 s4, s0, s4 ; GFX7-NEXT: s_add_i32 s5, s1, s5 ; GFX7-NEXT: s_add_i32 s6, s8, s6 ; GFX7-NEXT: s_add_i32 s7, s9, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-NEXT: v_mov_b32_e32 v5, s7 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_endpgm ; ; GFX11-LABEL: load_uniform_P4_v4i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: global_load_b128 v[2:5], v2, s[0:1] ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_readfirstlane_b32 s7, v5 ; GFX11-NEXT: v_readfirstlane_b32 s4, v2 ; GFX11-NEXT: v_readfirstlane_b32 s5, v3 ; GFX11-NEXT: v_readfirstlane_b32 s6, v4 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_i32 s3, s7, s3 ; GFX11-NEXT: s_add_i32 s0, s4, s0 ; GFX11-NEXT: s_add_i32 s1, s5, s1 ; GFX11-NEXT: s_add_i32 s2, s6, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 ; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: load_uniform_P4_v4i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: global_load_b128 v[2:5], v2, s[0:1] ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s7, v5 ; GFX12-NEXT: v_readfirstlane_b32 s4, v2 ; GFX12-NEXT: v_readfirstlane_b32 s5, v3 ; GFX12-NEXT: v_readfirstlane_b32 s6, v4 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_co_i32 s3, s7, s3 ; GFX12-NEXT: s_add_co_i32 s0, s4, s0 ; GFX12-NEXT: s_add_co_i32 s1, s5, s1 ; GFX12-NEXT: s_add_co_i32 s2, s6, s2 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX12-NEXT: s_endpgm %a = load <4 x i32>, ptr addrspace(4) %ptra, align 2 %b = load volatile <4 x i32>, ptr addrspace(4) %ptra %sum = add <4 x i32> %a, %b store <4 x i32> %sum, ptr addrspace(1) %out ret void } ; any target, 256bit load load, not align 4 define amdgpu_ps void @load_uniform_P4_v8i32(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) { ; GFX7-LABEL: load_uniform_P4_v8i32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_load_dwordx4 v[2:5], off, s[0:3], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[6:9], off, s[0:3], 0 offset:16 ; GFX7-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_readfirstlane_b32 s12, v2 ; GFX7-NEXT: v_readfirstlane_b32 s13, v3 ; GFX7-NEXT: v_readfirstlane_b32 s14, v4 ; GFX7-NEXT: v_readfirstlane_b32 s15, v5 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_readfirstlane_b32 s16, v6 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_i32 s4, s12, s4 ; GFX7-NEXT: v_readfirstlane_b32 s17, v7 ; GFX7-NEXT: v_readfirstlane_b32 s18, v8 ; GFX7-NEXT: v_readfirstlane_b32 s19, v9 ; GFX7-NEXT: s_add_i32 s5, s13, s5 ; GFX7-NEXT: s_add_i32 s6, s14, s6 ; GFX7-NEXT: s_add_i32 s7, s15, s7 ; GFX7-NEXT: s_add_i32 s8, s16, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_add_i32 s9, s17, s9 ; GFX7-NEXT: s_add_i32 s10, s18, s10 ; GFX7-NEXT: s_add_i32 s11, s19, s11 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-NEXT: v_mov_b32_e32 v5, s7 ; GFX7-NEXT: v_mov_b32_e32 v6, s8 ; GFX7-NEXT: v_mov_b32_e32 v7, s9 ; GFX7-NEXT: v_mov_b32_e32 v8, s10 ; GFX7-NEXT: v_mov_b32_e32 v9, s11 ; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[0:3], 0 addr64 offset:16 ; GFX7-NEXT: s_endpgm ; ; GFX11-LABEL: load_uniform_P4_v8i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v6, 0 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b128 v[2:5], v6, s[0:1] ; GFX11-NEXT: global_load_b128 v[6:9], v6, s[0:1] offset:16 ; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: v_readfirstlane_b32 s11, v5 ; GFX11-NEXT: v_readfirstlane_b32 s8, v2 ; GFX11-NEXT: v_readfirstlane_b32 s9, v3 ; GFX11-NEXT: v_readfirstlane_b32 s10, v4 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_readfirstlane_b32 s15, v9 ; GFX11-NEXT: v_readfirstlane_b32 s12, v6 ; GFX11-NEXT: v_readfirstlane_b32 s13, v7 ; GFX11-NEXT: v_readfirstlane_b32 s14, v8 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_i32 s3, s11, s3 ; GFX11-NEXT: s_add_i32 s0, s8, s0 ; GFX11-NEXT: s_add_i32 s1, s9, s1 ; GFX11-NEXT: s_add_i32 s2, s10, s2 ; GFX11-NEXT: s_add_i32 s7, s15, s7 ; GFX11-NEXT: s_add_i32 s4, s12, s4 ; GFX11-NEXT: s_add_i32 s5, s13, s5 ; GFX11-NEXT: s_add_i32 s6, s14, s6 ; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 ; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6 ; GFX11-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: load_uniform_P4_v8i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v6, 0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_load_b128 v[2:5], v6, s[0:1] ; GFX12-NEXT: global_load_b128 v[6:9], v6, s[0:1] offset:16 ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x0 ; GFX12-NEXT: s_wait_loadcnt 0x1 ; GFX12-NEXT: v_readfirstlane_b32 s11, v5 ; GFX12-NEXT: v_readfirstlane_b32 s8, v2 ; GFX12-NEXT: v_readfirstlane_b32 s9, v3 ; GFX12-NEXT: v_readfirstlane_b32 s10, v4 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s15, v9 ; GFX12-NEXT: v_readfirstlane_b32 s12, v6 ; GFX12-NEXT: v_readfirstlane_b32 s13, v7 ; GFX12-NEXT: v_readfirstlane_b32 s14, v8 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_co_i32 s3, s11, s3 ; GFX12-NEXT: s_add_co_i32 s0, s8, s0 ; GFX12-NEXT: s_add_co_i32 s1, s9, s1 ; GFX12-NEXT: s_add_co_i32 s2, s10, s2 ; GFX12-NEXT: s_add_co_i32 s7, s15, s7 ; GFX12-NEXT: s_add_co_i32 s4, s12, s4 ; GFX12-NEXT: s_add_co_i32 s5, s13, s5 ; GFX12-NEXT: s_add_co_i32 s6, s14, s6 ; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6 ; GFX12-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX12-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 ; GFX12-NEXT: s_endpgm %a = load <8 x i32>, ptr addrspace(4) %ptra, align 2 %b = load volatile <8 x i32>, ptr addrspace(4) %ptra %sum = add <8 x i32> %a, %b store <8 x i32> %sum, ptr addrspace(1) %out ret void } ; any target, 512bit load load, not align 4 define amdgpu_ps void @load_uniform_P4_v16i32(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) { ; GFX7-LABEL: load_uniform_P4_v16i32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_load_dwordx4 v[2:5], off, s[0:3], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[6:9], off, s[0:3], 0 offset:16 ; GFX7-NEXT: buffer_load_dwordx4 v[10:13], off, s[0:3], 0 offset:32 ; GFX7-NEXT: buffer_load_dwordx4 v[14:17], off, s[0:3], 0 offset:48 ; GFX7-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: s_waitcnt vmcnt(3) ; GFX7-NEXT: v_readfirstlane_b32 s20, v2 ; GFX7-NEXT: v_readfirstlane_b32 s21, v3 ; GFX7-NEXT: v_readfirstlane_b32 s22, v4 ; GFX7-NEXT: v_readfirstlane_b32 s23, v5 ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_readfirstlane_b32 s24, v6 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_readfirstlane_b32 s28, v10 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_readfirstlane_b32 s33, v14 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_i32 s4, s20, s4 ; GFX7-NEXT: v_readfirstlane_b32 s25, v7 ; GFX7-NEXT: v_readfirstlane_b32 s26, v8 ; GFX7-NEXT: v_readfirstlane_b32 s27, v9 ; GFX7-NEXT: v_readfirstlane_b32 s29, v11 ; GFX7-NEXT: v_readfirstlane_b32 s30, v12 ; GFX7-NEXT: v_readfirstlane_b32 s31, v13 ; GFX7-NEXT: v_readfirstlane_b32 s34, v15 ; GFX7-NEXT: v_readfirstlane_b32 s35, v16 ; GFX7-NEXT: v_readfirstlane_b32 s36, v17 ; GFX7-NEXT: s_add_i32 s5, s21, s5 ; GFX7-NEXT: s_add_i32 s6, s22, s6 ; GFX7-NEXT: s_add_i32 s7, s23, s7 ; GFX7-NEXT: s_add_i32 s8, s24, s8 ; GFX7-NEXT: s_add_i32 s12, s28, s12 ; GFX7-NEXT: s_add_i32 s16, s33, s16 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_add_i32 s9, s25, s9 ; GFX7-NEXT: s_add_i32 s10, s26, s10 ; GFX7-NEXT: s_add_i32 s11, s27, s11 ; GFX7-NEXT: s_add_i32 s13, s29, s13 ; GFX7-NEXT: s_add_i32 s14, s30, s14 ; GFX7-NEXT: s_add_i32 s15, s31, s15 ; GFX7-NEXT: s_add_i32 s17, s34, s17 ; GFX7-NEXT: s_add_i32 s18, s35, s18 ; GFX7-NEXT: s_add_i32 s19, s36, s19 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-NEXT: v_mov_b32_e32 v5, s7 ; GFX7-NEXT: v_mov_b32_e32 v6, s8 ; GFX7-NEXT: v_mov_b32_e32 v10, s12 ; GFX7-NEXT: v_mov_b32_e32 v14, s16 ; GFX7-NEXT: v_mov_b32_e32 v7, s9 ; GFX7-NEXT: v_mov_b32_e32 v8, s10 ; GFX7-NEXT: v_mov_b32_e32 v9, s11 ; GFX7-NEXT: v_mov_b32_e32 v11, s13 ; GFX7-NEXT: v_mov_b32_e32 v12, s14 ; GFX7-NEXT: v_mov_b32_e32 v13, s15 ; GFX7-NEXT: v_mov_b32_e32 v15, s17 ; GFX7-NEXT: v_mov_b32_e32 v16, s18 ; GFX7-NEXT: v_mov_b32_e32 v17, s19 ; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[0:3], 0 addr64 offset:16 ; GFX7-NEXT: buffer_store_dwordx4 v[10:13], v[0:1], s[0:3], 0 addr64 offset:32 ; GFX7-NEXT: buffer_store_dwordx4 v[14:17], v[0:1], s[0:3], 0 addr64 offset:48 ; GFX7-NEXT: s_endpgm ; ; GFX11-LABEL: load_uniform_P4_v16i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v14, 0 ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: global_load_b128 v[2:5], v14, s[0:1] ; GFX11-NEXT: global_load_b128 v[6:9], v14, s[0:1] offset:16 ; GFX11-NEXT: global_load_b128 v[10:13], v14, s[0:1] offset:32 ; GFX11-NEXT: global_load_b128 v[14:17], v14, s[0:1] offset:48 ; GFX11-NEXT: s_load_b512 s[0:15], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt vmcnt(3) ; GFX11-NEXT: v_readfirstlane_b32 s19, v5 ; GFX11-NEXT: v_readfirstlane_b32 s16, v2 ; GFX11-NEXT: v_readfirstlane_b32 s17, v3 ; GFX11-NEXT: v_readfirstlane_b32 s18, v4 ; GFX11-NEXT: s_waitcnt vmcnt(2) ; GFX11-NEXT: v_readfirstlane_b32 s23, v9 ; GFX11-NEXT: v_readfirstlane_b32 s20, v6 ; GFX11-NEXT: v_readfirstlane_b32 s21, v7 ; GFX11-NEXT: v_readfirstlane_b32 s22, v8 ; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: v_readfirstlane_b32 s27, v13 ; GFX11-NEXT: v_readfirstlane_b32 s24, v10 ; GFX11-NEXT: v_readfirstlane_b32 s25, v11 ; GFX11-NEXT: v_readfirstlane_b32 s26, v12 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_readfirstlane_b32 s31, v17 ; GFX11-NEXT: v_readfirstlane_b32 s28, v14 ; GFX11-NEXT: v_readfirstlane_b32 s29, v15 ; GFX11-NEXT: v_readfirstlane_b32 s30, v16 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_i32 s3, s19, s3 ; GFX11-NEXT: s_add_i32 s0, s16, s0 ; GFX11-NEXT: s_add_i32 s1, s17, s1 ; GFX11-NEXT: s_add_i32 s2, s18, s2 ; GFX11-NEXT: s_add_i32 s7, s23, s7 ; GFX11-NEXT: s_add_i32 s4, s20, s4 ; GFX11-NEXT: s_add_i32 s5, s21, s5 ; GFX11-NEXT: s_add_i32 s6, s22, s6 ; GFX11-NEXT: s_add_i32 s11, s27, s11 ; GFX11-NEXT: v_mov_b32_e32 v5, s3 ; GFX11-NEXT: s_add_i32 s8, s24, s8 ; GFX11-NEXT: s_add_i32 s9, s25, s9 ; GFX11-NEXT: s_add_i32 s10, s26, s10 ; GFX11-NEXT: s_add_i32 s15, s31, s15 ; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v9, s7 ; GFX11-NEXT: s_add_i32 s12, s28, s12 ; GFX11-NEXT: s_add_i32 s13, s29, s13 ; GFX11-NEXT: s_add_i32 s14, s30, s14 ; GFX11-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v7, s5 ; GFX11-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v13, s11 ; GFX11-NEXT: v_dual_mov_b32 v12, s10 :: v_dual_mov_b32 v11, s9 ; GFX11-NEXT: v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v17, s15 ; GFX11-NEXT: v_dual_mov_b32 v16, s14 :: v_dual_mov_b32 v15, s13 ; GFX11-NEXT: v_mov_b32_e32 v14, s12 ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 ; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 ; GFX11-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: load_uniform_P4_v16i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v14, 0 ; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: global_load_b128 v[2:5], v14, s[0:1] ; GFX12-NEXT: global_load_b128 v[6:9], v14, s[0:1] offset:16 ; GFX12-NEXT: global_load_b128 v[10:13], v14, s[0:1] offset:32 ; GFX12-NEXT: global_load_b128 v[14:17], v14, s[0:1] offset:48 ; GFX12-NEXT: s_load_b512 s[0:15], s[0:1], 0x0 ; GFX12-NEXT: s_wait_loadcnt 0x3 ; GFX12-NEXT: v_readfirstlane_b32 s19, v5 ; GFX12-NEXT: v_readfirstlane_b32 s16, v2 ; GFX12-NEXT: v_readfirstlane_b32 s17, v3 ; GFX12-NEXT: v_readfirstlane_b32 s18, v4 ; GFX12-NEXT: s_wait_loadcnt 0x2 ; GFX12-NEXT: v_readfirstlane_b32 s23, v9 ; GFX12-NEXT: v_readfirstlane_b32 s20, v6 ; GFX12-NEXT: v_readfirstlane_b32 s21, v7 ; GFX12-NEXT: v_readfirstlane_b32 s22, v8 ; GFX12-NEXT: s_wait_loadcnt 0x1 ; GFX12-NEXT: v_readfirstlane_b32 s27, v13 ; GFX12-NEXT: v_readfirstlane_b32 s24, v10 ; GFX12-NEXT: v_readfirstlane_b32 s25, v11 ; GFX12-NEXT: v_readfirstlane_b32 s26, v12 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s31, v17 ; GFX12-NEXT: v_readfirstlane_b32 s28, v14 ; GFX12-NEXT: v_readfirstlane_b32 s29, v15 ; GFX12-NEXT: v_readfirstlane_b32 s30, v16 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_co_i32 s3, s19, s3 ; GFX12-NEXT: s_add_co_i32 s0, s16, s0 ; GFX12-NEXT: s_add_co_i32 s1, s17, s1 ; GFX12-NEXT: s_add_co_i32 s2, s18, s2 ; GFX12-NEXT: s_add_co_i32 s7, s23, s7 ; GFX12-NEXT: s_add_co_i32 s4, s20, s4 ; GFX12-NEXT: s_add_co_i32 s5, s21, s5 ; GFX12-NEXT: s_add_co_i32 s6, s22, s6 ; GFX12-NEXT: s_add_co_i32 s11, s27, s11 ; GFX12-NEXT: v_mov_b32_e32 v5, s3 ; GFX12-NEXT: s_add_co_i32 s8, s24, s8 ; GFX12-NEXT: s_add_co_i32 s9, s25, s9 ; GFX12-NEXT: s_add_co_i32 s10, s26, s10 ; GFX12-NEXT: s_add_co_i32 s15, s31, s15 ; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v9, s7 ; GFX12-NEXT: s_add_co_i32 s12, s28, s12 ; GFX12-NEXT: s_add_co_i32 s13, s29, s13 ; GFX12-NEXT: s_add_co_i32 s14, s30, s14 ; GFX12-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v7, s5 ; GFX12-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v13, s11 ; GFX12-NEXT: v_dual_mov_b32 v12, s10 :: v_dual_mov_b32 v11, s9 ; GFX12-NEXT: v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v17, s15 ; GFX12-NEXT: v_dual_mov_b32 v16, s14 :: v_dual_mov_b32 v15, s13 ; GFX12-NEXT: v_mov_b32_e32 v14, s12 ; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX12-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 ; GFX12-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 ; GFX12-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 ; GFX12-NEXT: s_endpgm %a = load <16 x i32>, ptr addrspace(4) %ptra, align 2 %b = load volatile <16 x i32>, ptr addrspace(4) %ptra %sum = add <16 x i32> %a, %b store <16 x i32> %sum, ptr addrspace(1) %out ret void }