; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s ; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s declare <2 x i32> @llvm.amdgcn.global.load.tr4.b64.v2i32.p1(ptr addrspace(1)) declare <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32.p1(ptr addrspace(1)) declare <3 x i32> @llvm.amdgcn.global.load.tr6.b96.v3i32.p1(ptr addrspace(1)) declare <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16.p1(ptr addrspace(1)) declare <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16.p1(ptr addrspace(1)) declare <8 x bfloat> @llvm.amdgcn.global.load.tr.b128.v8bf16.p1(ptr addrspace(1)) declare <2 x i32> @llvm.amdgcn.ds.load.tr4.b64.v2i32.p3(ptr addrspace(3)) declare <2 x i32> @llvm.amdgcn.ds.load.tr8.b64.v2i32.p3(ptr addrspace(3)) declare <3 x i32> @llvm.amdgcn.ds.load.tr6.b96.v3i32.p3(ptr addrspace(3)) declare <8 x i16> @llvm.amdgcn.ds.load.tr16.b128.v8i16.p3(ptr addrspace(3)) declare <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16.p3(ptr addrspace(3)) declare <8 x bfloat> @llvm.amdgcn.ds.load.tr16.b128.v8bf16.p3(ptr addrspace(3)) define amdgpu_ps void @global_load_tr4_b64_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use) { ; GFX1250-LABEL: global_load_tr4_b64_vaddr: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: global_load_tr4_b64 v[0:1], v[0:1], off offset:32 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b64 v[2:3], v[0:1], off ; GFX1250-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 %val = call <2 x i32> @llvm.amdgcn.global.load.tr4.b64.v2i32.p1(ptr addrspace(1) %gep) store <2 x i32> %val, ptr addrspace(1) %use ret void } define amdgpu_ps void @global_load_tr4_b64_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use) { ; GFX1250-LABEL: global_load_tr4_b64_saddr: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: global_load_tr4_b64 v[2:3], v2, s[0:1] offset:32 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX1250-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 %val = call <2 x i32> @llvm.amdgcn.global.load.tr4.b64.v2i32.p1(ptr addrspace(1) %gep) store <2 x i32> %val, ptr addrspace(1) %use ret void } define amdgpu_ps void @global_load_tr8_b64_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use) { ; GFX1250-LABEL: global_load_tr8_b64_vaddr: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: global_load_tr8_b64 v[0:1], v[0:1], off offset:32 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b64 v[2:3], v[0:1], off ; GFX1250-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 %val = call <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32.p1(ptr addrspace(1) %gep) store <2 x i32> %val, ptr addrspace(1) %use ret void } define amdgpu_ps void @global_load_tr8_b64_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use) { ; GFX1250-LABEL: global_load_tr8_b64_saddr: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: global_load_tr8_b64 v[2:3], v2, s[0:1] offset:32 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX1250-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 %val = call <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32.p1(ptr addrspace(1) %gep) store <2 x i32> %val, ptr addrspace(1) %use ret void } define amdgpu_ps void @global_load_tr6_b96_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use) { ; GFX1250-LABEL: global_load_tr6_b96_vaddr: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: global_load_tr6_b96 v[4:6], v[0:1], off offset:32 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b96 v[2:3], v[4:6], off ; GFX1250-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 %val = call <3 x i32> @llvm.amdgcn.global.load.tr6.b96.v3i32.p1(ptr addrspace(1) %gep) store <3 x i32> %val, ptr addrspace(1) %use ret void } define amdgpu_ps void @global_load_tr6_b96_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use) { ; GFX1250-LABEL: global_load_tr6_b96_saddr: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: global_load_tr6_b96 v[2:4], v2, s[0:1] offset:32 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b96 v[0:1], v[2:4], off ; GFX1250-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 %val = call <3 x i32> @llvm.amdgcn.global.load.tr6.b96.v3i32.p1(ptr addrspace(1) %gep) store <3 x i32> %val, ptr addrspace(1) %use ret void } define amdgpu_ps void @global_load_tr16_b128_v8i16_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use) { ; GFX1250-LABEL: global_load_tr16_b128_v8i16_vaddr: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: global_load_tr16_b128 v[4:7], v[0:1], off offset:32 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b128 v[2:3], v[4:7], off ; GFX1250-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 %val = call <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16.p1(ptr addrspace(1) %gep) store <8 x i16> %val, ptr addrspace(1) %use ret void } define amdgpu_ps void @global_load_tr16_b128_v8i16_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use) { ; GFX1250-LABEL: global_load_tr16_b128_v8i16_saddr: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: global_load_tr16_b128 v[2:5], v2, s[0:1] offset:32 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX1250-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 %val = call <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16.p1(ptr addrspace(1) %gep) store <8 x i16> %val, ptr addrspace(1) %use ret void } define amdgpu_ps void @global_load_tr16_b128_v8f16_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use) { ; GFX1250-LABEL: global_load_tr16_b128_v8f16_vaddr: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: global_load_tr16_b128 v[4:7], v[0:1], off offset:32 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b128 v[2:3], v[4:7], off ; GFX1250-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 %val = call <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16.p1(ptr addrspace(1) %gep) store <8 x half> %val, ptr addrspace(1) %use ret void } define amdgpu_ps void @global_load_tr16_b128_v8f16_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use) { ; GFX1250-LABEL: global_load_tr16_b128_v8f16_saddr: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: global_load_tr16_b128 v[2:5], v2, s[0:1] offset:32 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX1250-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 %val = call <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16.p1(ptr addrspace(1) %gep) store <8 x half> %val, ptr addrspace(1) %use ret void } define amdgpu_ps void @global_load_tr16_b128_v8b16_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use) { ; GFX1250-LABEL: global_load_tr16_b128_v8b16_vaddr: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: global_load_tr16_b128 v[4:7], v[0:1], off offset:32 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b128 v[2:3], v[4:7], off ; GFX1250-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 %val = call <8 x bfloat> @llvm.amdgcn.global.load.tr.b128.v8bf16.p1(ptr addrspace(1) %gep) store <8 x bfloat> %val, ptr addrspace(1) %use ret void } define amdgpu_ps void @global_load_tr16_b128_v8bf16_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use) { ; GFX1250-LABEL: global_load_tr16_b128_v8bf16_saddr: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: global_load_tr16_b128 v[2:5], v2, s[0:1] offset:32 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX1250-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 %val = call <8 x bfloat> @llvm.amdgcn.global.load.tr.b128.v8bf16.p1(ptr addrspace(1) %gep) store <8 x bfloat> %val, ptr addrspace(1) %use ret void } define amdgpu_ps void @ds_load_tr4_b64(ptr addrspace(3) %addr, ptr addrspace(1) %use) { ; GFX1250-SDAG-LABEL: ds_load_tr4_b64: ; GFX1250-SDAG: ; %bb.0: ; %entry ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: ds_load_tr4_b64 v[0:1], v0 offset:32 ; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_store_b64 v[2:3], v[0:1], off ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: ds_load_tr4_b64: ; GFX1250-GISEL: ; %bb.0: ; %entry ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: ds_load_tr4_b64 v[0:1], v0 offset:32 ; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_store_b64 v[4:5], v[0:1], off ; GFX1250-GISEL-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4 %val = call <2 x i32> @llvm.amdgcn.ds.load.tr4.b64.v2i32.p3(ptr addrspace(3) %gep) store <2 x i32> %val, ptr addrspace(1) %use ret void } define amdgpu_ps void @ds_load_tr8_b64(ptr addrspace(3) %addr, ptr addrspace(1) %use) { ; GFX1250-SDAG-LABEL: ds_load_tr8_b64: ; GFX1250-SDAG: ; %bb.0: ; %entry ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: ds_load_tr8_b64 v[0:1], v0 offset:32 ; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_store_b64 v[2:3], v[0:1], off ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: ds_load_tr8_b64: ; GFX1250-GISEL: ; %bb.0: ; %entry ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: ds_load_tr8_b64 v[0:1], v0 offset:32 ; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_store_b64 v[4:5], v[0:1], off ; GFX1250-GISEL-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4 %val = call <2 x i32> @llvm.amdgcn.ds.load.tr8.b64.v2i32.p3(ptr addrspace(3) %gep) store <2 x i32> %val, ptr addrspace(1) %use ret void } define amdgpu_ps void @ds_load_tr6_b96(ptr addrspace(3) %addr, ptr addrspace(1) %use) { ; GFX1250-SDAG-LABEL: ds_load_tr6_b96: ; GFX1250-SDAG: ; %bb.0: ; %entry ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 ; GFX1250-SDAG-NEXT: ds_load_tr6_b96 v[0:2], v0 offset:32 ; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_store_b96 v[4:5], v[0:2], off ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: ds_load_tr6_b96: ; GFX1250-GISEL: ; %bb.0: ; %entry ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: ds_load_tr6_b96 v[0:2], v0 offset:32 ; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_store_b96 v[4:5], v[0:2], off ; GFX1250-GISEL-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4 %val = call <3 x i32> @llvm.amdgcn.ds.load.tr6.b96.v3i32.p3(ptr addrspace(3) %gep) store <3 x i32> %val, ptr addrspace(1) %use ret void } define amdgpu_ps void @ds_load_tr16_b128_v8i16(ptr addrspace(3) %addr, ptr addrspace(1) %use) { ; GFX1250-SDAG-LABEL: ds_load_tr16_b128_v8i16: ; GFX1250-SDAG: ; %bb.0: ; %entry ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 ; GFX1250-SDAG-NEXT: ds_load_tr16_b128 v[0:3], v0 offset:32 ; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_store_b128 v[4:5], v[0:3], off ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: ds_load_tr16_b128_v8i16: ; GFX1250-GISEL: ; %bb.0: ; %entry ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: ds_load_tr16_b128 v[0:3], v0 offset:32 ; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_store_b128 v[4:5], v[0:3], off ; GFX1250-GISEL-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4 %val = call <8 x i16> @llvm.amdgcn.ds.load.tr16.b128.v8i16.p3(ptr addrspace(3) %gep) store <8 x i16> %val, ptr addrspace(1) %use ret void } define amdgpu_ps void @ds_load_tr16_b128_v8f16(ptr addrspace(3) %addr, ptr addrspace(1) %use) { ; GFX1250-SDAG-LABEL: ds_load_tr16_b128_v8f16: ; GFX1250-SDAG: ; %bb.0: ; %entry ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 ; GFX1250-SDAG-NEXT: ds_load_tr16_b128 v[0:3], v0 offset:32 ; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_store_b128 v[4:5], v[0:3], off ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: ds_load_tr16_b128_v8f16: ; GFX1250-GISEL: ; %bb.0: ; %entry ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: ds_load_tr16_b128 v[0:3], v0 offset:32 ; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_store_b128 v[4:5], v[0:3], off ; GFX1250-GISEL-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4 %val = call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16.p3(ptr addrspace(3) %gep) store <8 x half> %val, ptr addrspace(1) %use ret void } define amdgpu_ps void @ds_load_tr16_b128_v8bf16(ptr addrspace(3) %addr, ptr addrspace(1) %use) { ; GFX1250-LABEL: ds_load_tr16_b128_v8bf16: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 ; GFX1250-NEXT: ds_load_tr16_b128 v[0:3], v0 offset:32 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: global_store_b128 v[4:5], v[0:3], off ; GFX1250-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4 %val = call <8 x bfloat> @llvm.amdgcn.ds.load.tr16.b128.v8bf16.p3(ptr addrspace(3) %gep) store <8 x bfloat> %val, ptr addrspace(1) %use ret void }