; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck %s --check-prefix=GFX1250 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -global-isel -global-isel-abort=2 < %s | FileCheck %s --check-prefix=GISEL define amdgpu_ps void @test_wmma_f32_16x16x4_f32(<2 x float> %A, <2 x float> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x4_f32: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_wmma_f32_16x16x4_f32 v[6:13], v[0:1], v[2:3], 1.0 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 ; GFX1250-NEXT: global_store_b128 v[4:5], v[6:9], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f32_16x16x4_f32: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: v_wmma_f32_16x16x4_f32 v[6:13], v[0:1], v[2:3], 1.0 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[4:5], v[6:9], off ; GISEL-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 ; GISEL-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1 0, <2 x float> %A, i1 0, <2 x float> %B, i16 0, <8 x float> , i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f32_16x16x4_f32_non_splat(<2 x float> %A, <2 x float> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x4_f32_non_splat: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_dual_mov_b32 v6, 1.0 :: v_dual_mov_b32 v8, 2.0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v9, v6 ; GFX1250-NEXT: v_dual_mov_b32 v10, v6 :: v_dual_mov_b32 v11, v6 ; GFX1250-NEXT: v_dual_mov_b32 v12, v6 :: v_dual_mov_b32 v13, v6 ; GFX1250-NEXT: v_wmma_f32_16x16x4_f32 v[6:13], v[0:1], v[2:3], v[6:13] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 ; GFX1250-NEXT: global_store_b128 v[4:5], v[6:9], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f32_16x16x4_f32_non_splat: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s2, 2.0 ; GISEL-NEXT: s_mov_b32 s6, s0 ; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[4:5] ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-NEXT: v_wmma_f32_16x16x4_f32 v[6:13], v[0:1], v[2:3], v[6:13] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[4:5], v[6:9], off ; GISEL-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 ; GISEL-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1 0, <2 x float> %A, i1 0, <2 x float> %B, i16 0, <8 x float> , i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f32_16x16x4_f32_non_inlineable(<2 x float> %A, <2 x float> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x4_f32_non_inlineable: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_mov_b32_e32 v6, 0x40400000 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6 ; GFX1250-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6 ; GFX1250-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6 ; GFX1250-NEXT: v_mov_b32_e32 v13, v6 ; GFX1250-NEXT: v_wmma_f32_16x16x4_f32 v[6:13], v[0:1], v[2:3], v[6:13] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 ; GFX1250-NEXT: global_store_b128 v[4:5], v[6:9], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f32_16x16x4_f32_non_inlineable: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GISEL-NEXT: s_mov_b32 s6, s0 ; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[4:5] ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-NEXT: v_wmma_f32_16x16x4_f32 v[6:13], v[0:1], v[2:3], v[6:13] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[4:5], v[6:9], off ; GISEL-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 ; GISEL-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1 0, <2 x float> %A, i1 0, <2 x float> %B, i16 0, <8 x float> , i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f32_16x16x32_bf16(<16 x bfloat> %A, <16 x bfloat> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x32_bf16: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_wmma_f32_16x16x32_bf16 v[18:25], v[0:7], v[8:15], 1.0 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 ; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f32_16x16x32_bf16: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: v_wmma_f32_16x16x32_bf16 v[18:25], v[0:7], v[8:15], 1.0 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x float> , i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f32_16x16x32_bf16_non_splat(<16 x bfloat> %A, <16 x bfloat> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x32_bf16_non_splat: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_dual_mov_b32 v18, 1.0 :: v_dual_mov_b32 v20, 2.0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18 ; GFX1250-NEXT: v_dual_mov_b32 v22, v18 :: v_dual_mov_b32 v23, v18 ; GFX1250-NEXT: v_dual_mov_b32 v24, v18 :: v_dual_mov_b32 v25, v18 ; GFX1250-NEXT: v_wmma_f32_16x16x32_bf16 v[18:25], v[0:7], v[8:15], v[18:25] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 ; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f32_16x16x32_bf16_non_splat: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: v_dual_mov_b32 v18, 1.0 :: v_dual_mov_b32 v20, 2.0 ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GISEL-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18 ; GISEL-NEXT: v_dual_mov_b32 v22, v18 :: v_dual_mov_b32 v23, v18 ; GISEL-NEXT: v_dual_mov_b32 v24, v18 :: v_dual_mov_b32 v25, v18 ; GISEL-NEXT: v_wmma_f32_16x16x32_bf16 v[18:25], v[0:7], v[8:15], v[18:25] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x float> , i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f32_16x16x32_bf16_non_inlineable(<16 x bfloat> %A, <16 x bfloat> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x32_bf16_non_inlineable: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_mov_b32_e32 v18, 0x40400000 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18 ; GFX1250-NEXT: v_dual_mov_b32 v21, v18 :: v_dual_mov_b32 v22, v18 ; GFX1250-NEXT: v_dual_mov_b32 v23, v18 :: v_dual_mov_b32 v24, v18 ; GFX1250-NEXT: v_mov_b32_e32 v25, v18 ; GFX1250-NEXT: v_wmma_f32_16x16x32_bf16 v[18:25], v[0:7], v[8:15], v[18:25] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 ; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f32_16x16x32_bf16_non_inlineable: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: v_mov_b32_e32 v18, 0x40400000 ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GISEL-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18 ; GISEL-NEXT: v_dual_mov_b32 v21, v18 :: v_dual_mov_b32 v22, v18 ; GISEL-NEXT: v_dual_mov_b32 v23, v18 :: v_dual_mov_b32 v24, v18 ; GISEL-NEXT: v_mov_b32_e32 v25, v18 ; GISEL-NEXT: v_wmma_f32_16x16x32_bf16 v[18:25], v[0:7], v[8:15], v[18:25] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x float> , i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16(<16 x bfloat> %A, <16 x bfloat> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_bf16_16x16x32_bf16: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_wmma_bf16_16x16x32_bf16 v[18:21], v[0:7], v[8:15], 1.0 ; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_bf16_16x16x32_bf16: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: v_wmma_bf16_16x16x32_bf16 v[18:21], v[0:7], v[8:15], 1.0 ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: s_endpgm bb: %res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x bfloat> , i1 false, i1 false) store <8 x bfloat> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16_non_splat(<16 x bfloat> %A, <16 x bfloat> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_bf16_16x16x32_bf16_non_splat: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_dual_mov_b32 v18, 0x3f803f80 :: v_dual_mov_b32 v19, 1.0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v20, v18 :: v_dual_mov_b32 v21, v18 ; GFX1250-NEXT: v_wmma_bf16_16x16x32_bf16 v[18:21], v[0:7], v[8:15], v[18:21] ; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_bf16_16x16x32_bf16_non_splat: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: v_dual_mov_b32 v18, 0x3f803f80 :: v_dual_mov_b32 v19, 1.0 ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GISEL-NEXT: v_dual_mov_b32 v20, v18 :: v_dual_mov_b32 v21, v18 ; GISEL-NEXT: v_wmma_bf16_16x16x32_bf16 v[18:21], v[0:7], v[8:15], v[18:21] ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: s_endpgm bb: %res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x bfloat> , i1 false, i1 false) store <8 x bfloat> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16_non_inlineable(<16 x bfloat> %A, <16 x bfloat> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_bf16_16x16x32_bf16_non_inlineable: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_mov_b32_e32 v18, 0x3fc03fc0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18 ; GFX1250-NEXT: v_mov_b32_e32 v21, v18 ; GFX1250-NEXT: v_wmma_bf16_16x16x32_bf16 v[18:21], v[0:7], v[8:15], v[18:21] ; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_bf16_16x16x32_bf16_non_inlineable: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: v_mov_b32_e32 v18, 0x3fc03fc0 ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GISEL-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18 ; GISEL-NEXT: v_mov_b32_e32 v21, v18 ; GISEL-NEXT: v_wmma_bf16_16x16x32_bf16 v[18:21], v[0:7], v[8:15], v[18:21] ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: s_endpgm bb: %res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x bfloat> , i1 false, i1 false) store <8 x bfloat> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_bf16f32_16x16x32_bf16(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_bf16f32_16x16x32_bf16: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_wmma_bf16f32_16x16x32_bf16 v[18:21], v[0:7], v[8:15], 1.0 ; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_bf16f32_16x16x32_bf16: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: v_wmma_bf16f32_16x16x32_bf16 v[18:21], v[0:7], v[8:15], 1.0 ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: s_endpgm bb: %res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16f32.16x16x32.bf16.v8bf16.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x float> , i1 false, i1 false) store <8 x bfloat> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_bf16f32_16x16x32_bf16_non_splat(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_bf16f32_16x16x32_bf16_non_splat: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_dual_mov_b32 v18, 1.0 :: v_dual_mov_b32 v20, 2.0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18 ; GFX1250-NEXT: v_dual_mov_b32 v22, v18 :: v_dual_mov_b32 v23, v18 ; GFX1250-NEXT: v_dual_mov_b32 v24, v18 :: v_dual_mov_b32 v25, v18 ; GFX1250-NEXT: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[18:25] ; GFX1250-NEXT: global_store_b128 v[16:17], v[26:29], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_bf16f32_16x16x32_bf16_non_splat: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: v_dual_mov_b32 v18, 1.0 :: v_dual_mov_b32 v20, 2.0 ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GISEL-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18 ; GISEL-NEXT: v_dual_mov_b32 v22, v18 :: v_dual_mov_b32 v23, v18 ; GISEL-NEXT: v_dual_mov_b32 v24, v18 :: v_dual_mov_b32 v25, v18 ; GISEL-NEXT: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[18:25] ; GISEL-NEXT: global_store_b128 v[16:17], v[26:29], off ; GISEL-NEXT: s_endpgm bb: %res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16f32.16x16x32.bf16.v8bf16.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x float> , i1 false, i1 false) store <8 x bfloat> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_bf16f32_16x16x32_bf16_non_inlinable(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_bf16f32_16x16x32_bf16_non_inlinable: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_mov_b32_e32 v18, 0x40400000 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18 ; GFX1250-NEXT: v_dual_mov_b32 v21, v18 :: v_dual_mov_b32 v22, v18 ; GFX1250-NEXT: v_dual_mov_b32 v23, v18 :: v_dual_mov_b32 v24, v18 ; GFX1250-NEXT: v_mov_b32_e32 v25, v18 ; GFX1250-NEXT: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[18:25] ; GFX1250-NEXT: global_store_b128 v[16:17], v[26:29], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_bf16f32_16x16x32_bf16_non_inlinable: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: v_mov_b32_e32 v18, 0x40400000 ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GISEL-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18 ; GISEL-NEXT: v_dual_mov_b32 v21, v18 :: v_dual_mov_b32 v22, v18 ; GISEL-NEXT: v_dual_mov_b32 v23, v18 :: v_dual_mov_b32 v24, v18 ; GISEL-NEXT: v_mov_b32_e32 v25, v18 ; GISEL-NEXT: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[18:25] ; GISEL-NEXT: global_store_b128 v[16:17], v[26:29], off ; GISEL-NEXT: s_endpgm bb: %res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16f32.16x16x32.bf16.v8bf16.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x float> , i1 false, i1 false) store <8 x bfloat> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_fp8(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_fp8: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_wmma_f32_16x16x64_fp8_fp8 v[18:25], v[0:7], v[8:15], 1.0 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 ; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_fp8: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: v_wmma_f32_16x16x64_fp8_fp8 v[18:25], v[0:7], v[8:15], 1.0 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 ; GISEL-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.fp8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> , i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_fp8_non_splat(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_fp8_non_splat: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_dual_mov_b32 v18, 1.0 :: v_dual_mov_b32 v20, 2.0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18 ; GFX1250-NEXT: v_dual_mov_b32 v22, v18 :: v_dual_mov_b32 v23, v18 ; GFX1250-NEXT: v_dual_mov_b32 v24, v18 :: v_dual_mov_b32 v25, v18 ; GFX1250-NEXT: v_wmma_f32_16x16x64_fp8_fp8 v[18:25], v[0:7], v[8:15], v[18:25] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 ; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_fp8_non_splat: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s2, 2.0 ; GISEL-NEXT: s_mov_b32 s6, s0 ; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 ; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-NEXT: v_wmma_f32_16x16x64_fp8_fp8 v[18:25], v[0:7], v[8:15], v[18:25] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 ; GISEL-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.fp8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> , i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_fp8_non_inlineable(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_fp8_non_inlineable: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_mov_b32_e32 v18, 0x40400000 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18 ; GFX1250-NEXT: v_dual_mov_b32 v21, v18 :: v_dual_mov_b32 v22, v18 ; GFX1250-NEXT: v_dual_mov_b32 v23, v18 :: v_dual_mov_b32 v24, v18 ; GFX1250-NEXT: v_mov_b32_e32 v25, v18 ; GFX1250-NEXT: v_wmma_f32_16x16x64_fp8_fp8 v[18:25], v[0:7], v[8:15], v[18:25] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 ; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_fp8_non_inlineable: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GISEL-NEXT: s_mov_b32 s6, s0 ; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 ; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-NEXT: v_wmma_f32_16x16x64_fp8_fp8 v[18:25], v[0:7], v[8:15], v[18:25] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 ; GISEL-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.fp8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> , i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_bf8(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_bf8: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_wmma_f32_16x16x64_fp8_bf8 v[18:25], v[0:7], v[8:15], 1.0 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 ; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_bf8: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: v_wmma_f32_16x16x64_fp8_bf8 v[18:25], v[0:7], v[8:15], 1.0 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 ; GISEL-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.bf8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> , i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_bf8_non_splat(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_bf8_non_splat: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_dual_mov_b32 v18, 1.0 :: v_dual_mov_b32 v20, 2.0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18 ; GFX1250-NEXT: v_dual_mov_b32 v22, v18 :: v_dual_mov_b32 v23, v18 ; GFX1250-NEXT: v_dual_mov_b32 v24, v18 :: v_dual_mov_b32 v25, v18 ; GFX1250-NEXT: v_wmma_f32_16x16x64_fp8_bf8 v[18:25], v[0:7], v[8:15], v[18:25] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 ; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_bf8_non_splat: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s2, 2.0 ; GISEL-NEXT: s_mov_b32 s6, s0 ; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 ; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-NEXT: v_wmma_f32_16x16x64_fp8_bf8 v[18:25], v[0:7], v[8:15], v[18:25] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 ; GISEL-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.bf8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> , i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_bf8_non_inlineable(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_bf8_non_inlineable: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_mov_b32_e32 v18, 0x40400000 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18 ; GFX1250-NEXT: v_dual_mov_b32 v21, v18 :: v_dual_mov_b32 v22, v18 ; GFX1250-NEXT: v_dual_mov_b32 v23, v18 :: v_dual_mov_b32 v24, v18 ; GFX1250-NEXT: v_mov_b32_e32 v25, v18 ; GFX1250-NEXT: v_wmma_f32_16x16x64_fp8_bf8 v[18:25], v[0:7], v[8:15], v[18:25] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 ; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_bf8_non_inlineable: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GISEL-NEXT: s_mov_b32 s6, s0 ; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 ; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-NEXT: v_wmma_f32_16x16x64_fp8_bf8 v[18:25], v[0:7], v[8:15], v[18:25] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 ; GISEL-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.bf8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> , i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_fp8(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_fp8: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_wmma_f32_16x16x64_bf8_fp8 v[18:25], v[0:7], v[8:15], 1.0 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 ; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_fp8: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: v_wmma_f32_16x16x64_bf8_fp8 v[18:25], v[0:7], v[8:15], 1.0 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 ; GISEL-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.fp8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> , i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_fp8_non_splat(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_fp8_non_splat: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_dual_mov_b32 v18, 1.0 :: v_dual_mov_b32 v20, 2.0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18 ; GFX1250-NEXT: v_dual_mov_b32 v22, v18 :: v_dual_mov_b32 v23, v18 ; GFX1250-NEXT: v_dual_mov_b32 v24, v18 :: v_dual_mov_b32 v25, v18 ; GFX1250-NEXT: v_wmma_f32_16x16x64_bf8_fp8 v[18:25], v[0:7], v[8:15], v[18:25] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 ; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_fp8_non_splat: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s2, 2.0 ; GISEL-NEXT: s_mov_b32 s6, s0 ; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 ; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-NEXT: v_wmma_f32_16x16x64_bf8_fp8 v[18:25], v[0:7], v[8:15], v[18:25] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 ; GISEL-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.fp8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> , i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_fp8_non_inlineable(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_fp8_non_inlineable: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_mov_b32_e32 v18, 0x40400000 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18 ; GFX1250-NEXT: v_dual_mov_b32 v21, v18 :: v_dual_mov_b32 v22, v18 ; GFX1250-NEXT: v_dual_mov_b32 v23, v18 :: v_dual_mov_b32 v24, v18 ; GFX1250-NEXT: v_mov_b32_e32 v25, v18 ; GFX1250-NEXT: v_wmma_f32_16x16x64_bf8_fp8 v[18:25], v[0:7], v[8:15], v[18:25] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 ; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_fp8_non_inlineable: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GISEL-NEXT: s_mov_b32 s6, s0 ; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 ; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-NEXT: v_wmma_f32_16x16x64_bf8_fp8 v[18:25], v[0:7], v[8:15], v[18:25] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 ; GISEL-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.fp8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> , i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_bf8(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_bf8: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_wmma_f32_16x16x64_bf8_bf8 v[18:25], v[0:7], v[8:15], 1.0 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 ; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_bf8: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: v_wmma_f32_16x16x64_bf8_bf8 v[18:25], v[0:7], v[8:15], 1.0 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 ; GISEL-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.bf8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> , i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_bf8_non_splat(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_bf8_non_splat: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_dual_mov_b32 v18, 1.0 :: v_dual_mov_b32 v20, 2.0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18 ; GFX1250-NEXT: v_dual_mov_b32 v22, v18 :: v_dual_mov_b32 v23, v18 ; GFX1250-NEXT: v_dual_mov_b32 v24, v18 :: v_dual_mov_b32 v25, v18 ; GFX1250-NEXT: v_wmma_f32_16x16x64_bf8_bf8 v[18:25], v[0:7], v[8:15], v[18:25] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 ; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_bf8_non_splat: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s2, 2.0 ; GISEL-NEXT: s_mov_b32 s6, s0 ; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 ; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-NEXT: v_wmma_f32_16x16x64_bf8_bf8 v[18:25], v[0:7], v[8:15], v[18:25] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 ; GISEL-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.bf8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> , i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_bf8_non_inlineable(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_bf8_non_inlineable: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_mov_b32_e32 v18, 0x40400000 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18 ; GFX1250-NEXT: v_dual_mov_b32 v21, v18 :: v_dual_mov_b32 v22, v18 ; GFX1250-NEXT: v_dual_mov_b32 v23, v18 :: v_dual_mov_b32 v24, v18 ; GFX1250-NEXT: v_mov_b32_e32 v25, v18 ; GFX1250-NEXT: v_wmma_f32_16x16x64_bf8_bf8 v[18:25], v[0:7], v[8:15], v[18:25] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 ; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_bf8_non_inlineable: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GISEL-NEXT: s_mov_b32 s6, s0 ; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 ; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-NEXT: v_wmma_f32_16x16x64_bf8_bf8 v[18:25], v[0:7], v[8:15], v[18:25] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 ; GISEL-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.bf8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> , i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_fp8(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_fp8: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_wmma_f16_16x16x64_fp8_fp8 v[18:21], v[0:7], v[8:15], 1.0 ; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_fp8: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: v_wmma_f16_16x16x64_fp8_fp8 v[18:21], v[0:7], v[8:15], 1.0 ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: s_endpgm bb: %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.fp8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> , i1 false, i1 false) store <8 x half> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_fp8_non_splat(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_fp8_non_splat: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_mov_b32_e32 v18, 0x3c003c00 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v19, 0x3c004000 :: v_dual_mov_b32 v20, v18 ; GFX1250-NEXT: v_mov_b32_e32 v21, v18 ; GFX1250-NEXT: v_wmma_f16_16x16x64_fp8_fp8 v[18:21], v[0:7], v[8:15], v[18:21] ; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_fp8_non_splat: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: s_mov_b32 s0, 0x3c003c00 ; GISEL-NEXT: s_mov_b32 s1, 0x3c004000 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] ; GISEL-NEXT: v_wmma_f16_16x16x64_fp8_fp8 v[18:21], v[0:7], v[8:15], v[18:21] ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: s_endpgm bb: %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.fp8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> , i1 false, i1 false) store <8 x half> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_fp8_non_inlineable(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_fp8_non_inlineable: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_mov_b32_e32 v18, 0x42004200 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18 ; GFX1250-NEXT: v_mov_b32_e32 v21, v18 ; GFX1250-NEXT: v_wmma_f16_16x16x64_fp8_fp8 v[18:21], v[0:7], v[8:15], v[18:21] ; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_fp8_non_inlineable: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: s_mov_b32 s0, 0x42004200 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] ; GISEL-NEXT: v_wmma_f16_16x16x64_fp8_fp8 v[18:21], v[0:7], v[8:15], v[18:21] ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: s_endpgm bb: %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.fp8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> , i1 false, i1 false) store <8 x half> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_bf8(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_bf8: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_wmma_f16_16x16x64_fp8_bf8 v[18:21], v[0:7], v[8:15], 1.0 neg_hi:[0,0,1] ; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_bf8: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: v_wmma_f16_16x16x64_fp8_bf8 v[18:21], v[0:7], v[8:15], 1.0 neg_hi:[0,0,1] ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: s_endpgm bb: %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.bf8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 2, <8 x half> , i1 false, i1 false) store <8 x half> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_bf8_non_splat(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_bf8_non_splat: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_mov_b32_e32 v18, 0x3c003c00 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v19, 0x3c004000 :: v_dual_mov_b32 v20, v18 ; GFX1250-NEXT: v_mov_b32_e32 v21, v18 ; GFX1250-NEXT: v_wmma_f16_16x16x64_fp8_bf8 v[18:21], v[0:7], v[8:15], v[18:21] neg_hi:[0,0,1] ; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_bf8_non_splat: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: s_mov_b32 s0, 0x3c003c00 ; GISEL-NEXT: s_mov_b32 s1, 0x3c004000 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] ; GISEL-NEXT: v_wmma_f16_16x16x64_fp8_bf8 v[18:21], v[0:7], v[8:15], v[18:21] neg_hi:[0,0,1] ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: s_endpgm bb: %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.bf8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 2, <8 x half> , i1 false, i1 false) store <8 x half> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_bf8_non_inlineable(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_bf8_non_inlineable: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_mov_b32_e32 v18, 0x42004200 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18 ; GFX1250-NEXT: v_mov_b32_e32 v21, v18 ; GFX1250-NEXT: v_wmma_f16_16x16x64_fp8_bf8 v[18:21], v[0:7], v[8:15], v[18:21] neg_hi:[0,0,1] ; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_bf8_non_inlineable: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: s_mov_b32 s0, 0x42004200 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] ; GISEL-NEXT: v_wmma_f16_16x16x64_fp8_bf8 v[18:21], v[0:7], v[8:15], v[18:21] neg_hi:[0,0,1] ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: s_endpgm bb: %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.bf8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 2, <8 x half> , i1 false, i1 false) store <8 x half> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_fp8(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_fp8: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_wmma_f16_16x16x64_bf8_fp8 v[18:21], v[0:7], v[8:15], 1.0 ; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_fp8: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: v_wmma_f16_16x16x64_bf8_fp8 v[18:21], v[0:7], v[8:15], 1.0 ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: s_endpgm bb: %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.fp8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> , i1 false, i1 false) store <8 x half> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_fp8_non_splat(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_fp8_non_splat: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_mov_b32_e32 v18, 0x3c003c00 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v19, 0x3c004000 :: v_dual_mov_b32 v20, v18 ; GFX1250-NEXT: v_mov_b32_e32 v21, v18 ; GFX1250-NEXT: v_wmma_f16_16x16x64_bf8_fp8 v[18:21], v[0:7], v[8:15], v[18:21] ; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_fp8_non_splat: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: s_mov_b32 s0, 0x3c003c00 ; GISEL-NEXT: s_mov_b32 s1, 0x3c004000 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] ; GISEL-NEXT: v_wmma_f16_16x16x64_bf8_fp8 v[18:21], v[0:7], v[8:15], v[18:21] ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: s_endpgm bb: %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.fp8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> , i1 false, i1 false) store <8 x half> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_fp8_non_inlineable(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_fp8_non_inlineable: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_mov_b32_e32 v18, 0x42004200 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18 ; GFX1250-NEXT: v_mov_b32_e32 v21, v18 ; GFX1250-NEXT: v_wmma_f16_16x16x64_bf8_fp8 v[18:21], v[0:7], v[8:15], v[18:21] ; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_fp8_non_inlineable: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: s_mov_b32 s0, 0x42004200 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] ; GISEL-NEXT: v_wmma_f16_16x16x64_bf8_fp8 v[18:21], v[0:7], v[8:15], v[18:21] ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: s_endpgm bb: %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.fp8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> , i1 false, i1 false) store <8 x half> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_bf8(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_bf8: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_wmma_f16_16x16x64_bf8_bf8 v[18:21], v[0:7], v[8:15], 1.0 ; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_bf8: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: v_wmma_f16_16x16x64_bf8_bf8 v[18:21], v[0:7], v[8:15], 1.0 ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: s_endpgm bb: %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> , i1 false, i1 false) store <8 x half> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_bf8_non_splat(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_bf8_non_splat: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_mov_b32_e32 v18, 0x3c003c00 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v19, 0x3c004000 :: v_dual_mov_b32 v20, v18 ; GFX1250-NEXT: v_mov_b32_e32 v21, v18 ; GFX1250-NEXT: v_wmma_f16_16x16x64_bf8_bf8 v[18:21], v[0:7], v[8:15], v[18:21] ; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_bf8_non_splat: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: s_mov_b32 s0, 0x3c003c00 ; GISEL-NEXT: s_mov_b32 s1, 0x3c004000 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] ; GISEL-NEXT: v_wmma_f16_16x16x64_bf8_bf8 v[18:21], v[0:7], v[8:15], v[18:21] ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: s_endpgm bb: %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> , i1 false, i1 false) store <8 x half> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_bf8_non_inlineable(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_bf8_non_inlineable: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_mov_b32_e32 v18, 0x42004200 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18 ; GFX1250-NEXT: v_mov_b32_e32 v21, v18 ; GFX1250-NEXT: v_wmma_f16_16x16x64_bf8_bf8 v[18:21], v[0:7], v[8:15], v[18:21] ; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_bf8_non_inlineable: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: s_mov_b32 s0, 0x42004200 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] ; GISEL-NEXT: v_wmma_f16_16x16x64_bf8_bf8 v[18:21], v[0:7], v[8:15], v[18:21] ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: s_endpgm bb: %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> , i1 false, i1 false) store <8 x half> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_i32_16x16x64_iu8(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_i32_16x16x64_iu8: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_wmma_i32_16x16x64_iu8 v[18:25], v[0:7], v[8:15], 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 ; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_i32_16x16x64_iu8: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: v_wmma_i32_16x16x64_iu8 v[18:25], v[0:7], v[8:15], 1 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 ; GISEL-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 0, <8 x i32> %A, i1 0, <8 x i32> %B, <8 x i32> , i1 false, i1 false) store <8 x i32> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_i32_16x16x64_iu8_non_splat(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_i32_16x16x64_iu8_non_splat: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_dual_mov_b32 v18, 1 :: v_dual_mov_b32 v20, 2 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18 ; GFX1250-NEXT: v_dual_mov_b32 v22, v18 :: v_dual_mov_b32 v23, v18 ; GFX1250-NEXT: v_dual_mov_b32 v24, v18 :: v_dual_mov_b32 v25, v18 ; GFX1250-NEXT: v_wmma_i32_16x16x64_iu8 v[18:25], v[0:7], v[8:15], v[18:25] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 ; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_i32_16x16x64_iu8_non_splat: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: s_mov_b32 s0, 1 ; GISEL-NEXT: s_mov_b32 s2, 2 ; GISEL-NEXT: s_mov_b32 s6, s0 ; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 ; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-NEXT: v_wmma_i32_16x16x64_iu8 v[18:25], v[0:7], v[8:15], v[18:25] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 ; GISEL-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 0, <8 x i32> %A, i1 0, <8 x i32> %B, <8 x i32> , i1 false, i1 false) store <8 x i32> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_i32_16x16x64_iu8_non_inlineable(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_i32_16x16x64_iu8_non_inlineable: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_mov_b32_e32 v18, 0x80 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18 ; GFX1250-NEXT: v_dual_mov_b32 v21, v18 :: v_dual_mov_b32 v22, v18 ; GFX1250-NEXT: v_dual_mov_b32 v23, v18 :: v_dual_mov_b32 v24, v18 ; GFX1250-NEXT: v_mov_b32_e32 v25, v18 ; GFX1250-NEXT: v_wmma_i32_16x16x64_iu8 v[18:25], v[0:7], v[8:15], v[18:25] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 ; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_i32_16x16x64_iu8_non_inlineable: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: s_movk_i32 s0, 0x80 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GISEL-NEXT: s_mov_b32 s6, s0 ; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 ; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-NEXT: v_wmma_i32_16x16x64_iu8 v[18:25], v[0:7], v[8:15], v[18:25] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 ; GISEL-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 0, <8 x i32> %A, i1 0, <8 x i32> %B, <8 x i32> , i1 false, i1 false) store <8 x i32> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f32_16x16x32_f16(<16 x half> %A, <16 x half> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x32_f16: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_wmma_f32_16x16x32_f16 v[18:25], v[0:7], v[8:15], 1.0 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 ; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f32_16x16x32_f16: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: v_wmma_f32_16x16x32_f16 v[18:25], v[0:7], v[8:15], 1.0 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 ; GISEL-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 0, <8 x float> , i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f32_16x16x32_f16_non_splat(<16 x half> %A, <16 x half> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x32_f16_non_splat: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_dual_mov_b32 v18, 1.0 :: v_dual_mov_b32 v20, 2.0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18 ; GFX1250-NEXT: v_dual_mov_b32 v22, v18 :: v_dual_mov_b32 v23, v18 ; GFX1250-NEXT: v_dual_mov_b32 v24, v18 :: v_dual_mov_b32 v25, v18 ; GFX1250-NEXT: v_wmma_f32_16x16x32_f16 v[18:25], v[0:7], v[8:15], v[18:25] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 ; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f32_16x16x32_f16_non_splat: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s2, 2.0 ; GISEL-NEXT: s_mov_b32 s6, s0 ; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 ; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-NEXT: v_wmma_f32_16x16x32_f16 v[18:25], v[0:7], v[8:15], v[18:25] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 ; GISEL-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 0, <8 x float> , i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f32_16x16x32_f16_non_inlineable(<16 x half> %A, <16 x half> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x32_f16_non_inlineable: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_mov_b32_e32 v18, 0x40400000 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18 ; GFX1250-NEXT: v_dual_mov_b32 v21, v18 :: v_dual_mov_b32 v22, v18 ; GFX1250-NEXT: v_dual_mov_b32 v23, v18 :: v_dual_mov_b32 v24, v18 ; GFX1250-NEXT: v_mov_b32_e32 v25, v18 ; GFX1250-NEXT: v_wmma_f32_16x16x32_f16 v[18:25], v[0:7], v[8:15], v[18:25] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 ; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f32_16x16x32_f16_non_inlineable: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GISEL-NEXT: s_mov_b32 s6, s0 ; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 ; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-NEXT: v_wmma_f32_16x16x32_f16 v[18:25], v[0:7], v[8:15], v[18:25] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 ; GISEL-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 0, <8 x float> , i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f16_16x16x32_f16(<16 x half> %A, <16 x half> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x32_f16: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_wmma_f16_16x16x32_f16 v[18:21], v[0:7], v[8:15], 1.0 ; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x32_f16: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: v_wmma_f16_16x16x32_f16 v[18:21], v[0:7], v[8:15], 1.0 ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: s_endpgm bb: %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 0, <8 x half> , i1 false, i1 false) store <8 x half> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f16_16x16x32_f16_non_splat(<16 x half> %A, <16 x half> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x32_f16_non_splat: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_mov_b32_e32 v18, 0x3c003c00 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v19, 0x3c004000 :: v_dual_mov_b32 v20, v18 ; GFX1250-NEXT: v_mov_b32_e32 v21, v18 ; GFX1250-NEXT: v_wmma_f16_16x16x32_f16 v[18:21], v[0:7], v[8:15], v[18:21] ; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x32_f16_non_splat: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: s_mov_b32 s0, 0x3c003c00 ; GISEL-NEXT: s_mov_b32 s1, 0x3c004000 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] ; GISEL-NEXT: v_wmma_f16_16x16x32_f16 v[18:21], v[0:7], v[8:15], v[18:21] ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: s_endpgm bb: %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 0, <8 x half> , i1 false, i1 false) store <8 x half> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f16_16x16x32_f16_non_inlineable(<16 x half> %A, <16 x half> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x32_f16_non_inlineable: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_mov_b32_e32 v18, 0x42004200 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18 ; GFX1250-NEXT: v_mov_b32_e32 v21, v18 ; GFX1250-NEXT: v_wmma_f16_16x16x32_f16 v[18:21], v[0:7], v[8:15], v[18:21] ; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x32_f16_non_inlineable: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: s_mov_b32 s0, 0x42004200 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] ; GISEL-NEXT: v_wmma_f16_16x16x32_f16 v[18:21], v[0:7], v[8:15], v[18:21] ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: s_endpgm bb: %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 0, <8 x half> , i1 false, i1 false) store <8 x half> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], 1.0 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 ; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], 1.0 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off ; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 ; GISEL-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> ) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_non_splat: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_dual_mov_b32 v34, 1.0 :: v_dual_mov_b32 v35, 2.0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v36, v34 :: v_dual_mov_b32 v37, v34 ; GFX1250-NEXT: v_dual_mov_b32 v38, v34 :: v_dual_mov_b32 v39, v34 ; GFX1250-NEXT: v_dual_mov_b32 v40, v34 :: v_dual_mov_b32 v41, v34 ; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 ; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_non_splat: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s1, 2.0 ; GISEL-NEXT: s_mov_b32 s6, s0 ; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 ; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off ; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 ; GISEL-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> ) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_non_inlineable: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_mov_b32_e32 v34, 0x40400000 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34 ; GFX1250-NEXT: v_dual_mov_b32 v37, v34 :: v_dual_mov_b32 v38, v34 ; GFX1250-NEXT: v_dual_mov_b32 v39, v34 :: v_dual_mov_b32 v40, v34 ; GFX1250-NEXT: v_mov_b32_e32 v41, v34 ; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 ; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_non_inlineable: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GISEL-NEXT: s_mov_b32 s6, s0 ; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 ; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off ; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 ; GISEL-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> ) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_fp8: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_wmma_f16_16x16x128_fp8_fp8 v[34:37], v[0:15], v[16:31], 1.0 ; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_fp8: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: v_wmma_f16_16x16x128_fp8_fp8 v[34:37], v[0:15], v[16:31], 1.0 ; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off ; GISEL-NEXT: s_endpgm bb: %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.fp8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x half> , i1 false, i1 false) store <8 x half> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_fp8_non_splat: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_mov_b32_e32 v34, 0x3c003c00 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v35, 0x3c004000 :: v_dual_mov_b32 v36, v34 ; GFX1250-NEXT: v_mov_b32_e32 v37, v34 ; GFX1250-NEXT: v_wmma_f16_16x16x128_fp8_fp8 v[34:37], v[0:15], v[16:31], v[34:37] ; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_fp8_non_splat: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: s_mov_b32 s0, 0x3c003c00 ; GISEL-NEXT: s_mov_b32 s1, 0x3c004000 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] ; GISEL-NEXT: v_wmma_f16_16x16x128_fp8_fp8 v[34:37], v[0:15], v[16:31], v[34:37] ; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off ; GISEL-NEXT: s_endpgm bb: %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.fp8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x half> , i1 false, i1 false) store <8 x half> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_fp8_non_inlineable: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_mov_b32_e32 v34, 0x42004200 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34 ; GFX1250-NEXT: v_mov_b32_e32 v37, v34 ; GFX1250-NEXT: v_wmma_f16_16x16x128_fp8_fp8 v[34:37], v[0:15], v[16:31], v[34:37] ; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_fp8_non_inlineable: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: s_mov_b32 s0, 0x42004200 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] ; GISEL-NEXT: v_wmma_f16_16x16x128_fp8_fp8 v[34:37], v[0:15], v[16:31], v[34:37] ; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off ; GISEL-NEXT: s_endpgm bb: %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.fp8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x half> , i1 false, i1 false) store <8 x half> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_bf8(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_bf8: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_wmma_f16_16x16x128_fp8_bf8 v[34:37], v[0:15], v[16:31], 1.0 neg_hi:[0,0,1] ; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_bf8: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: v_wmma_f16_16x16x128_fp8_bf8 v[34:37], v[0:15], v[16:31], 1.0 neg_hi:[0,0,1] ; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off ; GISEL-NEXT: s_endpgm bb: %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.bf8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 2, <8 x half> , i1 false, i1 false) store <8 x half> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_bf8_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_bf8_non_splat: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_mov_b32_e32 v34, 0x3c003c00 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v35, 0x3c004000 :: v_dual_mov_b32 v36, v34 ; GFX1250-NEXT: v_mov_b32_e32 v37, v34 ; GFX1250-NEXT: v_wmma_f16_16x16x128_fp8_bf8 v[34:37], v[0:15], v[16:31], v[34:37] neg_hi:[0,0,1] ; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_bf8_non_splat: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: s_mov_b32 s0, 0x3c003c00 ; GISEL-NEXT: s_mov_b32 s1, 0x3c004000 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] ; GISEL-NEXT: v_wmma_f16_16x16x128_fp8_bf8 v[34:37], v[0:15], v[16:31], v[34:37] neg_hi:[0,0,1] ; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off ; GISEL-NEXT: s_endpgm bb: %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.bf8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 2, <8 x half> , i1 false, i1 false) store <8 x half> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_bf8_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_bf8_non_inlineable: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_mov_b32_e32 v34, 0x42004200 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34 ; GFX1250-NEXT: v_mov_b32_e32 v37, v34 ; GFX1250-NEXT: v_wmma_f16_16x16x128_fp8_bf8 v[34:37], v[0:15], v[16:31], v[34:37] neg_hi:[0,0,1] ; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_bf8_non_inlineable: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: s_mov_b32 s0, 0x42004200 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] ; GISEL-NEXT: v_wmma_f16_16x16x128_fp8_bf8 v[34:37], v[0:15], v[16:31], v[34:37] neg_hi:[0,0,1] ; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off ; GISEL-NEXT: s_endpgm bb: %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.bf8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 2, <8 x half> , i1 false, i1 false) store <8 x half> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_fp8(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_fp8: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_wmma_f16_16x16x128_bf8_fp8 v[34:37], v[0:15], v[16:31], 1.0 ; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_fp8: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: v_wmma_f16_16x16x128_bf8_fp8 v[34:37], v[0:15], v[16:31], 1.0 ; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off ; GISEL-NEXT: s_endpgm bb: %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.fp8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x half> , i1 false, i1 false) store <8 x half> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_fp8_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_fp8_non_splat: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_mov_b32_e32 v34, 0x3c003c00 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v35, 0x3c004000 :: v_dual_mov_b32 v36, v34 ; GFX1250-NEXT: v_mov_b32_e32 v37, v34 ; GFX1250-NEXT: v_wmma_f16_16x16x128_bf8_fp8 v[34:37], v[0:15], v[16:31], v[34:37] ; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_fp8_non_splat: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: s_mov_b32 s0, 0x3c003c00 ; GISEL-NEXT: s_mov_b32 s1, 0x3c004000 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] ; GISEL-NEXT: v_wmma_f16_16x16x128_bf8_fp8 v[34:37], v[0:15], v[16:31], v[34:37] ; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off ; GISEL-NEXT: s_endpgm bb: %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.fp8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x half> , i1 false, i1 false) store <8 x half> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_fp8_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_fp8_non_inlineable: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_mov_b32_e32 v34, 0x42004200 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34 ; GFX1250-NEXT: v_mov_b32_e32 v37, v34 ; GFX1250-NEXT: v_wmma_f16_16x16x128_bf8_fp8 v[34:37], v[0:15], v[16:31], v[34:37] ; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_fp8_non_inlineable: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: s_mov_b32 s0, 0x42004200 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] ; GISEL-NEXT: v_wmma_f16_16x16x128_bf8_fp8 v[34:37], v[0:15], v[16:31], v[34:37] ; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off ; GISEL-NEXT: s_endpgm bb: %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.fp8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x half> , i1 false, i1 false) store <8 x half> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_bf8(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_bf8: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_wmma_f16_16x16x128_bf8_bf8 v[34:37], v[0:15], v[16:31], 1.0 ; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_bf8: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: v_wmma_f16_16x16x128_bf8_bf8 v[34:37], v[0:15], v[16:31], 1.0 ; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off ; GISEL-NEXT: s_endpgm bb: %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.bf8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x half> , i1 false, i1 false) store <8 x half> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_bf8_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_bf8_non_splat: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_mov_b32_e32 v34, 0x3c003c00 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v35, 0x3c004000 :: v_dual_mov_b32 v36, v34 ; GFX1250-NEXT: v_mov_b32_e32 v37, v34 ; GFX1250-NEXT: v_wmma_f16_16x16x128_bf8_bf8 v[34:37], v[0:15], v[16:31], v[34:37] ; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_bf8_non_splat: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: s_mov_b32 s0, 0x3c003c00 ; GISEL-NEXT: s_mov_b32 s1, 0x3c004000 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] ; GISEL-NEXT: v_wmma_f16_16x16x128_bf8_bf8 v[34:37], v[0:15], v[16:31], v[34:37] ; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off ; GISEL-NEXT: s_endpgm bb: %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.bf8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x half> , i1 false, i1 false) store <8 x half> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_bf8_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_bf8_non_inlineable: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_mov_b32_e32 v34, 0x42004200 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34 ; GFX1250-NEXT: v_mov_b32_e32 v37, v34 ; GFX1250-NEXT: v_wmma_f16_16x16x128_bf8_bf8 v[34:37], v[0:15], v[16:31], v[34:37] ; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_bf8_non_inlineable: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: s_mov_b32 s0, 0x42004200 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] ; GISEL-NEXT: v_wmma_f16_16x16x128_bf8_bf8 v[34:37], v[0:15], v[16:31], v[34:37] ; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off ; GISEL-NEXT: s_endpgm bb: %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.bf8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x half> , i1 false, i1 false) store <8 x half> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_fp8(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_fp8: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_wmma_f32_16x16x128_fp8_fp8 v[34:41], v[0:15], v[16:31], 1.0 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 ; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_fp8: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: v_wmma_f32_16x16x128_fp8_fp8 v[34:41], v[0:15], v[16:31], 1.0 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off ; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 ; GISEL-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.fp8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x float> , i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_fp8_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_fp8_non_splat: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_dual_mov_b32 v34, 1.0 :: v_dual_mov_b32 v36, 2.0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v37, v34 ; GFX1250-NEXT: v_dual_mov_b32 v38, v34 :: v_dual_mov_b32 v39, v34 ; GFX1250-NEXT: v_dual_mov_b32 v40, v34 :: v_dual_mov_b32 v41, v34 ; GFX1250-NEXT: v_wmma_f32_16x16x128_fp8_fp8 v[34:41], v[0:15], v[16:31], v[34:41] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 ; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_fp8_non_splat: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s2, 2.0 ; GISEL-NEXT: s_mov_b32 s6, s0 ; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 ; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-NEXT: v_wmma_f32_16x16x128_fp8_fp8 v[34:41], v[0:15], v[16:31], v[34:41] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off ; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 ; GISEL-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.fp8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x float> , i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_fp8_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_fp8_non_inlineable: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_mov_b32_e32 v34, 0x40400000 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34 ; GFX1250-NEXT: v_dual_mov_b32 v37, v34 :: v_dual_mov_b32 v38, v34 ; GFX1250-NEXT: v_dual_mov_b32 v39, v34 :: v_dual_mov_b32 v40, v34 ; GFX1250-NEXT: v_mov_b32_e32 v41, v34 ; GFX1250-NEXT: v_wmma_f32_16x16x128_fp8_fp8 v[34:41], v[0:15], v[16:31], v[34:41] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 ; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_fp8_non_inlineable: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GISEL-NEXT: s_mov_b32 s6, s0 ; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 ; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-NEXT: v_wmma_f32_16x16x128_fp8_fp8 v[34:41], v[0:15], v[16:31], v[34:41] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off ; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 ; GISEL-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.fp8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x float> , i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_bf8(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_bf8: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_wmma_f32_16x16x128_fp8_bf8 v[34:41], v[0:15], v[16:31], 1.0 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 ; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_bf8: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: v_wmma_f32_16x16x128_fp8_bf8 v[34:41], v[0:15], v[16:31], 1.0 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off ; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 ; GISEL-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.bf8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x float> , i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_bf8_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_bf8_non_splat: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_dual_mov_b32 v34, 1.0 :: v_dual_mov_b32 v36, 2.0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v37, v34 ; GFX1250-NEXT: v_dual_mov_b32 v38, v34 :: v_dual_mov_b32 v39, v34 ; GFX1250-NEXT: v_dual_mov_b32 v40, v34 :: v_dual_mov_b32 v41, v34 ; GFX1250-NEXT: v_wmma_f32_16x16x128_fp8_bf8 v[34:41], v[0:15], v[16:31], v[34:41] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 ; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_bf8_non_splat: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s2, 2.0 ; GISEL-NEXT: s_mov_b32 s6, s0 ; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 ; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-NEXT: v_wmma_f32_16x16x128_fp8_bf8 v[34:41], v[0:15], v[16:31], v[34:41] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off ; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 ; GISEL-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.bf8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x float> , i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_bf8_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_bf8_non_inlineable: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_mov_b32_e32 v34, 0x40400000 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34 ; GFX1250-NEXT: v_dual_mov_b32 v37, v34 :: v_dual_mov_b32 v38, v34 ; GFX1250-NEXT: v_dual_mov_b32 v39, v34 :: v_dual_mov_b32 v40, v34 ; GFX1250-NEXT: v_mov_b32_e32 v41, v34 ; GFX1250-NEXT: v_wmma_f32_16x16x128_fp8_bf8 v[34:41], v[0:15], v[16:31], v[34:41] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 ; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_bf8_non_inlineable: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GISEL-NEXT: s_mov_b32 s6, s0 ; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 ; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-NEXT: v_wmma_f32_16x16x128_fp8_bf8 v[34:41], v[0:15], v[16:31], v[34:41] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off ; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 ; GISEL-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.bf8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x float> , i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_fp8(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_fp8: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_wmma_f32_16x16x128_bf8_fp8 v[34:41], v[0:15], v[16:31], 1.0 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 ; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_fp8: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: v_wmma_f32_16x16x128_bf8_fp8 v[34:41], v[0:15], v[16:31], 1.0 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off ; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 ; GISEL-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.fp8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x float> , i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_fp8_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_fp8_non_splat: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_dual_mov_b32 v34, 1.0 :: v_dual_mov_b32 v36, 2.0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v37, v34 ; GFX1250-NEXT: v_dual_mov_b32 v38, v34 :: v_dual_mov_b32 v39, v34 ; GFX1250-NEXT: v_dual_mov_b32 v40, v34 :: v_dual_mov_b32 v41, v34 ; GFX1250-NEXT: v_wmma_f32_16x16x128_bf8_fp8 v[34:41], v[0:15], v[16:31], v[34:41] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 ; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_fp8_non_splat: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s2, 2.0 ; GISEL-NEXT: s_mov_b32 s6, s0 ; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 ; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-NEXT: v_wmma_f32_16x16x128_bf8_fp8 v[34:41], v[0:15], v[16:31], v[34:41] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off ; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 ; GISEL-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.fp8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x float> , i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_fp8_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_fp8_non_inlineable: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_mov_b32_e32 v34, 0x40400000 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34 ; GFX1250-NEXT: v_dual_mov_b32 v37, v34 :: v_dual_mov_b32 v38, v34 ; GFX1250-NEXT: v_dual_mov_b32 v39, v34 :: v_dual_mov_b32 v40, v34 ; GFX1250-NEXT: v_mov_b32_e32 v41, v34 ; GFX1250-NEXT: v_wmma_f32_16x16x128_bf8_fp8 v[34:41], v[0:15], v[16:31], v[34:41] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 ; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_fp8_non_inlineable: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GISEL-NEXT: s_mov_b32 s6, s0 ; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 ; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-NEXT: v_wmma_f32_16x16x128_bf8_fp8 v[34:41], v[0:15], v[16:31], v[34:41] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off ; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 ; GISEL-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.fp8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x float> , i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_bf8(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_bf8: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_wmma_f32_16x16x128_bf8_bf8 v[34:41], v[0:15], v[16:31], 1.0 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 ; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_bf8: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: v_wmma_f32_16x16x128_bf8_bf8 v[34:41], v[0:15], v[16:31], 1.0 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off ; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 ; GISEL-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.bf8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x float> , i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_bf8_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_bf8_non_splat: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_dual_mov_b32 v34, 1.0 :: v_dual_mov_b32 v36, 2.0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v37, v34 ; GFX1250-NEXT: v_dual_mov_b32 v38, v34 :: v_dual_mov_b32 v39, v34 ; GFX1250-NEXT: v_dual_mov_b32 v40, v34 :: v_dual_mov_b32 v41, v34 ; GFX1250-NEXT: v_wmma_f32_16x16x128_bf8_bf8 v[34:41], v[0:15], v[16:31], v[34:41] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 ; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_bf8_non_splat: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s2, 2.0 ; GISEL-NEXT: s_mov_b32 s6, s0 ; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 ; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-NEXT: v_wmma_f32_16x16x128_bf8_bf8 v[34:41], v[0:15], v[16:31], v[34:41] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off ; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 ; GISEL-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.bf8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x float> , i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_bf8_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_bf8_non_inlineable: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_mov_b32_e32 v34, 0x40400000 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34 ; GFX1250-NEXT: v_dual_mov_b32 v37, v34 :: v_dual_mov_b32 v38, v34 ; GFX1250-NEXT: v_dual_mov_b32 v39, v34 :: v_dual_mov_b32 v40, v34 ; GFX1250-NEXT: v_mov_b32_e32 v41, v34 ; GFX1250-NEXT: v_wmma_f32_16x16x128_bf8_bf8 v[34:41], v[0:15], v[16:31], v[34:41] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 ; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_bf8_non_inlineable: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GISEL-NEXT: s_mov_b32 s6, s0 ; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 ; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-NEXT: v_wmma_f32_16x16x128_bf8_bf8 v[34:41], v[0:15], v[16:31], v[34:41] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off ; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 ; GISEL-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.bf8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x float> , i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f32_32x16x128_f4(<16 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_32x16x128_f4: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_wmma_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], 1.0 ; GFX1250-NEXT: s_clause 0x3 ; GFX1250-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48 ; GFX1250-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32 ; GFX1250-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16 ; GFX1250-NEXT: global_store_b128 v[24:25], v[26:29], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f32_32x16x128_f4: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: v_wmma_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], 1.0 ; GISEL-NEXT: s_clause 0x3 ; GISEL-NEXT: global_store_b128 v[24:25], v[26:29], off ; GISEL-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16 ; GISEL-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32 ; GISEL-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48 ; GISEL-NEXT: s_endpgm bb: %res = call <16 x float> @llvm.amdgcn.wmma.f32.32x16x128.f4.v16i32.v8i32.v16f32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> ) store <16 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f32_32x16x128_f4_non_splat(<16 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_32x16x128_f4_non_splat: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_dual_mov_b32 v26, 1.0 :: v_dual_mov_b32 v28, 2.0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v27, v26 :: v_dual_mov_b32 v29, v26 ; GFX1250-NEXT: v_dual_mov_b32 v30, v26 :: v_dual_mov_b32 v31, v26 ; GFX1250-NEXT: v_dual_mov_b32 v32, v26 :: v_dual_mov_b32 v33, v26 ; GFX1250-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v26 ; GFX1250-NEXT: v_dual_mov_b32 v36, v28 :: v_dual_mov_b32 v37, v26 ; GFX1250-NEXT: v_dual_mov_b32 v38, v26 :: v_dual_mov_b32 v39, v26 ; GFX1250-NEXT: v_dual_mov_b32 v40, v26 :: v_dual_mov_b32 v41, v26 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_wmma_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41] ; GFX1250-NEXT: s_clause 0x3 ; GFX1250-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48 ; GFX1250-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32 ; GFX1250-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16 ; GFX1250-NEXT: global_store_b128 v[24:25], v[26:29], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f32_32x16x128_f4_non_splat: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s2, 2.0 ; GISEL-NEXT: s_mov_b32 s14, s0 ; GISEL-NEXT: s_mov_b32 s15, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 ; GISEL-NEXT: s_mov_b32 s6, s0 ; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s8, s0 ; GISEL-NEXT: s_mov_b32 s9, s0 ; GISEL-NEXT: s_mov_b32 s10, s2 ; GISEL-NEXT: s_mov_b32 s11, s0 ; GISEL-NEXT: s_mov_b32 s12, s0 ; GISEL-NEXT: s_mov_b32 s13, s0 ; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[4:5] ; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-NEXT: v_wmma_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41] ; GISEL-NEXT: s_clause 0x3 ; GISEL-NEXT: global_store_b128 v[24:25], v[26:29], off ; GISEL-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16 ; GISEL-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32 ; GISEL-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48 ; GISEL-NEXT: s_endpgm bb: %res = call <16 x float> @llvm.amdgcn.wmma.f32.32x16x128.f4.v16i32.v8i32.v16f32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> ) store <16 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f32_32x16x128_f4_non_inlineable(<16 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_32x16x128_f4_non_inlineable: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_mov_b32_e32 v26, 0x40400000 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v27, v26 :: v_dual_mov_b32 v28, v26 ; GFX1250-NEXT: v_dual_mov_b32 v29, v26 :: v_dual_mov_b32 v30, v26 ; GFX1250-NEXT: v_dual_mov_b32 v31, v26 :: v_dual_mov_b32 v32, v26 ; GFX1250-NEXT: v_dual_mov_b32 v33, v26 :: v_dual_mov_b32 v34, v26 ; GFX1250-NEXT: v_dual_mov_b32 v35, v26 :: v_dual_mov_b32 v36, v26 ; GFX1250-NEXT: v_dual_mov_b32 v37, v26 :: v_dual_mov_b32 v38, v26 ; GFX1250-NEXT: v_dual_mov_b32 v39, v26 :: v_dual_mov_b32 v40, v26 ; GFX1250-NEXT: v_mov_b32_e32 v41, v26 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_wmma_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41] ; GFX1250-NEXT: s_clause 0x3 ; GFX1250-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48 ; GFX1250-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32 ; GFX1250-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16 ; GFX1250-NEXT: global_store_b128 v[24:25], v[26:29], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f32_32x16x128_f4_non_inlineable: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GISEL-NEXT: s_mov_b32 s14, s0 ; GISEL-NEXT: s_mov_b32 s15, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 ; GISEL-NEXT: s_mov_b32 s6, s0 ; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s8, s0 ; GISEL-NEXT: s_mov_b32 s9, s0 ; GISEL-NEXT: s_mov_b32 s10, s0 ; GISEL-NEXT: s_mov_b32 s11, s0 ; GISEL-NEXT: s_mov_b32 s12, s0 ; GISEL-NEXT: s_mov_b32 s13, s0 ; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[4:5] ; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-NEXT: v_wmma_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41] ; GISEL-NEXT: s_clause 0x3 ; GISEL-NEXT: global_store_b128 v[24:25], v[26:29], off ; GISEL-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16 ; GISEL-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32 ; GISEL-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48 ; GISEL-NEXT: s_endpgm bb: %res = call <16 x float> @llvm.amdgcn.wmma.f32.32x16x128.f4.v16i32.v8i32.v16f32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> ) store <16 x float> %res, ptr addrspace(1) %out ret void } declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1, <2 x float>, i1, <2 x float>, i16, <8 x float>, i1, i1) declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1, <16 x bfloat>, i1, <16 x bfloat>, i16, <8 x float>, i1, i1) declare <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1, <16 x bfloat>, i1, <16 x bfloat>, i16, <8 x bfloat>, i1, i1) declare <8 x bfloat> @llvm.amdgcn.wmma.bf16f32.16x16x32.bf16.v8bf16.v16bf16(i1, <16 x bfloat>, i1, <16 x bfloat>, i16, <8 x float>, i1, i1) declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.fp8.v8f32.v8i32(<8 x i32>, <8 x i32>, i16, <8 x float>, i1, i1) declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.bf8.v8f32.v8i32(<8 x i32>, <8 x i32>, i16, <8 x float>, i1, i1) declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.fp8.v8f32.v8i32(<8 x i32>, <8 x i32>, i16, <8 x float>, i1, i1) declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.bf8.v8f32.v8i32(<8 x i32>, <8 x i32>, i16, <8 x float>, i1, i1) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.fp8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.bf8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.fp8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1) declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 immarg, <8 x i32>, i1 immarg, <8 x i32>, <8 x i32>, i1, i1) declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x float>, i1, i1) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x half>, i1, i1) declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x i32>, i32, <16 x i32>, i16, <8 x float>) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.bf8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.bf8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1) declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.fp8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1) declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.bf8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1) declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.fp8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1) declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.bf8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1) declare <16 x float> @llvm.amdgcn.wmma.f32.32x16x128.f4.v16i32.v8i32.v16f32(<16 x i32>, <8 x i32>, i16, <16 x float>)