; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck %s --check-prefix=GFX1250 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -global-isel -global-isel-abort=2 < %s | FileCheck %s --check-prefix=GISEL define amdgpu_ps void @test_swmmac_f32_16x16x64_bf16(<16 x bfloat> %A, <32 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_f32_16x16x64_bf16: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: global_load_b32 v32, v[32:33], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 index_key:1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 ; GFX1250-NEXT: global_store_b128 v[34:35], v[24:27], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_swmmac_f32_16x16x64_bf16: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: global_load_b32 v32, v[32:33], off ; GISEL-NEXT: s_wait_loadcnt 0x0 ; GISEL-NEXT: v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 index_key:1 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 ; GISEL-NEXT: global_store_b128 v[34:35], v[24:27], off ; GISEL-NEXT: s_endpgm bb: %IndexVecPacked = load i32, ptr addrspace(1) %IndexVecPtr, align 4 %IndexVec = bitcast i32 %IndexVecPacked to <2 x i16> %Index = extractelement <2 x i16> %IndexVec, i32 1 %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1 0, <16 x bfloat> %A, i1 0, <32 x bfloat> %B, <8 x float> %C, i16 %Index, i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_swmmac_bf16_16x16x64_bf16(<16 x bfloat> %A, <32 x bfloat> %B, <8 x bfloat> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_bf16_16x16x64_bf16: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: global_load_b32 v28, v[28:29], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 index_key:1 ; GFX1250-NEXT: global_store_b128 v[30:31], v[24:27], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_swmmac_bf16_16x16x64_bf16: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: global_load_b32 v28, v[28:29], off ; GISEL-NEXT: s_wait_loadcnt 0x0 ; GISEL-NEXT: v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 index_key:1 ; GISEL-NEXT: global_store_b128 v[30:31], v[24:27], off ; GISEL-NEXT: s_endpgm bb: %IndexVecPacked = load i32, ptr addrspace(1) %IndexVecPtr, align 4 %IndexVec = bitcast i32 %IndexVecPacked to <2 x i16> %Index = extractelement <2 x i16> %IndexVec, i32 1 %res = call <8 x bfloat> @llvm.amdgcn.swmmac.bf16.16x16x64.bf16.v8bf16.v16bf16.v32bf16.i16(i1 0, <16 x bfloat> %A, i1 0, <32 x bfloat> %B, <8 x bfloat> %C, i16 %Index, i1 false, i1 false) store <8 x bfloat> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_swmmac_bf16f32_16x16x64_bf16(<16 x bfloat> %A, <32 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_bf16f32_16x16x64_bf16: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: global_load_b32 v32, v[32:33], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 index_key:1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 ; GFX1250-NEXT: global_store_b128 v[34:35], v[24:27], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_swmmac_bf16f32_16x16x64_bf16: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: global_load_b32 v32, v[32:33], off ; GISEL-NEXT: s_wait_loadcnt 0x0 ; GISEL-NEXT: v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 index_key:1 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 ; GISEL-NEXT: global_store_b128 v[34:35], v[24:27], off ; GISEL-NEXT: s_endpgm bb: %IndexVecPacked = load i32, ptr addrspace(1) %IndexVecPtr, align 4 %IndexVec = bitcast i32 %IndexVecPacked to <2 x i16> %Index = extractelement <2 x i16> %IndexVec, i32 1 %res = call <8 x float> @llvm.amdgcn.swmmac.bf16f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1 0, <16 x bfloat> %A, i1 0, <32 x bfloat> %B, <8 x float> %C, i16 %Index, i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_swmmac_f32_16x16x128_fp8_fp8_i32_index(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_f32_16x16x128_fp8_fp8_i32_index: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: global_load_b64 v[32:33], v[32:33], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 ; GFX1250-NEXT: global_store_b64 v[34:35], v[32:33], off ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 ; GFX1250-NEXT: global_store_b128 v[36:37], v[24:27], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_swmmac_f32_16x16x128_fp8_fp8_i32_index: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: global_load_b64 v[32:33], v[32:33], off ; GISEL-NEXT: s_wait_loadcnt 0x0 ; GISEL-NEXT: v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 ; GISEL-NEXT: global_store_b64 v[34:35], v[32:33], off ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[36:37], v[24:27], off ; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 ; GISEL-NEXT: s_endpgm bb: %IndexVecPacked = load i64, ptr addrspace(1) %IndexVecPtr, align 8 store i64 %IndexVecPacked, ptr addrspace(1) %IndexVecOutPtr %IndexVec = bitcast i64 %IndexVecPacked to <2 x i32> %Index = extractelement <2 x i32> %IndexVec, i32 1 %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.fp8.fp8.v8f32.v8i32.v16i32.i32(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i32 %Index, i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_swmmac_f32_16x16x128_fp8_fp8_i64_index(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_f32_16x16x128_fp8_fp8_i64_index: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: global_load_b64 v[32:33], v[32:33], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 ; GFX1250-NEXT: global_store_b64 v[34:35], v[32:33], off ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 ; GFX1250-NEXT: global_store_b128 v[36:37], v[24:27], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_swmmac_f32_16x16x128_fp8_fp8_i64_index: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: global_load_b64 v[32:33], v[32:33], off ; GISEL-NEXT: s_wait_loadcnt 0x0 ; GISEL-NEXT: v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 ; GISEL-NEXT: global_store_b64 v[34:35], v[32:33], off ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[36:37], v[24:27], off ; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 ; GISEL-NEXT: s_endpgm bb: %IndexVecPacked = load i64, ptr addrspace(1) %IndexVecPtr, align 8 store i64 %IndexVecPacked, ptr addrspace(1) %IndexVecOutPtr %Index = lshr i64 %IndexVecPacked, 32 %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.fp8.fp8.v8f32.v8i32.v16i32.i64(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %Index, i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_swmmac_f32_16x16x128_fp8_bf8_i32_index(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_f32_16x16x128_fp8_bf8_i32_index: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: global_load_b64 v[32:33], v[32:33], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 ; GFX1250-NEXT: global_store_b64 v[34:35], v[32:33], off ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 ; GFX1250-NEXT: global_store_b128 v[36:37], v[24:27], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_swmmac_f32_16x16x128_fp8_bf8_i32_index: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: global_load_b64 v[32:33], v[32:33], off ; GISEL-NEXT: s_wait_loadcnt 0x0 ; GISEL-NEXT: v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 ; GISEL-NEXT: global_store_b64 v[34:35], v[32:33], off ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[36:37], v[24:27], off ; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 ; GISEL-NEXT: s_endpgm bb: %IndexVecPacked = load i64, ptr addrspace(1) %IndexVecPtr, align 8 store i64 %IndexVecPacked, ptr addrspace(1) %IndexVecOutPtr %IndexVec = bitcast i64 %IndexVecPacked to <2 x i32> %Index = extractelement <2 x i32> %IndexVec, i32 1 %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.fp8.bf8.v8f32.v8i32.v16i32.i32(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i32 %Index, i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_swmmac_f32_16x16x128_fp8_bf8_i64_index(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_f32_16x16x128_fp8_bf8_i64_index: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: global_load_b64 v[32:33], v[32:33], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 ; GFX1250-NEXT: global_store_b64 v[34:35], v[32:33], off ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 ; GFX1250-NEXT: global_store_b128 v[36:37], v[24:27], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_swmmac_f32_16x16x128_fp8_bf8_i64_index: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: global_load_b64 v[32:33], v[32:33], off ; GISEL-NEXT: s_wait_loadcnt 0x0 ; GISEL-NEXT: v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 ; GISEL-NEXT: global_store_b64 v[34:35], v[32:33], off ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[36:37], v[24:27], off ; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 ; GISEL-NEXT: s_endpgm bb: %IndexVecPacked = load i64, ptr addrspace(1) %IndexVecPtr, align 8 store i64 %IndexVecPacked, ptr addrspace(1) %IndexVecOutPtr %Index = lshr i64 %IndexVecPacked, 32 %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.fp8.bf8.v8f32.v8i32.v16i32.i64(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %Index, i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_swmmac_f32_16x16x128_bf8_fp8_i32_index(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_f32_16x16x128_bf8_fp8_i32_index: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: global_load_b64 v[32:33], v[32:33], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 ; GFX1250-NEXT: global_store_b64 v[34:35], v[32:33], off ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 ; GFX1250-NEXT: global_store_b128 v[36:37], v[24:27], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_swmmac_f32_16x16x128_bf8_fp8_i32_index: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: global_load_b64 v[32:33], v[32:33], off ; GISEL-NEXT: s_wait_loadcnt 0x0 ; GISEL-NEXT: v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 ; GISEL-NEXT: global_store_b64 v[34:35], v[32:33], off ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[36:37], v[24:27], off ; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 ; GISEL-NEXT: s_endpgm bb: %IndexVecPacked = load i64, ptr addrspace(1) %IndexVecPtr, align 8 store i64 %IndexVecPacked, ptr addrspace(1) %IndexVecOutPtr %IndexVec = bitcast i64 %IndexVecPacked to <2 x i32> %Index = extractelement <2 x i32> %IndexVec, i32 1 %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.bf8.fp8.v8f32.v8i32.v16i32.i32(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i32 %Index, i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_swmmac_f32_16x16x128_bf8_fp8_i64_index(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_f32_16x16x128_bf8_fp8_i64_index: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: global_load_b64 v[32:33], v[32:33], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 ; GFX1250-NEXT: global_store_b64 v[34:35], v[32:33], off ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 ; GFX1250-NEXT: global_store_b128 v[36:37], v[24:27], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_swmmac_f32_16x16x128_bf8_fp8_i64_index: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: global_load_b64 v[32:33], v[32:33], off ; GISEL-NEXT: s_wait_loadcnt 0x0 ; GISEL-NEXT: v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 ; GISEL-NEXT: global_store_b64 v[34:35], v[32:33], off ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[36:37], v[24:27], off ; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 ; GISEL-NEXT: s_endpgm bb: %IndexVecPacked = load i64, ptr addrspace(1) %IndexVecPtr, align 8 store i64 %IndexVecPacked, ptr addrspace(1) %IndexVecOutPtr %Index = lshr i64 %IndexVecPacked, 32 %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.bf8.fp8.v8f32.v8i32.v16i32.i64(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %Index, i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_swmmac_f32_16x16x128_bf8_bf8_i32_index(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_f32_16x16x128_bf8_bf8_i32_index: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: global_load_b64 v[32:33], v[32:33], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 ; GFX1250-NEXT: global_store_b64 v[34:35], v[32:33], off ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 ; GFX1250-NEXT: global_store_b128 v[36:37], v[24:27], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_swmmac_f32_16x16x128_bf8_bf8_i32_index: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: global_load_b64 v[32:33], v[32:33], off ; GISEL-NEXT: s_wait_loadcnt 0x0 ; GISEL-NEXT: v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 ; GISEL-NEXT: global_store_b64 v[34:35], v[32:33], off ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[36:37], v[24:27], off ; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 ; GISEL-NEXT: s_endpgm bb: %IndexVecPacked = load i64, ptr addrspace(1) %IndexVecPtr, align 8 store i64 %IndexVecPacked, ptr addrspace(1) %IndexVecOutPtr %IndexVec = bitcast i64 %IndexVecPacked to <2 x i32> %Index = extractelement <2 x i32> %IndexVec, i32 1 %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.bf8.bf8.v8f32.v8i32.v16i32.i32(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i32 %Index, i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_swmmac_f32_16x16x128_bf8_bf8_i64_index(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_f32_16x16x128_bf8_bf8_i64_index: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: global_load_b64 v[32:33], v[32:33], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 ; GFX1250-NEXT: global_store_b64 v[34:35], v[32:33], off ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 ; GFX1250-NEXT: global_store_b128 v[36:37], v[24:27], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_swmmac_f32_16x16x128_bf8_bf8_i64_index: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: global_load_b64 v[32:33], v[32:33], off ; GISEL-NEXT: s_wait_loadcnt 0x0 ; GISEL-NEXT: v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 ; GISEL-NEXT: global_store_b64 v[34:35], v[32:33], off ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[36:37], v[24:27], off ; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 ; GISEL-NEXT: s_endpgm bb: %IndexVecPacked = load i64, ptr addrspace(1) %IndexVecPtr, align 8 store i64 %IndexVecPacked, ptr addrspace(1) %IndexVecOutPtr %Index = lshr i64 %IndexVecPacked, 32 %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.bf8.bf8.v8f32.v8i32.v16i32.i64(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %Index, i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_swmmac_f16_16x16x128_fp8_fp8_i32_index(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_f16_16x16x128_fp8_fp8_i32_index: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: global_load_b64 v[28:29], v[28:29], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 ; GFX1250-NEXT: global_store_b64 v[30:31], v[28:29], off ; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_swmmac_f16_16x16x128_fp8_fp8_i32_index: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: global_load_b64 v[28:29], v[28:29], off ; GISEL-NEXT: s_wait_loadcnt 0x0 ; GISEL-NEXT: v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 ; GISEL-NEXT: global_store_b64 v[30:31], v[28:29], off ; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off ; GISEL-NEXT: s_endpgm bb: %IndexVecPacked = load i64, ptr addrspace(1) %IndexVecPtr, align 8 store i64 %IndexVecPacked, ptr addrspace(1) %IndexVecOutPtr %IndexVec = bitcast i64 %IndexVecPacked to <2 x i32> %Index = extractelement <2 x i32> %IndexVec, i32 1 %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.fp8.fp8.v8f16.v8i32.v16i32.i32(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i32 %Index, i1 false, i1 false) store <8 x half> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_swmmac_f16_16x16x128_fp8_fp8_i64_index(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_f16_16x16x128_fp8_fp8_i64_index: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: global_load_b64 v[28:29], v[28:29], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 ; GFX1250-NEXT: global_store_b64 v[30:31], v[28:29], off ; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_swmmac_f16_16x16x128_fp8_fp8_i64_index: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: global_load_b64 v[28:29], v[28:29], off ; GISEL-NEXT: s_wait_loadcnt 0x0 ; GISEL-NEXT: v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 ; GISEL-NEXT: global_store_b64 v[30:31], v[28:29], off ; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off ; GISEL-NEXT: s_endpgm bb: %IndexVecPacked = load i64, ptr addrspace(1) %IndexVecPtr, align 8 store i64 %IndexVecPacked, ptr addrspace(1) %IndexVecOutPtr %Index = lshr i64 %IndexVecPacked, 32 %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.fp8.fp8.v8f16.v8i32.v16i32.i64(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i64 %Index, i1 false, i1 false) store <8 x half> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_swmmac_f16_16x16x128_fp8_bf8_i32_index(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_f16_16x16x128_fp8_bf8_i32_index: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: global_load_b64 v[28:29], v[28:29], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 ; GFX1250-NEXT: global_store_b64 v[30:31], v[28:29], off ; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_swmmac_f16_16x16x128_fp8_bf8_i32_index: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: global_load_b64 v[28:29], v[28:29], off ; GISEL-NEXT: s_wait_loadcnt 0x0 ; GISEL-NEXT: v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 ; GISEL-NEXT: global_store_b64 v[30:31], v[28:29], off ; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off ; GISEL-NEXT: s_endpgm bb: %IndexVecPacked = load i64, ptr addrspace(1) %IndexVecPtr, align 4 store i64 %IndexVecPacked, ptr addrspace(1) %IndexVecOutPtr %IndexVec = bitcast i64 %IndexVecPacked to <2 x i32> %Index = extractelement <2 x i32> %IndexVec, i32 1 %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.fp8.bf8.v8f16.v8i32.v16i32.i32(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i32 %Index, i1 false, i1 false) store <8 x half> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_swmmac_f16_16x16x128_fp8_bf8_i64_index(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_f16_16x16x128_fp8_bf8_i64_index: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: global_load_b64 v[28:29], v[28:29], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 ; GFX1250-NEXT: global_store_b64 v[30:31], v[28:29], off ; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_swmmac_f16_16x16x128_fp8_bf8_i64_index: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: global_load_b64 v[28:29], v[28:29], off ; GISEL-NEXT: s_wait_loadcnt 0x0 ; GISEL-NEXT: v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 ; GISEL-NEXT: global_store_b64 v[30:31], v[28:29], off ; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off ; GISEL-NEXT: s_endpgm bb: %IndexVecPacked = load i64, ptr addrspace(1) %IndexVecPtr, align 4 store i64 %IndexVecPacked, ptr addrspace(1) %IndexVecOutPtr %Index = lshr i64 %IndexVecPacked, 32 %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.fp8.bf8.v8f16.v8i32.v16i32.i64(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i64 %Index, i1 false, i1 false) store <8 x half> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_swmmac_f16_16x16x128_bf8_fp8_i32_index(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_f16_16x16x128_bf8_fp8_i32_index: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: global_load_b64 v[28:29], v[28:29], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 ; GFX1250-NEXT: global_store_b64 v[30:31], v[28:29], off ; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_swmmac_f16_16x16x128_bf8_fp8_i32_index: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: global_load_b64 v[28:29], v[28:29], off ; GISEL-NEXT: s_wait_loadcnt 0x0 ; GISEL-NEXT: v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 ; GISEL-NEXT: global_store_b64 v[30:31], v[28:29], off ; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off ; GISEL-NEXT: s_endpgm bb: %IndexVecPacked = load i64, ptr addrspace(1) %IndexVecPtr, align 8 store i64 %IndexVecPacked, ptr addrspace(1) %IndexVecOutPtr %IndexVec = bitcast i64 %IndexVecPacked to <2 x i32> %Index = extractelement <2 x i32> %IndexVec, i32 1 %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.bf8.fp8.v8f16.v8i32.v16i32.i32(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i32 %Index, i1 false, i1 false) store <8 x half> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_swmmac_f16_16x16x128_bf8_fp8_i64_index(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_f16_16x16x128_bf8_fp8_i64_index: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: global_load_b64 v[28:29], v[28:29], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 ; GFX1250-NEXT: global_store_b64 v[30:31], v[28:29], off ; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_swmmac_f16_16x16x128_bf8_fp8_i64_index: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: global_load_b64 v[28:29], v[28:29], off ; GISEL-NEXT: s_wait_loadcnt 0x0 ; GISEL-NEXT: v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 ; GISEL-NEXT: global_store_b64 v[30:31], v[28:29], off ; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off ; GISEL-NEXT: s_endpgm bb: %IndexVecPacked = load i64, ptr addrspace(1) %IndexVecPtr, align 8 store i64 %IndexVecPacked, ptr addrspace(1) %IndexVecOutPtr %Index = lshr i64 %IndexVecPacked, 32 %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.bf8.fp8.v8f16.v8i32.v16i32.i64(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i64 %Index, i1 false, i1 false) store <8 x half> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_swmmac_f16_16x16x128_bf8_bf8_i32_index(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_f16_16x16x128_bf8_bf8_i32_index: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: global_load_b64 v[28:29], v[28:29], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 ; GFX1250-NEXT: global_store_b64 v[30:31], v[28:29], off ; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_swmmac_f16_16x16x128_bf8_bf8_i32_index: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: global_load_b64 v[28:29], v[28:29], off ; GISEL-NEXT: s_wait_loadcnt 0x0 ; GISEL-NEXT: v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 ; GISEL-NEXT: global_store_b64 v[30:31], v[28:29], off ; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off ; GISEL-NEXT: s_endpgm bb: %IndexVecPacked = load i64, ptr addrspace(1) %IndexVecPtr, align 8 store i64 %IndexVecPacked, ptr addrspace(1) %IndexVecOutPtr %IndexVec = bitcast i64 %IndexVecPacked to <2 x i32> %Index = extractelement <2 x i32> %IndexVec, i32 1 %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.bf8.bf8.v8f16.v8i32.v16i32.i32(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i32 %Index, i1 false, i1 false) store <8 x half> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_swmmac_f16_16x16x128_bf8_bf8_i64_index(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_f16_16x16x128_bf8_bf8_i64_index: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: global_load_b64 v[28:29], v[28:29], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 ; GFX1250-NEXT: global_store_b64 v[30:31], v[28:29], off ; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_swmmac_f16_16x16x128_bf8_bf8_i64_index: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: global_load_b64 v[28:29], v[28:29], off ; GISEL-NEXT: s_wait_loadcnt 0x0 ; GISEL-NEXT: v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 ; GISEL-NEXT: global_store_b64 v[30:31], v[28:29], off ; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off ; GISEL-NEXT: s_endpgm bb: %IndexVecPacked = load i64, ptr addrspace(1) %IndexVecPtr, align 8 store i64 %IndexVecPacked, ptr addrspace(1) %IndexVecOutPtr %Index = lshr i64 %IndexVecPacked, 32 %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.bf8.bf8.v8f16.v8i32.v16i32.i64(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i64 %Index, i1 false, i1 false) store <8 x half> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_swmmac_i32_16x16x128_iu8_i32_index(<8 x i32> %A, <16 x i32> %B, <8 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_i32_16x16x128_iu8_i32_index: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: global_load_b64 v[32:33], v[32:33], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 ; GFX1250-NEXT: global_store_b64 v[34:35], v[32:33], off ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 ; GFX1250-NEXT: global_store_b128 v[36:37], v[24:27], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_swmmac_i32_16x16x128_iu8_i32_index: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: global_load_b64 v[32:33], v[32:33], off ; GISEL-NEXT: s_wait_loadcnt 0x0 ; GISEL-NEXT: v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 ; GISEL-NEXT: global_store_b64 v[34:35], v[32:33], off ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[36:37], v[24:27], off ; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 ; GISEL-NEXT: s_endpgm bb: %IndexVecPacked = load i64, ptr addrspace(1) %IndexVecPtr, align 8 store i64 %IndexVecPacked, ptr addrspace(1) %IndexVecOutPtr %IndexVec = bitcast i64 %IndexVecPacked to <2 x i32> %Index = extractelement <2 x i32> %IndexVec, i32 1 %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x128.iu8.v8i32.v8i32.v16i32.i32(i1 0, <8 x i32> %A, i1 0, <16 x i32> %B, <8 x i32> %C, i32 %Index, i1 false, i1 false) store <8 x i32> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_swmmac_i32_16x16x128_iu8_i64_index(<8 x i32> %A, <16 x i32> %B, <8 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_i32_16x16x128_iu8_i64_index: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: global_load_b64 v[32:33], v[32:33], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 ; GFX1250-NEXT: global_store_b64 v[34:35], v[32:33], off ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 ; GFX1250-NEXT: global_store_b128 v[36:37], v[24:27], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_swmmac_i32_16x16x128_iu8_i64_index: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: global_load_b64 v[32:33], v[32:33], off ; GISEL-NEXT: s_wait_loadcnt 0x0 ; GISEL-NEXT: v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 ; GISEL-NEXT: global_store_b64 v[34:35], v[32:33], off ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[36:37], v[24:27], off ; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 ; GISEL-NEXT: s_endpgm bb: %IndexVecPacked = load i64, ptr addrspace(1) %IndexVecPtr, align 8 store i64 %IndexVecPacked, ptr addrspace(1) %IndexVecOutPtr %Index = lshr i64 %IndexVecPacked, 32 %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x128.iu8.v8i32.v8i32.v16i32.i64(i1 0, <8 x i32> %A, i1 0, <16 x i32> %B, <8 x i32> %C, i64 %Index, i1 false, i1 false) store <8 x i32> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_swmmac_f32_16x16x64_f16(<16 x half> %A, <32 x half> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_f32_16x16x64_f16: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: global_load_b32 v32, v[32:33], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 index_key:1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 ; GFX1250-NEXT: global_store_b128 v[34:35], v[24:27], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_swmmac_f32_16x16x64_f16: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: global_load_b32 v32, v[32:33], off ; GISEL-NEXT: s_wait_loadcnt 0x0 ; GISEL-NEXT: v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 index_key:1 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[34:35], v[24:27], off ; GISEL-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 ; GISEL-NEXT: s_endpgm bb: %IndexVecPacked = load i32, ptr addrspace(1) %IndexVecPtr, align 4 %IndexVec = bitcast i32 %IndexVecPacked to <2 x i16> %Index = extractelement <2 x i16> %IndexVec, i32 1 %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.f16.v8f32.v16f16.v32f16.i16(i1 0, <16 x half> %A, i1 0, <32 x half> %B, <8 x float> %C, i16 %Index, i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_swmmac_f16_16x16x64_f16(<16 x half> %A, <32 x half> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_f16_16x16x64_f16: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: global_load_b32 v28, v[28:29], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 index_key:1 ; GFX1250-NEXT: global_store_b128 v[30:31], v[24:27], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_swmmac_f16_16x16x64_f16: ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: global_load_b32 v28, v[28:29], off ; GISEL-NEXT: s_wait_loadcnt 0x0 ; GISEL-NEXT: v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 index_key:1 ; GISEL-NEXT: global_store_b128 v[30:31], v[24:27], off ; GISEL-NEXT: s_endpgm bb: %IndexVecPacked = load i32, ptr addrspace(1) %IndexVecPtr, align 4 %IndexVec = bitcast i32 %IndexVecPacked to <2 x i16> %Index = extractelement <2 x i16> %IndexVec, i32 1 %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x64.f16.v8f16.v16f16.v32f16.i16(i1 0, <16 x half> %A, i1 0, <32 x half> %B, <8 x half> %C, i16 %Index, i1 false, i1 false) store <8 x half> %res, ptr addrspace(1) %out ret void } declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x float>, i16, i1, i1) declare <8 x bfloat> @llvm.amdgcn.swmmac.bf16.16x16x64.bf16.v8bf16.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x bfloat>, i16, i1, i1) declare <8 x float> @llvm.amdgcn.swmmac.bf16f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x float>, i16, i1, i1) declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.fp8.fp8.v8f32.v8i32.v16i32.i32(<8 x i32>, <16 x i32>, <8 x float>, i32, i1, i1) declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.fp8.bf8.v8f32.v8i32.v16i32.i32(<8 x i32>, <16 x i32>, <8 x float>, i32, i1, i1) declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.bf8.fp8.v8f32.v8i32.v16i32.i32(<8 x i32>, <16 x i32>, <8 x float>, i32, i1, i1) declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.bf8.bf8.v8f32.v8i32.v16i32.i32(<8 x i32>, <16 x i32>, <8 x float>, i32, i1, i1) declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.fp8.fp8.v8f16.v8i32.v16i32.i32(<8 x i32>, <16 x i32>, <8 x half>, i32, i1, i1) declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.fp8.bf8.v8f16.v8i32.v16i32.i32(<8 x i32>, <16 x i32>, <8 x half>, i32, i1, i1) declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.bf8.fp8.v8f16.v8i32.v16i32.i32(<8 x i32>, <16 x i32>, <8 x half>, i32, i1, i1) declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.bf8.bf8.v8f16.v8i32.v16i32.i32(<8 x i32>, <16 x i32>, <8 x half>, i32, i1, i1) declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x128.iu8.v8i32.v8i32.v16i32.i32(i1 immarg, <8 x i32>, i1 immarg, <16 x i32>, <8 x i32>, i32, i1, i1) declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.f16.v8f32.v16f16.v32f16.i16(i1, <16 x half>, i1, <32 x half>, <8 x float>, i16, i1, i1) declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x64.f16.v8f16.v16f16.v32f16.i16(i1, <16 x half>, i1, <32 x half>, <8 x half>, i16, i1, i1)