; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -misched-cluster=0 < %s | FileCheck -check-prefix=GCN %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -misched-cluster=0 -amdgpu-igrouplp-exact-solver-max-branches=250000 < %s | FileCheck -check-prefix=EXACTCUTOFF %s declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16..i16(<8 x half>, <16 x half>, <8 x half>, i16) define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_cluster(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_SWMMAC_cluster: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GCN-NEXT: v_mov_b32_e32 v48, 0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GCN-NEXT: v_and_b32_e32 v28, 0x3ff0, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: v_add_nc_u32_e32 v0, s0, v28 ; GCN-NEXT: v_dual_mov_b32 v50, s1 :: v_dual_add_nc_u32 v49, s1, v28 ; GCN-NEXT: ds_load_b128 v[8:11], v0 ; GCN-NEXT: ds_load_b128 v[12:15], v0 offset:512 ; GCN-NEXT: ds_load_b128 v[16:19], v0 offset:1536 ; GCN-NEXT: ds_load_b128 v[20:23], v0 offset:3072 ; GCN-NEXT: ds_load_b128 v[24:27], v0 offset:5120 ; GCN-NEXT: ds_load_b128 v[4:7], v0 offset:11280 ; GCN-NEXT: ds_load_b128 v[0:3], v0 offset:11264 ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(7) SyncID(0) ; GCN-NEXT: s_wait_dscnt 0x6 ; GCN-NEXT: v_mov_b32_e32 v31, v11 ; GCN-NEXT: s_wait_dscnt 0x5 ; GCN-NEXT: v_mov_b32_e32 v35, v15 ; GCN-NEXT: s_wait_dscnt 0x4 ; GCN-NEXT: v_mov_b32_e32 v39, v19 ; GCN-NEXT: s_wait_dscnt 0x3 ; GCN-NEXT: v_mov_b32_e32 v43, v23 ; GCN-NEXT: s_wait_dscnt 0x2 ; GCN-NEXT: v_dual_mov_b32 v47, v27 :: v_dual_mov_b32 v30, v10 ; GCN-NEXT: v_dual_mov_b32 v29, v9 :: v_dual_mov_b32 v28, v8 ; GCN-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v33, v13 ; GCN-NEXT: v_mov_b32_e32 v32, v12 ; GCN-NEXT: v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v37, v17 ; GCN-NEXT: v_mov_b32_e32 v36, v16 ; GCN-NEXT: v_dual_mov_b32 v42, v22 :: v_dual_mov_b32 v41, v21 ; GCN-NEXT: v_mov_b32_e32 v40, v20 ; GCN-NEXT: v_dual_mov_b32 v46, v26 :: v_dual_mov_b32 v45, v25 ; GCN-NEXT: v_mov_b32_e32 v44, v24 ; GCN-NEXT: s_wait_dscnt 0x0 ; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[28:31], v[8:11], v[0:7], v48 ; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[32:35], v[12:15], v[0:7], v48 ; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[36:39], v[16:19], v[0:7], v48 ; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[40:43], v[20:23], v[0:7], v48 ; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[44:47], v[24:27], v[0:7], v48 ; GCN-NEXT: ds_store_b128 v49, v[28:31] ; GCN-NEXT: ds_store_b128 v50, v[32:35] offset:512 ; GCN-NEXT: ds_store_b128 v50, v[36:39] offset:1024 ; GCN-NEXT: ds_store_b128 v50, v[40:43] offset:1536 ; GCN-NEXT: ds_store_b128 v50, v[44:47] offset:2048 ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(5) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(5) SyncID(0) ; GCN-NEXT: s_endpgm ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_SWMMAC_cluster: ; EXACTCUTOFF: ; %bb.0: ; %entry ; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v48, 0 ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; EXACTCUTOFF-NEXT: v_and_b32_e32 v28, 0x3ff0, v0 ; EXACTCUTOFF-NEXT: s_wait_kmcnt 0x0 ; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v0, s0, v28 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v50, s1 :: v_dual_add_nc_u32 v49, s1, v28 ; EXACTCUTOFF-NEXT: ds_load_b128 v[8:11], v0 ; EXACTCUTOFF-NEXT: ds_load_b128 v[12:15], v0 offset:512 ; EXACTCUTOFF-NEXT: ds_load_b128 v[16:19], v0 offset:1536 ; EXACTCUTOFF-NEXT: ds_load_b128 v[20:23], v0 offset:3072 ; EXACTCUTOFF-NEXT: ds_load_b128 v[24:27], v0 offset:5120 ; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v0 offset:11280 ; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v0 offset:11264 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(7) SyncID(0) ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x6 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v31, v11 ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x5 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v35, v15 ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x4 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v39, v19 ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x3 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v43, v23 ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x2 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v47, v27 :: v_dual_mov_b32 v30, v10 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v29, v9 :: v_dual_mov_b32 v28, v8 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v33, v13 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v32, v12 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v37, v17 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v36, v16 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v42, v22 :: v_dual_mov_b32 v41, v21 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v40, v20 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v46, v26 :: v_dual_mov_b32 v45, v25 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v44, v24 ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 ; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[28:31], v[8:11], v[0:7], v48 ; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[32:35], v[12:15], v[0:7], v48 ; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[36:39], v[16:19], v[0:7], v48 ; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[40:43], v[20:23], v[0:7], v48 ; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[44:47], v[24:27], v[0:7], v48 ; EXACTCUTOFF-NEXT: ds_store_b128 v49, v[28:31] ; EXACTCUTOFF-NEXT: ds_store_b128 v50, v[32:35] offset:512 ; EXACTCUTOFF-NEXT: ds_store_b128 v50, v[36:39] offset:1024 ; EXACTCUTOFF-NEXT: ds_store_b128 v50, v[40:43] offset:1536 ; EXACTCUTOFF-NEXT: ds_store_b128 v50, v[44:47] offset:2048 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(5) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(5) SyncID(0) ; EXACTCUTOFF-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() %load.0.addr = getelementptr <8 x half>, ptr addrspace(3) %in, i32 %idx %load.0 = load <8 x half>, ptr addrspace(3) %load.0.addr %load.1.addr = getelementptr <8 x half>, ptr addrspace(3) %load.0.addr, i32 32 %load.1 = load <8 x half>, ptr addrspace(3) %load.1.addr %load.2.addr = getelementptr <8 x half>, ptr addrspace(3) %load.1.addr, i32 64 %load.2 = load <8 x half>, ptr addrspace(3) %load.2.addr %load.3.addr = getelementptr <8 x half>, ptr addrspace(3) %load.2.addr, i32 96 %load.3 = load <8 x half>, ptr addrspace(3) %load.3.addr %load.4.addr = getelementptr <8 x half>, ptr addrspace(3) %load.3.addr, i32 128 %load.4 = load <8 x half>, ptr addrspace(3) %load.4.addr %load.b.addr = getelementptr <16 x half>, ptr addrspace(3) %load.4.addr, i32 192 %load.b = load <16 x half>, ptr addrspace(3) %load.b.addr %mai.0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.0, <16 x half> %load.b, <8 x half> %load.0, i1 0) %mai.1 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.1, <16 x half> %load.b, <8 x half> %load.1, i1 0) %mai.2 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.2, <16 x half> %load.b, <8 x half> %load.2, i1 0) %mai.3 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.3, <16 x half> %load.b, <8 x half> %load.3, i1 0) %mai.4 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.4, <16 x half> %load.b, <8 x half> %load.4, i1 0) %store.0.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 %idx store <8 x half> %mai.0, ptr addrspace(3) %store.0.addr %store.1.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 32 store <8 x half> %mai.1, ptr addrspace(3) %store.1.addr %store.2.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 64 store <8 x half> %mai.2, ptr addrspace(3) %store.2.addr %store.3.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 96 store <8 x half> %mai.3, ptr addrspace(3) %store.3.addr %store.4.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 128 store <8 x half> %mai.4, ptr addrspace(3) %store.4.addr ; 7 DS read call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 7, i32 0) ; 5 SWMMAC call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 5, i32 0) ; 5 DS write call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 5, i32 0) ret void } define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_interleaved(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_SWMMAC_interleaved: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_and_b32_e32 v16, 0x3ff, v0 ; GCN-NEXT: v_mov_b32_e32 v18, 0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_lshl_add_u32 v17, v16, 5, s0 ; GCN-NEXT: v_lshl_add_u32 v16, v16, 4, s1 ; GCN-NEXT: ds_load_b128 v[8:11], v17 offset:1024 ; GCN-NEXT: ds_load_b128 v[0:3], v17 ; GCN-NEXT: ds_load_b128 v[4:7], v17 offset:16 ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(3) SyncID(0) ; GCN-NEXT: s_wait_dscnt 0x2 ; GCN-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 ; GCN-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 ; GCN-NEXT: s_wait_dscnt 0x0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-NEXT: ds_store_b128 v16, v[12:15] ; GCN-NEXT: ds_load_b128 v[8:11], v17 offset:2560 ; GCN-NEXT: v_mov_b32_e32 v16, s1 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) ; GCN-NEXT: s_wait_dscnt 0x0 ; GCN-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 ; GCN-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-NEXT: ds_store_b128 v16, v[12:15] offset:512 ; GCN-NEXT: ds_load_b128 v[8:11], v17 offset:4608 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) ; GCN-NEXT: s_wait_dscnt 0x0 ; GCN-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 ; GCN-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-NEXT: ds_store_b128 v16, v[12:15] offset:1024 ; GCN-NEXT: ds_load_b128 v[8:11], v17 offset:7168 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) ; GCN-NEXT: s_wait_dscnt 0x0 ; GCN-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 ; GCN-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-NEXT: ds_store_b128 v16, v[12:15] offset:1536 ; GCN-NEXT: ds_load_b128 v[8:11], v17 offset:10240 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) ; GCN-NEXT: s_wait_dscnt 0x0 ; GCN-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 ; GCN-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-NEXT: ds_store_b128 v16, v[12:15] offset:2048 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; GCN-NEXT: s_endpgm ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_SWMMAC_interleaved: ; EXACTCUTOFF: ; %bb.0: ; %entry ; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; EXACTCUTOFF-NEXT: v_and_b32_e32 v16, 0x3ff, v0 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v18, 0 ; EXACTCUTOFF-NEXT: s_wait_kmcnt 0x0 ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_2) ; EXACTCUTOFF-NEXT: v_lshl_add_u32 v17, v16, 5, s0 ; EXACTCUTOFF-NEXT: v_lshl_add_u32 v16, v16, 4, s1 ; EXACTCUTOFF-NEXT: ds_load_b128 v[8:11], v17 offset:1024 ; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v17 ; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v17 offset:16 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(3) SyncID(0) ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x2 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) ; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] ; EXACTCUTOFF-NEXT: ds_load_b128 v[8:11], v17 offset:2560 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v16, s1 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) ; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] offset:512 ; EXACTCUTOFF-NEXT: ds_load_b128 v[8:11], v17 offset:4608 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) ; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] offset:1024 ; EXACTCUTOFF-NEXT: ds_load_b128 v[8:11], v17 offset:7168 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) ; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] offset:1536 ; EXACTCUTOFF-NEXT: ds_load_b128 v[8:11], v17 offset:10240 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) ; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] offset:2048 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() %load.b.addr = getelementptr <16 x half>, ptr addrspace(3) %in, i32 %idx %load.b = load <16 x half>, ptr addrspace(3) %load.b.addr %load.0.addr = getelementptr <8 x half>, ptr addrspace(3) %load.b.addr, i32 64 %load.0 = load <8 x half>, ptr addrspace(3) %load.0.addr %load.1.addr = getelementptr <8 x half>, ptr addrspace(3) %load.0.addr, i32 96 %load.1 = load <8 x half>, ptr addrspace(3) %load.1.addr %load.2.addr = getelementptr <8 x half>, ptr addrspace(3) %load.1.addr, i32 128 %load.2 = load <8 x half>, ptr addrspace(3) %load.2.addr %load.3.addr = getelementptr <8 x half>, ptr addrspace(3) %load.2.addr, i32 160 %load.3 = load <8 x half>, ptr addrspace(3) %load.3.addr %load.4.addr = getelementptr <8 x half>, ptr addrspace(3) %load.3.addr, i32 192 %load.4 = load <8 x half>, ptr addrspace(3) %load.4.addr %mai.0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.0, <16 x half> %load.b, <8 x half> %load.0, i1 0) %mai.1 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.1, <16 x half> %load.b, <8 x half> %load.1, i1 0) %mai.2 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.2, <16 x half> %load.b, <8 x half> %load.2, i1 0) %mai.3 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.3, <16 x half> %load.b, <8 x half> %load.3, i1 0) %mai.4 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.4, <16 x half> %load.b, <8 x half> %load.4, i1 0) %store.0.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 %idx store <8 x half> %mai.0, ptr addrspace(3) %store.0.addr %store.1.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 32 store <8 x half> %mai.1, ptr addrspace(3) %store.1.addr %store.2.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 64 store <8 x half> %mai.2, ptr addrspace(3) %store.2.addr %store.3.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 96 store <8 x half> %mai.3, ptr addrspace(3) %store.3.addr %store.4.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 128 store <8 x half> %mai.4, ptr addrspace(3) %store.4.addr ; 3 DS read call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 3, i32 0) ; 1 SWMMAC call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) ; 1 DS write call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0) ; 1 DS read call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0) ; 1 SWMMAC call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) ; 1 DS write call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0) ; 1 DS read call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0) ; 1 SWMMAC call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) ; 1 DS write call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0) ; 1 DS read call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0) ; 1 SWMMAC call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) ; 1 DS write call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0) ; 1 DS read call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0) ; 1 SWMMAC call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) ; 1 DS write call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0) ret void }