; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -debug-only si-insert-waitcnts < %s 2>&1 | FileCheck %s ; REQUIRES: asserts ; CHECK: Begin Block: bb.0.bb define amdgpu_kernel void @main(ptr addrspace(3) %arg) { bb: %i = load <16 x i8>, ptr addrspace(3) %arg, align 16 tail call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> zeroinitializer, <8 x i32> zeroinitializer, i32 0) %i1 = shufflevector <16 x i8> %i, <16 x i8> zeroinitializer, <64 x i32> %i2 = shufflevector <64 x i8> zeroinitializer, <64 x i8> %i1, <64 x i32> fence syncscope("workgroup") release %i3 = bitcast <64 x i8> %i2 to <16 x i32> %i4 = tail call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %i3, i32 0, <16 x i32> zeroinitializer, i16 0, <8 x float> zeroinitializer, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) %i5 = extractelement <8 x float> %i4, i64 0 %i6 = insertelement <4 x float> zeroinitializer, float %i5, i64 0 tail call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> %i6, ptr addrspace(8) null, i32 0, i32 0, i32 0) ret void }