; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=instcombine < %s | FileCheck %s ; ------------------------------------------------------------------------------------ ; Incorrect signature for format cases (IR vector too large) wmma.f32.16x16x128.f8f6f4 ; ------------------------------------------------------------------------------------ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp6___v16i32_fp8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp6___v16i32_fp8( ; CHECK-SAME: <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) { ; CHECK-NEXT: [[BB:.*:]] ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <12 x i32> ; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 2, <12 x i32> [[TMP0]], i32 0, <16 x i32> [[B]], i16 0, <8 x float> [[C]]) ; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32 ; CHECK-NEXT: ret void ; bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 2, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_bf8___v16i32_fp6(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_bf8___v16i32_fp6( ; CHECK-SAME: <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) { ; CHECK-NEXT: [[BB:.*:]] ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[B]], <16 x i32> poison, <12 x i32> ; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 1, <16 x i32> [[A]], i32 2, <12 x i32> [[TMP0]], i16 0, <8 x float> [[C]]) ; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32 ; CHECK-NEXT: ret void ; bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 1, <16 x i32> %A, i32 2, <16 x i32> %B, i16 0, <8 x float> %C) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_bf6___v16i32_bf8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_bf6___v16i32_bf8( ; CHECK-SAME: <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) { ; CHECK-NEXT: [[BB:.*:]] ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <12 x i32> ; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 3, <12 x i32> [[TMP0]], i32 1, <16 x i32> [[B]], i16 0, <8 x float> [[C]]) ; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32 ; CHECK-NEXT: ret void ; bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 3, <16 x i32> %A, i32 1, <16 x i32> %B, i16 0, <8 x float> %C) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_bf8___v16i32_bf6(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_bf8___v16i32_bf6( ; CHECK-SAME: <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) { ; CHECK-NEXT: [[BB:.*:]] ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[B]], <16 x i32> poison, <12 x i32> ; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 1, <16 x i32> [[A]], i32 3, <12 x i32> [[TMP0]], i16 0, <8 x float> [[C]]) ; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32 ; CHECK-NEXT: ret void ; bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 1, <16 x i32> %A, i32 3, <16 x i32> %B, i16 0, <8 x float> %C) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp4___v16i32_fp8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp4___v16i32_fp8( ; CHECK-SAME: <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) { ; CHECK-NEXT: [[BB:.*:]] ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <8 x i32> ; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 4, <8 x i32> [[TMP0]], i32 0, <16 x i32> [[B]], i16 0, <8 x float> [[C]]) ; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32 ; CHECK-NEXT: ret void ; bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 4, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v16i32_fp4(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v16i32_fp4( ; CHECK-SAME: <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) { ; CHECK-NEXT: [[BB:.*:]] ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[B]], <16 x i32> poison, <8 x i32> ; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 0, <16 x i32> [[A]], i32 4, <8 x i32> [[TMP0]], i16 0, <8 x float> [[C]]) ; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32 ; CHECK-NEXT: ret void ; bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 4, <16 x i32> %B, i16 0, <8 x float> %C) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v12i32_fp4___v16i32_fp8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v12i32_fp4___v16i32_fp8( ; CHECK-SAME: <12 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) { ; CHECK-NEXT: [[BB:.*:]] ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <12 x i32> [[A]], <12 x i32> poison, <8 x i32> ; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 4, <8 x i32> [[TMP0]], i32 0, <16 x i32> [[B]], i16 0, <8 x float> [[C]]) ; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32 ; CHECK-NEXT: ret void ; bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 4, <12 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v12i32_fp4(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v12i32_fp4( ; CHECK-SAME: <16 x i32> [[A:%.*]], <12 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) { ; CHECK-NEXT: [[BB:.*:]] ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <12 x i32> [[B]], <12 x i32> poison, <8 x i32> ; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 0, <16 x i32> [[A]], i32 4, <8 x i32> [[TMP0]], i16 0, <8 x float> [[C]]) ; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32 ; CHECK-NEXT: ret void ; bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 4, <12 x i32> %B, i16 0, <8 x float> %C) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v12i32_fp4___v16i32_fp6(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v12i32_fp4___v16i32_fp6( ; CHECK-SAME: <12 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) { ; CHECK-NEXT: [[BB:.*:]] ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <12 x i32> [[A]], <12 x i32> poison, <8 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[B]], <16 x i32> poison, <12 x i32> ; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v12i32(i32 4, <8 x i32> [[TMP0]], i32 2, <12 x i32> [[TMP1]], i16 0, <8 x float> [[C]]) ; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32 ; CHECK-NEXT: ret void ; bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 4, <12 x i32> %A, i32 2, <16 x i32> %B, i16 0, <8 x float> %C) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_bf6___v12i32_fp4(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_bf6___v12i32_fp4( ; CHECK-SAME: <16 x i32> [[A:%.*]], <12 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) { ; CHECK-NEXT: [[BB:.*:]] ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <12 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <12 x i32> [[B]], <12 x i32> poison, <8 x i32> ; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v8i32(i32 3, <12 x i32> [[TMP0]], i32 4, <8 x i32> [[TMP1]], i16 0, <8 x float> [[C]]) ; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32 ; CHECK-NEXT: ret void ; bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 3, <16 x i32> %A, i32 4, <12 x i32> %B, i16 0, <8 x float> %C) store <8 x float> %res, ptr addrspace(1) %out ret void } ; ------------------------------------------------------------------------------------------ ; Incorrect signature for format cases (IR vector too large) wmma.scale.f32.16x16x128.f8f6f4 ; ------------------------------------------------------------------------------------------ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4___v16i32_fp6___v16i32_fp8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; CHECK-LABEL: define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4___v16i32_fp6___v16i32_fp8( ; CHECK-SAME: <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) { ; CHECK-NEXT: [[BB:.*:]] ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <12 x i32> ; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 2, <12 x i32> [[TMP0]], i32 0, <16 x i32> [[B]], i16 0, <8 x float> [[C]], i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) ; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32 ; CHECK-NEXT: ret void ; bb: %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 2, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4___v16i32_bf8___v16i32_fp6(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; CHECK-LABEL: define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4___v16i32_bf8___v16i32_fp6( ; CHECK-SAME: <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) { ; CHECK-NEXT: [[BB:.*:]] ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[B]], <16 x i32> poison, <12 x i32> ; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 1, <16 x i32> [[A]], i32 2, <12 x i32> [[TMP0]], i16 0, <8 x float> [[C]], i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) ; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32 ; CHECK-NEXT: ret void ; bb: %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 1, <16 x i32> %A, i32 2, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4___v16i32_bf6___v16i32_bf8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; CHECK-LABEL: define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4___v16i32_bf6___v16i32_bf8( ; CHECK-SAME: <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) { ; CHECK-NEXT: [[BB:.*:]] ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <12 x i32> ; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 3, <12 x i32> [[TMP0]], i32 1, <16 x i32> [[B]], i16 0, <8 x float> [[C]], i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) ; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32 ; CHECK-NEXT: ret void ; bb: %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 3, <16 x i32> %A, i32 1, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4___v16i32_bf8___v16i32_bf6(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; CHECK-LABEL: define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4___v16i32_bf8___v16i32_bf6( ; CHECK-SAME: <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) { ; CHECK-NEXT: [[BB:.*:]] ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[B]], <16 x i32> poison, <12 x i32> ; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 1, <16 x i32> [[A]], i32 3, <12 x i32> [[TMP0]], i16 0, <8 x float> [[C]], i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) ; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32 ; CHECK-NEXT: ret void ; bb: %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 1, <16 x i32> %A, i32 3, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4___v16i32_fp4___v16i32_fp8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; CHECK-LABEL: define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4___v16i32_fp4___v16i32_fp8( ; CHECK-SAME: <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) { ; CHECK-NEXT: [[BB:.*:]] ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <8 x i32> ; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 4, <8 x i32> [[TMP0]], i32 0, <16 x i32> [[B]], i16 0, <8 x float> [[C]], i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) ; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32 ; CHECK-NEXT: ret void ; bb: %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 4, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4___v16i32_fp8___v16i32_fp4(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; CHECK-LABEL: define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4___v16i32_fp8___v16i32_fp4( ; CHECK-SAME: <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) { ; CHECK-NEXT: [[BB:.*:]] ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[B]], <16 x i32> poison, <8 x i32> ; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 0, <16 x i32> [[A]], i32 4, <8 x i32> [[TMP0]], i16 0, <8 x float> [[C]], i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) ; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32 ; CHECK-NEXT: ret void ; bb: %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 4, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4___v12i32_fp4___v16i32_fp8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; CHECK-LABEL: define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4___v12i32_fp4___v16i32_fp8( ; CHECK-SAME: <12 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) { ; CHECK-NEXT: [[BB:.*:]] ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <12 x i32> [[A]], <12 x i32> poison, <8 x i32> ; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 4, <8 x i32> [[TMP0]], i32 0, <16 x i32> [[B]], i16 0, <8 x float> [[C]], i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) ; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32 ; CHECK-NEXT: ret void ; bb: %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 4, <12 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4___v16i32_fp8___v12i32_fp4(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; CHECK-LABEL: define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4___v16i32_fp8___v12i32_fp4( ; CHECK-SAME: <16 x i32> [[A:%.*]], <12 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) { ; CHECK-NEXT: [[BB:.*:]] ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <12 x i32> [[B]], <12 x i32> poison, <8 x i32> ; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 0, <16 x i32> [[A]], i32 4, <8 x i32> [[TMP0]], i16 0, <8 x float> [[C]], i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) ; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32 ; CHECK-NEXT: ret void ; bb: %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 4, <12 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4___v12i32_fp4___v16i32_fp6(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; CHECK-LABEL: define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4___v12i32_fp4___v16i32_fp6( ; CHECK-SAME: <12 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) { ; CHECK-NEXT: [[BB:.*:]] ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <12 x i32> [[A]], <12 x i32> poison, <8 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[B]], <16 x i32> poison, <12 x i32> ; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v8i32.v12i32(i32 4, <8 x i32> [[TMP0]], i32 2, <12 x i32> [[TMP1]], i16 0, <8 x float> [[C]], i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) ; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32 ; CHECK-NEXT: ret void ; bb: %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 4, <12 x i32> %A, i32 2, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4___v16i32_bf6___v12i32_fp4(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; CHECK-LABEL: define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4___v16i32_bf6___v12i32_fp4( ; CHECK-SAME: <16 x i32> [[A:%.*]], <12 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) { ; CHECK-NEXT: [[BB:.*:]] ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <12 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <12 x i32> [[B]], <12 x i32> poison, <8 x i32> ; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v12i32.v8i32(i32 3, <12 x i32> [[TMP0]], i32 4, <8 x i32> [[TMP1]], i16 0, <8 x float> [[C]], i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) ; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32 ; CHECK-NEXT: ret void ; bb: %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 3, <16 x i32> %A, i32 4, <12 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } ; -------------------------------------------------------------------------------------------- ; Incorrect signature for format cases (IR vector too large) wmma.scale16.f32.16x16x128.f8f6f4 ; -------------------------------------------------------------------------------------------- define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4___v16i32_fp6___v16i32_fp8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; CHECK-LABEL: define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4___v16i32_fp6___v16i32_fp8( ; CHECK-SAME: <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) { ; CHECK-NEXT: [[BB:.*:]] ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <12 x i32> ; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 2, <12 x i32> [[TMP0]], i32 0, <16 x i32> [[B]], i16 0, <8 x float> [[C]], i32 0, i32 0, i64 0, i32 0, i32 0, i64 0, i1 false, i1 false) ; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32 ; CHECK-NEXT: ret void ; bb: %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 2, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i64 0, i32 0, i32 0, i64 0, i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4___v16i32_bf8___v16i32_fp6(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; CHECK-LABEL: define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4___v16i32_bf8___v16i32_fp6( ; CHECK-SAME: <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) { ; CHECK-NEXT: [[BB:.*:]] ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[B]], <16 x i32> poison, <12 x i32> ; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 1, <16 x i32> [[A]], i32 2, <12 x i32> [[TMP0]], i16 0, <8 x float> [[C]], i32 0, i32 0, i64 0, i32 0, i32 0, i64 0, i1 false, i1 false) ; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32 ; CHECK-NEXT: ret void ; bb: %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 1, <16 x i32> %A, i32 2, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i64 0, i32 0, i32 0, i64 0, i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4___v16i32_bf6___v16i32_bf8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; CHECK-LABEL: define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4___v16i32_bf6___v16i32_bf8( ; CHECK-SAME: <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) { ; CHECK-NEXT: [[BB:.*:]] ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <12 x i32> ; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 3, <12 x i32> [[TMP0]], i32 1, <16 x i32> [[B]], i16 0, <8 x float> [[C]], i32 0, i32 0, i64 0, i32 0, i32 0, i64 0, i1 false, i1 false) ; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32 ; CHECK-NEXT: ret void ; bb: %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 3, <16 x i32> %A, i32 1, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i64 0, i32 0, i32 0, i64 0, i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4___v16i32_bf8___v16i32_bf6(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; CHECK-LABEL: define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4___v16i32_bf8___v16i32_bf6( ; CHECK-SAME: <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) { ; CHECK-NEXT: [[BB:.*:]] ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[B]], <16 x i32> poison, <12 x i32> ; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 1, <16 x i32> [[A]], i32 3, <12 x i32> [[TMP0]], i16 0, <8 x float> [[C]], i32 0, i32 0, i64 0, i32 0, i32 0, i64 0, i1 false, i1 false) ; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32 ; CHECK-NEXT: ret void ; bb: %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 1, <16 x i32> %A, i32 3, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i64 0, i32 0, i32 0, i64 0, i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4___v16i32_fp4___v16i32_fp8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; CHECK-LABEL: define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4___v16i32_fp4___v16i32_fp8( ; CHECK-SAME: <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) { ; CHECK-NEXT: [[BB:.*:]] ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <8 x i32> ; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 4, <8 x i32> [[TMP0]], i32 0, <16 x i32> [[B]], i16 0, <8 x float> [[C]], i32 0, i32 0, i64 0, i32 0, i32 0, i64 0, i1 false, i1 false) ; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32 ; CHECK-NEXT: ret void ; bb: %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 4, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i64 0, i32 0, i32 0, i64 0, i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4___v16i32_fp8___v16i32_fp4(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; CHECK-LABEL: define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4___v16i32_fp8___v16i32_fp4( ; CHECK-SAME: <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) { ; CHECK-NEXT: [[BB:.*:]] ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[B]], <16 x i32> poison, <8 x i32> ; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 0, <16 x i32> [[A]], i32 4, <8 x i32> [[TMP0]], i16 0, <8 x float> [[C]], i32 0, i32 0, i64 0, i32 0, i32 0, i64 0, i1 false, i1 false) ; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32 ; CHECK-NEXT: ret void ; bb: %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 4, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i64 0, i32 0, i32 0, i64 0, i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4___v12i32_fp4___v16i32_fp8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; CHECK-LABEL: define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4___v12i32_fp4___v16i32_fp8( ; CHECK-SAME: <12 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) { ; CHECK-NEXT: [[BB:.*:]] ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <12 x i32> [[A]], <12 x i32> poison, <8 x i32> ; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 4, <8 x i32> [[TMP0]], i32 0, <16 x i32> [[B]], i16 0, <8 x float> [[C]], i32 0, i32 0, i64 0, i32 0, i32 0, i64 0, i1 false, i1 false) ; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32 ; CHECK-NEXT: ret void ; bb: %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 4, <12 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i64 0, i32 0, i32 0, i64 0, i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4___v16i32_fp8___v12i32_fp4(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; CHECK-LABEL: define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4___v16i32_fp8___v12i32_fp4( ; CHECK-SAME: <16 x i32> [[A:%.*]], <12 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) { ; CHECK-NEXT: [[BB:.*:]] ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <12 x i32> [[B]], <12 x i32> poison, <8 x i32> ; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 0, <16 x i32> [[A]], i32 4, <8 x i32> [[TMP0]], i16 0, <8 x float> [[C]], i32 0, i32 0, i64 0, i32 0, i32 0, i64 0, i1 false, i1 false) ; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32 ; CHECK-NEXT: ret void ; bb: %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 4, <12 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i64 0, i32 0, i32 0, i64 0, i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4___v12i32_fp4___v16i32_fp6(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; CHECK-LABEL: define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4___v12i32_fp4___v16i32_fp6( ; CHECK-SAME: <12 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) { ; CHECK-NEXT: [[BB:.*:]] ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <12 x i32> [[A]], <12 x i32> poison, <8 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[B]], <16 x i32> poison, <12 x i32> ; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v8i32.v12i32(i32 4, <8 x i32> [[TMP0]], i32 2, <12 x i32> [[TMP1]], i16 0, <8 x float> [[C]], i32 0, i32 0, i64 0, i32 0, i32 0, i64 0, i1 false, i1 false) ; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32 ; CHECK-NEXT: ret void ; bb: %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 4, <12 x i32> %A, i32 2, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i64 0, i32 0, i32 0, i64 0, i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void } define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4___v16i32_bf6___v12i32_fp4(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; CHECK-LABEL: define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4___v16i32_bf6___v12i32_fp4( ; CHECK-SAME: <16 x i32> [[A:%.*]], <12 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) { ; CHECK-NEXT: [[BB:.*:]] ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <12 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <12 x i32> [[B]], <12 x i32> poison, <8 x i32> ; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v12i32.v8i32(i32 3, <12 x i32> [[TMP0]], i32 4, <8 x i32> [[TMP1]], i16 0, <8 x float> [[C]], i32 0, i32 0, i64 0, i32 0, i32 0, i64 0, i1 false, i1 false) ; CHECK-NEXT: store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32 ; CHECK-NEXT: ret void ; bb: %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 3, <16 x i32> %A, i32 4, <12 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i64 0, i32 0, i32 0, i64 0, i1 false, i1 false) store <8 x float> %res, ptr addrspace(1) %out ret void }