diff options
Diffstat (limited to 'clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl')
-rw-r--r-- | clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl | 963 |
1 files changed, 963 insertions, 0 deletions
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl index a9ea176..4ff0571 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl @@ -4,7 +4,23 @@ #pragma OPENCL EXTENSION cl_khr_fp16 : enable +typedef unsigned int uint; +typedef unsigned short int ushort; +typedef unsigned int __attribute__((ext_vector_type(2))) uint2; +typedef unsigned int __attribute__((ext_vector_type(3))) uint3; +typedef unsigned int __attribute__((ext_vector_type(4))) uint4; +typedef __bf16 __attribute__((ext_vector_type(2))) bfloat2; +typedef __bf16 __attribute__((ext_vector_type(8))) bfloat8; +typedef __bf16 __attribute__((ext_vector_type(16))) bfloat16; +typedef __bf16 __attribute__((ext_vector_type(32))) bfloat32; typedef half __attribute__((ext_vector_type(2))) half2; +typedef half __attribute__((ext_vector_type(8))) half8; +typedef half __attribute__((ext_vector_type(16))) half16; +typedef half __attribute__((ext_vector_type(32))) half32; +typedef float __attribute__((ext_vector_type(8))) float8; +typedef float __attribute__((ext_vector_type(16))) float16; +typedef float __attribute__((ext_vector_type(32))) float32; +typedef short __attribute__((ext_vector_type(2))) short2; // CHECK-LABEL: @test_setprio_inc_wg( // CHECK-NEXT: entry: @@ -42,6 +58,24 @@ void test_s_wait_tensorcnt() { __builtin_amdgcn_s_wait_tensorcnt(0); } +// CHECK-LABEL: @test_prng_b32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.prng.b32(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP1]], ptr addrspace(1) [[TMP2]], align 4 +// CHECK-NEXT: ret void +// +void test_prng_b32(global uint* out, uint a) { + *out = __builtin_amdgcn_prng_b32(a); +} + // CHECK-LABEL: @test_tanh_f32( // CHECK-NEXT: entry: // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) @@ -233,6 +267,60 @@ void test_cos_bf16(global __bf16* out, __bf16 a) *out = __builtin_amdgcn_cos_bf16(a); } +// CHECK-LABEL: @test_cvt_sr_pk_bf16_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[SR_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// CHECK-NEXT: [[SR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SR_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store float [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store float [[B:%.*]], ptr [[B_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[SR:%.*]], ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[A_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[B_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = call <2 x bfloat> @llvm.amdgcn.cvt.sr.pk.bf16.f32(float [[TMP0]], float [[TMP1]], i32 [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x bfloat> [[TMP3]], ptr addrspace(1) [[TMP4]], align 4 +// CHECK-NEXT: ret void +// +void test_cvt_sr_pk_bf16_f32(global bfloat2* out, float a, float b, uint sr) +{ + *out = __builtin_amdgcn_cvt_sr_pk_bf16_f32(a, b, sr); +} + +// CHECK-LABEL: @test_cvt_sr_pk_f16_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[SR_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// CHECK-NEXT: [[SR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SR_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store float [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store float [[B:%.*]], ptr [[B_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[SR:%.*]], ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[A_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[B_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = call <2 x half> @llvm.amdgcn.cvt.sr.pk.f16.f32(float [[TMP0]], float [[TMP1]], i32 [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x half> [[TMP3]], ptr addrspace(1) [[TMP4]], align 4 +// CHECK-NEXT: ret void +// +void test_cvt_sr_pk_f16_f32(global half2* out, float a, float b, uint sr) +{ + *out = __builtin_amdgcn_cvt_sr_pk_f16_f32(a, b, sr); +} + // CHECK-LABEL: @test_cvt_f16_fp8( // CHECK-NEXT: entry: // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) @@ -349,6 +437,881 @@ void test_cvt_pk_f16_bf8(global half2* out, short a) out[0] = __builtin_amdgcn_cvt_pk_f16_bf8(a); } +// CHECK-LABEL: @test_cvt_pk_bf8_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x half> [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr [[A_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.amdgcn.cvt.pk.bf8.f16(<2 x half> [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i16 [[TMP1]], ptr addrspace(1) [[TMP2]], align 2 +// CHECK-NEXT: ret void +// +void test_cvt_pk_bf8_f16(global short* out, half2 a) +{ + *out = __builtin_amdgcn_cvt_pk_bf8_f16(a); +} + +// CHECK-LABEL: @test_cvt_pk_fp8_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x half> [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr [[A_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.amdgcn.cvt.pk.fp8.f16(<2 x half> [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i16 [[TMP1]], ptr addrspace(1) [[TMP2]], align 2 +// CHECK-NEXT: ret void +// +void test_cvt_pk_fp8_f16(global short* out, half2 a) +{ + *out = __builtin_amdgcn_cvt_pk_fp8_f16(a); +} + +// CHECK-LABEL: @test_cvt_sr_bf8_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca half, align 2, addrspace(5) +// CHECK-NEXT: [[SR_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[SR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SR_ADDR]] to ptr +// CHECK-NEXT: [[OLD_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OLD_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store half [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2 +// CHECK-NEXT: store i32 [[SR:%.*]], ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[OLD:%.*]], ptr [[OLD_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.cvt.sr.bf8.f16(half [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i32 0) +// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.cvt.sr.bf8.f16(half [[TMP5]], i32 [[TMP6]], i32 [[TMP7]], i32 1) +// CHECK-NEXT: [[TMP9:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP8]], ptr addrspace(1) [[TMP9]], align 4 +// CHECK-NEXT: [[TMP10:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.cvt.sr.bf8.f16(half [[TMP10]], i32 [[TMP11]], i32 [[TMP12]], i32 2) +// CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP13]], ptr addrspace(1) [[TMP14]], align 4 +// CHECK-NEXT: [[TMP15:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP18:%.*]] = call i32 @llvm.amdgcn.cvt.sr.bf8.f16(half [[TMP15]], i32 [[TMP16]], i32 [[TMP17]], i32 3) +// CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP18]], ptr addrspace(1) [[TMP19]], align 4 +// CHECK-NEXT: ret void +// +void test_cvt_sr_bf8_f16(global int* out, half a, uint sr, int old) +{ + *out = __builtin_amdgcn_cvt_sr_bf8_f16(a, sr, old, 0); + *out = __builtin_amdgcn_cvt_sr_bf8_f16(a, sr, old, 1); + *out = __builtin_amdgcn_cvt_sr_bf8_f16(a, sr, old, 2); + *out = __builtin_amdgcn_cvt_sr_bf8_f16(a, sr, old, 3); +} + +// CHECK-LABEL: @test_cvt_sr_fp8_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca half, align 2, addrspace(5) +// CHECK-NEXT: [[SR_ADDR:%.*]] = alloca i16, align 2, addrspace(5) +// CHECK-NEXT: [[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[SR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SR_ADDR]] to ptr +// CHECK-NEXT: [[OLD_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OLD_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store half [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2 +// CHECK-NEXT: store i16 [[SR:%.*]], ptr [[SR_ADDR_ASCAST]], align 2 +// CHECK-NEXT: store i32 [[OLD:%.*]], ptr [[OLD_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[SR_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP1]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.cvt.sr.fp8.f16(half [[TMP0]], i32 [[CONV]], i32 [[TMP2]], i32 0) +// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP6:%.*]] = load i16, ptr [[SR_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[CONV1:%.*]] = sext i16 [[TMP6]] to i32 +// CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.cvt.sr.fp8.f16(half [[TMP5]], i32 [[CONV1]], i32 [[TMP7]], i32 1) +// CHECK-NEXT: [[TMP9:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP8]], ptr addrspace(1) [[TMP9]], align 4 +// CHECK-NEXT: [[TMP10:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP11:%.*]] = load i16, ptr [[SR_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[CONV2:%.*]] = sext i16 [[TMP11]] to i32 +// CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.cvt.sr.fp8.f16(half [[TMP10]], i32 [[CONV2]], i32 [[TMP12]], i32 2) +// CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP13]], ptr addrspace(1) [[TMP14]], align 4 +// CHECK-NEXT: [[TMP15:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP16:%.*]] = load i16, ptr [[SR_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[CONV3:%.*]] = sext i16 [[TMP16]] to i32 +// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP18:%.*]] = call i32 @llvm.amdgcn.cvt.sr.fp8.f16(half [[TMP15]], i32 [[CONV3]], i32 [[TMP17]], i32 3) +// CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP18]], ptr addrspace(1) [[TMP19]], align 4 +// CHECK-NEXT: ret void +// +void test_cvt_sr_fp8_f16(global int* out, half a, short sr, int old) +{ + *out = __builtin_amdgcn_cvt_sr_fp8_f16(a, sr, old, 0); + *out = __builtin_amdgcn_cvt_sr_fp8_f16(a, sr, old, 1); + *out = __builtin_amdgcn_cvt_sr_fp8_f16(a, sr, old, 2); + *out = __builtin_amdgcn_cvt_sr_fp8_f16(a, sr, old, 3); +} + +// CHECK-LABEL: @test_cvt_scale_pk( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUTH8_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[OUTY8_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[SRC2_ADDR:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// CHECK-NEXT: [[OUTF32_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[OUTF8_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[OUTH16_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[OUTY16_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[OUTF16_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[SRC3_ADDR:%.*]] = alloca <3 x i32>, align 16, addrspace(5) +// CHECK-NEXT: [[SRC1_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[OUTH8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUTH8_ADDR]] to ptr +// CHECK-NEXT: [[OUTY8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUTY8_ADDR]] to ptr +// CHECK-NEXT: [[SRC2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC2_ADDR]] to ptr +// CHECK-NEXT: [[OUTF32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUTF32_ADDR]] to ptr +// CHECK-NEXT: [[OUTF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUTF8_ADDR]] to ptr +// CHECK-NEXT: [[OUTH16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUTH16_ADDR]] to ptr +// CHECK-NEXT: [[OUTY16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUTY16_ADDR]] to ptr +// CHECK-NEXT: [[OUTF16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUTF16_ADDR]] to ptr +// CHECK-NEXT: [[SRC3_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC3_ADDR]] to ptr +// CHECK-NEXT: [[SRC1_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC1_ADDR]] to ptr +// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUTH8:%.*]], ptr [[OUTH8_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[OUTY8:%.*]], ptr [[OUTY8_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x i32> [[SRC2:%.*]], ptr [[SRC2_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[OUTF32:%.*]], ptr [[OUTF32_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[OUTF8:%.*]], ptr [[OUTF8_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[OUTH16:%.*]], ptr [[OUTH16_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[OUTY16:%.*]], ptr [[OUTY16_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[OUTF16:%.*]], ptr [[OUTF16_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <3 x i32> [[SRC3:%.*]], ptr [[SRC3_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store i32 [[SRC1:%.*]], ptr [[SRC1_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[SCALE:%.*]], ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC2_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x half> @llvm.amdgcn.cvt.scale.pk8.f16.fp8(<2 x i32> [[TMP0]], i32 [[TMP1]], i32 4) +// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[OUTH8_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <8 x half> [[TMP2]], ptr addrspace(1) [[TMP3]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr [[SRC2_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP6:%.*]] = call <8 x bfloat> @llvm.amdgcn.cvt.scale.pk8.bf16.fp8(<2 x i32> [[TMP4]], i32 [[TMP5]], i32 5) +// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUTY8_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <8 x bfloat> [[TMP6]], ptr addrspace(1) [[TMP7]], align 16 +// CHECK-NEXT: [[TMP8:%.*]] = load <2 x i32>, ptr [[SRC2_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP10:%.*]] = call <8 x half> @llvm.amdgcn.cvt.scale.pk8.f16.bf8(<2 x i32> [[TMP8]], i32 [[TMP9]], i32 6) +// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr [[OUTH8_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <8 x half> [[TMP10]], ptr addrspace(1) [[TMP11]], align 16 +// CHECK-NEXT: [[TMP12:%.*]] = load <2 x i32>, ptr [[SRC2_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP14:%.*]] = call <8 x bfloat> @llvm.amdgcn.cvt.scale.pk8.bf16.bf8(<2 x i32> [[TMP12]], i32 [[TMP13]], i32 7) +// CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr [[OUTY8_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <8 x bfloat> [[TMP14]], ptr addrspace(1) [[TMP15]], align 16 +// CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[SRC1_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP18:%.*]] = call <8 x half> @llvm.amdgcn.cvt.scale.pk8.f16.fp4(i32 [[TMP16]], i32 [[TMP17]], i32 1) +// CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr [[OUTH8_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <8 x half> [[TMP18]], ptr addrspace(1) [[TMP19]], align 16 +// CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr [[SRC1_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP22:%.*]] = call <8 x bfloat> @llvm.amdgcn.cvt.scale.pk8.bf16.fp4(i32 [[TMP20]], i32 [[TMP21]], i32 2) +// CHECK-NEXT: [[TMP23:%.*]] = load ptr addrspace(1), ptr [[OUTY8_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <8 x bfloat> [[TMP22]], ptr addrspace(1) [[TMP23]], align 16 +// CHECK-NEXT: [[TMP24:%.*]] = load <2 x i32>, ptr [[SRC2_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP26:%.*]] = call <8 x float> @llvm.amdgcn.cvt.scale.pk8.f32.fp8(<2 x i32> [[TMP24]], i32 [[TMP25]], i32 5) +// CHECK-NEXT: [[TMP27:%.*]] = load ptr addrspace(1), ptr [[OUTF8_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <8 x float> [[TMP26]], ptr addrspace(1) [[TMP27]], align 32 +// CHECK-NEXT: [[TMP28:%.*]] = load <2 x i32>, ptr [[SRC2_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP30:%.*]] = call <8 x float> @llvm.amdgcn.cvt.scale.pk8.f32.bf8(<2 x i32> [[TMP28]], i32 [[TMP29]], i32 6) +// CHECK-NEXT: [[TMP31:%.*]] = load ptr addrspace(1), ptr [[OUTF8_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <8 x float> [[TMP30]], ptr addrspace(1) [[TMP31]], align 32 +// CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr [[SRC1_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP33:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP34:%.*]] = call <8 x float> @llvm.amdgcn.cvt.scale.pk8.f32.fp4(i32 [[TMP32]], i32 [[TMP33]], i32 7) +// CHECK-NEXT: [[TMP35:%.*]] = load ptr addrspace(1), ptr [[OUTF8_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <8 x float> [[TMP34]], ptr addrspace(1) [[TMP35]], align 32 +// CHECK-NEXT: [[TMP36:%.*]] = load <3 x i32>, ptr [[SRC3_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP37:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP38:%.*]] = call <16 x half> @llvm.amdgcn.cvt.scale.pk16.f16.fp6(<3 x i32> [[TMP36]], i32 [[TMP37]], i32 0) +// CHECK-NEXT: [[TMP39:%.*]] = load ptr addrspace(1), ptr [[OUTH16_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <16 x half> [[TMP38]], ptr addrspace(1) [[TMP39]], align 32 +// CHECK-NEXT: [[TMP40:%.*]] = load <3 x i32>, ptr [[SRC3_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP41:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP42:%.*]] = call <16 x bfloat> @llvm.amdgcn.cvt.scale.pk16.bf16.fp6(<3 x i32> [[TMP40]], i32 [[TMP41]], i32 1) +// CHECK-NEXT: [[TMP43:%.*]] = load ptr addrspace(1), ptr [[OUTY16_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <16 x bfloat> [[TMP42]], ptr addrspace(1) [[TMP43]], align 32 +// CHECK-NEXT: [[TMP44:%.*]] = load <3 x i32>, ptr [[SRC3_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP45:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP46:%.*]] = call <16 x half> @llvm.amdgcn.cvt.scale.pk16.f16.bf6(<3 x i32> [[TMP44]], i32 [[TMP45]], i32 2) +// CHECK-NEXT: [[TMP47:%.*]] = load ptr addrspace(1), ptr [[OUTH16_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <16 x half> [[TMP46]], ptr addrspace(1) [[TMP47]], align 32 +// CHECK-NEXT: [[TMP48:%.*]] = load <3 x i32>, ptr [[SRC3_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP49:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP50:%.*]] = call <16 x bfloat> @llvm.amdgcn.cvt.scale.pk16.bf16.bf6(<3 x i32> [[TMP48]], i32 [[TMP49]], i32 3) +// CHECK-NEXT: [[TMP51:%.*]] = load ptr addrspace(1), ptr [[OUTY16_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <16 x bfloat> [[TMP50]], ptr addrspace(1) [[TMP51]], align 32 +// CHECK-NEXT: [[TMP52:%.*]] = load <3 x i32>, ptr [[SRC3_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP53:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP54:%.*]] = call <16 x float> @llvm.amdgcn.cvt.scale.pk16.f32.fp6(<3 x i32> [[TMP52]], i32 [[TMP53]], i32 3) +// CHECK-NEXT: [[TMP55:%.*]] = load ptr addrspace(1), ptr [[OUTF16_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <16 x float> [[TMP54]], ptr addrspace(1) [[TMP55]], align 64 +// CHECK-NEXT: [[TMP56:%.*]] = load <3 x i32>, ptr [[SRC3_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP57:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP58:%.*]] = call <16 x float> @llvm.amdgcn.cvt.scale.pk16.f32.bf6(<3 x i32> [[TMP56]], i32 [[TMP57]], i32 4) +// CHECK-NEXT: [[TMP59:%.*]] = load ptr addrspace(1), ptr [[OUTF16_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <16 x float> [[TMP58]], ptr addrspace(1) [[TMP59]], align 64 +// CHECK-NEXT: ret void +// +void test_cvt_scale_pk(global half8 *outh8, global bfloat8 *outy8, uint2 src2, + global float32 *outf32, global float8 *outf8, + global half16 *outh16, global bfloat16 *outy16, + global float16 *outf16, uint3 src3, + uint src1, uint scale) +{ + *outh8 = __builtin_amdgcn_cvt_scale_pk8_f16_fp8(src2, scale, 4); + *outy8 = __builtin_amdgcn_cvt_scale_pk8_bf16_fp8(src2, scale, 5); + *outh8 = __builtin_amdgcn_cvt_scale_pk8_f16_bf8(src2, scale, 6); + *outy8 = __builtin_amdgcn_cvt_scale_pk8_bf16_bf8(src2, scale, 7); + *outh8 = __builtin_amdgcn_cvt_scale_pk8_f16_fp4(src1, scale, 1); + *outy8 = __builtin_amdgcn_cvt_scale_pk8_bf16_fp4(src1, scale, 2); + *outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_fp8(src2, scale, 5); + *outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_bf8(src2, scale, 6); + *outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_fp4(src1, scale, 7); + *outh16 = __builtin_amdgcn_cvt_scale_pk16_f16_fp6(src3, scale, 0); + *outy16 = __builtin_amdgcn_cvt_scale_pk16_bf16_fp6(src3, scale, 1); + *outh16 = __builtin_amdgcn_cvt_scale_pk16_f16_bf6(src3, scale, 2); + *outy16 = __builtin_amdgcn_cvt_scale_pk16_bf16_bf6(src3, scale, 3); + *outf16 = __builtin_amdgcn_cvt_scale_pk16_f32_fp6(src3, scale, 3); + *outf16 = __builtin_amdgcn_cvt_scale_pk16_f32_bf6(src3, scale, 4); +} + +// CHECK-LABEL: @test_cvt_scalef32_pk( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT2_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[SRCBF8_ADDR:%.*]] = alloca <8 x bfloat>, align 16, addrspace(5) +// CHECK-NEXT: [[SRCH8_ADDR:%.*]] = alloca <8 x half>, align 16, addrspace(5) +// CHECK-NEXT: [[SRCF8_ADDR:%.*]] = alloca <8 x float>, align 32, addrspace(5) +// CHECK-NEXT: [[OUT3_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[SRCBF16_ADDR:%.*]] = alloca <16 x bfloat>, align 32, addrspace(5) +// CHECK-NEXT: [[SRCH16_ADDR:%.*]] = alloca <16 x half>, align 32, addrspace(5) +// CHECK-NEXT: [[SRCF16_ADDR:%.*]] = alloca <16 x float>, align 64, addrspace(5) +// CHECK-NEXT: [[OUT1_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[OUT2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT2_ADDR]] to ptr +// CHECK-NEXT: [[SRCBF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCBF8_ADDR]] to ptr +// CHECK-NEXT: [[SRCH8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCH8_ADDR]] to ptr +// CHECK-NEXT: [[SRCF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCF8_ADDR]] to ptr +// CHECK-NEXT: [[OUT3_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT3_ADDR]] to ptr +// CHECK-NEXT: [[SRCBF16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCBF16_ADDR]] to ptr +// CHECK-NEXT: [[SRCH16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCH16_ADDR]] to ptr +// CHECK-NEXT: [[SRCF16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCF16_ADDR]] to ptr +// CHECK-NEXT: [[OUT1_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT1_ADDR]] to ptr +// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT2:%.*]], ptr [[OUT2_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <8 x bfloat> [[SRCBF8:%.*]], ptr [[SRCBF8_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store <8 x half> [[SRCH8:%.*]], ptr [[SRCH8_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store <8 x float> [[SRCF8:%.*]], ptr [[SRCF8_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store ptr addrspace(1) [[OUT3:%.*]], ptr [[OUT3_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <16 x bfloat> [[SRCBF16:%.*]], ptr [[SRCBF16_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store <16 x half> [[SRCH16:%.*]], ptr [[SRCH16_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store <16 x float> [[SRCF16:%.*]], ptr [[SRCF16_ADDR_ASCAST]], align 64 +// CHECK-NEXT: store ptr addrspace(1) [[OUT1:%.*]], ptr [[OUT1_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store float [[SCALE:%.*]], ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x bfloat>, ptr [[SRCBF8_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.fp8.bf16(<8 x bfloat> [[TMP0]], float [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load <8 x bfloat>, ptr [[SRCBF8_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP6:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.bf8.bf16(<8 x bfloat> [[TMP4]], float [[TMP5]]) +// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 8 +// CHECK-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr [[SRCH8_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP10:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.fp8.f16(<8 x half> [[TMP8]], float [[TMP9]]) +// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x i32> [[TMP10]], ptr addrspace(1) [[TMP11]], align 8 +// CHECK-NEXT: [[TMP12:%.*]] = load <8 x half>, ptr [[SRCH8_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP13:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP14:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.bf8.f16(<8 x half> [[TMP12]], float [[TMP13]]) +// CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x i32> [[TMP14]], ptr addrspace(1) [[TMP15]], align 8 +// CHECK-NEXT: [[TMP16:%.*]] = load <8 x float>, ptr [[SRCF8_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TMP17:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP18:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.fp8.f32(<8 x float> [[TMP16]], float [[TMP17]]) +// CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x i32> [[TMP18]], ptr addrspace(1) [[TMP19]], align 8 +// CHECK-NEXT: [[TMP20:%.*]] = load <8 x float>, ptr [[SRCF8_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TMP21:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP22:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.bf8.f32(<8 x float> [[TMP20]], float [[TMP21]]) +// CHECK-NEXT: [[TMP23:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x i32> [[TMP22]], ptr addrspace(1) [[TMP23]], align 8 +// CHECK-NEXT: [[TMP24:%.*]] = load <8 x float>, ptr [[SRCF8_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TMP25:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk8.fp4.f32(<8 x float> [[TMP24]], float [[TMP25]]) +// CHECK-NEXT: [[TMP27:%.*]] = load ptr addrspace(1), ptr [[OUT1_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP26]], ptr addrspace(1) [[TMP27]], align 4 +// CHECK-NEXT: [[TMP28:%.*]] = load <8 x half>, ptr [[SRCH8_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP29:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP30:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk8.fp4.f16(<8 x half> [[TMP28]], float [[TMP29]]) +// CHECK-NEXT: [[TMP31:%.*]] = load ptr addrspace(1), ptr [[OUT1_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP30]], ptr addrspace(1) [[TMP31]], align 4 +// CHECK-NEXT: [[TMP32:%.*]] = load <8 x bfloat>, ptr [[SRCBF8_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP33:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP34:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk8.fp4.bf16(<8 x bfloat> [[TMP32]], float [[TMP33]]) +// CHECK-NEXT: [[TMP35:%.*]] = load ptr addrspace(1), ptr [[OUT1_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP34]], ptr addrspace(1) [[TMP35]], align 4 +// CHECK-NEXT: [[TMP36:%.*]] = load <16 x bfloat>, ptr [[SRCBF16_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TMP37:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP38:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.bf6.bf16(<16 x bfloat> [[TMP36]], float [[TMP37]]) +// CHECK-NEXT: [[TMP39:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <3 x i32> [[TMP38]], ptr addrspace(1) [[TMP39]], align 16 +// CHECK-NEXT: [[TMP40:%.*]] = load <16 x half>, ptr [[SRCH16_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TMP41:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP42:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.bf6.f16(<16 x half> [[TMP40]], float [[TMP41]]) +// CHECK-NEXT: [[TMP43:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <3 x i32> [[TMP42]], ptr addrspace(1) [[TMP43]], align 16 +// CHECK-NEXT: [[TMP44:%.*]] = load <16 x bfloat>, ptr [[SRCBF16_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TMP45:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP46:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.fp6.bf16(<16 x bfloat> [[TMP44]], float [[TMP45]]) +// CHECK-NEXT: [[TMP47:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <3 x i32> [[TMP46]], ptr addrspace(1) [[TMP47]], align 16 +// CHECK-NEXT: [[TMP48:%.*]] = load <16 x half>, ptr [[SRCH16_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TMP49:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP50:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.fp6.f16(<16 x half> [[TMP48]], float [[TMP49]]) +// CHECK-NEXT: [[TMP51:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <3 x i32> [[TMP50]], ptr addrspace(1) [[TMP51]], align 16 +// CHECK-NEXT: [[TMP52:%.*]] = load <16 x float>, ptr [[SRCF16_ADDR_ASCAST]], align 64 +// CHECK-NEXT: [[TMP53:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP54:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.bf6.f32(<16 x float> [[TMP52]], float [[TMP53]]) +// CHECK-NEXT: [[TMP55:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <3 x i32> [[TMP54]], ptr addrspace(1) [[TMP55]], align 16 +// CHECK-NEXT: [[TMP56:%.*]] = load <16 x float>, ptr [[SRCF16_ADDR_ASCAST]], align 64 +// CHECK-NEXT: [[TMP57:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP58:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.fp6.f32(<16 x float> [[TMP56]], float [[TMP57]]) +// CHECK-NEXT: [[TMP59:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <3 x i32> [[TMP58]], ptr addrspace(1) [[TMP59]], align 16 +// CHECK-NEXT: ret void +// +void test_cvt_scalef32_pk(global uint2 *out2, bfloat8 srcbf8, half8 srch8, float8 srcf8, + global uint3 *out3, bfloat16 srcbf16, half16 srch16, float16 srcf16, + global uint *out1, float scale) +{ + *out2 = __builtin_amdgcn_cvt_scalef32_pk8_fp8_bf16(srcbf8, scale); + *out2 = __builtin_amdgcn_cvt_scalef32_pk8_bf8_bf16(srcbf8, scale); + *out2 = __builtin_amdgcn_cvt_scalef32_pk8_fp8_f16(srch8, scale); + *out2 = __builtin_amdgcn_cvt_scalef32_pk8_bf8_f16(srch8, scale); + *out2 = __builtin_amdgcn_cvt_scalef32_pk8_fp8_f32(srcf8, scale); + *out2 = __builtin_amdgcn_cvt_scalef32_pk8_bf8_f32(srcf8, scale); + *out1 = __builtin_amdgcn_cvt_scalef32_pk8_fp4_f32(srcf8, scale); + *out1 = __builtin_amdgcn_cvt_scalef32_pk8_fp4_f16(srch8, scale); + *out1 = __builtin_amdgcn_cvt_scalef32_pk8_fp4_bf16(srcbf8, scale); + *out3 = __builtin_amdgcn_cvt_scalef32_pk16_bf6_bf16(srcbf16, scale); + *out3 = __builtin_amdgcn_cvt_scalef32_pk16_bf6_f16(srch16, scale); + *out3 = __builtin_amdgcn_cvt_scalef32_pk16_fp6_bf16(srcbf16, scale); + *out3 = __builtin_amdgcn_cvt_scalef32_pk16_fp6_f16(srch16, scale); + *out3 = __builtin_amdgcn_cvt_scalef32_pk16_bf6_f32(srcf16, scale); + *out3 = __builtin_amdgcn_cvt_scalef32_pk16_fp6_f32(srcf16, scale); +} + +// CHECK-LABEL: @test_cvt_scalef32_sr_pk( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT2_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[SRCBF8_ADDR:%.*]] = alloca <8 x bfloat>, align 16, addrspace(5) +// CHECK-NEXT: [[SRCH8_ADDR:%.*]] = alloca <8 x half>, align 16, addrspace(5) +// CHECK-NEXT: [[SRCF8_ADDR:%.*]] = alloca <8 x float>, align 32, addrspace(5) +// CHECK-NEXT: [[OUT3_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[SRCBF16_ADDR:%.*]] = alloca <16 x bfloat>, align 32, addrspace(5) +// CHECK-NEXT: [[SRCH16_ADDR:%.*]] = alloca <16 x half>, align 32, addrspace(5) +// CHECK-NEXT: [[SRCF16_ADDR:%.*]] = alloca <16 x float>, align 64, addrspace(5) +// CHECK-NEXT: [[OUT1_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[SR_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[OUT2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT2_ADDR]] to ptr +// CHECK-NEXT: [[SRCBF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCBF8_ADDR]] to ptr +// CHECK-NEXT: [[SRCH8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCH8_ADDR]] to ptr +// CHECK-NEXT: [[SRCF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCF8_ADDR]] to ptr +// CHECK-NEXT: [[OUT3_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT3_ADDR]] to ptr +// CHECK-NEXT: [[SRCBF16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCBF16_ADDR]] to ptr +// CHECK-NEXT: [[SRCH16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCH16_ADDR]] to ptr +// CHECK-NEXT: [[SRCF16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCF16_ADDR]] to ptr +// CHECK-NEXT: [[OUT1_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT1_ADDR]] to ptr +// CHECK-NEXT: [[SR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SR_ADDR]] to ptr +// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT2:%.*]], ptr [[OUT2_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <8 x bfloat> [[SRCBF8:%.*]], ptr [[SRCBF8_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store <8 x half> [[SRCH8:%.*]], ptr [[SRCH8_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store <8 x float> [[SRCF8:%.*]], ptr [[SRCF8_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store ptr addrspace(1) [[OUT3:%.*]], ptr [[OUT3_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <16 x bfloat> [[SRCBF16:%.*]], ptr [[SRCBF16_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store <16 x half> [[SRCH16:%.*]], ptr [[SRCH16_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store <16 x float> [[SRCF16:%.*]], ptr [[SRCF16_ADDR_ASCAST]], align 64 +// CHECK-NEXT: store ptr addrspace(1) [[OUT1:%.*]], ptr [[OUT1_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[SR:%.*]], ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store float [[SCALE:%.*]], ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x bfloat>, ptr [[SRCBF8_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.fp8.bf16(<8 x bfloat> [[TMP0]], i32 [[TMP1]], float [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x i32> [[TMP3]], ptr addrspace(1) [[TMP4]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = load <8 x bfloat>, ptr [[SRCBF8_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.bf8.bf16(<8 x bfloat> [[TMP5]], i32 [[TMP6]], float [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x i32> [[TMP8]], ptr addrspace(1) [[TMP9]], align 8 +// CHECK-NEXT: [[TMP10:%.*]] = load <8 x half>, ptr [[SRCH8_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP12:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP13:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.fp8.f16(<8 x half> [[TMP10]], i32 [[TMP11]], float [[TMP12]]) +// CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x i32> [[TMP13]], ptr addrspace(1) [[TMP14]], align 8 +// CHECK-NEXT: [[TMP15:%.*]] = load <8 x half>, ptr [[SRCH8_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP17:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP18:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.bf8.f16(<8 x half> [[TMP15]], i32 [[TMP16]], float [[TMP17]]) +// CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x i32> [[TMP18]], ptr addrspace(1) [[TMP19]], align 8 +// CHECK-NEXT: [[TMP20:%.*]] = load <8 x float>, ptr [[SRCF8_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP22:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP23:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.fp8.f32(<8 x float> [[TMP20]], i32 [[TMP21]], float [[TMP22]]) +// CHECK-NEXT: [[TMP24:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x i32> [[TMP23]], ptr addrspace(1) [[TMP24]], align 8 +// CHECK-NEXT: [[TMP25:%.*]] = load <8 x float>, ptr [[SRCF8_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP27:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP28:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.bf8.f32(<8 x float> [[TMP25]], i32 [[TMP26]], float [[TMP27]]) +// CHECK-NEXT: [[TMP29:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x i32> [[TMP28]], ptr addrspace(1) [[TMP29]], align 8 +// CHECK-NEXT: [[TMP30:%.*]] = load <8 x float>, ptr [[SRCF8_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP32:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP33:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk8.fp4.f32(<8 x float> [[TMP30]], i32 [[TMP31]], float [[TMP32]]) +// CHECK-NEXT: [[TMP34:%.*]] = load ptr addrspace(1), ptr [[OUT1_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP33]], ptr addrspace(1) [[TMP34]], align 4 +// CHECK-NEXT: [[TMP35:%.*]] = load <8 x half>, ptr [[SRCH8_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP36:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP37:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP38:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk8.fp4.f16(<8 x half> [[TMP35]], i32 [[TMP36]], float [[TMP37]]) +// CHECK-NEXT: [[TMP39:%.*]] = load ptr addrspace(1), ptr [[OUT1_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP38]], ptr addrspace(1) [[TMP39]], align 4 +// CHECK-NEXT: [[TMP40:%.*]] = load <8 x bfloat>, ptr [[SRCBF8_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP41:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP42:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP43:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk8.fp4.bf16(<8 x bfloat> [[TMP40]], i32 [[TMP41]], float [[TMP42]]) +// CHECK-NEXT: [[TMP44:%.*]] = load ptr addrspace(1), ptr [[OUT1_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP43]], ptr addrspace(1) [[TMP44]], align 4 +// CHECK-NEXT: [[TMP45:%.*]] = load <16 x bfloat>, ptr [[SRCBF16_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TMP46:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP47:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP48:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.bf16(<16 x bfloat> [[TMP45]], i32 [[TMP46]], float [[TMP47]]) +// CHECK-NEXT: [[TMP49:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <3 x i32> [[TMP48]], ptr addrspace(1) [[TMP49]], align 16 +// CHECK-NEXT: [[TMP50:%.*]] = load <16 x half>, ptr [[SRCH16_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TMP51:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP52:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP53:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.f16(<16 x half> [[TMP50]], i32 [[TMP51]], float [[TMP52]]) +// CHECK-NEXT: [[TMP54:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <3 x i32> [[TMP53]], ptr addrspace(1) [[TMP54]], align 16 +// CHECK-NEXT: [[TMP55:%.*]] = load <16 x bfloat>, ptr [[SRCBF16_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TMP56:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP57:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP58:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.bf16(<16 x bfloat> [[TMP55]], i32 [[TMP56]], float [[TMP57]]) +// CHECK-NEXT: [[TMP59:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <3 x i32> [[TMP58]], ptr addrspace(1) [[TMP59]], align 16 +// CHECK-NEXT: [[TMP60:%.*]] = load <16 x half>, ptr [[SRCH16_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TMP61:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP62:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP63:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.f16(<16 x half> [[TMP60]], i32 [[TMP61]], float [[TMP62]]) +// CHECK-NEXT: [[TMP64:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <3 x i32> [[TMP63]], ptr addrspace(1) [[TMP64]], align 16 +// CHECK-NEXT: [[TMP65:%.*]] = load <16 x float>, ptr [[SRCF16_ADDR_ASCAST]], align 64 +// CHECK-NEXT: [[TMP66:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP67:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP68:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.f32(<16 x float> [[TMP65]], i32 [[TMP66]], float [[TMP67]]) +// CHECK-NEXT: [[TMP69:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <3 x i32> [[TMP68]], ptr addrspace(1) [[TMP69]], align 16 +// CHECK-NEXT: [[TMP70:%.*]] = load <16 x float>, ptr [[SRCF16_ADDR_ASCAST]], align 64 +// CHECK-NEXT: [[TMP71:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP72:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP73:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.f32(<16 x float> [[TMP70]], i32 [[TMP71]], float [[TMP72]]) +// CHECK-NEXT: [[TMP74:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <3 x i32> [[TMP73]], ptr addrspace(1) [[TMP74]], align 16 +// CHECK-NEXT: ret void +// +void test_cvt_scalef32_sr_pk(global uint2 *out2, bfloat8 srcbf8, half8 srch8, float8 srcf8, + global uint3 *out3, bfloat16 srcbf16, half16 srch16, float16 srcf16, + global uint *out1, uint sr, float scale) +{ + *out2 = __builtin_amdgcn_cvt_scalef32_sr_pk8_fp8_bf16(srcbf8, sr, scale); + *out2 = __builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_bf16(srcbf8, sr, scale); + *out2 = __builtin_amdgcn_cvt_scalef32_sr_pk8_fp8_f16(srch8, sr, scale); + *out2 = __builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_f16(srch8, sr, scale); + *out2 = __builtin_amdgcn_cvt_scalef32_sr_pk8_fp8_f32(srcf8, sr, scale); + *out2 = __builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_f32(srcf8, sr, scale); + *out1 = __builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f32(srcf8, sr, scale); + *out1 = __builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f16(srch8, sr, scale); + *out1 = __builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_bf16(srcbf8, sr, scale); + *out3 = __builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_bf16(srcbf16, sr, scale); + *out3 = __builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_f16(srch16, sr, scale); + *out3 = __builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_bf16(srcbf16, sr, scale); + *out3 = __builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_f16(srch16, sr, scale); + *out3 = __builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_f32(srcf16, sr, scale); + *out3 = __builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_f32(srcf16, sr, scale); +} + +// CHECK-LABEL: @test_sat_pk4_i4_i8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT: store ptr [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.amdgcn.sat.pk4.i4.i8(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i16 [[TMP1]], ptr [[TMP2]], align 2 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = call i16 @llvm.amdgcn.sat.pk4.u4.u8(i32 [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i16 [[TMP4]], ptr [[TMP5]], align 2 +// CHECK-NEXT: ret void +// +void test_sat_pk4_i4_i8(ushort *out, uint src) +{ + *out = __builtin_amdgcn_sat_pk4_i4_i8(src); + *out = __builtin_amdgcn_sat_pk4_u4_u8(src); +} + +// CHECK-LABEL: @test_permlane16_swap( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[OLD_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OLD_ADDR]] to ptr +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[OLD:%.*]], ptr [[OLD_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP0]], i32 [[TMP1]], i1 false, i1 false) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i64 0 +// CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP4]], i64 1 +// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 8 +// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP10:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP8]], i32 [[TMP9]], i1 true, i1 false) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { i32, i32 } [[TMP10]], 0 +// CHECK-NEXT: [[TMP12:%.*]] = extractvalue { i32, i32 } [[TMP10]], 1 +// CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i64 0 +// CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP12]], i64 1 +// CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x i32> [[TMP14]], ptr addrspace(1) [[TMP15]], align 8 +// CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP18:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP16]], i32 [[TMP17]], i1 false, i1 true) +// CHECK-NEXT: [[TMP19:%.*]] = extractvalue { i32, i32 } [[TMP18]], 0 +// CHECK-NEXT: [[TMP20:%.*]] = extractvalue { i32, i32 } [[TMP18]], 1 +// CHECK-NEXT: [[TMP21:%.*]] = insertelement <2 x i32> poison, i32 [[TMP19]], i64 0 +// CHECK-NEXT: [[TMP22:%.*]] = insertelement <2 x i32> [[TMP21]], i32 [[TMP20]], i64 1 +// CHECK-NEXT: [[TMP23:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x i32> [[TMP22]], ptr addrspace(1) [[TMP23]], align 8 +// CHECK-NEXT: ret void +// +void test_permlane16_swap(global uint2* out, uint old, uint src) { + *out = __builtin_amdgcn_permlane16_swap(old, src, false, false); + *out = __builtin_amdgcn_permlane16_swap(old, src, true, false); + *out = __builtin_amdgcn_permlane16_swap(old, src, false, true); +} + +// CHECK-LABEL: @test_permlane_bcast( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[SRC0_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SRC1_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SRC2_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC0_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC0_ADDR]] to ptr +// CHECK-NEXT: [[SRC1_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC1_ADDR]] to ptr +// CHECK-NEXT: [[SRC2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC2_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[SRC0:%.*]], ptr [[SRC0_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[SRC1:%.*]], ptr [[SRC1_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[SRC2:%.*]], ptr [[SRC2_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC0_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SRC1_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[SRC2_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.permlane.bcast(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4 +// CHECK-NEXT: ret void +// +void test_permlane_bcast(global uint* out, uint src0, uint src1, uint src2) { + *out = __builtin_amdgcn_permlane_bcast(src0, src1, src2); +} + +// CHECK-LABEL: @test_permlane_down( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[SRC0_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SRC1_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SRC2_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC0_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC0_ADDR]] to ptr +// CHECK-NEXT: [[SRC1_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC1_ADDR]] to ptr +// CHECK-NEXT: [[SRC2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC2_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[SRC0:%.*]], ptr [[SRC0_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[SRC1:%.*]], ptr [[SRC1_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[SRC2:%.*]], ptr [[SRC2_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC0_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SRC1_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[SRC2_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.permlane.down(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4 +// CHECK-NEXT: ret void +// +void test_permlane_down(global uint* out, uint src0, uint src1, uint src2) { + *out = __builtin_amdgcn_permlane_down(src0, src1, src2); +} + +// CHECK-LABEL: @test_permlane_up( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[SRC0_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SRC1_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SRC2_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC0_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC0_ADDR]] to ptr +// CHECK-NEXT: [[SRC1_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC1_ADDR]] to ptr +// CHECK-NEXT: [[SRC2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC2_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[SRC0:%.*]], ptr [[SRC0_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[SRC1:%.*]], ptr [[SRC1_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[SRC2:%.*]], ptr [[SRC2_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC0_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SRC1_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[SRC2_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.permlane.up(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4 +// CHECK-NEXT: ret void +// +void test_permlane_up(global uint* out, uint src0, uint src1, uint src2) { + *out = __builtin_amdgcn_permlane_up(src0, src1, src2); +} + +// CHECK-LABEL: @test_permlane_xor( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[SRC0_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SRC1_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SRC2_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC0_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC0_ADDR]] to ptr +// CHECK-NEXT: [[SRC1_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC1_ADDR]] to ptr +// CHECK-NEXT: [[SRC2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC2_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[SRC0:%.*]], ptr [[SRC0_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[SRC1:%.*]], ptr [[SRC1_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[SRC2:%.*]], ptr [[SRC2_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC0_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SRC1_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[SRC2_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.permlane.xor(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4 +// CHECK-NEXT: ret void +// +void test_permlane_xor(global uint* out, uint src0, uint src1, uint src2) { + *out = __builtin_amdgcn_permlane_xor(src0, src1, src2); +} + +// CHECK-LABEL: @test_permlane_idx_gen( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[SRC0_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SRC1_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC0_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC0_ADDR]] to ptr +// CHECK-NEXT: [[SRC1_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC1_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[SRC0:%.*]], ptr [[SRC0_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[SRC1:%.*]], ptr [[SRC1_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC0_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SRC1_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.permlane.idx.gen(i32 [[TMP0]], i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 +// CHECK-NEXT: ret void +// +void test_permlane_idx_gen(global uint* out, uint src0, uint src1) { + *out = __builtin_amdgcn_permlane_idx_gen(src0, src1); +} + +// CHECK-LABEL: @test_perm_pk( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A32_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[A64_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[B32_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[B64_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[C_ADDR:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// CHECK-NEXT: [[OUT2_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[OUT3_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[OUT4_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A32_ADDR]] to ptr +// CHECK-NEXT: [[A64_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A64_ADDR]] to ptr +// CHECK-NEXT: [[B32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B32_ADDR]] to ptr +// CHECK-NEXT: [[B64_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B64_ADDR]] to ptr +// CHECK-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// CHECK-NEXT: [[OUT2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT2_ADDR]] to ptr +// CHECK-NEXT: [[OUT3_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT3_ADDR]] to ptr +// CHECK-NEXT: [[OUT4_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT4_ADDR]] to ptr +// CHECK-NEXT: store i32 [[A32:%.*]], ptr [[A32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[A64:%.*]], ptr [[A64_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[B32:%.*]], ptr [[B32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[B64:%.*]], ptr [[B64_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store <2 x i32> [[C:%.*]], ptr [[C_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[OUT2:%.*]], ptr [[OUT2_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[OUT3:%.*]], ptr [[OUT3_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[OUT4:%.*]], ptr [[OUT4_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[B32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[C_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = call <2 x i32> @llvm.amdgcn.perm.pk16.b4.u4(i32 [[TMP0]], i32 [[TMP1]], <2 x i32> [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[OUT2_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x i32> [[TMP3]], ptr [[TMP4]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[A32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[B64_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[CONV:%.*]] = zext i32 [[TMP6]] to i64 +// CHECK-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr [[C_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP8:%.*]] = call <3 x i32> @llvm.amdgcn.perm.pk16.b6.u4(i32 [[TMP5]], i64 [[CONV]], <2 x i32> [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = load ptr, ptr [[OUT3_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <3 x i32> [[TMP8]], ptr [[TMP9]], align 16 +// CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[A64_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[CONV1:%.*]] = zext i32 [[TMP10]] to i64 +// CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[B64_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[CONV2:%.*]] = zext i32 [[TMP11]] to i64 +// CHECK-NEXT: [[TMP12:%.*]] = load <2 x i32>, ptr [[C_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP13:%.*]] = call <4 x i32> @llvm.amdgcn.perm.pk16.b8.u4(i64 [[CONV1]], i64 [[CONV2]], <2 x i32> [[TMP12]]) +// CHECK-NEXT: [[TMP14:%.*]] = load ptr, ptr [[OUT4_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <4 x i32> [[TMP13]], ptr [[TMP14]], align 16 +// CHECK-NEXT: ret void +// +void test_perm_pk(uint a32, uint a64, uint b32, uint b64, uint2 c, uint2 *out2, uint3 *out3, uint4 *out4) { + *out2 = __builtin_amdgcn_perm_pk16_b4_u4(a32, b32, c); + *out3 = __builtin_amdgcn_perm_pk16_b6_u4(a32, b64, c); + *out4 = __builtin_amdgcn_perm_pk16_b8_u4(a64, b64, c); +} + +// CHECK-LABEL: @test_prefetch( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[FPTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[GPTR_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[FPTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[FPTR_ADDR]] to ptr +// CHECK-NEXT: [[GPTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[GPTR_ADDR]] to ptr +// CHECK-NEXT: store ptr [[FPTR:%.*]], ptr [[FPTR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[GPTR:%.*]], ptr [[GPTR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[FPTR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: call void @llvm.amdgcn.flat.prefetch(ptr [[TMP0]], i32 0) +// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[GPTR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) [[TMP1]], i32 8) +// CHECK-NEXT: ret void +// +void test_prefetch(generic void *fptr, global void *gptr) { + __builtin_amdgcn_flat_prefetch(fptr, 0); + __builtin_amdgcn_global_prefetch(gptr, 8); +} + +// CHECK-LABEL: @test_cvt_pk_fp8_f32_e5m3( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[OLD_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OLD_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[OLD:%.*]], ptr [[OLD_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store float [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store float [[B:%.*]], ptr [[B_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[A_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[B_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.cvt.pk.fp8.f32.e5m3(float [[TMP0]], float [[TMP1]], i32 [[TMP2]], i1 true) +// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4 +// CHECK-NEXT: ret void +// +void test_cvt_pk_fp8_f32_e5m3(global int* out, int old, float a, float b) +{ + *out = __builtin_amdgcn_cvt_pk_fp8_f32_e5m3(a, b, old, true); +} + +// CHECK-LABEL: @test_cvt_sr_fp8_f32_e5m3( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[OLD_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OLD_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[OLD:%.*]], ptr [[OLD_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store float [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[B:%.*]], ptr [[B_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[A_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.cvt.sr.fp8.f32.e5m3(float [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i32 3) +// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4 +// CHECK-NEXT: ret void +// +void test_cvt_sr_fp8_f32_e5m3(global int* out, int old, float a, int b) +{ + *out = __builtin_amdgcn_cvt_sr_fp8_f32_e5m3(a, b, old, 3); +} + // CHECK-LABEL: @test_cvt_f32_fp8_e5m3( // CHECK-NEXT: entry: // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |