// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s | FileCheck %s typedef int v8i __attribute__((ext_vector_type(8))); typedef _Bool v8b __attribute__((ext_vector_type(8))); // CHECK-LABEL: define dso_local <8 x i32> @test_load( // CHECK-SAME: i8 noundef [[M_COERCE:%.*]], ptr noundef [[P:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[M:%.*]] = alloca i8, align 1 // CHECK-NEXT: [[M_ADDR:%.*]] = alloca i8, align 1 // CHECK-NEXT: [[P_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NEXT: store i8 [[M_COERCE]], ptr [[M]], align 1 // CHECK-NEXT: [[LOAD_BITS:%.*]] = load i8, ptr [[M]], align 1 // CHECK-NEXT: [[M1:%.*]] = bitcast i8 [[LOAD_BITS]] to <8 x i1> // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i1> [[M1]] to i8 // CHECK-NEXT: store i8 [[TMP0]], ptr [[M_ADDR]], align 1 // CHECK-NEXT: store ptr [[P]], ptr [[P_ADDR]], align 8 // CHECK-NEXT: [[LOAD_BITS2:%.*]] = load i8, ptr [[M_ADDR]], align 1 // CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[LOAD_BITS2]] to <8 x i1> // CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[P_ADDR]], align 8 // CHECK-NEXT: [[MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP2]], i32 4, <8 x i1> [[TMP1]], <8 x i32> poison) // CHECK-NEXT: ret <8 x i32> [[MASKED_LOAD]] // v8i test_load(v8b m, int *p) { return __builtin_masked_load(m, p); } // CHECK-LABEL: define dso_local <8 x i32> @test_load_passthru( // CHECK-SAME: i8 noundef [[M_COERCE:%.*]], ptr noundef [[P:%.*]], ptr noundef byval(<8 x i32>) align 32 [[TMP0:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[M:%.*]] = alloca i8, align 1 // CHECK-NEXT: [[M_ADDR:%.*]] = alloca i8, align 1 // CHECK-NEXT: [[P_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NEXT: [[T_ADDR:%.*]] = alloca <8 x i32>, align 32 // CHECK-NEXT: store i8 [[M_COERCE]], ptr [[M]], align 1 // CHECK-NEXT: [[LOAD_BITS:%.*]] = load i8, ptr [[M]], align 1 // CHECK-NEXT: [[M1:%.*]] = bitcast i8 [[LOAD_BITS]] to <8 x i1> // CHECK-NEXT: [[T:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i1> [[M1]] to i8 // CHECK-NEXT: store i8 [[TMP1]], ptr [[M_ADDR]], align 1 // CHECK-NEXT: store ptr [[P]], ptr [[P_ADDR]], align 8 // CHECK-NEXT: store <8 x i32> [[T]], ptr [[T_ADDR]], align 32 // CHECK-NEXT: [[LOAD_BITS2:%.*]] = load i8, ptr [[M_ADDR]], align 1 // CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[LOAD_BITS2]] to <8 x i1> // CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[P_ADDR]], align 8 // CHECK-NEXT: [[TMP4:%.*]] = load <8 x i32>, ptr [[T_ADDR]], align 32 // CHECK-NEXT: [[MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP3]], i32 4, <8 x i1> [[TMP2]], <8 x i32> [[TMP4]]) // CHECK-NEXT: ret <8 x i32> [[MASKED_LOAD]] // v8i test_load_passthru(v8b m, int *p, v8i t) { return __builtin_masked_load(m, p, t); } // CHECK-LABEL: define dso_local <8 x i32> @test_load_expand( // CHECK-SAME: i8 noundef [[M_COERCE:%.*]], ptr noundef [[P:%.*]], ptr noundef byval(<8 x i32>) align 32 [[TMP0:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[M:%.*]] = alloca i8, align 1 // CHECK-NEXT: [[M_ADDR:%.*]] = alloca i8, align 1 // CHECK-NEXT: [[P_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NEXT: [[T_ADDR:%.*]] = alloca <8 x i32>, align 32 // CHECK-NEXT: store i8 [[M_COERCE]], ptr [[M]], align 1 // CHECK-NEXT: [[LOAD_BITS:%.*]] = load i8, ptr [[M]], align 1 // CHECK-NEXT: [[M1:%.*]] = bitcast i8 [[LOAD_BITS]] to <8 x i1> // CHECK-NEXT: [[T:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i1> [[M1]] to i8 // CHECK-NEXT: store i8 [[TMP1]], ptr [[M_ADDR]], align 1 // CHECK-NEXT: store ptr [[P]], ptr [[P_ADDR]], align 8 // CHECK-NEXT: store <8 x i32> [[T]], ptr [[T_ADDR]], align 32 // CHECK-NEXT: [[LOAD_BITS2:%.*]] = load i8, ptr [[M_ADDR]], align 1 // CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[LOAD_BITS2]] to <8 x i1> // CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[P_ADDR]], align 8 // CHECK-NEXT: [[TMP4:%.*]] = load <8 x i32>, ptr [[T_ADDR]], align 32 // CHECK-NEXT: [[MASKED_EXPAND_LOAD:%.*]] = call <8 x i32> @llvm.masked.expandload.v8i32(ptr [[TMP3]], <8 x i1> [[TMP2]], <8 x i32> [[TMP4]]) // CHECK-NEXT: ret <8 x i32> [[MASKED_EXPAND_LOAD]] // v8i test_load_expand(v8b m, int *p, v8i t) { return __builtin_masked_expand_load(m, p, t); } // CHECK-LABEL: define dso_local void @test_store( // CHECK-SAME: i8 noundef [[M_COERCE:%.*]], ptr noundef byval(<8 x i32>) align 32 [[TMP0:%.*]], ptr noundef [[P:%.*]]) #[[ATTR3:[0-9]+]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[M:%.*]] = alloca i8, align 1 // CHECK-NEXT: [[M_ADDR:%.*]] = alloca i8, align 1 // CHECK-NEXT: [[V_ADDR:%.*]] = alloca <8 x i32>, align 32 // CHECK-NEXT: [[P_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NEXT: store i8 [[M_COERCE]], ptr [[M]], align 1 // CHECK-NEXT: [[LOAD_BITS:%.*]] = load i8, ptr [[M]], align 1 // CHECK-NEXT: [[M1:%.*]] = bitcast i8 [[LOAD_BITS]] to <8 x i1> // CHECK-NEXT: [[V:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i1> [[M1]] to i8 // CHECK-NEXT: store i8 [[TMP1]], ptr [[M_ADDR]], align 1 // CHECK-NEXT: store <8 x i32> [[V]], ptr [[V_ADDR]], align 32 // CHECK-NEXT: store ptr [[P]], ptr [[P_ADDR]], align 8 // CHECK-NEXT: [[LOAD_BITS2:%.*]] = load i8, ptr [[M_ADDR]], align 1 // CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[LOAD_BITS2]] to <8 x i1> // CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr [[V_ADDR]], align 32 // CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[P_ADDR]], align 8 // CHECK-NEXT: call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP3]], ptr [[TMP4]], i32 4, <8 x i1> [[TMP2]]) // CHECK-NEXT: ret void // void test_store(v8b m, v8i v, int *p) { __builtin_masked_store(m, v, p); } // CHECK-LABEL: define dso_local void @test_compress_store( // CHECK-SAME: i8 noundef [[M_COERCE:%.*]], ptr noundef byval(<8 x i32>) align 32 [[TMP0:%.*]], ptr noundef [[P:%.*]]) #[[ATTR3]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[M:%.*]] = alloca i8, align 1 // CHECK-NEXT: [[M_ADDR:%.*]] = alloca i8, align 1 // CHECK-NEXT: [[V_ADDR:%.*]] = alloca <8 x i32>, align 32 // CHECK-NEXT: [[P_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NEXT: store i8 [[M_COERCE]], ptr [[M]], align 1 // CHECK-NEXT: [[LOAD_BITS:%.*]] = load i8, ptr [[M]], align 1 // CHECK-NEXT: [[M1:%.*]] = bitcast i8 [[LOAD_BITS]] to <8 x i1> // CHECK-NEXT: [[V:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i1> [[M1]] to i8 // CHECK-NEXT: store i8 [[TMP1]], ptr [[M_ADDR]], align 1 // CHECK-NEXT: store <8 x i32> [[V]], ptr [[V_ADDR]], align 32 // CHECK-NEXT: store ptr [[P]], ptr [[P_ADDR]], align 8 // CHECK-NEXT: [[LOAD_BITS2:%.*]] = load i8, ptr [[M_ADDR]], align 1 // CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[LOAD_BITS2]] to <8 x i1> // CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr [[V_ADDR]], align 32 // CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[P_ADDR]], align 8 // CHECK-NEXT: call void @llvm.masked.compressstore.v8i32(<8 x i32> [[TMP3]], ptr [[TMP4]], <8 x i1> [[TMP2]]) // CHECK-NEXT: ret void // void test_compress_store(v8b m, v8i v, int *p) { __builtin_masked_compress_store(m, v, p); } // CHECK-LABEL: define dso_local <8 x i32> @test_gather( // CHECK-SAME: i8 noundef [[MASK_COERCE:%.*]], ptr noundef byval(<8 x i32>) align 32 [[TMP0:%.*]], ptr noundef [[PTR:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[MASK:%.*]] = alloca i8, align 1 // CHECK-NEXT: [[MASK_ADDR:%.*]] = alloca i8, align 1 // CHECK-NEXT: [[IDX_ADDR:%.*]] = alloca <8 x i32>, align 32 // CHECK-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NEXT: store i8 [[MASK_COERCE]], ptr [[MASK]], align 1 // CHECK-NEXT: [[LOAD_BITS:%.*]] = load i8, ptr [[MASK]], align 1 // CHECK-NEXT: [[MASK1:%.*]] = bitcast i8 [[LOAD_BITS]] to <8 x i1> // CHECK-NEXT: [[IDX:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i1> [[MASK1]] to i8 // CHECK-NEXT: store i8 [[TMP1]], ptr [[MASK_ADDR]], align 1 // CHECK-NEXT: store <8 x i32> [[IDX]], ptr [[IDX_ADDR]], align 32 // CHECK-NEXT: store ptr [[PTR]], ptr [[PTR_ADDR]], align 8 // CHECK-NEXT: [[LOAD_BITS2:%.*]] = load i8, ptr [[MASK_ADDR]], align 1 // CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[LOAD_BITS2]] to <8 x i1> // CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr [[IDX_ADDR]], align 32 // CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[TMP4]], <8 x i32> [[TMP3]] // CHECK-NEXT: [[MASKED_GATHER:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> [[TMP2]], <8 x i32> poison) // CHECK-NEXT: ret <8 x i32> [[MASKED_GATHER]] // v8i test_gather(v8b mask, v8i idx, int *ptr) { return __builtin_masked_gather(mask, idx, ptr); } // CHECK-LABEL: define dso_local void @test_scatter( // CHECK-SAME: i8 noundef [[MASK_COERCE:%.*]], ptr noundef byval(<8 x i32>) align 32 [[TMP0:%.*]], ptr noundef byval(<8 x i32>) align 32 [[TMP1:%.*]], ptr noundef [[PTR:%.*]]) #[[ATTR3]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[MASK:%.*]] = alloca i8, align 1 // CHECK-NEXT: [[MASK_ADDR:%.*]] = alloca i8, align 1 // CHECK-NEXT: [[VAL_ADDR:%.*]] = alloca <8 x i32>, align 32 // CHECK-NEXT: [[IDX_ADDR:%.*]] = alloca <8 x i32>, align 32 // CHECK-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NEXT: store i8 [[MASK_COERCE]], ptr [[MASK]], align 1 // CHECK-NEXT: [[LOAD_BITS:%.*]] = load i8, ptr [[MASK]], align 1 // CHECK-NEXT: [[MASK1:%.*]] = bitcast i8 [[LOAD_BITS]] to <8 x i1> // CHECK-NEXT: [[VAL:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32 // CHECK-NEXT: [[IDX:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i1> [[MASK1]] to i8 // CHECK-NEXT: store i8 [[TMP2]], ptr [[MASK_ADDR]], align 1 // CHECK-NEXT: store <8 x i32> [[VAL]], ptr [[VAL_ADDR]], align 32 // CHECK-NEXT: store <8 x i32> [[IDX]], ptr [[IDX_ADDR]], align 32 // CHECK-NEXT: store ptr [[PTR]], ptr [[PTR_ADDR]], align 8 // CHECK-NEXT: [[LOAD_BITS2:%.*]] = load i8, ptr [[MASK_ADDR]], align 1 // CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[LOAD_BITS2]] to <8 x i1> // CHECK-NEXT: [[TMP4:%.*]] = load <8 x i32>, ptr [[VAL_ADDR]], align 32 // CHECK-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr [[IDX_ADDR]], align 32 // CHECK-NEXT: [[TMP6:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8 // CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[TMP6]], <8 x i32> [[TMP4]] // CHECK-NEXT: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[TMP5]], <8 x ptr> [[TMP7]], i32 4, <8 x i1> [[TMP3]]) // CHECK-NEXT: ret void // void test_scatter(v8b mask, v8i val, v8i idx, int *ptr) { __builtin_masked_scatter(mask, val, idx, ptr); } // CHECK-LABEL: define dso_local <8 x i32> @test_load_as( // CHECK-SAME: i8 noundef [[MASK_COERCE:%.*]], ptr addrspace(42) noundef [[PTR:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[MASK:%.*]] = alloca i8, align 1 // CHECK-NEXT: [[MASK_ADDR:%.*]] = alloca i8, align 1 // CHECK-NEXT: [[PTR_ADDR:%.*]] = alloca ptr addrspace(42), align 8 // CHECK-NEXT: store i8 [[MASK_COERCE]], ptr [[MASK]], align 1 // CHECK-NEXT: [[LOAD_BITS:%.*]] = load i8, ptr [[MASK]], align 1 // CHECK-NEXT: [[MASK1:%.*]] = bitcast i8 [[LOAD_BITS]] to <8 x i1> // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i1> [[MASK1]] to i8 // CHECK-NEXT: store i8 [[TMP0]], ptr [[MASK_ADDR]], align 1 // CHECK-NEXT: store ptr addrspace(42) [[PTR]], ptr [[PTR_ADDR]], align 8 // CHECK-NEXT: [[LOAD_BITS2:%.*]] = load i8, ptr [[MASK_ADDR]], align 1 // CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[LOAD_BITS2]] to <8 x i1> // CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(42), ptr [[PTR_ADDR]], align 8 // CHECK-NEXT: [[MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p42(ptr addrspace(42) [[TMP2]], i32 4, <8 x i1> [[TMP1]], <8 x i32> poison) // CHECK-NEXT: ret <8 x i32> [[MASKED_LOAD]] // v8i test_load_as(v8b mask, int __attribute__((address_space(42))) * ptr) { return __builtin_masked_load(mask, ptr); } // CHECK-LABEL: define dso_local void @test_store_as( // CHECK-SAME: i8 noundef [[M_COERCE:%.*]], ptr noundef byval(<8 x i32>) align 32 [[TMP0:%.*]], ptr addrspace(42) noundef [[P:%.*]]) #[[ATTR3]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[M:%.*]] = alloca i8, align 1 // CHECK-NEXT: [[M_ADDR:%.*]] = alloca i8, align 1 // CHECK-NEXT: [[V_ADDR:%.*]] = alloca <8 x i32>, align 32 // CHECK-NEXT: [[P_ADDR:%.*]] = alloca ptr addrspace(42), align 8 // CHECK-NEXT: store i8 [[M_COERCE]], ptr [[M]], align 1 // CHECK-NEXT: [[LOAD_BITS:%.*]] = load i8, ptr [[M]], align 1 // CHECK-NEXT: [[M1:%.*]] = bitcast i8 [[LOAD_BITS]] to <8 x i1> // CHECK-NEXT: [[V:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i1> [[M1]] to i8 // CHECK-NEXT: store i8 [[TMP1]], ptr [[M_ADDR]], align 1 // CHECK-NEXT: store <8 x i32> [[V]], ptr [[V_ADDR]], align 32 // CHECK-NEXT: store ptr addrspace(42) [[P]], ptr [[P_ADDR]], align 8 // CHECK-NEXT: [[LOAD_BITS2:%.*]] = load i8, ptr [[M_ADDR]], align 1 // CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[LOAD_BITS2]] to <8 x i1> // CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr [[V_ADDR]], align 32 // CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(42), ptr [[P_ADDR]], align 8 // CHECK-NEXT: call void @llvm.masked.store.v8i32.p42(<8 x i32> [[TMP3]], ptr addrspace(42) [[TMP4]], i32 4, <8 x i1> [[TMP2]]) // CHECK-NEXT: ret void // void test_store_as(v8b m, v8i v, int __attribute__((address_space(42))) *p) { __builtin_masked_store(m, v, p); } // CHECK-LABEL: define dso_local <8 x i32> @test_gather_as( // CHECK-SAME: i8 noundef [[MASK_COERCE:%.*]], ptr noundef byval(<8 x i32>) align 32 [[TMP0:%.*]], ptr addrspace(42) noundef [[PTR:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[MASK:%.*]] = alloca i8, align 1 // CHECK-NEXT: [[MASK_ADDR:%.*]] = alloca i8, align 1 // CHECK-NEXT: [[IDX_ADDR:%.*]] = alloca <8 x i32>, align 32 // CHECK-NEXT: [[PTR_ADDR:%.*]] = alloca ptr addrspace(42), align 8 // CHECK-NEXT: store i8 [[MASK_COERCE]], ptr [[MASK]], align 1 // CHECK-NEXT: [[LOAD_BITS:%.*]] = load i8, ptr [[MASK]], align 1 // CHECK-NEXT: [[MASK1:%.*]] = bitcast i8 [[LOAD_BITS]] to <8 x i1> // CHECK-NEXT: [[IDX:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i1> [[MASK1]] to i8 // CHECK-NEXT: store i8 [[TMP1]], ptr [[MASK_ADDR]], align 1 // CHECK-NEXT: store <8 x i32> [[IDX]], ptr [[IDX_ADDR]], align 32 // CHECK-NEXT: store ptr addrspace(42) [[PTR]], ptr [[PTR_ADDR]], align 8 // CHECK-NEXT: [[LOAD_BITS2:%.*]] = load i8, ptr [[MASK_ADDR]], align 1 // CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[LOAD_BITS2]] to <8 x i1> // CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr [[IDX_ADDR]], align 32 // CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(42), ptr [[PTR_ADDR]], align 8 // CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr addrspace(42) [[TMP4]], <8 x i32> [[TMP3]] // CHECK-NEXT: [[MASKED_GATHER:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p42(<8 x ptr addrspace(42)> [[TMP5]], i32 4, <8 x i1> [[TMP2]], <8 x i32> poison) // CHECK-NEXT: ret <8 x i32> [[MASKED_GATHER]] // v8i test_gather_as(v8b mask, v8i idx, int __attribute__((address_space(42))) *ptr) { return __builtin_masked_gather(mask, idx, ptr); } // CHECK-LABEL: define dso_local void @test_scatter_as( // CHECK-SAME: i8 noundef [[MASK_COERCE:%.*]], ptr noundef byval(<8 x i32>) align 32 [[TMP0:%.*]], ptr noundef byval(<8 x i32>) align 32 [[TMP1:%.*]], ptr addrspace(42) noundef [[PTR:%.*]]) #[[ATTR3]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[MASK:%.*]] = alloca i8, align 1 // CHECK-NEXT: [[MASK_ADDR:%.*]] = alloca i8, align 1 // CHECK-NEXT: [[VAL_ADDR:%.*]] = alloca <8 x i32>, align 32 // CHECK-NEXT: [[IDX_ADDR:%.*]] = alloca <8 x i32>, align 32 // CHECK-NEXT: [[PTR_ADDR:%.*]] = alloca ptr addrspace(42), align 8 // CHECK-NEXT: store i8 [[MASK_COERCE]], ptr [[MASK]], align 1 // CHECK-NEXT: [[LOAD_BITS:%.*]] = load i8, ptr [[MASK]], align 1 // CHECK-NEXT: [[MASK1:%.*]] = bitcast i8 [[LOAD_BITS]] to <8 x i1> // CHECK-NEXT: [[VAL:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32 // CHECK-NEXT: [[IDX:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i1> [[MASK1]] to i8 // CHECK-NEXT: store i8 [[TMP2]], ptr [[MASK_ADDR]], align 1 // CHECK-NEXT: store <8 x i32> [[VAL]], ptr [[VAL_ADDR]], align 32 // CHECK-NEXT: store <8 x i32> [[IDX]], ptr [[IDX_ADDR]], align 32 // CHECK-NEXT: store ptr addrspace(42) [[PTR]], ptr [[PTR_ADDR]], align 8 // CHECK-NEXT: [[LOAD_BITS2:%.*]] = load i8, ptr [[MASK_ADDR]], align 1 // CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[LOAD_BITS2]] to <8 x i1> // CHECK-NEXT: [[TMP4:%.*]] = load <8 x i32>, ptr [[VAL_ADDR]], align 32 // CHECK-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr [[IDX_ADDR]], align 32 // CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(42), ptr [[PTR_ADDR]], align 8 // CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr addrspace(42) [[TMP6]], <8 x i32> [[TMP4]] // CHECK-NEXT: call void @llvm.masked.scatter.v8i32.v8p42(<8 x i32> [[TMP5]], <8 x ptr addrspace(42)> [[TMP7]], i32 4, <8 x i1> [[TMP3]]) // CHECK-NEXT: ret void // void test_scatter_as(v8b mask, v8i val, v8i idx, int __attribute__((address_space(42))) *ptr) { __builtin_masked_scatter(mask, val, idx, ptr); }