; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 < %s | FileCheck %s define amdgpu_kernel void @i32_2d_load_store(ptr %out) { ; CHECK-LABEL: define amdgpu_kernel void @i32_2d_load_store( ; CHECK-SAME: ptr [[OUT:%.*]]) { ; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <6 x i32> poison ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <6 x i32> [[ALLOCA]], i32 0, i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <6 x i32> [[TMP1]], i32 1, i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <6 x i32> [[TMP2]], i32 2, i32 2 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <6 x i32> [[TMP3]], i32 3, i32 3 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <6 x i32> [[TMP4]], i32 4, i32 4 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <6 x i32> [[TMP5]], i32 5, i32 5 ; CHECK-NEXT: store i32 3, ptr [[OUT]], align 4 ; CHECK-NEXT: ret void ; %alloca = alloca [2 x [3 x i32]], align 16, addrspace(5) %gep.00 = getelementptr inbounds [2 x [3 x i32]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 0 %gep.01 = getelementptr inbounds [2 x [3 x i32]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 1 %gep.02 = getelementptr inbounds [2 x [3 x i32]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 2 %gep.10 = getelementptr inbounds [2 x [3 x i32]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0 %gep.11 = getelementptr inbounds [2 x [3 x i32]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 1 %gep.12 = getelementptr inbounds [2 x [3 x i32]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 2 store i32 0, ptr addrspace(5) %gep.00 store i32 1, ptr addrspace(5) %gep.01 store i32 2, ptr addrspace(5) %gep.02 store i32 3, ptr addrspace(5) %gep.10 store i32 4, ptr addrspace(5) %gep.11 store i32 5, ptr addrspace(5) %gep.12 %gep = getelementptr inbounds [2 x [3 x i32]], ptr addrspace(5) %alloca, i32 0, i32 1 %load = load i32, ptr addrspace(5) %gep store i32 %load, ptr %out ret void } define amdgpu_kernel void @i64_2d_load_store(ptr %out) { ; CHECK-LABEL: define amdgpu_kernel void @i64_2d_load_store( ; CHECK-SAME: ptr [[OUT:%.*]]) { ; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <6 x i64> poison ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <6 x i64> [[ALLOCA]], i64 0, i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <6 x i64> [[TMP1]], i64 1, i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <6 x i64> [[TMP2]], i64 2, i32 2 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <6 x i64> [[TMP3]], i64 3, i32 3 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <6 x i64> [[TMP4]], i64 4, i32 4 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <6 x i64> [[TMP5]], i64 5, i32 5 ; CHECK-NEXT: store i64 3, ptr [[OUT]], align 8 ; CHECK-NEXT: ret void ; %alloca = alloca [2 x [3 x i64]], align 16, addrspace(5) %gep.00 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 0 %gep.01 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 1 %gep.02 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 2 %gep.10 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0 %gep.11 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 1 %gep.12 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 2 store i64 0, ptr addrspace(5) %gep.00 store i64 1, ptr addrspace(5) %gep.01 store i64 2, ptr addrspace(5) %gep.02 store i64 3, ptr addrspace(5) %gep.10 store i64 4, ptr addrspace(5) %gep.11 store i64 5, ptr addrspace(5) %gep.12 %gep = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 1 %load = load i64, ptr addrspace(5) %gep store i64 %load, ptr %out ret void } define amdgpu_kernel void @i32_2d_alloca_store_partial(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) { ; CHECK-LABEL: define amdgpu_kernel void @i32_2d_alloca_store_partial( ; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(3) [[DUMMY_LDS:%.*]]) { ; CHECK-NEXT: [[BB:.*:]] ; CHECK-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() ; CHECK-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 ; CHECK-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3 ; CHECK-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2 ; CHECK-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]] ; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <8 x i32> poison ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[ALLOCA]], i32 1, i32 0 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 2, i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP5]], i32 3, i32 2 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 4, i32 3 ; CHECK-NEXT: [[TMP0:%.*]] = extractelement <8 x i32> [[TMP3]], i32 [[SEL2]] ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[TMP0]] to float ; CHECK-NEXT: store float [[TMP1]], ptr addrspace(1) [[OUT]], align 4 ; CHECK-NEXT: ret void ; bb: %x = tail call i32 @llvm.amdgcn.workitem.id.x() %y = tail call i32 @llvm.amdgcn.workitem.id.y() %c1 = icmp uge i32 %x, 3 %c2 = icmp uge i32 %y, 3 %sel1 = select i1 %c1, i32 1, i32 2 %sel2 = select i1 %c2, i32 0, i32 %sel1 %alloca = alloca [2 x [4 x i32]], align 4, addrspace(5) %gep = getelementptr inbounds <4 x i32>, ptr addrspace(5) %alloca, i32 0, i32 %sel2 store <4 x i32> , ptr addrspace(5) %alloca, align 4 %load = load float, ptr addrspace(5) %gep, align 4 store float %load, ptr addrspace(1) %out, align 4 ret void } define amdgpu_kernel void @i64_2d_load_store_cast(ptr %out) { ; CHECK-LABEL: define amdgpu_kernel void @i64_2d_load_store_cast( ; CHECK-SAME: ptr [[OUT:%.*]]) { ; CHECK-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() ; CHECK-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 ; CHECK-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3 ; CHECK-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2 ; CHECK-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]] ; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <6 x i64> poison ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <6 x i64> [[ALLOCA]], i64 0, i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <6 x i64> [[TMP7]], i64 1, i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <6 x i64> [[TMP2]], i64 2, i32 2 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <6 x i64> [[TMP3]], i64 3, i32 3 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <6 x i64> [[TMP4]], i64 4, i32 4 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <6 x i64> [[TMP5]], i64 5, i32 5 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <6 x i64> [[TMP6]], i32 [[SEL2]] ; CHECK-NEXT: store i64 [[TMP1]], ptr [[OUT]], align 8 ; CHECK-NEXT: ret void ; %x = tail call i32 @llvm.amdgcn.workitem.id.x() %y = tail call i32 @llvm.amdgcn.workitem.id.y() %c1 = icmp uge i32 %x, 3 %c2 = icmp uge i32 %y, 3 %sel1 = select i1 %c1, i32 1, i32 2 %sel2 = select i1 %c2, i32 0, i32 %sel1 %alloca = alloca [2 x [3 x i64]], align 16, addrspace(5) %gep.00 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 0 %gep.01 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 1 %gep.02 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 2 %gep.10 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0 %gep.11 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 1 %gep.12 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 2 store i64 0, ptr addrspace(5) %gep.00 store i64 1, ptr addrspace(5) %gep.01 store i64 2, ptr addrspace(5) %gep.02 store i64 3, ptr addrspace(5) %gep.10 store i64 4, ptr addrspace(5) %gep.11 store i64 5, ptr addrspace(5) %gep.12 %gep = getelementptr inbounds [6 x i64], ptr addrspace(5) %alloca, i32 0, i32 %sel2 %load = load i64, ptr addrspace(5) %gep store i64 %load, ptr %out ret void } define amdgpu_kernel void @i64_2d_load_store_subvec_1(ptr %out) { ; CHECK-LABEL: define amdgpu_kernel void @i64_2d_load_store_subvec_1( ; CHECK-SAME: ptr [[OUT:%.*]]) { ; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <6 x i64> poison ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <6 x i64> [[ALLOCA]], i64 0, i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <6 x i64> [[TMP1]], i64 1, i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <6 x i64> [[TMP2]], i64 2, i32 2 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <6 x i64> [[TMP3]], i64 3, i32 3 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <6 x i64> [[TMP4]], i64 4, i32 4 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <6 x i64> [[TMP5]], i64 5, i32 5 ; CHECK-NEXT: [[ELEM:%.*]] = extractelement <3 x i64> , i32 2 ; CHECK-NEXT: store i64 [[ELEM]], ptr [[OUT]], align 8 ; CHECK-NEXT: ret void ; %alloca = alloca [2 x [3 x i64]], align 16, addrspace(5) %gep.00 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0 %gep.01 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0 store <3 x i64> , ptr addrspace(5) %gep.00 store <3 x i64> , ptr addrspace(5) %gep.01 %gep = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 1 %load = load <3 x i64>, ptr addrspace(5) %gep %elem = extractelement <3 x i64> %load, i32 2 store i64 %elem, ptr %out ret void } define amdgpu_kernel void @i64_2d_load_store_subvec_2(ptr %out) { ; CHECK-LABEL: define amdgpu_kernel void @i64_2d_load_store_subvec_2( ; CHECK-SAME: ptr [[OUT:%.*]]) { ; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <8 x i64> poison ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i64> [[ALLOCA]], i64 0, i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i64> [[TMP1]], i64 1, i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i64> [[TMP2]], i64 2, i32 2 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i64> [[TMP3]], i64 3, i32 4 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i64> [[TMP4]], i64 4, i32 5 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i64> [[TMP5]], i64 5, i32 6 ; CHECK-NEXT: [[ELEM:%.*]] = extractelement <3 x i64> , i32 2 ; CHECK-NEXT: store i64 [[ELEM]], ptr [[OUT]], align 8 ; CHECK-NEXT: ret void ; %alloca = alloca [2 x <3 x i64>], align 16, addrspace(5) %gep.00 = getelementptr inbounds [2 x <3 x i64>], ptr addrspace(5) %alloca, i32 0 %gep.01 = getelementptr inbounds [2 x <3 x i64>], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0 store <3 x i64> , ptr addrspace(5) %gep.00 store <3 x i64> , ptr addrspace(5) %gep.01 %gep = getelementptr inbounds [2 x <3 x i64>], ptr addrspace(5) %alloca, i32 0, i32 1 %load = load <3 x i64>, ptr addrspace(5) %gep %elem = extractelement <3 x i64> %load, i32 2 store i64 %elem, ptr %out ret void } define amdgpu_kernel void @i64_2d_load_store_subvec_3(ptr %out) { ; CHECK-LABEL: define amdgpu_kernel void @i64_2d_load_store_subvec_3( ; CHECK-SAME: ptr [[OUT:%.*]]) { ; CHECK-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() ; CHECK-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 ; CHECK-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3 ; CHECK-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2 ; CHECK-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]] ; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <6 x i64> poison ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <6 x i64> [[ALLOCA]], i64 0, i32 0 ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <6 x i64> [[TMP10]], i64 1, i32 1 ; CHECK-NEXT: [[TMP12:%.*]] = insertelement <6 x i64> [[TMP11]], i64 2, i32 2 ; CHECK-NEXT: [[TMP13:%.*]] = insertelement <6 x i64> [[TMP12]], i64 3, i32 3 ; CHECK-NEXT: [[TMP14:%.*]] = insertelement <6 x i64> [[TMP13]], i64 4, i32 4 ; CHECK-NEXT: [[TMP15:%.*]] = insertelement <6 x i64> [[TMP14]], i64 5, i32 5 ; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[SEL2]], 3 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <6 x i64> [[TMP15]], i32 [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <3 x i64> poison, i64 [[TMP2]], i64 0 ; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP1]], 1 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <6 x i64> [[TMP15]], i32 [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <3 x i64> [[TMP3]], i64 [[TMP5]], i64 1 ; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP1]], 2 ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <6 x i64> [[TMP15]], i32 [[TMP7]] ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <3 x i64> [[TMP6]], i64 [[TMP8]], i64 2 ; CHECK-NEXT: [[ELEM:%.*]] = extractelement <3 x i64> [[TMP9]], i32 2 ; CHECK-NEXT: store i64 [[ELEM]], ptr [[OUT]], align 8 ; CHECK-NEXT: ret void ; %x = tail call i32 @llvm.amdgcn.workitem.id.x() %y = tail call i32 @llvm.amdgcn.workitem.id.y() %c1 = icmp uge i32 %x, 3 %c2 = icmp uge i32 %y, 3 %sel1 = select i1 %c1, i32 1, i32 2 %sel2 = select i1 %c2, i32 0, i32 %sel1 %alloca = alloca [2 x [3 x i64]], align 16, addrspace(5) %gep.00 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0 %gep.01 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0 store <3 x i64> , ptr addrspace(5) %gep.00 store <3 x i64> , ptr addrspace(5) %gep.01 %gep = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 %sel2 %load = load <3 x i64>, ptr addrspace(5) %gep %elem = extractelement <3 x i64> %load, i32 2 store i64 %elem, ptr %out ret void } define amdgpu_kernel void @i64_2d_load_store_subvec_3_i64_offset(ptr %out) { ; CHECK-LABEL: define amdgpu_kernel void @i64_2d_load_store_subvec_3_i64_offset( ; CHECK-SAME: ptr [[OUT:%.*]]) { ; CHECK-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() ; CHECK-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 ; CHECK-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3 ; CHECK-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2 ; CHECK-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]] ; CHECK-NEXT: [[SEL3:%.*]] = zext i32 [[SEL2]] to i64 ; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <6 x i64> poison ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <6 x i64> [[ALLOCA]], i64 0, i32 0 ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <6 x i64> [[TMP10]], i64 1, i32 1 ; CHECK-NEXT: [[TMP12:%.*]] = insertelement <6 x i64> [[TMP11]], i64 2, i32 2 ; CHECK-NEXT: [[TMP13:%.*]] = insertelement <6 x i64> [[TMP12]], i64 3, i32 3 ; CHECK-NEXT: [[TMP14:%.*]] = insertelement <6 x i64> [[TMP13]], i64 4, i32 4 ; CHECK-NEXT: [[TMP15:%.*]] = insertelement <6 x i64> [[TMP14]], i64 5, i32 5 ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[SEL3]], 3 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <6 x i64> [[TMP15]], i64 [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <3 x i64> poison, i64 [[TMP2]], i64 0 ; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[TMP1]], 1 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <6 x i64> [[TMP15]], i64 [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <3 x i64> [[TMP3]], i64 [[TMP5]], i64 1 ; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP1]], 2 ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <6 x i64> [[TMP15]], i64 [[TMP7]] ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <3 x i64> [[TMP6]], i64 [[TMP8]], i64 2 ; CHECK-NEXT: [[ELEM:%.*]] = extractelement <3 x i64> [[TMP9]], i32 2 ; CHECK-NEXT: store i64 [[ELEM]], ptr [[OUT]], align 8 ; CHECK-NEXT: ret void ; %x = tail call i32 @llvm.amdgcn.workitem.id.x() %y = tail call i32 @llvm.amdgcn.workitem.id.y() %c1 = icmp uge i32 %x, 3 %c2 = icmp uge i32 %y, 3 %sel1 = select i1 %c1, i32 1, i32 2 %sel2 = select i1 %c2, i32 0, i32 %sel1 %sel3 = zext i32 %sel2 to i64 %alloca = alloca [2 x [3 x i64]], align 16, addrspace(5) %gep.00 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0 %gep.01 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0 store <3 x i64> , ptr addrspace(5) %gep.00 store <3 x i64> , ptr addrspace(5) %gep.01 %gep = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i64 0, i64 %sel3 %load = load <3 x i64>, ptr addrspace(5) %gep %elem = extractelement <3 x i64> %load, i32 2 store i64 %elem, ptr %out ret void } define amdgpu_kernel void @i64_2d_load_store_subvec_3_i64_offset_index(ptr %out) { ; CHECK-LABEL: define amdgpu_kernel void @i64_2d_load_store_subvec_3_i64_offset_index( ; CHECK-SAME: ptr [[OUT:%.*]]) { ; CHECK-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() ; CHECK-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 ; CHECK-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3 ; CHECK-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2 ; CHECK-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]] ; CHECK-NEXT: [[SEL3:%.*]] = zext i32 [[SEL2]] to i64 ; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <6 x i64> poison ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <6 x i64> [[ALLOCA]], i64 0, i32 0 ; CHECK-NEXT: [[TMP12:%.*]] = insertelement <6 x i64> [[TMP11]], i64 1, i32 1 ; CHECK-NEXT: [[TMP13:%.*]] = insertelement <6 x i64> [[TMP12]], i64 2, i32 2 ; CHECK-NEXT: [[TMP14:%.*]] = insertelement <6 x i64> [[TMP13]], i64 3, i32 3 ; CHECK-NEXT: [[TMP15:%.*]] = insertelement <6 x i64> [[TMP14]], i64 4, i32 4 ; CHECK-NEXT: [[TMP16:%.*]] = insertelement <6 x i64> [[TMP15]], i64 5, i32 5 ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[SEL3]], 3 ; CHECK-NEXT: [[TMP2:%.*]] = add i64 6, [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <6 x i64> [[TMP16]], i64 [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <3 x i64> poison, i64 [[TMP3]], i64 0 ; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP2]], 1 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <6 x i64> [[TMP16]], i64 [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <3 x i64> [[TMP4]], i64 [[TMP6]], i64 1 ; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[TMP2]], 2 ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <6 x i64> [[TMP16]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <3 x i64> [[TMP7]], i64 [[TMP9]], i64 2 ; CHECK-NEXT: [[ELEM:%.*]] = extractelement <3 x i64> [[TMP10]], i32 2 ; CHECK-NEXT: store i64 [[ELEM]], ptr [[OUT]], align 8 ; CHECK-NEXT: ret void ; %x = tail call i32 @llvm.amdgcn.workitem.id.x() %y = tail call i32 @llvm.amdgcn.workitem.id.y() %c1 = icmp uge i32 %x, 3 %c2 = icmp uge i32 %y, 3 %sel1 = select i1 %c1, i32 1, i32 2 %sel2 = select i1 %c2, i32 0, i32 %sel1 %sel3 = zext i32 %sel2 to i64 %alloca = alloca [2 x [3 x i64]], align 16, addrspace(5) %gep.00 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0 %gep.01 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0 store <3 x i64> , ptr addrspace(5) %gep.00 store <3 x i64> , ptr addrspace(5) %gep.01 %gep = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i64 1, i64 %sel3 %load = load <3 x i64>, ptr addrspace(5) %gep %elem = extractelement <3 x i64> %load, i32 2 store i64 %elem, ptr %out ret void } define amdgpu_kernel void @i64_2d_load_store_subvec_4(ptr %out) { ; CHECK-LABEL: define amdgpu_kernel void @i64_2d_load_store_subvec_4( ; CHECK-SAME: ptr [[OUT:%.*]]) { ; CHECK-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() ; CHECK-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 ; CHECK-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3 ; CHECK-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2 ; CHECK-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]] ; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <8 x i64> poison ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <8 x i64> [[ALLOCA]], i64 0, i32 0 ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x i64> [[TMP10]], i64 1, i32 1 ; CHECK-NEXT: [[TMP12:%.*]] = insertelement <8 x i64> [[TMP11]], i64 2, i32 2 ; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x i64> [[TMP12]], i64 3, i32 4 ; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x i64> [[TMP13]], i64 4, i32 5 ; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x i64> [[TMP14]], i64 5, i32 6 ; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[SEL2]], 4 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i64> [[TMP15]], i32 [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <3 x i64> poison, i64 [[TMP2]], i64 0 ; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP1]], 1 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i64> [[TMP15]], i32 [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <3 x i64> [[TMP3]], i64 [[TMP5]], i64 1 ; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP1]], 2 ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x i64> [[TMP15]], i32 [[TMP7]] ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <3 x i64> [[TMP6]], i64 [[TMP8]], i64 2 ; CHECK-NEXT: [[ELEM:%.*]] = extractelement <3 x i64> [[TMP9]], i32 2 ; CHECK-NEXT: store i64 [[ELEM]], ptr [[OUT]], align 8 ; CHECK-NEXT: ret void ; %x = tail call i32 @llvm.amdgcn.workitem.id.x() %y = tail call i32 @llvm.amdgcn.workitem.id.y() %c1 = icmp uge i32 %x, 3 %c2 = icmp uge i32 %y, 3 %sel1 = select i1 %c1, i32 1, i32 2 %sel2 = select i1 %c2, i32 0, i32 %sel1 %alloca = alloca [2 x <3 x i64>], align 16, addrspace(5) %gep.00 = getelementptr inbounds [2 x <3 x i64>], ptr addrspace(5) %alloca, i32 0 %gep.01 = getelementptr inbounds [2 x <3 x i64>], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0 store <3 x i64> , ptr addrspace(5) %gep.00 store <3 x i64> , ptr addrspace(5) %gep.01 %gep = getelementptr inbounds [2 x <3 x i64>], ptr addrspace(5) %alloca, i32 0, i32 %sel2 %load = load <3 x i64>, ptr addrspace(5) %gep %elem = extractelement <3 x i64> %load, i32 2 store i64 %elem, ptr %out ret void } define amdgpu_kernel void @i32_3d_load_store(ptr %out) { ; CHECK-LABEL: define amdgpu_kernel void @i32_3d_load_store( ; CHECK-SAME: ptr [[OUT:%.*]]) { ; CHECK-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y() ; CHECK-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3 ; CHECK-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3 ; CHECK-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2 ; CHECK-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]] ; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <12 x i32> poison ; CHECK-NEXT: [[TMP13:%.*]] = insertelement <12 x i32> [[ALLOCA]], i32 0, i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <12 x i32> [[TMP13]], i32 1, i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <12 x i32> [[TMP2]], i32 2, i32 2 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <12 x i32> [[TMP3]], i32 3, i32 3 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <12 x i32> [[TMP4]], i32 4, i32 4 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <12 x i32> [[TMP5]], i32 5, i32 5 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <12 x i32> [[TMP6]], i32 6, i32 6 ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <12 x i32> [[TMP7]], i32 7, i32 7 ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <12 x i32> [[TMP8]], i32 8, i32 8 ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <12 x i32> [[TMP9]], i32 9, i32 9 ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <12 x i32> [[TMP10]], i32 10, i32 10 ; CHECK-NEXT: [[TMP12:%.*]] = insertelement <12 x i32> [[TMP11]], i32 11, i32 11 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <12 x i32> [[TMP12]], i32 [[SEL2]] ; CHECK-NEXT: store i32 [[TMP1]], ptr [[OUT]], align 4 ; CHECK-NEXT: ret void ; %x = tail call i32 @llvm.amdgcn.workitem.id.x() %y = tail call i32 @llvm.amdgcn.workitem.id.y() %c1 = icmp uge i32 %x, 3 %c2 = icmp uge i32 %y, 3 %sel1 = select i1 %c1, i32 1, i32 2 %sel2 = select i1 %c2, i32 0, i32 %sel1 %alloca = alloca [2 x [2 x [3 x i32]]], align 16, addrspace(5) %gep.000 = getelementptr inbounds [2 x [2 x [3 x i32]]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 0, i32 0 %gep.001 = getelementptr inbounds [2 x [2 x [3 x i32]]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 0, i32 1 %gep.002 = getelementptr inbounds [2 x [2 x [3 x i32]]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 0, i32 2 %gep.010 = getelementptr inbounds [2 x [2 x [3 x i32]]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 1, i32 0 %gep.011 = getelementptr inbounds [2 x [2 x [3 x i32]]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 1, i32 1 %gep.012 = getelementptr inbounds [2 x [2 x [3 x i32]]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 1, i32 2 %gep.100 = getelementptr inbounds [2 x [2 x [3 x i32]]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0, i32 0 %gep.101 = getelementptr inbounds [2 x [2 x [3 x i32]]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0, i32 1 %gep.102 = getelementptr inbounds [2 x [2 x [3 x i32]]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0, i32 2 %gep.110 = getelementptr inbounds [2 x [2 x [3 x i32]]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 1, i32 0 %gep.111 = getelementptr inbounds [2 x [2 x [3 x i32]]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 1, i32 1 %gep.112 = getelementptr inbounds [2 x [2 x [3 x i32]]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 1, i32 2 store i32 0, ptr addrspace(5) %gep.000 store i32 1, ptr addrspace(5) %gep.001 store i32 2, ptr addrspace(5) %gep.002 store i32 3, ptr addrspace(5) %gep.010 store i32 4, ptr addrspace(5) %gep.011 store i32 5, ptr addrspace(5) %gep.012 store i32 6, ptr addrspace(5) %gep.100 store i32 7, ptr addrspace(5) %gep.101 store i32 8, ptr addrspace(5) %gep.102 store i32 9, ptr addrspace(5) %gep.110 store i32 10, ptr addrspace(5) %gep.111 store i32 11, ptr addrspace(5) %gep.112 %gep = getelementptr inbounds [12 x i32], ptr addrspace(5) %alloca, i32 0, i32 %sel2 %load = load i32, ptr addrspace(5) %gep store i32 %load, ptr %out ret void } define amdgpu_kernel void @i16_2d_load_store(ptr %out, i32 %sel) { ; CHECK-LABEL: define amdgpu_kernel void @i16_2d_load_store( ; CHECK-SAME: ptr [[OUT:%.*]], i32 [[SEL:%.*]]) { ; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <6 x i16> poison ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <6 x i16> [[ALLOCA]], i16 0, i32 0 ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <6 x i16> [[TMP7]], i16 1, i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <6 x i16> [[TMP8]], i16 2, i32 2 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <6 x i16> [[TMP3]], i16 3, i32 3 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <6 x i16> [[TMP4]], i16 4, i32 4 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <6 x i16> [[TMP5]], i16 5, i32 5 ; CHECK-NEXT: [[TMP1:%.*]] = add i32 3, [[SEL]] ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <6 x i16> [[TMP6]], i32 [[TMP1]] ; CHECK-NEXT: store i16 [[TMP2]], ptr [[OUT]], align 2 ; CHECK-NEXT: ret void ; %alloca = alloca [2 x [3 x i16]], align 16, addrspace(5) %gep.00 = getelementptr inbounds [2 x [3 x i16]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 0 %gep.01 = getelementptr inbounds [2 x [3 x i16]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 1 %gep.02 = getelementptr inbounds [2 x [3 x i16]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 2 %gep.10 = getelementptr inbounds [2 x [3 x i16]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0 %gep.11 = getelementptr inbounds [2 x [3 x i16]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 1 %gep.12 = getelementptr inbounds [2 x [3 x i16]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 2 store i16 0, ptr addrspace(5) %gep.00 store i16 1, ptr addrspace(5) %gep.01 store i16 2, ptr addrspace(5) %gep.02 store i16 3, ptr addrspace(5) %gep.10 store i16 4, ptr addrspace(5) %gep.11 store i16 5, ptr addrspace(5) %gep.12 %gep = getelementptr inbounds [2 x [3 x i16]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 %sel %load = load i16, ptr addrspace(5) %gep store i16 %load, ptr %out ret void } define amdgpu_kernel void @float_2d_load_store(ptr %out, i32 %sel) { ; CHECK-LABEL: define amdgpu_kernel void @float_2d_load_store( ; CHECK-SAME: ptr [[OUT:%.*]], i32 [[SEL:%.*]]) { ; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <6 x float> poison ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <6 x float> [[ALLOCA]], float 0.000000e+00, i32 0 ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <6 x float> [[TMP7]], float 1.000000e+00, i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <6 x float> [[TMP8]], float 2.000000e+00, i32 2 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <6 x float> [[TMP3]], float 3.000000e+00, i32 3 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <6 x float> [[TMP4]], float 4.000000e+00, i32 4 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <6 x float> [[TMP5]], float 5.000000e+00, i32 5 ; CHECK-NEXT: [[TMP1:%.*]] = add i32 3, [[SEL]] ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <6 x float> [[TMP6]], i32 [[TMP1]] ; CHECK-NEXT: store float [[TMP2]], ptr [[OUT]], align 4 ; CHECK-NEXT: ret void ; %alloca = alloca [2 x [3 x float]], align 16, addrspace(5) %gep.00 = getelementptr inbounds [2 x [3 x float]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 0 %gep.01 = getelementptr inbounds [2 x [3 x float]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 1 %gep.02 = getelementptr inbounds [2 x [3 x float]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 2 %gep.10 = getelementptr inbounds [2 x [3 x float]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0 %gep.11 = getelementptr inbounds [2 x [3 x float]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 1 %gep.12 = getelementptr inbounds [2 x [3 x float]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 2 store float 0.0, ptr addrspace(5) %gep.00 store float 1.0, ptr addrspace(5) %gep.01 store float 2.0, ptr addrspace(5) %gep.02 store float 3.0, ptr addrspace(5) %gep.10 store float 4.0, ptr addrspace(5) %gep.11 store float 5.0, ptr addrspace(5) %gep.12 %gep = getelementptr inbounds [2 x [3 x float]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 %sel %load = load float, ptr addrspace(5) %gep store float %load, ptr %out ret void } define amdgpu_kernel void @ptr_2d_load_store(ptr %out, i32 %sel) { ; CHECK-LABEL: define amdgpu_kernel void @ptr_2d_load_store( ; CHECK-SAME: ptr [[OUT:%.*]], i32 [[SEL:%.*]]) { ; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <6 x ptr> poison ; CHECK-NEXT: [[PTR_0:%.*]] = getelementptr inbounds ptr, ptr [[OUT]], i32 0 ; CHECK-NEXT: [[PTR_1:%.*]] = getelementptr inbounds ptr, ptr [[OUT]], i32 1 ; CHECK-NEXT: [[PTR_2:%.*]] = getelementptr inbounds ptr, ptr [[OUT]], i32 2 ; CHECK-NEXT: [[PTR_3:%.*]] = getelementptr inbounds ptr, ptr [[OUT]], i32 3 ; CHECK-NEXT: [[PTR_4:%.*]] = getelementptr inbounds ptr, ptr [[OUT]], i32 4 ; CHECK-NEXT: [[PTR_5:%.*]] = getelementptr inbounds ptr, ptr [[OUT]], i32 5 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <6 x ptr> [[ALLOCA]], ptr [[PTR_0]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <6 x ptr> [[TMP1]], ptr [[PTR_1]], i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <6 x ptr> [[TMP2]], ptr [[PTR_2]], i32 2 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <6 x ptr> [[TMP3]], ptr [[PTR_3]], i32 3 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <6 x ptr> [[TMP4]], ptr [[PTR_4]], i32 4 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <6 x ptr> [[TMP5]], ptr [[PTR_5]], i32 5 ; CHECK-NEXT: [[TMP7:%.*]] = add i32 3, [[SEL]] ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <6 x ptr> [[TMP6]], i32 [[TMP7]] ; CHECK-NEXT: store ptr [[TMP8]], ptr [[OUT]], align 8 ; CHECK-NEXT: ret void ; %alloca = alloca [2 x [3 x ptr]], align 16, addrspace(5) %gep.00 = getelementptr inbounds [2 x [3 x ptr]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 0 %gep.01 = getelementptr inbounds [2 x [3 x ptr]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 1 %gep.02 = getelementptr inbounds [2 x [3 x ptr]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 2 %gep.10 = getelementptr inbounds [2 x [3 x ptr]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0 %gep.11 = getelementptr inbounds [2 x [3 x ptr]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 1 %gep.12 = getelementptr inbounds [2 x [3 x ptr]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 2 %ptr.0 = getelementptr inbounds ptr, ptr %out, i32 0 %ptr.1 = getelementptr inbounds ptr, ptr %out, i32 1 %ptr.2 = getelementptr inbounds ptr, ptr %out, i32 2 %ptr.3 = getelementptr inbounds ptr, ptr %out, i32 3 %ptr.4 = getelementptr inbounds ptr, ptr %out, i32 4 %ptr.5 = getelementptr inbounds ptr, ptr %out, i32 5 store ptr %ptr.0, ptr addrspace(5) %gep.00 store ptr %ptr.1, ptr addrspace(5) %gep.01 store ptr %ptr.2, ptr addrspace(5) %gep.02 store ptr %ptr.3, ptr addrspace(5) %gep.10 store ptr %ptr.4, ptr addrspace(5) %gep.11 store ptr %ptr.5, ptr addrspace(5) %gep.12 %gep = getelementptr inbounds [2 x [3 x ptr]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 %sel %load = load ptr, ptr addrspace(5) %gep store ptr %load, ptr %out ret void } declare i32 @llvm.amdgcn.workitem.id.x() declare i32 @llvm.amdgcn.workitem.id.y()