; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt -S -mcpu=gfx900 -amdgpu-lower-buffer-fat-pointers < %s | FileCheck %s ; RUN: opt -S -mcpu=gfx900 -passes=amdgpu-lower-buffer-fat-pointers < %s | FileCheck %s target triple = "amdgcn--" ;; memcpy declare void @llvm.memcpy.p7.p7.i32(ptr addrspace(7), ptr addrspace(7), i32, i1) declare void @llvm.memcpy.p1.p7.i32(ptr addrspace(1), ptr addrspace(7), i32, i1) declare void @llvm.memcpy.p7.p1.i32(ptr addrspace(7), ptr addrspace(1), i32, i1) declare void @llvm.memcpy.p7.p7.i64(ptr addrspace(7), ptr addrspace(7), i64, i1) declare void @llvm.memcpy.p3.p7.i32(ptr addrspace(3), ptr addrspace(7), i32, i1) define void @memcpy_known(ptr addrspace(7) inreg %src, ptr addrspace(7) inreg %dst) { ; CHECK-LABEL: define void @memcpy_known( ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[SRC:%.*]], { ptr addrspace(8), i32 } inreg [[DST:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[DST_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 0 ; CHECK-NEXT: [[DST_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 1 ; CHECK-NEXT: [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0 ; CHECK-NEXT: [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1 ; CHECK-NEXT: br label %[[LOAD_STORE_LOOP:.*]] ; CHECK: [[LOAD_STORE_LOOP]]: ; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOAD_STORE_LOOP]] ] ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SRC_OFF]], [[LOOP_INDEX]] ; CHECK-NEXT: [[DOTOFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[TMP1]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_0:%.*]] = shufflevector <4 x i32> [[DOTOFF_0]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_0:%.*]] = shufflevector <64 x i32> poison, <64 x i32> [[DOTEXT_0]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_16:%.*]] = add nuw i32 [[TMP1]], 16 ; CHECK-NEXT: [[DOTOFF_16:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_16]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_4:%.*]] = shufflevector <4 x i32> [[DOTOFF_16]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_4:%.*]] = shufflevector <64 x i32> [[DOTPARTS_0]], <64 x i32> [[DOTEXT_4]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_32:%.*]] = add nuw i32 [[TMP1]], 32 ; CHECK-NEXT: [[DOTOFF_32:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_32]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_8:%.*]] = shufflevector <4 x i32> [[DOTOFF_32]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_8:%.*]] = shufflevector <64 x i32> [[DOTPARTS_4]], <64 x i32> [[DOTEXT_8]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_48:%.*]] = add nuw i32 [[TMP1]], 48 ; CHECK-NEXT: [[DOTOFF_48:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_48]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_12:%.*]] = shufflevector <4 x i32> [[DOTOFF_48]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_12:%.*]] = shufflevector <64 x i32> [[DOTPARTS_8]], <64 x i32> [[DOTEXT_12]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_64:%.*]] = add nuw i32 [[TMP1]], 64 ; CHECK-NEXT: [[DOTOFF_64:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_64]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_16:%.*]] = shufflevector <4 x i32> [[DOTOFF_64]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_16:%.*]] = shufflevector <64 x i32> [[DOTPARTS_12]], <64 x i32> [[DOTEXT_16]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_80:%.*]] = add nuw i32 [[TMP1]], 80 ; CHECK-NEXT: [[DOTOFF_80:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_80]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_20:%.*]] = shufflevector <4 x i32> [[DOTOFF_80]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_20:%.*]] = shufflevector <64 x i32> [[DOTPARTS_16]], <64 x i32> [[DOTEXT_20]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_96:%.*]] = add nuw i32 [[TMP1]], 96 ; CHECK-NEXT: [[DOTOFF_96:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_96]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_24:%.*]] = shufflevector <4 x i32> [[DOTOFF_96]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_24:%.*]] = shufflevector <64 x i32> [[DOTPARTS_20]], <64 x i32> [[DOTEXT_24]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_112:%.*]] = add nuw i32 [[TMP1]], 112 ; CHECK-NEXT: [[DOTOFF_112:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_112]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_28:%.*]] = shufflevector <4 x i32> [[DOTOFF_112]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_28:%.*]] = shufflevector <64 x i32> [[DOTPARTS_24]], <64 x i32> [[DOTEXT_28]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_128:%.*]] = add nuw i32 [[TMP1]], 128 ; CHECK-NEXT: [[DOTOFF_128:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_128]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_32:%.*]] = shufflevector <4 x i32> [[DOTOFF_128]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_32:%.*]] = shufflevector <64 x i32> [[DOTPARTS_28]], <64 x i32> [[DOTEXT_32]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_144:%.*]] = add nuw i32 [[TMP1]], 144 ; CHECK-NEXT: [[DOTOFF_144:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_144]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_36:%.*]] = shufflevector <4 x i32> [[DOTOFF_144]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_36:%.*]] = shufflevector <64 x i32> [[DOTPARTS_32]], <64 x i32> [[DOTEXT_36]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_160:%.*]] = add nuw i32 [[TMP1]], 160 ; CHECK-NEXT: [[DOTOFF_160:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_160]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_40:%.*]] = shufflevector <4 x i32> [[DOTOFF_160]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_40:%.*]] = shufflevector <64 x i32> [[DOTPARTS_36]], <64 x i32> [[DOTEXT_40]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_176:%.*]] = add nuw i32 [[TMP1]], 176 ; CHECK-NEXT: [[DOTOFF_176:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_176]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_44:%.*]] = shufflevector <4 x i32> [[DOTOFF_176]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_44:%.*]] = shufflevector <64 x i32> [[DOTPARTS_40]], <64 x i32> [[DOTEXT_44]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_192:%.*]] = add nuw i32 [[TMP1]], 192 ; CHECK-NEXT: [[DOTOFF_192:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_192]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_48:%.*]] = shufflevector <4 x i32> [[DOTOFF_192]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_48:%.*]] = shufflevector <64 x i32> [[DOTPARTS_44]], <64 x i32> [[DOTEXT_48]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_208:%.*]] = add nuw i32 [[TMP1]], 208 ; CHECK-NEXT: [[DOTOFF_208:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_208]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_52:%.*]] = shufflevector <4 x i32> [[DOTOFF_208]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_52:%.*]] = shufflevector <64 x i32> [[DOTPARTS_48]], <64 x i32> [[DOTEXT_52]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_224:%.*]] = add nuw i32 [[TMP1]], 224 ; CHECK-NEXT: [[DOTOFF_224:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_224]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_56:%.*]] = shufflevector <4 x i32> [[DOTOFF_224]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_56:%.*]] = shufflevector <64 x i32> [[DOTPARTS_52]], <64 x i32> [[DOTEXT_56]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_240:%.*]] = add nuw i32 [[TMP1]], 240 ; CHECK-NEXT: [[DOTOFF_240:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_240]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_60:%.*]] = shufflevector <4 x i32> [[DOTOFF_240]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <64 x i32> [[DOTPARTS_56]], <64 x i32> [[DOTEXT_60]], <64 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[DST_OFF]], [[LOOP_INDEX]] ; CHECK-NEXT: [[DOTSLICE_0:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_0]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[TMP3]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_4:%.*]] = add nuw i32 [[TMP3]], 16 ; CHECK-NEXT: [[DOTSLICE_4:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_4]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_4]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_8:%.*]] = add nuw i32 [[TMP3]], 32 ; CHECK-NEXT: [[DOTSLICE_8:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_8]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_8]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_12:%.*]] = add nuw i32 [[TMP3]], 48 ; CHECK-NEXT: [[DOTSLICE_12:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_12]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_12]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_16:%.*]] = add nuw i32 [[TMP3]], 64 ; CHECK-NEXT: [[DOTSLICE_16:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_16]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_16]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_20:%.*]] = add nuw i32 [[TMP3]], 80 ; CHECK-NEXT: [[DOTSLICE_20:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_20]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_20]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_24:%.*]] = add nuw i32 [[TMP3]], 96 ; CHECK-NEXT: [[DOTSLICE_24:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_24]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_24]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_28:%.*]] = add nuw i32 [[TMP3]], 112 ; CHECK-NEXT: [[DOTSLICE_28:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_28]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_28]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_32:%.*]] = add nuw i32 [[TMP3]], 128 ; CHECK-NEXT: [[DOTSLICE_32:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_32]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_32]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_36:%.*]] = add nuw i32 [[TMP3]], 144 ; CHECK-NEXT: [[DOTSLICE_36:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_36]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_36]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_40:%.*]] = add nuw i32 [[TMP3]], 160 ; CHECK-NEXT: [[DOTSLICE_40:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_40]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_40]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_44:%.*]] = add nuw i32 [[TMP3]], 176 ; CHECK-NEXT: [[DOTSLICE_44:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_44]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_44]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_48:%.*]] = add nuw i32 [[TMP3]], 192 ; CHECK-NEXT: [[DOTSLICE_48:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_48]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_48]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_52:%.*]] = add nuw i32 [[TMP3]], 208 ; CHECK-NEXT: [[DOTSLICE_52:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_52]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_52]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_56:%.*]] = add nuw i32 [[TMP3]], 224 ; CHECK-NEXT: [[DOTSLICE_56:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_56]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_56]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_60:%.*]] = add nuw i32 [[TMP3]], 240 ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTOFF_240]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0) ; CHECK-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256 ; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 8192 ; CHECK-NEXT: br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]] ; CHECK: [[MEMCPY_SPLIT]]: ; CHECK-NEXT: ret void ; call void @llvm.memcpy.p7.p7.i32(ptr addrspace(7) noundef nonnull align 16 %dst, ptr addrspace(7) noundef nonnull align 16 %src, i32 8192, i1 false) ret void } define void @memcpy_known_small(ptr addrspace(7) inreg %src, ptr addrspace(7) inreg %dst) { ; CHECK-LABEL: define void @memcpy_known_small( ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[SRC:%.*]], { ptr addrspace(8), i32 } inreg [[DST:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[DST_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 0 ; CHECK-NEXT: [[DST_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 1 ; CHECK-NEXT: [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0 ; CHECK-NEXT: [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1 ; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[SRC_OFF]], i32 0, i32 0) ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[TMP1]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DST_OFF]], i32 0, i32 0) ; CHECK-NEXT: [[TMP2:%.*]] = add nuw i32 [[SRC_OFF]], 16 ; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP2]], i32 0, i32 0) ; CHECK-NEXT: [[TMP4:%.*]] = add nuw i32 [[DST_OFF]], 16 ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[TMP3]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[TMP4]], i32 0, i32 0) ; CHECK-NEXT: ret void ; call void @llvm.memcpy.p7.p7.i32(ptr addrspace(7) %dst, ptr addrspace(7) %src, i32 32, i1 false) ret void } define void @memcpy_known_byte(ptr addrspace(7) inreg %src, ptr addrspace(7) inreg %dst) { ; CHECK-LABEL: define void @memcpy_known_byte( ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[SRC:%.*]], { ptr addrspace(8), i32 } inreg [[DST:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[DST_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 0 ; CHECK-NEXT: [[DST_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 1 ; CHECK-NEXT: [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0 ; CHECK-NEXT: [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1 ; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[SRC_OFF]], i32 0, i32 0) ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[TMP1]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DST_OFF]], i32 0, i32 0) ; CHECK-NEXT: ret void ; call void @llvm.memcpy.p7.p7.i32(ptr addrspace(7) %dst, ptr addrspace(7) %src, i32 1, i1 false) ret void } define void @memcpy_known_tail(ptr addrspace(7) inreg %src, ptr addrspace(7) inreg %dst) { ; CHECK-LABEL: define void @memcpy_known_tail( ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[SRC:%.*]], { ptr addrspace(8), i32 } inreg [[DST:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[DST_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 0 ; CHECK-NEXT: [[DST_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 1 ; CHECK-NEXT: [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0 ; CHECK-NEXT: [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1 ; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.raw.ptr.buffer.load.i64(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[SRC_OFF]], i32 0, i32 0) ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i64(i64 [[TMP1]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DST_OFF]], i32 0, i32 0) ; CHECK-NEXT: [[TMP2:%.*]] = add nuw i32 [[SRC_OFF]], 8 ; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP2]], i32 0, i32 0) ; CHECK-NEXT: [[TMP4:%.*]] = add nuw i32 [[DST_OFF]], 8 ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[TMP3]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[TMP4]], i32 0, i32 0) ; CHECK-NEXT: [[TMP5:%.*]] = add nuw i32 [[SRC_OFF]], 12 ; CHECK-NEXT: [[TMP6:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP5]], i32 0, i32 0) ; CHECK-NEXT: [[TMP7:%.*]] = add nuw i32 [[DST_OFF]], 12 ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[TMP6]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[TMP7]], i32 0, i32 0) ; CHECK-NEXT: [[TMP8:%.*]] = add nuw i32 [[SRC_OFF]], 14 ; CHECK-NEXT: [[TMP9:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP8]], i32 0, i32 0) ; CHECK-NEXT: [[TMP10:%.*]] = add nuw i32 [[DST_OFF]], 14 ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[TMP9]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[TMP10]], i32 0, i32 0) ; CHECK-NEXT: ret void ; call void @llvm.memcpy.p7.p7.i32(ptr addrspace(7) %dst, ptr addrspace(7) %src, i32 15, i1 false) ret void } define void @memcpy_known_i64(ptr addrspace(7) inreg %src, ptr addrspace(7) inreg %dst) { ; CHECK-LABEL: define void @memcpy_known_i64( ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[SRC:%.*]], { ptr addrspace(8), i32 } inreg [[DST:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[DST_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 0 ; CHECK-NEXT: [[DST_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 1 ; CHECK-NEXT: [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0 ; CHECK-NEXT: [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1 ; CHECK-NEXT: br label %[[LOAD_STORE_LOOP:.*]] ; CHECK: [[LOAD_STORE_LOOP]]: ; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOAD_STORE_LOOP]] ] ; CHECK-NEXT: [[LOOP_INDEX_C:%.*]] = trunc i64 [[LOOP_INDEX]] to i32 ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SRC_OFF]], [[LOOP_INDEX_C]] ; CHECK-NEXT: [[DOTOFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP1]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_0:%.*]] = shufflevector <4 x i32> [[DOTOFF_0]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_0:%.*]] = shufflevector <64 x i32> poison, <64 x i32> [[DOTEXT_0]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_16:%.*]] = add nuw i32 [[TMP1]], 16 ; CHECK-NEXT: [[DOTOFF_16:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_16]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_4:%.*]] = shufflevector <4 x i32> [[DOTOFF_16]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_4:%.*]] = shufflevector <64 x i32> [[DOTPARTS_0]], <64 x i32> [[DOTEXT_4]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_32:%.*]] = add nuw i32 [[TMP1]], 32 ; CHECK-NEXT: [[DOTOFF_32:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_32]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_8:%.*]] = shufflevector <4 x i32> [[DOTOFF_32]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_8:%.*]] = shufflevector <64 x i32> [[DOTPARTS_4]], <64 x i32> [[DOTEXT_8]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_48:%.*]] = add nuw i32 [[TMP1]], 48 ; CHECK-NEXT: [[DOTOFF_48:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_48]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_12:%.*]] = shufflevector <4 x i32> [[DOTOFF_48]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_12:%.*]] = shufflevector <64 x i32> [[DOTPARTS_8]], <64 x i32> [[DOTEXT_12]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_64:%.*]] = add nuw i32 [[TMP1]], 64 ; CHECK-NEXT: [[DOTOFF_64:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_64]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_16:%.*]] = shufflevector <4 x i32> [[DOTOFF_64]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_16:%.*]] = shufflevector <64 x i32> [[DOTPARTS_12]], <64 x i32> [[DOTEXT_16]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_80:%.*]] = add nuw i32 [[TMP1]], 80 ; CHECK-NEXT: [[DOTOFF_80:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_80]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_20:%.*]] = shufflevector <4 x i32> [[DOTOFF_80]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_20:%.*]] = shufflevector <64 x i32> [[DOTPARTS_16]], <64 x i32> [[DOTEXT_20]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_96:%.*]] = add nuw i32 [[TMP1]], 96 ; CHECK-NEXT: [[DOTOFF_96:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_96]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_24:%.*]] = shufflevector <4 x i32> [[DOTOFF_96]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_24:%.*]] = shufflevector <64 x i32> [[DOTPARTS_20]], <64 x i32> [[DOTEXT_24]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_112:%.*]] = add nuw i32 [[TMP1]], 112 ; CHECK-NEXT: [[DOTOFF_112:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_112]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_28:%.*]] = shufflevector <4 x i32> [[DOTOFF_112]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_28:%.*]] = shufflevector <64 x i32> [[DOTPARTS_24]], <64 x i32> [[DOTEXT_28]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_128:%.*]] = add nuw i32 [[TMP1]], 128 ; CHECK-NEXT: [[DOTOFF_128:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_128]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_32:%.*]] = shufflevector <4 x i32> [[DOTOFF_128]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_32:%.*]] = shufflevector <64 x i32> [[DOTPARTS_28]], <64 x i32> [[DOTEXT_32]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_144:%.*]] = add nuw i32 [[TMP1]], 144 ; CHECK-NEXT: [[DOTOFF_144:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_144]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_36:%.*]] = shufflevector <4 x i32> [[DOTOFF_144]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_36:%.*]] = shufflevector <64 x i32> [[DOTPARTS_32]], <64 x i32> [[DOTEXT_36]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_160:%.*]] = add nuw i32 [[TMP1]], 160 ; CHECK-NEXT: [[DOTOFF_160:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_160]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_40:%.*]] = shufflevector <4 x i32> [[DOTOFF_160]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_40:%.*]] = shufflevector <64 x i32> [[DOTPARTS_36]], <64 x i32> [[DOTEXT_40]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_176:%.*]] = add nuw i32 [[TMP1]], 176 ; CHECK-NEXT: [[DOTOFF_176:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_176]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_44:%.*]] = shufflevector <4 x i32> [[DOTOFF_176]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_44:%.*]] = shufflevector <64 x i32> [[DOTPARTS_40]], <64 x i32> [[DOTEXT_44]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_192:%.*]] = add nuw i32 [[TMP1]], 192 ; CHECK-NEXT: [[DOTOFF_192:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_192]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_48:%.*]] = shufflevector <4 x i32> [[DOTOFF_192]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_48:%.*]] = shufflevector <64 x i32> [[DOTPARTS_44]], <64 x i32> [[DOTEXT_48]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_208:%.*]] = add nuw i32 [[TMP1]], 208 ; CHECK-NEXT: [[DOTOFF_208:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_208]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_52:%.*]] = shufflevector <4 x i32> [[DOTOFF_208]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_52:%.*]] = shufflevector <64 x i32> [[DOTPARTS_48]], <64 x i32> [[DOTEXT_52]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_224:%.*]] = add nuw i32 [[TMP1]], 224 ; CHECK-NEXT: [[DOTOFF_224:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_224]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_56:%.*]] = shufflevector <4 x i32> [[DOTOFF_224]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_56:%.*]] = shufflevector <64 x i32> [[DOTPARTS_52]], <64 x i32> [[DOTEXT_56]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_240:%.*]] = add nuw i32 [[TMP1]], 240 ; CHECK-NEXT: [[DOTOFF_240:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_240]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_60:%.*]] = shufflevector <4 x i32> [[DOTOFF_240]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <64 x i32> [[DOTPARTS_56]], <64 x i32> [[DOTEXT_60]], <64 x i32> ; CHECK-NEXT: [[LOOP_INDEX_C1:%.*]] = trunc i64 [[LOOP_INDEX]] to i32 ; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[DST_OFF]], [[LOOP_INDEX_C1]] ; CHECK-NEXT: [[DOTSLICE_0:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_0]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[TMP3]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_4:%.*]] = add nuw i32 [[TMP3]], 16 ; CHECK-NEXT: [[DOTSLICE_4:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_4]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_4]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_8:%.*]] = add nuw i32 [[TMP3]], 32 ; CHECK-NEXT: [[DOTSLICE_8:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_8]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_8]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_12:%.*]] = add nuw i32 [[TMP3]], 48 ; CHECK-NEXT: [[DOTSLICE_12:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_12]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_12]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_16:%.*]] = add nuw i32 [[TMP3]], 64 ; CHECK-NEXT: [[DOTSLICE_16:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_16]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_16]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_20:%.*]] = add nuw i32 [[TMP3]], 80 ; CHECK-NEXT: [[DOTSLICE_20:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_20]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_20]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_24:%.*]] = add nuw i32 [[TMP3]], 96 ; CHECK-NEXT: [[DOTSLICE_24:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_24]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_24]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_28:%.*]] = add nuw i32 [[TMP3]], 112 ; CHECK-NEXT: [[DOTSLICE_28:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_28]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_28]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_32:%.*]] = add nuw i32 [[TMP3]], 128 ; CHECK-NEXT: [[DOTSLICE_32:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_32]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_32]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_36:%.*]] = add nuw i32 [[TMP3]], 144 ; CHECK-NEXT: [[DOTSLICE_36:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_36]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_36]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_40:%.*]] = add nuw i32 [[TMP3]], 160 ; CHECK-NEXT: [[DOTSLICE_40:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_40]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_40]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_44:%.*]] = add nuw i32 [[TMP3]], 176 ; CHECK-NEXT: [[DOTSLICE_44:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_44]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_44]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_48:%.*]] = add nuw i32 [[TMP3]], 192 ; CHECK-NEXT: [[DOTSLICE_48:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_48]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_48]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_52:%.*]] = add nuw i32 [[TMP3]], 208 ; CHECK-NEXT: [[DOTSLICE_52:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_52]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_52]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_56:%.*]] = add nuw i32 [[TMP3]], 224 ; CHECK-NEXT: [[DOTSLICE_56:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_56]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_56]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_60:%.*]] = add nuw i32 [[TMP3]], 240 ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTOFF_240]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0) ; CHECK-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256 ; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 8192 ; CHECK-NEXT: br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]] ; CHECK: [[MEMCPY_SPLIT]]: ; CHECK-NEXT: ret void ; call void @llvm.memcpy.p7.p7.i64(ptr addrspace(7) %dst, ptr addrspace(7) %src, i64 8192, i1 false) ret void } define void @memcpy_known_i32_volatile(ptr addrspace(7) inreg %src, ptr addrspace(7) inreg %dst) { ; CHECK-LABEL: define void @memcpy_known_i32_volatile( ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[SRC:%.*]], { ptr addrspace(8), i32 } inreg [[DST:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[DST_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 0 ; CHECK-NEXT: [[DST_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 1 ; CHECK-NEXT: [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0 ; CHECK-NEXT: [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1 ; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[SRC_OFF]], i32 0, i32 -2147483648) ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[TMP1]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DST_OFF]], i32 0, i32 -2147483648) ; CHECK-NEXT: [[TMP2:%.*]] = add nuw i32 [[SRC_OFF]], 16 ; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP2]], i32 0, i32 -2147483648) ; CHECK-NEXT: [[TMP4:%.*]] = add nuw i32 [[DST_OFF]], 16 ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[TMP3]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[TMP4]], i32 0, i32 -2147483648) ; CHECK-NEXT: ret void ; call void @llvm.memcpy.p7.p7.i32(ptr addrspace(7) %dst, ptr addrspace(7) %src, i32 32, i1 true) ret void } define void @memcpy_unknown(ptr addrspace(7) inreg %src, ptr addrspace(7) inreg %dst, i32 inreg %length) { ; CHECK-LABEL: define void @memcpy_unknown( ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[SRC:%.*]], { ptr addrspace(8), i32 } inreg [[DST:%.*]], i32 inreg [[LENGTH:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[DST_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 0 ; CHECK-NEXT: [[DST_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 1 ; CHECK-NEXT: [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0 ; CHECK-NEXT: [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1 ; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[LENGTH]], 15 ; CHECK-NEXT: [[TMP2:%.*]] = sub i32 [[LENGTH]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0 ; CHECK-NEXT: br i1 [[TMP3]], label %[[LOOP_MEMCPY_EXPANSION:.*]], label %[[LOOP_MEMCPY_RESIDUAL_HEADER:.*]] ; CHECK: [[LOOP_MEMCPY_EXPANSION]]: ; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP7:%.*]], %[[LOOP_MEMCPY_EXPANSION]] ] ; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SRC_OFF]], [[LOOP_INDEX]] ; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP4]], i32 0, i32 0) ; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[DST_OFF]], [[LOOP_INDEX]] ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[TMP5]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[TMP6]], i32 0, i32 0) ; CHECK-NEXT: [[TMP7]] = add i32 [[LOOP_INDEX]], 16 ; CHECK-NEXT: [[TMP8:%.*]] = icmp ult i32 [[TMP7]], [[TMP2]] ; CHECK-NEXT: br i1 [[TMP8]], label %[[LOOP_MEMCPY_EXPANSION]], label %[[LOOP_MEMCPY_RESIDUAL_HEADER]] ; CHECK: [[LOOP_MEMCPY_RESIDUAL:.*]]: ; CHECK-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, %[[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP13:%.*]], %[[LOOP_MEMCPY_RESIDUAL]] ] ; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP2]], [[RESIDUAL_LOOP_INDEX]] ; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[SRC_OFF]], [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP10]], i32 0, i32 0) ; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[DST_OFF]], [[TMP9]] ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[TMP11]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[TMP12]], i32 0, i32 0) ; CHECK-NEXT: [[TMP13]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1 ; CHECK-NEXT: [[TMP14:%.*]] = icmp ult i32 [[TMP13]], [[TMP1]] ; CHECK-NEXT: br i1 [[TMP14]], label %[[LOOP_MEMCPY_RESIDUAL]], label %[[POST_LOOP_MEMCPY_EXPANSION:.*]] ; CHECK: [[POST_LOOP_MEMCPY_EXPANSION]]: ; CHECK-NEXT: ret void ; CHECK: [[LOOP_MEMCPY_RESIDUAL_HEADER]]: ; CHECK-NEXT: [[TMP15:%.*]] = icmp ne i32 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[TMP15]], label %[[LOOP_MEMCPY_RESIDUAL]], label %[[POST_LOOP_MEMCPY_EXPANSION]] ; call void @llvm.memcpy.p7.p7.i32(ptr addrspace(7) %dst, ptr addrspace(7) %src, i32 %length, i1 false) ret void } define void @memcpy_known_p1_to_p7(ptr addrspace(1) inreg %src, ptr addrspace(7) inreg %dst) { ; CHECK-LABEL: define void @memcpy_known_p1_to_p7( ; CHECK-SAME: ptr addrspace(1) inreg [[SRC:%.*]], { ptr addrspace(8), i32 } inreg [[DST:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[DST_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 0 ; CHECK-NEXT: [[DST_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 1 ; CHECK-NEXT: br label %[[LOAD_STORE_LOOP:.*]] ; CHECK: [[LOAD_STORE_LOOP]]: ; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOAD_STORE_LOOP]] ] ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i32 [[LOOP_INDEX]] ; CHECK-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 16 ; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[DST_OFF]], [[LOOP_INDEX]] ; CHECK-NEXT: [[DOTSLICE_0:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_0]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[TMP3]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_4:%.*]] = add nuw i32 [[TMP3]], 16 ; CHECK-NEXT: [[DOTSLICE_4:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_4]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_4]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_8:%.*]] = add nuw i32 [[TMP3]], 32 ; CHECK-NEXT: [[DOTSLICE_8:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_8]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_8]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_12:%.*]] = add nuw i32 [[TMP3]], 48 ; CHECK-NEXT: [[DOTSLICE_12:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_12]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_12]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_16:%.*]] = add nuw i32 [[TMP3]], 64 ; CHECK-NEXT: [[DOTSLICE_16:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_16]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_16]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_20:%.*]] = add nuw i32 [[TMP3]], 80 ; CHECK-NEXT: [[DOTSLICE_20:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_20]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_20]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_24:%.*]] = add nuw i32 [[TMP3]], 96 ; CHECK-NEXT: [[DOTSLICE_24:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_24]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_24]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_28:%.*]] = add nuw i32 [[TMP3]], 112 ; CHECK-NEXT: [[DOTSLICE_28:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_28]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_28]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_32:%.*]] = add nuw i32 [[TMP3]], 128 ; CHECK-NEXT: [[DOTSLICE_32:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_32]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_32]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_36:%.*]] = add nuw i32 [[TMP3]], 144 ; CHECK-NEXT: [[DOTSLICE_36:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_36]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_36]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_40:%.*]] = add nuw i32 [[TMP3]], 160 ; CHECK-NEXT: [[DOTSLICE_40:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_40]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_40]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_44:%.*]] = add nuw i32 [[TMP3]], 176 ; CHECK-NEXT: [[DOTSLICE_44:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_44]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_44]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_48:%.*]] = add nuw i32 [[TMP3]], 192 ; CHECK-NEXT: [[DOTSLICE_48:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_48]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_48]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_52:%.*]] = add nuw i32 [[TMP3]], 208 ; CHECK-NEXT: [[DOTSLICE_52:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_52]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_52]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_56:%.*]] = add nuw i32 [[TMP3]], 224 ; CHECK-NEXT: [[DOTSLICE_56:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_56]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_56]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_60:%.*]] = add nuw i32 [[TMP3]], 240 ; CHECK-NEXT: [[DOTSLICE_60:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_60]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0) ; CHECK-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256 ; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 8192 ; CHECK-NEXT: br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]] ; CHECK: [[MEMCPY_SPLIT]]: ; CHECK-NEXT: ret void ; call void @llvm.memcpy.p7.p1.i32(ptr addrspace(7) noundef nonnull align 16 %dst, ptr addrspace(1) noundef nonnull align 16 %src, i32 8192, i1 false) ret void } define void @memcpy_known_p7_to_p1(ptr addrspace(7) inreg %src, ptr addrspace(1) inreg %dst) { ; CHECK-LABEL: define void @memcpy_known_p7_to_p1( ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[SRC:%.*]], ptr addrspace(1) inreg [[DST:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0 ; CHECK-NEXT: [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1 ; CHECK-NEXT: br label %[[LOAD_STORE_LOOP:.*]] ; CHECK: [[LOAD_STORE_LOOP]]: ; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOAD_STORE_LOOP]] ] ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SRC_OFF]], [[LOOP_INDEX]] ; CHECK-NEXT: [[DOTOFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[TMP1]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_0:%.*]] = shufflevector <4 x i32> [[DOTOFF_0]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_0:%.*]] = shufflevector <64 x i32> poison, <64 x i32> [[DOTEXT_0]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_16:%.*]] = add nuw i32 [[TMP1]], 16 ; CHECK-NEXT: [[DOTOFF_16:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_16]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_4:%.*]] = shufflevector <4 x i32> [[DOTOFF_16]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_4:%.*]] = shufflevector <64 x i32> [[DOTPARTS_0]], <64 x i32> [[DOTEXT_4]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_32:%.*]] = add nuw i32 [[TMP1]], 32 ; CHECK-NEXT: [[DOTOFF_32:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_32]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_8:%.*]] = shufflevector <4 x i32> [[DOTOFF_32]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_8:%.*]] = shufflevector <64 x i32> [[DOTPARTS_4]], <64 x i32> [[DOTEXT_8]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_48:%.*]] = add nuw i32 [[TMP1]], 48 ; CHECK-NEXT: [[DOTOFF_48:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_48]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_12:%.*]] = shufflevector <4 x i32> [[DOTOFF_48]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_12:%.*]] = shufflevector <64 x i32> [[DOTPARTS_8]], <64 x i32> [[DOTEXT_12]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_64:%.*]] = add nuw i32 [[TMP1]], 64 ; CHECK-NEXT: [[DOTOFF_64:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_64]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_16:%.*]] = shufflevector <4 x i32> [[DOTOFF_64]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_16:%.*]] = shufflevector <64 x i32> [[DOTPARTS_12]], <64 x i32> [[DOTEXT_16]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_80:%.*]] = add nuw i32 [[TMP1]], 80 ; CHECK-NEXT: [[DOTOFF_80:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_80]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_20:%.*]] = shufflevector <4 x i32> [[DOTOFF_80]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_20:%.*]] = shufflevector <64 x i32> [[DOTPARTS_16]], <64 x i32> [[DOTEXT_20]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_96:%.*]] = add nuw i32 [[TMP1]], 96 ; CHECK-NEXT: [[DOTOFF_96:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_96]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_24:%.*]] = shufflevector <4 x i32> [[DOTOFF_96]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_24:%.*]] = shufflevector <64 x i32> [[DOTPARTS_20]], <64 x i32> [[DOTEXT_24]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_112:%.*]] = add nuw i32 [[TMP1]], 112 ; CHECK-NEXT: [[DOTOFF_112:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_112]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_28:%.*]] = shufflevector <4 x i32> [[DOTOFF_112]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_28:%.*]] = shufflevector <64 x i32> [[DOTPARTS_24]], <64 x i32> [[DOTEXT_28]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_128:%.*]] = add nuw i32 [[TMP1]], 128 ; CHECK-NEXT: [[DOTOFF_128:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_128]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_32:%.*]] = shufflevector <4 x i32> [[DOTOFF_128]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_32:%.*]] = shufflevector <64 x i32> [[DOTPARTS_28]], <64 x i32> [[DOTEXT_32]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_144:%.*]] = add nuw i32 [[TMP1]], 144 ; CHECK-NEXT: [[DOTOFF_144:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_144]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_36:%.*]] = shufflevector <4 x i32> [[DOTOFF_144]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_36:%.*]] = shufflevector <64 x i32> [[DOTPARTS_32]], <64 x i32> [[DOTEXT_36]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_160:%.*]] = add nuw i32 [[TMP1]], 160 ; CHECK-NEXT: [[DOTOFF_160:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_160]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_40:%.*]] = shufflevector <4 x i32> [[DOTOFF_160]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_40:%.*]] = shufflevector <64 x i32> [[DOTPARTS_36]], <64 x i32> [[DOTEXT_40]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_176:%.*]] = add nuw i32 [[TMP1]], 176 ; CHECK-NEXT: [[DOTOFF_176:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_176]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_44:%.*]] = shufflevector <4 x i32> [[DOTOFF_176]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_44:%.*]] = shufflevector <64 x i32> [[DOTPARTS_40]], <64 x i32> [[DOTEXT_44]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_192:%.*]] = add nuw i32 [[TMP1]], 192 ; CHECK-NEXT: [[DOTOFF_192:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_192]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_48:%.*]] = shufflevector <4 x i32> [[DOTOFF_192]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_48:%.*]] = shufflevector <64 x i32> [[DOTPARTS_44]], <64 x i32> [[DOTEXT_48]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_208:%.*]] = add nuw i32 [[TMP1]], 208 ; CHECK-NEXT: [[DOTOFF_208:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_208]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_52:%.*]] = shufflevector <4 x i32> [[DOTOFF_208]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_52:%.*]] = shufflevector <64 x i32> [[DOTPARTS_48]], <64 x i32> [[DOTEXT_52]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_224:%.*]] = add nuw i32 [[TMP1]], 224 ; CHECK-NEXT: [[DOTOFF_224:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_224]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_56:%.*]] = shufflevector <4 x i32> [[DOTOFF_224]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_56:%.*]] = shufflevector <64 x i32> [[DOTPARTS_52]], <64 x i32> [[DOTEXT_56]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_240:%.*]] = add nuw i32 [[TMP1]], 240 ; CHECK-NEXT: [[DOTOFF_240:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_240]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_60:%.*]] = shufflevector <4 x i32> [[DOTOFF_240]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <64 x i32> [[DOTPARTS_56]], <64 x i32> [[DOTEXT_60]], <64 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i32 [[LOOP_INDEX]] ; CHECK-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 16 ; CHECK-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256 ; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 8192 ; CHECK-NEXT: br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]] ; CHECK: [[MEMCPY_SPLIT]]: ; CHECK-NEXT: ret void ; call void @llvm.memcpy.p1.p7.i32(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(7) noundef nonnull align 16 %src, i32 8192, i1 false) ret void } ;; This could be the direct-to-LDS intrinsics in a future patch define void @memcpy_known_p7_to_p3(ptr addrspace(7) inreg %src, ptr addrspace(3) inreg %dst) { ; CHECK-LABEL: define void @memcpy_known_p7_to_p3( ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[SRC:%.*]], ptr addrspace(3) inreg [[DST:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0 ; CHECK-NEXT: [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1 ; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[SRC_OFF]], i32 0, i32 0) ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 0 ; CHECK-NEXT: store <4 x i32> [[TMP1]], ptr addrspace(3) [[TMP2]], align 16 ; CHECK-NEXT: ret void ; call void @llvm.memcpy.p3.p7.i32(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(7) noundef nonnull align 16 %src, i32 16, i1 false) ret void } define void @memcpy_known_p7_to_p3_byte(ptr addrspace(7) inreg %src, ptr addrspace(3) inreg %dst) { ; CHECK-LABEL: define void @memcpy_known_p7_to_p3_byte( ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[SRC:%.*]], ptr addrspace(3) inreg [[DST:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0 ; CHECK-NEXT: [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1 ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[SRC_OFF]], i32 0, i32 0) ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 0 ; CHECK-NEXT: store i32 [[TMP1]], ptr addrspace(3) [[TMP2]], align 16 ; CHECK-NEXT: ret void ; call void @llvm.memcpy.p3.p7.i32(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(7) noundef nonnull align 16 %src, i32 4, i1 false) ret void } define void @memcpy_known_p7_to_p3_long(ptr addrspace(7) inreg %src, ptr addrspace(3) inreg %dst) { ; CHECK-LABEL: define void @memcpy_known_p7_to_p3_long( ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[SRC:%.*]], ptr addrspace(3) inreg [[DST:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0 ; CHECK-NEXT: [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1 ; CHECK-NEXT: br label %[[LOAD_STORE_LOOP:.*]] ; CHECK: [[LOAD_STORE_LOOP]]: ; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOAD_STORE_LOOP]] ] ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SRC_OFF]], [[LOOP_INDEX]] ; CHECK-NEXT: [[DOTOFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[TMP1]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_0:%.*]] = shufflevector <4 x i32> [[DOTOFF_0]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_0:%.*]] = shufflevector <64 x i32> poison, <64 x i32> [[DOTEXT_0]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_16:%.*]] = add nuw i32 [[TMP1]], 16 ; CHECK-NEXT: [[DOTOFF_16:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_16]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_4:%.*]] = shufflevector <4 x i32> [[DOTOFF_16]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_4:%.*]] = shufflevector <64 x i32> [[DOTPARTS_0]], <64 x i32> [[DOTEXT_4]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_32:%.*]] = add nuw i32 [[TMP1]], 32 ; CHECK-NEXT: [[DOTOFF_32:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_32]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_8:%.*]] = shufflevector <4 x i32> [[DOTOFF_32]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_8:%.*]] = shufflevector <64 x i32> [[DOTPARTS_4]], <64 x i32> [[DOTEXT_8]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_48:%.*]] = add nuw i32 [[TMP1]], 48 ; CHECK-NEXT: [[DOTOFF_48:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_48]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_12:%.*]] = shufflevector <4 x i32> [[DOTOFF_48]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_12:%.*]] = shufflevector <64 x i32> [[DOTPARTS_8]], <64 x i32> [[DOTEXT_12]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_64:%.*]] = add nuw i32 [[TMP1]], 64 ; CHECK-NEXT: [[DOTOFF_64:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_64]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_16:%.*]] = shufflevector <4 x i32> [[DOTOFF_64]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_16:%.*]] = shufflevector <64 x i32> [[DOTPARTS_12]], <64 x i32> [[DOTEXT_16]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_80:%.*]] = add nuw i32 [[TMP1]], 80 ; CHECK-NEXT: [[DOTOFF_80:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_80]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_20:%.*]] = shufflevector <4 x i32> [[DOTOFF_80]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_20:%.*]] = shufflevector <64 x i32> [[DOTPARTS_16]], <64 x i32> [[DOTEXT_20]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_96:%.*]] = add nuw i32 [[TMP1]], 96 ; CHECK-NEXT: [[DOTOFF_96:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_96]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_24:%.*]] = shufflevector <4 x i32> [[DOTOFF_96]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_24:%.*]] = shufflevector <64 x i32> [[DOTPARTS_20]], <64 x i32> [[DOTEXT_24]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_112:%.*]] = add nuw i32 [[TMP1]], 112 ; CHECK-NEXT: [[DOTOFF_112:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_112]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_28:%.*]] = shufflevector <4 x i32> [[DOTOFF_112]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_28:%.*]] = shufflevector <64 x i32> [[DOTPARTS_24]], <64 x i32> [[DOTEXT_28]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_128:%.*]] = add nuw i32 [[TMP1]], 128 ; CHECK-NEXT: [[DOTOFF_128:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_128]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_32:%.*]] = shufflevector <4 x i32> [[DOTOFF_128]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_32:%.*]] = shufflevector <64 x i32> [[DOTPARTS_28]], <64 x i32> [[DOTEXT_32]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_144:%.*]] = add nuw i32 [[TMP1]], 144 ; CHECK-NEXT: [[DOTOFF_144:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_144]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_36:%.*]] = shufflevector <4 x i32> [[DOTOFF_144]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_36:%.*]] = shufflevector <64 x i32> [[DOTPARTS_32]], <64 x i32> [[DOTEXT_36]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_160:%.*]] = add nuw i32 [[TMP1]], 160 ; CHECK-NEXT: [[DOTOFF_160:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_160]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_40:%.*]] = shufflevector <4 x i32> [[DOTOFF_160]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_40:%.*]] = shufflevector <64 x i32> [[DOTPARTS_36]], <64 x i32> [[DOTEXT_40]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_176:%.*]] = add nuw i32 [[TMP1]], 176 ; CHECK-NEXT: [[DOTOFF_176:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_176]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_44:%.*]] = shufflevector <4 x i32> [[DOTOFF_176]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_44:%.*]] = shufflevector <64 x i32> [[DOTPARTS_40]], <64 x i32> [[DOTEXT_44]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_192:%.*]] = add nuw i32 [[TMP1]], 192 ; CHECK-NEXT: [[DOTOFF_192:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_192]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_48:%.*]] = shufflevector <4 x i32> [[DOTOFF_192]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_48:%.*]] = shufflevector <64 x i32> [[DOTPARTS_44]], <64 x i32> [[DOTEXT_48]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_208:%.*]] = add nuw i32 [[TMP1]], 208 ; CHECK-NEXT: [[DOTOFF_208:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_208]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_52:%.*]] = shufflevector <4 x i32> [[DOTOFF_208]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_52:%.*]] = shufflevector <64 x i32> [[DOTPARTS_48]], <64 x i32> [[DOTEXT_52]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_224:%.*]] = add nuw i32 [[TMP1]], 224 ; CHECK-NEXT: [[DOTOFF_224:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_224]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_56:%.*]] = shufflevector <4 x i32> [[DOTOFF_224]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_56:%.*]] = shufflevector <64 x i32> [[DOTPARTS_52]], <64 x i32> [[DOTEXT_56]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_240:%.*]] = add nuw i32 [[TMP1]], 240 ; CHECK-NEXT: [[DOTOFF_240:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_240]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_60:%.*]] = shufflevector <4 x i32> [[DOTOFF_240]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <64 x i32> [[DOTPARTS_56]], <64 x i32> [[DOTEXT_60]], <64 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[LOOP_INDEX]] ; CHECK-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(3) [[TMP3]], align 16 ; CHECK-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256 ; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 8192 ; CHECK-NEXT: br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]] ; CHECK: [[MEMCPY_SPLIT]]: ; CHECK-NEXT: ret void ; call void @llvm.memcpy.p3.p7.i32(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(7) noundef nonnull align 16 %src, i32 8192, i1 false) ret void } ;; memcpy.inline declare void @llvm.memcpy.inline.p7.p7.i32(ptr addrspace(7), ptr addrspace(7), i32, i1) declare void @llvm.memcpy.inline.p1.p7.i32(ptr addrspace(1), ptr addrspace(7), i32, i1) declare void @llvm.memcpy.inline.p7.p1.i32(ptr addrspace(7), ptr addrspace(1), i32, i1) declare void @llvm.memcpy.inline.p7.p7.i64(ptr addrspace(7), ptr addrspace(7), i64, i1) declare void @llvm.memcpy.inline.p3.p7.i32(ptr addrspace(3), ptr addrspace(7), i32, i1) define void @memcpy.inline_known(ptr addrspace(7) inreg %src, ptr addrspace(7) inreg %dst) { ; CHECK-LABEL: define void @memcpy.inline_known( ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[SRC:%.*]], { ptr addrspace(8), i32 } inreg [[DST:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[DST_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 0 ; CHECK-NEXT: [[DST_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 1 ; CHECK-NEXT: [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0 ; CHECK-NEXT: [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1 ; CHECK-NEXT: br label %[[LOAD_STORE_LOOP:.*]] ; CHECK: [[LOAD_STORE_LOOP]]: ; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOAD_STORE_LOOP]] ] ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SRC_OFF]], [[LOOP_INDEX]] ; CHECK-NEXT: [[DOTOFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[TMP1]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_0:%.*]] = shufflevector <4 x i32> [[DOTOFF_0]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_0:%.*]] = shufflevector <64 x i32> poison, <64 x i32> [[DOTEXT_0]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_16:%.*]] = add nuw i32 [[TMP1]], 16 ; CHECK-NEXT: [[DOTOFF_16:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_16]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_4:%.*]] = shufflevector <4 x i32> [[DOTOFF_16]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_4:%.*]] = shufflevector <64 x i32> [[DOTPARTS_0]], <64 x i32> [[DOTEXT_4]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_32:%.*]] = add nuw i32 [[TMP1]], 32 ; CHECK-NEXT: [[DOTOFF_32:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_32]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_8:%.*]] = shufflevector <4 x i32> [[DOTOFF_32]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_8:%.*]] = shufflevector <64 x i32> [[DOTPARTS_4]], <64 x i32> [[DOTEXT_8]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_48:%.*]] = add nuw i32 [[TMP1]], 48 ; CHECK-NEXT: [[DOTOFF_48:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_48]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_12:%.*]] = shufflevector <4 x i32> [[DOTOFF_48]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_12:%.*]] = shufflevector <64 x i32> [[DOTPARTS_8]], <64 x i32> [[DOTEXT_12]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_64:%.*]] = add nuw i32 [[TMP1]], 64 ; CHECK-NEXT: [[DOTOFF_64:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_64]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_16:%.*]] = shufflevector <4 x i32> [[DOTOFF_64]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_16:%.*]] = shufflevector <64 x i32> [[DOTPARTS_12]], <64 x i32> [[DOTEXT_16]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_80:%.*]] = add nuw i32 [[TMP1]], 80 ; CHECK-NEXT: [[DOTOFF_80:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_80]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_20:%.*]] = shufflevector <4 x i32> [[DOTOFF_80]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_20:%.*]] = shufflevector <64 x i32> [[DOTPARTS_16]], <64 x i32> [[DOTEXT_20]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_96:%.*]] = add nuw i32 [[TMP1]], 96 ; CHECK-NEXT: [[DOTOFF_96:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_96]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_24:%.*]] = shufflevector <4 x i32> [[DOTOFF_96]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_24:%.*]] = shufflevector <64 x i32> [[DOTPARTS_20]], <64 x i32> [[DOTEXT_24]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_112:%.*]] = add nuw i32 [[TMP1]], 112 ; CHECK-NEXT: [[DOTOFF_112:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_112]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_28:%.*]] = shufflevector <4 x i32> [[DOTOFF_112]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_28:%.*]] = shufflevector <64 x i32> [[DOTPARTS_24]], <64 x i32> [[DOTEXT_28]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_128:%.*]] = add nuw i32 [[TMP1]], 128 ; CHECK-NEXT: [[DOTOFF_128:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_128]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_32:%.*]] = shufflevector <4 x i32> [[DOTOFF_128]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_32:%.*]] = shufflevector <64 x i32> [[DOTPARTS_28]], <64 x i32> [[DOTEXT_32]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_144:%.*]] = add nuw i32 [[TMP1]], 144 ; CHECK-NEXT: [[DOTOFF_144:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_144]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_36:%.*]] = shufflevector <4 x i32> [[DOTOFF_144]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_36:%.*]] = shufflevector <64 x i32> [[DOTPARTS_32]], <64 x i32> [[DOTEXT_36]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_160:%.*]] = add nuw i32 [[TMP1]], 160 ; CHECK-NEXT: [[DOTOFF_160:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_160]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_40:%.*]] = shufflevector <4 x i32> [[DOTOFF_160]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_40:%.*]] = shufflevector <64 x i32> [[DOTPARTS_36]], <64 x i32> [[DOTEXT_40]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_176:%.*]] = add nuw i32 [[TMP1]], 176 ; CHECK-NEXT: [[DOTOFF_176:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_176]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_44:%.*]] = shufflevector <4 x i32> [[DOTOFF_176]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_44:%.*]] = shufflevector <64 x i32> [[DOTPARTS_40]], <64 x i32> [[DOTEXT_44]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_192:%.*]] = add nuw i32 [[TMP1]], 192 ; CHECK-NEXT: [[DOTOFF_192:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_192]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_48:%.*]] = shufflevector <4 x i32> [[DOTOFF_192]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_48:%.*]] = shufflevector <64 x i32> [[DOTPARTS_44]], <64 x i32> [[DOTEXT_48]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_208:%.*]] = add nuw i32 [[TMP1]], 208 ; CHECK-NEXT: [[DOTOFF_208:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_208]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_52:%.*]] = shufflevector <4 x i32> [[DOTOFF_208]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_52:%.*]] = shufflevector <64 x i32> [[DOTPARTS_48]], <64 x i32> [[DOTEXT_52]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_224:%.*]] = add nuw i32 [[TMP1]], 224 ; CHECK-NEXT: [[DOTOFF_224:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_224]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_56:%.*]] = shufflevector <4 x i32> [[DOTOFF_224]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_56:%.*]] = shufflevector <64 x i32> [[DOTPARTS_52]], <64 x i32> [[DOTEXT_56]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_240:%.*]] = add nuw i32 [[TMP1]], 240 ; CHECK-NEXT: [[DOTOFF_240:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_240]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_60:%.*]] = shufflevector <4 x i32> [[DOTOFF_240]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <64 x i32> [[DOTPARTS_56]], <64 x i32> [[DOTEXT_60]], <64 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[DST_OFF]], [[LOOP_INDEX]] ; CHECK-NEXT: [[DOTSLICE_0:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_0]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[TMP3]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_4:%.*]] = add nuw i32 [[TMP3]], 16 ; CHECK-NEXT: [[DOTSLICE_4:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_4]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_4]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_8:%.*]] = add nuw i32 [[TMP3]], 32 ; CHECK-NEXT: [[DOTSLICE_8:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_8]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_8]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_12:%.*]] = add nuw i32 [[TMP3]], 48 ; CHECK-NEXT: [[DOTSLICE_12:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_12]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_12]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_16:%.*]] = add nuw i32 [[TMP3]], 64 ; CHECK-NEXT: [[DOTSLICE_16:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_16]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_16]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_20:%.*]] = add nuw i32 [[TMP3]], 80 ; CHECK-NEXT: [[DOTSLICE_20:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_20]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_20]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_24:%.*]] = add nuw i32 [[TMP3]], 96 ; CHECK-NEXT: [[DOTSLICE_24:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_24]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_24]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_28:%.*]] = add nuw i32 [[TMP3]], 112 ; CHECK-NEXT: [[DOTSLICE_28:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_28]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_28]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_32:%.*]] = add nuw i32 [[TMP3]], 128 ; CHECK-NEXT: [[DOTSLICE_32:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_32]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_32]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_36:%.*]] = add nuw i32 [[TMP3]], 144 ; CHECK-NEXT: [[DOTSLICE_36:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_36]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_36]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_40:%.*]] = add nuw i32 [[TMP3]], 160 ; CHECK-NEXT: [[DOTSLICE_40:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_40]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_40]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_44:%.*]] = add nuw i32 [[TMP3]], 176 ; CHECK-NEXT: [[DOTSLICE_44:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_44]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_44]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_48:%.*]] = add nuw i32 [[TMP3]], 192 ; CHECK-NEXT: [[DOTSLICE_48:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_48]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_48]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_52:%.*]] = add nuw i32 [[TMP3]], 208 ; CHECK-NEXT: [[DOTSLICE_52:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_52]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_52]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_56:%.*]] = add nuw i32 [[TMP3]], 224 ; CHECK-NEXT: [[DOTSLICE_56:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_56]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_56]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_60:%.*]] = add nuw i32 [[TMP3]], 240 ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTOFF_240]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0) ; CHECK-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256 ; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 8192 ; CHECK-NEXT: br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]] ; CHECK: [[MEMCPY_SPLIT]]: ; CHECK-NEXT: ret void ; call void @llvm.memcpy.inline.p7.p7.i32(ptr addrspace(7) noundef nonnull align 16 %dst, ptr addrspace(7) noundef nonnull align 16 %src, i32 8192, i1 false) ret void } define void @memcpy.inline_known_small(ptr addrspace(7) inreg %src, ptr addrspace(7) inreg %dst) { ; CHECK-LABEL: define void @memcpy.inline_known_small( ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[SRC:%.*]], { ptr addrspace(8), i32 } inreg [[DST:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[DST_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 0 ; CHECK-NEXT: [[DST_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 1 ; CHECK-NEXT: [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0 ; CHECK-NEXT: [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1 ; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[SRC_OFF]], i32 0, i32 0) ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[TMP1]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DST_OFF]], i32 0, i32 0) ; CHECK-NEXT: [[TMP2:%.*]] = add nuw i32 [[SRC_OFF]], 16 ; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP2]], i32 0, i32 0) ; CHECK-NEXT: [[TMP4:%.*]] = add nuw i32 [[DST_OFF]], 16 ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[TMP3]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[TMP4]], i32 0, i32 0) ; CHECK-NEXT: ret void ; call void @llvm.memcpy.inline.p7.p7.i32(ptr addrspace(7) %dst, ptr addrspace(7) %src, i32 32, i1 false) ret void } define void @memcpy.inline_known_byte(ptr addrspace(7) inreg %src, ptr addrspace(7) inreg %dst) { ; CHECK-LABEL: define void @memcpy.inline_known_byte( ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[SRC:%.*]], { ptr addrspace(8), i32 } inreg [[DST:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[DST_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 0 ; CHECK-NEXT: [[DST_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 1 ; CHECK-NEXT: [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0 ; CHECK-NEXT: [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1 ; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[SRC_OFF]], i32 0, i32 0) ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[TMP1]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DST_OFF]], i32 0, i32 0) ; CHECK-NEXT: ret void ; call void @llvm.memcpy.inline.p7.p7.i32(ptr addrspace(7) %dst, ptr addrspace(7) %src, i32 1, i1 false) ret void } define void @memcpy.inline_known_tail(ptr addrspace(7) inreg %src, ptr addrspace(7) inreg %dst) { ; CHECK-LABEL: define void @memcpy.inline_known_tail( ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[SRC:%.*]], { ptr addrspace(8), i32 } inreg [[DST:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[DST_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 0 ; CHECK-NEXT: [[DST_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 1 ; CHECK-NEXT: [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0 ; CHECK-NEXT: [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1 ; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.raw.ptr.buffer.load.i64(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[SRC_OFF]], i32 0, i32 0) ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i64(i64 [[TMP1]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DST_OFF]], i32 0, i32 0) ; CHECK-NEXT: [[TMP2:%.*]] = add nuw i32 [[SRC_OFF]], 8 ; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP2]], i32 0, i32 0) ; CHECK-NEXT: [[TMP4:%.*]] = add nuw i32 [[DST_OFF]], 8 ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[TMP3]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[TMP4]], i32 0, i32 0) ; CHECK-NEXT: [[TMP5:%.*]] = add nuw i32 [[SRC_OFF]], 12 ; CHECK-NEXT: [[TMP6:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP5]], i32 0, i32 0) ; CHECK-NEXT: [[TMP7:%.*]] = add nuw i32 [[DST_OFF]], 12 ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[TMP6]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[TMP7]], i32 0, i32 0) ; CHECK-NEXT: [[TMP8:%.*]] = add nuw i32 [[SRC_OFF]], 14 ; CHECK-NEXT: [[TMP9:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP8]], i32 0, i32 0) ; CHECK-NEXT: [[TMP10:%.*]] = add nuw i32 [[DST_OFF]], 14 ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[TMP9]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[TMP10]], i32 0, i32 0) ; CHECK-NEXT: ret void ; call void @llvm.memcpy.inline.p7.p7.i32(ptr addrspace(7) %dst, ptr addrspace(7) %src, i32 15, i1 false) ret void } define void @memcpy.inline_known_i64(ptr addrspace(7) inreg %src, ptr addrspace(7) inreg %dst) { ; CHECK-LABEL: define void @memcpy.inline_known_i64( ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[SRC:%.*]], { ptr addrspace(8), i32 } inreg [[DST:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[DST_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 0 ; CHECK-NEXT: [[DST_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 1 ; CHECK-NEXT: [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0 ; CHECK-NEXT: [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1 ; CHECK-NEXT: br label %[[LOAD_STORE_LOOP:.*]] ; CHECK: [[LOAD_STORE_LOOP]]: ; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOAD_STORE_LOOP]] ] ; CHECK-NEXT: [[LOOP_INDEX_C:%.*]] = trunc i64 [[LOOP_INDEX]] to i32 ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SRC_OFF]], [[LOOP_INDEX_C]] ; CHECK-NEXT: [[DOTOFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP1]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_0:%.*]] = shufflevector <4 x i32> [[DOTOFF_0]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_0:%.*]] = shufflevector <64 x i32> poison, <64 x i32> [[DOTEXT_0]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_16:%.*]] = add nuw i32 [[TMP1]], 16 ; CHECK-NEXT: [[DOTOFF_16:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_16]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_4:%.*]] = shufflevector <4 x i32> [[DOTOFF_16]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_4:%.*]] = shufflevector <64 x i32> [[DOTPARTS_0]], <64 x i32> [[DOTEXT_4]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_32:%.*]] = add nuw i32 [[TMP1]], 32 ; CHECK-NEXT: [[DOTOFF_32:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_32]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_8:%.*]] = shufflevector <4 x i32> [[DOTOFF_32]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_8:%.*]] = shufflevector <64 x i32> [[DOTPARTS_4]], <64 x i32> [[DOTEXT_8]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_48:%.*]] = add nuw i32 [[TMP1]], 48 ; CHECK-NEXT: [[DOTOFF_48:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_48]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_12:%.*]] = shufflevector <4 x i32> [[DOTOFF_48]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_12:%.*]] = shufflevector <64 x i32> [[DOTPARTS_8]], <64 x i32> [[DOTEXT_12]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_64:%.*]] = add nuw i32 [[TMP1]], 64 ; CHECK-NEXT: [[DOTOFF_64:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_64]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_16:%.*]] = shufflevector <4 x i32> [[DOTOFF_64]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_16:%.*]] = shufflevector <64 x i32> [[DOTPARTS_12]], <64 x i32> [[DOTEXT_16]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_80:%.*]] = add nuw i32 [[TMP1]], 80 ; CHECK-NEXT: [[DOTOFF_80:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_80]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_20:%.*]] = shufflevector <4 x i32> [[DOTOFF_80]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_20:%.*]] = shufflevector <64 x i32> [[DOTPARTS_16]], <64 x i32> [[DOTEXT_20]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_96:%.*]] = add nuw i32 [[TMP1]], 96 ; CHECK-NEXT: [[DOTOFF_96:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_96]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_24:%.*]] = shufflevector <4 x i32> [[DOTOFF_96]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_24:%.*]] = shufflevector <64 x i32> [[DOTPARTS_20]], <64 x i32> [[DOTEXT_24]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_112:%.*]] = add nuw i32 [[TMP1]], 112 ; CHECK-NEXT: [[DOTOFF_112:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_112]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_28:%.*]] = shufflevector <4 x i32> [[DOTOFF_112]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_28:%.*]] = shufflevector <64 x i32> [[DOTPARTS_24]], <64 x i32> [[DOTEXT_28]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_128:%.*]] = add nuw i32 [[TMP1]], 128 ; CHECK-NEXT: [[DOTOFF_128:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_128]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_32:%.*]] = shufflevector <4 x i32> [[DOTOFF_128]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_32:%.*]] = shufflevector <64 x i32> [[DOTPARTS_28]], <64 x i32> [[DOTEXT_32]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_144:%.*]] = add nuw i32 [[TMP1]], 144 ; CHECK-NEXT: [[DOTOFF_144:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_144]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_36:%.*]] = shufflevector <4 x i32> [[DOTOFF_144]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_36:%.*]] = shufflevector <64 x i32> [[DOTPARTS_32]], <64 x i32> [[DOTEXT_36]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_160:%.*]] = add nuw i32 [[TMP1]], 160 ; CHECK-NEXT: [[DOTOFF_160:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_160]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_40:%.*]] = shufflevector <4 x i32> [[DOTOFF_160]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_40:%.*]] = shufflevector <64 x i32> [[DOTPARTS_36]], <64 x i32> [[DOTEXT_40]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_176:%.*]] = add nuw i32 [[TMP1]], 176 ; CHECK-NEXT: [[DOTOFF_176:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_176]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_44:%.*]] = shufflevector <4 x i32> [[DOTOFF_176]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_44:%.*]] = shufflevector <64 x i32> [[DOTPARTS_40]], <64 x i32> [[DOTEXT_44]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_192:%.*]] = add nuw i32 [[TMP1]], 192 ; CHECK-NEXT: [[DOTOFF_192:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_192]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_48:%.*]] = shufflevector <4 x i32> [[DOTOFF_192]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_48:%.*]] = shufflevector <64 x i32> [[DOTPARTS_44]], <64 x i32> [[DOTEXT_48]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_208:%.*]] = add nuw i32 [[TMP1]], 208 ; CHECK-NEXT: [[DOTOFF_208:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_208]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_52:%.*]] = shufflevector <4 x i32> [[DOTOFF_208]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_52:%.*]] = shufflevector <64 x i32> [[DOTPARTS_48]], <64 x i32> [[DOTEXT_52]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_224:%.*]] = add nuw i32 [[TMP1]], 224 ; CHECK-NEXT: [[DOTOFF_224:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_224]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_56:%.*]] = shufflevector <4 x i32> [[DOTOFF_224]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_56:%.*]] = shufflevector <64 x i32> [[DOTPARTS_52]], <64 x i32> [[DOTEXT_56]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_240:%.*]] = add nuw i32 [[TMP1]], 240 ; CHECK-NEXT: [[DOTOFF_240:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[DOTOFF_PTR_240]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_60:%.*]] = shufflevector <4 x i32> [[DOTOFF_240]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <64 x i32> [[DOTPARTS_56]], <64 x i32> [[DOTEXT_60]], <64 x i32> ; CHECK-NEXT: [[LOOP_INDEX_C1:%.*]] = trunc i64 [[LOOP_INDEX]] to i32 ; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[DST_OFF]], [[LOOP_INDEX_C1]] ; CHECK-NEXT: [[DOTSLICE_0:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_0]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[TMP3]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_4:%.*]] = add nuw i32 [[TMP3]], 16 ; CHECK-NEXT: [[DOTSLICE_4:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_4]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_4]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_8:%.*]] = add nuw i32 [[TMP3]], 32 ; CHECK-NEXT: [[DOTSLICE_8:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_8]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_8]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_12:%.*]] = add nuw i32 [[TMP3]], 48 ; CHECK-NEXT: [[DOTSLICE_12:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_12]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_12]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_16:%.*]] = add nuw i32 [[TMP3]], 64 ; CHECK-NEXT: [[DOTSLICE_16:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_16]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_16]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_20:%.*]] = add nuw i32 [[TMP3]], 80 ; CHECK-NEXT: [[DOTSLICE_20:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_20]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_20]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_24:%.*]] = add nuw i32 [[TMP3]], 96 ; CHECK-NEXT: [[DOTSLICE_24:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_24]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_24]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_28:%.*]] = add nuw i32 [[TMP3]], 112 ; CHECK-NEXT: [[DOTSLICE_28:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_28]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_28]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_32:%.*]] = add nuw i32 [[TMP3]], 128 ; CHECK-NEXT: [[DOTSLICE_32:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_32]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_32]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_36:%.*]] = add nuw i32 [[TMP3]], 144 ; CHECK-NEXT: [[DOTSLICE_36:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_36]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_36]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_40:%.*]] = add nuw i32 [[TMP3]], 160 ; CHECK-NEXT: [[DOTSLICE_40:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_40]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_40]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_44:%.*]] = add nuw i32 [[TMP3]], 176 ; CHECK-NEXT: [[DOTSLICE_44:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_44]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_44]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_48:%.*]] = add nuw i32 [[TMP3]], 192 ; CHECK-NEXT: [[DOTSLICE_48:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_48]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_48]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_52:%.*]] = add nuw i32 [[TMP3]], 208 ; CHECK-NEXT: [[DOTSLICE_52:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_52]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_52]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_56:%.*]] = add nuw i32 [[TMP3]], 224 ; CHECK-NEXT: [[DOTSLICE_56:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_56]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_56]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_60:%.*]] = add nuw i32 [[TMP3]], 240 ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTOFF_240]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0) ; CHECK-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256 ; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 8192 ; CHECK-NEXT: br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]] ; CHECK: [[MEMCPY_SPLIT]]: ; CHECK-NEXT: ret void ; call void @llvm.memcpy.inline.p7.p7.i64(ptr addrspace(7) %dst, ptr addrspace(7) %src, i64 8192, i1 false) ret void } define void @memcpy.inline_known_i32_volatile(ptr addrspace(7) inreg %src, ptr addrspace(7) inreg %dst) { ; CHECK-LABEL: define void @memcpy.inline_known_i32_volatile( ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[SRC:%.*]], { ptr addrspace(8), i32 } inreg [[DST:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[DST_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 0 ; CHECK-NEXT: [[DST_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 1 ; CHECK-NEXT: [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0 ; CHECK-NEXT: [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1 ; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[SRC_OFF]], i32 0, i32 -2147483648) ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[TMP1]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DST_OFF]], i32 0, i32 -2147483648) ; CHECK-NEXT: [[TMP2:%.*]] = add nuw i32 [[SRC_OFF]], 16 ; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP2]], i32 0, i32 -2147483648) ; CHECK-NEXT: [[TMP4:%.*]] = add nuw i32 [[DST_OFF]], 16 ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[TMP3]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[TMP4]], i32 0, i32 -2147483648) ; CHECK-NEXT: ret void ; call void @llvm.memcpy.inline.p7.p7.i32(ptr addrspace(7) %dst, ptr addrspace(7) %src, i32 32, i1 true) ret void } define void @memcpy.inline_unknown(ptr addrspace(7) inreg %src, ptr addrspace(7) inreg %dst, i32 inreg %length) { ; CHECK-LABEL: define void @memcpy.inline_unknown( ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[SRC:%.*]], { ptr addrspace(8), i32 } inreg [[DST:%.*]], i32 inreg [[LENGTH:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[DST_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 0 ; CHECK-NEXT: [[DST_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 1 ; CHECK-NEXT: [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0 ; CHECK-NEXT: [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1 ; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[LENGTH]], 15 ; CHECK-NEXT: [[TMP2:%.*]] = sub i32 [[LENGTH]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0 ; CHECK-NEXT: br i1 [[TMP3]], label %[[LOOP_MEMCPY_EXPANSION:.*]], label %[[LOOP_MEMCPY_RESIDUAL_HEADER:.*]] ; CHECK: [[LOOP_MEMCPY_EXPANSION]]: ; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP7:%.*]], %[[LOOP_MEMCPY_EXPANSION]] ] ; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SRC_OFF]], [[LOOP_INDEX]] ; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP4]], i32 0, i32 0) ; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[DST_OFF]], [[LOOP_INDEX]] ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[TMP5]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[TMP6]], i32 0, i32 0) ; CHECK-NEXT: [[TMP7]] = add i32 [[LOOP_INDEX]], 16 ; CHECK-NEXT: [[TMP8:%.*]] = icmp ult i32 [[TMP7]], [[TMP2]] ; CHECK-NEXT: br i1 [[TMP8]], label %[[LOOP_MEMCPY_EXPANSION]], label %[[LOOP_MEMCPY_RESIDUAL_HEADER]] ; CHECK: [[LOOP_MEMCPY_RESIDUAL:.*]]: ; CHECK-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, %[[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP13:%.*]], %[[LOOP_MEMCPY_RESIDUAL]] ] ; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP2]], [[RESIDUAL_LOOP_INDEX]] ; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[SRC_OFF]], [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[SRC_RSRC]], i32 [[TMP10]], i32 0, i32 0) ; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[DST_OFF]], [[TMP9]] ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[TMP11]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[TMP12]], i32 0, i32 0) ; CHECK-NEXT: [[TMP13]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1 ; CHECK-NEXT: [[TMP14:%.*]] = icmp ult i32 [[TMP13]], [[TMP1]] ; CHECK-NEXT: br i1 [[TMP14]], label %[[LOOP_MEMCPY_RESIDUAL]], label %[[POST_LOOP_MEMCPY_EXPANSION:.*]] ; CHECK: [[POST_LOOP_MEMCPY_EXPANSION]]: ; CHECK-NEXT: ret void ; CHECK: [[LOOP_MEMCPY_RESIDUAL_HEADER]]: ; CHECK-NEXT: [[TMP15:%.*]] = icmp ne i32 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[TMP15]], label %[[LOOP_MEMCPY_RESIDUAL]], label %[[POST_LOOP_MEMCPY_EXPANSION]] ; call void @llvm.memcpy.inline.p7.p7.i32(ptr addrspace(7) %dst, ptr addrspace(7) %src, i32 %length, i1 false) ret void } define void @memcpy.inline_known_p1_to_p7(ptr addrspace(1) inreg %src, ptr addrspace(7) inreg %dst) { ; CHECK-LABEL: define void @memcpy.inline_known_p1_to_p7( ; CHECK-SAME: ptr addrspace(1) inreg [[SRC:%.*]], { ptr addrspace(8), i32 } inreg [[DST:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[DST_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 0 ; CHECK-NEXT: [[DST_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[DST]], 1 ; CHECK-NEXT: br label %[[LOAD_STORE_LOOP:.*]] ; CHECK: [[LOAD_STORE_LOOP]]: ; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOAD_STORE_LOOP]] ] ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i32 [[LOOP_INDEX]] ; CHECK-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 16 ; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[DST_OFF]], [[LOOP_INDEX]] ; CHECK-NEXT: [[DOTSLICE_0:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_0]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[TMP3]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_4:%.*]] = add nuw i32 [[TMP3]], 16 ; CHECK-NEXT: [[DOTSLICE_4:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_4]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_4]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_8:%.*]] = add nuw i32 [[TMP3]], 32 ; CHECK-NEXT: [[DOTSLICE_8:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_8]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_8]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_12:%.*]] = add nuw i32 [[TMP3]], 48 ; CHECK-NEXT: [[DOTSLICE_12:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_12]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_12]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_16:%.*]] = add nuw i32 [[TMP3]], 64 ; CHECK-NEXT: [[DOTSLICE_16:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_16]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_16]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_20:%.*]] = add nuw i32 [[TMP3]], 80 ; CHECK-NEXT: [[DOTSLICE_20:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_20]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_20]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_24:%.*]] = add nuw i32 [[TMP3]], 96 ; CHECK-NEXT: [[DOTSLICE_24:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_24]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_24]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_28:%.*]] = add nuw i32 [[TMP3]], 112 ; CHECK-NEXT: [[DOTSLICE_28:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_28]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_28]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_32:%.*]] = add nuw i32 [[TMP3]], 128 ; CHECK-NEXT: [[DOTSLICE_32:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_32]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_32]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_36:%.*]] = add nuw i32 [[TMP3]], 144 ; CHECK-NEXT: [[DOTSLICE_36:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_36]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_36]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_40:%.*]] = add nuw i32 [[TMP3]], 160 ; CHECK-NEXT: [[DOTSLICE_40:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_40]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_40]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_44:%.*]] = add nuw i32 [[TMP3]], 176 ; CHECK-NEXT: [[DOTSLICE_44:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_44]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_44]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_48:%.*]] = add nuw i32 [[TMP3]], 192 ; CHECK-NEXT: [[DOTSLICE_48:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_48]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_48]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_52:%.*]] = add nuw i32 [[TMP3]], 208 ; CHECK-NEXT: [[DOTSLICE_52:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_52]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_52]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_56:%.*]] = add nuw i32 [[TMP3]], 224 ; CHECK-NEXT: [[DOTSLICE_56:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_56]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_56]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_60:%.*]] = add nuw i32 [[TMP3]], 240 ; CHECK-NEXT: [[DOTSLICE_60:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_60]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0) ; CHECK-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256 ; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 8192 ; CHECK-NEXT: br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]] ; CHECK: [[MEMCPY_SPLIT]]: ; CHECK-NEXT: ret void ; call void @llvm.memcpy.inline.p7.p1.i32(ptr addrspace(7) noundef nonnull align 16 %dst, ptr addrspace(1) noundef nonnull align 16 %src, i32 8192, i1 false) ret void } define void @memcpy.inline_known_p7_to_p1(ptr addrspace(7) inreg %src, ptr addrspace(1) inreg %dst) { ; CHECK-LABEL: define void @memcpy.inline_known_p7_to_p1( ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[SRC:%.*]], ptr addrspace(1) inreg [[DST:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0 ; CHECK-NEXT: [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1 ; CHECK-NEXT: br label %[[LOAD_STORE_LOOP:.*]] ; CHECK: [[LOAD_STORE_LOOP]]: ; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOAD_STORE_LOOP]] ] ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SRC_OFF]], [[LOOP_INDEX]] ; CHECK-NEXT: [[DOTOFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[TMP1]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_0:%.*]] = shufflevector <4 x i32> [[DOTOFF_0]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_0:%.*]] = shufflevector <64 x i32> poison, <64 x i32> [[DOTEXT_0]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_16:%.*]] = add nuw i32 [[TMP1]], 16 ; CHECK-NEXT: [[DOTOFF_16:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_16]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_4:%.*]] = shufflevector <4 x i32> [[DOTOFF_16]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_4:%.*]] = shufflevector <64 x i32> [[DOTPARTS_0]], <64 x i32> [[DOTEXT_4]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_32:%.*]] = add nuw i32 [[TMP1]], 32 ; CHECK-NEXT: [[DOTOFF_32:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_32]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_8:%.*]] = shufflevector <4 x i32> [[DOTOFF_32]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_8:%.*]] = shufflevector <64 x i32> [[DOTPARTS_4]], <64 x i32> [[DOTEXT_8]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_48:%.*]] = add nuw i32 [[TMP1]], 48 ; CHECK-NEXT: [[DOTOFF_48:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_48]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_12:%.*]] = shufflevector <4 x i32> [[DOTOFF_48]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_12:%.*]] = shufflevector <64 x i32> [[DOTPARTS_8]], <64 x i32> [[DOTEXT_12]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_64:%.*]] = add nuw i32 [[TMP1]], 64 ; CHECK-NEXT: [[DOTOFF_64:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_64]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_16:%.*]] = shufflevector <4 x i32> [[DOTOFF_64]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_16:%.*]] = shufflevector <64 x i32> [[DOTPARTS_12]], <64 x i32> [[DOTEXT_16]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_80:%.*]] = add nuw i32 [[TMP1]], 80 ; CHECK-NEXT: [[DOTOFF_80:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_80]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_20:%.*]] = shufflevector <4 x i32> [[DOTOFF_80]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_20:%.*]] = shufflevector <64 x i32> [[DOTPARTS_16]], <64 x i32> [[DOTEXT_20]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_96:%.*]] = add nuw i32 [[TMP1]], 96 ; CHECK-NEXT: [[DOTOFF_96:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_96]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_24:%.*]] = shufflevector <4 x i32> [[DOTOFF_96]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_24:%.*]] = shufflevector <64 x i32> [[DOTPARTS_20]], <64 x i32> [[DOTEXT_24]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_112:%.*]] = add nuw i32 [[TMP1]], 112 ; CHECK-NEXT: [[DOTOFF_112:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_112]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_28:%.*]] = shufflevector <4 x i32> [[DOTOFF_112]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_28:%.*]] = shufflevector <64 x i32> [[DOTPARTS_24]], <64 x i32> [[DOTEXT_28]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_128:%.*]] = add nuw i32 [[TMP1]], 128 ; CHECK-NEXT: [[DOTOFF_128:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_128]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_32:%.*]] = shufflevector <4 x i32> [[DOTOFF_128]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_32:%.*]] = shufflevector <64 x i32> [[DOTPARTS_28]], <64 x i32> [[DOTEXT_32]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_144:%.*]] = add nuw i32 [[TMP1]], 144 ; CHECK-NEXT: [[DOTOFF_144:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_144]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_36:%.*]] = shufflevector <4 x i32> [[DOTOFF_144]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_36:%.*]] = shufflevector <64 x i32> [[DOTPARTS_32]], <64 x i32> [[DOTEXT_36]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_160:%.*]] = add nuw i32 [[TMP1]], 160 ; CHECK-NEXT: [[DOTOFF_160:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_160]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_40:%.*]] = shufflevector <4 x i32> [[DOTOFF_160]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_40:%.*]] = shufflevector <64 x i32> [[DOTPARTS_36]], <64 x i32> [[DOTEXT_40]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_176:%.*]] = add nuw i32 [[TMP1]], 176 ; CHECK-NEXT: [[DOTOFF_176:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_176]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_44:%.*]] = shufflevector <4 x i32> [[DOTOFF_176]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_44:%.*]] = shufflevector <64 x i32> [[DOTPARTS_40]], <64 x i32> [[DOTEXT_44]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_192:%.*]] = add nuw i32 [[TMP1]], 192 ; CHECK-NEXT: [[DOTOFF_192:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_192]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_48:%.*]] = shufflevector <4 x i32> [[DOTOFF_192]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_48:%.*]] = shufflevector <64 x i32> [[DOTPARTS_44]], <64 x i32> [[DOTEXT_48]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_208:%.*]] = add nuw i32 [[TMP1]], 208 ; CHECK-NEXT: [[DOTOFF_208:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_208]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_52:%.*]] = shufflevector <4 x i32> [[DOTOFF_208]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_52:%.*]] = shufflevector <64 x i32> [[DOTPARTS_48]], <64 x i32> [[DOTEXT_52]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_224:%.*]] = add nuw i32 [[TMP1]], 224 ; CHECK-NEXT: [[DOTOFF_224:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_224]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_56:%.*]] = shufflevector <4 x i32> [[DOTOFF_224]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_56:%.*]] = shufflevector <64 x i32> [[DOTPARTS_52]], <64 x i32> [[DOTEXT_56]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_240:%.*]] = add nuw i32 [[TMP1]], 240 ; CHECK-NEXT: [[DOTOFF_240:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_240]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_60:%.*]] = shufflevector <4 x i32> [[DOTOFF_240]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <64 x i32> [[DOTPARTS_56]], <64 x i32> [[DOTEXT_60]], <64 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i32 [[LOOP_INDEX]] ; CHECK-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 16 ; CHECK-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256 ; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 8192 ; CHECK-NEXT: br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]] ; CHECK: [[MEMCPY_SPLIT]]: ; CHECK-NEXT: ret void ; call void @llvm.memcpy.inline.p1.p7.i32(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(7) noundef nonnull align 16 %src, i32 8192, i1 false) ret void } ;; This could be the direct-to-LDS intrinsics in a future patch define void @memcpy.inline_known_p7_to_p3(ptr addrspace(7) inreg %src, ptr addrspace(3) inreg %dst) { ; CHECK-LABEL: define void @memcpy.inline_known_p7_to_p3( ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[SRC:%.*]], ptr addrspace(3) inreg [[DST:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0 ; CHECK-NEXT: [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1 ; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[SRC_OFF]], i32 0, i32 0) ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 0 ; CHECK-NEXT: store <4 x i32> [[TMP1]], ptr addrspace(3) [[TMP2]], align 16 ; CHECK-NEXT: ret void ; call void @llvm.memcpy.inline.p3.p7.i32(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(7) noundef nonnull align 16 %src, i32 16, i1 false) ret void } define void @memcpy.inline_known_p7_to_p3_byte(ptr addrspace(7) inreg %src, ptr addrspace(3) inreg %dst) { ; CHECK-LABEL: define void @memcpy.inline_known_p7_to_p3_byte( ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[SRC:%.*]], ptr addrspace(3) inreg [[DST:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0 ; CHECK-NEXT: [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1 ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[SRC_OFF]], i32 0, i32 0) ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 0 ; CHECK-NEXT: store i32 [[TMP1]], ptr addrspace(3) [[TMP2]], align 16 ; CHECK-NEXT: ret void ; call void @llvm.memcpy.inline.p3.p7.i32(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(7) noundef nonnull align 16 %src, i32 4, i1 false) ret void } define void @memcpy.inline_known_p7_to_p3_long(ptr addrspace(7) inreg %src, ptr addrspace(3) inreg %dst) { ; CHECK-LABEL: define void @memcpy.inline_known_p7_to_p3_long( ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[SRC:%.*]], ptr addrspace(3) inreg [[DST:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[SRC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 0 ; CHECK-NEXT: [[SRC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[SRC]], 1 ; CHECK-NEXT: br label %[[LOAD_STORE_LOOP:.*]] ; CHECK: [[LOAD_STORE_LOOP]]: ; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOAD_STORE_LOOP]] ] ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SRC_OFF]], [[LOOP_INDEX]] ; CHECK-NEXT: [[DOTOFF_0:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[TMP1]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_0:%.*]] = shufflevector <4 x i32> [[DOTOFF_0]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_0:%.*]] = shufflevector <64 x i32> poison, <64 x i32> [[DOTEXT_0]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_16:%.*]] = add nuw i32 [[TMP1]], 16 ; CHECK-NEXT: [[DOTOFF_16:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_16]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_4:%.*]] = shufflevector <4 x i32> [[DOTOFF_16]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_4:%.*]] = shufflevector <64 x i32> [[DOTPARTS_0]], <64 x i32> [[DOTEXT_4]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_32:%.*]] = add nuw i32 [[TMP1]], 32 ; CHECK-NEXT: [[DOTOFF_32:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_32]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_8:%.*]] = shufflevector <4 x i32> [[DOTOFF_32]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_8:%.*]] = shufflevector <64 x i32> [[DOTPARTS_4]], <64 x i32> [[DOTEXT_8]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_48:%.*]] = add nuw i32 [[TMP1]], 48 ; CHECK-NEXT: [[DOTOFF_48:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_48]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_12:%.*]] = shufflevector <4 x i32> [[DOTOFF_48]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_12:%.*]] = shufflevector <64 x i32> [[DOTPARTS_8]], <64 x i32> [[DOTEXT_12]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_64:%.*]] = add nuw i32 [[TMP1]], 64 ; CHECK-NEXT: [[DOTOFF_64:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_64]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_16:%.*]] = shufflevector <4 x i32> [[DOTOFF_64]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_16:%.*]] = shufflevector <64 x i32> [[DOTPARTS_12]], <64 x i32> [[DOTEXT_16]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_80:%.*]] = add nuw i32 [[TMP1]], 80 ; CHECK-NEXT: [[DOTOFF_80:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_80]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_20:%.*]] = shufflevector <4 x i32> [[DOTOFF_80]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_20:%.*]] = shufflevector <64 x i32> [[DOTPARTS_16]], <64 x i32> [[DOTEXT_20]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_96:%.*]] = add nuw i32 [[TMP1]], 96 ; CHECK-NEXT: [[DOTOFF_96:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_96]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_24:%.*]] = shufflevector <4 x i32> [[DOTOFF_96]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_24:%.*]] = shufflevector <64 x i32> [[DOTPARTS_20]], <64 x i32> [[DOTEXT_24]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_112:%.*]] = add nuw i32 [[TMP1]], 112 ; CHECK-NEXT: [[DOTOFF_112:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_112]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_28:%.*]] = shufflevector <4 x i32> [[DOTOFF_112]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_28:%.*]] = shufflevector <64 x i32> [[DOTPARTS_24]], <64 x i32> [[DOTEXT_28]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_128:%.*]] = add nuw i32 [[TMP1]], 128 ; CHECK-NEXT: [[DOTOFF_128:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_128]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_32:%.*]] = shufflevector <4 x i32> [[DOTOFF_128]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_32:%.*]] = shufflevector <64 x i32> [[DOTPARTS_28]], <64 x i32> [[DOTEXT_32]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_144:%.*]] = add nuw i32 [[TMP1]], 144 ; CHECK-NEXT: [[DOTOFF_144:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_144]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_36:%.*]] = shufflevector <4 x i32> [[DOTOFF_144]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_36:%.*]] = shufflevector <64 x i32> [[DOTPARTS_32]], <64 x i32> [[DOTEXT_36]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_160:%.*]] = add nuw i32 [[TMP1]], 160 ; CHECK-NEXT: [[DOTOFF_160:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_160]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_40:%.*]] = shufflevector <4 x i32> [[DOTOFF_160]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_40:%.*]] = shufflevector <64 x i32> [[DOTPARTS_36]], <64 x i32> [[DOTEXT_40]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_176:%.*]] = add nuw i32 [[TMP1]], 176 ; CHECK-NEXT: [[DOTOFF_176:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_176]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_44:%.*]] = shufflevector <4 x i32> [[DOTOFF_176]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_44:%.*]] = shufflevector <64 x i32> [[DOTPARTS_40]], <64 x i32> [[DOTEXT_44]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_192:%.*]] = add nuw i32 [[TMP1]], 192 ; CHECK-NEXT: [[DOTOFF_192:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_192]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_48:%.*]] = shufflevector <4 x i32> [[DOTOFF_192]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_48:%.*]] = shufflevector <64 x i32> [[DOTPARTS_44]], <64 x i32> [[DOTEXT_48]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_208:%.*]] = add nuw i32 [[TMP1]], 208 ; CHECK-NEXT: [[DOTOFF_208:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_208]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_52:%.*]] = shufflevector <4 x i32> [[DOTOFF_208]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_52:%.*]] = shufflevector <64 x i32> [[DOTPARTS_48]], <64 x i32> [[DOTEXT_52]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_224:%.*]] = add nuw i32 [[TMP1]], 224 ; CHECK-NEXT: [[DOTOFF_224:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_224]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_56:%.*]] = shufflevector <4 x i32> [[DOTOFF_224]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[DOTPARTS_56:%.*]] = shufflevector <64 x i32> [[DOTPARTS_52]], <64 x i32> [[DOTEXT_56]], <64 x i32> ; CHECK-NEXT: [[DOTOFF_PTR_240:%.*]] = add nuw i32 [[TMP1]], 240 ; CHECK-NEXT: [[DOTOFF_240:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[SRC_RSRC]], i32 [[DOTOFF_PTR_240]], i32 0, i32 0) ; CHECK-NEXT: [[DOTEXT_60:%.*]] = shufflevector <4 x i32> [[DOTOFF_240]], <4 x i32> poison, <64 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <64 x i32> [[DOTPARTS_56]], <64 x i32> [[DOTEXT_60]], <64 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[LOOP_INDEX]] ; CHECK-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(3) [[TMP3]], align 16 ; CHECK-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256 ; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 8192 ; CHECK-NEXT: br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]] ; CHECK: [[MEMCPY_SPLIT]]: ; CHECK-NEXT: ret void ; call void @llvm.memcpy.inline.p3.p7.i32(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(7) noundef nonnull align 16 %src, i32 8192, i1 false) ret void } ;; memset declare void @llvm.memset.p7.i32(ptr addrspace(7), i8, i32, i1) declare void @llvm.memset.p7.i64(ptr addrspace(7), i8, i64, i1) define void @memset_known(ptr addrspace(7) inreg %ptr) { ; CHECK-LABEL: define void @memset_known( ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0 ; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 ; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] ; CHECK: [[LOADSTORELOOP]]: ; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ] ; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[TMP1]] ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0) ; CHECK-NEXT: [[TMP3]] = add i32 [[TMP1]], 1 ; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 8192 ; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]] ; CHECK: [[SPLIT]]: ; CHECK-NEXT: ret void ; call void @llvm.memset.p7.i32(ptr addrspace(7) noundef nonnull align 16 %ptr, i8 1, i32 8192, i1 false) ret void } define void @memset_known_small(ptr addrspace(7) inreg %ptr) { ; CHECK-LABEL: define void @memset_known_small( ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0 ; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 ; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] ; CHECK: [[LOADSTORELOOP]]: ; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ] ; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[TMP1]] ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0) ; CHECK-NEXT: [[TMP3]] = add i32 [[TMP1]], 1 ; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 32 ; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]] ; CHECK: [[SPLIT]]: ; CHECK-NEXT: ret void ; call void @llvm.memset.p7.i32(ptr addrspace(7) %ptr, i8 1, i32 32, i1 false) ret void } define void @memset_known_byte(ptr addrspace(7) inreg %ptr) { ; CHECK-LABEL: define void @memset_known_byte( ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0 ; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 ; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] ; CHECK: [[LOADSTORELOOP]]: ; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ] ; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[TMP1]] ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0) ; CHECK-NEXT: [[TMP3]] = add i32 [[TMP1]], 1 ; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 1 ; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]] ; CHECK: [[SPLIT]]: ; CHECK-NEXT: ret void ; call void @llvm.memset.p7.i32(ptr addrspace(7) %ptr, i8 1, i32 1, i1 false) ret void } define void @memset_known_tail(ptr addrspace(7) inreg %ptr) { ; CHECK-LABEL: define void @memset_known_tail( ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0 ; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 ; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] ; CHECK: [[LOADSTORELOOP]]: ; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ] ; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[TMP1]] ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0) ; CHECK-NEXT: [[TMP3]] = add i32 [[TMP1]], 1 ; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 15 ; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]] ; CHECK: [[SPLIT]]: ; CHECK-NEXT: ret void ; call void @llvm.memset.p7.i32(ptr addrspace(7) %ptr, i8 1, i32 15, i1 false) ret void } define void @memset_known_i64(ptr addrspace(7) inreg %ptr) { ; CHECK-LABEL: define void @memset_known_i64( ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0 ; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 ; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] ; CHECK: [[LOADSTORELOOP]]: ; CHECK-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ] ; CHECK-NEXT: [[DOTC:%.*]] = trunc i64 [[TMP1]] to i32 ; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[DOTC]] ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0) ; CHECK-NEXT: [[TMP3]] = add i64 [[TMP1]], 1 ; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 8192 ; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]] ; CHECK: [[SPLIT]]: ; CHECK-NEXT: ret void ; call void @llvm.memset.p7.i64(ptr addrspace(7) %ptr, i8 1, i64 8192, i1 false) ret void } define void @memset_known_i32_volatile(ptr addrspace(7) inreg %ptr) { ; CHECK-LABEL: define void @memset_known_i32_volatile( ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0 ; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 ; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] ; CHECK: [[LOADSTORELOOP]]: ; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ] ; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[TMP1]] ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 -2147483648) ; CHECK-NEXT: [[TMP3]] = add i32 [[TMP1]], 1 ; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 32 ; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]] ; CHECK: [[SPLIT]]: ; CHECK-NEXT: ret void ; call void @llvm.memset.p7.i32(ptr addrspace(7) %ptr, i8 1, i32 32, i1 true) ret void } define void @memset_unknown(ptr addrspace(7) inreg %ptr, i32 inreg %length) { ; CHECK-LABEL: define void @memset_unknown( ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]], i32 inreg [[LENGTH:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0 ; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 ; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 0, [[LENGTH]] ; CHECK-NEXT: br i1 [[TMP1]], label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] ; CHECK: [[LOADSTORELOOP]]: ; CHECK-NEXT: [[TMP2:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOADSTORELOOP]] ] ; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[PTR_OFF]], [[TMP2]] ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP3]], i32 0, i32 0) ; CHECK-NEXT: [[TMP4]] = add i32 [[TMP2]], 1 ; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], [[LENGTH]] ; CHECK-NEXT: br i1 [[TMP5]], label %[[LOADSTORELOOP]], label %[[SPLIT]] ; CHECK: [[SPLIT]]: ; CHECK-NEXT: ret void ; call void @llvm.memset.p7.i32(ptr addrspace(7) %ptr, i8 1, i32 %length, i1 false) ret void } ;; memset.inline declare void @llvm.memset.inline.p7.i32(ptr addrspace(7), i8, i32, i1) declare void @llvm.memset.inline.p7.i64(ptr addrspace(7), i8, i64, i1) define void @memset.inline_known(ptr addrspace(7) inreg %ptr) { ; CHECK-LABEL: define void @memset.inline_known( ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0 ; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 ; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] ; CHECK: [[LOADSTORELOOP]]: ; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ] ; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[TMP1]] ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0) ; CHECK-NEXT: [[TMP3]] = add i32 [[TMP1]], 1 ; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 8192 ; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]] ; CHECK: [[SPLIT]]: ; CHECK-NEXT: ret void ; call void @llvm.memset.inline.p7.i32(ptr addrspace(7) noundef nonnull align 16 %ptr, i8 1, i32 8192, i1 false) ret void } define void @memset.inline_known_small(ptr addrspace(7) inreg %ptr) { ; CHECK-LABEL: define void @memset.inline_known_small( ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0 ; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 ; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] ; CHECK: [[LOADSTORELOOP]]: ; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ] ; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[TMP1]] ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0) ; CHECK-NEXT: [[TMP3]] = add i32 [[TMP1]], 1 ; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 32 ; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]] ; CHECK: [[SPLIT]]: ; CHECK-NEXT: ret void ; call void @llvm.memset.inline.p7.i32(ptr addrspace(7) %ptr, i8 1, i32 32, i1 false) ret void } define void @memset.inline_known_byte(ptr addrspace(7) inreg %ptr) { ; CHECK-LABEL: define void @memset.inline_known_byte( ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0 ; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 ; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] ; CHECK: [[LOADSTORELOOP]]: ; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ] ; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[TMP1]] ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0) ; CHECK-NEXT: [[TMP3]] = add i32 [[TMP1]], 1 ; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 1 ; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]] ; CHECK: [[SPLIT]]: ; CHECK-NEXT: ret void ; call void @llvm.memset.inline.p7.i32(ptr addrspace(7) %ptr, i8 1, i32 1, i1 false) ret void } define void @memset.inline_known_tail(ptr addrspace(7) inreg %ptr) { ; CHECK-LABEL: define void @memset.inline_known_tail( ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0 ; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 ; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] ; CHECK: [[LOADSTORELOOP]]: ; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ] ; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[TMP1]] ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0) ; CHECK-NEXT: [[TMP3]] = add i32 [[TMP1]], 1 ; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 15 ; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]] ; CHECK: [[SPLIT]]: ; CHECK-NEXT: ret void ; call void @llvm.memset.inline.p7.i32(ptr addrspace(7) %ptr, i8 1, i32 15, i1 false) ret void } define void @memset.inline_known_i64(ptr addrspace(7) inreg %ptr) { ; CHECK-LABEL: define void @memset.inline_known_i64( ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0 ; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 ; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] ; CHECK: [[LOADSTORELOOP]]: ; CHECK-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ] ; CHECK-NEXT: [[DOTC:%.*]] = trunc i64 [[TMP1]] to i32 ; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[DOTC]] ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0) ; CHECK-NEXT: [[TMP3]] = add i64 [[TMP1]], 1 ; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 8192 ; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]] ; CHECK: [[SPLIT]]: ; CHECK-NEXT: ret void ; call void @llvm.memset.inline.p7.i64(ptr addrspace(7) %ptr, i8 1, i64 8192, i1 false) ret void } define void @memset.inline_known_i32_volatile(ptr addrspace(7) inreg %ptr) { ; CHECK-LABEL: define void @memset.inline_known_i32_volatile( ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0 ; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 ; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] ; CHECK: [[LOADSTORELOOP]]: ; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ] ; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[TMP1]] ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 -2147483648) ; CHECK-NEXT: [[TMP3]] = add i32 [[TMP1]], 1 ; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 32 ; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]] ; CHECK: [[SPLIT]]: ; CHECK-NEXT: ret void ; call void @llvm.memset.inline.p7.i32(ptr addrspace(7) %ptr, i8 1, i32 32, i1 true) ret void } define void @memset.inline_unknown(ptr addrspace(7) inreg %ptr, i32 inreg %length) { ; CHECK-LABEL: define void @memset.inline_unknown( ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]], i32 inreg [[LENGTH:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0 ; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 ; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 0, [[LENGTH]] ; CHECK-NEXT: br i1 [[TMP1]], label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] ; CHECK: [[LOADSTORELOOP]]: ; CHECK-NEXT: [[TMP2:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOADSTORELOOP]] ] ; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[PTR_OFF]], [[TMP2]] ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP3]], i32 0, i32 0) ; CHECK-NEXT: [[TMP4]] = add i32 [[TMP2]], 1 ; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], [[LENGTH]] ; CHECK-NEXT: br i1 [[TMP5]], label %[[LOADSTORELOOP]], label %[[SPLIT]] ; CHECK: [[SPLIT]]: ; CHECK-NEXT: ret void ; call void @llvm.memset.inline.p7.i32(ptr addrspace(7) %ptr, i8 1, i32 %length, i1 false) ret void } ;; memset.pattern declare void @llvm.experimental.memset.pattern.p7.i32.i32(ptr addrspace(7), i32, i32, i1) declare void @llvm.experimental.memset.pattern.p7.i32.i64(ptr addrspace(7), i32, i64, i1) define void @memset_pattern_known(ptr addrspace(7) inreg %ptr) { ; CHECK-LABEL: define void @memset_pattern_known( ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0 ; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 ; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] ; CHECK: [[LOADSTORELOOP]]: ; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ] ; CHECK-NEXT: [[DOTIDX:%.*]] = mul nsw i32 [[TMP1]], 4 ; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[DOTIDX]] ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 1, ptr addrspace(8) align 4 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0) ; CHECK-NEXT: [[TMP3]] = add i32 [[TMP1]], 1 ; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 8192 ; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]] ; CHECK: [[SPLIT]]: ; CHECK-NEXT: ret void ; call void @llvm.experimental.memset.pattern.p7.i32.i32(ptr addrspace(7) noundef nonnull align 16 %ptr, i32 1, i32 8192, i1 false) ret void } define void @memset_pattern_known_small(ptr addrspace(7) inreg %ptr) { ; CHECK-LABEL: define void @memset_pattern_known_small( ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0 ; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 ; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] ; CHECK: [[LOADSTORELOOP]]: ; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ] ; CHECK-NEXT: [[DOTIDX:%.*]] = mul nsw i32 [[TMP1]], 4 ; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[DOTIDX]] ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0) ; CHECK-NEXT: [[TMP3]] = add i32 [[TMP1]], 1 ; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 32 ; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]] ; CHECK: [[SPLIT]]: ; CHECK-NEXT: ret void ; call void @llvm.experimental.memset.pattern.p7.i32.i32(ptr addrspace(7) %ptr, i32 1, i32 32, i1 false) ret void } define void @memset_pattern_known_i64(ptr addrspace(7) inreg %ptr) { ; CHECK-LABEL: define void @memset_pattern_known_i64( ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0 ; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 ; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] ; CHECK: [[LOADSTORELOOP]]: ; CHECK-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ] ; CHECK-NEXT: [[DOTC:%.*]] = trunc i64 [[TMP1]] to i32 ; CHECK-NEXT: [[DOTIDX:%.*]] = mul nsw i32 [[DOTC]], 4 ; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[DOTIDX]] ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0) ; CHECK-NEXT: [[TMP3]] = add i64 [[TMP1]], 1 ; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 8192 ; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]] ; CHECK: [[SPLIT]]: ; CHECK-NEXT: ret void ; call void @llvm.experimental.memset.pattern.p7.i32.i64(ptr addrspace(7) %ptr, i32 1, i64 8192, i1 false) ret void } define void @memset_pattern_known_i32_volatile(ptr addrspace(7) inreg %ptr) { ; CHECK-LABEL: define void @memset_pattern_known_i32_volatile( ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0 ; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 ; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] ; CHECK: [[LOADSTORELOOP]]: ; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ] ; CHECK-NEXT: [[DOTIDX:%.*]] = mul nsw i32 [[TMP1]], 4 ; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[DOTIDX]] ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 -2147483648) ; CHECK-NEXT: [[TMP3]] = add i32 [[TMP1]], 1 ; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 32 ; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]] ; CHECK: [[SPLIT]]: ; CHECK-NEXT: ret void ; call void @llvm.experimental.memset.pattern.p7.i32.i32(ptr addrspace(7) %ptr, i32 1, i32 32, i1 true) ret void } define void @memset_pattern_unknown(ptr addrspace(7) inreg %ptr, i32 inreg %length) { ; CHECK-LABEL: define void @memset_pattern_unknown( ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]], i32 inreg [[LENGTH:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0 ; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 ; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 0, [[LENGTH]] ; CHECK-NEXT: br i1 [[TMP1]], label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] ; CHECK: [[LOADSTORELOOP]]: ; CHECK-NEXT: [[TMP2:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOADSTORELOOP]] ] ; CHECK-NEXT: [[DOTIDX:%.*]] = mul nsw i32 [[TMP2]], 4 ; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[PTR_OFF]], [[DOTIDX]] ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP3]], i32 0, i32 0) ; CHECK-NEXT: [[TMP4]] = add i32 [[TMP2]], 1 ; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], [[LENGTH]] ; CHECK-NEXT: br i1 [[TMP5]], label %[[LOADSTORELOOP]], label %[[SPLIT]] ; CHECK: [[SPLIT]]: ; CHECK-NEXT: ret void ; call void @llvm.experimental.memset.pattern.p7.i32.i32(ptr addrspace(7) %ptr, i32 1, i32 %length, i1 false) ret void } ;;; Buffer load to LDS declare void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7), ptr addrspace(3), i32 immarg, i32 immarg, i32 immarg) define void @llvm_amdgcn_load_to_lds(ptr addrspace(7) inreg %p, ptr addrspace(3) inreg %l, i32 %idx) { ; CHECK-LABEL: define void @llvm_amdgcn_load_to_lds( ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[P:%.*]], ptr addrspace(3) inreg [[L:%.*]], i32 [[IDX:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[P_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[P]], 0 ; CHECK-NEXT: [[P_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[P]], 1 ; CHECK-NEXT: [[Q:%.*]] = add i32 [[P_OFF]], [[IDX]] ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) [[P_RSRC]], ptr addrspace(3) [[L]], i32 4, i32 [[Q]], i32 0, i32 16, i32 0) ; CHECK-NEXT: ret void ; %q = getelementptr i8, ptr addrspace(7) %p, i32 %idx call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %q, ptr addrspace(3) %l, i32 4, i32 16, i32 0) ret void }