aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/Transforms
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/Transforms')
-rw-r--r--llvm/test/Transforms/InstCombine/add-sitofp.ll7
-rw-r--r--llvm/test/Transforms/InstCombine/binop-itofp.ll22
-rw-r--r--llvm/test/Transforms/InstCombine/ptrtoaddr.ll5
-rw-r--r--llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll103
-rw-r--r--llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll539
-rw-r--r--llvm/test/Transforms/LowerMatrixIntrinsics/data-layout.ll312
-rw-r--r--llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-double.ll9
-rw-r--r--llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-double.ll9
-rw-r--r--llvm/test/Transforms/PGOProfile/data-access-profile.ll83
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll16
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/gather-with-cmp-user.ll6
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/identity-match-splat-less-defined.ll27
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-subvectors-insert.ll8
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/non-scheduled-inst-reused-as-last-inst.ll6
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/reduced-val-vectorized-in-transform.ll9
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/reschedule-only-scheduled.ll47
-rw-r--r--llvm/test/Transforms/SLPVectorizer/minbitwidth-node-with-multi-users.ll14
17 files changed, 1083 insertions, 139 deletions
diff --git a/llvm/test/Transforms/InstCombine/add-sitofp.ll b/llvm/test/Transforms/InstCombine/add-sitofp.ll
index fae1365..e1d39fd 100644
--- a/llvm/test/Transforms/InstCombine/add-sitofp.ll
+++ b/llvm/test/Transforms/InstCombine/add-sitofp.ll
@@ -99,12 +99,15 @@ define float @test_3(i32 %a, i32 %b) {
ret float %p
}
+; Don't perform the fold on vector operations, as the integer op may be
+; much more expensive than the float op in that case.
define <4 x double> @test_4(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: @test_4(
; CHECK-NEXT: [[A_AND:%.*]] = and <4 x i32> [[A:%.*]], splat (i32 1073741823)
; CHECK-NEXT: [[B_AND:%.*]] = and <4 x i32> [[B:%.*]], splat (i32 1073741823)
-; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw <4 x i32> [[A_AND]], [[B_AND]]
-; CHECK-NEXT: [[RES:%.*]] = uitofp nneg <4 x i32> [[TMP1]] to <4 x double>
+; CHECK-NEXT: [[A_AND_FP:%.*]] = uitofp nneg <4 x i32> [[A_AND]] to <4 x double>
+; CHECK-NEXT: [[B_AND_FP:%.*]] = uitofp nneg <4 x i32> [[B_AND]] to <4 x double>
+; CHECK-NEXT: [[RES:%.*]] = fadd <4 x double> [[A_AND_FP]], [[B_AND_FP]]
; CHECK-NEXT: ret <4 x double> [[RES]]
;
; Drop two highest bits to guarantee that %a + %b doesn't overflow
diff --git a/llvm/test/Transforms/InstCombine/binop-itofp.ll b/llvm/test/Transforms/InstCombine/binop-itofp.ll
index 702bbbb..57184ea 100644
--- a/llvm/test/Transforms/InstCombine/binop-itofp.ll
+++ b/llvm/test/Transforms/InstCombine/binop-itofp.ll
@@ -1063,6 +1063,25 @@ define float @negzero_check_on_constant_for_si_fmul(i1 %c, i1 %.b, ptr %g_2345)
ret float %mul3.i.i
}
+; Don't perform the fold on vector operations, as the integer op may be
+; much more expensive than the float op in that case.
+define <2 x half> @test_ui_ui_i8_mul_vec(<2 x i8> noundef %x_in, <2 x i8> noundef %y_in) {
+; CHECK-LABEL: @test_ui_ui_i8_mul_vec(
+; CHECK-NEXT: [[X:%.*]] = and <2 x i8> [[X_IN:%.*]], splat (i8 15)
+; CHECK-NEXT: [[Y:%.*]] = and <2 x i8> [[Y_IN:%.*]], splat (i8 15)
+; CHECK-NEXT: [[XF:%.*]] = uitofp nneg <2 x i8> [[X]] to <2 x half>
+; CHECK-NEXT: [[YF:%.*]] = uitofp nneg <2 x i8> [[Y]] to <2 x half>
+; CHECK-NEXT: [[R:%.*]] = fmul <2 x half> [[XF]], [[YF]]
+; CHECK-NEXT: ret <2 x half> [[R]]
+;
+ %x = and <2 x i8> %x_in, splat (i8 15)
+ %y = and <2 x i8> %y_in, splat (i8 15)
+ %xf = uitofp <2 x i8> %x to <2 x half>
+ %yf = uitofp <2 x i8> %y to <2 x half>
+ %r = fmul <2 x half> %xf, %yf
+ ret <2 x half> %r
+}
+
define <2 x float> @nonzero_check_on_constant_for_si_fmul_vec_w_poison(i1 %c, i1 %.b, ptr %g_2345) {
; CHECK-LABEL: @nonzero_check_on_constant_for_si_fmul_vec_w_poison(
; CHECK-NEXT: [[SEL:%.*]] = select i1 [[C:%.*]], i32 65529, i32 53264
@@ -1091,8 +1110,9 @@ define <2 x float> @nonzero_check_on_constant_for_si_fmul_nz_vec_w_poison(i1 %c,
; CHECK-NEXT: [[CONV_I_V:%.*]] = insertelement <2 x i16> poison, i16 [[CONV_I_S]], i64 0
; CHECK-NEXT: [[CONV_I:%.*]] = shufflevector <2 x i16> [[CONV_I_V]], <2 x i16> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[MUL3_I_I:%.*]] = sitofp <2 x i16> [[CONV_I]] to <2 x float>
+; CHECK-NEXT: [[MUL3_I_I1:%.*]] = fmul <2 x float> [[MUL3_I_I]], <float poison, float 1.000000e+00>
; CHECK-NEXT: store i32 [[SEL]], ptr [[G_2345:%.*]], align 4
-; CHECK-NEXT: ret <2 x float> [[MUL3_I_I]]
+; CHECK-NEXT: ret <2 x float> [[MUL3_I_I1]]
;
%sel = select i1 %c, i32 65529, i32 53264
%conv.i.s = trunc i32 %sel to i16
diff --git a/llvm/test/Transforms/InstCombine/ptrtoaddr.ll b/llvm/test/Transforms/InstCombine/ptrtoaddr.ll
index 7b0b152..ffaa8b1 100644
--- a/llvm/test/Transforms/InstCombine/ptrtoaddr.ll
+++ b/llvm/test/Transforms/InstCombine/ptrtoaddr.ll
@@ -23,10 +23,7 @@ define i64 @ptrtoaddr_inttoptr_arg(i64 %a) {
define i32 @ptrtoaddr_inttoptr_arg_addrsize(i32 %a) {
; CHECK-LABEL: define i32 @ptrtoaddr_inttoptr_arg_addrsize(
; CHECK-SAME: i32 [[A:%.*]]) {
-; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[A]] to i64
-; CHECK-NEXT: [[TOPTR:%.*]] = inttoptr i64 [[TMP1]] to ptr addrspace(1)
-; CHECK-NEXT: [[TOADDR:%.*]] = ptrtoaddr ptr addrspace(1) [[TOPTR]] to i32
-; CHECK-NEXT: ret i32 [[TOADDR]]
+; CHECK-NEXT: ret i32 [[A]]
;
%toptr = inttoptr i32 %a to ptr addrspace(1)
%toaddr = ptrtoaddr ptr addrspace(1) %toptr to i32
diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll
index 9ed2240..9357adf 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll
@@ -273,3 +273,106 @@ loop:
exit:
ret void
}
+
+define void @ld_div2_ld_scevunknown_nonuniform(ptr %src.a, ptr noalias %src.b, ptr noalias %dst) {
+; CHECK-LABEL: define void @ld_div2_ld_scevunknown_nonuniform
+; CHECK-SAME: (ptr [[SRC_A:%.*]], ptr noalias [[SRC_B:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5
+; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6
+; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[SRC_A]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr [[SRC_A]], i64 [[TMP1]]
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[SRC_A]], i64 [[TMP2]]
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[SRC_A]], i64 [[TMP3]]
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[SRC_A]], i64 [[TMP4]]
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[SRC_A]], i64 [[TMP5]]
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[SRC_A]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[SRC_A]], i64 [[TMP7]]
+; CHECK-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP8]], align 4
+; CHECK-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP9]], align 4
+; CHECK-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP10]], align 4
+; CHECK-NEXT: [[TMP19:%.*]] = load i64, ptr [[TMP11]], align 4
+; CHECK-NEXT: [[TMP20:%.*]] = load i64, ptr [[TMP12]], align 4
+; CHECK-NEXT: [[TMP21:%.*]] = load i64, ptr [[TMP13]], align 4
+; CHECK-NEXT: [[TMP22:%.*]] = load i64, ptr [[TMP14]], align 4
+; CHECK-NEXT: [[TMP23:%.*]] = load i64, ptr [[TMP15]], align 4
+; CHECK-NEXT: [[TMP24:%.*]] = insertelement <8 x i64> poison, i64 [[TMP16]], i32 0
+; CHECK-NEXT: [[TMP25:%.*]] = insertelement <8 x i64> [[TMP24]], i64 [[TMP17]], i32 1
+; CHECK-NEXT: [[TMP26:%.*]] = insertelement <8 x i64> [[TMP25]], i64 [[TMP18]], i32 2
+; CHECK-NEXT: [[TMP27:%.*]] = insertelement <8 x i64> [[TMP26]], i64 [[TMP19]], i32 3
+; CHECK-NEXT: [[TMP28:%.*]] = insertelement <8 x i64> [[TMP27]], i64 [[TMP20]], i32 4
+; CHECK-NEXT: [[TMP29:%.*]] = insertelement <8 x i64> [[TMP28]], i64 [[TMP21]], i32 5
+; CHECK-NEXT: [[TMP30:%.*]] = insertelement <8 x i64> [[TMP29]], i64 [[TMP22]], i32 6
+; CHECK-NEXT: [[TMP31:%.*]] = insertelement <8 x i64> [[TMP30]], i64 [[TMP23]], i32 7
+; CHECK-NEXT: [[TMP32:%.*]] = udiv <8 x i64> [[TMP31]], splat (i64 2)
+; CHECK-NEXT: [[TMP33:%.*]] = extractelement <8 x i64> [[TMP32]], i32 0
+; CHECK-NEXT: [[TMP34:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP33]]
+; CHECK-NEXT: [[TMP35:%.*]] = extractelement <8 x i64> [[TMP32]], i32 1
+; CHECK-NEXT: [[TMP36:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP35]]
+; CHECK-NEXT: [[TMP37:%.*]] = extractelement <8 x i64> [[TMP32]], i32 2
+; CHECK-NEXT: [[TMP38:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP37]]
+; CHECK-NEXT: [[TMP39:%.*]] = extractelement <8 x i64> [[TMP32]], i32 3
+; CHECK-NEXT: [[TMP40:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP39]]
+; CHECK-NEXT: [[TMP41:%.*]] = extractelement <8 x i64> [[TMP32]], i32 4
+; CHECK-NEXT: [[TMP42:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP41]]
+; CHECK-NEXT: [[TMP43:%.*]] = extractelement <8 x i64> [[TMP32]], i32 5
+; CHECK-NEXT: [[TMP44:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP43]]
+; CHECK-NEXT: [[TMP45:%.*]] = extractelement <8 x i64> [[TMP32]], i32 6
+; CHECK-NEXT: [[TMP46:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP45]]
+; CHECK-NEXT: [[TMP47:%.*]] = extractelement <8 x i64> [[TMP32]], i32 7
+; CHECK-NEXT: [[TMP48:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP47]]
+; CHECK-NEXT: [[TMP49:%.*]] = load i32, ptr [[TMP34]], align 4
+; CHECK-NEXT: [[TMP50:%.*]] = load i32, ptr [[TMP36]], align 4
+; CHECK-NEXT: [[TMP51:%.*]] = load i32, ptr [[TMP38]], align 4
+; CHECK-NEXT: [[TMP52:%.*]] = load i32, ptr [[TMP40]], align 4
+; CHECK-NEXT: [[TMP53:%.*]] = load i32, ptr [[TMP42]], align 4
+; CHECK-NEXT: [[TMP54:%.*]] = load i32, ptr [[TMP44]], align 4
+; CHECK-NEXT: [[TMP55:%.*]] = load i32, ptr [[TMP46]], align 4
+; CHECK-NEXT: [[TMP56:%.*]] = load i32, ptr [[TMP48]], align 4
+; CHECK-NEXT: [[TMP57:%.*]] = insertelement <8 x i32> poison, i32 [[TMP49]], i32 0
+; CHECK-NEXT: [[TMP58:%.*]] = insertelement <8 x i32> [[TMP57]], i32 [[TMP50]], i32 1
+; CHECK-NEXT: [[TMP59:%.*]] = insertelement <8 x i32> [[TMP58]], i32 [[TMP51]], i32 2
+; CHECK-NEXT: [[TMP60:%.*]] = insertelement <8 x i32> [[TMP59]], i32 [[TMP52]], i32 3
+; CHECK-NEXT: [[TMP61:%.*]] = insertelement <8 x i32> [[TMP60]], i32 [[TMP53]], i32 4
+; CHECK-NEXT: [[TMP62:%.*]] = insertelement <8 x i32> [[TMP61]], i32 [[TMP54]], i32 5
+; CHECK-NEXT: [[TMP63:%.*]] = insertelement <8 x i32> [[TMP62]], i32 [[TMP55]], i32 6
+; CHECK-NEXT: [[TMP64:%.*]] = insertelement <8 x i32> [[TMP63]], i32 [[TMP56]], i32 7
+; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP0]]
+; CHECK-NEXT: store <8 x i32> [[TMP64]], ptr [[TMP65]], align 4
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT: [[TMP66:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT: br i1 [[TMP66]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: br label [[SCALAR_PH:%.*]]
+; CHECK: scalar.ph:
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %gep.a = getelementptr i32, ptr %src.a, i64 %iv
+ %load.a = load i64, ptr %gep.a
+ %d = udiv i64 %load.a, 2
+ %gep.b = getelementptr i32, ptr %src.b, i64 %d
+ %load.b = load i32, ptr %gep.b
+ %gep.dst = getelementptr i32, ptr %dst, i64 %iv
+ store i32 %load.b, ptr %gep.dst
+ %iv.next = add i64 %iv, 1
+ %exit.cond = icmp eq i64 %iv, 1000
+ br i1 %exit.cond, label %exit, label %loop
+
+exit:
+ ret void
+}
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll
new file mode 100644
index 0000000..d281905
--- /dev/null
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll
@@ -0,0 +1,539 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=lower-matrix-intrinsics,instcombine -data-layout='p:128:128' -fuse-matrix-use-loops=false -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s --check-prefix=PTR128
+; RUN: opt -passes=lower-matrix-intrinsics,instcombine -data-layout='p:64:64' -fuse-matrix-use-loops=false -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s --check-prefix=PTR64
+; RUN: opt -passes=lower-matrix-intrinsics,instcombine -data-layout='p:32:32' -fuse-matrix-use-loops=false -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s --check-prefix=PTR32
+
+; REQUIRES: aarch64-registered-target
+
+; See the comment in `data-layout.ll` for an explanation.
+
+target triple = "aarch64-unknown-unknown"
+
+define void @multiply(ptr %A, ptr %B, ptr %C) {
+; PTR128-LABEL: @multiply(
+; PTR128-NEXT: entry:
+; PTR128-NEXT: [[STORE_BEGIN:%.*]] = ptrtoint ptr [[C:%.*]] to i128
+; PTR128-NEXT: [[STORE_END:%.*]] = add nuw nsw i128 [[STORE_BEGIN]], 128
+; PTR128-NEXT: [[LOAD_BEGIN:%.*]] = ptrtoint ptr [[A:%.*]] to i128
+; PTR128-NEXT: [[TMP0:%.*]] = icmp ugt i128 [[STORE_END]], [[LOAD_BEGIN]]
+; PTR128-NEXT: br i1 [[TMP0]], label [[ALIAS_CONT:%.*]], label [[NO_ALIAS:%.*]]
+; PTR128: alias_cont:
+; PTR128-NEXT: [[LOAD_END:%.*]] = add nuw nsw i128 [[LOAD_BEGIN]], 128
+; PTR128-NEXT: [[TMP1:%.*]] = icmp ugt i128 [[LOAD_END]], [[STORE_BEGIN]]
+; PTR128-NEXT: br i1 [[TMP1]], label [[COPY:%.*]], label [[NO_ALIAS]]
+; PTR128: copy:
+; PTR128-NEXT: [[TMP2:%.*]] = alloca [16 x double], align 8
+; PTR128-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP2]], ptr noundef nonnull align 8 dereferenceable(128) [[A]], i64 128, i1 false)
+; PTR128-NEXT: br label [[NO_ALIAS]]
+; PTR128: no_alias:
+; PTR128-NEXT: [[TMP3:%.*]] = phi ptr [ [[A]], [[ENTRY:%.*]] ], [ [[A]], [[ALIAS_CONT]] ], [ [[TMP2]], [[COPY]] ]
+; PTR128-NEXT: [[STORE_BEGIN4:%.*]] = ptrtoint ptr [[C]] to i128
+; PTR128-NEXT: [[STORE_END5:%.*]] = add nuw nsw i128 [[STORE_BEGIN4]], 128
+; PTR128-NEXT: [[LOAD_BEGIN6:%.*]] = ptrtoint ptr [[A]] to i128
+; PTR128-NEXT: [[TMP4:%.*]] = icmp ugt i128 [[STORE_END5]], [[LOAD_BEGIN6]]
+; PTR128-NEXT: br i1 [[TMP4]], label [[ALIAS_CONT1:%.*]], label [[NO_ALIAS3:%.*]]
+; PTR128: alias_cont1:
+; PTR128-NEXT: [[LOAD_END7:%.*]] = add nuw nsw i128 [[LOAD_BEGIN6]], 128
+; PTR128-NEXT: [[TMP5:%.*]] = icmp ugt i128 [[LOAD_END7]], [[STORE_BEGIN4]]
+; PTR128-NEXT: br i1 [[TMP5]], label [[COPY2:%.*]], label [[NO_ALIAS3]]
+; PTR128: copy2:
+; PTR128-NEXT: [[TMP6:%.*]] = alloca [16 x double], align 8
+; PTR128-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP6]], ptr noundef nonnull align 8 dereferenceable(128) [[A]], i64 128, i1 false)
+; PTR128-NEXT: br label [[NO_ALIAS3]]
+; PTR128: no_alias3:
+; PTR128-NEXT: [[TMP7:%.*]] = phi ptr [ [[A]], [[NO_ALIAS]] ], [ [[A]], [[ALIAS_CONT1]] ], [ [[TMP6]], [[COPY2]] ]
+; PTR128-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
+; PTR128-NEXT: [[VEC_GEP:%.*]] = getelementptr i8, ptr [[TMP3]], i128 32
+; PTR128-NEXT: [[COL_LOAD8:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
+; PTR128-NEXT: [[COL_LOAD9:%.*]] = load <2 x double>, ptr [[TMP7]], align 8
+; PTR128-NEXT: [[VEC_GEP10:%.*]] = getelementptr i8, ptr [[TMP7]], i128 32
+; PTR128-NEXT: [[COL_LOAD11:%.*]] = load <2 x double>, ptr [[VEC_GEP10]], align 8
+; PTR128-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[COL_LOAD9]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT: [[TMP8:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT]]
+; PTR128-NEXT: [[SPLAT_SPLAT14:%.*]] = shufflevector <2 x double> [[COL_LOAD9]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT: [[TMP9:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD8]], <2 x double> [[SPLAT_SPLAT14]], <2 x double> [[TMP8]])
+; PTR128-NEXT: [[SPLAT_SPLAT17:%.*]] = shufflevector <2 x double> [[COL_LOAD11]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT: [[TMP10:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT17]]
+; PTR128-NEXT: [[SPLAT_SPLAT20:%.*]] = shufflevector <2 x double> [[COL_LOAD11]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT: [[TMP11:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD8]], <2 x double> [[SPLAT_SPLAT20]], <2 x double> [[TMP10]])
+; PTR128-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP3]], i128 64
+; PTR128-NEXT: [[COL_LOAD21:%.*]] = load <2 x double>, ptr [[TMP12]], align 8
+; PTR128-NEXT: [[VEC_GEP22:%.*]] = getelementptr i8, ptr [[TMP3]], i128 96
+; PTR128-NEXT: [[COL_LOAD23:%.*]] = load <2 x double>, ptr [[VEC_GEP22]], align 8
+; PTR128-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP7]], i128 16
+; PTR128-NEXT: [[COL_LOAD24:%.*]] = load <2 x double>, ptr [[TMP13]], align 8
+; PTR128-NEXT: [[VEC_GEP25:%.*]] = getelementptr i8, ptr [[TMP7]], i128 48
+; PTR128-NEXT: [[COL_LOAD26:%.*]] = load <2 x double>, ptr [[VEC_GEP25]], align 8
+; PTR128-NEXT: [[SPLAT_SPLAT30:%.*]] = shufflevector <2 x double> [[COL_LOAD24]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT: [[TMP14:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD21]], <2 x double> [[SPLAT_SPLAT30]], <2 x double> [[TMP9]])
+; PTR128-NEXT: [[SPLAT_SPLAT33:%.*]] = shufflevector <2 x double> [[COL_LOAD24]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT: [[TMP15:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD23]], <2 x double> [[SPLAT_SPLAT33]], <2 x double> [[TMP14]])
+; PTR128-NEXT: [[SPLAT_SPLAT37:%.*]] = shufflevector <2 x double> [[COL_LOAD26]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT: [[TMP16:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD21]], <2 x double> [[SPLAT_SPLAT37]], <2 x double> [[TMP11]])
+; PTR128-NEXT: [[SPLAT_SPLAT40:%.*]] = shufflevector <2 x double> [[COL_LOAD26]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT: [[TMP17:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD23]], <2 x double> [[SPLAT_SPLAT40]], <2 x double> [[TMP16]])
+; PTR128-NEXT: store <2 x double> [[TMP15]], ptr [[C]], align 8
+; PTR128-NEXT: [[VEC_GEP41:%.*]] = getelementptr i8, ptr [[C]], i128 32
+; PTR128-NEXT: store <2 x double> [[TMP17]], ptr [[VEC_GEP41]], align 8
+; PTR128-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP3]], i128 16
+; PTR128-NEXT: [[COL_LOAD42:%.*]] = load <2 x double>, ptr [[TMP18]], align 8
+; PTR128-NEXT: [[VEC_GEP43:%.*]] = getelementptr i8, ptr [[TMP3]], i128 48
+; PTR128-NEXT: [[COL_LOAD44:%.*]] = load <2 x double>, ptr [[VEC_GEP43]], align 8
+; PTR128-NEXT: [[COL_LOAD45:%.*]] = load <2 x double>, ptr [[TMP7]], align 8
+; PTR128-NEXT: [[VEC_GEP46:%.*]] = getelementptr i8, ptr [[TMP7]], i128 32
+; PTR128-NEXT: [[COL_LOAD47:%.*]] = load <2 x double>, ptr [[VEC_GEP46]], align 8
+; PTR128-NEXT: [[SPLAT_SPLAT50:%.*]] = shufflevector <2 x double> [[COL_LOAD45]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT: [[TMP19:%.*]] = fmul contract <2 x double> [[COL_LOAD42]], [[SPLAT_SPLAT50]]
+; PTR128-NEXT: [[SPLAT_SPLAT53:%.*]] = shufflevector <2 x double> [[COL_LOAD45]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT: [[TMP20:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD44]], <2 x double> [[SPLAT_SPLAT53]], <2 x double> [[TMP19]])
+; PTR128-NEXT: [[SPLAT_SPLAT56:%.*]] = shufflevector <2 x double> [[COL_LOAD47]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT: [[TMP21:%.*]] = fmul contract <2 x double> [[COL_LOAD42]], [[SPLAT_SPLAT56]]
+; PTR128-NEXT: [[SPLAT_SPLAT59:%.*]] = shufflevector <2 x double> [[COL_LOAD47]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT: [[TMP22:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD44]], <2 x double> [[SPLAT_SPLAT59]], <2 x double> [[TMP21]])
+; PTR128-NEXT: [[TMP23:%.*]] = getelementptr i8, ptr [[TMP3]], i128 80
+; PTR128-NEXT: [[COL_LOAD60:%.*]] = load <2 x double>, ptr [[TMP23]], align 8
+; PTR128-NEXT: [[VEC_GEP61:%.*]] = getelementptr i8, ptr [[TMP3]], i128 112
+; PTR128-NEXT: [[COL_LOAD62:%.*]] = load <2 x double>, ptr [[VEC_GEP61]], align 8
+; PTR128-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[TMP7]], i128 16
+; PTR128-NEXT: [[COL_LOAD63:%.*]] = load <2 x double>, ptr [[TMP24]], align 8
+; PTR128-NEXT: [[VEC_GEP64:%.*]] = getelementptr i8, ptr [[TMP7]], i128 48
+; PTR128-NEXT: [[COL_LOAD65:%.*]] = load <2 x double>, ptr [[VEC_GEP64]], align 8
+; PTR128-NEXT: [[SPLAT_SPLAT69:%.*]] = shufflevector <2 x double> [[COL_LOAD63]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT: [[TMP25:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD60]], <2 x double> [[SPLAT_SPLAT69]], <2 x double> [[TMP20]])
+; PTR128-NEXT: [[SPLAT_SPLAT72:%.*]] = shufflevector <2 x double> [[COL_LOAD63]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT: [[TMP26:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD62]], <2 x double> [[SPLAT_SPLAT72]], <2 x double> [[TMP25]])
+; PTR128-NEXT: [[SPLAT_SPLAT76:%.*]] = shufflevector <2 x double> [[COL_LOAD65]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT: [[TMP27:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD60]], <2 x double> [[SPLAT_SPLAT76]], <2 x double> [[TMP22]])
+; PTR128-NEXT: [[SPLAT_SPLAT79:%.*]] = shufflevector <2 x double> [[COL_LOAD65]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT: [[TMP28:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD62]], <2 x double> [[SPLAT_SPLAT79]], <2 x double> [[TMP27]])
+; PTR128-NEXT: [[TMP29:%.*]] = getelementptr i8, ptr [[C]], i128 16
+; PTR128-NEXT: store <2 x double> [[TMP26]], ptr [[TMP29]], align 8
+; PTR128-NEXT: [[VEC_GEP80:%.*]] = getelementptr i8, ptr [[C]], i128 48
+; PTR128-NEXT: store <2 x double> [[TMP28]], ptr [[VEC_GEP80]], align 8
+; PTR128-NEXT: [[COL_LOAD81:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
+; PTR128-NEXT: [[VEC_GEP82:%.*]] = getelementptr i8, ptr [[TMP3]], i128 32
+; PTR128-NEXT: [[COL_LOAD83:%.*]] = load <2 x double>, ptr [[VEC_GEP82]], align 8
+; PTR128-NEXT: [[TMP30:%.*]] = getelementptr i8, ptr [[TMP7]], i128 64
+; PTR128-NEXT: [[COL_LOAD84:%.*]] = load <2 x double>, ptr [[TMP30]], align 8
+; PTR128-NEXT: [[VEC_GEP85:%.*]] = getelementptr i8, ptr [[TMP7]], i128 96
+; PTR128-NEXT: [[COL_LOAD86:%.*]] = load <2 x double>, ptr [[VEC_GEP85]], align 8
+; PTR128-NEXT: [[SPLAT_SPLAT89:%.*]] = shufflevector <2 x double> [[COL_LOAD84]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT: [[TMP31:%.*]] = fmul contract <2 x double> [[COL_LOAD81]], [[SPLAT_SPLAT89]]
+; PTR128-NEXT: [[SPLAT_SPLAT92:%.*]] = shufflevector <2 x double> [[COL_LOAD84]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT: [[TMP32:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD83]], <2 x double> [[SPLAT_SPLAT92]], <2 x double> [[TMP31]])
+; PTR128-NEXT: [[SPLAT_SPLAT95:%.*]] = shufflevector <2 x double> [[COL_LOAD86]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT: [[TMP33:%.*]] = fmul contract <2 x double> [[COL_LOAD81]], [[SPLAT_SPLAT95]]
+; PTR128-NEXT: [[SPLAT_SPLAT98:%.*]] = shufflevector <2 x double> [[COL_LOAD86]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT: [[TMP34:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD83]], <2 x double> [[SPLAT_SPLAT98]], <2 x double> [[TMP33]])
+; PTR128-NEXT: [[TMP35:%.*]] = getelementptr i8, ptr [[TMP3]], i128 64
+; PTR128-NEXT: [[COL_LOAD99:%.*]] = load <2 x double>, ptr [[TMP35]], align 8
+; PTR128-NEXT: [[VEC_GEP100:%.*]] = getelementptr i8, ptr [[TMP3]], i128 96
+; PTR128-NEXT: [[COL_LOAD101:%.*]] = load <2 x double>, ptr [[VEC_GEP100]], align 8
+; PTR128-NEXT: [[TMP36:%.*]] = getelementptr i8, ptr [[TMP7]], i128 80
+; PTR128-NEXT: [[COL_LOAD102:%.*]] = load <2 x double>, ptr [[TMP36]], align 8
+; PTR128-NEXT: [[VEC_GEP103:%.*]] = getelementptr i8, ptr [[TMP7]], i128 112
+; PTR128-NEXT: [[COL_LOAD104:%.*]] = load <2 x double>, ptr [[VEC_GEP103]], align 8
+; PTR128-NEXT: [[SPLAT_SPLAT108:%.*]] = shufflevector <2 x double> [[COL_LOAD102]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT: [[TMP37:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD99]], <2 x double> [[SPLAT_SPLAT108]], <2 x double> [[TMP32]])
+; PTR128-NEXT: [[SPLAT_SPLAT111:%.*]] = shufflevector <2 x double> [[COL_LOAD102]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT: [[TMP38:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD101]], <2 x double> [[SPLAT_SPLAT111]], <2 x double> [[TMP37]])
+; PTR128-NEXT: [[SPLAT_SPLAT115:%.*]] = shufflevector <2 x double> [[COL_LOAD104]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT: [[TMP39:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD99]], <2 x double> [[SPLAT_SPLAT115]], <2 x double> [[TMP34]])
+; PTR128-NEXT: [[SPLAT_SPLAT118:%.*]] = shufflevector <2 x double> [[COL_LOAD104]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT: [[TMP40:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD101]], <2 x double> [[SPLAT_SPLAT118]], <2 x double> [[TMP39]])
+; PTR128-NEXT: [[TMP41:%.*]] = getelementptr i8, ptr [[C]], i128 64
+; PTR128-NEXT: store <2 x double> [[TMP38]], ptr [[TMP41]], align 8
+; PTR128-NEXT: [[VEC_GEP119:%.*]] = getelementptr i8, ptr [[C]], i128 96
+; PTR128-NEXT: store <2 x double> [[TMP40]], ptr [[VEC_GEP119]], align 8
+; PTR128-NEXT: [[TMP42:%.*]] = getelementptr i8, ptr [[TMP3]], i128 16
+; PTR128-NEXT: [[COL_LOAD120:%.*]] = load <2 x double>, ptr [[TMP42]], align 8
+; PTR128-NEXT: [[VEC_GEP121:%.*]] = getelementptr i8, ptr [[TMP3]], i128 48
+; PTR128-NEXT: [[COL_LOAD122:%.*]] = load <2 x double>, ptr [[VEC_GEP121]], align 8
+; PTR128-NEXT: [[TMP43:%.*]] = getelementptr i8, ptr [[TMP7]], i128 64
+; PTR128-NEXT: [[COL_LOAD123:%.*]] = load <2 x double>, ptr [[TMP43]], align 8
+; PTR128-NEXT: [[VEC_GEP124:%.*]] = getelementptr i8, ptr [[TMP7]], i128 96
+; PTR128-NEXT: [[COL_LOAD125:%.*]] = load <2 x double>, ptr [[VEC_GEP124]], align 8
+; PTR128-NEXT: [[SPLAT_SPLAT128:%.*]] = shufflevector <2 x double> [[COL_LOAD123]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT: [[TMP44:%.*]] = fmul contract <2 x double> [[COL_LOAD120]], [[SPLAT_SPLAT128]]
+; PTR128-NEXT: [[SPLAT_SPLAT131:%.*]] = shufflevector <2 x double> [[COL_LOAD123]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT: [[TMP45:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD122]], <2 x double> [[SPLAT_SPLAT131]], <2 x double> [[TMP44]])
+; PTR128-NEXT: [[SPLAT_SPLAT134:%.*]] = shufflevector <2 x double> [[COL_LOAD125]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT: [[TMP46:%.*]] = fmul contract <2 x double> [[COL_LOAD120]], [[SPLAT_SPLAT134]]
+; PTR128-NEXT: [[SPLAT_SPLAT137:%.*]] = shufflevector <2 x double> [[COL_LOAD125]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT: [[TMP47:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD122]], <2 x double> [[SPLAT_SPLAT137]], <2 x double> [[TMP46]])
+; PTR128-NEXT: [[TMP48:%.*]] = getelementptr i8, ptr [[TMP3]], i128 80
+; PTR128-NEXT: [[COL_LOAD138:%.*]] = load <2 x double>, ptr [[TMP48]], align 8
+; PTR128-NEXT: [[VEC_GEP139:%.*]] = getelementptr i8, ptr [[TMP3]], i128 112
+; PTR128-NEXT: [[COL_LOAD140:%.*]] = load <2 x double>, ptr [[VEC_GEP139]], align 8
+; PTR128-NEXT: [[TMP49:%.*]] = getelementptr i8, ptr [[TMP7]], i128 80
+; PTR128-NEXT: [[COL_LOAD141:%.*]] = load <2 x double>, ptr [[TMP49]], align 8
+; PTR128-NEXT: [[VEC_GEP142:%.*]] = getelementptr i8, ptr [[TMP7]], i128 112
+; PTR128-NEXT: [[COL_LOAD143:%.*]] = load <2 x double>, ptr [[VEC_GEP142]], align 8
+; PTR128-NEXT: [[SPLAT_SPLAT147:%.*]] = shufflevector <2 x double> [[COL_LOAD141]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT: [[TMP50:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD138]], <2 x double> [[SPLAT_SPLAT147]], <2 x double> [[TMP45]])
+; PTR128-NEXT: [[SPLAT_SPLAT150:%.*]] = shufflevector <2 x double> [[COL_LOAD141]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT: [[TMP51:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD140]], <2 x double> [[SPLAT_SPLAT150]], <2 x double> [[TMP50]])
+; PTR128-NEXT: [[SPLAT_SPLAT154:%.*]] = shufflevector <2 x double> [[COL_LOAD143]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT: [[TMP52:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD138]], <2 x double> [[SPLAT_SPLAT154]], <2 x double> [[TMP47]])
+; PTR128-NEXT: [[SPLAT_SPLAT157:%.*]] = shufflevector <2 x double> [[COL_LOAD143]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT: [[TMP53:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD140]], <2 x double> [[SPLAT_SPLAT157]], <2 x double> [[TMP52]])
+; PTR128-NEXT: [[TMP54:%.*]] = getelementptr i8, ptr [[C]], i128 80
+; PTR128-NEXT: store <2 x double> [[TMP51]], ptr [[TMP54]], align 8
+; PTR128-NEXT: [[VEC_GEP158:%.*]] = getelementptr i8, ptr [[C]], i128 112
+; PTR128-NEXT: store <2 x double> [[TMP53]], ptr [[VEC_GEP158]], align 8
+; PTR128-NEXT: ret void
+;
+; PTR64-LABEL: @multiply(
+; PTR64-NEXT: entry:
+; PTR64-NEXT: [[STORE_BEGIN:%.*]] = ptrtoint ptr [[C:%.*]] to i64
+; PTR64-NEXT: [[STORE_END:%.*]] = add nuw nsw i64 [[STORE_BEGIN]], 128
+; PTR64-NEXT: [[LOAD_BEGIN:%.*]] = ptrtoint ptr [[A:%.*]] to i64
+; PTR64-NEXT: [[TMP0:%.*]] = icmp ugt i64 [[STORE_END]], [[LOAD_BEGIN]]
+; PTR64-NEXT: br i1 [[TMP0]], label [[ALIAS_CONT:%.*]], label [[NO_ALIAS:%.*]]
+; PTR64: alias_cont:
+; PTR64-NEXT: [[LOAD_END:%.*]] = add nuw nsw i64 [[LOAD_BEGIN]], 128
+; PTR64-NEXT: [[TMP1:%.*]] = icmp ugt i64 [[LOAD_END]], [[STORE_BEGIN]]
+; PTR64-NEXT: br i1 [[TMP1]], label [[COPY:%.*]], label [[NO_ALIAS]]
+; PTR64: copy:
+; PTR64-NEXT: [[TMP2:%.*]] = alloca [16 x double], align 8
+; PTR64-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP2]], ptr noundef nonnull align 8 dereferenceable(128) [[A]], i64 128, i1 false)
+; PTR64-NEXT: br label [[NO_ALIAS]]
+; PTR64: no_alias:
+; PTR64-NEXT: [[TMP3:%.*]] = phi ptr [ [[A]], [[ENTRY:%.*]] ], [ [[A]], [[ALIAS_CONT]] ], [ [[TMP2]], [[COPY]] ]
+; PTR64-NEXT: [[STORE_BEGIN4:%.*]] = ptrtoint ptr [[C]] to i64
+; PTR64-NEXT: [[STORE_END5:%.*]] = add nuw nsw i64 [[STORE_BEGIN4]], 128
+; PTR64-NEXT: [[LOAD_BEGIN6:%.*]] = ptrtoint ptr [[A]] to i64
+; PTR64-NEXT: [[TMP4:%.*]] = icmp ugt i64 [[STORE_END5]], [[LOAD_BEGIN6]]
+; PTR64-NEXT: br i1 [[TMP4]], label [[ALIAS_CONT1:%.*]], label [[NO_ALIAS3:%.*]]
+; PTR64: alias_cont1:
+; PTR64-NEXT: [[LOAD_END7:%.*]] = add nuw nsw i64 [[LOAD_BEGIN6]], 128
+; PTR64-NEXT: [[TMP5:%.*]] = icmp ugt i64 [[LOAD_END7]], [[STORE_BEGIN4]]
+; PTR64-NEXT: br i1 [[TMP5]], label [[COPY2:%.*]], label [[NO_ALIAS3]]
+; PTR64: copy2:
+; PTR64-NEXT: [[TMP6:%.*]] = alloca [16 x double], align 8
+; PTR64-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP6]], ptr noundef nonnull align 8 dereferenceable(128) [[A]], i64 128, i1 false)
+; PTR64-NEXT: br label [[NO_ALIAS3]]
+; PTR64: no_alias3:
+; PTR64-NEXT: [[TMP7:%.*]] = phi ptr [ [[A]], [[NO_ALIAS]] ], [ [[A]], [[ALIAS_CONT1]] ], [ [[TMP6]], [[COPY2]] ]
+; PTR64-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
+; PTR64-NEXT: [[VEC_GEP:%.*]] = getelementptr i8, ptr [[TMP3]], i64 32
+; PTR64-NEXT: [[COL_LOAD8:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
+; PTR64-NEXT: [[COL_LOAD9:%.*]] = load <2 x double>, ptr [[TMP7]], align 8
+; PTR64-NEXT: [[VEC_GEP10:%.*]] = getelementptr i8, ptr [[TMP7]], i64 32
+; PTR64-NEXT: [[COL_LOAD11:%.*]] = load <2 x double>, ptr [[VEC_GEP10]], align 8
+; PTR64-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[COL_LOAD9]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT: [[TMP8:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT]]
+; PTR64-NEXT: [[SPLAT_SPLAT14:%.*]] = shufflevector <2 x double> [[COL_LOAD9]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT: [[TMP9:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD8]], <2 x double> [[SPLAT_SPLAT14]], <2 x double> [[TMP8]])
+; PTR64-NEXT: [[SPLAT_SPLAT17:%.*]] = shufflevector <2 x double> [[COL_LOAD11]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT: [[TMP10:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT17]]
+; PTR64-NEXT: [[SPLAT_SPLAT20:%.*]] = shufflevector <2 x double> [[COL_LOAD11]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT: [[TMP11:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD8]], <2 x double> [[SPLAT_SPLAT20]], <2 x double> [[TMP10]])
+; PTR64-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP3]], i64 64
+; PTR64-NEXT: [[COL_LOAD21:%.*]] = load <2 x double>, ptr [[TMP12]], align 8
+; PTR64-NEXT: [[VEC_GEP22:%.*]] = getelementptr i8, ptr [[TMP3]], i64 96
+; PTR64-NEXT: [[COL_LOAD23:%.*]] = load <2 x double>, ptr [[VEC_GEP22]], align 8
+; PTR64-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP7]], i64 16
+; PTR64-NEXT: [[COL_LOAD24:%.*]] = load <2 x double>, ptr [[TMP13]], align 8
+; PTR64-NEXT: [[VEC_GEP25:%.*]] = getelementptr i8, ptr [[TMP7]], i64 48
+; PTR64-NEXT: [[COL_LOAD26:%.*]] = load <2 x double>, ptr [[VEC_GEP25]], align 8
+; PTR64-NEXT: [[SPLAT_SPLAT30:%.*]] = shufflevector <2 x double> [[COL_LOAD24]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT: [[TMP14:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD21]], <2 x double> [[SPLAT_SPLAT30]], <2 x double> [[TMP9]])
+; PTR64-NEXT: [[SPLAT_SPLAT33:%.*]] = shufflevector <2 x double> [[COL_LOAD24]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT: [[TMP15:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD23]], <2 x double> [[SPLAT_SPLAT33]], <2 x double> [[TMP14]])
+; PTR64-NEXT: [[SPLAT_SPLAT37:%.*]] = shufflevector <2 x double> [[COL_LOAD26]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT: [[TMP16:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD21]], <2 x double> [[SPLAT_SPLAT37]], <2 x double> [[TMP11]])
+; PTR64-NEXT: [[SPLAT_SPLAT40:%.*]] = shufflevector <2 x double> [[COL_LOAD26]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT: [[TMP17:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD23]], <2 x double> [[SPLAT_SPLAT40]], <2 x double> [[TMP16]])
+; PTR64-NEXT: store <2 x double> [[TMP15]], ptr [[C]], align 8
+; PTR64-NEXT: [[VEC_GEP41:%.*]] = getelementptr i8, ptr [[C]], i64 32
+; PTR64-NEXT: store <2 x double> [[TMP17]], ptr [[VEC_GEP41]], align 8
+; PTR64-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP3]], i64 16
+; PTR64-NEXT: [[COL_LOAD42:%.*]] = load <2 x double>, ptr [[TMP18]], align 8
+; PTR64-NEXT: [[VEC_GEP43:%.*]] = getelementptr i8, ptr [[TMP3]], i64 48
+; PTR64-NEXT: [[COL_LOAD44:%.*]] = load <2 x double>, ptr [[VEC_GEP43]], align 8
+; PTR64-NEXT: [[COL_LOAD45:%.*]] = load <2 x double>, ptr [[TMP7]], align 8
+; PTR64-NEXT: [[VEC_GEP46:%.*]] = getelementptr i8, ptr [[TMP7]], i64 32
+; PTR64-NEXT: [[COL_LOAD47:%.*]] = load <2 x double>, ptr [[VEC_GEP46]], align 8
+; PTR64-NEXT: [[SPLAT_SPLAT50:%.*]] = shufflevector <2 x double> [[COL_LOAD45]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT: [[TMP19:%.*]] = fmul contract <2 x double> [[COL_LOAD42]], [[SPLAT_SPLAT50]]
+; PTR64-NEXT: [[SPLAT_SPLAT53:%.*]] = shufflevector <2 x double> [[COL_LOAD45]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT: [[TMP20:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD44]], <2 x double> [[SPLAT_SPLAT53]], <2 x double> [[TMP19]])
+; PTR64-NEXT: [[SPLAT_SPLAT56:%.*]] = shufflevector <2 x double> [[COL_LOAD47]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT: [[TMP21:%.*]] = fmul contract <2 x double> [[COL_LOAD42]], [[SPLAT_SPLAT56]]
+; PTR64-NEXT: [[SPLAT_SPLAT59:%.*]] = shufflevector <2 x double> [[COL_LOAD47]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT: [[TMP22:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD44]], <2 x double> [[SPLAT_SPLAT59]], <2 x double> [[TMP21]])
+; PTR64-NEXT: [[TMP23:%.*]] = getelementptr i8, ptr [[TMP3]], i64 80
+; PTR64-NEXT: [[COL_LOAD60:%.*]] = load <2 x double>, ptr [[TMP23]], align 8
+; PTR64-NEXT: [[VEC_GEP61:%.*]] = getelementptr i8, ptr [[TMP3]], i64 112
+; PTR64-NEXT: [[COL_LOAD62:%.*]] = load <2 x double>, ptr [[VEC_GEP61]], align 8
+; PTR64-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[TMP7]], i64 16
+; PTR64-NEXT: [[COL_LOAD63:%.*]] = load <2 x double>, ptr [[TMP24]], align 8
+; PTR64-NEXT: [[VEC_GEP64:%.*]] = getelementptr i8, ptr [[TMP7]], i64 48
+; PTR64-NEXT: [[COL_LOAD65:%.*]] = load <2 x double>, ptr [[VEC_GEP64]], align 8
+; PTR64-NEXT: [[SPLAT_SPLAT69:%.*]] = shufflevector <2 x double> [[COL_LOAD63]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT: [[TMP25:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD60]], <2 x double> [[SPLAT_SPLAT69]], <2 x double> [[TMP20]])
+; PTR64-NEXT: [[SPLAT_SPLAT72:%.*]] = shufflevector <2 x double> [[COL_LOAD63]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT: [[TMP26:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD62]], <2 x double> [[SPLAT_SPLAT72]], <2 x double> [[TMP25]])
+; PTR64-NEXT: [[SPLAT_SPLAT76:%.*]] = shufflevector <2 x double> [[COL_LOAD65]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT: [[TMP27:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD60]], <2 x double> [[SPLAT_SPLAT76]], <2 x double> [[TMP22]])
+; PTR64-NEXT: [[SPLAT_SPLAT79:%.*]] = shufflevector <2 x double> [[COL_LOAD65]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT: [[TMP28:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD62]], <2 x double> [[SPLAT_SPLAT79]], <2 x double> [[TMP27]])
+; PTR64-NEXT: [[TMP29:%.*]] = getelementptr i8, ptr [[C]], i64 16
+; PTR64-NEXT: store <2 x double> [[TMP26]], ptr [[TMP29]], align 8
+; PTR64-NEXT: [[VEC_GEP80:%.*]] = getelementptr i8, ptr [[C]], i64 48
+; PTR64-NEXT: store <2 x double> [[TMP28]], ptr [[VEC_GEP80]], align 8
+; PTR64-NEXT: [[COL_LOAD81:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
+; PTR64-NEXT: [[VEC_GEP82:%.*]] = getelementptr i8, ptr [[TMP3]], i64 32
+; PTR64-NEXT: [[COL_LOAD83:%.*]] = load <2 x double>, ptr [[VEC_GEP82]], align 8
+; PTR64-NEXT: [[TMP30:%.*]] = getelementptr i8, ptr [[TMP7]], i64 64
+; PTR64-NEXT: [[COL_LOAD84:%.*]] = load <2 x double>, ptr [[TMP30]], align 8
+; PTR64-NEXT: [[VEC_GEP85:%.*]] = getelementptr i8, ptr [[TMP7]], i64 96
+; PTR64-NEXT: [[COL_LOAD86:%.*]] = load <2 x double>, ptr [[VEC_GEP85]], align 8
+; PTR64-NEXT: [[SPLAT_SPLAT89:%.*]] = shufflevector <2 x double> [[COL_LOAD84]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT: [[TMP31:%.*]] = fmul contract <2 x double> [[COL_LOAD81]], [[SPLAT_SPLAT89]]
+; PTR64-NEXT: [[SPLAT_SPLAT92:%.*]] = shufflevector <2 x double> [[COL_LOAD84]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT: [[TMP32:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD83]], <2 x double> [[SPLAT_SPLAT92]], <2 x double> [[TMP31]])
+; PTR64-NEXT: [[SPLAT_SPLAT95:%.*]] = shufflevector <2 x double> [[COL_LOAD86]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT: [[TMP33:%.*]] = fmul contract <2 x double> [[COL_LOAD81]], [[SPLAT_SPLAT95]]
+; PTR64-NEXT: [[SPLAT_SPLAT98:%.*]] = shufflevector <2 x double> [[COL_LOAD86]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT: [[TMP34:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD83]], <2 x double> [[SPLAT_SPLAT98]], <2 x double> [[TMP33]])
+; PTR64-NEXT: [[TMP35:%.*]] = getelementptr i8, ptr [[TMP3]], i64 64
+; PTR64-NEXT: [[COL_LOAD99:%.*]] = load <2 x double>, ptr [[TMP35]], align 8
+; PTR64-NEXT: [[VEC_GEP100:%.*]] = getelementptr i8, ptr [[TMP3]], i64 96
+; PTR64-NEXT: [[COL_LOAD101:%.*]] = load <2 x double>, ptr [[VEC_GEP100]], align 8
+; PTR64-NEXT: [[TMP36:%.*]] = getelementptr i8, ptr [[TMP7]], i64 80
+; PTR64-NEXT: [[COL_LOAD102:%.*]] = load <2 x double>, ptr [[TMP36]], align 8
+; PTR64-NEXT: [[VEC_GEP103:%.*]] = getelementptr i8, ptr [[TMP7]], i64 112
+; PTR64-NEXT: [[COL_LOAD104:%.*]] = load <2 x double>, ptr [[VEC_GEP103]], align 8
+; PTR64-NEXT: [[SPLAT_SPLAT108:%.*]] = shufflevector <2 x double> [[COL_LOAD102]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT: [[TMP37:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD99]], <2 x double> [[SPLAT_SPLAT108]], <2 x double> [[TMP32]])
+; PTR64-NEXT: [[SPLAT_SPLAT111:%.*]] = shufflevector <2 x double> [[COL_LOAD102]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT: [[TMP38:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD101]], <2 x double> [[SPLAT_SPLAT111]], <2 x double> [[TMP37]])
+; PTR64-NEXT: [[SPLAT_SPLAT115:%.*]] = shufflevector <2 x double> [[COL_LOAD104]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT: [[TMP39:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD99]], <2 x double> [[SPLAT_SPLAT115]], <2 x double> [[TMP34]])
+; PTR64-NEXT: [[SPLAT_SPLAT118:%.*]] = shufflevector <2 x double> [[COL_LOAD104]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT: [[TMP40:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD101]], <2 x double> [[SPLAT_SPLAT118]], <2 x double> [[TMP39]])
+; PTR64-NEXT: [[TMP41:%.*]] = getelementptr i8, ptr [[C]], i64 64
+; PTR64-NEXT: store <2 x double> [[TMP38]], ptr [[TMP41]], align 8
+; PTR64-NEXT: [[VEC_GEP119:%.*]] = getelementptr i8, ptr [[C]], i64 96
+; PTR64-NEXT: store <2 x double> [[TMP40]], ptr [[VEC_GEP119]], align 8
+; PTR64-NEXT: [[TMP42:%.*]] = getelementptr i8, ptr [[TMP3]], i64 16
+; PTR64-NEXT: [[COL_LOAD120:%.*]] = load <2 x double>, ptr [[TMP42]], align 8
+; PTR64-NEXT: [[VEC_GEP121:%.*]] = getelementptr i8, ptr [[TMP3]], i64 48
+; PTR64-NEXT: [[COL_LOAD122:%.*]] = load <2 x double>, ptr [[VEC_GEP121]], align 8
+; PTR64-NEXT: [[TMP43:%.*]] = getelementptr i8, ptr [[TMP7]], i64 64
+; PTR64-NEXT: [[COL_LOAD123:%.*]] = load <2 x double>, ptr [[TMP43]], align 8
+; PTR64-NEXT: [[VEC_GEP124:%.*]] = getelementptr i8, ptr [[TMP7]], i64 96
+; PTR64-NEXT: [[COL_LOAD125:%.*]] = load <2 x double>, ptr [[VEC_GEP124]], align 8
+; PTR64-NEXT: [[SPLAT_SPLAT128:%.*]] = shufflevector <2 x double> [[COL_LOAD123]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT: [[TMP44:%.*]] = fmul contract <2 x double> [[COL_LOAD120]], [[SPLAT_SPLAT128]]
+; PTR64-NEXT: [[SPLAT_SPLAT131:%.*]] = shufflevector <2 x double> [[COL_LOAD123]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT: [[TMP45:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD122]], <2 x double> [[SPLAT_SPLAT131]], <2 x double> [[TMP44]])
+; PTR64-NEXT: [[SPLAT_SPLAT134:%.*]] = shufflevector <2 x double> [[COL_LOAD125]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT: [[TMP46:%.*]] = fmul contract <2 x double> [[COL_LOAD120]], [[SPLAT_SPLAT134]]
+; PTR64-NEXT: [[SPLAT_SPLAT137:%.*]] = shufflevector <2 x double> [[COL_LOAD125]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT: [[TMP47:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD122]], <2 x double> [[SPLAT_SPLAT137]], <2 x double> [[TMP46]])
+; PTR64-NEXT: [[TMP48:%.*]] = getelementptr i8, ptr [[TMP3]], i64 80
+; PTR64-NEXT: [[COL_LOAD138:%.*]] = load <2 x double>, ptr [[TMP48]], align 8
+; PTR64-NEXT: [[VEC_GEP139:%.*]] = getelementptr i8, ptr [[TMP3]], i64 112
+; PTR64-NEXT: [[COL_LOAD140:%.*]] = load <2 x double>, ptr [[VEC_GEP139]], align 8
+; PTR64-NEXT: [[TMP49:%.*]] = getelementptr i8, ptr [[TMP7]], i64 80
+; PTR64-NEXT: [[COL_LOAD141:%.*]] = load <2 x double>, ptr [[TMP49]], align 8
+; PTR64-NEXT: [[VEC_GEP142:%.*]] = getelementptr i8, ptr [[TMP7]], i64 112
+; PTR64-NEXT: [[COL_LOAD143:%.*]] = load <2 x double>, ptr [[VEC_GEP142]], align 8
+; PTR64-NEXT: [[SPLAT_SPLAT147:%.*]] = shufflevector <2 x double> [[COL_LOAD141]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT: [[TMP50:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD138]], <2 x double> [[SPLAT_SPLAT147]], <2 x double> [[TMP45]])
+; PTR64-NEXT: [[SPLAT_SPLAT150:%.*]] = shufflevector <2 x double> [[COL_LOAD141]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT: [[TMP51:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD140]], <2 x double> [[SPLAT_SPLAT150]], <2 x double> [[TMP50]])
+; PTR64-NEXT: [[SPLAT_SPLAT154:%.*]] = shufflevector <2 x double> [[COL_LOAD143]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT: [[TMP52:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD138]], <2 x double> [[SPLAT_SPLAT154]], <2 x double> [[TMP47]])
+; PTR64-NEXT: [[SPLAT_SPLAT157:%.*]] = shufflevector <2 x double> [[COL_LOAD143]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT: [[TMP53:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD140]], <2 x double> [[SPLAT_SPLAT157]], <2 x double> [[TMP52]])
+; PTR64-NEXT: [[TMP54:%.*]] = getelementptr i8, ptr [[C]], i64 80
+; PTR64-NEXT: store <2 x double> [[TMP51]], ptr [[TMP54]], align 8
+; PTR64-NEXT: [[VEC_GEP158:%.*]] = getelementptr i8, ptr [[C]], i64 112
+; PTR64-NEXT: store <2 x double> [[TMP53]], ptr [[VEC_GEP158]], align 8
+; PTR64-NEXT: ret void
+;
+; PTR32-LABEL: @multiply(
+; PTR32-NEXT: entry:
+; PTR32-NEXT: [[STORE_BEGIN:%.*]] = ptrtoint ptr [[C:%.*]] to i32
+; PTR32-NEXT: [[STORE_END:%.*]] = add nuw nsw i32 [[STORE_BEGIN]], 128
+; PTR32-NEXT: [[LOAD_BEGIN:%.*]] = ptrtoint ptr [[A:%.*]] to i32
+; PTR32-NEXT: [[TMP0:%.*]] = icmp ugt i32 [[STORE_END]], [[LOAD_BEGIN]]
+; PTR32-NEXT: br i1 [[TMP0]], label [[ALIAS_CONT:%.*]], label [[NO_ALIAS:%.*]]
+; PTR32: alias_cont:
+; PTR32-NEXT: [[LOAD_END:%.*]] = add nuw nsw i32 [[LOAD_BEGIN]], 128
+; PTR32-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[LOAD_END]], [[STORE_BEGIN]]
+; PTR32-NEXT: br i1 [[TMP1]], label [[COPY:%.*]], label [[NO_ALIAS]]
+; PTR32: copy:
+; PTR32-NEXT: [[TMP2:%.*]] = alloca [16 x double], align 8
+; PTR32-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP2]], ptr noundef nonnull align 8 dereferenceable(128) [[A]], i64 128, i1 false)
+; PTR32-NEXT: br label [[NO_ALIAS]]
+; PTR32: no_alias:
+; PTR32-NEXT: [[TMP3:%.*]] = phi ptr [ [[A]], [[ENTRY:%.*]] ], [ [[A]], [[ALIAS_CONT]] ], [ [[TMP2]], [[COPY]] ]
+; PTR32-NEXT: [[STORE_BEGIN4:%.*]] = ptrtoint ptr [[C]] to i32
+; PTR32-NEXT: [[STORE_END5:%.*]] = add nuw nsw i32 [[STORE_BEGIN4]], 128
+; PTR32-NEXT: [[LOAD_BEGIN6:%.*]] = ptrtoint ptr [[A]] to i32
+; PTR32-NEXT: [[TMP4:%.*]] = icmp ugt i32 [[STORE_END5]], [[LOAD_BEGIN6]]
+; PTR32-NEXT: br i1 [[TMP4]], label [[ALIAS_CONT1:%.*]], label [[NO_ALIAS3:%.*]]
+; PTR32: alias_cont1:
+; PTR32-NEXT: [[LOAD_END7:%.*]] = add nuw nsw i32 [[LOAD_BEGIN6]], 128
+; PTR32-NEXT: [[TMP5:%.*]] = icmp ugt i32 [[LOAD_END7]], [[STORE_BEGIN4]]
+; PTR32-NEXT: br i1 [[TMP5]], label [[COPY2:%.*]], label [[NO_ALIAS3]]
+; PTR32: copy2:
+; PTR32-NEXT: [[TMP6:%.*]] = alloca [16 x double], align 8
+; PTR32-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP6]], ptr noundef nonnull align 8 dereferenceable(128) [[A]], i64 128, i1 false)
+; PTR32-NEXT: br label [[NO_ALIAS3]]
+; PTR32: no_alias3:
+; PTR32-NEXT: [[TMP7:%.*]] = phi ptr [ [[A]], [[NO_ALIAS]] ], [ [[A]], [[ALIAS_CONT1]] ], [ [[TMP6]], [[COPY2]] ]
+; PTR32-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
+; PTR32-NEXT: [[VEC_GEP:%.*]] = getelementptr i8, ptr [[TMP3]], i32 32
+; PTR32-NEXT: [[COL_LOAD8:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
+; PTR32-NEXT: [[COL_LOAD9:%.*]] = load <2 x double>, ptr [[TMP7]], align 8
+; PTR32-NEXT: [[VEC_GEP10:%.*]] = getelementptr i8, ptr [[TMP7]], i32 32
+; PTR32-NEXT: [[COL_LOAD11:%.*]] = load <2 x double>, ptr [[VEC_GEP10]], align 8
+; PTR32-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[COL_LOAD9]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT: [[TMP8:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT]]
+; PTR32-NEXT: [[SPLAT_SPLAT14:%.*]] = shufflevector <2 x double> [[COL_LOAD9]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT: [[TMP9:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD8]], <2 x double> [[SPLAT_SPLAT14]], <2 x double> [[TMP8]])
+; PTR32-NEXT: [[SPLAT_SPLAT17:%.*]] = shufflevector <2 x double> [[COL_LOAD11]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT: [[TMP10:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT17]]
+; PTR32-NEXT: [[SPLAT_SPLAT20:%.*]] = shufflevector <2 x double> [[COL_LOAD11]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT: [[TMP11:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD8]], <2 x double> [[SPLAT_SPLAT20]], <2 x double> [[TMP10]])
+; PTR32-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP3]], i32 64
+; PTR32-NEXT: [[COL_LOAD21:%.*]] = load <2 x double>, ptr [[TMP12]], align 8
+; PTR32-NEXT: [[VEC_GEP22:%.*]] = getelementptr i8, ptr [[TMP3]], i32 96
+; PTR32-NEXT: [[COL_LOAD23:%.*]] = load <2 x double>, ptr [[VEC_GEP22]], align 8
+; PTR32-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP7]], i32 16
+; PTR32-NEXT: [[COL_LOAD24:%.*]] = load <2 x double>, ptr [[TMP13]], align 8
+; PTR32-NEXT: [[VEC_GEP25:%.*]] = getelementptr i8, ptr [[TMP7]], i32 48
+; PTR32-NEXT: [[COL_LOAD26:%.*]] = load <2 x double>, ptr [[VEC_GEP25]], align 8
+; PTR32-NEXT: [[SPLAT_SPLAT30:%.*]] = shufflevector <2 x double> [[COL_LOAD24]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT: [[TMP14:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD21]], <2 x double> [[SPLAT_SPLAT30]], <2 x double> [[TMP9]])
+; PTR32-NEXT: [[SPLAT_SPLAT33:%.*]] = shufflevector <2 x double> [[COL_LOAD24]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT: [[TMP15:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD23]], <2 x double> [[SPLAT_SPLAT33]], <2 x double> [[TMP14]])
+; PTR32-NEXT: [[SPLAT_SPLAT37:%.*]] = shufflevector <2 x double> [[COL_LOAD26]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT: [[TMP16:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD21]], <2 x double> [[SPLAT_SPLAT37]], <2 x double> [[TMP11]])
+; PTR32-NEXT: [[SPLAT_SPLAT40:%.*]] = shufflevector <2 x double> [[COL_LOAD26]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT: [[TMP17:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD23]], <2 x double> [[SPLAT_SPLAT40]], <2 x double> [[TMP16]])
+; PTR32-NEXT: store <2 x double> [[TMP15]], ptr [[C]], align 8
+; PTR32-NEXT: [[VEC_GEP41:%.*]] = getelementptr i8, ptr [[C]], i32 32
+; PTR32-NEXT: store <2 x double> [[TMP17]], ptr [[VEC_GEP41]], align 8
+; PTR32-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP3]], i32 16
+; PTR32-NEXT: [[COL_LOAD42:%.*]] = load <2 x double>, ptr [[TMP18]], align 8
+; PTR32-NEXT: [[VEC_GEP43:%.*]] = getelementptr i8, ptr [[TMP3]], i32 48
+; PTR32-NEXT: [[COL_LOAD44:%.*]] = load <2 x double>, ptr [[VEC_GEP43]], align 8
+; PTR32-NEXT: [[COL_LOAD45:%.*]] = load <2 x double>, ptr [[TMP7]], align 8
+; PTR32-NEXT: [[VEC_GEP46:%.*]] = getelementptr i8, ptr [[TMP7]], i32 32
+; PTR32-NEXT: [[COL_LOAD47:%.*]] = load <2 x double>, ptr [[VEC_GEP46]], align 8
+; PTR32-NEXT: [[SPLAT_SPLAT50:%.*]] = shufflevector <2 x double> [[COL_LOAD45]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT: [[TMP19:%.*]] = fmul contract <2 x double> [[COL_LOAD42]], [[SPLAT_SPLAT50]]
+; PTR32-NEXT: [[SPLAT_SPLAT53:%.*]] = shufflevector <2 x double> [[COL_LOAD45]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT: [[TMP20:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD44]], <2 x double> [[SPLAT_SPLAT53]], <2 x double> [[TMP19]])
+; PTR32-NEXT: [[SPLAT_SPLAT56:%.*]] = shufflevector <2 x double> [[COL_LOAD47]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT: [[TMP21:%.*]] = fmul contract <2 x double> [[COL_LOAD42]], [[SPLAT_SPLAT56]]
+; PTR32-NEXT: [[SPLAT_SPLAT59:%.*]] = shufflevector <2 x double> [[COL_LOAD47]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT: [[TMP22:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD44]], <2 x double> [[SPLAT_SPLAT59]], <2 x double> [[TMP21]])
+; PTR32-NEXT: [[TMP23:%.*]] = getelementptr i8, ptr [[TMP3]], i32 80
+; PTR32-NEXT: [[COL_LOAD60:%.*]] = load <2 x double>, ptr [[TMP23]], align 8
+; PTR32-NEXT: [[VEC_GEP61:%.*]] = getelementptr i8, ptr [[TMP3]], i32 112
+; PTR32-NEXT: [[COL_LOAD62:%.*]] = load <2 x double>, ptr [[VEC_GEP61]], align 8
+; PTR32-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[TMP7]], i32 16
+; PTR32-NEXT: [[COL_LOAD63:%.*]] = load <2 x double>, ptr [[TMP24]], align 8
+; PTR32-NEXT: [[VEC_GEP64:%.*]] = getelementptr i8, ptr [[TMP7]], i32 48
+; PTR32-NEXT: [[COL_LOAD65:%.*]] = load <2 x double>, ptr [[VEC_GEP64]], align 8
+; PTR32-NEXT: [[SPLAT_SPLAT69:%.*]] = shufflevector <2 x double> [[COL_LOAD63]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT: [[TMP25:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD60]], <2 x double> [[SPLAT_SPLAT69]], <2 x double> [[TMP20]])
+; PTR32-NEXT: [[SPLAT_SPLAT72:%.*]] = shufflevector <2 x double> [[COL_LOAD63]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT: [[TMP26:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD62]], <2 x double> [[SPLAT_SPLAT72]], <2 x double> [[TMP25]])
+; PTR32-NEXT: [[SPLAT_SPLAT76:%.*]] = shufflevector <2 x double> [[COL_LOAD65]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT: [[TMP27:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD60]], <2 x double> [[SPLAT_SPLAT76]], <2 x double> [[TMP22]])
+; PTR32-NEXT: [[SPLAT_SPLAT79:%.*]] = shufflevector <2 x double> [[COL_LOAD65]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT: [[TMP28:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD62]], <2 x double> [[SPLAT_SPLAT79]], <2 x double> [[TMP27]])
+; PTR32-NEXT: [[TMP29:%.*]] = getelementptr i8, ptr [[C]], i32 16
+; PTR32-NEXT: store <2 x double> [[TMP26]], ptr [[TMP29]], align 8
+; PTR32-NEXT: [[VEC_GEP80:%.*]] = getelementptr i8, ptr [[C]], i32 48
+; PTR32-NEXT: store <2 x double> [[TMP28]], ptr [[VEC_GEP80]], align 8
+; PTR32-NEXT: [[COL_LOAD81:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
+; PTR32-NEXT: [[VEC_GEP82:%.*]] = getelementptr i8, ptr [[TMP3]], i32 32
+; PTR32-NEXT: [[COL_LOAD83:%.*]] = load <2 x double>, ptr [[VEC_GEP82]], align 8
+; PTR32-NEXT: [[TMP30:%.*]] = getelementptr i8, ptr [[TMP7]], i32 64
+; PTR32-NEXT: [[COL_LOAD84:%.*]] = load <2 x double>, ptr [[TMP30]], align 8
+; PTR32-NEXT: [[VEC_GEP85:%.*]] = getelementptr i8, ptr [[TMP7]], i32 96
+; PTR32-NEXT: [[COL_LOAD86:%.*]] = load <2 x double>, ptr [[VEC_GEP85]], align 8
+; PTR32-NEXT: [[SPLAT_SPLAT89:%.*]] = shufflevector <2 x double> [[COL_LOAD84]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT: [[TMP31:%.*]] = fmul contract <2 x double> [[COL_LOAD81]], [[SPLAT_SPLAT89]]
+; PTR32-NEXT: [[SPLAT_SPLAT92:%.*]] = shufflevector <2 x double> [[COL_LOAD84]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT: [[TMP32:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD83]], <2 x double> [[SPLAT_SPLAT92]], <2 x double> [[TMP31]])
+; PTR32-NEXT: [[SPLAT_SPLAT95:%.*]] = shufflevector <2 x double> [[COL_LOAD86]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT: [[TMP33:%.*]] = fmul contract <2 x double> [[COL_LOAD81]], [[SPLAT_SPLAT95]]
+; PTR32-NEXT: [[SPLAT_SPLAT98:%.*]] = shufflevector <2 x double> [[COL_LOAD86]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT: [[TMP34:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD83]], <2 x double> [[SPLAT_SPLAT98]], <2 x double> [[TMP33]])
+; PTR32-NEXT: [[TMP35:%.*]] = getelementptr i8, ptr [[TMP3]], i32 64
+; PTR32-NEXT: [[COL_LOAD99:%.*]] = load <2 x double>, ptr [[TMP35]], align 8
+; PTR32-NEXT: [[VEC_GEP100:%.*]] = getelementptr i8, ptr [[TMP3]], i32 96
+; PTR32-NEXT: [[COL_LOAD101:%.*]] = load <2 x double>, ptr [[VEC_GEP100]], align 8
+; PTR32-NEXT: [[TMP36:%.*]] = getelementptr i8, ptr [[TMP7]], i32 80
+; PTR32-NEXT: [[COL_LOAD102:%.*]] = load <2 x double>, ptr [[TMP36]], align 8
+; PTR32-NEXT: [[VEC_GEP103:%.*]] = getelementptr i8, ptr [[TMP7]], i32 112
+; PTR32-NEXT: [[COL_LOAD104:%.*]] = load <2 x double>, ptr [[VEC_GEP103]], align 8
+; PTR32-NEXT: [[SPLAT_SPLAT108:%.*]] = shufflevector <2 x double> [[COL_LOAD102]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT: [[TMP37:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD99]], <2 x double> [[SPLAT_SPLAT108]], <2 x double> [[TMP32]])
+; PTR32-NEXT: [[SPLAT_SPLAT111:%.*]] = shufflevector <2 x double> [[COL_LOAD102]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT: [[TMP38:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD101]], <2 x double> [[SPLAT_SPLAT111]], <2 x double> [[TMP37]])
+; PTR32-NEXT: [[SPLAT_SPLAT115:%.*]] = shufflevector <2 x double> [[COL_LOAD104]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT: [[TMP39:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD99]], <2 x double> [[SPLAT_SPLAT115]], <2 x double> [[TMP34]])
+; PTR32-NEXT: [[SPLAT_SPLAT118:%.*]] = shufflevector <2 x double> [[COL_LOAD104]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT: [[TMP40:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD101]], <2 x double> [[SPLAT_SPLAT118]], <2 x double> [[TMP39]])
+; PTR32-NEXT: [[TMP41:%.*]] = getelementptr i8, ptr [[C]], i32 64
+; PTR32-NEXT: store <2 x double> [[TMP38]], ptr [[TMP41]], align 8
+; PTR32-NEXT: [[VEC_GEP119:%.*]] = getelementptr i8, ptr [[C]], i32 96
+; PTR32-NEXT: store <2 x double> [[TMP40]], ptr [[VEC_GEP119]], align 8
+; PTR32-NEXT: [[TMP42:%.*]] = getelementptr i8, ptr [[TMP3]], i32 16
+; PTR32-NEXT: [[COL_LOAD120:%.*]] = load <2 x double>, ptr [[TMP42]], align 8
+; PTR32-NEXT: [[VEC_GEP121:%.*]] = getelementptr i8, ptr [[TMP3]], i32 48
+; PTR32-NEXT: [[COL_LOAD122:%.*]] = load <2 x double>, ptr [[VEC_GEP121]], align 8
+; PTR32-NEXT: [[TMP43:%.*]] = getelementptr i8, ptr [[TMP7]], i32 64
+; PTR32-NEXT: [[COL_LOAD123:%.*]] = load <2 x double>, ptr [[TMP43]], align 8
+; PTR32-NEXT: [[VEC_GEP124:%.*]] = getelementptr i8, ptr [[TMP7]], i32 96
+; PTR32-NEXT: [[COL_LOAD125:%.*]] = load <2 x double>, ptr [[VEC_GEP124]], align 8
+; PTR32-NEXT: [[SPLAT_SPLAT128:%.*]] = shufflevector <2 x double> [[COL_LOAD123]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT: [[TMP44:%.*]] = fmul contract <2 x double> [[COL_LOAD120]], [[SPLAT_SPLAT128]]
+; PTR32-NEXT: [[SPLAT_SPLAT131:%.*]] = shufflevector <2 x double> [[COL_LOAD123]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT: [[TMP45:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD122]], <2 x double> [[SPLAT_SPLAT131]], <2 x double> [[TMP44]])
+; PTR32-NEXT: [[SPLAT_SPLAT134:%.*]] = shufflevector <2 x double> [[COL_LOAD125]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT: [[TMP46:%.*]] = fmul contract <2 x double> [[COL_LOAD120]], [[SPLAT_SPLAT134]]
+; PTR32-NEXT: [[SPLAT_SPLAT137:%.*]] = shufflevector <2 x double> [[COL_LOAD125]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT: [[TMP47:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD122]], <2 x double> [[SPLAT_SPLAT137]], <2 x double> [[TMP46]])
+; PTR32-NEXT: [[TMP48:%.*]] = getelementptr i8, ptr [[TMP3]], i32 80
+; PTR32-NEXT: [[COL_LOAD138:%.*]] = load <2 x double>, ptr [[TMP48]], align 8
+; PTR32-NEXT: [[VEC_GEP139:%.*]] = getelementptr i8, ptr [[TMP3]], i32 112
+; PTR32-NEXT: [[COL_LOAD140:%.*]] = load <2 x double>, ptr [[VEC_GEP139]], align 8
+; PTR32-NEXT: [[TMP49:%.*]] = getelementptr i8, ptr [[TMP7]], i32 80
+; PTR32-NEXT: [[COL_LOAD141:%.*]] = load <2 x double>, ptr [[TMP49]], align 8
+; PTR32-NEXT: [[VEC_GEP142:%.*]] = getelementptr i8, ptr [[TMP7]], i32 112
+; PTR32-NEXT: [[COL_LOAD143:%.*]] = load <2 x double>, ptr [[VEC_GEP142]], align 8
+; PTR32-NEXT: [[SPLAT_SPLAT147:%.*]] = shufflevector <2 x double> [[COL_LOAD141]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT: [[TMP50:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD138]], <2 x double> [[SPLAT_SPLAT147]], <2 x double> [[TMP45]])
+; PTR32-NEXT: [[SPLAT_SPLAT150:%.*]] = shufflevector <2 x double> [[COL_LOAD141]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT: [[TMP51:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD140]], <2 x double> [[SPLAT_SPLAT150]], <2 x double> [[TMP50]])
+; PTR32-NEXT: [[SPLAT_SPLAT154:%.*]] = shufflevector <2 x double> [[COL_LOAD143]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT: [[TMP52:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD138]], <2 x double> [[SPLAT_SPLAT154]], <2 x double> [[TMP47]])
+; PTR32-NEXT: [[SPLAT_SPLAT157:%.*]] = shufflevector <2 x double> [[COL_LOAD143]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT: [[TMP53:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD140]], <2 x double> [[SPLAT_SPLAT157]], <2 x double> [[TMP52]])
+; PTR32-NEXT: [[TMP54:%.*]] = getelementptr i8, ptr [[C]], i32 80
+; PTR32-NEXT: store <2 x double> [[TMP51]], ptr [[TMP54]], align 8
+; PTR32-NEXT: [[VEC_GEP158:%.*]] = getelementptr i8, ptr [[C]], i32 112
+; PTR32-NEXT: store <2 x double> [[TMP53]], ptr [[VEC_GEP158]], align 8
+; PTR32-NEXT: ret void
+;
+entry:
+ %a = load <16 x double>, ptr %A, align 8
+ %c = call <16 x double> @llvm.matrix.multiply(<16 x double> %a, <16 x double> %a, i32 4, i32 4, i32 4)
+ store <16 x double> %c, ptr %C, align 8
+ ret void
+}
+
+declare <16 x double> @llvm.matrix.multiply(<16 x double>, <16 x double>, i32, i32, i32)
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout.ll
new file mode 100644
index 0000000..87def6b
--- /dev/null
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout.ll
@@ -0,0 +1,312 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes='lower-matrix-intrinsics' -data-layout='p:128:128' -S < %s | FileCheck %s --check-prefix=PTR128
+; RUN: opt -passes='lower-matrix-intrinsics' -data-layout='p:64:64' -S < %s | FileCheck %s --check-prefix=PTR64
+; RUN: opt -passes='lower-matrix-intrinsics' -data-layout='p:32:32' -S < %s | FileCheck %s --check-prefix=PTR32
+
+; To properly support the matrix intrinsics on, e.g., 32-bit platforms (without
+; the need to emit `libc` calls), we perform strided index calculations using
+; the same pointer bit-width as the matrix pointers, as determined by the data
+; layout. To verify this behaviour, this test runs several strided loads and
+; stores through the lowering pass with (32|64|128)-bit pointers, and verifies
+; the generated code extends / truncates strides accordingly. Similarly,
+; `data-layout-multiply-fused.ll` adopts this approach to verify the same
+; behaviour for index calculations emitted while lowering fused matrix
+; multiplies.
+
+define <9 x double> @strided_load_3x3_i128(ptr %in, i128 %stride) {
+; PTR128-LABEL: @strided_load_3x3_i128(
+; PTR128-NEXT: entry:
+; PTR128-NEXT: [[VEC_START:%.*]] = mul i128 0, [[STRIDE:%.*]]
+; PTR128-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i128 [[VEC_START]]
+; PTR128-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR128-NEXT: [[VEC_START1:%.*]] = mul i128 1, [[STRIDE]]
+; PTR128-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i128 [[VEC_START1]]
+; PTR128-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR128-NEXT: [[VEC_START4:%.*]] = mul i128 2, [[STRIDE]]
+; PTR128-NEXT: [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i128 [[VEC_START4]]
+; PTR128-NEXT: [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8
+; PTR128-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR128-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR128-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR128-NEXT: ret <9 x double> [[TMP2]]
+;
+; PTR64-LABEL: @strided_load_3x3_i128(
+; PTR64-NEXT: entry:
+; PTR64-NEXT: [[STRIDE_CAST:%.*]] = trunc i128 [[STRIDE:%.*]] to i64
+; PTR64-NEXT: [[VEC_START:%.*]] = mul i64 0, [[STRIDE_CAST]]
+; PTR64-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i64 [[VEC_START]]
+; PTR64-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR64-NEXT: [[VEC_START1:%.*]] = mul i64 1, [[STRIDE_CAST]]
+; PTR64-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i64 [[VEC_START1]]
+; PTR64-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR64-NEXT: [[VEC_START4:%.*]] = mul i64 2, [[STRIDE_CAST]]
+; PTR64-NEXT: [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i64 [[VEC_START4]]
+; PTR64-NEXT: [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8
+; PTR64-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR64-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR64-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR64-NEXT: ret <9 x double> [[TMP2]]
+;
+; PTR32-LABEL: @strided_load_3x3_i128(
+; PTR32-NEXT: entry:
+; PTR32-NEXT: [[STRIDE_CAST:%.*]] = trunc i128 [[STRIDE:%.*]] to i32
+; PTR32-NEXT: [[VEC_START:%.*]] = mul i32 0, [[STRIDE_CAST]]
+; PTR32-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i32 [[VEC_START]]
+; PTR32-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR32-NEXT: [[VEC_START1:%.*]] = mul i32 1, [[STRIDE_CAST]]
+; PTR32-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i32 [[VEC_START1]]
+; PTR32-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR32-NEXT: [[VEC_START4:%.*]] = mul i32 2, [[STRIDE_CAST]]
+; PTR32-NEXT: [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i32 [[VEC_START4]]
+; PTR32-NEXT: [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8
+; PTR32-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR32-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR32-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR32-NEXT: ret <9 x double> [[TMP2]]
+;
+entry:
+ %load = call <9 x double> @llvm.matrix.column.major.load.v9f64.i128(ptr %in, i128 %stride, i1 false, i32 3, i32 3)
+ ret <9 x double> %load
+}
+
+define <9 x double> @strided_load_3x3_const_stride_i128(ptr %in) {
+; PTR128-LABEL: @strided_load_3x3_const_stride_i128(
+; PTR128-NEXT: entry:
+; PTR128-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
+; PTR128-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i128 16
+; PTR128-NEXT: [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR128-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i128 32
+; PTR128-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR128-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR128-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR128-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR128-NEXT: ret <9 x double> [[TMP2]]
+;
+; PTR64-LABEL: @strided_load_3x3_const_stride_i128(
+; PTR64-NEXT: entry:
+; PTR64-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
+; PTR64-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i64 16
+; PTR64-NEXT: [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR64-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i64 32
+; PTR64-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR64-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR64-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR64-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR64-NEXT: ret <9 x double> [[TMP2]]
+;
+; PTR32-LABEL: @strided_load_3x3_const_stride_i128(
+; PTR32-NEXT: entry:
+; PTR32-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
+; PTR32-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i32 16
+; PTR32-NEXT: [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR32-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i32 32
+; PTR32-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR32-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR32-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR32-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR32-NEXT: ret <9 x double> [[TMP2]]
+;
+entry:
+ %load = call <9 x double> @llvm.matrix.column.major.load.v9f64.i128(ptr %in, i128 16, i1 false, i32 3, i32 3)
+ ret <9 x double> %load
+}
+
+define <9 x double> @strided_load_3x3_i64(ptr %in, i64 %stride) {
+; PTR128-LABEL: @strided_load_3x3_i64(
+; PTR128-NEXT: entry:
+; PTR128-NEXT: [[STRIDE_CAST:%.*]] = zext i64 [[STRIDE:%.*]] to i128
+; PTR128-NEXT: [[VEC_START:%.*]] = mul i128 0, [[STRIDE_CAST]]
+; PTR128-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i128 [[VEC_START]]
+; PTR128-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR128-NEXT: [[VEC_START1:%.*]] = mul i128 1, [[STRIDE_CAST]]
+; PTR128-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i128 [[VEC_START1]]
+; PTR128-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR128-NEXT: [[VEC_START4:%.*]] = mul i128 2, [[STRIDE_CAST]]
+; PTR128-NEXT: [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i128 [[VEC_START4]]
+; PTR128-NEXT: [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8
+; PTR128-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR128-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR128-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR128-NEXT: ret <9 x double> [[TMP2]]
+;
+; PTR64-LABEL: @strided_load_3x3_i64(
+; PTR64-NEXT: entry:
+; PTR64-NEXT: [[VEC_START:%.*]] = mul i64 0, [[STRIDE:%.*]]
+; PTR64-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i64 [[VEC_START]]
+; PTR64-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR64-NEXT: [[VEC_START1:%.*]] = mul i64 1, [[STRIDE]]
+; PTR64-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i64 [[VEC_START1]]
+; PTR64-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR64-NEXT: [[VEC_START4:%.*]] = mul i64 2, [[STRIDE]]
+; PTR64-NEXT: [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i64 [[VEC_START4]]
+; PTR64-NEXT: [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8
+; PTR64-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR64-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR64-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR64-NEXT: ret <9 x double> [[TMP2]]
+;
+; PTR32-LABEL: @strided_load_3x3_i64(
+; PTR32-NEXT: entry:
+; PTR32-NEXT: [[STRIDE_CAST:%.*]] = trunc i64 [[STRIDE:%.*]] to i32
+; PTR32-NEXT: [[VEC_START:%.*]] = mul i32 0, [[STRIDE_CAST]]
+; PTR32-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i32 [[VEC_START]]
+; PTR32-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR32-NEXT: [[VEC_START1:%.*]] = mul i32 1, [[STRIDE_CAST]]
+; PTR32-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i32 [[VEC_START1]]
+; PTR32-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR32-NEXT: [[VEC_START4:%.*]] = mul i32 2, [[STRIDE_CAST]]
+; PTR32-NEXT: [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i32 [[VEC_START4]]
+; PTR32-NEXT: [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8
+; PTR32-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR32-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR32-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR32-NEXT: ret <9 x double> [[TMP2]]
+;
+entry:
+ %load = call <9 x double> @llvm.matrix.column.major.load.v9f64.i64(ptr %in, i64 %stride, i1 false, i32 3, i32 3)
+ ret <9 x double> %load
+}
+
+define <9 x double> @strided_load_3x3_const_stride_i64(ptr %in) {
+; PTR128-LABEL: @strided_load_3x3_const_stride_i64(
+; PTR128-NEXT: entry:
+; PTR128-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
+; PTR128-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i128 16
+; PTR128-NEXT: [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR128-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i128 32
+; PTR128-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR128-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR128-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR128-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR128-NEXT: ret <9 x double> [[TMP2]]
+;
+; PTR64-LABEL: @strided_load_3x3_const_stride_i64(
+; PTR64-NEXT: entry:
+; PTR64-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
+; PTR64-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i64 16
+; PTR64-NEXT: [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR64-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i64 32
+; PTR64-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR64-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR64-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR64-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR64-NEXT: ret <9 x double> [[TMP2]]
+;
+; PTR32-LABEL: @strided_load_3x3_const_stride_i64(
+; PTR32-NEXT: entry:
+; PTR32-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
+; PTR32-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i32 16
+; PTR32-NEXT: [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR32-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i32 32
+; PTR32-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR32-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR32-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR32-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR32-NEXT: ret <9 x double> [[TMP2]]
+;
+entry:
+ %load = call <9 x double> @llvm.matrix.column.major.load.v9f64.i64(ptr %in, i64 16, i1 false, i32 3, i32 3)
+ ret <9 x double> %load
+}
+
+define <9 x double> @strided_load_3x3_i32(ptr %in, i32 %stride) {
+; PTR128-LABEL: @strided_load_3x3_i32(
+; PTR128-NEXT: entry:
+; PTR128-NEXT: [[STRIDE_CAST:%.*]] = zext i32 [[STRIDE:%.*]] to i128
+; PTR128-NEXT: [[VEC_START:%.*]] = mul i128 0, [[STRIDE_CAST]]
+; PTR128-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i128 [[VEC_START]]
+; PTR128-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR128-NEXT: [[VEC_START1:%.*]] = mul i128 1, [[STRIDE_CAST]]
+; PTR128-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i128 [[VEC_START1]]
+; PTR128-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR128-NEXT: [[VEC_START4:%.*]] = mul i128 2, [[STRIDE_CAST]]
+; PTR128-NEXT: [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i128 [[VEC_START4]]
+; PTR128-NEXT: [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8
+; PTR128-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR128-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR128-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR128-NEXT: ret <9 x double> [[TMP2]]
+;
+; PTR64-LABEL: @strided_load_3x3_i32(
+; PTR64-NEXT: entry:
+; PTR64-NEXT: [[STRIDE_CAST:%.*]] = zext i32 [[STRIDE:%.*]] to i64
+; PTR64-NEXT: [[VEC_START:%.*]] = mul i64 0, [[STRIDE_CAST]]
+; PTR64-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i64 [[VEC_START]]
+; PTR64-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR64-NEXT: [[VEC_START1:%.*]] = mul i64 1, [[STRIDE_CAST]]
+; PTR64-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i64 [[VEC_START1]]
+; PTR64-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR64-NEXT: [[VEC_START4:%.*]] = mul i64 2, [[STRIDE_CAST]]
+; PTR64-NEXT: [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i64 [[VEC_START4]]
+; PTR64-NEXT: [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8
+; PTR64-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR64-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR64-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR64-NEXT: ret <9 x double> [[TMP2]]
+;
+; PTR32-LABEL: @strided_load_3x3_i32(
+; PTR32-NEXT: entry:
+; PTR32-NEXT: [[VEC_START:%.*]] = mul i32 0, [[STRIDE:%.*]]
+; PTR32-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i32 [[VEC_START]]
+; PTR32-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR32-NEXT: [[VEC_START1:%.*]] = mul i32 1, [[STRIDE]]
+; PTR32-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i32 [[VEC_START1]]
+; PTR32-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR32-NEXT: [[VEC_START4:%.*]] = mul i32 2, [[STRIDE]]
+; PTR32-NEXT: [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i32 [[VEC_START4]]
+; PTR32-NEXT: [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8
+; PTR32-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR32-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR32-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR32-NEXT: ret <9 x double> [[TMP2]]
+;
+entry:
+ %load = call <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr %in, i32 %stride, i1 false, i32 3, i32 3)
+ ret <9 x double> %load
+}
+
+define <9 x double> @strided_load_3x3_const_stride_i32(ptr %in) {
+; PTR128-LABEL: @strided_load_3x3_const_stride_i32(
+; PTR128-NEXT: entry:
+; PTR128-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
+; PTR128-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i128 16
+; PTR128-NEXT: [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR128-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i128 32
+; PTR128-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR128-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR128-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR128-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR128-NEXT: ret <9 x double> [[TMP2]]
+;
+; PTR64-LABEL: @strided_load_3x3_const_stride_i32(
+; PTR64-NEXT: entry:
+; PTR64-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
+; PTR64-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i64 16
+; PTR64-NEXT: [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR64-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i64 32
+; PTR64-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR64-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR64-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR64-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR64-NEXT: ret <9 x double> [[TMP2]]
+;
+; PTR32-LABEL: @strided_load_3x3_const_stride_i32(
+; PTR32-NEXT: entry:
+; PTR32-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
+; PTR32-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i32 16
+; PTR32-NEXT: [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR32-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i32 32
+; PTR32-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR32-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR32-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR32-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR32-NEXT: ret <9 x double> [[TMP2]]
+;
+entry:
+ %load = call <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr %in, i32 16, i1 false, i32 3, i32 3)
+ ret <9 x double> %load
+}
+
+declare <9 x double> @llvm.matrix.column.major.load.v9f64.i128(ptr, i128, i1, i32, i32)
+declare <9 x double> @llvm.matrix.column.major.load.v9f64.i64(ptr, i64, i1, i32, i32)
+declare <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr, i32, i1, i32, i32)
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-double.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-double.ll
index ae7da19..abc4705 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-double.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-double.ll
@@ -62,11 +62,12 @@ declare <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr, i32, i1, i32,
define <8 x double> @strided_load_4x2_stride_i32(ptr %in, i32 %stride) {
; CHECK-LABEL: @strided_load_4x2_stride_i32(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[VEC_START:%.*]] = mul i32 0, [[STRIDE:%.*]]
-; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i32 [[VEC_START]]
+; CHECK-NEXT: [[STRIDE_CAST:%.*]] = zext i32 [[STRIDE:%.*]] to i64
+; CHECK-NEXT: [[VEC_START:%.*]] = mul i64 0, [[STRIDE_CAST]]
+; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i64 [[VEC_START]]
; CHECK-NEXT: [[COL_LOAD:%.*]] = load <4 x double>, ptr [[VEC_GEP]], align 8
-; CHECK-NEXT: [[VEC_START1:%.*]] = mul i32 1, [[STRIDE]]
-; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i32 [[VEC_START1]]
+; CHECK-NEXT: [[VEC_START1:%.*]] = mul i64 1, [[STRIDE_CAST]]
+; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i64 [[VEC_START1]]
; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <4 x double>, ptr [[VEC_GEP2]], align 8
; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x double> [[COL_LOAD]], <4 x double> [[COL_LOAD4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: ret <8 x double> [[TMP0]]
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-double.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-double.ll
index 28e9cdb..81b8507 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-double.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-double.ll
@@ -34,11 +34,12 @@ define void @strided_store_3x2_nonconst_i32_stride(<6 x double> %in, i32 %stride
; CHECK-LABEL: @strided_store_3x2_nonconst_i32_stride(
; CHECK-NEXT: [[SPLIT:%.*]] = shufflevector <6 x double> [[IN:%.*]], <6 x double> poison, <3 x i32> <i32 0, i32 1, i32 2>
; CHECK-NEXT: [[SPLIT1:%.*]] = shufflevector <6 x double> [[IN]], <6 x double> poison, <3 x i32> <i32 3, i32 4, i32 5>
-; CHECK-NEXT: [[VEC_START:%.*]] = mul i32 0, [[STRIDE:%.*]]
-; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[OUT:%.*]], i32 [[VEC_START]]
+; CHECK-NEXT: [[STRIDE_CAST:%.*]] = zext i32 [[STRIDE:%.*]] to i64
+; CHECK-NEXT: [[VEC_START:%.*]] = mul i64 0, [[STRIDE_CAST]]
+; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[VEC_START]]
; CHECK-NEXT: store <3 x double> [[SPLIT]], ptr [[VEC_GEP]], align 8
-; CHECK-NEXT: [[VEC_START2:%.*]] = mul i32 1, [[STRIDE]]
-; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr double, ptr [[OUT]], i32 [[VEC_START2]]
+; CHECK-NEXT: [[VEC_START2:%.*]] = mul i64 1, [[STRIDE_CAST]]
+; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr double, ptr [[OUT]], i64 [[VEC_START2]]
; CHECK-NEXT: store <3 x double> [[SPLIT1]], ptr [[VEC_GEP3]], align 8
; CHECK-NEXT: ret void
;
diff --git a/llvm/test/Transforms/PGOProfile/data-access-profile.ll b/llvm/test/Transforms/PGOProfile/data-access-profile.ll
index 29198f34..205184b 100644
--- a/llvm/test/Transforms/PGOProfile/data-access-profile.ll
+++ b/llvm/test/Transforms/PGOProfile/data-access-profile.ll
@@ -3,55 +3,72 @@
; RUN: rm -rf %t && split-file %s %t && cd %t
-;; Read a text profile and merge it into indexed profile.
+;; Read text profiles and merge them into indexed profiles.
; RUN: llvm-profdata merge --memprof-version=4 memprof.yaml -o memprof.profdata
+; RUN: llvm-profdata merge --memprof-version=4 memprof-no-dap.yaml -o memprof-no-dap.profdata
;; Run optimizer pass on an IR module without IR functions, and test that global
;; variables in the module could be annotated (i.e., no early return),
; RUN: opt -passes='memprof-use<profile-filename=memprof.profdata>' -memprof-annotate-static-data-prefix \
-; RUN: -debug-only=memprof -stats -S funcless-module.ll -o - 2>&1 | FileCheck %s --check-prefixes=LOG,PREFIX,STAT
+; RUN: -debug-only=memprof -stats -S funcless-module.ll -o - 2>&1 | FileCheck %s --check-prefixes=LOG,IR,STAT
;; Run optimizer pass on the IR, and check the section prefix.
; RUN: opt -passes='memprof-use<profile-filename=memprof.profdata>' -memprof-annotate-static-data-prefix \
-; RUN: -debug-only=memprof -stats -S input.ll -o - 2>&1 | FileCheck %s --check-prefixes=LOG,PREFIX,STAT
+; RUN: -debug-only=memprof -stats -S input.ll -o - 2>&1 | FileCheck %s --check-prefixes=LOG,IR,STAT
-;; Run optimizer pass without explicitly setting -memprof-annotate-static-data-prefix.
-;; The output text IR shouldn't have `section_prefix`
+;; Run memprof without providing memprof data. Test that IR has module flag
+;; `EnableDataAccessProf` as 0.
+; RUN: opt -passes='memprof-use<profile-filename=memprof-no-dap.profdata>' -memprof-annotate-static-data-prefix \
+; RUN: -debug-only=memprof -stats -S input.ll -o - 2>&1 | FileCheck %s --check-prefix=FLAG
+
+;; Run memprof without explicitly setting -memprof-annotate-static-data-prefix.
+;; The output text IR shouldn't have `section_prefix` or EnableDataAccessProf module flag.
; RUN: opt -passes='memprof-use<profile-filename=memprof.profdata>' \
-; RUN: -debug-only=memprof -stats -S input.ll -o - | FileCheck %s --implicit-check-not="section_prefix"
+; RUN: -debug-only=memprof -stats -S input.ll -o - | FileCheck %s --check-prefix=FLAGLESS --implicit-check-not="section_prefix"
; LOG: Skip annotating string literal .str
; LOG: Global variable var1 is annotated as hot
; LOG: Global variable var2.llvm.125 is annotated as hot
; LOG: Global variable bar is not annotated
; LOG: Global variable foo is annotated as unlikely
-; LOG: Global variable var3 has explicit section name. Skip annotating.
-; LOG: Global variable var4 has explicit section name. Skip annotating.
+; LOG: Skip annotation for var3 due to explicit section name.
+; LOG: Skip annotation for var4 due to explicit section name.
+; LOG: Skip annotation for llvm.fake_var due to name starts with `llvm.`.
+; LOG: Skip annotation for qux due to linker declaration.
;; String literals are not annotated.
-; PREFIX: @.str = unnamed_addr constant [5 x i8] c"abcde"
-; PREFIX-NOT: section_prefix
-; PREFIX: @var1 = global i32 123, !section_prefix !0
+; IR: @.str = unnamed_addr constant [5 x i8] c"abcde"
+; IR-NOT: section_prefix
+; IR: @var1 = global i32 123, !section_prefix !0
;; @var.llvm.125 will be canonicalized to @var2 for profile look-up.
-; PREFIX-NEXT: @var2.llvm.125 = global i64 0, !section_prefix !0
+; IR-NEXT: @var2.llvm.125 = global i64 0, !section_prefix !0
;; @bar is not seen in hot symbol or known symbol set, so it won't get a section
;; prefix. Test this by testing that there is no section_prefix between @bar and
;; @foo.
-; PREFIX-NEXT: @bar = global i16 3
-; PREFIX-NOT: !section_prefix
+; IR-NEXT: @bar = global i16 3
+; IR-NOT: !section_prefix
;; @foo is unlikely.
-; PREFIX-NEXT: @foo = global i8 2, !section_prefix !1
+; IR-NEXT: @foo = global i8 2, !section_prefix !1
+
+; IR-NEXT: @var3 = constant [2 x i32] [i32 12345, i32 6789], section "sec1"
+; IR-NEXT: @var4 = constant [1 x i64] [i64 98765] #0
+
+; IR: @llvm.fake_var = global i32 123
+; IR-NOT: !section_prefix
+; IR: @qux = external global i64
+; IR-NOT: !section_prefix
-; PREFIX-NEXT: @var3 = constant [2 x i32] [i32 12345, i32 6789], section "sec1"
-; PREFIX-NEXT: @var4 = constant [1 x i64] [i64 98765] #0
+; IR: attributes #0 = { "rodata-section"="sec2" }
-; PREFIX: attributes #0 = { "rodata-section"="sec2" }
+; IR: !0 = !{!"section_prefix", !"hot"}
+; IR-NEXT: !1 = !{!"section_prefix", !"unlikely"}
+; IR-NEXT: !2 = !{i32 2, !"EnableDataAccessProf", i32 1}
-; PREFIX: !0 = !{!"section_prefix", !"hot"}
-; PREFIX-NEXT: !1 = !{!"section_prefix", !"unlikely"}
+; FLAG: !{i32 2, !"EnableDataAccessProf", i32 0}
+; FLAGLESS-NOT: EnableDataAccessProf
; STAT: 1 memprof - Number of global vars annotated with 'unlikely' section prefix.
; STAT: 2 memprof - Number of global vars with user-specified section (not annotated).
@@ -72,6 +89,24 @@ DataAccessProfiles:
- foo
KnownColdStrHashes: [ 999, 1001 ]
...
+;--- memprof-no-dap.yaml
+---
+# A memprof file with without data access profiles. The heap records are simplified
+# to pass profile parsing and don't need to match the IR.
+HeapProfileRecords:
+ - GUID: 0xdeadbeef12345678
+ AllocSites:
+ - Callstack:
+ - { Function: 0x1111111111111111, LineOffset: 11, Column: 10, IsInlineFrame: true }
+ MemInfoBlock:
+ AllocCount: 111
+ TotalSize: 222
+ TotalLifetime: 333
+ TotalLifetimeAccessDensity: 444
+ CallSites:
+ - Frames:
+ - { Function: 0x5555555555555555, LineOffset: 55, Column: 50, IsInlineFrame: true }
+...
;--- input.ll
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
@@ -84,11 +119,14 @@ target triple = "x86_64-unknown-linux-gnu"
@foo = global i8 2
@var3 = constant [2 x i32][i32 12345, i32 6789], section "sec1"
@var4 = constant [1 x i64][i64 98765] #0
+@llvm.fake_var = global i32 123
+@qux = external global i64
define i32 @func() {
%a = load i32, ptr @var1
%b = load i32, ptr @var2.llvm.125
- %ret = call i32 (...) @func_taking_arbitrary_param(i32 %a, i32 %b)
+ %c = load i32, ptr @llvm.fake_var
+ %ret = call i32 (...) @func_taking_arbitrary_param(i32 %a, i32 %b, i32 %c)
ret i32 %ret
}
@@ -108,5 +146,8 @@ target triple = "x86_64-unknown-linux-gnu"
@foo = global i8 2
@var3 = constant [2 x i32][i32 12345, i32 6789], section "sec1"
@var4 = constant [1 x i64][i64 98765] #0
+@llvm.fake_var = global i32 123
+@qux = external global i64
+
attributes #0 = { "rodata-section"="sec2" }
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll
index c5f72f2..fded7a4 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll
@@ -4,21 +4,9 @@
define i32 @crash_reordering_undefs() {
; CHECK-LABEL: @crash_reordering_undefs(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[OR0:%.*]] = or i64 undef, undef
-; CHECK-NEXT: [[CMP0:%.*]] = icmp eq i64 undef, [[OR0]]
-; CHECK-NEXT: [[ADD0:%.*]] = select i1 [[CMP0]], i32 65536, i32 65537
-; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i64 undef, undef
-; CHECK-NEXT: [[ADD2:%.*]] = select i1 [[CMP1]], i32 65536, i32 65537
-; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i64 undef, undef
-; CHECK-NEXT: [[ADD4:%.*]] = select i1 [[CMP2]], i32 65536, i32 65537
-; CHECK-NEXT: [[OR1:%.*]] = or i64 undef, undef
-; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i64 undef, [[OR1]]
-; CHECK-NEXT: [[ADD9:%.*]] = select i1 [[CMP3]], i32 65536, i32 65537
+; CHECK-NEXT: [[ADD0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> splat (i32 65537))
; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 undef, [[ADD0]]
-; CHECK-NEXT: [[OP_RDX1:%.*]] = add i32 [[ADD2]], [[ADD4]]
-; CHECK-NEXT: [[OP_RDX2:%.*]] = add i32 [[OP_RDX]], [[OP_RDX1]]
-; CHECK-NEXT: [[OP_RDX3:%.*]] = add i32 [[OP_RDX2]], [[ADD9]]
-; CHECK-NEXT: ret i32 [[OP_RDX3]]
+; CHECK-NEXT: ret i32 [[OP_RDX]]
;
entry:
%or0 = or i64 undef, undef
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gather-with-cmp-user.ll b/llvm/test/Transforms/SLPVectorizer/X86/gather-with-cmp-user.ll
index 3ac0d01..13b050d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/gather-with-cmp-user.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/gather-with-cmp-user.ll
@@ -6,15 +6,15 @@ define i1 @test(i32 %g, i16 %d) {
; CHECK-SAME: i32 [[G:%.*]], i16 [[D:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP0:%.*]] = and i16 [[D]], 1
-; CHECK-NEXT: [[XOR_I_I:%.*]] = xor i32 [[G]], 1
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[G]], i32 0
-; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[XOR_I_I]], i32 1
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP9:%.*]] = xor <2 x i32> [[TMP2]], <i32 0, i32 1>
+; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
; CHECK-NEXT: [[TMP4:%.*]] = trunc <2 x i32> [[TMP9]] to <2 x i8>
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i8> [[TMP4]], <2 x i8> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i8> [[TMP5]], <i8 -9, i8 -9, i8 -1, i8 -1>
; CHECK-NEXT: [[TMP7:%.*]] = icmp sgt <4 x i8> [[TMP6]], splat (i8 -3)
; CHECK-NEXT: [[TMP8:%.*]] = zext <4 x i1> [[TMP7]] to <4 x i8>
-; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
; CHECK-NEXT: [[TMP11:%.*]] = zext <4 x i8> [[TMP8]] to <4 x i32>
; CHECK-NEXT: [[TMP12:%.*]] = icmp sgt <4 x i32> [[TMP10]], [[TMP11]]
; CHECK-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP12]])
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/identity-match-splat-less-defined.ll b/llvm/test/Transforms/SLPVectorizer/X86/identity-match-splat-less-defined.ll
index f07424f..43302f2 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/identity-match-splat-less-defined.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/identity-match-splat-less-defined.ll
@@ -3,32 +3,7 @@
define i32 @test() {
; CHECK-LABEL: define i32 @test() {
-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 0, i32 0
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 poison>
-; CHECK-NEXT: [[TMP3:%.*]] = or <4 x i32> [[TMP2]], <i32 0, i32 0, i32 0, i32 poison>
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
-; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i32> [[TMP25]], zeroinitializer
-; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <24 x i32> <i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
-; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <64 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <64 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 3, i32 3, i32 3, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP5]], <64 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 7, i32 7, i32 7, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <64 x i32> [[TMP9]], <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 poison, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 48, i32 49, i32 50, i32 51, i32 poison, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
-; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <64 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <64 x i32> [[TMP10]], <64 x i32> [[TMP12]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 64, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 48, i32 49, i32 50, i32 51, i32 poison, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <64 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <64 x i32> [[TMP13]], <64 x i32> [[TMP15]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 48, i32 49, i32 50, i32 51, i32 67, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <24 x i32> [[TMP6]], <24 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <64 x i32> [[TMP16]], <64 x i32> [[TMP15]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <64 x i32> [[TMP27]], <64 x i32> [[TMP28]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[TMP19:%.*]] = icmp eq <64 x i32> zeroinitializer, [[TMP18]]
-; CHECK-NEXT: [[TMP20:%.*]] = icmp ne <64 x i32> zeroinitializer, [[TMP18]]
-; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <64 x i1> [[TMP19]], <64 x i1> [[TMP20]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[TMP22:%.*]] = zext <64 x i1> [[TMP21]] to <64 x i8>
-; CHECK-NEXT: [[TMP23:%.*]] = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> [[TMP22]])
+; CHECK-NEXT: [[TMP23:%.*]] = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
; CHECK-NEXT: [[TMP24:%.*]] = sext i8 [[TMP23]] to i32
; CHECK-NEXT: ret i32 [[TMP24]]
;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-subvectors-insert.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-subvectors-insert.ll
index 1fedde4..3e9bd78 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-subvectors-insert.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-subvectors-insert.ll
@@ -3,12 +3,8 @@
define void @test() {
; CHECK-LABEL: define void @test() {
-; CHECK-NEXT: [[XOR108_I_I_I:%.*]] = xor i64 0, 1
-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <12 x i64> <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 poison, i64 0>, i64 [[XOR108_I_I_I]], i32 10
-; CHECK-NEXT: [[TMP2:%.*]] = lshr <12 x i64> [[TMP1]], zeroinitializer
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <16 x i64> poison, i64 [[XOR108_I_I_I]], i32 3
-; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <12 x i64> [[TMP2]], <12 x i64> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <16 x i64> [[TMP5]], <16 x i64> [[TMP3]], <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i64> poison, i64 1, i32 3
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <16 x i64> <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 1, i64 0, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i64> [[TMP1]], <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <16 x i64> [[TMP6]], <16 x i64> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 3, i32 7, i32 8, i32 9, i32 3, i32 10, i32 11, i32 12, i32 3>
; CHECK-NEXT: [[TMP8:%.*]] = trunc <16 x i64> [[TMP7]] to <16 x i1>
; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i1> [[TMP8]], zeroinitializer
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/non-scheduled-inst-reused-as-last-inst.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-scheduled-inst-reused-as-last-inst.ll
index 034fe82..c5442b7 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/non-scheduled-inst-reused-as-last-inst.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/non-scheduled-inst-reused-as-last-inst.ll
@@ -6,11 +6,10 @@
define void @foo() {
; CHECK-LABEL: define void @foo() {
; CHECK-NEXT: bb:
-; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 0, i32 0
; CHECK-NEXT: br label [[BB1:%.*]]
; CHECK: bb1:
; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ zeroinitializer, [[BB:%.*]] ], [ [[TMP6:%.*]], [[BB4:%.*]] ]
-; CHECK-NEXT: [[TMP2:%.*]] = shl <2 x i32> [[TMP1]], [[TMP0]]
+; CHECK-NEXT: [[TMP2:%.*]] = shl <2 x i32> [[TMP1]], zeroinitializer
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP1]], <2 x i32> <i32 0, i32 3>
; CHECK-NEXT: [[TMP6]] = or <2 x i32> [[TMP5]], zeroinitializer
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP6]], i32 0
@@ -24,11 +23,10 @@ define void @foo() {
;
; FORCED-LABEL: define void @foo() {
; FORCED-NEXT: bb:
-; FORCED-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 0, i32 0
; FORCED-NEXT: br label [[BB1:%.*]]
; FORCED: bb1:
; FORCED-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ zeroinitializer, [[BB:%.*]] ], [ [[TMP6:%.*]], [[BB4:%.*]] ]
-; FORCED-NEXT: [[TMP2:%.*]] = shl <2 x i32> [[TMP1]], [[TMP0]]
+; FORCED-NEXT: [[TMP2:%.*]] = shl <2 x i32> [[TMP1]], zeroinitializer
; FORCED-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP1]], <2 x i32> <i32 0, i32 3>
; FORCED-NEXT: [[TMP6]] = or <2 x i32> [[TMP5]], zeroinitializer
; FORCED-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP6]], i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-vectorized-in-transform.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-vectorized-in-transform.ll
index 2612a21..e8078ad 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-vectorized-in-transform.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-vectorized-in-transform.ll
@@ -5,23 +5,22 @@ define i32 @test(i1 %cond) {
; CHECK-LABEL: define i32 @test(
; CHECK-SAME: i1 [[COND:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[OR92:%.*]] = or i32 1, 0
; CHECK-NEXT: br label %[[BB:.*]]
; CHECK: [[BB]]:
-; CHECK-NEXT: [[P1:%.*]] = phi i32 [ [[OR92:%.*]], %[[BB]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[P1:%.*]] = phi i32 [ [[OR92]], %[[BB]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i32> [ [[TMP8:%.*]], %[[BB]] ], [ zeroinitializer, %[[ENTRY]] ]
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> <i32 poison, i32 poison, i32 0, i32 0>, <4 x i32> <i32 poison, i32 1, i32 6, i32 7>
; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[P1]], i32 0
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP10]], <4 x i32> <i32 4, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i32> zeroinitializer, [[TMP4]]
-; CHECK-NEXT: [[OR92]] = or i32 1, 0
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[TMP5]])
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> <i32 poison, i32 1>, i32 [[TMP6]], i32 0
-; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[OR92]], i32 0
-; CHECK-NEXT: [[TMP8]] = xor <2 x i32> [[TMP9]], [[TMP7]]
-; CHECK-NEXT: [[OP_RDX:%.*]] = xor i32 [[TMP6]], [[OR92]]
+; CHECK-NEXT: [[TMP8]] = xor <2 x i32> [[TMP9]], <i32 1, i32 0>
; CHECK-NEXT: br i1 [[COND]], label %[[EXIT:.*]], label %[[BB]]
; CHECK: [[EXIT]]:
+; CHECK-NEXT: [[OP_RDX:%.*]] = extractelement <2 x i32> [[TMP8]], i32 0
; CHECK-NEXT: ret i32 [[OP_RDX]]
;
entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reschedule-only-scheduled.ll b/llvm/test/Transforms/SLPVectorizer/X86/reschedule-only-scheduled.ll
index 4a5dd2a..b9f8390 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reschedule-only-scheduled.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reschedule-only-scheduled.ll
@@ -8,42 +8,21 @@ define i16 @test() {
; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[TMP0]], 0
; CHECK-NEXT: [[CALL99_I:%.*]] = call i32 @llvm.bswap.i32(i32 0)
; CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[CALL99_I]], 0
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP2]], i32 1
-; CHECK-NEXT: [[TMP5:%.*]] = and <2 x i32> [[TMP4]], zeroinitializer
-; CHECK-NEXT: [[TMP6:%.*]] = shl i32 0, 0
-; CHECK-NEXT: [[UNSCLEAR186_I:%.*]] = and i32 [[TMP6]], 0
-; CHECK-NEXT: [[TMP7:%.*]] = shl i32 0, 0
; CHECK-NEXT: [[CALL7_I45:%.*]] = tail call i32 null(i32 0)
; CHECK-NEXT: [[TMP8:%.*]] = lshr i32 [[CALL7_I45]], 0
-; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0
-; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP8]], i32 1
-; CHECK-NEXT: [[TMP11:%.*]] = and <2 x i32> [[TMP10]], zeroinitializer
-; CHECK-NEXT: [[TMP12:%.*]] = lshr i32 0, 0
-; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[TMP12]], i32 0
-; CHECK-NEXT: [[TMP14:%.*]] = shl <2 x i32> [[TMP13]], zeroinitializer
-; CHECK-NEXT: [[TMP15:%.*]] = and <2 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <24 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <24 x i32> <i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 0, i32 poison, i32 0, i32 poison, i32 poison, i32 poison, i32 0, i32 poison, i32 poison, i32 poison, i32 0, i32 poison, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 0>, <24 x i32> [[TMP16]], <24 x i32> <i32 0, i32 1, i32 24, i32 25, i32 poison, i32 5, i32 poison, i32 7, i32 poison, i32 poison, i32 poison, i32 11, i32 poison, i32 poison, i32 poison, i32 15, i32 poison, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 22, i32 23>
-; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <24 x i32> [[TMP17]], <24 x i32> <i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 24, i32 5, i32 26, i32 7, i32 28, i32 29, i32 poison, i32 11, i32 poison, i32 poison, i32 poison, i32 15, i32 poison, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 22, i32 23>
-; CHECK-NEXT: [[TMP19:%.*]] = insertelement <24 x i32> [[TMP18]], i32 [[UNSCLEAR186_I]], i32 10
-; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x i32> [[TMP11]], <2 x i32> poison, <24 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <24 x i32> [[TMP19]], <24 x i32> [[TMP20]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <2 x i32> [[TMP15]], <2 x i32> poison, <24 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <24 x i32> [[TMP21]], <24 x i32> [[TMP22]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 24, i32 15, i32 25, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 22, i32 23>
-; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <24 x i32> [[TMP23]], <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 24, i32 25, i32 26, i32 27, i32 22, i32 23>
-; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <24 x i32> [[TMP24]], <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 0, i32 poison, i32 0, i32 0, i32 0, i32 poison, i32 0, i32 0, i32 0, i32 poison, i32 0, i32 poison, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, <24 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 4, i32 30, i32 6, i32 32, i32 33, i32 34, i32 poison, i32 36, i32 37, i32 38, i32 poison, i32 40, i32 poison, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
-; CHECK-NEXT: [[TMP26:%.*]] = insertelement <24 x i32> [[TMP25]], i32 [[UNSCLEAR186_I]], i32 11
-; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <2 x i32> [[TMP15]], <2 x i32> poison, <24 x i32> <i32 0, i32 poison, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <24 x i32> [[TMP26]], <24 x i32> [[TMP27]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 24, i32 16, i32 26, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; CHECK-NEXT: [[TMP29:%.*]] = icmp ne <24 x i32> [[TMP24]], [[TMP28]]
-; CHECK-NEXT: [[RDX_OP:%.*]] = shufflevector <24 x i1> [[TMP29]], <24 x i1> <i1 false, i1 false, i1 false, i1 false, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef>, <28 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27>
-; CHECK-NEXT: [[TMP30:%.*]] = bitcast <28 x i1> [[RDX_OP]] to i28
-; CHECK-NEXT: [[TMP31:%.*]] = call i28 @llvm.ctpop.i28(i28 [[TMP30]])
-; CHECK-NEXT: [[TMP32:%.*]] = trunc i28 [[TMP31]] to i16
-; CHECK-NEXT: [[TMP33:%.*]] = call i4 @llvm.ctpop.i4(i4 -8)
-; CHECK-NEXT: [[TMP34:%.*]] = zext i4 [[TMP33]] to i16
-; CHECK-NEXT: [[OP_RDX4:%.*]] = add i16 [[TMP34]], [[TMP32]]
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <28 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 poison>, i32 [[TMP1]], i32 4
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <28 x i32> [[TMP4]], i32 [[TMP2]], i32 5
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <28 x i32> [[TMP5]], <28 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, <28 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 28, i32 29, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 poison>
+; CHECK-NEXT: [[TMP7:%.*]] = insertelement <28 x i32> [[TMP6]], i32 [[TMP8]], i32 12
+; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <28 x i32> [[TMP7]], <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, <28 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 12, i32 28, i32 29, i32 30, i32 31, i32 poison, i32 poison, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 poison>
+; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <28 x i32> [[TMP16]], <28 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, <28 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 28, i32 29, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27>
+; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <28 x i32> [[TMP9]], <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, <28 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 28, i32 29, i32 30, i32 31, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27>
+; CHECK-NEXT: [[TMP11:%.*]] = and <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison>, [[TMP10]]
+; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <28 x i32> [[TMP11]], <28 x i32> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 9, i32 10, i32 10, i32 11, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>
+; CHECK-NEXT: [[TMP13:%.*]] = icmp ne <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1>, [[TMP12]]
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <32 x i1> [[TMP13]] to i32
+; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP14]])
+; CHECK-NEXT: [[OP_RDX4:%.*]] = trunc i32 [[TMP15]] to i16
; CHECK-NEXT: ret i16 [[OP_RDX4]]
;
entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/minbitwidth-node-with-multi-users.ll b/llvm/test/Transforms/SLPVectorizer/minbitwidth-node-with-multi-users.ll
index a7f8629..78708a2 100644
--- a/llvm/test/Transforms/SLPVectorizer/minbitwidth-node-with-multi-users.ll
+++ b/llvm/test/Transforms/SLPVectorizer/minbitwidth-node-with-multi-users.ll
@@ -6,20 +6,12 @@ define void @test() {
; CHECK-LABEL: define void @test() {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr null, align 2
-; CHECK-NEXT: [[TMP1:%.*]] = and i8 0, 1
; CHECK-NEXT: [[TMP2:%.*]] = and i32 0, 0
; CHECK-NEXT: [[TMP3:%.*]] = select i1 false, i32 0, i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i8> <i8 0, i8 poison, i8 poison, i8 poison>, i8 [[TMP1]], i32 1
-; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
-; CHECK-NEXT: [[TMP15:%.*]] = trunc <4 x i8> [[TMP5]] to <4 x i1>
-; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT: [[TMP8:%.*]] = or <4 x i8> [[TMP7]], zeroinitializer
+; CHECK-NEXT: [[TMP8:%.*]] = or <4 x i8> zeroinitializer, zeroinitializer
; CHECK-NEXT: [[TMP9:%.*]] = trunc <4 x i8> [[TMP8]] to <4 x i1>
-; CHECK-NEXT: [[TMP10:%.*]] = or <4 x i1> zeroinitializer, [[TMP15]]
-; CHECK-NEXT: [[TMP11:%.*]] = icmp eq <4 x i1> [[TMP9]], [[TMP10]]
-; CHECK-NEXT: [[TMP6:%.*]] = zext <4 x i1> [[TMP15]] to <4 x i32>
-; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> <i32 0, i32 poison, i32 0, i32 0>, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
-; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i1> [[TMP9]], zeroinitializer
+; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP13]])
; CHECK-NEXT: [[OP_RDX:%.*]] = and i32 0, [[TMP14]]
; CHECK-NEXT: store i32 [[OP_RDX]], ptr null, align 4