aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlexey Bataev <a.bataev@outlook.com>2024-08-06 09:20:50 -0400
committerGitHub <noreply@github.com>2024-08-06 09:20:50 -0400
commitdaf4a06e5c5531005b275b72681e04bd08e58fe4 (patch)
treeba7b93d6451eb6be007fda2cecfe7c765413e1af
parentf9b69a378cb1acfedab7252b4d4dc3d0af282d0b (diff)
downloadllvm-daf4a06e5c5531005b275b72681e04bd08e58fe4.zip
llvm-daf4a06e5c5531005b275b72681e04bd08e58fe4.tar.gz
llvm-daf4a06e5c5531005b275b72681e04bd08e58fe4.tar.bz2
[SLP]Try detect strided loads, if any pointer op require extraction.
If any pointer operand of the non-cosencutive loads is an instructions with the user, which is not part of the current graph, and, thus, requires emission of the extractelement instruction, better to try to detect if the load sequence can be repsented as strided load and extractelement instructions for pointers are not required. Reviewers: preames, RKSimon, topperc Reviewed By: RKSimon Pull Request: https://github.com/llvm/llvm-project/pull/101668
-rw-r--r--llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp12
-rw-r--r--llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll635
-rw-r--r--llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll17
3 files changed, 312 insertions, 352 deletions
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index ed9dfa6..936b074 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -4617,7 +4617,17 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
// 3. The loads are ordered, or number of unordered loads <=
// MaxProfitableUnorderedLoads, or loads are in reversed order.
// (this check is to avoid extra costs for very expensive shuffles).
- if (IsPossibleStrided && (((Sz > MinProfitableStridedLoads ||
+ // 4. Any pointer operand is an instruction with the users outside of the
+ // current graph (for masked gathers extra extractelement instructions
+ // might be required).
+ auto IsAnyPointerUsedOutGraph =
+ IsPossibleStrided && any_of(PointerOps, [&](Value *V) {
+ return isa<Instruction>(V) && any_of(V->users(), [&](User *U) {
+ return !getTreeEntry(U) && !MustGather.contains(U);
+ });
+ });
+ if (IsPossibleStrided && (IsAnyPointerUsedOutGraph ||
+ ((Sz > MinProfitableStridedLoads ||
(static_cast<unsigned>(std::abs(*Diff)) <=
MaxProfitableLoadStride * Sz &&
isPowerOf2_32(std::abs(*Diff)))) &&
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
index 3595f77..4978991 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
@@ -8,17 +8,12 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[PIX1]], align 1
; CHECK-NEXT: [[CONV1:%.*]] = zext i8 [[TMP0]] to i32
-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[PIX1]], i32 0
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x ptr> [[TMP1]], <2 x ptr> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, <2 x ptr> [[TMP2]], <2 x i64> <i64 4, i64 6>
-; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x ptr> poison, ptr [[PIX2]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x ptr> [[TMP4]], <2 x ptr> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, <2 x ptr> [[TMP5]], <2 x i64> <i64 4, i64 6>
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[PIX1]], i64 4
+; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i8, ptr [[PIX2]], i64 4
; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr i8, ptr [[PIX1]], i64 1
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, <2 x ptr> [[TMP5]], <2 x i64> <i64 1, i64 3>
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, <2 x ptr> [[TMP2]], <2 x i64> <i64 5, i64 7>
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, <2 x ptr> [[TMP5]], <2 x i64> <i64 5, i64 7>
-; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i8, ptr [[PIX2]], i64 2
+; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i8, ptr [[PIX2]], i64 1
+; CHECK-NEXT: [[ARRAYIDX25:%.*]] = getelementptr i8, ptr [[PIX1]], i64 5
+; CHECK-NEXT: [[ARRAYIDX27:%.*]] = getelementptr i8, ptr [[PIX2]], i64 5
; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr i8, ptr [[PIX1]], i64 3
; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[ARRAYIDX32]], align 1
; CHECK-NEXT: [[CONV33:%.*]] = zext i8 [[TMP10]] to i32
@@ -26,10 +21,12 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[ADD_PTR644:%.*]] = getelementptr i8, ptr [[PIX2]], i64 [[IDX_EXT63]]
; CHECK-NEXT: [[TMP11:%.*]] = load i8, ptr [[ADD_PTR3]], align 1
; CHECK-NEXT: [[CONV_1:%.*]] = zext i8 [[TMP11]] to i32
-; CHECK-NEXT: [[TMP12:%.*]] = load i8, ptr [[ADD_PTR644]], align 1
+; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 4
+; CHECK-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 4
; CHECK-NEXT: [[ARRAYIDX8_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 1
-; CHECK-NEXT: [[ARRAYIDX22_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 2
-; CHECK-NEXT: [[TMP13:%.*]] = load i8, ptr [[ARRAYIDX22_1]], align 1
+; CHECK-NEXT: [[ARRAYIDX22_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 1
+; CHECK-NEXT: [[ARRAYIDX25_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 5
+; CHECK-NEXT: [[ARRAYIDX27_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 5
; CHECK-NEXT: [[ARRAYIDX32_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 3
; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX32_1]], align 1
; CHECK-NEXT: [[CONV33_1:%.*]] = zext i8 [[TMP14]] to i32
@@ -37,178 +34,189 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[ADD_PTR64_1:%.*]] = getelementptr i8, ptr [[ADD_PTR64]], i64 [[IDX_EXT63]]
; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 4
; CHECK-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 4
-; CHECK-NEXT: [[TMP15:%.*]] = load <2 x i8>, ptr [[ADD_PTR_1]], align 1
-; CHECK-NEXT: [[TMP16:%.*]] = zext <2 x i8> [[TMP15]] to <2 x i32>
-; CHECK-NEXT: [[TMP17:%.*]] = load <2 x i8>, ptr [[ADD_PTR64_1]], align 1
-; CHECK-NEXT: [[TMP18:%.*]] = zext <2 x i8> [[TMP17]] to <2 x i32>
-; CHECK-NEXT: [[TMP19:%.*]] = sub <2 x i32> [[TMP16]], [[TMP18]]
-; CHECK-NEXT: [[TMP20:%.*]] = load <2 x i8>, ptr [[ARRAYIDX3_2]], align 1
-; CHECK-NEXT: [[TMP21:%.*]] = zext <2 x i8> [[TMP20]] to <2 x i32>
-; CHECK-NEXT: [[TMP22:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_2]], align 1
-; CHECK-NEXT: [[TMP23:%.*]] = zext <2 x i8> [[TMP22]] to <2 x i32>
-; CHECK-NEXT: [[TMP24:%.*]] = sub <2 x i32> [[TMP21]], [[TMP23]]
+; CHECK-NEXT: [[ARRAYIDX8_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 1
+; CHECK-NEXT: [[ARRAYIDX10_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 1
+; CHECK-NEXT: [[ARRAYIDX13_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 5
+; CHECK-NEXT: [[ARRAYIDX15_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 5
+; CHECK-NEXT: [[TMP4:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ADD_PTR_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP16:%.*]] = zext <2 x i8> [[TMP4]] to <2 x i32>
+; CHECK-NEXT: [[TMP6:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ADD_PTR64_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP7:%.*]] = zext <2 x i8> [[TMP6]] to <2 x i32>
+; CHECK-NEXT: [[TMP8:%.*]] = sub <2 x i32> [[TMP16]], [[TMP7]]
+; CHECK-NEXT: [[TMP9:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP13:%.*]] = zext <2 x i8> [[TMP9]] to <2 x i32>
+; CHECK-NEXT: [[TMP28:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP12:%.*]] = zext <2 x i8> [[TMP28]] to <2 x i32>
+; CHECK-NEXT: [[TMP24:%.*]] = sub <2 x i32> [[TMP13]], [[TMP12]]
; CHECK-NEXT: [[TMP25:%.*]] = shl <2 x i32> [[TMP24]], <i32 16, i32 16>
-; CHECK-NEXT: [[TMP26:%.*]] = add <2 x i32> [[TMP25]], [[TMP19]]
-; CHECK-NEXT: [[ARRAYIDX20_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 2
-; CHECK-NEXT: [[ARRAYIDX22_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 2
-; CHECK-NEXT: [[ARRAYIDX25_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 6
-; CHECK-NEXT: [[ARRAYIDX27_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 6
-; CHECK-NEXT: [[TMP27:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20_2]], align 1
-; CHECK-NEXT: [[TMP28:%.*]] = zext <2 x i8> [[TMP27]] to <2 x i32>
-; CHECK-NEXT: [[TMP29:%.*]] = load <2 x i8>, ptr [[ARRAYIDX22_2]], align 1
-; CHECK-NEXT: [[TMP30:%.*]] = zext <2 x i8> [[TMP29]] to <2 x i32>
-; CHECK-NEXT: [[TMP31:%.*]] = sub <2 x i32> [[TMP28]], [[TMP30]]
-; CHECK-NEXT: [[TMP32:%.*]] = load <2 x i8>, ptr [[ARRAYIDX25_2]], align 1
-; CHECK-NEXT: [[TMP33:%.*]] = zext <2 x i8> [[TMP32]] to <2 x i32>
-; CHECK-NEXT: [[TMP34:%.*]] = load <2 x i8>, ptr [[ARRAYIDX27_2]], align 1
-; CHECK-NEXT: [[TMP35:%.*]] = zext <2 x i8> [[TMP34]] to <2 x i32>
-; CHECK-NEXT: [[TMP36:%.*]] = sub <2 x i32> [[TMP33]], [[TMP35]]
+; CHECK-NEXT: [[TMP15:%.*]] = add <2 x i32> [[TMP25]], [[TMP8]]
+; CHECK-NEXT: [[TMP29:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX8_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP17:%.*]] = zext <2 x i8> [[TMP29]] to <2 x i32>
+; CHECK-NEXT: [[TMP18:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP19:%.*]] = zext <2 x i8> [[TMP18]] to <2 x i32>
+; CHECK-NEXT: [[TMP20:%.*]] = sub <2 x i32> [[TMP17]], [[TMP19]]
+; CHECK-NEXT: [[TMP21:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX13_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP22:%.*]] = zext <2 x i8> [[TMP21]] to <2 x i32>
+; CHECK-NEXT: [[TMP23:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX15_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP30:%.*]] = zext <2 x i8> [[TMP23]] to <2 x i32>
+; CHECK-NEXT: [[TMP36:%.*]] = sub <2 x i32> [[TMP22]], [[TMP30]]
; CHECK-NEXT: [[TMP37:%.*]] = shl <2 x i32> [[TMP36]], <i32 16, i32 16>
-; CHECK-NEXT: [[TMP38:%.*]] = add <2 x i32> [[TMP37]], [[TMP31]]
+; CHECK-NEXT: [[TMP27:%.*]] = add <2 x i32> [[TMP37]], [[TMP20]]
+; CHECK-NEXT: [[TMP26:%.*]] = add <2 x i32> [[TMP27]], [[TMP15]]
+; CHECK-NEXT: [[TMP38:%.*]] = sub <2 x i32> [[TMP15]], [[TMP27]]
; CHECK-NEXT: [[ADD44_2:%.*]] = extractelement <2 x i32> [[TMP26]], i32 0
; CHECK-NEXT: [[CONV:%.*]] = extractelement <2 x i32> [[TMP26]], i32 1
-; CHECK-NEXT: [[ADD44_3:%.*]] = add i32 [[CONV]], [[ADD44_2]]
+; CHECK-NEXT: [[ADD48_2:%.*]] = add i32 [[CONV]], [[ADD44_2]]
; CHECK-NEXT: [[SUB51_2:%.*]] = sub i32 [[ADD44_2]], [[CONV]]
; CHECK-NEXT: [[SUB45_2:%.*]] = extractelement <2 x i32> [[TMP38]], i32 0
; CHECK-NEXT: [[SUB47_2:%.*]] = extractelement <2 x i32> [[TMP38]], i32 1
-; CHECK-NEXT: [[ADD46_2:%.*]] = add i32 [[SUB47_2]], [[SUB45_2]]
+; CHECK-NEXT: [[ADD55_2:%.*]] = add i32 [[SUB47_2]], [[SUB45_2]]
; CHECK-NEXT: [[SUB59_2:%.*]] = sub i32 [[SUB45_2]], [[SUB47_2]]
-; CHECK-NEXT: [[ADD48_2:%.*]] = add i32 [[ADD46_2]], [[ADD44_3]]
-; CHECK-NEXT: [[TMP43:%.*]] = load i8, ptr null, align 1
-; CHECK-NEXT: [[ARRAYIDX20_3:%.*]] = getelementptr i8, ptr null, i64 2
-; CHECK-NEXT: [[ARRAYIDX22_3:%.*]] = getelementptr i8, ptr null, i64 2
+; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr i8, ptr null, i64 4
+; CHECK-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr i8, ptr null, i64 4
+; CHECK-NEXT: [[ARRAYIDX8_3:%.*]] = getelementptr i8, ptr null, i64 1
+; CHECK-NEXT: [[ARRAYIDX10_3:%.*]] = getelementptr i8, ptr null, i64 1
; CHECK-NEXT: [[TMP44:%.*]] = load i8, ptr null, align 1
-; CHECK-NEXT: [[TMP45:%.*]] = insertelement <2 x ptr> <ptr poison, ptr null>, ptr [[ARRAYIDX20_3]], i32 0
-; CHECK-NEXT: [[TMP46:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP45]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP47:%.*]] = zext <2 x i8> [[TMP46]] to <2 x i32>
-; CHECK-NEXT: [[TMP48:%.*]] = insertelement <2 x ptr> <ptr poison, ptr null>, ptr [[ARRAYIDX22_3]], i32 0
-; CHECK-NEXT: [[TMP49:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP48]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP50:%.*]] = zext <2 x i8> [[TMP49]] to <2 x i32>
-; CHECK-NEXT: [[TMP51:%.*]] = sub <2 x i32> [[TMP47]], [[TMP50]]
-; CHECK-NEXT: [[TMP52:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP53:%.*]] = zext <2 x i8> [[TMP52]] to <2 x i32>
-; CHECK-NEXT: [[TMP54:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 6, i64 4>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP55:%.*]] = zext <2 x i8> [[TMP54]] to <2 x i32>
-; CHECK-NEXT: [[TMP56:%.*]] = sub <2 x i32> [[TMP53]], [[TMP55]]
-; CHECK-NEXT: [[TMP57:%.*]] = shl <2 x i32> [[TMP56]], <i32 16, i32 16>
-; CHECK-NEXT: [[TMP58:%.*]] = add <2 x i32> [[TMP57]], [[TMP51]]
-; CHECK-NEXT: [[TMP59:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 3, i64 1>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP60:%.*]] = zext <2 x i8> [[TMP59]] to <2 x i32>
-; CHECK-NEXT: [[TMP61:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 3, i64 1>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP62:%.*]] = zext <2 x i8> [[TMP61]] to <2 x i32>
-; CHECK-NEXT: [[TMP63:%.*]] = sub <2 x i32> [[TMP60]], [[TMP62]]
+; CHECK-NEXT: [[ARRAYIDX15_3:%.*]] = getelementptr i8, ptr null, i64 5
+; CHECK-NEXT: [[TMP43:%.*]] = load i8, ptr null, align 1
+; CHECK-NEXT: [[TMP53:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP33:%.*]] = zext <2 x i8> [[TMP53]] to <2 x i32>
+; CHECK-NEXT: [[TMP54:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP39:%.*]] = zext <2 x i8> [[TMP54]] to <2 x i32>
+; CHECK-NEXT: [[TMP40:%.*]] = sub <2 x i32> [[TMP33]], [[TMP39]]
+; CHECK-NEXT: [[TMP41:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_3]], i64 -4, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP42:%.*]] = zext <2 x i8> [[TMP41]] to <2 x i32>
+; CHECK-NEXT: [[TMP58:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP59:%.*]] = zext <2 x i8> [[TMP58]] to <2 x i32>
+; CHECK-NEXT: [[TMP45:%.*]] = sub <2 x i32> [[TMP42]], [[TMP59]]
+; CHECK-NEXT: [[TMP46:%.*]] = shl <2 x i32> [[TMP45]], <i32 16, i32 16>
+; CHECK-NEXT: [[TMP62:%.*]] = add <2 x i32> [[TMP46]], [[TMP40]]
+; CHECK-NEXT: [[TMP48:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX8_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP49:%.*]] = zext <2 x i8> [[TMP48]] to <2 x i32>
+; CHECK-NEXT: [[TMP50:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP51:%.*]] = zext <2 x i8> [[TMP50]] to <2 x i32>
+; CHECK-NEXT: [[TMP52:%.*]] = sub <2 x i32> [[TMP49]], [[TMP51]]
; CHECK-NEXT: [[TMP64:%.*]] = insertelement <2 x i8> poison, i8 [[TMP44]], i32 0
; CHECK-NEXT: [[TMP65:%.*]] = insertelement <2 x i8> [[TMP64]], i8 [[TMP43]], i32 1
-; CHECK-NEXT: [[TMP66:%.*]] = zext <2 x i8> [[TMP65]] to <2 x i32>
-; CHECK-NEXT: [[TMP67:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 7, i64 5>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP68:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32>
-; CHECK-NEXT: [[TMP69:%.*]] = sub <2 x i32> [[TMP66]], [[TMP68]]
+; CHECK-NEXT: [[TMP55:%.*]] = zext <2 x i8> [[TMP65]] to <2 x i32>
+; CHECK-NEXT: [[TMP56:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX15_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP57:%.*]] = zext <2 x i8> [[TMP56]] to <2 x i32>
+; CHECK-NEXT: [[TMP69:%.*]] = sub <2 x i32> [[TMP55]], [[TMP57]]
; CHECK-NEXT: [[TMP70:%.*]] = shl <2 x i32> [[TMP69]], <i32 16, i32 16>
-; CHECK-NEXT: [[TMP71:%.*]] = add <2 x i32> [[TMP70]], [[TMP63]]
-; CHECK-NEXT: [[TMP72:%.*]] = add <2 x i32> [[TMP71]], [[TMP58]]
-; CHECK-NEXT: [[TMP190:%.*]] = sub <2 x i32> [[TMP58]], [[TMP71]]
+; CHECK-NEXT: [[TMP60:%.*]] = add <2 x i32> [[TMP70]], [[TMP52]]
+; CHECK-NEXT: [[TMP72:%.*]] = add <2 x i32> [[TMP60]], [[TMP62]]
+; CHECK-NEXT: [[TMP47:%.*]] = sub <2 x i32> [[TMP62]], [[TMP60]]
; CHECK-NEXT: [[TMP74:%.*]] = extractelement <2 x i32> [[TMP72]], i32 0
; CHECK-NEXT: [[TMP75:%.*]] = extractelement <2 x i32> [[TMP72]], i32 1
-; CHECK-NEXT: [[ADD48_3:%.*]] = add i32 [[TMP74]], [[TMP75]]
+; CHECK-NEXT: [[ADD48_3:%.*]] = add i32 [[TMP75]], [[TMP74]]
+; CHECK-NEXT: [[SUB51_3:%.*]] = sub i32 [[TMP74]], [[TMP75]]
+; CHECK-NEXT: [[TMP61:%.*]] = extractelement <2 x i32> [[TMP47]], i32 0
+; CHECK-NEXT: [[TMP79:%.*]] = extractelement <2 x i32> [[TMP47]], i32 1
+; CHECK-NEXT: [[ADD55_3:%.*]] = add i32 [[TMP79]], [[TMP61]]
+; CHECK-NEXT: [[SUB59_3:%.*]] = sub i32 [[TMP61]], [[TMP79]]
; CHECK-NEXT: [[ADD94:%.*]] = add i32 [[ADD48_3]], [[ADD48_2]]
; CHECK-NEXT: [[SUB102:%.*]] = sub i32 [[ADD48_2]], [[ADD48_3]]
-; CHECK-NEXT: [[TMP79:%.*]] = extractelement <2 x i32> [[TMP47]], i32 1
-; CHECK-NEXT: [[SHR_I49_2:%.*]] = lshr i32 [[TMP79]], 15
-; CHECK-NEXT: [[AND_I50_2:%.*]] = and i32 [[SHR_I49_2]], 65537
-; CHECK-NEXT: [[MUL_I51_2:%.*]] = mul i32 [[AND_I50_2]], 65535
-; CHECK-NEXT: [[SHR_I49_3:%.*]] = lshr i32 [[ADD46_2]], 15
+; CHECK-NEXT: [[TMP63:%.*]] = extractelement <2 x i32> [[TMP33]], i32 0
+; CHECK-NEXT: [[SHR_I49_3:%.*]] = lshr i32 [[TMP63]], 15
; CHECK-NEXT: [[AND_I50_3:%.*]] = and i32 [[SHR_I49_3]], 65537
; CHECK-NEXT: [[MUL_I51_3:%.*]] = mul i32 [[AND_I50_3]], 65535
+; CHECK-NEXT: [[SHR_I_1:%.*]] = lshr i32 [[CONV]], 15
+; CHECK-NEXT: [[AND_I_1:%.*]] = and i32 [[SHR_I_1]], 65537
+; CHECK-NEXT: [[MUL_I_1:%.*]] = mul i32 [[AND_I_1]], 65535
+; CHECK-NEXT: [[ADD94_1:%.*]] = add i32 [[ADD55_3]], [[ADD55_2]]
+; CHECK-NEXT: [[SUB102_1:%.*]] = sub i32 [[ADD55_2]], [[ADD55_3]]
; CHECK-NEXT: [[TMP107:%.*]] = extractelement <2 x i32> [[TMP16]], i32 0
; CHECK-NEXT: [[SHR_I49_5:%.*]] = lshr i32 [[TMP107]], 15
; CHECK-NEXT: [[AND_I50_5:%.*]] = and i32 [[SHR_I49_5]], 65537
; CHECK-NEXT: [[MUL_I51_5:%.*]] = mul i32 [[AND_I50_5]], 65535
+; CHECK-NEXT: [[ADD94_4:%.*]] = add i32 [[SUB51_3]], [[SUB51_2]]
+; CHECK-NEXT: [[SUB102_2:%.*]] = sub i32 [[SUB51_2]], [[SUB51_3]]
; CHECK-NEXT: [[SHR_I49_4:%.*]] = lshr i32 [[CONV_1]], 15
; CHECK-NEXT: [[AND_I50_4:%.*]] = and i32 [[SHR_I49_4]], 65537
; CHECK-NEXT: [[MUL_I51_4:%.*]] = mul i32 [[AND_I50_4]], 65535
+; CHECK-NEXT: [[ADD94_5:%.*]] = add i32 [[SUB59_3]], [[SUB59_2]]
+; CHECK-NEXT: [[SUB102_3:%.*]] = sub i32 [[SUB59_2]], [[SUB59_3]]
; CHECK-NEXT: [[SHR_I49_6:%.*]] = lshr i32 [[CONV1]], 15
; CHECK-NEXT: [[AND_I50_6:%.*]] = and i32 [[SHR_I49_6]], 65537
; CHECK-NEXT: [[MUL_I51_6:%.*]] = mul i32 [[AND_I50_6]], 65535
-; CHECK-NEXT: [[TMP78:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1
-; CHECK-NEXT: [[TMP102:%.*]] = zext <2 x i8> [[TMP78]] to <2 x i32>
-; CHECK-NEXT: [[TMP80:%.*]] = insertelement <2 x ptr> [[TMP5]], ptr [[ARRAYIDX22]], i32 1
-; CHECK-NEXT: [[TMP81:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP80]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP82:%.*]] = zext <2 x i8> [[TMP81]] to <2 x i32>
-; CHECK-NEXT: [[TMP83:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP3]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP84:%.*]] = zext <2 x i8> [[TMP83]] to <2 x i32>
-; CHECK-NEXT: [[TMP85:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP6]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP86:%.*]] = zext <2 x i8> [[TMP85]] to <2 x i32>
-; CHECK-NEXT: [[TMP87:%.*]] = sub <2 x i32> [[TMP84]], [[TMP86]]
+; CHECK-NEXT: [[TMP66:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1
+; CHECK-NEXT: [[TMP102:%.*]] = zext <2 x i8> [[TMP66]] to <2 x i32>
+; CHECK-NEXT: [[TMP67:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[PIX2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP78:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32>
+; CHECK-NEXT: [[TMP73:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[TMP1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP81:%.*]] = zext <2 x i8> [[TMP73]] to <2 x i32>
+; CHECK-NEXT: [[TMP71:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP76:%.*]] = zext <2 x i8> [[TMP71]] to <2 x i32>
+; CHECK-NEXT: [[TMP87:%.*]] = sub <2 x i32> [[TMP81]], [[TMP76]]
; CHECK-NEXT: [[TMP88:%.*]] = shl <2 x i32> [[TMP87]], <i32 16, i32 16>
-; CHECK-NEXT: [[TMP89:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP7]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP90:%.*]] = zext <2 x i8> [[TMP89]] to <2 x i32>
-; CHECK-NEXT: [[TMP91:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP8]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP92:%.*]] = zext <2 x i8> [[TMP91]] to <2 x i32>
-; CHECK-NEXT: [[TMP93:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP9]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP94:%.*]] = zext <2 x i8> [[TMP93]] to <2 x i32>
-; CHECK-NEXT: [[TMP95:%.*]] = sub <2 x i32> [[TMP92]], [[TMP94]]
+; CHECK-NEXT: [[TMP83:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX22]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP80:%.*]] = zext <2 x i8> [[TMP83]] to <2 x i32>
+; CHECK-NEXT: [[TMP77:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX25]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP82:%.*]] = zext <2 x i8> [[TMP77]] to <2 x i32>
+; CHECK-NEXT: [[TMP85:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX27]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP84:%.*]] = zext <2 x i8> [[TMP85]] to <2 x i32>
+; CHECK-NEXT: [[TMP95:%.*]] = sub <2 x i32> [[TMP82]], [[TMP84]]
; CHECK-NEXT: [[TMP96:%.*]] = shl <2 x i32> [[TMP95]], <i32 16, i32 16>
; CHECK-NEXT: [[TMP97:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV33]], i32 1
-; CHECK-NEXT: [[TMP98:%.*]] = sub <2 x i32> [[TMP97]], [[TMP90]]
-; CHECK-NEXT: [[TMP104:%.*]] = add <2 x i32> [[TMP96]], [[TMP98]]
-; CHECK-NEXT: [[TMP100:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV1]], i32 0
-; CHECK-NEXT: [[TMP101:%.*]] = sub <2 x i32> [[TMP100]], [[TMP82]]
-; CHECK-NEXT: [[TMP200:%.*]] = add <2 x i32> [[TMP88]], [[TMP101]]
-; CHECK-NEXT: [[TMP128:%.*]] = shufflevector <2 x i32> [[TMP104]], <2 x i32> [[TMP200]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[TMP106:%.*]] = add <2 x i32> [[TMP104]], [[TMP200]]
-; CHECK-NEXT: [[TMP105:%.*]] = sub <2 x i32> [[TMP200]], [[TMP104]]
+; CHECK-NEXT: [[TMP89:%.*]] = sub <2 x i32> [[TMP97]], [[TMP80]]
+; CHECK-NEXT: [[TMP105:%.*]] = add <2 x i32> [[TMP96]], [[TMP89]]
+; CHECK-NEXT: [[TMP86:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV1]], i32 0
+; CHECK-NEXT: [[TMP99:%.*]] = sub <2 x i32> [[TMP86]], [[TMP78]]
+; CHECK-NEXT: [[TMP92:%.*]] = add <2 x i32> [[TMP88]], [[TMP99]]
+; CHECK-NEXT: [[TMP93:%.*]] = shufflevector <2 x i32> [[TMP105]], <2 x i32> [[TMP92]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT: [[TMP106:%.*]] = add <2 x i32> [[TMP105]], [[TMP92]]
+; CHECK-NEXT: [[TMP91:%.*]] = sub <2 x i32> [[TMP92]], [[TMP105]]
; CHECK-NEXT: [[TMP238:%.*]] = extractelement <2 x i32> [[TMP106]], i32 0
; CHECK-NEXT: [[TMP108:%.*]] = extractelement <2 x i32> [[TMP106]], i32 1
; CHECK-NEXT: [[ADD48:%.*]] = add i32 [[TMP108]], [[TMP238]]
-; CHECK-NEXT: [[TMP142:%.*]] = extractelement <2 x i32> [[TMP105]], i32 1
+; CHECK-NEXT: [[SUB51:%.*]] = sub i32 [[TMP238]], [[TMP108]]
+; CHECK-NEXT: [[TMP94:%.*]] = extractelement <2 x i32> [[TMP91]], i32 0
+; CHECK-NEXT: [[SUB47:%.*]] = extractelement <2 x i32> [[TMP91]], i32 1
+; CHECK-NEXT: [[ADD55:%.*]] = add i32 [[SUB47]], [[TMP94]]
+; CHECK-NEXT: [[SUB59:%.*]] = sub i32 [[TMP94]], [[SUB47]]
; CHECK-NEXT: [[SHR_I59_1:%.*]] = lshr i32 [[TMP108]], 15
; CHECK-NEXT: [[AND_I60_1:%.*]] = and i32 [[SHR_I59_1]], 65537
; CHECK-NEXT: [[MUL_I61_1:%.*]] = mul i32 [[AND_I60_1]], 65535
-; CHECK-NEXT: [[SHR_I59_4:%.*]] = lshr i32 [[TMP142]], 15
+; CHECK-NEXT: [[SHR_I59_4:%.*]] = lshr i32 [[SUB47]], 15
; CHECK-NEXT: [[AND_I60_4:%.*]] = and i32 [[SHR_I59_4]], 65537
; CHECK-NEXT: [[MUL_I61_4:%.*]] = mul i32 [[AND_I60_4]], 65535
-; CHECK-NEXT: [[TMP109:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1
-; CHECK-NEXT: [[TMP110:%.*]] = zext <2 x i8> [[TMP109]] to <2 x i32>
-; CHECK-NEXT: [[TMP111:%.*]] = insertelement <2 x i8> poison, i8 [[TMP12]], i32 0
-; CHECK-NEXT: [[TMP112:%.*]] = insertelement <2 x i8> [[TMP111]], i8 [[TMP13]], i32 1
-; CHECK-NEXT: [[TMP113:%.*]] = zext <2 x i8> [[TMP112]] to <2 x i32>
-; CHECK-NEXT: [[TMP114:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR3]], i32 0
-; CHECK-NEXT: [[TMP115:%.*]] = shufflevector <2 x ptr> [[TMP114]], <2 x ptr> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP116:%.*]] = getelementptr i8, <2 x ptr> [[TMP115]], <2 x i64> <i64 4, i64 6>
-; CHECK-NEXT: [[TMP117:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP116]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP118:%.*]] = zext <2 x i8> [[TMP117]] to <2 x i32>
-; CHECK-NEXT: [[TMP119:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR644]], i32 0
-; CHECK-NEXT: [[TMP120:%.*]] = shufflevector <2 x ptr> [[TMP119]], <2 x ptr> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP121:%.*]] = getelementptr i8, <2 x ptr> [[TMP120]], <2 x i64> <i64 4, i64 6>
-; CHECK-NEXT: [[TMP122:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP121]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP123:%.*]] = zext <2 x i8> [[TMP122]] to <2 x i32>
-; CHECK-NEXT: [[TMP124:%.*]] = sub <2 x i32> [[TMP118]], [[TMP123]]
+; CHECK-NEXT: [[TMP104:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1
+; CHECK-NEXT: [[TMP110:%.*]] = zext <2 x i8> [[TMP104]] to <2 x i32>
+; CHECK-NEXT: [[TMP98:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ADD_PTR644]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP103:%.*]] = zext <2 x i8> [[TMP98]] to <2 x i32>
+; CHECK-NEXT: [[TMP100:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP109:%.*]] = zext <2 x i8> [[TMP100]] to <2 x i32>
+; CHECK-NEXT: [[TMP112:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP114:%.*]] = zext <2 x i8> [[TMP112]] to <2 x i32>
+; CHECK-NEXT: [[TMP124:%.*]] = sub <2 x i32> [[TMP109]], [[TMP114]]
; CHECK-NEXT: [[TMP125:%.*]] = shl <2 x i32> [[TMP124]], <i32 16, i32 16>
-; CHECK-NEXT: [[TMP126:%.*]] = getelementptr i8, <2 x ptr> [[TMP120]], <2 x i64> <i64 1, i64 3>
-; CHECK-NEXT: [[TMP127:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP126]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP144:%.*]] = zext <2 x i8> [[TMP127]] to <2 x i32>
-; CHECK-NEXT: [[TMP129:%.*]] = getelementptr i8, <2 x ptr> [[TMP115]], <2 x i64> <i64 5, i64 7>
-; CHECK-NEXT: [[TMP130:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP129]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP131:%.*]] = zext <2 x i8> [[TMP130]] to <2 x i32>
-; CHECK-NEXT: [[TMP132:%.*]] = getelementptr i8, <2 x ptr> [[TMP120]], <2 x i64> <i64 5, i64 7>
-; CHECK-NEXT: [[TMP133:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP132]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP134:%.*]] = zext <2 x i8> [[TMP133]] to <2 x i32>
-; CHECK-NEXT: [[TMP135:%.*]] = sub <2 x i32> [[TMP131]], [[TMP134]]
+; CHECK-NEXT: [[TMP113:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX22_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP111:%.*]] = zext <2 x i8> [[TMP113]] to <2 x i32>
+; CHECK-NEXT: [[TMP115:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX25_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP118:%.*]] = zext <2 x i8> [[TMP115]] to <2 x i32>
+; CHECK-NEXT: [[TMP116:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX27_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP128:%.*]] = zext <2 x i8> [[TMP116]] to <2 x i32>
+; CHECK-NEXT: [[TMP135:%.*]] = sub <2 x i32> [[TMP118]], [[TMP128]]
; CHECK-NEXT: [[TMP136:%.*]] = shl <2 x i32> [[TMP135]], <i32 16, i32 16>
; CHECK-NEXT: [[TMP137:%.*]] = insertelement <2 x i32> [[TMP110]], i32 [[CONV33_1]], i32 1
-; CHECK-NEXT: [[TMP138:%.*]] = sub <2 x i32> [[TMP137]], [[TMP144]]
-; CHECK-NEXT: [[TMP139:%.*]] = add <2 x i32> [[TMP136]], [[TMP138]]
-; CHECK-NEXT: [[TMP140:%.*]] = insertelement <2 x i32> [[TMP110]], i32 [[CONV_1]], i32 0
-; CHECK-NEXT: [[TMP141:%.*]] = sub <2 x i32> [[TMP140]], [[TMP113]]
-; CHECK-NEXT: [[TMP155:%.*]] = add <2 x i32> [[TMP125]], [[TMP141]]
-; CHECK-NEXT: [[TMP143:%.*]] = add <2 x i32> [[TMP139]], [[TMP155]]
-; CHECK-NEXT: [[TMP189:%.*]] = sub <2 x i32> [[TMP155]], [[TMP139]]
+; CHECK-NEXT: [[TMP119:%.*]] = sub <2 x i32> [[TMP137]], [[TMP111]]
+; CHECK-NEXT: [[TMP120:%.*]] = add <2 x i32> [[TMP136]], [[TMP119]]
+; CHECK-NEXT: [[TMP117:%.*]] = insertelement <2 x i32> [[TMP110]], i32 [[CONV_1]], i32 0
+; CHECK-NEXT: [[TMP122:%.*]] = sub <2 x i32> [[TMP117]], [[TMP103]]
+; CHECK-NEXT: [[TMP123:%.*]] = add <2 x i32> [[TMP125]], [[TMP122]]
+; CHECK-NEXT: [[TMP143:%.*]] = add <2 x i32> [[TMP120]], [[TMP123]]
+; CHECK-NEXT: [[TMP121:%.*]] = sub <2 x i32> [[TMP123]], [[TMP120]]
; CHECK-NEXT: [[TMP145:%.*]] = extractelement <2 x i32> [[TMP143]], i32 0
; CHECK-NEXT: [[TMP146:%.*]] = extractelement <2 x i32> [[TMP143]], i32 1
; CHECK-NEXT: [[ADD48_1:%.*]] = add i32 [[TMP146]], [[TMP145]]
-; CHECK-NEXT: [[SHR_I54:%.*]] = lshr i32 [[TMP146]], 15
-; CHECK-NEXT: [[AND_I55:%.*]] = and i32 [[SHR_I54]], 65537
-; CHECK-NEXT: [[MUL_I56:%.*]] = mul i32 [[AND_I55]], 65535
+; CHECK-NEXT: [[SUB51_1:%.*]] = sub i32 [[TMP145]], [[TMP146]]
+; CHECK-NEXT: [[TMP126:%.*]] = extractelement <2 x i32> [[TMP121]], i32 0
+; CHECK-NEXT: [[TMP127:%.*]] = extractelement <2 x i32> [[TMP121]], i32 1
+; CHECK-NEXT: [[ADD55_1:%.*]] = add i32 [[TMP127]], [[TMP126]]
+; CHECK-NEXT: [[SUB59_1:%.*]] = sub i32 [[TMP126]], [[TMP127]]
+; CHECK-NEXT: [[SHR_I54_1:%.*]] = lshr i32 [[TMP146]], 15
+; CHECK-NEXT: [[AND_I55_1:%.*]] = and i32 [[SHR_I54_1]], 65537
+; CHECK-NEXT: [[MUL_I56_1:%.*]] = mul i32 [[AND_I55_1]], 65535
; CHECK-NEXT: [[TMP147:%.*]] = lshr <2 x i32> [[TMP110]], <i32 15, i32 15>
; CHECK-NEXT: [[TMP148:%.*]] = and <2 x i32> [[TMP147]], <i32 65537, i32 65537>
; CHECK-NEXT: [[TMP149:%.*]] = mul <2 x i32> [[TMP148]], <i32 65535, i32 65535>
@@ -218,76 +226,46 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[SUB104:%.*]] = sub i32 [[ADD78]], [[ADD94]]
; CHECK-NEXT: [[ADD105:%.*]] = add i32 [[SUB102]], [[SUB86]]
; CHECK-NEXT: [[SUB106:%.*]] = sub i32 [[SUB86]], [[SUB102]]
-; CHECK-NEXT: [[ADD_I:%.*]] = add i32 [[MUL_I51_2]], [[ADD103]]
-; CHECK-NEXT: [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP79]]
-; CHECK-NEXT: [[ADD_I52:%.*]] = add i32 [[MUL_I51_3]], [[ADD105]]
-; CHECK-NEXT: [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[ADD46_2]]
-; CHECK-NEXT: [[ADD_I57:%.*]] = add i32 [[MUL_I56]], [[SUB104]]
+; CHECK-NEXT: [[ADD_I:%.*]] = add i32 [[MUL_I51_3]], [[ADD103]]
+; CHECK-NEXT: [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP63]]
+; CHECK-NEXT: [[ADD_I52:%.*]] = add i32 [[MUL_I_1]], [[ADD105]]
+; CHECK-NEXT: [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[CONV]]
+; CHECK-NEXT: [[ADD_I57:%.*]] = add i32 [[MUL_I56_1]], [[SUB104]]
; CHECK-NEXT: [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP146]]
; CHECK-NEXT: [[ADD_I62:%.*]] = add i32 [[MUL_I61_1]], [[SUB106]]
; CHECK-NEXT: [[XOR_I63:%.*]] = xor i32 [[ADD_I62]], [[TMP108]]
; CHECK-NEXT: [[ADD110:%.*]] = add i32 [[XOR_I53]], [[XOR_I]]
; CHECK-NEXT: [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I58]]
; CHECK-NEXT: [[ADD113:%.*]] = add i32 [[ADD112]], [[XOR_I63]]
-; CHECK-NEXT: [[TMP150:%.*]] = shufflevector <2 x i32> [[TMP105]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT: [[TMP151:%.*]] = insertelement <2 x i32> [[TMP150]], i32 [[SUB59_2]], i32 1
-; CHECK-NEXT: [[TMP152:%.*]] = insertelement <2 x i32> [[TMP105]], i32 [[SUB51_2]], i32 1
-; CHECK-NEXT: [[TMP153:%.*]] = add <2 x i32> [[TMP151]], [[TMP152]]
-; CHECK-NEXT: [[TMP154:%.*]] = shufflevector <2 x i32> [[TMP189]], <2 x i32> [[TMP190]], <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: [[TMP184:%.*]] = shufflevector <2 x i32> [[TMP189]], <2 x i32> [[TMP190]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP156:%.*]] = add <2 x i32> [[TMP154]], [[TMP184]]
-; CHECK-NEXT: [[TMP157:%.*]] = extractelement <2 x i32> [[TMP153]], i32 1
-; CHECK-NEXT: [[TMP158:%.*]] = extractelement <2 x i32> [[TMP156]], i32 1
-; CHECK-NEXT: [[TMP159:%.*]] = shufflevector <2 x i32> [[TMP156]], <2 x i32> [[TMP153]], <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT: [[ADD78_2:%.*]] = add i32 [[TMP158]], [[TMP157]]
-; CHECK-NEXT: [[TMP160:%.*]] = extractelement <2 x i32> [[TMP153]], i32 0
-; CHECK-NEXT: [[TMP161:%.*]] = extractelement <2 x i32> [[TMP156]], i32 0
-; CHECK-NEXT: [[TMP162:%.*]] = shufflevector <2 x i32> [[TMP156]], <2 x i32> [[TMP153]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[ADD94_1:%.*]] = add i32 [[TMP161]], [[TMP160]]
-; CHECK-NEXT: [[TMP163:%.*]] = sub <2 x i32> [[TMP153]], [[TMP156]]
-; CHECK-NEXT: [[TMP164:%.*]] = extractelement <2 x i32> [[TMP163]], i32 0
-; CHECK-NEXT: [[TMP165:%.*]] = extractelement <2 x i32> [[TMP163]], i32 1
-; CHECK-NEXT: [[ADD105_1:%.*]] = add i32 [[TMP165]], [[TMP164]]
-; CHECK-NEXT: [[SUB106_1:%.*]] = sub i32 [[TMP164]], [[TMP165]]
+; CHECK-NEXT: [[ADD78_1:%.*]] = add i32 [[ADD55_1]], [[ADD55]]
+; CHECK-NEXT: [[SUB86_1:%.*]] = sub i32 [[ADD55]], [[ADD55_1]]
+; CHECK-NEXT: [[ADD105_1:%.*]] = add i32 [[SUB102_1]], [[SUB86_1]]
+; CHECK-NEXT: [[SUB106_1:%.*]] = sub i32 [[SUB86_1]], [[SUB102_1]]
; CHECK-NEXT: [[ADD_I52_1:%.*]] = add i32 [[MUL_I51_5]], [[ADD105_1]]
; CHECK-NEXT: [[XOR_I53_1:%.*]] = xor i32 [[ADD_I52_1]], [[TMP107]]
-; CHECK-NEXT: [[TMP166:%.*]] = shufflevector <2 x i32> [[TMP16]], <2 x i32> [[TMP189]], <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT: [[TMP167:%.*]] = lshr <2 x i32> [[TMP166]], <i32 15, i32 15>
-; CHECK-NEXT: [[TMP168:%.*]] = and <2 x i32> [[TMP167]], <i32 65537, i32 65537>
-; CHECK-NEXT: [[TMP169:%.*]] = mul <2 x i32> [[TMP168]], <i32 65535, i32 65535>
-; CHECK-NEXT: [[TMP172:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_1]], i32 0
-; CHECK-NEXT: [[TMP171:%.*]] = shufflevector <2 x i32> [[TMP172]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP208:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_2]], i32 0
-; CHECK-NEXT: [[TMP209:%.*]] = shufflevector <2 x i32> [[TMP208]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP282:%.*]] = add <2 x i32> [[TMP171]], [[TMP209]]
-; CHECK-NEXT: [[TMP211:%.*]] = sub <2 x i32> [[TMP171]], [[TMP209]]
-; CHECK-NEXT: [[TMP283:%.*]] = shufflevector <2 x i32> [[TMP282]], <2 x i32> [[TMP211]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP177:%.*]] = add <2 x i32> [[TMP169]], [[TMP283]]
-; CHECK-NEXT: [[TMP178:%.*]] = xor <2 x i32> [[TMP177]], [[TMP166]]
+; CHECK-NEXT: [[TMP129:%.*]] = shufflevector <2 x i32> [[TMP17]], <2 x i32> [[TMP121]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[TMP130:%.*]] = lshr <2 x i32> [[TMP129]], <i32 15, i32 15>
+; CHECK-NEXT: [[TMP131:%.*]] = and <2 x i32> [[TMP130]], <i32 65537, i32 65537>
+; CHECK-NEXT: [[TMP132:%.*]] = mul <2 x i32> [[TMP131]], <i32 65535, i32 65535>
+; CHECK-NEXT: [[TMP133:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_1]], i32 0
+; CHECK-NEXT: [[TMP144:%.*]] = shufflevector <2 x i32> [[TMP133]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP151:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_1]], i32 0
+; CHECK-NEXT: [[TMP152:%.*]] = shufflevector <2 x i32> [[TMP151]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP153:%.*]] = add <2 x i32> [[TMP144]], [[TMP152]]
+; CHECK-NEXT: [[TMP138:%.*]] = sub <2 x i32> [[TMP144]], [[TMP152]]
+; CHECK-NEXT: [[TMP139:%.*]] = shufflevector <2 x i32> [[TMP153]], <2 x i32> [[TMP138]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[TMP140:%.*]] = add <2 x i32> [[TMP132]], [[TMP139]]
+; CHECK-NEXT: [[TMP141:%.*]] = xor <2 x i32> [[TMP140]], [[TMP129]]
; CHECK-NEXT: [[ADD_I62_1:%.*]] = add i32 [[MUL_I61_4]], [[SUB106_1]]
-; CHECK-NEXT: [[XOR_I63_1:%.*]] = xor i32 [[ADD_I62_1]], [[TMP142]]
+; CHECK-NEXT: [[XOR_I63_1:%.*]] = xor i32 [[ADD_I62_1]], [[SUB47]]
; CHECK-NEXT: [[ADD108_1:%.*]] = add i32 [[XOR_I53_1]], [[ADD113]]
-; CHECK-NEXT: [[TMP179:%.*]] = extractelement <2 x i32> [[TMP178]], i32 0
-; CHECK-NEXT: [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[TMP179]]
-; CHECK-NEXT: [[TMP180:%.*]] = extractelement <2 x i32> [[TMP178]], i32 1
-; CHECK-NEXT: [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[TMP180]]
+; CHECK-NEXT: [[TMP142:%.*]] = extractelement <2 x i32> [[TMP141]], i32 0
+; CHECK-NEXT: [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[TMP142]]
+; CHECK-NEXT: [[TMP154:%.*]] = extractelement <2 x i32> [[TMP141]], i32 1
+; CHECK-NEXT: [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[TMP154]]
; CHECK-NEXT: [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[XOR_I63_1]]
-; CHECK-NEXT: [[TMP181:%.*]] = shufflevector <2 x i32> [[TMP106]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
-; CHECK-NEXT: [[TMP182:%.*]] = insertelement <2 x i32> [[TMP181]], i32 [[ADD44_3]], i32 0
-; CHECK-NEXT: [[TMP183:%.*]] = insertelement <2 x i32> [[TMP106]], i32 [[ADD46_2]], i32 0
-; CHECK-NEXT: [[TMP195:%.*]] = sub <2 x i32> [[TMP182]], [[TMP183]]
-; CHECK-NEXT: [[TMP185:%.*]] = shufflevector <2 x i32> [[TMP72]], <2 x i32> [[TMP143]], <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: [[TMP186:%.*]] = shufflevector <2 x i32> [[TMP72]], <2 x i32> [[TMP143]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP187:%.*]] = sub <2 x i32> [[TMP185]], [[TMP186]]
-; CHECK-NEXT: [[TMP188:%.*]] = extractelement <2 x i32> [[TMP195]], i32 0
-; CHECK-NEXT: [[TMP196:%.*]] = extractelement <2 x i32> [[TMP187]], i32 0
-; CHECK-NEXT: [[TMP199:%.*]] = shufflevector <2 x i32> [[TMP187]], <2 x i32> [[TMP195]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[ADD94_4:%.*]] = add i32 [[TMP196]], [[TMP188]]
-; CHECK-NEXT: [[TMP191:%.*]] = extractelement <2 x i32> [[TMP195]], i32 1
-; CHECK-NEXT: [[TMP192:%.*]] = extractelement <2 x i32> [[TMP187]], i32 1
-; CHECK-NEXT: [[TMP193:%.*]] = shufflevector <2 x i32> [[TMP187]], <2 x i32> [[TMP195]], <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT: [[ADD94_2:%.*]] = add i32 [[TMP192]], [[TMP191]]
-; CHECK-NEXT: [[TMP194:%.*]] = sub <2 x i32> [[TMP195]], [[TMP187]]
+; CHECK-NEXT: [[ADD94_2:%.*]] = add i32 [[SUB51_1]], [[SUB51]]
+; CHECK-NEXT: [[SUB86_2:%.*]] = sub i32 [[SUB51]], [[SUB51_1]]
; CHECK-NEXT: [[TMP244:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_2]], i32 0
; CHECK-NEXT: [[TMP245:%.*]] = shufflevector <2 x i32> [[TMP244]], <2 x i32> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP197:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_4]], i32 0
@@ -295,40 +273,25 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[TMP216:%.*]] = add <2 x i32> [[TMP245]], [[TMP198]]
; CHECK-NEXT: [[TMP210:%.*]] = sub <2 x i32> [[TMP245]], [[TMP198]]
; CHECK-NEXT: [[TMP221:%.*]] = shufflevector <2 x i32> [[TMP216]], <2 x i32> [[TMP210]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP215:%.*]] = extractelement <2 x i32> [[TMP194]], i32 0
-; CHECK-NEXT: [[TMP203:%.*]] = extractelement <2 x i32> [[TMP194]], i32 1
-; CHECK-NEXT: [[ADD105_2:%.*]] = add i32 [[TMP215]], [[TMP203]]
-; CHECK-NEXT: [[SUB106_2:%.*]] = sub i32 [[TMP203]], [[TMP215]]
+; CHECK-NEXT: [[ADD105_2:%.*]] = add i32 [[SUB102_2]], [[SUB86_2]]
+; CHECK-NEXT: [[SUB106_2:%.*]] = sub i32 [[SUB86_2]], [[SUB102_2]]
; CHECK-NEXT: [[ADD_I52_2:%.*]] = add i32 [[MUL_I51_4]], [[ADD105_2]]
; CHECK-NEXT: [[XOR_I53_2:%.*]] = xor i32 [[ADD_I52_2]], [[CONV_1]]
-; CHECK-NEXT: [[TMP266:%.*]] = add <2 x i32> [[TMP149]], [[TMP221]]
-; CHECK-NEXT: [[TMP267:%.*]] = xor <2 x i32> [[TMP266]], [[TMP110]]
+; CHECK-NEXT: [[TMP134:%.*]] = add <2 x i32> [[TMP149]], [[TMP221]]
+; CHECK-NEXT: [[TMP213:%.*]] = xor <2 x i32> [[TMP134]], [[TMP110]]
; CHECK-NEXT: [[SHR_I59_2:%.*]] = lshr i32 [[TMP238]], 15
; CHECK-NEXT: [[AND_I60_2:%.*]] = and i32 [[SHR_I59_2]], 65537
; CHECK-NEXT: [[MUL_I61_2:%.*]] = mul i32 [[AND_I60_2]], 65535
; CHECK-NEXT: [[ADD_I62_2:%.*]] = add i32 [[MUL_I61_2]], [[SUB106_2]]
; CHECK-NEXT: [[XOR_I63_2:%.*]] = xor i32 [[ADD_I62_2]], [[TMP238]]
; CHECK-NEXT: [[ADD108_2:%.*]] = add i32 [[XOR_I53_2]], [[ADD113_1]]
-; CHECK-NEXT: [[TMP206:%.*]] = extractelement <2 x i32> [[TMP267]], i32 0
-; CHECK-NEXT: [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP206]]
-; CHECK-NEXT: [[TMP207:%.*]] = extractelement <2 x i32> [[TMP267]], i32 1
-; CHECK-NEXT: [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP207]]
-; CHECK-NEXT: [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I63_2]]
-; CHECK-NEXT: [[TMP222:%.*]] = insertelement <2 x i32> [[TMP150]], i32 [[SUB51_2]], i32 0
-; CHECK-NEXT: [[TMP225:%.*]] = insertelement <2 x i32> [[TMP105]], i32 [[SUB59_2]], i32 0
-; CHECK-NEXT: [[TMP226:%.*]] = sub <2 x i32> [[TMP222]], [[TMP225]]
-; CHECK-NEXT: [[TMP227:%.*]] = shufflevector <2 x i32> [[TMP190]], <2 x i32> [[TMP189]], <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: [[TMP212:%.*]] = shufflevector <2 x i32> [[TMP190]], <2 x i32> [[TMP189]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP213:%.*]] = sub <2 x i32> [[TMP227]], [[TMP212]]
-; CHECK-NEXT: [[TMP214:%.*]] = extractelement <2 x i32> [[TMP226]], i32 0
; CHECK-NEXT: [[TMP237:%.*]] = extractelement <2 x i32> [[TMP213]], i32 0
-; CHECK-NEXT: [[TMP239:%.*]] = shufflevector <2 x i32> [[TMP213]], <2 x i32> [[TMP226]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[ADD94_5:%.*]] = add i32 [[TMP237]], [[TMP214]]
-; CHECK-NEXT: [[TMP217:%.*]] = extractelement <2 x i32> [[TMP226]], i32 1
+; CHECK-NEXT: [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP237]]
; CHECK-NEXT: [[TMP218:%.*]] = extractelement <2 x i32> [[TMP213]], i32 1
-; CHECK-NEXT: [[TMP219:%.*]] = shufflevector <2 x i32> [[TMP213]], <2 x i32> [[TMP226]], <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT: [[ADD94_3:%.*]] = add i32 [[TMP218]], [[TMP217]]
-; CHECK-NEXT: [[TMP240:%.*]] = sub <2 x i32> [[TMP226]], [[TMP213]]
+; CHECK-NEXT: [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP218]]
+; CHECK-NEXT: [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I63_2]]
+; CHECK-NEXT: [[ADD94_3:%.*]] = add i32 [[SUB59_1]], [[SUB59]]
+; CHECK-NEXT: [[SUB86_3:%.*]] = sub i32 [[SUB59]], [[SUB59_1]]
; CHECK-NEXT: [[TMP223:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_3]], i32 0
; CHECK-NEXT: [[TMP224:%.*]] = shufflevector <2 x i32> [[TMP223]], <2 x i32> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP241:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_5]], i32 0
@@ -336,17 +299,15 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[TMP261:%.*]] = add <2 x i32> [[TMP224]], [[TMP242]]
; CHECK-NEXT: [[TMP262:%.*]] = sub <2 x i32> [[TMP224]], [[TMP242]]
; CHECK-NEXT: [[TMP220:%.*]] = shufflevector <2 x i32> [[TMP261]], <2 x i32> [[TMP262]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP228:%.*]] = extractelement <2 x i32> [[TMP240]], i32 0
-; CHECK-NEXT: [[TMP229:%.*]] = extractelement <2 x i32> [[TMP240]], i32 1
-; CHECK-NEXT: [[ADD105_3:%.*]] = add i32 [[TMP228]], [[TMP229]]
-; CHECK-NEXT: [[SUB106_3:%.*]] = sub i32 [[TMP229]], [[TMP228]]
+; CHECK-NEXT: [[ADD105_3:%.*]] = add i32 [[SUB102_3]], [[SUB86_3]]
+; CHECK-NEXT: [[SUB106_3:%.*]] = sub i32 [[SUB86_3]], [[SUB102_3]]
; CHECK-NEXT: [[ADD_I52_3:%.*]] = add i32 [[MUL_I51_6]], [[ADD105_3]]
; CHECK-NEXT: [[XOR_I53_3:%.*]] = xor i32 [[ADD_I52_3]], [[CONV1]]
; CHECK-NEXT: [[TMP230:%.*]] = lshr <2 x i32> [[TMP102]], <i32 15, i32 15>
; CHECK-NEXT: [[TMP231:%.*]] = and <2 x i32> [[TMP230]], <i32 65537, i32 65537>
; CHECK-NEXT: [[TMP232:%.*]] = mul <2 x i32> [[TMP231]], <i32 65535, i32 65535>
-; CHECK-NEXT: [[TMP233:%.*]] = add <2 x i32> [[TMP232]], [[TMP220]]
-; CHECK-NEXT: [[TMP234:%.*]] = xor <2 x i32> [[TMP233]], [[TMP102]]
+; CHECK-NEXT: [[TMP150:%.*]] = add <2 x i32> [[TMP232]], [[TMP220]]
+; CHECK-NEXT: [[TMP234:%.*]] = xor <2 x i32> [[TMP150]], [[TMP102]]
; CHECK-NEXT: [[SHR_I59_3:%.*]] = lshr i32 [[CONV33]], 15
; CHECK-NEXT: [[AND_I60_3:%.*]] = and i32 [[SHR_I59_3]], 65537
; CHECK-NEXT: [[MUL_I61_3:%.*]] = mul i32 [[AND_I60_3]], 65535
@@ -368,9 +329,9 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; THR15-NEXT: [[ARRAYIDX3:%.*]] = getelementptr i8, ptr [[PIX1]], i64 4
; THR15-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i8, ptr [[PIX2]], i64 4
; THR15-NEXT: [[ARRAYIDX8:%.*]] = getelementptr i8, ptr [[PIX1]], i64 1
-; THR15-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i8, ptr [[PIX2]], i64 2
-; THR15-NEXT: [[ARRAYIDX25:%.*]] = getelementptr i8, ptr [[PIX1]], i64 6
-; THR15-NEXT: [[ARRAYIDX27:%.*]] = getelementptr i8, ptr [[PIX2]], i64 6
+; THR15-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i8, ptr [[PIX2]], i64 1
+; THR15-NEXT: [[ARRAYIDX25:%.*]] = getelementptr i8, ptr [[PIX1]], i64 5
+; THR15-NEXT: [[ARRAYIDX27:%.*]] = getelementptr i8, ptr [[PIX2]], i64 5
; THR15-NEXT: [[ARRAYIDX32:%.*]] = getelementptr i8, ptr [[PIX1]], i64 3
; THR15-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX32]], align 1
; THR15-NEXT: [[CONV33:%.*]] = zext i8 [[TMP1]] to i32
@@ -381,9 +342,9 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; THR15-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 4
; THR15-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 4
; THR15-NEXT: [[ARRAYIDX8_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 1
-; THR15-NEXT: [[ARRAYIDX22_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 2
-; THR15-NEXT: [[ARRAYIDX25_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 6
-; THR15-NEXT: [[ARRAYIDX27_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 6
+; THR15-NEXT: [[ARRAYIDX22_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 1
+; THR15-NEXT: [[ARRAYIDX13_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 5
+; THR15-NEXT: [[ARRAYIDX27_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 5
; THR15-NEXT: [[ARRAYIDX32_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 3
; THR15-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX32_1]], align 1
; THR15-NEXT: [[CONV33_1:%.*]] = zext i8 [[TMP3]] to i32
@@ -392,10 +353,10 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; THR15-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 4
; THR15-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 4
; THR15-NEXT: [[TMP4:%.*]] = load <2 x i8>, ptr [[ADD_PTR_1]], align 1
-; THR15-NEXT: [[TMP5:%.*]] = zext <2 x i8> [[TMP4]] to <2 x i32>
+; THR15-NEXT: [[TMP66:%.*]] = zext <2 x i8> [[TMP4]] to <2 x i32>
; THR15-NEXT: [[TMP6:%.*]] = load <2 x i8>, ptr [[ADD_PTR64_1]], align 1
; THR15-NEXT: [[TMP7:%.*]] = zext <2 x i8> [[TMP6]] to <2 x i32>
-; THR15-NEXT: [[TMP8:%.*]] = sub <2 x i32> [[TMP5]], [[TMP7]]
+; THR15-NEXT: [[TMP8:%.*]] = sub <2 x i32> [[TMP66]], [[TMP7]]
; THR15-NEXT: [[TMP9:%.*]] = load <2 x i8>, ptr [[ARRAYIDX3_2]], align 1
; THR15-NEXT: [[TMP10:%.*]] = zext <2 x i8> [[TMP9]] to <2 x i32>
; THR15-NEXT: [[TMP11:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_2]], align 1
@@ -484,103 +445,101 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; THR15-NEXT: [[SHR_I49:%.*]] = lshr i32 [[ADD46_2]], 15
; THR15-NEXT: [[AND_I50:%.*]] = and i32 [[SHR_I49]], 65537
; THR15-NEXT: [[MUL_I51:%.*]] = mul i32 [[AND_I50]], 65535
-; THR15-NEXT: [[ADD94_2:%.*]] = add i32 [[SUB51_3]], [[SUB51_2]]
-; THR15-NEXT: [[SUB102_2:%.*]] = sub i32 [[SUB51_2]], [[SUB51_3]]
-; THR15-NEXT: [[SHR_I49_2:%.*]] = lshr i32 [[CONV_1]], 15
+; THR15-NEXT: [[ADD55_1:%.*]] = add i32 [[ADD55_3]], [[ADD55_2]]
+; THR15-NEXT: [[SUB102_1:%.*]] = sub i32 [[ADD55_2]], [[ADD55_3]]
+; THR15-NEXT: [[TMP64:%.*]] = extractelement <2 x i32> [[TMP66]], i32 0
+; THR15-NEXT: [[SHR_I49_2:%.*]] = lshr i32 [[TMP64]], 15
; THR15-NEXT: [[AND_I50_2:%.*]] = and i32 [[SHR_I49_2]], 65537
; THR15-NEXT: [[MUL_I51_2:%.*]] = mul i32 [[AND_I50_2]], 65535
-; THR15-NEXT: [[ADD94_3:%.*]] = add i32 [[SUB59_3]], [[SUB59_2]]
-; THR15-NEXT: [[SUB102_3:%.*]] = sub i32 [[SUB59_2]], [[SUB59_3]]
-; THR15-NEXT: [[SHR_I49_3:%.*]] = lshr i32 [[CONV]], 15
+; THR15-NEXT: [[ADD94_2:%.*]] = add i32 [[SUB51_3]], [[SUB51_2]]
+; THR15-NEXT: [[SUB102_2:%.*]] = sub i32 [[SUB51_2]], [[SUB51_3]]
+; THR15-NEXT: [[SHR_I49_3:%.*]] = lshr i32 [[CONV_1]], 15
; THR15-NEXT: [[AND_I50_3:%.*]] = and i32 [[SHR_I49_3]], 65537
; THR15-NEXT: [[MUL_I51_3:%.*]] = mul i32 [[AND_I50_3]], 65535
-; THR15-NEXT: [[TMP64:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1
-; THR15-NEXT: [[TMP65:%.*]] = zext <2 x i8> [[TMP64]] to <2 x i32>
-; THR15-NEXT: [[TMP66:%.*]] = load <2 x i8>, ptr [[PIX2]], align 1
-; THR15-NEXT: [[TMP67:%.*]] = zext <2 x i8> [[TMP66]] to <2 x i32>
-; THR15-NEXT: [[TMP68:%.*]] = load <2 x i8>, ptr [[ARRAYIDX3]], align 1
-; THR15-NEXT: [[TMP69:%.*]] = zext <2 x i8> [[TMP68]] to <2 x i32>
-; THR15-NEXT: [[TMP70:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5]], align 1
-; THR15-NEXT: [[TMP71:%.*]] = zext <2 x i8> [[TMP70]] to <2 x i32>
-; THR15-NEXT: [[TMP72:%.*]] = sub <2 x i32> [[TMP69]], [[TMP71]]
+; THR15-NEXT: [[ADD94_3:%.*]] = add i32 [[SUB59_3]], [[SUB59_2]]
+; THR15-NEXT: [[SUB102_3:%.*]] = sub i32 [[SUB59_2]], [[SUB59_3]]
+; THR15-NEXT: [[SHR_I49_4:%.*]] = lshr i32 [[CONV]], 15
+; THR15-NEXT: [[AND_I50_4:%.*]] = and i32 [[SHR_I49_4]], 65537
+; THR15-NEXT: [[MUL_I51_4:%.*]] = mul i32 [[AND_I50_4]], 65535
+; THR15-NEXT: [[TMP65:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1
+; THR15-NEXT: [[TMP74:%.*]] = zext <2 x i8> [[TMP65]] to <2 x i32>
+; THR15-NEXT: [[TMP67:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[PIX2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT: [[TMP68:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32>
+; THR15-NEXT: [[TMP69:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT: [[TMP70:%.*]] = zext <2 x i8> [[TMP69]] to <2 x i32>
+; THR15-NEXT: [[TMP71:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT: [[TMP81:%.*]] = zext <2 x i8> [[TMP71]] to <2 x i32>
+; THR15-NEXT: [[TMP72:%.*]] = sub <2 x i32> [[TMP70]], [[TMP81]]
; THR15-NEXT: [[TMP73:%.*]] = shl <2 x i32> [[TMP72]], <i32 16, i32 16>
-; THR15-NEXT: [[TMP74:%.*]] = shufflevector <2 x i32> [[TMP65]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
-; THR15-NEXT: [[TMP75:%.*]] = insertelement <2 x i32> [[TMP74]], i32 [[CONV]], i32 0
-; THR15-NEXT: [[TMP76:%.*]] = sub <2 x i32> [[TMP75]], [[TMP67]]
-; THR15-NEXT: [[TMP77:%.*]] = add <2 x i32> [[TMP73]], [[TMP76]]
-; THR15-NEXT: [[TMP78:%.*]] = load <2 x i8>, ptr [[ARRAYIDX22]], align 1
-; THR15-NEXT: [[TMP79:%.*]] = zext <2 x i8> [[TMP78]] to <2 x i32>
-; THR15-NEXT: [[TMP80:%.*]] = load <2 x i8>, ptr [[ARRAYIDX25]], align 1
-; THR15-NEXT: [[TMP81:%.*]] = zext <2 x i8> [[TMP80]] to <2 x i32>
-; THR15-NEXT: [[TMP82:%.*]] = load <2 x i8>, ptr [[ARRAYIDX27]], align 1
-; THR15-NEXT: [[TMP83:%.*]] = zext <2 x i8> [[TMP82]] to <2 x i32>
-; THR15-NEXT: [[TMP84:%.*]] = sub <2 x i32> [[TMP81]], [[TMP83]]
+; THR15-NEXT: [[TMP75:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX22]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT: [[TMP76:%.*]] = zext <2 x i8> [[TMP75]] to <2 x i32>
+; THR15-NEXT: [[TMP82:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX25]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT: [[TMP78:%.*]] = zext <2 x i8> [[TMP82]] to <2 x i32>
+; THR15-NEXT: [[TMP79:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX27]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT: [[TMP80:%.*]] = zext <2 x i8> [[TMP79]] to <2 x i32>
+; THR15-NEXT: [[TMP84:%.*]] = sub <2 x i32> [[TMP78]], [[TMP80]]
; THR15-NEXT: [[TMP85:%.*]] = shl <2 x i32> [[TMP84]], <i32 16, i32 16>
; THR15-NEXT: [[TMP86:%.*]] = insertelement <2 x i32> [[TMP74]], i32 [[CONV33]], i32 1
-; THR15-NEXT: [[TMP87:%.*]] = sub <2 x i32> [[TMP86]], [[TMP79]]
+; THR15-NEXT: [[TMP87:%.*]] = sub <2 x i32> [[TMP86]], [[TMP76]]
; THR15-NEXT: [[TMP88:%.*]] = add <2 x i32> [[TMP85]], [[TMP87]]
+; THR15-NEXT: [[TMP92:%.*]] = insertelement <2 x i32> [[TMP74]], i32 [[CONV]], i32 0
+; THR15-NEXT: [[TMP93:%.*]] = sub <2 x i32> [[TMP92]], [[TMP68]]
+; THR15-NEXT: [[TMP95:%.*]] = add <2 x i32> [[TMP73]], [[TMP93]]
+; THR15-NEXT: [[TMP97:%.*]] = shufflevector <2 x i32> [[TMP88]], <2 x i32> [[TMP95]], <2 x i32> <i32 0, i32 2>
+; THR15-NEXT: [[TMP77:%.*]] = add <2 x i32> [[TMP88]], [[TMP95]]
+; THR15-NEXT: [[TMP91:%.*]] = sub <2 x i32> [[TMP95]], [[TMP88]]
; THR15-NEXT: [[TMP89:%.*]] = extractelement <2 x i32> [[TMP77]], i32 0
; THR15-NEXT: [[TMP90:%.*]] = extractelement <2 x i32> [[TMP77]], i32 1
-; THR15-NEXT: [[ADD44:%.*]] = add i32 [[TMP90]], [[TMP89]]
-; THR15-NEXT: [[SUB45:%.*]] = sub i32 [[TMP89]], [[TMP90]]
-; THR15-NEXT: [[TMP91:%.*]] = extractelement <2 x i32> [[TMP88]], i32 0
-; THR15-NEXT: [[TMP92:%.*]] = extractelement <2 x i32> [[TMP88]], i32 1
-; THR15-NEXT: [[ADD46:%.*]] = add i32 [[TMP92]], [[TMP91]]
-; THR15-NEXT: [[SUB47:%.*]] = sub i32 [[TMP91]], [[TMP92]]
-; THR15-NEXT: [[ADD48:%.*]] = add i32 [[ADD46]], [[ADD44]]
-; THR15-NEXT: [[SUB51:%.*]] = sub i32 [[ADD44]], [[ADD46]]
-; THR15-NEXT: [[ADD55:%.*]] = add i32 [[SUB47]], [[SUB45]]
-; THR15-NEXT: [[SUB59:%.*]] = sub i32 [[SUB45]], [[SUB47]]
-; THR15-NEXT: [[SHR_I59:%.*]] = lshr i32 [[ADD46]], 15
+; THR15-NEXT: [[ADD48:%.*]] = add i32 [[TMP90]], [[TMP89]]
+; THR15-NEXT: [[SUB51:%.*]] = sub i32 [[TMP89]], [[TMP90]]
+; THR15-NEXT: [[TMP94:%.*]] = extractelement <2 x i32> [[TMP91]], i32 0
+; THR15-NEXT: [[SUB47:%.*]] = extractelement <2 x i32> [[TMP91]], i32 1
+; THR15-NEXT: [[ADD56:%.*]] = add i32 [[SUB47]], [[TMP94]]
+; THR15-NEXT: [[SUB59:%.*]] = sub i32 [[TMP94]], [[SUB47]]
+; THR15-NEXT: [[SHR_I59:%.*]] = lshr i32 [[TMP90]], 15
; THR15-NEXT: [[AND_I60:%.*]] = and i32 [[SHR_I59]], 65537
; THR15-NEXT: [[MUL_I61:%.*]] = mul i32 [[AND_I60]], 65535
; THR15-NEXT: [[SHR_I59_1:%.*]] = lshr i32 [[SUB47]], 15
; THR15-NEXT: [[AND_I60_1:%.*]] = and i32 [[SHR_I59_1]], 65537
; THR15-NEXT: [[MUL_I61_1:%.*]] = mul i32 [[AND_I60_1]], 65535
-; THR15-NEXT: [[TMP93:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1
-; THR15-NEXT: [[TMP94:%.*]] = zext <2 x i8> [[TMP93]] to <2 x i32>
-; THR15-NEXT: [[TMP95:%.*]] = load <2 x i8>, ptr [[ADD_PTR644]], align 1
-; THR15-NEXT: [[TMP96:%.*]] = zext <2 x i8> [[TMP95]] to <2 x i32>
-; THR15-NEXT: [[TMP97:%.*]] = load <2 x i8>, ptr [[ARRAYIDX3_1]], align 1
-; THR15-NEXT: [[TMP98:%.*]] = zext <2 x i8> [[TMP97]] to <2 x i32>
-; THR15-NEXT: [[TMP99:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_1]], align 1
-; THR15-NEXT: [[TMP100:%.*]] = zext <2 x i8> [[TMP99]] to <2 x i32>
-; THR15-NEXT: [[TMP101:%.*]] = sub <2 x i32> [[TMP98]], [[TMP100]]
+; THR15-NEXT: [[TMP96:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1
+; THR15-NEXT: [[TMP103:%.*]] = zext <2 x i8> [[TMP96]] to <2 x i32>
+; THR15-NEXT: [[TMP98:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ADD_PTR644]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT: [[TMP99:%.*]] = zext <2 x i8> [[TMP98]] to <2 x i32>
+; THR15-NEXT: [[TMP100:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT: [[TMP104:%.*]] = zext <2 x i8> [[TMP100]] to <2 x i32>
+; THR15-NEXT: [[TMP105:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT: [[TMP112:%.*]] = zext <2 x i8> [[TMP105]] to <2 x i32>
+; THR15-NEXT: [[TMP101:%.*]] = sub <2 x i32> [[TMP104]], [[TMP112]]
; THR15-NEXT: [[TMP102:%.*]] = shl <2 x i32> [[TMP101]], <i32 16, i32 16>
-; THR15-NEXT: [[TMP103:%.*]] = shufflevector <2 x i32> [[TMP94]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
-; THR15-NEXT: [[TMP104:%.*]] = insertelement <2 x i32> [[TMP103]], i32 [[CONV_1]], i32 0
-; THR15-NEXT: [[TMP105:%.*]] = sub <2 x i32> [[TMP104]], [[TMP96]]
-; THR15-NEXT: [[TMP106:%.*]] = add <2 x i32> [[TMP102]], [[TMP105]]
-; THR15-NEXT: [[TMP107:%.*]] = load <2 x i8>, ptr [[ARRAYIDX22_1]], align 1
-; THR15-NEXT: [[TMP108:%.*]] = zext <2 x i8> [[TMP107]] to <2 x i32>
-; THR15-NEXT: [[TMP109:%.*]] = load <2 x i8>, ptr [[ARRAYIDX25_1]], align 1
-; THR15-NEXT: [[TMP110:%.*]] = zext <2 x i8> [[TMP109]] to <2 x i32>
-; THR15-NEXT: [[TMP111:%.*]] = load <2 x i8>, ptr [[ARRAYIDX27_1]], align 1
-; THR15-NEXT: [[TMP112:%.*]] = zext <2 x i8> [[TMP111]] to <2 x i32>
-; THR15-NEXT: [[TMP113:%.*]] = sub <2 x i32> [[TMP110]], [[TMP112]]
+; THR15-NEXT: [[TMP120:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX22_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT: [[TMP107:%.*]] = zext <2 x i8> [[TMP120]] to <2 x i32>
+; THR15-NEXT: [[TMP108:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX13_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT: [[TMP109:%.*]] = zext <2 x i8> [[TMP108]] to <2 x i32>
+; THR15-NEXT: [[TMP110:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX27_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT: [[TMP111:%.*]] = zext <2 x i8> [[TMP110]] to <2 x i32>
+; THR15-NEXT: [[TMP113:%.*]] = sub <2 x i32> [[TMP109]], [[TMP111]]
; THR15-NEXT: [[TMP114:%.*]] = shl <2 x i32> [[TMP113]], <i32 16, i32 16>
; THR15-NEXT: [[TMP115:%.*]] = insertelement <2 x i32> [[TMP103]], i32 [[CONV33_1]], i32 1
-; THR15-NEXT: [[TMP116:%.*]] = sub <2 x i32> [[TMP115]], [[TMP108]]
+; THR15-NEXT: [[TMP116:%.*]] = sub <2 x i32> [[TMP115]], [[TMP107]]
; THR15-NEXT: [[TMP117:%.*]] = add <2 x i32> [[TMP114]], [[TMP116]]
+; THR15-NEXT: [[TMP126:%.*]] = insertelement <2 x i32> [[TMP103]], i32 [[CONV_1]], i32 0
+; THR15-NEXT: [[TMP127:%.*]] = sub <2 x i32> [[TMP126]], [[TMP99]]
+; THR15-NEXT: [[TMP128:%.*]] = add <2 x i32> [[TMP102]], [[TMP127]]
+; THR15-NEXT: [[TMP106:%.*]] = add <2 x i32> [[TMP117]], [[TMP128]]
+; THR15-NEXT: [[TMP121:%.*]] = sub <2 x i32> [[TMP128]], [[TMP117]]
; THR15-NEXT: [[TMP118:%.*]] = extractelement <2 x i32> [[TMP106]], i32 0
; THR15-NEXT: [[TMP119:%.*]] = extractelement <2 x i32> [[TMP106]], i32 1
-; THR15-NEXT: [[ADD44_1:%.*]] = add i32 [[TMP119]], [[TMP118]]
-; THR15-NEXT: [[SUB45_1:%.*]] = sub i32 [[TMP118]], [[TMP119]]
-; THR15-NEXT: [[TMP120:%.*]] = extractelement <2 x i32> [[TMP117]], i32 0
-; THR15-NEXT: [[TMP121:%.*]] = extractelement <2 x i32> [[TMP117]], i32 1
-; THR15-NEXT: [[ADD46_1:%.*]] = add i32 [[TMP121]], [[TMP120]]
-; THR15-NEXT: [[SUB47_1:%.*]] = sub i32 [[TMP120]], [[TMP121]]
-; THR15-NEXT: [[ADD48_1:%.*]] = add i32 [[ADD46_1]], [[ADD44_1]]
-; THR15-NEXT: [[SUB51_1:%.*]] = sub i32 [[ADD44_1]], [[ADD46_1]]
-; THR15-NEXT: [[ADD55_1:%.*]] = add i32 [[SUB47_1]], [[SUB45_1]]
-; THR15-NEXT: [[SUB59_1:%.*]] = sub i32 [[SUB45_1]], [[SUB47_1]]
-; THR15-NEXT: [[SHR_I54:%.*]] = lshr i32 [[ADD46_1]], 15
-; THR15-NEXT: [[AND_I55:%.*]] = and i32 [[SHR_I54]], 65537
-; THR15-NEXT: [[MUL_I56:%.*]] = mul i32 [[AND_I55]], 65535
-; THR15-NEXT: [[SHR_I54_1:%.*]] = lshr i32 [[SUB47_1]], 15
+; THR15-NEXT: [[ADD48_1:%.*]] = add i32 [[TMP119]], [[TMP118]]
+; THR15-NEXT: [[SUB51_1:%.*]] = sub i32 [[TMP118]], [[TMP119]]
+; THR15-NEXT: [[TMP129:%.*]] = extractelement <2 x i32> [[TMP121]], i32 0
+; THR15-NEXT: [[TMP125:%.*]] = extractelement <2 x i32> [[TMP121]], i32 1
+; THR15-NEXT: [[ADD55_4:%.*]] = add i32 [[TMP125]], [[TMP129]]
+; THR15-NEXT: [[SUB59_1:%.*]] = sub i32 [[TMP129]], [[TMP125]]
+; THR15-NEXT: [[SHR_I54_1:%.*]] = lshr i32 [[TMP119]], 15
; THR15-NEXT: [[AND_I55_1:%.*]] = and i32 [[SHR_I54_1]], 65537
; THR15-NEXT: [[MUL_I56_1:%.*]] = mul i32 [[AND_I55_1]], 65535
-; THR15-NEXT: [[TMP122:%.*]] = lshr <2 x i32> [[TMP94]], <i32 15, i32 15>
+; THR15-NEXT: [[TMP122:%.*]] = lshr <2 x i32> [[TMP103]], <i32 15, i32 15>
; THR15-NEXT: [[TMP123:%.*]] = and <2 x i32> [[TMP122]], <i32 65537, i32 65537>
; THR15-NEXT: [[TMP124:%.*]] = mul <2 x i32> [[TMP123]], <i32 65535, i32 65535>
; THR15-NEXT: [[ADD78:%.*]] = add i32 [[ADD48_1]], [[ADD48]]
@@ -593,20 +552,20 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; THR15-NEXT: [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP63]]
; THR15-NEXT: [[ADD_I52:%.*]] = add i32 [[MUL_I51]], [[ADD105]]
; THR15-NEXT: [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[ADD46_2]]
-; THR15-NEXT: [[ADD_I57:%.*]] = add i32 [[MUL_I56]], [[SUB104]]
-; THR15-NEXT: [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[ADD46_1]]
+; THR15-NEXT: [[ADD_I57:%.*]] = add i32 [[MUL_I56_1]], [[SUB104]]
+; THR15-NEXT: [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP119]]
; THR15-NEXT: [[ADD_I62:%.*]] = add i32 [[MUL_I61]], [[SUB106]]
-; THR15-NEXT: [[XOR_I63:%.*]] = xor i32 [[ADD_I62]], [[ADD46]]
+; THR15-NEXT: [[XOR_I63:%.*]] = xor i32 [[ADD_I62]], [[TMP90]]
; THR15-NEXT: [[ADD110:%.*]] = add i32 [[XOR_I53]], [[XOR_I]]
; THR15-NEXT: [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I58]]
; THR15-NEXT: [[ADD113:%.*]] = add i32 [[ADD112]], [[XOR_I63]]
-; THR15-NEXT: [[TMP125:%.*]] = insertelement <2 x i32> poison, i32 [[ADD55_2]], i32 0
-; THR15-NEXT: [[TMP126:%.*]] = shufflevector <2 x i32> [[TMP125]], <2 x i32> poison, <2 x i32> zeroinitializer
-; THR15-NEXT: [[TMP127:%.*]] = insertelement <2 x i32> poison, i32 [[ADD55_3]], i32 0
-; THR15-NEXT: [[TMP128:%.*]] = shufflevector <2 x i32> [[TMP127]], <2 x i32> poison, <2 x i32> zeroinitializer
-; THR15-NEXT: [[TMP129:%.*]] = sub <2 x i32> [[TMP126]], [[TMP128]]
-; THR15-NEXT: [[TMP130:%.*]] = add <2 x i32> [[TMP126]], [[TMP128]]
-; THR15-NEXT: [[TMP131:%.*]] = shufflevector <2 x i32> [[TMP129]], <2 x i32> [[TMP130]], <2 x i32> <i32 0, i32 3>
+; THR15-NEXT: [[ADD55:%.*]] = add i32 [[ADD55_4]], [[ADD56]]
+; THR15-NEXT: [[SUB86_1:%.*]] = sub i32 [[ADD56]], [[ADD55_4]]
+; THR15-NEXT: [[ADD105_1:%.*]] = add i32 [[SUB102_1]], [[SUB86_1]]
+; THR15-NEXT: [[SUB106_1:%.*]] = sub i32 [[SUB86_1]], [[SUB102_1]]
+; THR15-NEXT: [[ADD_I52_1:%.*]] = add i32 [[MUL_I51_2]], [[ADD105_1]]
+; THR15-NEXT: [[XOR_I53_1:%.*]] = xor i32 [[ADD_I52_1]], [[TMP64]]
+; THR15-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP66]], <2 x i32> [[TMP121]], <2 x i32> <i32 1, i32 3>
; THR15-NEXT: [[TMP132:%.*]] = lshr <2 x i32> [[TMP5]], <i32 15, i32 15>
; THR15-NEXT: [[TMP133:%.*]] = and <2 x i32> [[TMP132]], <i32 65537, i32 65537>
; THR15-NEXT: [[TMP134:%.*]] = mul <2 x i32> [[TMP133]], <i32 65535, i32 65535>
@@ -614,28 +573,18 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; THR15-NEXT: [[TMP136:%.*]] = shufflevector <2 x i32> [[TMP135]], <2 x i32> poison, <2 x i32> zeroinitializer
; THR15-NEXT: [[TMP137:%.*]] = insertelement <2 x i32> poison, i32 [[ADD55_1]], i32 0
; THR15-NEXT: [[TMP138:%.*]] = shufflevector <2 x i32> [[TMP137]], <2 x i32> poison, <2 x i32> zeroinitializer
-; THR15-NEXT: [[TMP139:%.*]] = sub <2 x i32> [[TMP136]], [[TMP138]]
; THR15-NEXT: [[TMP140:%.*]] = add <2 x i32> [[TMP136]], [[TMP138]]
-; THR15-NEXT: [[TMP141:%.*]] = shufflevector <2 x i32> [[TMP139]], <2 x i32> [[TMP140]], <2 x i32> <i32 0, i32 3>
-; THR15-NEXT: [[TMP142:%.*]] = extractelement <2 x i32> [[TMP131]], i32 1
-; THR15-NEXT: [[TMP143:%.*]] = extractelement <2 x i32> [[TMP141]], i32 1
-; THR15-NEXT: [[SUB104_1:%.*]] = sub i32 [[TMP143]], [[TMP142]]
-; THR15-NEXT: [[TMP144:%.*]] = add <2 x i32> [[TMP131]], [[TMP141]]
-; THR15-NEXT: [[TMP145:%.*]] = extractelement <2 x i32> [[TMP131]], i32 0
-; THR15-NEXT: [[TMP146:%.*]] = extractelement <2 x i32> [[TMP141]], i32 0
-; THR15-NEXT: [[TMP147:%.*]] = shufflevector <2 x i32> [[TMP141]], <2 x i32> [[TMP131]], <2 x i32> <i32 0, i32 2>
-; THR15-NEXT: [[SUB106_1:%.*]] = sub i32 [[TMP146]], [[TMP145]]
+; THR15-NEXT: [[TMP139:%.*]] = sub <2 x i32> [[TMP136]], [[TMP138]]
+; THR15-NEXT: [[TMP144:%.*]] = shufflevector <2 x i32> [[TMP140]], <2 x i32> [[TMP139]], <2 x i32> <i32 0, i32 3>
; THR15-NEXT: [[TMP148:%.*]] = add <2 x i32> [[TMP134]], [[TMP144]]
; THR15-NEXT: [[TMP149:%.*]] = xor <2 x i32> [[TMP148]], [[TMP5]]
-; THR15-NEXT: [[ADD_I57_1:%.*]] = add i32 [[MUL_I56_1]], [[SUB104_1]]
-; THR15-NEXT: [[XOR_I58_1:%.*]] = xor i32 [[ADD_I57_1]], [[SUB47_1]]
; THR15-NEXT: [[ADD_I62_1:%.*]] = add i32 [[MUL_I61_1]], [[SUB106_1]]
; THR15-NEXT: [[XOR_I63_1:%.*]] = xor i32 [[ADD_I62_1]], [[SUB47]]
+; THR15-NEXT: [[ADD108_1:%.*]] = add i32 [[XOR_I53_1]], [[ADD113]]
; THR15-NEXT: [[TMP150:%.*]] = extractelement <2 x i32> [[TMP149]], i32 0
-; THR15-NEXT: [[ADD108_1:%.*]] = add i32 [[TMP150]], [[ADD113]]
+; THR15-NEXT: [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[TMP150]]
; THR15-NEXT: [[TMP151:%.*]] = extractelement <2 x i32> [[TMP149]], i32 1
-; THR15-NEXT: [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[TMP151]]
-; THR15-NEXT: [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[XOR_I58_1]]
+; THR15-NEXT: [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[TMP151]]
; THR15-NEXT: [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[XOR_I63_1]]
; THR15-NEXT: [[ADD78_2:%.*]] = add i32 [[SUB51_1]], [[SUB51]]
; THR15-NEXT: [[SUB86_2:%.*]] = sub i32 [[SUB51]], [[SUB51_1]]
@@ -648,15 +597,15 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; THR15-NEXT: [[TMP158:%.*]] = shufflevector <2 x i32> [[TMP156]], <2 x i32> [[TMP157]], <2 x i32> <i32 0, i32 3>
; THR15-NEXT: [[ADD105_2:%.*]] = add i32 [[SUB102_2]], [[SUB86_2]]
; THR15-NEXT: [[SUB106_2:%.*]] = sub i32 [[SUB86_2]], [[SUB102_2]]
-; THR15-NEXT: [[ADD_I52_2:%.*]] = add i32 [[MUL_I51_2]], [[ADD105_2]]
+; THR15-NEXT: [[ADD_I52_2:%.*]] = add i32 [[MUL_I51_3]], [[ADD105_2]]
; THR15-NEXT: [[XOR_I53_2:%.*]] = xor i32 [[ADD_I52_2]], [[CONV_1]]
; THR15-NEXT: [[TMP159:%.*]] = add <2 x i32> [[TMP124]], [[TMP158]]
-; THR15-NEXT: [[TMP160:%.*]] = xor <2 x i32> [[TMP159]], [[TMP94]]
-; THR15-NEXT: [[SHR_I59_2:%.*]] = lshr i32 [[ADD44]], 15
+; THR15-NEXT: [[TMP160:%.*]] = xor <2 x i32> [[TMP159]], [[TMP103]]
+; THR15-NEXT: [[SHR_I59_2:%.*]] = lshr i32 [[TMP89]], 15
; THR15-NEXT: [[AND_I60_2:%.*]] = and i32 [[SHR_I59_2]], 65537
; THR15-NEXT: [[MUL_I61_2:%.*]] = mul i32 [[AND_I60_2]], 65535
; THR15-NEXT: [[ADD_I62_2:%.*]] = add i32 [[MUL_I61_2]], [[SUB106_2]]
-; THR15-NEXT: [[XOR_I63_2:%.*]] = xor i32 [[ADD_I62_2]], [[ADD44]]
+; THR15-NEXT: [[XOR_I63_2:%.*]] = xor i32 [[ADD_I62_2]], [[TMP89]]
; THR15-NEXT: [[ADD108_2:%.*]] = add i32 [[XOR_I53_2]], [[ADD113_1]]
; THR15-NEXT: [[TMP161:%.*]] = extractelement <2 x i32> [[TMP160]], i32 0
; THR15-NEXT: [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP161]]
@@ -674,13 +623,13 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; THR15-NEXT: [[TMP169:%.*]] = shufflevector <2 x i32> [[TMP167]], <2 x i32> [[TMP168]], <2 x i32> <i32 0, i32 3>
; THR15-NEXT: [[ADD105_3:%.*]] = add i32 [[SUB102_3]], [[SUB86_3]]
; THR15-NEXT: [[SUB106_3:%.*]] = sub i32 [[SUB86_3]], [[SUB102_3]]
-; THR15-NEXT: [[ADD_I52_3:%.*]] = add i32 [[MUL_I51_3]], [[ADD105_3]]
+; THR15-NEXT: [[ADD_I52_3:%.*]] = add i32 [[MUL_I51_4]], [[ADD105_3]]
; THR15-NEXT: [[XOR_I53_3:%.*]] = xor i32 [[ADD_I52_3]], [[CONV]]
-; THR15-NEXT: [[TMP170:%.*]] = lshr <2 x i32> [[TMP65]], <i32 15, i32 15>
+; THR15-NEXT: [[TMP170:%.*]] = lshr <2 x i32> [[TMP74]], <i32 15, i32 15>
; THR15-NEXT: [[TMP171:%.*]] = and <2 x i32> [[TMP170]], <i32 65537, i32 65537>
; THR15-NEXT: [[TMP172:%.*]] = mul <2 x i32> [[TMP171]], <i32 65535, i32 65535>
; THR15-NEXT: [[TMP173:%.*]] = add <2 x i32> [[TMP172]], [[TMP169]]
-; THR15-NEXT: [[TMP174:%.*]] = xor <2 x i32> [[TMP173]], [[TMP65]]
+; THR15-NEXT: [[TMP174:%.*]] = xor <2 x i32> [[TMP173]], [[TMP74]]
; THR15-NEXT: [[SHR_I59_3:%.*]] = lshr i32 [[CONV33]], 15
; THR15-NEXT: [[AND_I60_3:%.*]] = and i32 [[SHR_I59_3]], 65537
; THR15-NEXT: [[MUL_I61_3:%.*]] = mul i32 [[AND_I60_3]], 65535
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll
index ec152c7..b47168c 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll
@@ -8,16 +8,17 @@ define i16 @test() {
; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[PPREV_058_I:%.*]] = getelementptr [[S:%.*]], ptr null, i64 -1
-; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x ptr> <ptr null, ptr poison>, ptr [[PPREV_058_I]], i32 1
; CHECK-NEXT: br label [[WHILE_BODY_I:%.*]]
; CHECK: while.body.i:
-; CHECK-NEXT: [[TMP1:%.*]] = phi i16 [ 0, [[WHILE_BODY_I]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[TMP2:%.*]] = phi <2 x ptr> [ [[TMP3:%.*]], [[WHILE_BODY_I]] ], [ [[TMP0]], [[ENTRY]] ]
-; CHECK-NEXT: [[TMP3]] = getelementptr [[S]], <2 x ptr> [[TMP2]], <2 x i64> <i64 -1, i64 -1>
-; CHECK-NEXT: [[TMP4:%.*]] = call <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr> [[TMP3]], i32 2, <2 x i1> <i1 true, i1 true>, <2 x i16> poison)
-; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i16> [[TMP4]], i32 0
-; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i16> [[TMP4]], i32 1
-; CHECK-NEXT: [[CMP_I178:%.*]] = icmp ult i16 [[TMP5]], [[TMP6]]
+; CHECK-NEXT: [[TMP0:%.*]] = phi i16 [ 0, [[WHILE_BODY_I]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[PPREV_062_I:%.*]] = phi ptr [ [[PPREV_0_I:%.*]], [[WHILE_BODY_I]] ], [ [[PPREV_058_I]], [[ENTRY]] ]
+; CHECK-NEXT: [[PEDGE_061_I:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], [[WHILE_BODY_I]] ], [ null, [[ENTRY]] ]
+; CHECK-NEXT: [[INCDEC_PTR_I]] = getelementptr [[S]], ptr [[PEDGE_061_I]], i64 -1
+; CHECK-NEXT: [[PPREV_0_I]] = getelementptr [[S]], ptr [[PPREV_062_I]], i64 -1
+; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i16> @llvm.experimental.vp.strided.load.v2i16.p0.i64(ptr align 2 [[PPREV_0_I]], i64 4, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i16> [[TMP1]], i32 0
+; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i16> [[TMP1]], i32 1
+; CHECK-NEXT: [[CMP_I178:%.*]] = icmp ult i16 [[TMP3]], [[TMP2]]
; CHECK-NEXT: br label [[WHILE_BODY_I]]
;
entry: