aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBjorn Pettersson <bjorn.a.pettersson@ericsson.com>2025-05-31 09:37:27 +0200
committerBjorn Pettersson <bjorn.a.pettersson@ericsson.com>2025-07-04 00:38:25 +0200
commitbe4f7432d8f35a8b07dc745736dccac6ae742743 (patch)
treeb0634208ac354eb98f629608b9f0dd3e438a5899
parentfe73a97a1ef8c1c2df5999e0b6abecde0e89733b (diff)
downloadllvm-users/bjope/insertundef_4.zip
llvm-users/bjope/insertundef_4.tar.gz
llvm-users/bjope/insertundef_4.tar.bz2
[SelectionDAG] Deal with POISON for INSERT_VECTOR_ELT/INSERT_SUBVECTOR (part 3)users/bjope/insertundef_4
Target specific patches to avoid regressions seen after "part 1" aiming at fixing github issue #141034. One perhaps controversial change here is that convertToScalableVector now uses POISON instead of UNDEF for any additional elements added when converting to the scalable vector. This can avoid that we end up with things like t31: nxv1f32 = t32: v2f32 = extract_subvector t31, Constant:i64<0> t38: nxv1f32 = insert_subvector undef:nxv1f32, t32, Constant:i64<0> since if we instead try to insert into poison we can just use t31 instead of t38 without the risk that t31 would be more poisonous.
-rw-r--r--llvm/include/llvm/CodeGen/SelectionDAG.h11
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp9
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.cpp18
-rw-r--r--llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll81
-rw-r--r--llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests.ll8
-rw-r--r--llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll108
-rw-r--r--llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll6
-rw-r--r--llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll21
-rw-r--r--llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll28
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll90
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vw-web-simplification.ll55
11 files changed, 93 insertions, 342 deletions
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index a98e46c5..3abdafa 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -953,8 +953,17 @@ public:
}
/// Insert \p SubVec at the \p Idx element of \p Vec.
+ /// If \p SkipUndef is true and \p SubVec is UNDEF/POISON, then \p Vec is
+ /// returned.
SDValue getInsertSubvector(const SDLoc &DL, SDValue Vec, SDValue SubVec,
- unsigned Idx) {
+ unsigned Idx, bool SkipUndef = false) {
+ // Skipping insert of UNDEF could result in POISON elements remaining in the
+ // resulting vector. The SkipUndef is useful in situations when getNode
+ // can't reason well enough about ignoring the insert, e.g. when having
+ // scalable vectors and the user of this method knows that the subvector
+ // being replaced isn't POISON.
+ if (SkipUndef && SubVec.isUndef())
+ return Vec;
return getNode(ISD::INSERT_SUBVECTOR, DL, Vec.getValueType(), Vec, SubVec,
getVectorIdxConstant(Idx, DL));
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index fb8bd81..761f7ea 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -15125,11 +15125,14 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
if (PreferDUPAndInsert) {
// First, build a constant vector with the common element.
- SmallVector<SDValue, 8> Ops(NumElts, Value);
+ // Make sure to freeze the common element first, since we will use it also
+ // for indices that should be UNDEF (so we want to avoid making those
+ // elements more poisonous).
+ SmallVector<SDValue, 8> Ops(NumElts, DAG.getFreeze(Value));
SDValue NewVector = LowerBUILD_VECTOR(DAG.getBuildVector(VT, DL, Ops), DAG);
// Next, insert the elements that do not match the common value.
for (unsigned I = 0; I < NumElts; ++I)
- if (Op.getOperand(I) != Value)
+ if (Op.getOperand(I) != Value && !Op.getOperand(I).isUndef())
NewVector =
DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NewVector,
Op.getOperand(I), DAG.getConstant(I, DL, MVT::i64));
@@ -28721,7 +28724,7 @@ static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) {
"Expected a fixed length vector operand!");
SDLoc DL(V);
SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
- return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getPOISON(VT), V, Zero);
}
// Shrink V so it's just big enough to maintain a VT's worth of data.
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 4f280c3..55e352a 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -2859,7 +2859,7 @@ static SDValue convertToScalableVector(EVT VT, SDValue V, SelectionDAG &DAG,
assert(V.getValueType().isFixedLengthVector() &&
"Expected a fixed length vector operand!");
SDLoc DL(V);
- return DAG.getInsertSubvector(DL, DAG.getUNDEF(VT), V, 0);
+ return DAG.getInsertSubvector(DL, DAG.getPOISON(VT), V, 0);
}
// Shrink V so it's just big enough to maintain a VT's worth of data.
@@ -4347,7 +4347,8 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
DAG.getNode(ISD::BUILD_VECTOR, DL, OneRegVT, OneVRegOfOps);
SubBV = convertToScalableVector(M1VT, SubBV, DAG, Subtarget);
unsigned InsertIdx = (i / ElemsPerVReg) * NumOpElts;
- Vec = DAG.getInsertSubvector(DL, Vec, SubBV, InsertIdx);
+ Vec = DAG.getInsertSubvector(DL, Vec, SubBV, InsertIdx,
+ /*SkipUndef=*/true);
}
return convertFromScalableVector(VT, Vec, DAG, Subtarget);
}
@@ -7849,10 +7850,8 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
SDValue Vec = DAG.getUNDEF(VT);
for (const auto &OpIdx : enumerate(Op->ops())) {
SDValue SubVec = OpIdx.value();
- // Don't insert undef subvectors.
- if (SubVec.isUndef())
- continue;
- Vec = DAG.getInsertSubvector(DL, Vec, SubVec, OpIdx.index() * NumOpElts);
+ Vec = DAG.getInsertSubvector(DL, Vec, SubVec, OpIdx.index() * NumOpElts,
+ /*SkipUndef=*/true);
}
return Vec;
}
@@ -12272,9 +12271,10 @@ SDValue RISCVTargetLowering::lowerVECTOR_REVERSE(SDValue Op,
Hi = DAG.getNode(ISD::VECTOR_REVERSE, DL, HiVT, Hi);
// Reassemble the low and high pieces reversed.
// FIXME: This is a CONCAT_VECTORS.
- SDValue Res = DAG.getInsertSubvector(DL, DAG.getUNDEF(VecVT), Hi, 0);
- return DAG.getInsertSubvector(DL, Res, Lo,
- LoVT.getVectorMinNumElements());
+ SDValue Res = DAG.getInsertSubvector(DL, DAG.getUNDEF(VecVT), Hi, 0,
+ /*SkipUndef=*/true);
+ return DAG.getInsertSubvector(DL, Res, Lo, LoVT.getVectorMinNumElements(),
+ /*SkipUndef=*/true);
}
// Just promote the int type to i16 which will double the LMUL.
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll
index 2905d70..9efe0b3 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll
@@ -37,10 +37,6 @@ define void @select_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h
-; CHECK-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: ptrue p1.h
-; CHECK-NEXT: and z2.h, z2.h, #0x1
-; CHECK-NEXT: cmpne p1.h, p1/z, z2.h, #0
; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
@@ -63,15 +59,8 @@ define void @select_v32f16(ptr %a, ptr %b) #0 {
; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
; VBITS_GE_256-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h
; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z2.h, z3.h
-; VBITS_GE_256-NEXT: mov z4.h, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: ptrue p1.h
-; VBITS_GE_256-NEXT: mov z5.h, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: and z4.h, z4.h, #0x1
-; VBITS_GE_256-NEXT: and z5.h, z5.h, #0x1
-; VBITS_GE_256-NEXT: cmpne p2.h, p1/z, z4.h, #0
-; VBITS_GE_256-NEXT: cmpne p1.h, p1/z, z5.h, #0
-; VBITS_GE_256-NEXT: sel z0.h, p2, z0.h, z1.h
-; VBITS_GE_256-NEXT: sel z1.h, p1, z2.h, z3.h
+; VBITS_GE_256-NEXT: sel z0.h, p1, z0.h, z1.h
+; VBITS_GE_256-NEXT: sel z1.h, p2, z2.h, z3.h
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
; VBITS_GE_256-NEXT: ret
@@ -82,10 +71,6 @@ define void @select_v32f16(ptr %a, ptr %b) #0 {
; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
; VBITS_GE_512-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h
-; VBITS_GE_512-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_512-NEXT: ptrue p1.h
-; VBITS_GE_512-NEXT: and z2.h, z2.h, #0x1
-; VBITS_GE_512-NEXT: cmpne p1.h, p1/z, z2.h, #0
; VBITS_GE_512-NEXT: sel z0.h, p1, z0.h, z1.h
; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_512-NEXT: ret
@@ -104,10 +89,6 @@ define void @select_v64f16(ptr %a, ptr %b) vscale_range(8,0) #0 {
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h
-; CHECK-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: ptrue p1.h
-; CHECK-NEXT: and z2.h, z2.h, #0x1
-; CHECK-NEXT: cmpne p1.h, p1/z, z2.h, #0
; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
@@ -126,10 +107,6 @@ define void @select_v128f16(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h
-; CHECK-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: ptrue p1.h
-; CHECK-NEXT: and z2.h, z2.h, #0x1
-; CHECK-NEXT: cmpne p1.h, p1/z, z2.h, #0
; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
@@ -173,10 +150,6 @@ define void @select_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
-; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: ptrue p1.s
-; CHECK-NEXT: and z2.s, z2.s, #0x1
-; CHECK-NEXT: cmpne p1.s, p1/z, z2.s, #0
; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
@@ -199,15 +172,8 @@ define void @select_v16f32(ptr %a, ptr %b) #0 {
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z2.s, z3.s
-; VBITS_GE_256-NEXT: mov z4.s, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: ptrue p1.s
-; VBITS_GE_256-NEXT: mov z5.s, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: and z4.s, z4.s, #0x1
-; VBITS_GE_256-NEXT: and z5.s, z5.s, #0x1
-; VBITS_GE_256-NEXT: cmpne p2.s, p1/z, z4.s, #0
-; VBITS_GE_256-NEXT: cmpne p1.s, p1/z, z5.s, #0
-; VBITS_GE_256-NEXT: sel z0.s, p2, z0.s, z1.s
-; VBITS_GE_256-NEXT: sel z1.s, p1, z2.s, z3.s
+; VBITS_GE_256-NEXT: sel z0.s, p1, z0.s, z1.s
+; VBITS_GE_256-NEXT: sel z1.s, p2, z2.s, z3.s
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
; VBITS_GE_256-NEXT: ret
@@ -218,10 +184,6 @@ define void @select_v16f32(ptr %a, ptr %b) #0 {
; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
; VBITS_GE_512-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
-; VBITS_GE_512-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_512-NEXT: ptrue p1.s
-; VBITS_GE_512-NEXT: and z2.s, z2.s, #0x1
-; VBITS_GE_512-NEXT: cmpne p1.s, p1/z, z2.s, #0
; VBITS_GE_512-NEXT: sel z0.s, p1, z0.s, z1.s
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_512-NEXT: ret
@@ -240,10 +202,6 @@ define void @select_v32f32(ptr %a, ptr %b) vscale_range(8,0) #0 {
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
-; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: ptrue p1.s
-; CHECK-NEXT: and z2.s, z2.s, #0x1
-; CHECK-NEXT: cmpne p1.s, p1/z, z2.s, #0
; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
@@ -262,10 +220,6 @@ define void @select_v64f32(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
-; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: ptrue p1.s
-; CHECK-NEXT: and z2.s, z2.s, #0x1
-; CHECK-NEXT: cmpne p1.s, p1/z, z2.s, #0
; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
@@ -310,10 +264,6 @@ define void @select_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d
-; CHECK-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: ptrue p1.d
-; CHECK-NEXT: and z2.d, z2.d, #0x1
-; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0
; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
@@ -336,15 +286,8 @@ define void @select_v8f64(ptr %a, ptr %b) #0 {
; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d
; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z2.d, z3.d
-; VBITS_GE_256-NEXT: mov z4.d, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: ptrue p1.d
-; VBITS_GE_256-NEXT: mov z5.d, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: and z4.d, z4.d, #0x1
-; VBITS_GE_256-NEXT: and z5.d, z5.d, #0x1
-; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z4.d, #0
-; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z5.d, #0
-; VBITS_GE_256-NEXT: sel z0.d, p2, z0.d, z1.d
-; VBITS_GE_256-NEXT: sel z1.d, p1, z2.d, z3.d
+; VBITS_GE_256-NEXT: sel z0.d, p1, z0.d, z1.d
+; VBITS_GE_256-NEXT: sel z1.d, p2, z2.d, z3.d
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
; VBITS_GE_256-NEXT: ret
@@ -355,10 +298,6 @@ define void @select_v8f64(ptr %a, ptr %b) #0 {
; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_GE_512-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d
-; VBITS_GE_512-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_512-NEXT: ptrue p1.d
-; VBITS_GE_512-NEXT: and z2.d, z2.d, #0x1
-; VBITS_GE_512-NEXT: cmpne p1.d, p1/z, z2.d, #0
; VBITS_GE_512-NEXT: sel z0.d, p1, z0.d, z1.d
; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_512-NEXT: ret
@@ -377,10 +316,6 @@ define void @select_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 {
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d
-; CHECK-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: ptrue p1.d
-; CHECK-NEXT: and z2.d, z2.d, #0x1
-; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0
; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
@@ -399,10 +334,6 @@ define void @select_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d
-; CHECK-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: ptrue p1.d
-; CHECK-NEXT: and z2.d, z2.d, #0x1
-; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0
; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests.ll
index 2f76be6..5e94007 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests.ll
@@ -12,12 +12,12 @@ define void @foo(ptr %a) #0 {
; CHECK: SelectionDAG has 13 nodes:
; CHECK-NEXT: t0: ch,glue = EntryToken
; CHECK-NEXT: t2: i64,ch = CopyFromReg t0, Register:i64 %0
-; CHECK-NEXT: t21: nxv2i64,ch = LDR_ZXI<Mem:(volatile load (<vscale x 1 x s128>) from %ir.a, align 64)> t2, TargetConstant:i64<0>, t0
+; CHECK-NEXT: t22: nxv2i64,ch = LDR_ZXI<Mem:(volatile load (<vscale x 1 x s128>) from %ir.a, align 64)> t2, TargetConstant:i64<0>, t0
; CHECK-NEXT: t8: i64 = ADDXri TargetFrameIndex:i64<1>, TargetConstant:i32<0>, TargetConstant:i32<0>
; CHECK-NEXT: t6: i64 = ADDXri TargetFrameIndex:i64<0>, TargetConstant:i32<0>, TargetConstant:i32<0>
-; CHECK-NEXT: t22: ch = STR_ZXI<Mem:(volatile store (<vscale x 1 x s128>) into %ir.r0, align 64)> t21, t6, TargetConstant:i64<0>, t21:1
-; CHECK-NEXT: t23: ch = STR_ZXI<Mem:(volatile store (<vscale x 1 x s128>) into %ir.r1, align 64)> t21, t8, TargetConstant:i64<0>, t22
-; CHECK-NEXT: t10: ch = RET_ReallyLR t23
+; CHECK-NEXT: t23: ch = STR_ZXI<Mem:(volatile store (<vscale x 1 x s128>) into %ir.r0, align 64)> t22, t6, TargetConstant:i64<0>, t22:1
+; CHECK-NEXT: t24: ch = STR_ZXI<Mem:(volatile store (<vscale x 1 x s128>) into %ir.r1, align 64)> t22, t8, TargetConstant:i64<0>, t23
+; CHECK-NEXT: t10: ch = RET_ReallyLR t24
; CHECK-EMPTY:
entry:
%r0 = alloca <8 x i64>
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll
index 0e95da3..9cebbc4 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll
@@ -36,10 +36,6 @@ define void @select_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
; CHECK-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b
-; CHECK-NEXT: mov z2.b, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: ptrue p1.b
-; CHECK-NEXT: and z2.b, z2.b, #0x1
-; CHECK-NEXT: cmpne p1.b, p1/z, z2.b, #0
; CHECK-NEXT: sel z0.b, p1, z0.b, z1.b
; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; CHECK-NEXT: ret
@@ -62,15 +58,8 @@ define void @select_v64i8(ptr %a, ptr %b) #0 {
; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1]
; VBITS_GE_256-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b
; VBITS_GE_256-NEXT: cmpeq p2.b, p0/z, z2.b, z3.b
-; VBITS_GE_256-NEXT: mov z4.b, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: ptrue p1.b
-; VBITS_GE_256-NEXT: mov z5.b, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: and z4.b, z4.b, #0x1
-; VBITS_GE_256-NEXT: and z5.b, z5.b, #0x1
-; VBITS_GE_256-NEXT: cmpne p2.b, p1/z, z4.b, #0
-; VBITS_GE_256-NEXT: cmpne p1.b, p1/z, z5.b, #0
-; VBITS_GE_256-NEXT: sel z0.b, p2, z0.b, z1.b
-; VBITS_GE_256-NEXT: sel z1.b, p1, z2.b, z3.b
+; VBITS_GE_256-NEXT: sel z0.b, p1, z0.b, z1.b
+; VBITS_GE_256-NEXT: sel z1.b, p2, z2.b, z3.b
; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0]
; VBITS_GE_256-NEXT: ret
@@ -81,10 +70,6 @@ define void @select_v64i8(ptr %a, ptr %b) #0 {
; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1]
; VBITS_GE_512-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b
-; VBITS_GE_512-NEXT: mov z2.b, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_512-NEXT: ptrue p1.b
-; VBITS_GE_512-NEXT: and z2.b, z2.b, #0x1
-; VBITS_GE_512-NEXT: cmpne p1.b, p1/z, z2.b, #0
; VBITS_GE_512-NEXT: sel z0.b, p1, z0.b, z1.b
; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0]
; VBITS_GE_512-NEXT: ret
@@ -103,10 +88,6 @@ define void @select_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 {
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
; CHECK-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b
-; CHECK-NEXT: mov z2.b, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: ptrue p1.b
-; CHECK-NEXT: and z2.b, z2.b, #0x1
-; CHECK-NEXT: cmpne p1.b, p1/z, z2.b, #0
; CHECK-NEXT: sel z0.b, p1, z0.b, z1.b
; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; CHECK-NEXT: ret
@@ -125,10 +106,6 @@ define void @select_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
; CHECK-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b
-; CHECK-NEXT: mov z2.b, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: ptrue p1.b
-; CHECK-NEXT: and z2.b, z2.b, #0x1
-; CHECK-NEXT: cmpne p1.b, p1/z, z2.b, #0
; CHECK-NEXT: sel z0.b, p1, z0.b, z1.b
; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; CHECK-NEXT: ret
@@ -172,10 +149,6 @@ define void @select_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h
-; CHECK-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: ptrue p1.h
-; CHECK-NEXT: and z2.h, z2.h, #0x1
-; CHECK-NEXT: cmpne p1.h, p1/z, z2.h, #0
; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
@@ -198,15 +171,8 @@ define void @select_v32i16(ptr %a, ptr %b) #0 {
; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
; VBITS_GE_256-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h
; VBITS_GE_256-NEXT: cmpeq p2.h, p0/z, z2.h, z3.h
-; VBITS_GE_256-NEXT: mov z4.h, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: ptrue p1.h
-; VBITS_GE_256-NEXT: mov z5.h, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: and z4.h, z4.h, #0x1
-; VBITS_GE_256-NEXT: and z5.h, z5.h, #0x1
-; VBITS_GE_256-NEXT: cmpne p2.h, p1/z, z4.h, #0
-; VBITS_GE_256-NEXT: cmpne p1.h, p1/z, z5.h, #0
-; VBITS_GE_256-NEXT: sel z0.h, p2, z0.h, z1.h
-; VBITS_GE_256-NEXT: sel z1.h, p1, z2.h, z3.h
+; VBITS_GE_256-NEXT: sel z0.h, p1, z0.h, z1.h
+; VBITS_GE_256-NEXT: sel z1.h, p2, z2.h, z3.h
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
; VBITS_GE_256-NEXT: ret
@@ -217,10 +183,6 @@ define void @select_v32i16(ptr %a, ptr %b) #0 {
; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
; VBITS_GE_512-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h
-; VBITS_GE_512-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_512-NEXT: ptrue p1.h
-; VBITS_GE_512-NEXT: and z2.h, z2.h, #0x1
-; VBITS_GE_512-NEXT: cmpne p1.h, p1/z, z2.h, #0
; VBITS_GE_512-NEXT: sel z0.h, p1, z0.h, z1.h
; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_512-NEXT: ret
@@ -239,10 +201,6 @@ define void @select_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 {
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h
-; CHECK-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: ptrue p1.h
-; CHECK-NEXT: and z2.h, z2.h, #0x1
-; CHECK-NEXT: cmpne p1.h, p1/z, z2.h, #0
; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
@@ -261,10 +219,6 @@ define void @select_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h
-; CHECK-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: ptrue p1.h
-; CHECK-NEXT: and z2.h, z2.h, #0x1
-; CHECK-NEXT: cmpne p1.h, p1/z, z2.h, #0
; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
@@ -308,10 +262,6 @@ define void @select_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s
-; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: ptrue p1.s
-; CHECK-NEXT: and z2.s, z2.s, #0x1
-; CHECK-NEXT: cmpne p1.s, p1/z, z2.s, #0
; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
@@ -334,15 +284,8 @@ define void @select_v16i32(ptr %a, ptr %b) #0 {
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s
; VBITS_GE_256-NEXT: cmpeq p2.s, p0/z, z2.s, z3.s
-; VBITS_GE_256-NEXT: mov z4.s, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: ptrue p1.s
-; VBITS_GE_256-NEXT: mov z5.s, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: and z4.s, z4.s, #0x1
-; VBITS_GE_256-NEXT: and z5.s, z5.s, #0x1
-; VBITS_GE_256-NEXT: cmpne p2.s, p1/z, z4.s, #0
-; VBITS_GE_256-NEXT: cmpne p1.s, p1/z, z5.s, #0
-; VBITS_GE_256-NEXT: sel z0.s, p2, z0.s, z1.s
-; VBITS_GE_256-NEXT: sel z1.s, p1, z2.s, z3.s
+; VBITS_GE_256-NEXT: sel z0.s, p1, z0.s, z1.s
+; VBITS_GE_256-NEXT: sel z1.s, p2, z2.s, z3.s
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
; VBITS_GE_256-NEXT: ret
@@ -353,10 +296,6 @@ define void @select_v16i32(ptr %a, ptr %b) #0 {
; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s
-; VBITS_GE_512-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_512-NEXT: ptrue p1.s
-; VBITS_GE_512-NEXT: and z2.s, z2.s, #0x1
-; VBITS_GE_512-NEXT: cmpne p1.s, p1/z, z2.s, #0
; VBITS_GE_512-NEXT: sel z0.s, p1, z0.s, z1.s
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_512-NEXT: ret
@@ -375,10 +314,6 @@ define void @select_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 {
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s
-; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: ptrue p1.s
-; CHECK-NEXT: and z2.s, z2.s, #0x1
-; CHECK-NEXT: cmpne p1.s, p1/z, z2.s, #0
; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
@@ -397,10 +332,6 @@ define void @select_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s
-; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: ptrue p1.s
-; CHECK-NEXT: and z2.s, z2.s, #0x1
-; CHECK-NEXT: cmpne p1.s, p1/z, z2.s, #0
; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
@@ -445,10 +376,6 @@ define void @select_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d
-; CHECK-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: ptrue p1.d
-; CHECK-NEXT: and z2.d, z2.d, #0x1
-; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0
; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
@@ -471,15 +398,8 @@ define void @select_v8i64(ptr %a, ptr %b) #0 {
; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d
; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z2.d, z3.d
-; VBITS_GE_256-NEXT: mov z4.d, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: ptrue p1.d
-; VBITS_GE_256-NEXT: mov z5.d, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT: and z4.d, z4.d, #0x1
-; VBITS_GE_256-NEXT: and z5.d, z5.d, #0x1
-; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z4.d, #0
-; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z5.d, #0
-; VBITS_GE_256-NEXT: sel z0.d, p2, z0.d, z1.d
-; VBITS_GE_256-NEXT: sel z1.d, p1, z2.d, z3.d
+; VBITS_GE_256-NEXT: sel z0.d, p1, z0.d, z1.d
+; VBITS_GE_256-NEXT: sel z1.d, p2, z2.d, z3.d
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
; VBITS_GE_256-NEXT: ret
@@ -490,10 +410,6 @@ define void @select_v8i64(ptr %a, ptr %b) #0 {
; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d
-; VBITS_GE_512-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_512-NEXT: ptrue p1.d
-; VBITS_GE_512-NEXT: and z2.d, z2.d, #0x1
-; VBITS_GE_512-NEXT: cmpne p1.d, p1/z, z2.d, #0
; VBITS_GE_512-NEXT: sel z0.d, p1, z0.d, z1.d
; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_512-NEXT: ret
@@ -512,10 +428,6 @@ define void @select_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 {
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d
-; CHECK-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: ptrue p1.d
-; CHECK-NEXT: and z2.d, z2.d, #0x1
-; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0
; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
@@ -534,10 +446,6 @@ define void @select_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d
-; CHECK-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: ptrue p1.d
-; CHECK-NEXT: and z2.d, z2.d, #0x1
-; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0
; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
index ebd32c7..093e6cd 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
@@ -1198,15 +1198,11 @@ define void @masked_gather_passthru(ptr %a, ptr %b, ptr %c) vscale_range(16,0) #
; CHECK-NEXT: ptrue p0.s, vl32
; CHECK-NEXT: ptrue p2.d, vl32
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x2]
; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0
; CHECK-NEXT: ld1d { z0.d }, p2/z, [x1]
; CHECK-NEXT: punpklo p2.h, p1.b
-; CHECK-NEXT: mov z1.s, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: ptrue p1.s
; CHECK-NEXT: ld1w { z0.d }, p2/z, [z0.d]
-; CHECK-NEXT: and z1.s, z1.s, #0x1
-; CHECK-NEXT: cmpne p1.s, p1/z, z1.s, #0
-; CHECK-NEXT: ld1w { z1.s }, p0/z, [x2]
; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll
index 8b845df..ec0693a 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll
@@ -199,13 +199,6 @@ define void @select_v16f16(ptr %a, ptr %b) {
; CHECK-NEXT: ldp q1, q2, [x0]
; CHECK-NEXT: fcmeq p1.h, p0/z, z1.h, z0.h
; CHECK-NEXT: fcmeq p0.h, p0/z, z2.h, z3.h
-; CHECK-NEXT: mov z4.h, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: mov z5.h, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: ptrue p0.h
-; CHECK-NEXT: and z4.h, z4.h, #0x1
-; CHECK-NEXT: and z5.h, z5.h, #0x1
-; CHECK-NEXT: cmpne p1.h, p0/z, z4.h, #0
-; CHECK-NEXT: cmpne p0.h, p0/z, z5.h, #0
; CHECK-NEXT: mov z0.h, p1/m, z1.h
; CHECK-NEXT: sel z1.h, p0, z2.h, z3.h
; CHECK-NEXT: stp q0, q1, [x0]
@@ -441,13 +434,6 @@ define void @select_v8f32(ptr %a, ptr %b) {
; CHECK-NEXT: ldp q1, q2, [x0]
; CHECK-NEXT: fcmeq p1.s, p0/z, z1.s, z0.s
; CHECK-NEXT: fcmeq p0.s, p0/z, z2.s, z3.s
-; CHECK-NEXT: mov z4.s, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: mov z5.s, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: and z4.s, z4.s, #0x1
-; CHECK-NEXT: and z5.s, z5.s, #0x1
-; CHECK-NEXT: cmpne p1.s, p0/z, z4.s, #0
-; CHECK-NEXT: cmpne p0.s, p0/z, z5.s, #0
; CHECK-NEXT: mov z0.s, p1/m, z1.s
; CHECK-NEXT: sel z1.s, p0, z2.s, z3.s
; CHECK-NEXT: stp q0, q1, [x0]
@@ -572,13 +558,6 @@ define void @select_v4f64(ptr %a, ptr %b) {
; CHECK-NEXT: ldp q1, q2, [x0]
; CHECK-NEXT: fcmeq p1.d, p0/z, z1.d, z0.d
; CHECK-NEXT: fcmeq p0.d, p0/z, z2.d, z3.d
-; CHECK-NEXT: mov z4.d, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: mov z5.d, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: and z4.d, z4.d, #0x1
-; CHECK-NEXT: and z5.d, z5.d, #0x1
-; CHECK-NEXT: cmpne p1.d, p0/z, z4.d, #0
-; CHECK-NEXT: cmpne p0.d, p0/z, z5.d, #0
; CHECK-NEXT: mov z0.d, p1/m, z1.d
; CHECK-NEXT: sel z1.d, p0, z2.d, z3.d
; CHECK-NEXT: stp q0, q1, [x0]
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll
index 12b7886..3970113 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll
@@ -293,13 +293,6 @@ define void @select_v32i8(ptr %a, ptr %b) {
; CHECK-NEXT: ldp q1, q2, [x0]
; CHECK-NEXT: cmpeq p1.b, p0/z, z1.b, z0.b
; CHECK-NEXT: cmpeq p0.b, p0/z, z2.b, z3.b
-; CHECK-NEXT: mov z4.b, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: mov z5.b, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: ptrue p0.b
-; CHECK-NEXT: and z4.b, z4.b, #0x1
-; CHECK-NEXT: and z5.b, z5.b, #0x1
-; CHECK-NEXT: cmpne p1.b, p0/z, z4.b, #0
-; CHECK-NEXT: cmpne p0.b, p0/z, z5.b, #0
; CHECK-NEXT: mov z0.b, p1/m, z1.b
; CHECK-NEXT: sel z1.b, p0, z2.b, z3.b
; CHECK-NEXT: stp q0, q1, [x0]
@@ -704,13 +697,6 @@ define void @select_v16i16(ptr %a, ptr %b) {
; CHECK-NEXT: ldp q1, q2, [x0]
; CHECK-NEXT: cmpeq p1.h, p0/z, z1.h, z0.h
; CHECK-NEXT: cmpeq p0.h, p0/z, z2.h, z3.h
-; CHECK-NEXT: mov z4.h, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: mov z5.h, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: ptrue p0.h
-; CHECK-NEXT: and z4.h, z4.h, #0x1
-; CHECK-NEXT: and z5.h, z5.h, #0x1
-; CHECK-NEXT: cmpne p1.h, p0/z, z4.h, #0
-; CHECK-NEXT: cmpne p0.h, p0/z, z5.h, #0
; CHECK-NEXT: mov z0.h, p1/m, z1.h
; CHECK-NEXT: sel z1.h, p0, z2.h, z3.h
; CHECK-NEXT: stp q0, q1, [x0]
@@ -925,13 +911,6 @@ define void @select_v8i32(ptr %a, ptr %b) {
; CHECK-NEXT: ldp q1, q2, [x0]
; CHECK-NEXT: cmpeq p1.s, p0/z, z1.s, z0.s
; CHECK-NEXT: cmpeq p0.s, p0/z, z2.s, z3.s
-; CHECK-NEXT: mov z4.s, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: mov z5.s, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: and z4.s, z4.s, #0x1
-; CHECK-NEXT: and z5.s, z5.s, #0x1
-; CHECK-NEXT: cmpne p1.s, p0/z, z4.s, #0
-; CHECK-NEXT: cmpne p0.s, p0/z, z5.s, #0
; CHECK-NEXT: mov z0.s, p1/m, z1.s
; CHECK-NEXT: sel z1.s, p0, z2.s, z3.s
; CHECK-NEXT: stp q0, q1, [x0]
@@ -1065,13 +1044,6 @@ define void @select_v4i64(ptr %a, ptr %b) {
; CHECK-NEXT: ldp q1, q2, [x0]
; CHECK-NEXT: cmpeq p1.d, p0/z, z1.d, z0.d
; CHECK-NEXT: cmpeq p0.d, p0/z, z2.d, z3.d
-; CHECK-NEXT: mov z4.d, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: mov z5.d, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: and z4.d, z4.d, #0x1
-; CHECK-NEXT: and z5.d, z5.d, #0x1
-; CHECK-NEXT: cmpne p1.d, p0/z, z4.d, #0
-; CHECK-NEXT: cmpne p0.d, p0/z, z5.d, #0
; CHECK-NEXT: mov z0.d, p1/m, z1.d
; CHECK-NEXT: sel z1.d, p0, z2.d, z3.d
; CHECK-NEXT: stp q0, q1, [x0]
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll
index 5aa3a24..aba9056 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+f,+d -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=1 | FileCheck %s --check-prefixes=NO_FOLDING,NO_FOLDING1
-; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+f,+d -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=2 | FileCheck %s --check-prefixes=NO_FOLDING,NO_FOLDING2
+; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+f,+d -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=1 | FileCheck %s --check-prefixes=NO_FOLDING
+; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+f,+d -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=2 | FileCheck %s --check-prefixes=NO_FOLDING
; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+f,+d -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=3 | FileCheck %s --check-prefixes=FOLDING,ZVFH
; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+f,+d -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=3 | FileCheck %s --check-prefixes=FOLDING,ZVFHMIN
; Check that the default value enables the web folding and
@@ -8,35 +8,20 @@
; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+f,+d -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=FOLDING
define void @vfwmul_v2f116_multiple_users(ptr %x, ptr %y, ptr %z, <2 x half> %a, <2 x half> %b, <2 x half> %b2) {
-; NO_FOLDING1-LABEL: vfwmul_v2f116_multiple_users:
-; NO_FOLDING1: # %bb.0:
-; NO_FOLDING1-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
-; NO_FOLDING1-NEXT: vfwcvt.f.f.v v11, v8
-; NO_FOLDING1-NEXT: vfwcvt.f.f.v v8, v9
-; NO_FOLDING1-NEXT: vfwcvt.f.f.v v9, v10
-; NO_FOLDING1-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
-; NO_FOLDING1-NEXT: vfmul.vv v10, v11, v8
-; NO_FOLDING1-NEXT: vfadd.vv v11, v11, v9
-; NO_FOLDING1-NEXT: vfsub.vv v8, v8, v9
-; NO_FOLDING1-NEXT: vse32.v v10, (a0)
-; NO_FOLDING1-NEXT: vse32.v v11, (a1)
-; NO_FOLDING1-NEXT: vse32.v v8, (a2)
-; NO_FOLDING1-NEXT: ret
-;
-; NO_FOLDING2-LABEL: vfwmul_v2f116_multiple_users:
-; NO_FOLDING2: # %bb.0:
-; NO_FOLDING2-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
-; NO_FOLDING2-NEXT: vfwcvt.f.f.v v11, v8
-; NO_FOLDING2-NEXT: vfwcvt.f.f.v v8, v9
-; NO_FOLDING2-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
-; NO_FOLDING2-NEXT: vfmul.vv v9, v11, v8
-; NO_FOLDING2-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
-; NO_FOLDING2-NEXT: vfwadd.wv v11, v11, v10
-; NO_FOLDING2-NEXT: vfwsub.wv v8, v8, v10
-; NO_FOLDING2-NEXT: vse32.v v9, (a0)
-; NO_FOLDING2-NEXT: vse32.v v11, (a1)
-; NO_FOLDING2-NEXT: vse32.v v8, (a2)
-; NO_FOLDING2-NEXT: ret
+; NO_FOLDING-LABEL: vfwmul_v2f116_multiple_users:
+; NO_FOLDING: # %bb.0:
+; NO_FOLDING-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; NO_FOLDING-NEXT: vfwcvt.f.f.v v11, v8
+; NO_FOLDING-NEXT: vfwcvt.f.f.v v8, v9
+; NO_FOLDING-NEXT: vfwcvt.f.f.v v9, v10
+; NO_FOLDING-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; NO_FOLDING-NEXT: vfmul.vv v10, v11, v8
+; NO_FOLDING-NEXT: vfadd.vv v11, v11, v9
+; NO_FOLDING-NEXT: vfsub.vv v8, v8, v9
+; NO_FOLDING-NEXT: vse32.v v10, (a0)
+; NO_FOLDING-NEXT: vse32.v v11, (a1)
+; NO_FOLDING-NEXT: vse32.v v8, (a2)
+; NO_FOLDING-NEXT: ret
;
; ZVFH-LABEL: vfwmul_v2f116_multiple_users:
; ZVFH: # %bb.0:
@@ -76,35 +61,20 @@ define void @vfwmul_v2f116_multiple_users(ptr %x, ptr %y, ptr %z, <2 x half> %a,
}
define void @vfwmul_v2f32_multiple_users(ptr %x, ptr %y, ptr %z, <2 x float> %a, <2 x float> %b, <2 x float> %b2) {
-; NO_FOLDING1-LABEL: vfwmul_v2f32_multiple_users:
-; NO_FOLDING1: # %bb.0:
-; NO_FOLDING1-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
-; NO_FOLDING1-NEXT: vfwcvt.f.f.v v11, v8
-; NO_FOLDING1-NEXT: vfwcvt.f.f.v v8, v9
-; NO_FOLDING1-NEXT: vfwcvt.f.f.v v9, v10
-; NO_FOLDING1-NEXT: vsetvli zero, zero, e64, m1, ta, ma
-; NO_FOLDING1-NEXT: vfmul.vv v10, v11, v8
-; NO_FOLDING1-NEXT: vfadd.vv v11, v11, v9
-; NO_FOLDING1-NEXT: vfsub.vv v8, v8, v9
-; NO_FOLDING1-NEXT: vse64.v v10, (a0)
-; NO_FOLDING1-NEXT: vse64.v v11, (a1)
-; NO_FOLDING1-NEXT: vse64.v v8, (a2)
-; NO_FOLDING1-NEXT: ret
-;
-; NO_FOLDING2-LABEL: vfwmul_v2f32_multiple_users:
-; NO_FOLDING2: # %bb.0:
-; NO_FOLDING2-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
-; NO_FOLDING2-NEXT: vfwcvt.f.f.v v11, v8
-; NO_FOLDING2-NEXT: vfwcvt.f.f.v v8, v9
-; NO_FOLDING2-NEXT: vsetvli zero, zero, e64, m1, ta, ma
-; NO_FOLDING2-NEXT: vfmul.vv v9, v11, v8
-; NO_FOLDING2-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
-; NO_FOLDING2-NEXT: vfwadd.wv v11, v11, v10
-; NO_FOLDING2-NEXT: vfwsub.wv v8, v8, v10
-; NO_FOLDING2-NEXT: vse64.v v9, (a0)
-; NO_FOLDING2-NEXT: vse64.v v11, (a1)
-; NO_FOLDING2-NEXT: vse64.v v8, (a2)
-; NO_FOLDING2-NEXT: ret
+; NO_FOLDING-LABEL: vfwmul_v2f32_multiple_users:
+; NO_FOLDING: # %bb.0:
+; NO_FOLDING-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; NO_FOLDING-NEXT: vfwcvt.f.f.v v11, v8
+; NO_FOLDING-NEXT: vfwcvt.f.f.v v8, v9
+; NO_FOLDING-NEXT: vfwcvt.f.f.v v9, v10
+; NO_FOLDING-NEXT: vsetvli zero, zero, e64, m1, ta, ma
+; NO_FOLDING-NEXT: vfmul.vv v10, v11, v8
+; NO_FOLDING-NEXT: vfadd.vv v11, v11, v9
+; NO_FOLDING-NEXT: vfsub.vv v8, v8, v9
+; NO_FOLDING-NEXT: vse64.v v10, (a0)
+; NO_FOLDING-NEXT: vse64.v v11, (a1)
+; NO_FOLDING-NEXT: vse64.v v8, (a2)
+; NO_FOLDING-NEXT: ret
;
; FOLDING-LABEL: vfwmul_v2f32_multiple_users:
; FOLDING: # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vw-web-simplification.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vw-web-simplification.ll
index b093e9e3..227a428 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vw-web-simplification.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vw-web-simplification.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=1 | FileCheck %s --check-prefixes=NO_FOLDING1
-; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=1 | FileCheck %s --check-prefixes=NO_FOLDING1
-; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=2 | FileCheck %s --check-prefixes=NO_FOLDING2
-; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=2 | FileCheck %s --check-prefixes=NO_FOLDING2
+; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=1 | FileCheck %s --check-prefixes=NO_FOLDING
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=1 | FileCheck %s --check-prefixes=NO_FOLDING
+; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=2 | FileCheck %s --check-prefixes=NO_FOLDING
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=2 | FileCheck %s --check-prefixes=NO_FOLDING
; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=3 | FileCheck %s --check-prefixes=FOLDING
; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=3 | FileCheck %s --check-prefixes=FOLDING
; Check that the default value enables the web folding and
@@ -16,38 +16,21 @@
; We need the web size to be at least 3 for the folding to happen, because
; %c has 3 uses.
define <2 x i16> @vwmul_v2i16_multiple_users(ptr %x, ptr %y, ptr %z) {
-; NO_FOLDING1-LABEL: vwmul_v2i16_multiple_users:
-; NO_FOLDING1: # %bb.0:
-; NO_FOLDING1-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
-; NO_FOLDING1-NEXT: vle8.v v8, (a0)
-; NO_FOLDING1-NEXT: vle8.v v9, (a1)
-; NO_FOLDING1-NEXT: vle8.v v10, (a2)
-; NO_FOLDING1-NEXT: vsext.vf2 v11, v8
-; NO_FOLDING1-NEXT: vsext.vf2 v8, v9
-; NO_FOLDING1-NEXT: vsext.vf2 v9, v10
-; NO_FOLDING1-NEXT: vmul.vv v8, v11, v8
-; NO_FOLDING1-NEXT: vadd.vv v10, v11, v9
-; NO_FOLDING1-NEXT: vsub.vv v9, v11, v9
-; NO_FOLDING1-NEXT: vor.vv v8, v8, v10
-; NO_FOLDING1-NEXT: vor.vv v8, v8, v9
-; NO_FOLDING1-NEXT: ret
-;
-; NO_FOLDING2-LABEL: vwmul_v2i16_multiple_users:
-; NO_FOLDING2: # %bb.0:
-; NO_FOLDING2-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
-; NO_FOLDING2-NEXT: vle8.v v8, (a0)
-; NO_FOLDING2-NEXT: vle8.v v9, (a1)
-; NO_FOLDING2-NEXT: vle8.v v10, (a2)
-; NO_FOLDING2-NEXT: vsext.vf2 v11, v8
-; NO_FOLDING2-NEXT: vsext.vf2 v8, v9
-; NO_FOLDING2-NEXT: vmul.vv v8, v11, v8
-; NO_FOLDING2-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
-; NO_FOLDING2-NEXT: vwadd.wv v9, v11, v10
-; NO_FOLDING2-NEXT: vwsub.wv v11, v11, v10
-; NO_FOLDING2-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
-; NO_FOLDING2-NEXT: vor.vv v8, v8, v9
-; NO_FOLDING2-NEXT: vor.vv v8, v8, v11
-; NO_FOLDING2-NEXT: ret
+; NO_FOLDING-LABEL: vwmul_v2i16_multiple_users:
+; NO_FOLDING: # %bb.0:
+; NO_FOLDING-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; NO_FOLDING-NEXT: vle8.v v8, (a0)
+; NO_FOLDING-NEXT: vle8.v v9, (a1)
+; NO_FOLDING-NEXT: vle8.v v10, (a2)
+; NO_FOLDING-NEXT: vsext.vf2 v11, v8
+; NO_FOLDING-NEXT: vsext.vf2 v8, v9
+; NO_FOLDING-NEXT: vsext.vf2 v9, v10
+; NO_FOLDING-NEXT: vmul.vv v8, v11, v8
+; NO_FOLDING-NEXT: vadd.vv v10, v11, v9
+; NO_FOLDING-NEXT: vsub.vv v9, v11, v9
+; NO_FOLDING-NEXT: vor.vv v8, v8, v10
+; NO_FOLDING-NEXT: vor.vv v8, v8, v9
+; NO_FOLDING-NEXT: ret
;
; FOLDING-LABEL: vwmul_v2i16_multiple_users:
; FOLDING: # %bb.0: