diff options
11 files changed, 93 insertions, 342 deletions
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h index a98e46c5..3abdafa 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -953,8 +953,17 @@ public: } /// Insert \p SubVec at the \p Idx element of \p Vec. + /// If \p SkipUndef is true and \p SubVec is UNDEF/POISON, then \p Vec is + /// returned. SDValue getInsertSubvector(const SDLoc &DL, SDValue Vec, SDValue SubVec, - unsigned Idx) { + unsigned Idx, bool SkipUndef = false) { + // Skipping insert of UNDEF could result in POISON elements remaining in the + // resulting vector. The SkipUndef is useful in situations when getNode + // can't reason well enough about ignoring the insert, e.g. when having + // scalable vectors and the user of this method knows that the subvector + // being replaced isn't POISON. + if (SkipUndef && SubVec.isUndef()) + return Vec; return getNode(ISD::INSERT_SUBVECTOR, DL, Vec.getValueType(), Vec, SubVec, getVectorIdxConstant(Idx, DL)); } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index fb8bd81..761f7ea 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -15125,11 +15125,14 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, if (PreferDUPAndInsert) { // First, build a constant vector with the common element. - SmallVector<SDValue, 8> Ops(NumElts, Value); + // Make sure to freeze the common element first, since we will use it also + // for indices that should be UNDEF (so we want to avoid making those + // elements more poisonous). + SmallVector<SDValue, 8> Ops(NumElts, DAG.getFreeze(Value)); SDValue NewVector = LowerBUILD_VECTOR(DAG.getBuildVector(VT, DL, Ops), DAG); // Next, insert the elements that do not match the common value. for (unsigned I = 0; I < NumElts; ++I) - if (Op.getOperand(I) != Value) + if (Op.getOperand(I) != Value && !Op.getOperand(I).isUndef()) NewVector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NewVector, Op.getOperand(I), DAG.getConstant(I, DL, MVT::i64)); @@ -28721,7 +28724,7 @@ static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) { "Expected a fixed length vector operand!"); SDLoc DL(V); SDValue Zero = DAG.getConstant(0, DL, MVT::i64); - return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero); + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getPOISON(VT), V, Zero); } // Shrink V so it's just big enough to maintain a VT's worth of data. diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 4f280c3..55e352a 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -2859,7 +2859,7 @@ static SDValue convertToScalableVector(EVT VT, SDValue V, SelectionDAG &DAG, assert(V.getValueType().isFixedLengthVector() && "Expected a fixed length vector operand!"); SDLoc DL(V); - return DAG.getInsertSubvector(DL, DAG.getUNDEF(VT), V, 0); + return DAG.getInsertSubvector(DL, DAG.getPOISON(VT), V, 0); } // Shrink V so it's just big enough to maintain a VT's worth of data. @@ -4347,7 +4347,8 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, DAG.getNode(ISD::BUILD_VECTOR, DL, OneRegVT, OneVRegOfOps); SubBV = convertToScalableVector(M1VT, SubBV, DAG, Subtarget); unsigned InsertIdx = (i / ElemsPerVReg) * NumOpElts; - Vec = DAG.getInsertSubvector(DL, Vec, SubBV, InsertIdx); + Vec = DAG.getInsertSubvector(DL, Vec, SubBV, InsertIdx, + /*SkipUndef=*/true); } return convertFromScalableVector(VT, Vec, DAG, Subtarget); } @@ -7849,10 +7850,8 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, SDValue Vec = DAG.getUNDEF(VT); for (const auto &OpIdx : enumerate(Op->ops())) { SDValue SubVec = OpIdx.value(); - // Don't insert undef subvectors. - if (SubVec.isUndef()) - continue; - Vec = DAG.getInsertSubvector(DL, Vec, SubVec, OpIdx.index() * NumOpElts); + Vec = DAG.getInsertSubvector(DL, Vec, SubVec, OpIdx.index() * NumOpElts, + /*SkipUndef=*/true); } return Vec; } @@ -12272,9 +12271,10 @@ SDValue RISCVTargetLowering::lowerVECTOR_REVERSE(SDValue Op, Hi = DAG.getNode(ISD::VECTOR_REVERSE, DL, HiVT, Hi); // Reassemble the low and high pieces reversed. // FIXME: This is a CONCAT_VECTORS. - SDValue Res = DAG.getInsertSubvector(DL, DAG.getUNDEF(VecVT), Hi, 0); - return DAG.getInsertSubvector(DL, Res, Lo, - LoVT.getVectorMinNumElements()); + SDValue Res = DAG.getInsertSubvector(DL, DAG.getUNDEF(VecVT), Hi, 0, + /*SkipUndef=*/true); + return DAG.getInsertSubvector(DL, Res, Lo, LoVT.getVectorMinNumElements(), + /*SkipUndef=*/true); } // Just promote the int type to i16 which will double the LMUL. diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll index 2905d70..9efe0b3 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll @@ -37,10 +37,6 @@ define void @select_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] ; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h -; CHECK-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p1.h -; CHECK-NEXT: and z2.h, z2.h, #0x1 -; CHECK-NEXT: cmpne p1.h, p1/z, z2.h, #0 ; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -63,15 +59,8 @@ define void @select_v32f16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] ; VBITS_GE_256-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h ; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z2.h, z3.h -; VBITS_GE_256-NEXT: mov z4.h, p1/z, #-1 // =0xffffffffffffffff -; VBITS_GE_256-NEXT: ptrue p1.h -; VBITS_GE_256-NEXT: mov z5.h, p2/z, #-1 // =0xffffffffffffffff -; VBITS_GE_256-NEXT: and z4.h, z4.h, #0x1 -; VBITS_GE_256-NEXT: and z5.h, z5.h, #0x1 -; VBITS_GE_256-NEXT: cmpne p2.h, p1/z, z4.h, #0 -; VBITS_GE_256-NEXT: cmpne p1.h, p1/z, z5.h, #0 -; VBITS_GE_256-NEXT: sel z0.h, p2, z0.h, z1.h -; VBITS_GE_256-NEXT: sel z1.h, p1, z2.h, z3.h +; VBITS_GE_256-NEXT: sel z0.h, p1, z0.h, z1.h +; VBITS_GE_256-NEXT: sel z1.h, p2, z2.h, z3.h ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] ; VBITS_GE_256-NEXT: ret @@ -82,10 +71,6 @@ define void @select_v32f16(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] ; VBITS_GE_512-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h -; VBITS_GE_512-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff -; VBITS_GE_512-NEXT: ptrue p1.h -; VBITS_GE_512-NEXT: and z2.h, z2.h, #0x1 -; VBITS_GE_512-NEXT: cmpne p1.h, p1/z, z2.h, #0 ; VBITS_GE_512-NEXT: sel z0.h, p1, z0.h, z1.h ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] ; VBITS_GE_512-NEXT: ret @@ -104,10 +89,6 @@ define void @select_v64f16(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] ; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h -; CHECK-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p1.h -; CHECK-NEXT: and z2.h, z2.h, #0x1 -; CHECK-NEXT: cmpne p1.h, p1/z, z2.h, #0 ; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -126,10 +107,6 @@ define void @select_v128f16(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] ; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h -; CHECK-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p1.h -; CHECK-NEXT: and z2.h, z2.h, #0x1 -; CHECK-NEXT: cmpne p1.h, p1/z, z2.h, #0 ; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -173,10 +150,6 @@ define void @select_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] ; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s -; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p1.s -; CHECK-NEXT: and z2.s, z2.s, #0x1 -; CHECK-NEXT: cmpne p1.s, p1/z, z2.s, #0 ; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -199,15 +172,8 @@ define void @select_v16f32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] ; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s ; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z2.s, z3.s -; VBITS_GE_256-NEXT: mov z4.s, p1/z, #-1 // =0xffffffffffffffff -; VBITS_GE_256-NEXT: ptrue p1.s -; VBITS_GE_256-NEXT: mov z5.s, p2/z, #-1 // =0xffffffffffffffff -; VBITS_GE_256-NEXT: and z4.s, z4.s, #0x1 -; VBITS_GE_256-NEXT: and z5.s, z5.s, #0x1 -; VBITS_GE_256-NEXT: cmpne p2.s, p1/z, z4.s, #0 -; VBITS_GE_256-NEXT: cmpne p1.s, p1/z, z5.s, #0 -; VBITS_GE_256-NEXT: sel z0.s, p2, z0.s, z1.s -; VBITS_GE_256-NEXT: sel z1.s, p1, z2.s, z3.s +; VBITS_GE_256-NEXT: sel z0.s, p1, z0.s, z1.s +; VBITS_GE_256-NEXT: sel z1.s, p2, z2.s, z3.s ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] ; VBITS_GE_256-NEXT: ret @@ -218,10 +184,6 @@ define void @select_v16f32(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] ; VBITS_GE_512-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s -; VBITS_GE_512-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff -; VBITS_GE_512-NEXT: ptrue p1.s -; VBITS_GE_512-NEXT: and z2.s, z2.s, #0x1 -; VBITS_GE_512-NEXT: cmpne p1.s, p1/z, z2.s, #0 ; VBITS_GE_512-NEXT: sel z0.s, p1, z0.s, z1.s ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] ; VBITS_GE_512-NEXT: ret @@ -240,10 +202,6 @@ define void @select_v32f32(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] ; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s -; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p1.s -; CHECK-NEXT: and z2.s, z2.s, #0x1 -; CHECK-NEXT: cmpne p1.s, p1/z, z2.s, #0 ; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -262,10 +220,6 @@ define void @select_v64f32(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] ; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s -; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p1.s -; CHECK-NEXT: and z2.s, z2.s, #0x1 -; CHECK-NEXT: cmpne p1.s, p1/z, z2.s, #0 ; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -310,10 +264,6 @@ define void @select_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d -; CHECK-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: and z2.d, z2.d, #0x1 -; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0 ; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret @@ -336,15 +286,8 @@ define void @select_v8f64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d ; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z2.d, z3.d -; VBITS_GE_256-NEXT: mov z4.d, p1/z, #-1 // =0xffffffffffffffff -; VBITS_GE_256-NEXT: ptrue p1.d -; VBITS_GE_256-NEXT: mov z5.d, p2/z, #-1 // =0xffffffffffffffff -; VBITS_GE_256-NEXT: and z4.d, z4.d, #0x1 -; VBITS_GE_256-NEXT: and z5.d, z5.d, #0x1 -; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z4.d, #0 -; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z5.d, #0 -; VBITS_GE_256-NEXT: sel z0.d, p2, z0.d, z1.d -; VBITS_GE_256-NEXT: sel z1.d, p1, z2.d, z3.d +; VBITS_GE_256-NEXT: sel z0.d, p1, z0.d, z1.d +; VBITS_GE_256-NEXT: sel z1.d, p2, z2.d, z3.d ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] ; VBITS_GE_256-NEXT: ret @@ -355,10 +298,6 @@ define void @select_v8f64(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d -; VBITS_GE_512-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff -; VBITS_GE_512-NEXT: ptrue p1.d -; VBITS_GE_512-NEXT: and z2.d, z2.d, #0x1 -; VBITS_GE_512-NEXT: cmpne p1.d, p1/z, z2.d, #0 ; VBITS_GE_512-NEXT: sel z0.d, p1, z0.d, z1.d ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] ; VBITS_GE_512-NEXT: ret @@ -377,10 +316,6 @@ define void @select_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d -; CHECK-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: and z2.d, z2.d, #0x1 -; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0 ; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret @@ -399,10 +334,6 @@ define void @select_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d -; CHECK-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: and z2.d, z2.d, #0x1 -; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0 ; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests.ll index 2f76be6..5e94007 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests.ll @@ -12,12 +12,12 @@ define void @foo(ptr %a) #0 { ; CHECK: SelectionDAG has 13 nodes: ; CHECK-NEXT: t0: ch,glue = EntryToken ; CHECK-NEXT: t2: i64,ch = CopyFromReg t0, Register:i64 %0 -; CHECK-NEXT: t21: nxv2i64,ch = LDR_ZXI<Mem:(volatile load (<vscale x 1 x s128>) from %ir.a, align 64)> t2, TargetConstant:i64<0>, t0 +; CHECK-NEXT: t22: nxv2i64,ch = LDR_ZXI<Mem:(volatile load (<vscale x 1 x s128>) from %ir.a, align 64)> t2, TargetConstant:i64<0>, t0 ; CHECK-NEXT: t8: i64 = ADDXri TargetFrameIndex:i64<1>, TargetConstant:i32<0>, TargetConstant:i32<0> ; CHECK-NEXT: t6: i64 = ADDXri TargetFrameIndex:i64<0>, TargetConstant:i32<0>, TargetConstant:i32<0> -; CHECK-NEXT: t22: ch = STR_ZXI<Mem:(volatile store (<vscale x 1 x s128>) into %ir.r0, align 64)> t21, t6, TargetConstant:i64<0>, t21:1 -; CHECK-NEXT: t23: ch = STR_ZXI<Mem:(volatile store (<vscale x 1 x s128>) into %ir.r1, align 64)> t21, t8, TargetConstant:i64<0>, t22 -; CHECK-NEXT: t10: ch = RET_ReallyLR t23 +; CHECK-NEXT: t23: ch = STR_ZXI<Mem:(volatile store (<vscale x 1 x s128>) into %ir.r0, align 64)> t22, t6, TargetConstant:i64<0>, t22:1 +; CHECK-NEXT: t24: ch = STR_ZXI<Mem:(volatile store (<vscale x 1 x s128>) into %ir.r1, align 64)> t22, t8, TargetConstant:i64<0>, t23 +; CHECK-NEXT: t10: ch = RET_ReallyLR t24 ; CHECK-EMPTY: entry: %r0 = alloca <8 x i64> diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll index 0e95da3..9cebbc4 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll @@ -36,10 +36,6 @@ define void @select_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b -; CHECK-NEXT: mov z2.b, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p1.b -; CHECK-NEXT: and z2.b, z2.b, #0x1 -; CHECK-NEXT: cmpne p1.b, p1/z, z2.b, #0 ; CHECK-NEXT: sel z0.b, p1, z0.b, z1.b ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret @@ -62,15 +58,8 @@ define void @select_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b ; VBITS_GE_256-NEXT: cmpeq p2.b, p0/z, z2.b, z3.b -; VBITS_GE_256-NEXT: mov z4.b, p1/z, #-1 // =0xffffffffffffffff -; VBITS_GE_256-NEXT: ptrue p1.b -; VBITS_GE_256-NEXT: mov z5.b, p2/z, #-1 // =0xffffffffffffffff -; VBITS_GE_256-NEXT: and z4.b, z4.b, #0x1 -; VBITS_GE_256-NEXT: and z5.b, z5.b, #0x1 -; VBITS_GE_256-NEXT: cmpne p2.b, p1/z, z4.b, #0 -; VBITS_GE_256-NEXT: cmpne p1.b, p1/z, z5.b, #0 -; VBITS_GE_256-NEXT: sel z0.b, p2, z0.b, z1.b -; VBITS_GE_256-NEXT: sel z1.b, p1, z2.b, z3.b +; VBITS_GE_256-NEXT: sel z0.b, p1, z0.b, z1.b +; VBITS_GE_256-NEXT: sel z1.b, p2, z2.b, z3.b ; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] ; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0] ; VBITS_GE_256-NEXT: ret @@ -81,10 +70,6 @@ define void @select_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b -; VBITS_GE_512-NEXT: mov z2.b, p1/z, #-1 // =0xffffffffffffffff -; VBITS_GE_512-NEXT: ptrue p1.b -; VBITS_GE_512-NEXT: and z2.b, z2.b, #0x1 -; VBITS_GE_512-NEXT: cmpne p1.b, p1/z, z2.b, #0 ; VBITS_GE_512-NEXT: sel z0.b, p1, z0.b, z1.b ; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] ; VBITS_GE_512-NEXT: ret @@ -103,10 +88,6 @@ define void @select_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b -; CHECK-NEXT: mov z2.b, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p1.b -; CHECK-NEXT: and z2.b, z2.b, #0x1 -; CHECK-NEXT: cmpne p1.b, p1/z, z2.b, #0 ; CHECK-NEXT: sel z0.b, p1, z0.b, z1.b ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret @@ -125,10 +106,6 @@ define void @select_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b -; CHECK-NEXT: mov z2.b, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p1.b -; CHECK-NEXT: and z2.b, z2.b, #0x1 -; CHECK-NEXT: cmpne p1.b, p1/z, z2.b, #0 ; CHECK-NEXT: sel z0.b, p1, z0.b, z1.b ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret @@ -172,10 +149,6 @@ define void @select_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h -; CHECK-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p1.h -; CHECK-NEXT: and z2.h, z2.h, #0x1 -; CHECK-NEXT: cmpne p1.h, p1/z, z2.h, #0 ; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -198,15 +171,8 @@ define void @select_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h ; VBITS_GE_256-NEXT: cmpeq p2.h, p0/z, z2.h, z3.h -; VBITS_GE_256-NEXT: mov z4.h, p1/z, #-1 // =0xffffffffffffffff -; VBITS_GE_256-NEXT: ptrue p1.h -; VBITS_GE_256-NEXT: mov z5.h, p2/z, #-1 // =0xffffffffffffffff -; VBITS_GE_256-NEXT: and z4.h, z4.h, #0x1 -; VBITS_GE_256-NEXT: and z5.h, z5.h, #0x1 -; VBITS_GE_256-NEXT: cmpne p2.h, p1/z, z4.h, #0 -; VBITS_GE_256-NEXT: cmpne p1.h, p1/z, z5.h, #0 -; VBITS_GE_256-NEXT: sel z0.h, p2, z0.h, z1.h -; VBITS_GE_256-NEXT: sel z1.h, p1, z2.h, z3.h +; VBITS_GE_256-NEXT: sel z0.h, p1, z0.h, z1.h +; VBITS_GE_256-NEXT: sel z1.h, p2, z2.h, z3.h ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] ; VBITS_GE_256-NEXT: ret @@ -217,10 +183,6 @@ define void @select_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h -; VBITS_GE_512-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff -; VBITS_GE_512-NEXT: ptrue p1.h -; VBITS_GE_512-NEXT: and z2.h, z2.h, #0x1 -; VBITS_GE_512-NEXT: cmpne p1.h, p1/z, z2.h, #0 ; VBITS_GE_512-NEXT: sel z0.h, p1, z0.h, z1.h ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] ; VBITS_GE_512-NEXT: ret @@ -239,10 +201,6 @@ define void @select_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h -; CHECK-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p1.h -; CHECK-NEXT: and z2.h, z2.h, #0x1 -; CHECK-NEXT: cmpne p1.h, p1/z, z2.h, #0 ; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -261,10 +219,6 @@ define void @select_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h -; CHECK-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p1.h -; CHECK-NEXT: and z2.h, z2.h, #0x1 -; CHECK-NEXT: cmpne p1.h, p1/z, z2.h, #0 ; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -308,10 +262,6 @@ define void @select_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s -; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p1.s -; CHECK-NEXT: and z2.s, z2.s, #0x1 -; CHECK-NEXT: cmpne p1.s, p1/z, z2.s, #0 ; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -334,15 +284,8 @@ define void @select_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s ; VBITS_GE_256-NEXT: cmpeq p2.s, p0/z, z2.s, z3.s -; VBITS_GE_256-NEXT: mov z4.s, p1/z, #-1 // =0xffffffffffffffff -; VBITS_GE_256-NEXT: ptrue p1.s -; VBITS_GE_256-NEXT: mov z5.s, p2/z, #-1 // =0xffffffffffffffff -; VBITS_GE_256-NEXT: and z4.s, z4.s, #0x1 -; VBITS_GE_256-NEXT: and z5.s, z5.s, #0x1 -; VBITS_GE_256-NEXT: cmpne p2.s, p1/z, z4.s, #0 -; VBITS_GE_256-NEXT: cmpne p1.s, p1/z, z5.s, #0 -; VBITS_GE_256-NEXT: sel z0.s, p2, z0.s, z1.s -; VBITS_GE_256-NEXT: sel z1.s, p1, z2.s, z3.s +; VBITS_GE_256-NEXT: sel z0.s, p1, z0.s, z1.s +; VBITS_GE_256-NEXT: sel z1.s, p2, z2.s, z3.s ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] ; VBITS_GE_256-NEXT: ret @@ -353,10 +296,6 @@ define void @select_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s -; VBITS_GE_512-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff -; VBITS_GE_512-NEXT: ptrue p1.s -; VBITS_GE_512-NEXT: and z2.s, z2.s, #0x1 -; VBITS_GE_512-NEXT: cmpne p1.s, p1/z, z2.s, #0 ; VBITS_GE_512-NEXT: sel z0.s, p1, z0.s, z1.s ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] ; VBITS_GE_512-NEXT: ret @@ -375,10 +314,6 @@ define void @select_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s -; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p1.s -; CHECK-NEXT: and z2.s, z2.s, #0x1 -; CHECK-NEXT: cmpne p1.s, p1/z, z2.s, #0 ; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -397,10 +332,6 @@ define void @select_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s -; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p1.s -; CHECK-NEXT: and z2.s, z2.s, #0x1 -; CHECK-NEXT: cmpne p1.s, p1/z, z2.s, #0 ; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -445,10 +376,6 @@ define void @select_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d -; CHECK-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: and z2.d, z2.d, #0x1 -; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0 ; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret @@ -471,15 +398,8 @@ define void @select_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d ; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z2.d, z3.d -; VBITS_GE_256-NEXT: mov z4.d, p1/z, #-1 // =0xffffffffffffffff -; VBITS_GE_256-NEXT: ptrue p1.d -; VBITS_GE_256-NEXT: mov z5.d, p2/z, #-1 // =0xffffffffffffffff -; VBITS_GE_256-NEXT: and z4.d, z4.d, #0x1 -; VBITS_GE_256-NEXT: and z5.d, z5.d, #0x1 -; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z4.d, #0 -; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z5.d, #0 -; VBITS_GE_256-NEXT: sel z0.d, p2, z0.d, z1.d -; VBITS_GE_256-NEXT: sel z1.d, p1, z2.d, z3.d +; VBITS_GE_256-NEXT: sel z0.d, p1, z0.d, z1.d +; VBITS_GE_256-NEXT: sel z1.d, p2, z2.d, z3.d ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] ; VBITS_GE_256-NEXT: ret @@ -490,10 +410,6 @@ define void @select_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d -; VBITS_GE_512-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff -; VBITS_GE_512-NEXT: ptrue p1.d -; VBITS_GE_512-NEXT: and z2.d, z2.d, #0x1 -; VBITS_GE_512-NEXT: cmpne p1.d, p1/z, z2.d, #0 ; VBITS_GE_512-NEXT: sel z0.d, p1, z0.d, z1.d ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] ; VBITS_GE_512-NEXT: ret @@ -512,10 +428,6 @@ define void @select_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d -; CHECK-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: and z2.d, z2.d, #0x1 -; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0 ; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret @@ -534,10 +446,6 @@ define void @select_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d -; CHECK-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: and z2.d, z2.d, #0x1 -; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0 ; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll index ebd32c7..093e6cd 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll @@ -1198,15 +1198,11 @@ define void @masked_gather_passthru(ptr %a, ptr %b, ptr %c) vscale_range(16,0) # ; CHECK-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: ptrue p2.d, vl32 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x2] ; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 ; CHECK-NEXT: ld1d { z0.d }, p2/z, [x1] ; CHECK-NEXT: punpklo p2.h, p1.b -; CHECK-NEXT: mov z1.s, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: ld1w { z0.d }, p2/z, [z0.d] -; CHECK-NEXT: and z1.s, z1.s, #0x1 -; CHECK-NEXT: cmpne p1.s, p1/z, z1.s, #0 -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x2] ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s ; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll index 8b845df..ec0693a 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll @@ -199,13 +199,6 @@ define void @select_v16f16(ptr %a, ptr %b) { ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fcmeq p1.h, p0/z, z1.h, z0.h ; CHECK-NEXT: fcmeq p0.h, p0/z, z2.h, z3.h -; CHECK-NEXT: mov z4.h, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: mov z5.h, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: and z4.h, z4.h, #0x1 -; CHECK-NEXT: and z5.h, z5.h, #0x1 -; CHECK-NEXT: cmpne p1.h, p0/z, z4.h, #0 -; CHECK-NEXT: cmpne p0.h, p0/z, z5.h, #0 ; CHECK-NEXT: mov z0.h, p1/m, z1.h ; CHECK-NEXT: sel z1.h, p0, z2.h, z3.h ; CHECK-NEXT: stp q0, q1, [x0] @@ -441,13 +434,6 @@ define void @select_v8f32(ptr %a, ptr %b) { ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fcmeq p1.s, p0/z, z1.s, z0.s ; CHECK-NEXT: fcmeq p0.s, p0/z, z2.s, z3.s -; CHECK-NEXT: mov z4.s, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: mov z5.s, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: and z4.s, z4.s, #0x1 -; CHECK-NEXT: and z5.s, z5.s, #0x1 -; CHECK-NEXT: cmpne p1.s, p0/z, z4.s, #0 -; CHECK-NEXT: cmpne p0.s, p0/z, z5.s, #0 ; CHECK-NEXT: mov z0.s, p1/m, z1.s ; CHECK-NEXT: sel z1.s, p0, z2.s, z3.s ; CHECK-NEXT: stp q0, q1, [x0] @@ -572,13 +558,6 @@ define void @select_v4f64(ptr %a, ptr %b) { ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fcmeq p1.d, p0/z, z1.d, z0.d ; CHECK-NEXT: fcmeq p0.d, p0/z, z2.d, z3.d -; CHECK-NEXT: mov z4.d, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: mov z5.d, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: and z4.d, z4.d, #0x1 -; CHECK-NEXT: and z5.d, z5.d, #0x1 -; CHECK-NEXT: cmpne p1.d, p0/z, z4.d, #0 -; CHECK-NEXT: cmpne p0.d, p0/z, z5.d, #0 ; CHECK-NEXT: mov z0.d, p1/m, z1.d ; CHECK-NEXT: sel z1.d, p0, z2.d, z3.d ; CHECK-NEXT: stp q0, q1, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll index 12b7886..3970113 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll @@ -293,13 +293,6 @@ define void @select_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: cmpeq p1.b, p0/z, z1.b, z0.b ; CHECK-NEXT: cmpeq p0.b, p0/z, z2.b, z3.b -; CHECK-NEXT: mov z4.b, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: mov z5.b, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: and z4.b, z4.b, #0x1 -; CHECK-NEXT: and z5.b, z5.b, #0x1 -; CHECK-NEXT: cmpne p1.b, p0/z, z4.b, #0 -; CHECK-NEXT: cmpne p0.b, p0/z, z5.b, #0 ; CHECK-NEXT: mov z0.b, p1/m, z1.b ; CHECK-NEXT: sel z1.b, p0, z2.b, z3.b ; CHECK-NEXT: stp q0, q1, [x0] @@ -704,13 +697,6 @@ define void @select_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: cmpeq p1.h, p0/z, z1.h, z0.h ; CHECK-NEXT: cmpeq p0.h, p0/z, z2.h, z3.h -; CHECK-NEXT: mov z4.h, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: mov z5.h, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: and z4.h, z4.h, #0x1 -; CHECK-NEXT: and z5.h, z5.h, #0x1 -; CHECK-NEXT: cmpne p1.h, p0/z, z4.h, #0 -; CHECK-NEXT: cmpne p0.h, p0/z, z5.h, #0 ; CHECK-NEXT: mov z0.h, p1/m, z1.h ; CHECK-NEXT: sel z1.h, p0, z2.h, z3.h ; CHECK-NEXT: stp q0, q1, [x0] @@ -925,13 +911,6 @@ define void @select_v8i32(ptr %a, ptr %b) { ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: cmpeq p1.s, p0/z, z1.s, z0.s ; CHECK-NEXT: cmpeq p0.s, p0/z, z2.s, z3.s -; CHECK-NEXT: mov z4.s, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: mov z5.s, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: and z4.s, z4.s, #0x1 -; CHECK-NEXT: and z5.s, z5.s, #0x1 -; CHECK-NEXT: cmpne p1.s, p0/z, z4.s, #0 -; CHECK-NEXT: cmpne p0.s, p0/z, z5.s, #0 ; CHECK-NEXT: mov z0.s, p1/m, z1.s ; CHECK-NEXT: sel z1.s, p0, z2.s, z3.s ; CHECK-NEXT: stp q0, q1, [x0] @@ -1065,13 +1044,6 @@ define void @select_v4i64(ptr %a, ptr %b) { ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: cmpeq p1.d, p0/z, z1.d, z0.d ; CHECK-NEXT: cmpeq p0.d, p0/z, z2.d, z3.d -; CHECK-NEXT: mov z4.d, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: mov z5.d, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: and z4.d, z4.d, #0x1 -; CHECK-NEXT: and z5.d, z5.d, #0x1 -; CHECK-NEXT: cmpne p1.d, p0/z, z4.d, #0 -; CHECK-NEXT: cmpne p0.d, p0/z, z5.d, #0 ; CHECK-NEXT: mov z0.d, p1/m, z1.d ; CHECK-NEXT: sel z1.d, p0, z2.d, z3.d ; CHECK-NEXT: stp q0, q1, [x0] diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll index 5aa3a24..aba9056 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+f,+d -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=1 | FileCheck %s --check-prefixes=NO_FOLDING,NO_FOLDING1 -; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+f,+d -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=2 | FileCheck %s --check-prefixes=NO_FOLDING,NO_FOLDING2 +; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+f,+d -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=1 | FileCheck %s --check-prefixes=NO_FOLDING +; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+f,+d -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=2 | FileCheck %s --check-prefixes=NO_FOLDING ; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+f,+d -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=3 | FileCheck %s --check-prefixes=FOLDING,ZVFH ; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+f,+d -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=3 | FileCheck %s --check-prefixes=FOLDING,ZVFHMIN ; Check that the default value enables the web folding and @@ -8,35 +8,20 @@ ; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+f,+d -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=FOLDING define void @vfwmul_v2f116_multiple_users(ptr %x, ptr %y, ptr %z, <2 x half> %a, <2 x half> %b, <2 x half> %b2) { -; NO_FOLDING1-LABEL: vfwmul_v2f116_multiple_users: -; NO_FOLDING1: # %bb.0: -; NO_FOLDING1-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; NO_FOLDING1-NEXT: vfwcvt.f.f.v v11, v8 -; NO_FOLDING1-NEXT: vfwcvt.f.f.v v8, v9 -; NO_FOLDING1-NEXT: vfwcvt.f.f.v v9, v10 -; NO_FOLDING1-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; NO_FOLDING1-NEXT: vfmul.vv v10, v11, v8 -; NO_FOLDING1-NEXT: vfadd.vv v11, v11, v9 -; NO_FOLDING1-NEXT: vfsub.vv v8, v8, v9 -; NO_FOLDING1-NEXT: vse32.v v10, (a0) -; NO_FOLDING1-NEXT: vse32.v v11, (a1) -; NO_FOLDING1-NEXT: vse32.v v8, (a2) -; NO_FOLDING1-NEXT: ret -; -; NO_FOLDING2-LABEL: vfwmul_v2f116_multiple_users: -; NO_FOLDING2: # %bb.0: -; NO_FOLDING2-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; NO_FOLDING2-NEXT: vfwcvt.f.f.v v11, v8 -; NO_FOLDING2-NEXT: vfwcvt.f.f.v v8, v9 -; NO_FOLDING2-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; NO_FOLDING2-NEXT: vfmul.vv v9, v11, v8 -; NO_FOLDING2-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; NO_FOLDING2-NEXT: vfwadd.wv v11, v11, v10 -; NO_FOLDING2-NEXT: vfwsub.wv v8, v8, v10 -; NO_FOLDING2-NEXT: vse32.v v9, (a0) -; NO_FOLDING2-NEXT: vse32.v v11, (a1) -; NO_FOLDING2-NEXT: vse32.v v8, (a2) -; NO_FOLDING2-NEXT: ret +; NO_FOLDING-LABEL: vfwmul_v2f116_multiple_users: +; NO_FOLDING: # %bb.0: +; NO_FOLDING-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; NO_FOLDING-NEXT: vfwcvt.f.f.v v11, v8 +; NO_FOLDING-NEXT: vfwcvt.f.f.v v8, v9 +; NO_FOLDING-NEXT: vfwcvt.f.f.v v9, v10 +; NO_FOLDING-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; NO_FOLDING-NEXT: vfmul.vv v10, v11, v8 +; NO_FOLDING-NEXT: vfadd.vv v11, v11, v9 +; NO_FOLDING-NEXT: vfsub.vv v8, v8, v9 +; NO_FOLDING-NEXT: vse32.v v10, (a0) +; NO_FOLDING-NEXT: vse32.v v11, (a1) +; NO_FOLDING-NEXT: vse32.v v8, (a2) +; NO_FOLDING-NEXT: ret ; ; ZVFH-LABEL: vfwmul_v2f116_multiple_users: ; ZVFH: # %bb.0: @@ -76,35 +61,20 @@ define void @vfwmul_v2f116_multiple_users(ptr %x, ptr %y, ptr %z, <2 x half> %a, } define void @vfwmul_v2f32_multiple_users(ptr %x, ptr %y, ptr %z, <2 x float> %a, <2 x float> %b, <2 x float> %b2) { -; NO_FOLDING1-LABEL: vfwmul_v2f32_multiple_users: -; NO_FOLDING1: # %bb.0: -; NO_FOLDING1-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; NO_FOLDING1-NEXT: vfwcvt.f.f.v v11, v8 -; NO_FOLDING1-NEXT: vfwcvt.f.f.v v8, v9 -; NO_FOLDING1-NEXT: vfwcvt.f.f.v v9, v10 -; NO_FOLDING1-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; NO_FOLDING1-NEXT: vfmul.vv v10, v11, v8 -; NO_FOLDING1-NEXT: vfadd.vv v11, v11, v9 -; NO_FOLDING1-NEXT: vfsub.vv v8, v8, v9 -; NO_FOLDING1-NEXT: vse64.v v10, (a0) -; NO_FOLDING1-NEXT: vse64.v v11, (a1) -; NO_FOLDING1-NEXT: vse64.v v8, (a2) -; NO_FOLDING1-NEXT: ret -; -; NO_FOLDING2-LABEL: vfwmul_v2f32_multiple_users: -; NO_FOLDING2: # %bb.0: -; NO_FOLDING2-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; NO_FOLDING2-NEXT: vfwcvt.f.f.v v11, v8 -; NO_FOLDING2-NEXT: vfwcvt.f.f.v v8, v9 -; NO_FOLDING2-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; NO_FOLDING2-NEXT: vfmul.vv v9, v11, v8 -; NO_FOLDING2-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; NO_FOLDING2-NEXT: vfwadd.wv v11, v11, v10 -; NO_FOLDING2-NEXT: vfwsub.wv v8, v8, v10 -; NO_FOLDING2-NEXT: vse64.v v9, (a0) -; NO_FOLDING2-NEXT: vse64.v v11, (a1) -; NO_FOLDING2-NEXT: vse64.v v8, (a2) -; NO_FOLDING2-NEXT: ret +; NO_FOLDING-LABEL: vfwmul_v2f32_multiple_users: +; NO_FOLDING: # %bb.0: +; NO_FOLDING-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; NO_FOLDING-NEXT: vfwcvt.f.f.v v11, v8 +; NO_FOLDING-NEXT: vfwcvt.f.f.v v8, v9 +; NO_FOLDING-NEXT: vfwcvt.f.f.v v9, v10 +; NO_FOLDING-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; NO_FOLDING-NEXT: vfmul.vv v10, v11, v8 +; NO_FOLDING-NEXT: vfadd.vv v11, v11, v9 +; NO_FOLDING-NEXT: vfsub.vv v8, v8, v9 +; NO_FOLDING-NEXT: vse64.v v10, (a0) +; NO_FOLDING-NEXT: vse64.v v11, (a1) +; NO_FOLDING-NEXT: vse64.v v8, (a2) +; NO_FOLDING-NEXT: ret ; ; FOLDING-LABEL: vfwmul_v2f32_multiple_users: ; FOLDING: # %bb.0: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vw-web-simplification.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vw-web-simplification.ll index b093e9e3..227a428 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vw-web-simplification.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vw-web-simplification.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=1 | FileCheck %s --check-prefixes=NO_FOLDING1 -; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=1 | FileCheck %s --check-prefixes=NO_FOLDING1 -; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=2 | FileCheck %s --check-prefixes=NO_FOLDING2 -; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=2 | FileCheck %s --check-prefixes=NO_FOLDING2 +; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=1 | FileCheck %s --check-prefixes=NO_FOLDING +; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=1 | FileCheck %s --check-prefixes=NO_FOLDING +; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=2 | FileCheck %s --check-prefixes=NO_FOLDING +; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=2 | FileCheck %s --check-prefixes=NO_FOLDING ; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=3 | FileCheck %s --check-prefixes=FOLDING ; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=3 | FileCheck %s --check-prefixes=FOLDING ; Check that the default value enables the web folding and @@ -16,38 +16,21 @@ ; We need the web size to be at least 3 for the folding to happen, because ; %c has 3 uses. define <2 x i16> @vwmul_v2i16_multiple_users(ptr %x, ptr %y, ptr %z) { -; NO_FOLDING1-LABEL: vwmul_v2i16_multiple_users: -; NO_FOLDING1: # %bb.0: -; NO_FOLDING1-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; NO_FOLDING1-NEXT: vle8.v v8, (a0) -; NO_FOLDING1-NEXT: vle8.v v9, (a1) -; NO_FOLDING1-NEXT: vle8.v v10, (a2) -; NO_FOLDING1-NEXT: vsext.vf2 v11, v8 -; NO_FOLDING1-NEXT: vsext.vf2 v8, v9 -; NO_FOLDING1-NEXT: vsext.vf2 v9, v10 -; NO_FOLDING1-NEXT: vmul.vv v8, v11, v8 -; NO_FOLDING1-NEXT: vadd.vv v10, v11, v9 -; NO_FOLDING1-NEXT: vsub.vv v9, v11, v9 -; NO_FOLDING1-NEXT: vor.vv v8, v8, v10 -; NO_FOLDING1-NEXT: vor.vv v8, v8, v9 -; NO_FOLDING1-NEXT: ret -; -; NO_FOLDING2-LABEL: vwmul_v2i16_multiple_users: -; NO_FOLDING2: # %bb.0: -; NO_FOLDING2-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; NO_FOLDING2-NEXT: vle8.v v8, (a0) -; NO_FOLDING2-NEXT: vle8.v v9, (a1) -; NO_FOLDING2-NEXT: vle8.v v10, (a2) -; NO_FOLDING2-NEXT: vsext.vf2 v11, v8 -; NO_FOLDING2-NEXT: vsext.vf2 v8, v9 -; NO_FOLDING2-NEXT: vmul.vv v8, v11, v8 -; NO_FOLDING2-NEXT: vsetvli zero, zero, e8, mf8, ta, ma -; NO_FOLDING2-NEXT: vwadd.wv v9, v11, v10 -; NO_FOLDING2-NEXT: vwsub.wv v11, v11, v10 -; NO_FOLDING2-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; NO_FOLDING2-NEXT: vor.vv v8, v8, v9 -; NO_FOLDING2-NEXT: vor.vv v8, v8, v11 -; NO_FOLDING2-NEXT: ret +; NO_FOLDING-LABEL: vwmul_v2i16_multiple_users: +; NO_FOLDING: # %bb.0: +; NO_FOLDING-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; NO_FOLDING-NEXT: vle8.v v8, (a0) +; NO_FOLDING-NEXT: vle8.v v9, (a1) +; NO_FOLDING-NEXT: vle8.v v10, (a2) +; NO_FOLDING-NEXT: vsext.vf2 v11, v8 +; NO_FOLDING-NEXT: vsext.vf2 v8, v9 +; NO_FOLDING-NEXT: vsext.vf2 v9, v10 +; NO_FOLDING-NEXT: vmul.vv v8, v11, v8 +; NO_FOLDING-NEXT: vadd.vv v10, v11, v9 +; NO_FOLDING-NEXT: vsub.vv v9, v11, v9 +; NO_FOLDING-NEXT: vor.vv v8, v8, v10 +; NO_FOLDING-NEXT: vor.vv v8, v8, v9 +; NO_FOLDING-NEXT: ret ; ; FOLDING-LABEL: vwmul_v2i16_multiple_users: ; FOLDING: # %bb.0: |