diff options
5 files changed, 258 insertions, 275 deletions
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 4caadef..8235b53 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -3877,6 +3877,47 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, return convertFromScalableVector(VT, Vec, DAG, Subtarget); } + // For m1 vectors, if we have non-undef values in both halves of our vector, + // split the vector into low and high halves, build them separately, then + // use a vselect to combine them. For long vectors, this cuts the critical + // path of the vslide1down sequence in half, and gives us an opportunity + // to special case each half independently. Note that we don't change the + // length of the sub-vectors here, so if both fallback to the generic + // vslide1down path, we should be able to fold the vselect into the final + // vslidedown (for the undef tail) for the first half w/ masking. + unsigned NumElts = VT.getVectorNumElements(); + unsigned NumUndefElts = + count_if(Op->op_values(), [](const SDValue &V) { return V.isUndef(); }); + unsigned NumDefElts = NumElts - NumUndefElts; + if (NumDefElts >= 8 && NumDefElts > NumElts / 2 && + ContainerVT.bitsLE(getLMUL1VT(ContainerVT))) { + SmallVector<SDValue> SubVecAOps, SubVecBOps; + SmallVector<SDValue> MaskVals; + SDValue UndefElem = DAG.getUNDEF(Op->getOperand(0)->getValueType(0)); + SubVecAOps.reserve(NumElts); + SubVecBOps.reserve(NumElts); + for (unsigned i = 0; i < NumElts; i++) { + SDValue Elem = Op->getOperand(i); + if (i < NumElts / 2) { + SubVecAOps.push_back(Elem); + SubVecBOps.push_back(UndefElem); + } else { + SubVecAOps.push_back(UndefElem); + SubVecBOps.push_back(Elem); + } + bool SelectMaskVal = (i < NumElts / 2); + MaskVals.push_back(DAG.getConstant(SelectMaskVal, DL, XLenVT)); + } + assert(SubVecAOps.size() == NumElts && SubVecBOps.size() == NumElts && + MaskVals.size() == NumElts); + + SDValue SubVecA = DAG.getBuildVector(VT, DL, SubVecAOps); + SDValue SubVecB = DAG.getBuildVector(VT, DL, SubVecBOps); + MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts); + SDValue SelectMask = DAG.getBuildVector(MaskVT, DL, MaskVals); + return DAG.getNode(ISD::VSELECT, DL, VT, SelectMask, SubVecA, SubVecB); + } + // Cap the cost at a value linear to the number of elements in the vector. // The default lowering is to use the stack. The vector store + scalar loads // is linear in VL. However, at high lmuls vslide1down and vslidedown end up diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll index a2bd862..8e214e4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll @@ -1399,15 +1399,17 @@ define <2 x double> @vid_step2_v2f64() { define <8 x float> @buildvec_v8f32_zvl256(float %e0, float %e1, float %e2, float %e3, float %e4, float %e5, float %e6, float %e7) vscale_range(4, 128) { ; CHECK-LABEL: buildvec_v8f32_zvl256: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v8, fa0 ; CHECK-NEXT: vfslide1down.vf v8, v8, fa1 ; CHECK-NEXT: vfslide1down.vf v8, v8, fa2 -; CHECK-NEXT: vfslide1down.vf v8, v8, fa3 -; CHECK-NEXT: vfslide1down.vf v8, v8, fa4 +; CHECK-NEXT: vfslide1down.vf v9, v8, fa3 +; CHECK-NEXT: vfmv.v.f v8, fa4 ; CHECK-NEXT: vfslide1down.vf v8, v8, fa5 ; CHECK-NEXT: vfslide1down.vf v8, v8, fa6 +; CHECK-NEXT: vmv.v.i v0, 15 ; CHECK-NEXT: vfslide1down.vf v8, v8, fa7 +; CHECK-NEXT: vslidedown.vi v8, v9, 4, v0.t ; CHECK-NEXT: ret %v0 = insertelement <8 x float> poison, float %e0, i64 0 %v1 = insertelement <8 x float> %v0, float %e1, i64 1 @@ -1448,15 +1450,17 @@ define <8 x double> @buildvec_v8f64_zvl256(double %e0, double %e1, double %e2, d define <8 x double> @buildvec_v8f64_zvl512(double %e0, double %e1, double %e2, double %e3, double %e4, double %e5, double %e6, double %e7) vscale_range(8, 128) { ; CHECK-LABEL: buildvec_v8f64_zvl512: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e64, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e64, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v8, fa0 ; CHECK-NEXT: vfslide1down.vf v8, v8, fa1 ; CHECK-NEXT: vfslide1down.vf v8, v8, fa2 -; CHECK-NEXT: vfslide1down.vf v8, v8, fa3 -; CHECK-NEXT: vfslide1down.vf v8, v8, fa4 +; CHECK-NEXT: vfslide1down.vf v9, v8, fa3 +; CHECK-NEXT: vfmv.v.f v8, fa4 ; CHECK-NEXT: vfslide1down.vf v8, v8, fa5 ; CHECK-NEXT: vfslide1down.vf v8, v8, fa6 +; CHECK-NEXT: vmv.v.i v0, 15 ; CHECK-NEXT: vfslide1down.vf v8, v8, fa7 +; CHECK-NEXT: vslidedown.vi v8, v9, 4, v0.t ; CHECK-NEXT: ret %v0 = insertelement <8 x double> poison, double %e0, i64 0 %v1 = insertelement <8 x double> %v0, double %e1, i64 1 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll index ed0b15c..85b8490 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll @@ -359,28 +359,28 @@ define void @fp2si_v8f64_v8i8(ptr %x, ptr %y) { ; RV32-NEXT: feq.d a0, fa3, fa3 ; RV32-NEXT: fmax.d fa3, fa3, fa5 ; RV32-NEXT: fmin.d fa3, fa3, fa4 +; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV32-NEXT: fld fa2, 40(sp) ; RV32-NEXT: fcvt.w.d a2, fa3, rtz -; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32-NEXT: fld fa3, 32(sp) ; RV32-NEXT: neg a0, a0 ; RV32-NEXT: and a0, a0, a2 -; RV32-NEXT: vslide1down.vx v8, v10, a0 -; RV32-NEXT: feq.d a0, fa3, fa3 -; RV32-NEXT: fmax.d fa3, fa3, fa5 +; RV32-NEXT: feq.d a2, fa2, fa2 +; RV32-NEXT: fmax.d fa3, fa2, fa5 ; RV32-NEXT: fmin.d fa3, fa3, fa4 -; RV32-NEXT: fcvt.w.d a2, fa3, rtz -; RV32-NEXT: fld fa3, 40(sp) -; RV32-NEXT: neg a0, a0 -; RV32-NEXT: and a0, a0, a2 -; RV32-NEXT: vslide1down.vx v8, v8, a0 -; RV32-NEXT: feq.d a0, fa3, fa3 +; RV32-NEXT: fcvt.w.d a3, fa3, rtz +; RV32-NEXT: fld fa3, 32(sp) +; RV32-NEXT: vslide1down.vx v8, v10, a0 +; RV32-NEXT: neg a0, a2 +; RV32-NEXT: and a0, a0, a3 +; RV32-NEXT: feq.d a2, fa3, fa3 +; RV32-NEXT: neg a2, a2 ; RV32-NEXT: fmax.d fa3, fa3, fa5 ; RV32-NEXT: fmin.d fa3, fa3, fa4 -; RV32-NEXT: fcvt.w.d a2, fa3, rtz +; RV32-NEXT: fcvt.w.d a3, fa3, rtz ; RV32-NEXT: fld fa3, 48(sp) -; RV32-NEXT: neg a0, a0 -; RV32-NEXT: and a0, a0, a2 -; RV32-NEXT: vslide1down.vx v8, v8, a0 +; RV32-NEXT: and a2, a2, a3 +; RV32-NEXT: vmv.v.x v9, a2 +; RV32-NEXT: vslide1down.vx v9, v9, a0 ; RV32-NEXT: feq.d a0, fa3, fa3 ; RV32-NEXT: fmax.d fa3, fa3, fa5 ; RV32-NEXT: fmin.d fa3, fa3, fa4 @@ -388,15 +388,17 @@ define void @fp2si_v8f64_v8i8(ptr %x, ptr %y) { ; RV32-NEXT: fld fa3, 56(sp) ; RV32-NEXT: neg a0, a0 ; RV32-NEXT: and a0, a0, a2 -; RV32-NEXT: vslide1down.vx v8, v8, a0 +; RV32-NEXT: vslide1down.vx v9, v9, a0 ; RV32-NEXT: feq.d a0, fa3, fa3 ; RV32-NEXT: neg a0, a0 ; RV32-NEXT: fmax.d fa5, fa3, fa5 ; RV32-NEXT: fmin.d fa5, fa5, fa4 ; RV32-NEXT: fcvt.w.d a2, fa5, rtz ; RV32-NEXT: and a0, a0, a2 -; RV32-NEXT: vslide1down.vx v8, v8, a0 -; RV32-NEXT: vse8.v v8, (a1) +; RV32-NEXT: vmv.v.i v0, 15 +; RV32-NEXT: vslide1down.vx v9, v9, a0 +; RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t +; RV32-NEXT: vse8.v v9, (a1) ; RV32-NEXT: addi sp, s0, -128 ; RV32-NEXT: lw ra, 124(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s0, 120(sp) # 4-byte Folded Reload @@ -458,28 +460,28 @@ define void @fp2si_v8f64_v8i8(ptr %x, ptr %y) { ; RV64-NEXT: feq.d a0, fa3, fa3 ; RV64-NEXT: fmax.d fa3, fa3, fa5 ; RV64-NEXT: fmin.d fa3, fa3, fa4 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: fld fa2, 40(sp) ; RV64-NEXT: fcvt.l.d a2, fa3, rtz -; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64-NEXT: fld fa3, 32(sp) ; RV64-NEXT: neg a0, a0 ; RV64-NEXT: and a0, a0, a2 -; RV64-NEXT: vslide1down.vx v8, v10, a0 -; RV64-NEXT: feq.d a0, fa3, fa3 -; RV64-NEXT: fmax.d fa3, fa3, fa5 +; RV64-NEXT: feq.d a2, fa2, fa2 +; RV64-NEXT: fmax.d fa3, fa2, fa5 ; RV64-NEXT: fmin.d fa3, fa3, fa4 -; RV64-NEXT: fcvt.l.d a2, fa3, rtz -; RV64-NEXT: fld fa3, 40(sp) -; RV64-NEXT: neg a0, a0 -; RV64-NEXT: and a0, a0, a2 -; RV64-NEXT: vslide1down.vx v8, v8, a0 -; RV64-NEXT: feq.d a0, fa3, fa3 +; RV64-NEXT: fcvt.l.d a3, fa3, rtz +; RV64-NEXT: fld fa3, 32(sp) +; RV64-NEXT: vslide1down.vx v8, v10, a0 +; RV64-NEXT: neg a0, a2 +; RV64-NEXT: and a0, a0, a3 +; RV64-NEXT: feq.d a2, fa3, fa3 +; RV64-NEXT: negw a2, a2 ; RV64-NEXT: fmax.d fa3, fa3, fa5 ; RV64-NEXT: fmin.d fa3, fa3, fa4 -; RV64-NEXT: fcvt.l.d a2, fa3, rtz +; RV64-NEXT: fcvt.l.d a3, fa3, rtz ; RV64-NEXT: fld fa3, 48(sp) -; RV64-NEXT: neg a0, a0 -; RV64-NEXT: and a0, a0, a2 -; RV64-NEXT: vslide1down.vx v8, v8, a0 +; RV64-NEXT: and a2, a2, a3 +; RV64-NEXT: vmv.v.x v9, a2 +; RV64-NEXT: vslide1down.vx v9, v9, a0 ; RV64-NEXT: feq.d a0, fa3, fa3 ; RV64-NEXT: fmax.d fa3, fa3, fa5 ; RV64-NEXT: fmin.d fa3, fa3, fa4 @@ -487,15 +489,17 @@ define void @fp2si_v8f64_v8i8(ptr %x, ptr %y) { ; RV64-NEXT: fld fa3, 56(sp) ; RV64-NEXT: neg a0, a0 ; RV64-NEXT: and a0, a0, a2 -; RV64-NEXT: vslide1down.vx v8, v8, a0 +; RV64-NEXT: vslide1down.vx v9, v9, a0 ; RV64-NEXT: feq.d a0, fa3, fa3 ; RV64-NEXT: neg a0, a0 ; RV64-NEXT: fmax.d fa5, fa3, fa5 ; RV64-NEXT: fmin.d fa5, fa5, fa4 ; RV64-NEXT: fcvt.l.d a2, fa5, rtz ; RV64-NEXT: and a0, a0, a2 -; RV64-NEXT: vslide1down.vx v8, v8, a0 -; RV64-NEXT: vse8.v v8, (a1) +; RV64-NEXT: vmv.v.i v0, 15 +; RV64-NEXT: vslide1down.vx v9, v9, a0 +; RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t +; RV64-NEXT: vse8.v v9, (a1) ; RV64-NEXT: addi sp, s0, -128 ; RV64-NEXT: ld ra, 120(sp) # 8-byte Folded Reload ; RV64-NEXT: ld s0, 112(sp) # 8-byte Folded Reload @@ -553,11 +557,11 @@ define void @fp2ui_v8f64_v8i8(ptr %x, ptr %y) { ; RV32-NEXT: vslidedown.vi v8, v8, 3 ; RV32-NEXT: vfmv.f.s fa4, v8 ; RV32-NEXT: fmax.d fa4, fa4, fa3 -; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32-NEXT: fld fa2, 32(sp) +; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV32-NEXT: fld fa2, 40(sp) ; RV32-NEXT: fmin.d fa4, fa4, fa5 ; RV32-NEXT: fcvt.wu.d a0, fa4, rtz -; RV32-NEXT: fld fa4, 40(sp) +; RV32-NEXT: fld fa4, 32(sp) ; RV32-NEXT: fmax.d fa2, fa2, fa3 ; RV32-NEXT: fmin.d fa2, fa2, fa5 ; RV32-NEXT: fcvt.wu.d a2, fa2, rtz @@ -570,14 +574,16 @@ define void @fp2ui_v8f64_v8i8(ptr %x, ptr %y) { ; RV32-NEXT: fmin.d fa4, fa4, fa5 ; RV32-NEXT: fcvt.wu.d a0, fa4, rtz ; RV32-NEXT: fld fa4, 56(sp) -; RV32-NEXT: vslide1down.vx v8, v8, a2 -; RV32-NEXT: vslide1down.vx v8, v8, a3 -; RV32-NEXT: vslide1down.vx v8, v8, a0 +; RV32-NEXT: vmv.v.x v9, a3 +; RV32-NEXT: vslide1down.vx v9, v9, a2 +; RV32-NEXT: vslide1down.vx v9, v9, a0 ; RV32-NEXT: fmax.d fa4, fa4, fa3 ; RV32-NEXT: fmin.d fa5, fa4, fa5 ; RV32-NEXT: fcvt.wu.d a0, fa5, rtz -; RV32-NEXT: vslide1down.vx v8, v8, a0 -; RV32-NEXT: vse8.v v8, (a1) +; RV32-NEXT: vmv.v.i v0, 15 +; RV32-NEXT: vslide1down.vx v9, v9, a0 +; RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t +; RV32-NEXT: vse8.v v9, (a1) ; RV32-NEXT: addi sp, s0, -128 ; RV32-NEXT: lw ra, 124(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s0, 120(sp) # 4-byte Folded Reload @@ -627,11 +633,11 @@ define void @fp2ui_v8f64_v8i8(ptr %x, ptr %y) { ; RV64-NEXT: vslidedown.vi v8, v8, 3 ; RV64-NEXT: vfmv.f.s fa4, v8 ; RV64-NEXT: fmax.d fa4, fa4, fa3 -; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64-NEXT: fld fa2, 32(sp) +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64-NEXT: fld fa2, 40(sp) ; RV64-NEXT: fmin.d fa4, fa4, fa5 ; RV64-NEXT: fcvt.lu.d a0, fa4, rtz -; RV64-NEXT: fld fa4, 40(sp) +; RV64-NEXT: fld fa4, 32(sp) ; RV64-NEXT: fmax.d fa2, fa2, fa3 ; RV64-NEXT: fmin.d fa2, fa2, fa5 ; RV64-NEXT: fcvt.lu.d a2, fa2, rtz @@ -644,14 +650,16 @@ define void @fp2ui_v8f64_v8i8(ptr %x, ptr %y) { ; RV64-NEXT: fmin.d fa4, fa4, fa5 ; RV64-NEXT: fcvt.lu.d a0, fa4, rtz ; RV64-NEXT: fld fa4, 56(sp) -; RV64-NEXT: vslide1down.vx v8, v8, a2 -; RV64-NEXT: vslide1down.vx v8, v8, a3 -; RV64-NEXT: vslide1down.vx v8, v8, a0 +; RV64-NEXT: vmv.v.x v9, a3 +; RV64-NEXT: vslide1down.vx v9, v9, a2 +; RV64-NEXT: vslide1down.vx v9, v9, a0 ; RV64-NEXT: fmax.d fa4, fa4, fa3 ; RV64-NEXT: fmin.d fa5, fa4, fa5 ; RV64-NEXT: fcvt.lu.d a0, fa5, rtz -; RV64-NEXT: vslide1down.vx v8, v8, a0 -; RV64-NEXT: vse8.v v8, (a1) +; RV64-NEXT: vmv.v.i v0, 15 +; RV64-NEXT: vslide1down.vx v9, v9, a0 +; RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t +; RV64-NEXT: vse8.v v9, (a1) ; RV64-NEXT: addi sp, s0, -128 ; RV64-NEXT: ld ra, 120(sp) # 8-byte Folded Reload ; RV64-NEXT: ld s0, 112(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll index e691e63..ed6c01a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll @@ -1181,89 +1181,46 @@ define <8 x i64> @v8xi64_exact_undef_prefix(i64 %a, i64 %b, i64 %c, i64 %d) vsca define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) { -; RV32-LABEL: buildvec_v16i8_loads_contigous: -; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: .cfi_offset s0, -4 -; RV32-NEXT: lbu a1, 1(a0) -; RV32-NEXT: lbu a2, 2(a0) -; RV32-NEXT: lbu a3, 3(a0) -; RV32-NEXT: lbu a4, 4(a0) -; RV32-NEXT: lbu a5, 5(a0) -; RV32-NEXT: lbu a6, 6(a0) -; RV32-NEXT: lbu a7, 7(a0) -; RV32-NEXT: lbu t0, 8(a0) -; RV32-NEXT: lbu t1, 9(a0) -; RV32-NEXT: lbu t2, 10(a0) -; RV32-NEXT: lbu t3, 11(a0) -; RV32-NEXT: lbu t4, 12(a0) -; RV32-NEXT: lbu t5, 13(a0) -; RV32-NEXT: lbu t6, 14(a0) -; RV32-NEXT: lbu s0, 15(a0) -; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; RV32-NEXT: vlse8.v v8, (a0), zero -; RV32-NEXT: vslide1down.vx v8, v8, a1 -; RV32-NEXT: vslide1down.vx v8, v8, a2 -; RV32-NEXT: vslide1down.vx v8, v8, a3 -; RV32-NEXT: vslide1down.vx v8, v8, a4 -; RV32-NEXT: vslide1down.vx v8, v8, a5 -; RV32-NEXT: vslide1down.vx v8, v8, a6 -; RV32-NEXT: vslide1down.vx v8, v8, a7 -; RV32-NEXT: vslide1down.vx v8, v8, t0 -; RV32-NEXT: vslide1down.vx v8, v8, t1 -; RV32-NEXT: vslide1down.vx v8, v8, t2 -; RV32-NEXT: vslide1down.vx v8, v8, t3 -; RV32-NEXT: vslide1down.vx v8, v8, t4 -; RV32-NEXT: vslide1down.vx v8, v8, t5 -; RV32-NEXT: vslide1down.vx v8, v8, t6 -; RV32-NEXT: vslide1down.vx v8, v8, s0 -; RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 16 -; RV32-NEXT: ret -; -; RV64-LABEL: buildvec_v16i8_loads_contigous: -; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: .cfi_def_cfa_offset 16 -; RV64-NEXT: sd s0, 8(sp) # 8-byte Folded Spill -; RV64-NEXT: .cfi_offset s0, -8 -; RV64-NEXT: lbu a1, 1(a0) -; RV64-NEXT: lbu a2, 2(a0) -; RV64-NEXT: lbu a3, 3(a0) -; RV64-NEXT: lbu a4, 4(a0) -; RV64-NEXT: lbu a5, 5(a0) -; RV64-NEXT: lbu a6, 6(a0) -; RV64-NEXT: lbu a7, 7(a0) -; RV64-NEXT: lbu t0, 8(a0) -; RV64-NEXT: lbu t1, 9(a0) -; RV64-NEXT: lbu t2, 10(a0) -; RV64-NEXT: lbu t3, 11(a0) -; RV64-NEXT: lbu t4, 12(a0) -; RV64-NEXT: lbu t5, 13(a0) -; RV64-NEXT: lbu t6, 14(a0) -; RV64-NEXT: lbu s0, 15(a0) -; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; RV64-NEXT: vlse8.v v8, (a0), zero -; RV64-NEXT: vslide1down.vx v8, v8, a1 -; RV64-NEXT: vslide1down.vx v8, v8, a2 -; RV64-NEXT: vslide1down.vx v8, v8, a3 -; RV64-NEXT: vslide1down.vx v8, v8, a4 -; RV64-NEXT: vslide1down.vx v8, v8, a5 -; RV64-NEXT: vslide1down.vx v8, v8, a6 -; RV64-NEXT: vslide1down.vx v8, v8, a7 -; RV64-NEXT: vslide1down.vx v8, v8, t0 -; RV64-NEXT: vslide1down.vx v8, v8, t1 -; RV64-NEXT: vslide1down.vx v8, v8, t2 -; RV64-NEXT: vslide1down.vx v8, v8, t3 -; RV64-NEXT: vslide1down.vx v8, v8, t4 -; RV64-NEXT: vslide1down.vx v8, v8, t5 -; RV64-NEXT: vslide1down.vx v8, v8, t6 -; RV64-NEXT: vslide1down.vx v8, v8, s0 -; RV64-NEXT: ld s0, 8(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 16 -; RV64-NEXT: ret +; CHECK-LABEL: buildvec_v16i8_loads_contigous: +; CHECK: # %bb.0: +; CHECK-NEXT: lbu a1, 1(a0) +; CHECK-NEXT: lbu a2, 2(a0) +; CHECK-NEXT: lbu a3, 3(a0) +; CHECK-NEXT: lbu a4, 4(a0) +; CHECK-NEXT: lbu a5, 5(a0) +; CHECK-NEXT: lbu a6, 6(a0) +; CHECK-NEXT: lbu a7, 7(a0) +; CHECK-NEXT: lbu t0, 9(a0) +; CHECK-NEXT: lbu t1, 10(a0) +; CHECK-NEXT: lbu t2, 11(a0) +; CHECK-NEXT: lbu t3, 12(a0) +; CHECK-NEXT: lbu t4, 13(a0) +; CHECK-NEXT: lbu t5, 14(a0) +; CHECK-NEXT: lbu t6, 15(a0) +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-NEXT: vlse8.v v8, (a0), zero +; CHECK-NEXT: addi a0, a0, 8 +; CHECK-NEXT: vslide1down.vx v8, v8, a1 +; CHECK-NEXT: vslide1down.vx v8, v8, a2 +; CHECK-NEXT: vslide1down.vx v8, v8, a3 +; CHECK-NEXT: vslide1down.vx v8, v8, a4 +; CHECK-NEXT: vlse8.v v9, (a0), zero +; CHECK-NEXT: vslide1down.vx v8, v8, a5 +; CHECK-NEXT: vslide1down.vx v8, v8, a6 +; CHECK-NEXT: vslide1down.vx v10, v8, a7 +; CHECK-NEXT: vslide1down.vx v8, v9, t0 +; CHECK-NEXT: vslide1down.vx v8, v8, t1 +; CHECK-NEXT: vslide1down.vx v8, v8, t2 +; CHECK-NEXT: vslide1down.vx v8, v8, t3 +; CHECK-NEXT: vslide1down.vx v8, v8, t4 +; CHECK-NEXT: vslide1down.vx v8, v8, t5 +; CHECK-NEXT: vslide1down.vx v8, v8, t6 +; CHECK-NEXT: li a0, 255 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.s.x v0, a0 +; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu +; CHECK-NEXT: vslidedown.vi v8, v10, 8, v0.t +; CHECK-NEXT: ret %p2 = getelementptr i8, ptr %p, i32 1 %p3 = getelementptr i8, ptr %p, i32 2 %p4 = getelementptr i8, ptr %p, i32 3 @@ -1318,89 +1275,46 @@ define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) { define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) { -; RV32-LABEL: buildvec_v16i8_loads_gather: -; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: .cfi_offset s0, -4 -; RV32-NEXT: lbu a1, 1(a0) -; RV32-NEXT: lbu a2, 22(a0) -; RV32-NEXT: lbu a3, 31(a0) -; RV32-NEXT: lbu a4, 44(a0) -; RV32-NEXT: lbu a5, 55(a0) -; RV32-NEXT: lbu a6, 623(a0) -; RV32-NEXT: lbu a7, 75(a0) -; RV32-NEXT: lbu t0, 82(a0) -; RV32-NEXT: lbu t1, 93(a0) -; RV32-NEXT: lbu t2, 105(a0) -; RV32-NEXT: lbu t3, 161(a0) -; RV32-NEXT: lbu t4, 124(a0) -; RV32-NEXT: lbu t5, 163(a0) -; RV32-NEXT: lbu t6, 144(a0) -; RV32-NEXT: lbu s0, 154(a0) -; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; RV32-NEXT: vlse8.v v8, (a0), zero -; RV32-NEXT: vslide1down.vx v8, v8, a1 -; RV32-NEXT: vslide1down.vx v8, v8, a2 -; RV32-NEXT: vslide1down.vx v8, v8, a3 -; RV32-NEXT: vslide1down.vx v8, v8, a4 -; RV32-NEXT: vslide1down.vx v8, v8, a5 -; RV32-NEXT: vslide1down.vx v8, v8, a6 -; RV32-NEXT: vslide1down.vx v8, v8, a7 -; RV32-NEXT: vslide1down.vx v8, v8, t0 -; RV32-NEXT: vslide1down.vx v8, v8, t1 -; RV32-NEXT: vslide1down.vx v8, v8, t2 -; RV32-NEXT: vslide1down.vx v8, v8, t3 -; RV32-NEXT: vslide1down.vx v8, v8, t4 -; RV32-NEXT: vslide1down.vx v8, v8, t5 -; RV32-NEXT: vslide1down.vx v8, v8, t6 -; RV32-NEXT: vslide1down.vx v8, v8, s0 -; RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 16 -; RV32-NEXT: ret -; -; RV64-LABEL: buildvec_v16i8_loads_gather: -; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: .cfi_def_cfa_offset 16 -; RV64-NEXT: sd s0, 8(sp) # 8-byte Folded Spill -; RV64-NEXT: .cfi_offset s0, -8 -; RV64-NEXT: lbu a1, 1(a0) -; RV64-NEXT: lbu a2, 22(a0) -; RV64-NEXT: lbu a3, 31(a0) -; RV64-NEXT: lbu a4, 44(a0) -; RV64-NEXT: lbu a5, 55(a0) -; RV64-NEXT: lbu a6, 623(a0) -; RV64-NEXT: lbu a7, 75(a0) -; RV64-NEXT: lbu t0, 82(a0) -; RV64-NEXT: lbu t1, 93(a0) -; RV64-NEXT: lbu t2, 105(a0) -; RV64-NEXT: lbu t3, 161(a0) -; RV64-NEXT: lbu t4, 124(a0) -; RV64-NEXT: lbu t5, 163(a0) -; RV64-NEXT: lbu t6, 144(a0) -; RV64-NEXT: lbu s0, 154(a0) -; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; RV64-NEXT: vlse8.v v8, (a0), zero -; RV64-NEXT: vslide1down.vx v8, v8, a1 -; RV64-NEXT: vslide1down.vx v8, v8, a2 -; RV64-NEXT: vslide1down.vx v8, v8, a3 -; RV64-NEXT: vslide1down.vx v8, v8, a4 -; RV64-NEXT: vslide1down.vx v8, v8, a5 -; RV64-NEXT: vslide1down.vx v8, v8, a6 -; RV64-NEXT: vslide1down.vx v8, v8, a7 -; RV64-NEXT: vslide1down.vx v8, v8, t0 -; RV64-NEXT: vslide1down.vx v8, v8, t1 -; RV64-NEXT: vslide1down.vx v8, v8, t2 -; RV64-NEXT: vslide1down.vx v8, v8, t3 -; RV64-NEXT: vslide1down.vx v8, v8, t4 -; RV64-NEXT: vslide1down.vx v8, v8, t5 -; RV64-NEXT: vslide1down.vx v8, v8, t6 -; RV64-NEXT: vslide1down.vx v8, v8, s0 -; RV64-NEXT: ld s0, 8(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 16 -; RV64-NEXT: ret +; CHECK-LABEL: buildvec_v16i8_loads_gather: +; CHECK: # %bb.0: +; CHECK-NEXT: lbu a1, 1(a0) +; CHECK-NEXT: lbu a2, 22(a0) +; CHECK-NEXT: lbu a3, 31(a0) +; CHECK-NEXT: lbu a4, 44(a0) +; CHECK-NEXT: lbu a5, 55(a0) +; CHECK-NEXT: lbu a6, 623(a0) +; CHECK-NEXT: lbu a7, 75(a0) +; CHECK-NEXT: lbu t0, 93(a0) +; CHECK-NEXT: lbu t1, 105(a0) +; CHECK-NEXT: lbu t2, 161(a0) +; CHECK-NEXT: lbu t3, 124(a0) +; CHECK-NEXT: lbu t4, 163(a0) +; CHECK-NEXT: lbu t5, 144(a0) +; CHECK-NEXT: lbu t6, 154(a0) +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-NEXT: vlse8.v v8, (a0), zero +; CHECK-NEXT: addi a0, a0, 82 +; CHECK-NEXT: vslide1down.vx v8, v8, a1 +; CHECK-NEXT: vslide1down.vx v8, v8, a2 +; CHECK-NEXT: vslide1down.vx v8, v8, a3 +; CHECK-NEXT: vslide1down.vx v8, v8, a4 +; CHECK-NEXT: vlse8.v v9, (a0), zero +; CHECK-NEXT: vslide1down.vx v8, v8, a5 +; CHECK-NEXT: vslide1down.vx v8, v8, a6 +; CHECK-NEXT: vslide1down.vx v10, v8, a7 +; CHECK-NEXT: vslide1down.vx v8, v9, t0 +; CHECK-NEXT: vslide1down.vx v8, v8, t1 +; CHECK-NEXT: vslide1down.vx v8, v8, t2 +; CHECK-NEXT: vslide1down.vx v8, v8, t3 +; CHECK-NEXT: vslide1down.vx v8, v8, t4 +; CHECK-NEXT: vslide1down.vx v8, v8, t5 +; CHECK-NEXT: vslide1down.vx v8, v8, t6 +; CHECK-NEXT: li a0, 255 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.s.x v0, a0 +; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu +; CHECK-NEXT: vslidedown.vi v8, v10, 8, v0.t +; CHECK-NEXT: ret %p2 = getelementptr i8, ptr %p, i32 1 %p3 = getelementptr i8, ptr %p, i32 22 %p4 = getelementptr i8, ptr %p, i32 31 @@ -1560,21 +1474,26 @@ define <16 x i8> @buildvec_v16i8_undef_edges(ptr %p) { ; CHECK-NEXT: lbu a3, 55(a0) ; CHECK-NEXT: lbu a4, 623(a0) ; CHECK-NEXT: lbu a5, 75(a0) -; CHECK-NEXT: lbu a6, 82(a0) -; CHECK-NEXT: lbu a7, 93(a0) -; CHECK-NEXT: lbu t0, 105(a0) -; CHECK-NEXT: lbu a0, 161(a0) +; CHECK-NEXT: lbu a6, 93(a0) +; CHECK-NEXT: lbu a7, 105(a0) +; CHECK-NEXT: lbu t0, 161(a0) ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; CHECK-NEXT: vlse8.v v8, (a1), zero +; CHECK-NEXT: addi a0, a0, 82 ; CHECK-NEXT: vslide1down.vx v8, v8, a2 +; CHECK-NEXT: vlse8.v v9, (a0), zero ; CHECK-NEXT: vslide1down.vx v8, v8, a3 ; CHECK-NEXT: vslide1down.vx v8, v8, a4 -; CHECK-NEXT: vslide1down.vx v8, v8, a5 -; CHECK-NEXT: vslide1down.vx v8, v8, a6 +; CHECK-NEXT: vslide1down.vx v10, v8, a5 +; CHECK-NEXT: vslide1down.vx v8, v9, a6 ; CHECK-NEXT: vslide1down.vx v8, v8, a7 ; CHECK-NEXT: vslide1down.vx v8, v8, t0 -; CHECK-NEXT: vslide1down.vx v8, v8, a0 ; CHECK-NEXT: vslidedown.vi v8, v8, 4 +; CHECK-NEXT: li a0, 255 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.s.x v0, a0 +; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu +; CHECK-NEXT: vslidedown.vi v8, v10, 8, v0.t ; CHECK-NEXT: ret %p4 = getelementptr i8, ptr %p, i32 31 %p5 = getelementptr i8, ptr %p, i32 44 @@ -1615,26 +1534,31 @@ define <16 x i8> @buildvec_v16i8_loads_undef_scattered(ptr %p) { ; CHECK-NEXT: lbu a2, 44(a0) ; CHECK-NEXT: lbu a3, 55(a0) ; CHECK-NEXT: lbu a4, 75(a0) -; CHECK-NEXT: lbu a5, 82(a0) -; CHECK-NEXT: lbu a6, 93(a0) -; CHECK-NEXT: lbu a7, 124(a0) -; CHECK-NEXT: lbu t0, 144(a0) -; CHECK-NEXT: lbu t1, 154(a0) +; CHECK-NEXT: lbu a5, 93(a0) +; CHECK-NEXT: lbu a6, 124(a0) +; CHECK-NEXT: lbu a7, 144(a0) +; CHECK-NEXT: lbu t0, 154(a0) ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; CHECK-NEXT: vlse8.v v8, (a0), zero +; CHECK-NEXT: addi a0, a0, 82 ; CHECK-NEXT: vslide1down.vx v8, v8, a1 ; CHECK-NEXT: vslidedown.vi v8, v8, 2 ; CHECK-NEXT: vslide1down.vx v8, v8, a2 +; CHECK-NEXT: vlse8.v v9, (a0), zero ; CHECK-NEXT: vslide1down.vx v8, v8, a3 ; CHECK-NEXT: vslidedown.vi v8, v8, 1 -; CHECK-NEXT: vslide1down.vx v8, v8, a4 -; CHECK-NEXT: vslide1down.vx v8, v8, a5 -; CHECK-NEXT: vslide1down.vx v8, v8, a6 +; CHECK-NEXT: vslide1down.vx v10, v8, a4 +; CHECK-NEXT: vslide1down.vx v8, v9, a5 ; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vslide1down.vx v8, v8, a7 +; CHECK-NEXT: vslide1down.vx v8, v8, a6 ; CHECK-NEXT: vslidedown.vi v8, v8, 1 +; CHECK-NEXT: vslide1down.vx v8, v8, a7 ; CHECK-NEXT: vslide1down.vx v8, v8, t0 -; CHECK-NEXT: vslide1down.vx v8, v8, t1 +; CHECK-NEXT: li a0, 255 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.s.x v0, a0 +; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu +; CHECK-NEXT: vslidedown.vi v8, v10, 8, v0.t ; CHECK-NEXT: ret %p2 = getelementptr i8, ptr %p, i32 1 %p3 = getelementptr i8, ptr %p, i32 22 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll index dd0fc5a..c295fed 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll @@ -278,32 +278,34 @@ define <8 x i1> @buildvec_mask_nonconst_v8i1(i1 %x, i1 %y) { define <8 x i1> @buildvec_mask_nonconst_v8i1_2(i1 %x, i1 %y, i1 %z, i1 %w) { ; CHECK-LABEL: buildvec_mask_nonconst_v8i1_2: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; CHECK-NEXT: vmv.v.x v8, a0 -; CHECK-NEXT: vslide1down.vx v8, v8, a0 -; CHECK-NEXT: li a4, 1 -; CHECK-NEXT: vslide1down.vx v8, v8, a4 -; CHECK-NEXT: vslide1down.vx v8, v8, a1 -; CHECK-NEXT: vslide1down.vx v8, v8, a0 +; CHECK-NEXT: vslide1down.vx v9, v8, a0 +; CHECK-NEXT: li a0, 1 +; CHECK-NEXT: vslide1down.vx v9, v9, a0 +; CHECK-NEXT: vslide1down.vx v9, v9, a1 ; CHECK-NEXT: vslide1down.vx v8, v8, a3 ; CHECK-NEXT: vslide1down.vx v8, v8, zero +; CHECK-NEXT: vmv.v.i v0, 15 ; CHECK-NEXT: vslide1down.vx v8, v8, a2 +; CHECK-NEXT: vslidedown.vi v8, v9, 4, v0.t ; CHECK-NEXT: vand.vi v8, v8, 1 ; CHECK-NEXT: vmsne.vi v0, v8, 0 ; CHECK-NEXT: ret ; ; ZVE32F-LABEL: buildvec_mask_nonconst_v8i1_2: ; ZVE32F: # %bb.0: -; ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; ZVE32F-NEXT: vmv.v.x v8, a0 -; ZVE32F-NEXT: vslide1down.vx v8, v8, a0 -; ZVE32F-NEXT: li a4, 1 -; ZVE32F-NEXT: vslide1down.vx v8, v8, a4 -; ZVE32F-NEXT: vslide1down.vx v8, v8, a1 -; ZVE32F-NEXT: vslide1down.vx v8, v8, a0 +; ZVE32F-NEXT: vslide1down.vx v9, v8, a0 +; ZVE32F-NEXT: li a0, 1 +; ZVE32F-NEXT: vslide1down.vx v9, v9, a0 +; ZVE32F-NEXT: vslide1down.vx v9, v9, a1 ; ZVE32F-NEXT: vslide1down.vx v8, v8, a3 ; ZVE32F-NEXT: vslide1down.vx v8, v8, zero +; ZVE32F-NEXT: vmv.v.i v0, 15 ; ZVE32F-NEXT: vslide1down.vx v8, v8, a2 +; ZVE32F-NEXT: vslidedown.vi v8, v9, 4, v0.t ; ZVE32F-NEXT: vand.vi v8, v8, 1 ; ZVE32F-NEXT: vmsne.vi v0, v8, 0 ; ZVE32F-NEXT: ret @@ -321,32 +323,34 @@ define <8 x i1> @buildvec_mask_nonconst_v8i1_2(i1 %x, i1 %y, i1 %z, i1 %w) { define <8 x i1> @buildvec_mask_optsize_nonconst_v8i1_2(i1 %x, i1 %y, i1 %z, i1 %w) optsize { ; CHECK-LABEL: buildvec_mask_optsize_nonconst_v8i1_2: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; CHECK-NEXT: vmv.v.x v8, a0 -; CHECK-NEXT: vslide1down.vx v8, v8, a0 -; CHECK-NEXT: li a4, 1 -; CHECK-NEXT: vslide1down.vx v8, v8, a4 -; CHECK-NEXT: vslide1down.vx v8, v8, a1 -; CHECK-NEXT: vslide1down.vx v8, v8, a0 +; CHECK-NEXT: vslide1down.vx v9, v8, a0 +; CHECK-NEXT: li a0, 1 +; CHECK-NEXT: vslide1down.vx v9, v9, a0 +; CHECK-NEXT: vslide1down.vx v9, v9, a1 ; CHECK-NEXT: vslide1down.vx v8, v8, a3 ; CHECK-NEXT: vslide1down.vx v8, v8, zero +; CHECK-NEXT: vmv.v.i v0, 15 ; CHECK-NEXT: vslide1down.vx v8, v8, a2 +; CHECK-NEXT: vslidedown.vi v8, v9, 4, v0.t ; CHECK-NEXT: vand.vi v8, v8, 1 ; CHECK-NEXT: vmsne.vi v0, v8, 0 ; CHECK-NEXT: ret ; ; ZVE32F-LABEL: buildvec_mask_optsize_nonconst_v8i1_2: ; ZVE32F: # %bb.0: -; ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; ZVE32F-NEXT: vmv.v.x v8, a0 -; ZVE32F-NEXT: vslide1down.vx v8, v8, a0 -; ZVE32F-NEXT: li a4, 1 -; ZVE32F-NEXT: vslide1down.vx v8, v8, a4 -; ZVE32F-NEXT: vslide1down.vx v8, v8, a1 -; ZVE32F-NEXT: vslide1down.vx v8, v8, a0 +; ZVE32F-NEXT: vslide1down.vx v9, v8, a0 +; ZVE32F-NEXT: li a0, 1 +; ZVE32F-NEXT: vslide1down.vx v9, v9, a0 +; ZVE32F-NEXT: vslide1down.vx v9, v9, a1 ; ZVE32F-NEXT: vslide1down.vx v8, v8, a3 ; ZVE32F-NEXT: vslide1down.vx v8, v8, zero +; ZVE32F-NEXT: vmv.v.i v0, 15 ; ZVE32F-NEXT: vslide1down.vx v8, v8, a2 +; ZVE32F-NEXT: vslidedown.vi v8, v9, 4, v0.t ; ZVE32F-NEXT: vand.vi v8, v8, 1 ; ZVE32F-NEXT: vmsne.vi v0, v8, 0 ; ZVE32F-NEXT: ret @@ -364,30 +368,32 @@ define <8 x i1> @buildvec_mask_optsize_nonconst_v8i1_2(i1 %x, i1 %y, i1 %z, i1 % define <8 x i1> @buildvec_mask_optsize_nonconst_v8i1(i1 %x, i1 %y) optsize { ; CHECK-LABEL: buildvec_mask_optsize_nonconst_v8i1: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; CHECK-NEXT: vmv.v.x v8, a0 -; CHECK-NEXT: vslide1down.vx v8, v8, a0 -; CHECK-NEXT: vslide1down.vx v8, v8, a1 -; CHECK-NEXT: vslide1down.vx v8, v8, a1 -; CHECK-NEXT: vslide1down.vx v8, v8, a0 +; CHECK-NEXT: vslide1down.vx v9, v8, a0 +; CHECK-NEXT: vslide1down.vx v9, v9, a1 +; CHECK-NEXT: vslide1down.vx v9, v9, a1 ; CHECK-NEXT: vslide1down.vx v8, v8, a1 ; CHECK-NEXT: vslide1down.vx v8, v8, a1 +; CHECK-NEXT: vmv.v.i v0, 15 ; CHECK-NEXT: vslide1down.vx v8, v8, a1 +; CHECK-NEXT: vslidedown.vi v8, v9, 4, v0.t ; CHECK-NEXT: vand.vi v8, v8, 1 ; CHECK-NEXT: vmsne.vi v0, v8, 0 ; CHECK-NEXT: ret ; ; ZVE32F-LABEL: buildvec_mask_optsize_nonconst_v8i1: ; ZVE32F: # %bb.0: -; ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; ZVE32F-NEXT: vmv.v.x v8, a0 -; ZVE32F-NEXT: vslide1down.vx v8, v8, a0 -; ZVE32F-NEXT: vslide1down.vx v8, v8, a1 -; ZVE32F-NEXT: vslide1down.vx v8, v8, a1 -; ZVE32F-NEXT: vslide1down.vx v8, v8, a0 +; ZVE32F-NEXT: vslide1down.vx v9, v8, a0 +; ZVE32F-NEXT: vslide1down.vx v9, v9, a1 +; ZVE32F-NEXT: vslide1down.vx v9, v9, a1 ; ZVE32F-NEXT: vslide1down.vx v8, v8, a1 ; ZVE32F-NEXT: vslide1down.vx v8, v8, a1 +; ZVE32F-NEXT: vmv.v.i v0, 15 ; ZVE32F-NEXT: vslide1down.vx v8, v8, a1 +; ZVE32F-NEXT: vslidedown.vi v8, v9, 4, v0.t ; ZVE32F-NEXT: vand.vi v8, v8, 1 ; ZVE32F-NEXT: vmsne.vi v0, v8, 0 ; ZVE32F-NEXT: ret |