diff options
Diffstat (limited to 'llvm/lib/Target/X86/X86ISelLowering.cpp')
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 844 |
1 files changed, 645 insertions, 199 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 2970cf4..fbd875a 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -130,7 +130,7 @@ static cl::opt<bool> MulConstantOptimization( X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, const X86Subtarget &STI) - : TargetLowering(TM), Subtarget(STI) { + : TargetLowering(TM, STI), Subtarget(STI) { bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87(); MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0)); @@ -635,6 +635,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FROUNDEVEN, VT, Action); setOperationAction(ISD::FTRUNC, VT, Action); setOperationAction(ISD::FLDEXP, VT, Action); + setOperationAction(ISD::FSINCOSPI, VT, Action); }; if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) { @@ -2072,8 +2073,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (Subtarget.hasVBMI2()) { for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) { - setOperationAction(ISD::FSHL, VT, Custom); - setOperationAction(ISD::FSHR, VT, Custom); + setOperationAction(ISD::FSHL, VT, Legal); + setOperationAction(ISD::FSHR, VT, Legal); } setOperationAction(ISD::ROTL, MVT::v32i16, Custom); @@ -2088,8 +2089,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (!Subtarget.useSoftFloat() && Subtarget.hasVBMI2()) { for (auto VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v16i16, MVT::v8i32, MVT::v4i64}) { - setOperationAction(ISD::FSHL, VT, Custom); - setOperationAction(ISD::FSHR, VT, Custom); + setOperationAction(ISD::FSHL, VT, Subtarget.hasVLX() ? Legal : Custom); + setOperationAction(ISD::FSHR, VT, Subtarget.hasVLX() ? Legal : Custom); } } @@ -2097,9 +2098,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // pre-AVX512 equivalents. Without VLX we use 512-bit operations for // narrower widths. if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) { + for (MVT VT : {MVT::f16, MVT::f32, MVT::f64, MVT::v8f16, MVT::v4f32, + MVT::v2f64, MVT::v16f16, MVT::v8f32, MVT::v4f64, MVT::v32f16, + MVT::v16f32, MVT::v8f64}) + setOperationAction(ISD::FLDEXP, VT, Custom); + // These operations are handled on non-VLX by artificially widening in // isel patterns. - setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i32, Custom); @@ -2150,6 +2155,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } if (Subtarget.hasCDI()) { + for (auto VT : {MVT::i256, MVT::i512}) { + if (VT == MVT::i512 && !Subtarget.useAVX512Regs()) + continue; + setOperationAction(ISD::CTLZ, VT, Custom); + setOperationAction(ISD::CTTZ, VT, Custom); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom); + } for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) { setOperationAction(ISD::CTLZ, VT, Legal); } @@ -2572,8 +2585,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } // Combine sin / cos into _sincos_stret if it is available. - setOperationAction(ISD::FSINCOS, MVT::f64, Custom); - setOperationAction(ISD::FSINCOS, MVT::f32, Custom); + setOperationAction(ISD::FSINCOS, MVT::f64, Expand); + setOperationAction(ISD::FSINCOS, MVT::f32, Expand); if (Subtarget.isTargetWin64()) { setOperationAction(ISD::SDIV, MVT::i128, Custom); @@ -2655,6 +2668,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, ISD::AVGFLOORU, ISD::BITREVERSE, ISD::ADD, + ISD::SADDSAT, + ISD::SSUBSAT, ISD::FADD, ISD::FSUB, ISD::FNEG, @@ -2694,6 +2709,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, ISD::STRICT_FP_EXTEND, ISD::FP_ROUND, ISD::STRICT_FP_ROUND, + ISD::FSHL, + ISD::FSHR, ISD::INTRINSIC_VOID, ISD::INTRINSIC_WO_CHAIN, ISD::INTRINSIC_W_CHAIN}); @@ -2871,6 +2888,8 @@ static bool isTargetShuffle(unsigned Opcode) { case X86ISD::VPERMV: case X86ISD::VPERMV3: case X86ISD::VZEXT_MOVL: + case X86ISD::COMPRESS: + case X86ISD::EXPAND: return true; } } @@ -3087,7 +3106,7 @@ static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT) { } bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, - const CallInst &I, + const CallBase &I, MachineFunction &MF, unsigned Intrinsic) const { Info.flags = MachineMemOperand::MONone; @@ -3454,6 +3473,12 @@ bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, isTypeLegal(LoadVT) && isTypeLegal(BitcastVT)) return true; + // If we have a large vector type (even if illegal), don't bitcast to large + // (illegal) scalar types. Better to load fewer vectors and extract. + if (LoadVT.isVector() && !BitcastVT.isVector() && LoadVT.isInteger() && + BitcastVT.isInteger() && (LoadVT.getSizeInBits() % 128) == 0) + return false; + return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO); } @@ -5358,12 +5383,12 @@ bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) { int getRoundingModeX86(unsigned RM) { switch (static_cast<::llvm::RoundingMode>(RM)) { // clang-format off - case ::llvm::RoundingMode::NearestTiesToEven: return X86::rmToNearest; break; - case ::llvm::RoundingMode::TowardNegative: return X86::rmDownward; break; - case ::llvm::RoundingMode::TowardPositive: return X86::rmUpward; break; - case ::llvm::RoundingMode::TowardZero: return X86::rmTowardZero; break; - default: - return X86::rmInvalid; // Invalid rounding mode + case ::llvm::RoundingMode::NearestTiesToEven: return X86::rmToNearest; + case ::llvm::RoundingMode::TowardNegative: return X86::rmDownward; + case ::llvm::RoundingMode::TowardPositive: return X86::rmUpward; + case ::llvm::RoundingMode::TowardZero: return X86::rmTowardZero; + default: return X86::rmInvalid; + // clang-format on } } @@ -5816,6 +5841,48 @@ static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero, } return false; } + case X86ISD::COMPRESS: { + SDValue CmpVec = N.getOperand(0); + SDValue PassThru = N.getOperand(1); + SDValue CmpMask = N.getOperand(2); + APInt UndefElts; + SmallVector<APInt> EltBits; + if (!getTargetConstantBitsFromNode(CmpMask, 1, UndefElts, EltBits)) + return false; + assert(UndefElts.getBitWidth() == NumElems && EltBits.size() == NumElems && + "Illegal compression mask"); + for (unsigned I = 0; I != NumElems; ++I) { + if (!EltBits[I].isZero()) + Mask.push_back(I); + } + while (Mask.size() != NumElems) { + Mask.push_back(NumElems + Mask.size()); + } + Ops.push_back(CmpVec); + Ops.push_back(PassThru); + return true; + } + case X86ISD::EXPAND: { + SDValue ExpVec = N.getOperand(0); + SDValue PassThru = N.getOperand(1); + SDValue ExpMask = N.getOperand(2); + APInt UndefElts; + SmallVector<APInt> EltBits; + if (!getTargetConstantBitsFromNode(ExpMask, 1, UndefElts, EltBits)) + return false; + assert(UndefElts.getBitWidth() == NumElems && EltBits.size() == NumElems && + "Illegal expansion mask"); + unsigned ExpIndex = 0; + for (unsigned I = 0; I != NumElems; ++I) { + if (EltBits[I].isZero()) + Mask.push_back(I + NumElems); + else + Mask.push_back(ExpIndex++); + } + Ops.push_back(ExpVec); + Ops.push_back(PassThru); + return true; + } default: llvm_unreachable("unknown target shuffle node"); } @@ -7270,7 +7337,10 @@ static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) { static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, - bool IsAfterLegalize) { + bool IsAfterLegalize, + unsigned Depth = 0) { + if (Depth >= SelectionDAG::MaxRecursionDepth) + return SDValue(); // Limit search depth. if ((VT.getScalarSizeInBits() % 8) != 0) return SDValue(); @@ -7444,7 +7514,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems); SDValue HalfLD = EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL, - DAG, Subtarget, IsAfterLegalize); + DAG, Subtarget, IsAfterLegalize, Depth + 1); if (HalfLD) return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), HalfLD, DAG.getVectorIdxConstant(0, DL)); @@ -7521,7 +7591,8 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, VT.getSizeInBits() / ScalarSize); if (TLI.isTypeLegal(BroadcastVT)) { if (SDValue RepeatLoad = EltsFromConsecutiveLoads( - RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize)) { + RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize, + Depth + 1)) { SDValue Broadcast = RepeatLoad; if (RepeatSize > ScalarSize) { while (Broadcast.getValueSizeInBits() < VT.getSizeInBits()) @@ -7542,6 +7613,20 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, } } + // REVERSE - attempt to match the loads in reverse and then shuffle back. + // TODO: Do this for any permute or mismatching element counts. + if (Depth == 0 && ZeroMask.isZero() && UndefMask.isZero() && + TLI.isTypeLegal(VT) && VT.isVector() && + NumElems == VT.getVectorNumElements()) { + SmallVector<SDValue, 16> ReverseElts(Elts.rbegin(), Elts.rend()); + if (SDValue RevLd = EltsFromConsecutiveLoads( + VT, ReverseElts, DL, DAG, Subtarget, IsAfterLegalize, Depth + 1)) { + SmallVector<int, 16> ReverseMask(NumElems); + std::iota(ReverseMask.rbegin(), ReverseMask.rend(), 0); + return DAG.getVectorShuffle(VT, DL, RevLd, DAG.getUNDEF(VT), ReverseMask); + } + } + return SDValue(); } @@ -7948,7 +8033,7 @@ static SDValue buildFromShuffleMostly(SDValue Op, const SDLoc &DL, for (unsigned i = 0; i != NumElems; ++i) { unsigned Opc = Op.getOperand(i).getOpcode(); - if (Opc == ISD::UNDEF) + if (Opc == ISD::POISON || Opc == ISD::UNDEF) continue; if (Opc != ISD::EXTRACT_VECTOR_ELT) { @@ -7991,7 +8076,7 @@ static SDValue buildFromShuffleMostly(SDValue Op, const SDLoc &DL, if (!VecIn1.getNode()) return SDValue(); - VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT); + VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getPOISON(VT); SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask); for (unsigned Idx : InsertIndices) @@ -8115,6 +8200,8 @@ static SDValue LowerBUILD_VECTORvXi1(SDValue Op, const SDLoc &dl, case X86ISD::FHSUB: case X86ISD::HADD: case X86ISD::HSUB: + case X86ISD::HADDS: + case X86ISD::HSUBS: return true; } return false; @@ -8426,9 +8513,7 @@ static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget, // DAGCombiner::visitFADDForFMACombine. It would be good to have one // function that would answer if it is Ok to fuse MUL + ADD to FMADD // or MUL + ADDSUB to FMADDSUB. - const TargetOptions &Options = DAG.getTarget().Options; bool AllowFusion = - Options.AllowFPOpFusion == FPOpFusion::Fast || (AllowSubAddOrAddSubContract && Opnd0->getFlags().hasAllowContract()); if (!AllowFusion) return false; @@ -8856,6 +8941,56 @@ static SDValue lowerBuildVectorAsBlend(BuildVectorSDNode *BVOp, SDLoc const &DL, return SDValue(); } +/// Widen a BUILD_VECTOR if the scalar operands are freely mergeable. +static SDValue widenBuildVector(BuildVectorSDNode *BVOp, SDLoc const &DL, + X86Subtarget const &Subtarget, + SelectionDAG &DAG) { + using namespace SDPatternMatch; + MVT VT = BVOp->getSimpleValueType(0); + MVT SVT = VT.getScalarType(); + unsigned NumElts = VT.getVectorNumElements(); + unsigned EltBits = SVT.getSizeInBits(); + + if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32) + return SDValue(); + + unsigned WideBits = 2 * EltBits; + MVT WideSVT = MVT::getIntegerVT(WideBits); + MVT WideVT = MVT::getVectorVT(WideSVT, NumElts / 2); + if (!DAG.getTargetLoweringInfo().isTypeLegal(WideSVT)) + return SDValue(); + + SmallVector<SDValue, 8> WideOps; + for (unsigned I = 0; I != NumElts; I += 2) { + SDValue Op0 = BVOp->getOperand(I + 0); + SDValue Op1 = BVOp->getOperand(I + 1); + + if (Op0.isUndef() && Op1.isUndef()) { + WideOps.push_back(DAG.getUNDEF(WideSVT)); + continue; + } + + // TODO: Constant repacking? + + // Merge scalars that have been split from the same source. + SDValue X, Y; + if (sd_match(Op0, m_Trunc(m_Value(X))) && + sd_match(Op1, m_Trunc(m_Srl(m_Value(Y), m_SpecificInt(EltBits)))) && + peekThroughTruncates(X) == peekThroughTruncates(Y) && + X.getValueType().bitsGE(WideSVT)) { + if (X.getValueType().bitsGT(WideSVT)) + X = DAG.getNode(ISD::TRUNCATE, DL, WideSVT, X); + WideOps.push_back(X); + continue; + } + + return SDValue(); + } + + assert(WideOps.size() == (NumElts / 2) && "Failed to widen build vector"); + return DAG.getBitcast(VT, DAG.getBuildVector(WideVT, DL, WideOps)); +} + /// Create a vector constant without a load. SSE/AVX provide the bare minimum /// functionality to do this, so it's all zeros, all ones, or some derivation /// that is cheap to calculate. @@ -9326,6 +9461,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { return BitOp; if (SDValue Blend = lowerBuildVectorAsBlend(BV, dl, Subtarget, DAG)) return Blend; + if (SDValue WideBV = widenBuildVector(BV, dl, Subtarget, DAG)) + return WideBV; unsigned NumZero = ZeroMask.popcount(); unsigned NumNonZero = NonZeroMask.popcount(); @@ -18370,16 +18507,20 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget, SmallVector<int> Mask(OrigMask); // Canonicalize the shuffle with any horizontal ops inputs. + // Don't attempt this if the shuffle can still be widened as we may lose + // whole lane shuffle patterns. // NOTE: This may update Ops and Mask. - if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp( - Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget)) - return DAG.getBitcast(VT, HOp); + if (!canWidenShuffleElements(Mask)) { + if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp( + Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget)) + return DAG.getBitcast(VT, HOp); - V1 = DAG.getBitcast(VT, Ops[0]); - V2 = DAG.getBitcast(VT, Ops[1]); - assert(NumElements == (int)Mask.size() && - "canonicalizeShuffleMaskWithHorizOp " - "shouldn't alter the shuffle mask size"); + V1 = DAG.getBitcast(VT, Ops[0]); + V2 = DAG.getBitcast(VT, Ops[1]); + assert(NumElements == (int)Mask.size() && + "canonicalizeShuffleMaskWithHorizOp " + "shouldn't alter the shuffle mask size"); + } // Canonicalize zeros/ones/fp splat constants to ensure no undefs. // These will be materialized uniformly anyway, so make splat matching easier. @@ -19142,6 +19283,72 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, return SDValue(); } +static SDValue LowerFLDEXP(SDValue Op, const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + SDValue X = Op.getOperand(0); + MVT XTy = X.getSimpleValueType(); + SDValue Exp = Op.getOperand(1); + + switch (XTy.SimpleTy) { + default: + return SDValue(); + case MVT::f16: + if (!Subtarget.hasFP16()) + X = DAG.getFPExtendOrRound(X, DL, MVT::f32); + [[fallthrough]]; + case MVT::f32: + case MVT::f64: { + MVT VT = MVT::getVectorVT(X.getSimpleValueType(), + 128 / X.getSimpleValueType().getSizeInBits()); + Exp = DAG.getNode(ISD::SINT_TO_FP, DL, X.getValueType(), Exp); + SDValue VX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, X); + SDValue VExp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Exp); + SDValue Scalefs = DAG.getNode(X86ISD::SCALEFS, DL, VT, VX, VExp); + SDValue Final = DAG.getExtractVectorElt(DL, X.getValueType(), Scalefs, 0); + return DAG.getFPExtendOrRound(Final, DL, XTy); + } + case MVT::v4f32: + case MVT::v2f64: + case MVT::v8f32: + case MVT::v4f64: + case MVT::v16f32: + case MVT::v8f64: + if (XTy.getSizeInBits() == 512 || Subtarget.hasVLX()) { + Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp); + return DAG.getNode(X86ISD::SCALEF, DL, XTy, X, Exp); + } + break; + case MVT::v8f16: + case MVT::v16f16: + if (Subtarget.hasFP16()) { + if (Subtarget.hasVLX()) { + Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp); + return DAG.getNode(X86ISD::SCALEF, DL, XTy, X, Exp); + } + break; + } + X = DAG.getFPExtendOrRound(X, DL, XTy.changeVectorElementType(MVT::f32)); + Exp = DAG.getSExtOrTrunc(Exp, DL, + X.getSimpleValueType().changeTypeToInteger()); + break; + case MVT::v32f16: + if (Subtarget.hasFP16()) { + Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp); + return DAG.getNode(X86ISD::SCALEF, DL, XTy, X, Exp); + } + return splitVectorOp(Op, DAG, DL); + } + SDValue WideX = widenSubVector(X, true, Subtarget, DAG, DL, 512); + SDValue WideExp = widenSubVector(Exp, true, Subtarget, DAG, DL, 512); + Exp = DAG.getNode(ISD::SINT_TO_FP, DL, WideExp.getSimpleValueType(), Exp); + SDValue Scalef = + DAG.getNode(X86ISD::SCALEF, DL, WideX.getValueType(), WideX, WideExp); + SDValue Final = + DAG.getExtractSubvector(DL, X.getSimpleValueType(), Scalef, 0); + return DAG.getFPExtendOrRound(Final, DL, XTy); +} + static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDLoc dl(Op); @@ -29568,9 +29775,9 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget, } if (!(IsLoLaneAllZeroOrUndef || IsHiLaneAllZeroOrUndef)) { SDValue Mask = DAG.getBitcast(VT, DAG.getConstant(0x00FF, dl, ExVT)); - SDValue BLo = DAG.getNode(ISD::AND, dl, VT, Mask, B); SDValue BHi = DAG.getNode(X86ISD::ANDNP, dl, VT, Mask, B); - SDValue RLo = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BLo); + SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, DAG.getBitcast(ExVT, A), + DAG.getBitcast(ExVT, B)); SDValue RHi = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BHi); RLo = DAG.getNode(ISD::AND, dl, VT, DAG.getBitcast(VT, RLo), Mask); RHi = DAG.getNode(X86ISD::VSHLI, dl, ExVT, RHi, @@ -29586,26 +29793,8 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget, SDValue Undef = DAG.getUNDEF(VT); SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef)); SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef)); - - SDValue BLo, BHi; - if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) { - // If the RHS is a constant, manually unpackl/unpackh. - SmallVector<SDValue, 16> LoOps, HiOps; - for (unsigned i = 0; i != NumElts; i += 16) { - for (unsigned j = 0; j != 8; ++j) { - LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl, - MVT::i16)); - HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl, - MVT::i16)); - } - } - - BLo = DAG.getBuildVector(ExVT, dl, LoOps); - BHi = DAG.getBuildVector(ExVT, dl, HiOps); - } else { - BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef)); - BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef)); - } + SDValue BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef)); + SDValue BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef)); // Multiply, mask the lower 8bits of the lo/hi results and pack. SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo); @@ -30908,6 +31097,63 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR); } + if (VT == MVT::v64i8 && Subtarget.canExtendTo512BW()) { + // On AVX512BW, we can use variable 16-bit shifts to implement variable + // 8-bit shifts. For this, we split the input into two vectors, RLo and RHi. + // The i-th lane of RLo contains the (2*i)-th lane of R, and the i-th lane + // of RHi contains the (2*i+1)-th lane of R. After shifting, these vectors + // can efficiently be merged together using a masked move. + MVT ExtVT = MVT::v32i16; + + SDValue RLo, RHi; + // Isolate lower and upper lanes of Amt by masking odd lanes in AmtLo and + // right shifting AmtHi. + SDValue AmtLo = DAG.getNode(ISD::AND, dl, ExtVT, DAG.getBitcast(ExtVT, Amt), + DAG.getConstant(0x00ff, dl, ExtVT)); + SDValue AmtHi = getTargetVShiftByConstNode( + X86ISD::VSRLI, dl, ExtVT, DAG.getBitcast(ExtVT, Amt), 8, DAG); + switch (Opc) { + case ISD::SHL: + // Because we shift left, no bits from the high half can influence the low + // half, so we don't need to mask RLo. We do however need to mask RHi, to + // prevent high bits of an even lane overflowing into low bits of an odd + // lane. + RLo = DAG.getBitcast(ExtVT, R); + RHi = DAG.getNode(ISD::AND, dl, ExtVT, RLo, + DAG.getConstant(0xff00, dl, ExtVT)); + break; + case ISD::SRL: + // Same idea as above, but this time we need to make sure no low bits of + // an odd lane can overflow into high bits of an even lane. + RHi = DAG.getBitcast(ExtVT, R); + RLo = DAG.getNode(ISD::AND, dl, ExtVT, RHi, + DAG.getConstant(0x00ff, dl, ExtVT)); + break; + case ISD::SRA: + // For arithmetic right shifts, we want to sign extend each even lane of R + // such that the upper half of the corresponding lane of RLo is 0 or -1 + // depending on the sign bit of the original lane. We do this using 2 + // immediate shifts. + RHi = DAG.getBitcast(ExtVT, R); + RLo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, RHi, 8, DAG); + RLo = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExtVT, RLo, 8, DAG); + break; + default: + llvm_unreachable("Unexpected Shift Op"); + } + + SDValue ShiftedLo = + DAG.getBitcast(VT, DAG.getNode(Opc, dl, ExtVT, RLo, AmtLo)); + SDValue ShiftedHi = + DAG.getBitcast(VT, DAG.getNode(Opc, dl, ExtVT, RHi, AmtHi)); + + // To merge the shifted vectors back together, we select even lanes + // from ShiftedLo and odd lanes from ShiftedHi. + SDValue SelectMask = DAG.getBitcast( + MVT::v64i1, DAG.getConstant(0x5555555555555555, dl, MVT::i64)); + return DAG.getSelect(dl, VT, SelectMask, ShiftedLo, ShiftedHi); + } + if (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) { @@ -31127,19 +31373,15 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget, bool IsCstSplat = X86::isConstantSplat(Amt, APIntShiftAmt); unsigned NumElts = VT.getVectorNumElements(); - if (Subtarget.hasVBMI2() && EltSizeInBits > 8) { - - if (IsCstSplat) { - if (IsFSHR) - std::swap(Op0, Op1); - uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits); - SDValue Imm = DAG.getTargetConstant(ShiftAmt, DL, MVT::i8); - return getAVX512Node(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT, - {Op0, Op1, Imm}, DAG, Subtarget); - } + // For non-VLX VBMI2 targets, widen 128/256-bit to 512-bit so + // the rest of the lowering/isel can select the VBMI2 forms. + // Only Custom types (v8i16, v4i32, v2i64, v16i16, v8i32, v4i64) can + // reach LowerFunnelShift with VBMI2 but no VLX, so no type check needed. + if (Subtarget.hasVBMI2() && !Subtarget.hasVLX() && EltSizeInBits > 8) { return getAVX512Node(IsFSHR ? ISD::FSHR : ISD::FSHL, DL, VT, {Op0, Op1, Amt}, DAG, Subtarget); } + assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && @@ -33004,60 +33246,6 @@ static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); } -static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget, - SelectionDAG &DAG) { - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - SDValue Arg = Op.getOperand(0); - EVT ArgVT = Arg.getValueType(); - bool isF64 = ArgVT == MVT::f64; - - RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32; - const char *LibcallName = TLI.getLibcallName(LC); - if (!LibcallName) - return SDValue(); - - assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit()); - - // For MacOSX, we want to call an alternative entry point: __sincos_stret, - // which returns the values as { float, float } (in XMM0) or - // { double, double } (which is returned in XMM0, XMM1). - SDLoc dl(Op); - Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); - - TargetLowering::ArgListTy Args; - Args.emplace_back(Arg, ArgTy); - - // Only optimize x86_64 for now. i386 is a bit messy. For f32, - // the small struct {f32, f32} is returned in (eax, edx). For f64, - // the results are returned via SRet in memory. - SDValue Callee = - DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout())); - - Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy) - : (Type *)FixedVectorType::get(ArgTy, 4); - - TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl) - .setChain(DAG.getEntryNode()) - .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args)); - - std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); - - if (isF64) - // Returned in xmm0 and xmm1. - return CallResult.first; - - // Returned in bits 0:31 and 32:64 xmm0. - SDValue SinVal = - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first, - DAG.getVectorIdxConstant(0, dl)); - SDValue CosVal = - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first, - DAG.getVectorIdxConstant(1, dl)); - SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); - return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal); -} - /// Widen a vector input to a vector of NVT. The /// input vector must have the same element type as NVT. static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG, @@ -33662,7 +33850,6 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::ABDS: case ISD::ABDU: return LowerABD(Op, Subtarget, DAG); case ISD::AVGCEILU: return LowerAVG(Op, Subtarget, DAG); - case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG); case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG); case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG); case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG); @@ -33672,7 +33859,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG); case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG); case ISD::PREFETCH: return LowerPREFETCH(Op, Subtarget, DAG); - // clang-format on + case ISD::FLDEXP: return LowerFLDEXP(Op, Subtarget, DAG); + // clang-format on } } @@ -33756,6 +33944,59 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, } return; } + case ISD::CTLZ: + case ISD::CTTZ: + case ISD::CTLZ_ZERO_UNDEF: + case ISD::CTTZ_ZERO_UNDEF: { + // Fold i256/i512 CTLZ/CTTZ patterns to make use of AVX512 + // vXi64 CTLZ/CTTZ and VECTOR_COMPRESS. + // Compute the CTLZ/CTTZ of each element, add the element's bit offset, + // compress the result to remove all zero elements (passthru is set to + // scalar bitwidth if all elements are zero) and extract the lowest + // compressed element. + SDValue N0 = N->getOperand(0); + EVT VT = N->getValueType(0); + assert(Subtarget.hasCDI() && "AVX512CD required"); + assert((VT == MVT::i256 || VT == MVT::i512) && "Unexpected VT!"); + if (VT == MVT::i256 && !X86::mayFoldLoad(N0, Subtarget)) + return; + + unsigned SizeInBits = VT.getSizeInBits(); + MVT VecVT = MVT::getVectorVT(MVT::i64, SizeInBits / 64); + MVT BoolVT = VecVT.changeVectorElementType(MVT::i1); + SDValue Vec = DAG.getBitcast(VecVT, N0); + + SmallVector<int, 8> RevMask; + SmallVector<SDValue, 8> Offsets; + for (unsigned I = 0, E = VecVT.getVectorNumElements(); I != E; ++I) { + RevMask.push_back((int)((E - 1) - I)); + Offsets.push_back(DAG.getConstant(I * 64, dl, MVT::i64)); + } + + // CTLZ - reverse the elements as we want the top non-zero element at the + // bottom for compression. + unsigned VecOpc = ISD::CTTZ; + if (Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF) { + VecOpc = ISD::CTLZ; + Vec = DAG.getVectorShuffle(VecVT, dl, Vec, Vec, RevMask); + } + + SDValue PassThrough = DAG.getUNDEF(VecVT); + if (Opc == ISD::CTLZ || Opc == ISD::CTTZ) + PassThrough = DAG.getConstant(SizeInBits, dl, VecVT); + + SDValue IsNonZero = DAG.getSetCC(dl, BoolVT, Vec, + DAG.getConstant(0, dl, VecVT), ISD::SETNE); + SDValue Cnt = DAG.getNode(VecOpc, dl, VecVT, Vec); + Cnt = DAG.getNode(ISD::ADD, dl, VecVT, Cnt, + DAG.getBuildVector(VecVT, dl, Offsets)); + Cnt = DAG.getNode(ISD::VECTOR_COMPRESS, dl, VecVT, Cnt, IsNonZero, + PassThrough); + Cnt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cnt, + DAG.getVectorIdxConstant(0, dl)); + Results.push_back(DAG.getZExtOrTrunc(Cnt, dl, VT)); + return; + } case ISD::MUL: { EVT VT = N->getValueType(0); assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && @@ -34931,6 +35172,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(BLENDV) NODE_NAME_CASE(HADD) NODE_NAME_CASE(HSUB) + NODE_NAME_CASE(HADDS) + NODE_NAME_CASE(HSUBS) NODE_NAME_CASE(FHADD) NODE_NAME_CASE(FHSUB) NODE_NAME_CASE(CONFLICT) @@ -38168,22 +38411,22 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, default: llvm_unreachable("Unexpected instruction!"); case X86::PTCVTROWD2PSrri: - Opc = X86::TCVTROWD2PSrri; + Opc = X86::TCVTROWD2PSrti; break; case X86::PTCVTROWPS2BF16Hrri: - Opc = X86::TCVTROWPS2BF16Hrri; + Opc = X86::TCVTROWPS2BF16Hrti; break; case X86::PTCVTROWPS2PHHrri: - Opc = X86::TCVTROWPS2PHHrri; + Opc = X86::TCVTROWPS2PHHrti; break; case X86::PTCVTROWPS2BF16Lrri: - Opc = X86::TCVTROWPS2BF16Lrri; + Opc = X86::TCVTROWPS2BF16Lrti; break; case X86::PTCVTROWPS2PHLrri: - Opc = X86::TCVTROWPS2PHLrri; + Opc = X86::TCVTROWPS2PHLrti; break; case X86::PTILEMOVROWrri: - Opc = X86::TILEMOVROWrri; + Opc = X86::TILEMOVROWrti; break; } MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc)); @@ -38206,22 +38449,22 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, default: llvm_unreachable("Unexpected instruction!"); case X86::PTCVTROWD2PSrre: - Opc = X86::TCVTROWD2PSrre; + Opc = X86::TCVTROWD2PSrte; break; case X86::PTCVTROWPS2BF16Hrre: - Opc = X86::TCVTROWPS2BF16Hrre; + Opc = X86::TCVTROWPS2BF16Hrte; break; case X86::PTCVTROWPS2BF16Lrre: - Opc = X86::TCVTROWPS2BF16Lrre; + Opc = X86::TCVTROWPS2BF16Lrte; break; case X86::PTCVTROWPS2PHHrre: - Opc = X86::TCVTROWPS2PHHrre; + Opc = X86::TCVTROWPS2PHHrte; break; case X86::PTCVTROWPS2PHLrre: - Opc = X86::TCVTROWPS2PHLrre; + Opc = X86::TCVTROWPS2PHLrte; break; case X86::PTILEMOVROWrre: - Opc = X86::TILEMOVROWrre; + Opc = X86::TILEMOVROWrte; break; } MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc)); @@ -40707,8 +40950,9 @@ static SDValue canonicalizeShuffleMaskWithHorizOp( })) return SDValue(); - bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD || - Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB); + bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::FHSUB || + Opcode0 == X86ISD::HADD || Opcode0 == X86ISD::HSUB || + Opcode0 == X86ISD::HADDS || Opcode0 == X86ISD::HSUBS); bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS); if (!isHoriz && !isPack) return SDValue(); @@ -45014,11 +45258,16 @@ bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode( case X86ISD::INSERTPS: case X86ISD::BLENDI: case X86ISD::PSHUFB: + case X86ISD::VZEXT_MOVL: case X86ISD::PSHUFD: + case X86ISD::PSHUFHW: + case X86ISD::PSHUFLW: + case X86ISD::SHUFP: case X86ISD::UNPCKL: case X86ISD::UNPCKH: case X86ISD::VPERMILPV: case X86ISD::VPERMILPI: + case X86ISD::VPERMI: case X86ISD::VPERMV: case X86ISD::VPERMV3: { SmallVector<int, 8> Mask; @@ -45044,6 +45293,16 @@ bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode( } break; } + case X86ISD::VBROADCAST: { + SDValue Src = Op.getOperand(0); + MVT SrcVT = Src.getSimpleValueType(); + if (SrcVT.isVector()) { + APInt DemandedSrc = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0); + return DAG.isGuaranteedNotToBeUndefOrPoison(Src, DemandedSrc, PoisonOnly, + Depth + 1); + } + return DAG.isGuaranteedNotToBeUndefOrPoison(Src, PoisonOnly, Depth + 1); + } } return TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode( Op, DemandedElts, DAG, PoisonOnly, Depth); @@ -45088,13 +45347,19 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode( // SSE target shuffles. case X86ISD::INSERTPS: case X86ISD::PSHUFB: + case X86ISD::VZEXT_MOVL: case X86ISD::PSHUFD: + case X86ISD::PSHUFHW: + case X86ISD::PSHUFLW: + case X86ISD::SHUFP: case X86ISD::UNPCKL: case X86ISD::UNPCKH: case X86ISD::VPERMILPV: case X86ISD::VPERMILPI: + case X86ISD::VPERMI: case X86ISD::VPERMV: case X86ISD::VPERMV3: + case X86ISD::VBROADCAST: return false; // SSE comparisons handle all icmp/fcmp cases. // TODO: Add CMPM/MM with test coverage. @@ -53307,18 +53572,48 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, if (Mst->isCompressingStore()) return SDValue(); - EVT VT = Mst->getValue().getValueType(); + if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget)) + return ScalarStore; + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + SDLoc DL(N); - if (Mst->isTruncatingStore()) - return SDValue(); + SDValue Mask = Mst->getMask(); + SDValue Value = Mst->getValue(); + EVT MemVT = Mst->getMemoryVT(); + EVT VT = Value.getValueType(); - if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget)) - return ScalarStore; + // See if the truncating store can be a saturating truncated store. + if (Mst->isTruncatingStore()) { + if (VT.isVector() && MemVT.isVector() && VT.getScalarType().isInteger() && + MemVT.getScalarType().isInteger() && + VT.getVectorNumElements() == MemVT.getVectorNumElements() && + Subtarget.hasBWI() && Subtarget.hasVLX()) { + + SDValue SatSrc; + unsigned Opc; + if (SDValue SVal = detectSSatPattern(Value, MemVT)) { + SatSrc = SVal; + Opc = X86ISD::VMTRUNCSTORES; + } else if (SDValue UVal = detectUSatPattern(Value, MemVT, DAG, DL)) { + SatSrc = UVal; + Opc = X86ISD::VMTRUNCSTOREUS; + } else { + return SDValue(); + } + + SDVTList VTs = DAG.getVTList(MVT::Other); + SDValue Ops[] = {Mst->getChain(), SatSrc, Mst->getBasePtr(), Mask}; + MachineMemOperand *MMO = Mst->getMemOperand(); + return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO); + } + + // Otherwise don't combine if this store already truncates. + return SDValue(); + } // If the mask value has been legalized to a non-boolean vector, try to // simplify ops leading up to it. We only demand the MSB of each lane. - SDValue Mask = Mst->getMask(); if (Mask.getScalarValueSizeInBits() != 1) { APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits())); if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) { @@ -53334,54 +53629,57 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, Mst->getAddressingMode()); } - SDValue Value = Mst->getValue(); if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() && - TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(), - Mst->getMemoryVT())) { - return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0), - Mst->getBasePtr(), Mst->getOffset(), Mask, - Mst->getMemoryVT(), Mst->getMemOperand(), - Mst->getAddressingMode(), true); + TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(), MemVT)) { + return DAG.getMaskedStore(Mst->getChain(), DL, Value.getOperand(0), + Mst->getBasePtr(), Mst->getOffset(), Mask, MemVT, + Mst->getMemOperand(), Mst->getAddressingMode(), + true); } return SDValue(); } // Look for a RMW operation that only touches one bit of a larger than legal -// type and fold it to a BTC/BTR/BTS pattern acting on a single i32 sub value. +// type and fold it to a BTC/BTR/BTS or bit insertion pattern acting on a single +// i32 sub value. static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { using namespace SDPatternMatch; - - // Only handle normal stores and its chain was a matching normal load. - auto *Ld = dyn_cast<LoadSDNode>(St->getChain()); - if (!ISD::isNormalStore(St) || !St->isSimple() || !Ld || - !ISD::isNormalLoad(Ld) || !Ld->isSimple() || - Ld->getBasePtr() != St->getBasePtr() || - Ld->getOffset() != St->getOffset()) - return SDValue(); - - SDValue LoadVal(Ld, 0); SDValue StoredVal = St->getValue(); EVT VT = StoredVal.getValueType(); - // Only narrow larger than legal scalar integers. - if (!VT.isScalarInteger() || + // Only narrow normal stores of larger than legal scalar integers. + if (!ISD::isNormalStore(St) || !St->isSimple() || !VT.isScalarInteger() || VT.getSizeInBits() <= (Subtarget.is64Bit() ? 64 : 32)) return SDValue(); // BTR: X & ~(1 << ShAmt) // BTS: X | (1 << ShAmt) // BTC: X ^ (1 << ShAmt) - SDValue ShAmt; - if (!StoredVal.hasOneUse() || - !(sd_match(StoredVal, m_And(m_Specific(LoadVal), + // + // BitInsert: (X & ~(1 << ShAmt)) | (InsertBit << ShAmt) + SDValue SrcVal, InsertBit, ShAmt; + if (!(sd_match(StoredVal, m_And(m_Value(SrcVal), m_Not(m_Shl(m_One(), m_Value(ShAmt))))) || sd_match(StoredVal, - m_Or(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))) || + m_Or(m_Value(SrcVal), m_Shl(m_One(), m_Value(ShAmt)))) || sd_match(StoredVal, - m_Xor(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))))) + m_Xor(m_Value(SrcVal), m_Shl(m_One(), m_Value(ShAmt)))) || + sd_match( + StoredVal, + m_Or(m_And(m_Value(SrcVal), m_Not(m_Shl(m_One(), m_Value(ShAmt)))), + m_Shl(m_Value(InsertBit), m_Deferred(ShAmt)))))) + return SDValue(); + + // SrcVal must be a matching normal load further up the chain. + auto *Ld = dyn_cast<LoadSDNode>(peekThroughBitcasts(SrcVal)); + if (!Ld || !ISD::isNormalLoad(Ld) || !Ld->isSimple() || + Ld->getBasePtr() != St->getBasePtr() || + Ld->getOffset() != St->getOffset() || + !St->getChain().reachesChainWithoutSideEffects(SDValue(Ld, 1))) return SDValue(); // Ensure the shift amount is in bounds. @@ -53389,6 +53687,13 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL, if (KnownAmt.getMaxValue().uge(VT.getSizeInBits())) return SDValue(); + // If we're inserting a bit then it must be the LSB. + if (InsertBit) { + KnownBits KnownInsert = DAG.computeKnownBits(InsertBit); + if (KnownInsert.countMinLeadingZeros() < (VT.getSizeInBits() - 1)) + return SDValue(); + } + // Split the shift into an alignment shift that moves the active i32 block to // the bottom bits for truncation and a modulo shift that can act on the i32. EVT AmtVT = ShAmt.getValueType(); @@ -53396,6 +53701,7 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL, DAG.getSignedConstant(-32LL, DL, AmtVT)); SDValue ModuloAmt = DAG.getNode(ISD::AND, DL, AmtVT, ShAmt, DAG.getConstant(31, DL, AmtVT)); + ModuloAmt = DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8); // Compute the byte offset for the i32 block that is changed by the RMW. // combineTruncate will adjust the load for us in a similar way. @@ -53407,18 +53713,41 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL, SDNodeFlags::NoUnsignedWrap); // Reconstruct the BTC/BTR/BTS pattern for the i32 block and store. - SDValue X = DAG.getNode(ISD::SRL, DL, VT, LoadVal, AlignAmt); + SDValue X = DAG.getNode(ISD::SRL, DL, VT, SrcVal, AlignAmt); X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X); - SDValue Mask = - DAG.getNode(ISD::SHL, DL, MVT::i32, DAG.getConstant(1, DL, MVT::i32), - DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8)); - if (StoredVal.getOpcode() == ISD::AND) - Mask = DAG.getNOT(DL, Mask, MVT::i32); + SDValue Mask = DAG.getNode(ISD::SHL, DL, MVT::i32, + DAG.getConstant(1, DL, MVT::i32), ModuloAmt); - SDValue Res = DAG.getNode(StoredVal.getOpcode(), DL, MVT::i32, X, Mask); - return DAG.getStore(St->getChain(), DL, Res, NewPtr, St->getPointerInfo(), - Align(), St->getMemOperand()->getFlags()); + SDValue Res; + if (InsertBit) { + SDValue BitMask = + DAG.getNode(ISD::SHL, DL, MVT::i32, + DAG.getZExtOrTrunc(InsertBit, DL, MVT::i32), ModuloAmt); + Res = + DAG.getNode(ISD::AND, DL, MVT::i32, X, DAG.getNOT(DL, Mask, MVT::i32)); + Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, BitMask); + } else { + if (StoredVal.getOpcode() == ISD::AND) + Mask = DAG.getNOT(DL, Mask, MVT::i32); + Res = DAG.getNode(StoredVal.getOpcode(), DL, MVT::i32, X, Mask); + } + + SDValue NewStore = + DAG.getStore(St->getChain(), DL, Res, NewPtr, + MachinePointerInfo(St->getPointerInfo().getAddrSpace()), + Align(), St->getMemOperand()->getFlags()); + + // If there are other uses of StoredVal, replace with a new load of the + // whole (updated) value. + if (!StoredVal.hasOneUse()) { + SDValue NewLoad = + DAG.getLoad(VT, DL, NewStore, Ld->getBasePtr(), Ld->getMemOperand()); + for (SDNode *User : StoredVal->users()) + DCI.AddToWorklist(User); + DAG.ReplaceAllUsesWith(StoredVal, NewLoad); + } + return NewStore; } static SDValue combineStore(SDNode *N, SelectionDAG &DAG, @@ -53647,7 +53976,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, } } - if (SDValue R = narrowBitOpRMW(St, dl, DAG, Subtarget)) + if (SDValue R = narrowBitOpRMW(St, dl, DAG, DCI, Subtarget)) return R; // Convert store(cmov(load(p), x, CC), p) to cstore(x, p, CC) @@ -53984,7 +54313,9 @@ static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); unsigned Opcode = N->getOpcode(); - bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD); + bool IsAdd = + (Opcode == ISD::FADD) || (Opcode == ISD::ADD) || (Opcode == ISD::SADDSAT); + bool IsSat = (Opcode == ISD::SADDSAT) || (Opcode == ISD::SSUBSAT); SmallVector<int, 8> PostShuffleMask; auto MergableHorizOp = [N](unsigned HorizOpcode) { @@ -54014,11 +54345,17 @@ static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG, break; case ISD::ADD: case ISD::SUB: - if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 || - VT == MVT::v16i16 || VT == MVT::v8i32)) { + case ISD::SADDSAT: + case ISD::SSUBSAT: + if (!Subtarget.hasSSSE3()) + break; + if (VT == MVT::v8i16 || VT == MVT::v16i16 || + (!IsSat && (VT == MVT::v4i32 || VT == MVT::v8i32))) { + SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); - auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB; + auto HorizOpcode = IsSat ? (IsAdd ? X86ISD::HADDS : X86ISD::HSUBS) + : (IsAdd ? X86ISD::HADD : X86ISD::HSUB); if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd, PostShuffleMask, MergableHorizOp(HorizOpcode))) { auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL, @@ -54095,11 +54432,6 @@ static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG, // FADD(A, FMA(B, C, 0)) and FADD(A, FMUL(B, C)) to FMA(B, C, A) static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { - auto AllowContract = [&DAG](const SDNodeFlags &Flags) { - return DAG.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast || - Flags.hasAllowContract(); - }; - auto HasNoSignedZero = [&DAG](const SDNodeFlags &Flags) { return DAG.getTarget().Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros(); @@ -54112,7 +54444,7 @@ static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG, }; if (N->getOpcode() != ISD::FADD || !Subtarget.hasFP16() || - !AllowContract(N->getFlags())) + !N->getFlags().hasAllowContract()) return SDValue(); EVT VT = N->getValueType(0); @@ -54123,14 +54455,13 @@ static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG, SDValue RHS = N->getOperand(1); bool IsConj; SDValue FAddOp1, MulOp0, MulOp1; - auto GetCFmulFrom = [&MulOp0, &MulOp1, &IsConj, &AllowContract, - &IsVectorAllNegativeZero, + auto GetCFmulFrom = [&MulOp0, &MulOp1, &IsConj, &IsVectorAllNegativeZero, &HasNoSignedZero](SDValue N) -> bool { if (!N.hasOneUse() || N.getOpcode() != ISD::BITCAST) return false; SDValue Op0 = N.getOperand(0); unsigned Opcode = Op0.getOpcode(); - if (Op0.hasOneUse() && AllowContract(Op0->getFlags())) { + if (Op0.hasOneUse() && Op0->getFlags().hasAllowContract()) { if ((Opcode == X86ISD::VFMULC || Opcode == X86ISD::VFCMULC)) { MulOp0 = Op0.getOperand(0); MulOp1 = Op0.getOperand(1); @@ -54592,11 +54923,14 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, KnownBits KnownAmt = DAG.computeKnownBits(ShAmt); // Check the shift amount is byte aligned. // Check the truncation doesn't use any shifted in (zero) top bits. - // Check the shift amount doesn't depend on the original load. + // Check the shift amount doesn't depend on the original load chain. if (KnownAmt.countMinTrailingZeros() >= 3 && KnownAmt.getMaxValue().ule(SrcVT.getSizeInBits() - VT.getSizeInBits()) && - !Ld->isPredecessorOf(ShAmt.getNode())) { + none_of(Ld->uses(), [&ShAmt](SDUse &Use) { + return Use.getResNo() == 1 && + Use.getUser()->isPredecessorOf(ShAmt.getNode()); + })) { EVT PtrVT = Ld->getBasePtr().getValueType(); SDValue PtrBitOfs = DAG.getZExtOrTrunc(ShAmt, DL, PtrVT); SDValue PtrByteOfs = @@ -54605,7 +54939,8 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, SDValue NewPtr = DAG.getMemBasePlusOffset( Ld->getBasePtr(), PtrByteOfs, DL, SDNodeFlags::NoUnsignedWrap); SDValue NewLoad = - DAG.getLoad(VT, DL, Ld->getChain(), NewPtr, Ld->getPointerInfo(), + DAG.getLoad(VT, DL, Ld->getChain(), NewPtr, + MachinePointerInfo(Ld->getPointerInfo().getAddrSpace()), Align(), Ld->getMemOperand()->getFlags()); DAG.makeEquivalentMemoryOrdering(Ld, NewLoad); return NewLoad; @@ -57377,6 +57712,40 @@ static SDValue combineFP_TO_xINT_SAT(SDNode *N, SelectionDAG &DAG, return SDValue(); } +// Combiner: turn uniform-constant splat funnel shifts into VSHLD/VSHRD +static SDValue combineFunnelShift(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + SDLoc DL(N); + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + SDValue Amt = N->getOperand(2); + EVT VT = Op0.getValueType(); + + if (!VT.isVector()) + return SDValue(); + + // Only combine if the operation is legal for this type. + // This ensures we don't try to convert types that need to be + // widened/promoted. + if (!DAG.getTargetLoweringInfo().isOperationLegal(N->getOpcode(), VT)) + return SDValue(); + + unsigned EltSize = VT.getScalarSizeInBits(); + APInt ShiftVal; + if (!X86::isConstantSplat(Amt, ShiftVal)) + return SDValue(); + + uint64_t ModAmt = ShiftVal.urem(EltSize); + SDValue Imm = DAG.getTargetConstant(ModAmt, DL, MVT::i8); + bool IsFSHR = N->getOpcode() == ISD::FSHR; + + if (IsFSHR) + std::swap(Op0, Op1); + unsigned Opcode = IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD; + return DAG.getNode(Opcode, DL, VT, {Op0, Op1, Imm}); +} + static bool needCarryOrOverflowFlag(SDValue Flags) { assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!"); @@ -59063,7 +59432,8 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, case X86ISD::ANDNP: // TODO: AVX512 targets should only use CombineSubOperand like AVX1/2. if (!IsSplat && (VT.is256BitVector() || - (VT.is512BitVector() && Subtarget.useAVX512Regs()))) { + (VT.is512BitVector() && Subtarget.useAVX512Regs()) || + (EltSizeInBits == 1 && TLI.isTypeLegal(VT)))) { // Don't concatenate root AVX1 NOT patterns. // TODO: Allow NOT folding if Concat0 succeeds. if (Opcode == ISD::XOR && Depth == 0 && !Subtarget.hasInt256() && @@ -59073,7 +59443,8 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, break; SDValue Concat0 = CombineSubOperand(VT, Ops, 0); SDValue Concat1 = CombineSubOperand(VT, Ops, 1); - if (Concat0 || Concat1 || Subtarget.useAVX512Regs()) + if (Concat0 || Concat1 || + (EltSizeInBits != 1 && Subtarget.useAVX512Regs())) return DAG.getNode(Opcode, DL, VT, Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0), Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1)); @@ -59133,6 +59504,31 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, } } break; + case ISD::SETCC: + if (!IsSplat && EltSizeInBits == 1 && + llvm::all_of(Ops, [Op0](SDValue Op) { + return Op0.getOperand(0).getValueType() == + Op.getOperand(0).getValueType() && + Op0.getOperand(2) == Op.getOperand(2); + })) { + EVT SrcVT = Op0.getOperand(0).getValueType(); + EVT NewSrcVT = EVT::getVectorVT(Ctx, SrcVT.getScalarType(), + NumOps * SrcVT.getVectorNumElements()); + unsigned SrcSizeInBits = SrcVT.getScalarSizeInBits(); + if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(NewSrcVT) && + (NewSrcVT.is256BitVector() || + (NewSrcVT.is512BitVector() && Subtarget.useAVX512Regs() && + (SrcSizeInBits >= 32 || Subtarget.useBWIRegs())))) { + SDValue LHS = CombineSubOperand(NewSrcVT.getSimpleVT(), Ops, 0); + SDValue RHS = CombineSubOperand(NewSrcVT.getSimpleVT(), Ops, 1); + if (LHS || RHS) + return DAG.getNode(Opcode, DL, VT, + LHS ? LHS : ConcatSubOperand(NewSrcVT, Ops, 0), + RHS ? RHS : ConcatSubOperand(NewSrcVT, Ops, 1), + Op0.getOperand(2)); + } + } + break; case ISD::CTPOP: case ISD::CTTZ: case ISD::CTLZ: @@ -59196,6 +59592,36 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, ConcatSubOperand(VT, Ops, 1)); } break; + case ISD::FSQRT: + case ISD::FCEIL: + case ISD::FTRUNC: + case ISD::FRINT: + case ISD::FNEARBYINT: + case ISD::FROUND: + case ISD::FROUNDEVEN: + case ISD::FFLOOR: + if (!IsSplat && (VT.is256BitVector() || + (VT.is512BitVector() && Subtarget.useAVX512Regs()))) { + return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0)); + } + break; + case X86ISD::FRCP: + case X86ISD::FRSQRT: + if (!IsSplat && VT.is256BitVector()) { + return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0)); + } + break; + case X86ISD::VRNDSCALE: + if (!IsSplat && + (VT.is256BitVector() || + (VT.is512BitVector() && Subtarget.useAVX512Regs())) && + llvm::all_of(Ops, [Op0](SDValue Op) { + return Op0.getOperand(1) == Op.getOperand(1); + })) { + return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0), + Op0.getOperand(1)); + } + break; case X86ISD::HADD: case X86ISD::HSUB: case X86ISD::FHADD: @@ -59327,8 +59753,8 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, if (TLI->allowsMemoryAccess(Ctx, DAG.getDataLayout(), VT, *FirstLd->getMemOperand(), &Fast) && Fast) { - if (SDValue Ld = - EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false)) + if (SDValue Ld = EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, + false, Depth + 1)) return Ld; } } @@ -59467,6 +59893,17 @@ static SDValue combineCONCAT_VECTORS(SDNode *N, SelectionDAG &DAG, } } + // Attempt to merge comparison/logic ops if the type is legal. + if (TLI.isTypeLegal(VT) && + (all_of(Ops, [](SDValue Op) { return Op.getOpcode() == ISD::SETCC; }) || + all_of(Ops, [](SDValue Op) { + return ISD::isBitwiseLogicOp(Op.getOpcode()); + }))) { + if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, + DAG, Subtarget)) + return R; + } + // Don't do anything else for i1 vectors. return SDValue(); } @@ -60807,6 +61244,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget); case X86ISD::ADD: case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI, Subtarget); + case ISD::SADDSAT: + case ISD::SSUBSAT: return combineToHorizontalAddSub(N, DAG, Subtarget); case X86ISD::CLOAD: case X86ISD::CSTORE: return combineX86CloadCstore(N, DAG); case X86ISD::SBB: return combineSBB(N, DAG); @@ -60930,6 +61369,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::VPERM2X128: case X86ISD::SHUF128: case X86ISD::VZEXT_MOVL: + case X86ISD::COMPRESS: + case X86ISD::EXPAND: case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget); case X86ISD::FMADD_RND: case X86ISD::FMSUB: @@ -60977,6 +61418,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::INTRINSIC_VOID: return combineINTRINSIC_VOID(N, DAG, DCI); case ISD::FP_TO_SINT_SAT: case ISD::FP_TO_UINT_SAT: return combineFP_TO_xINT_SAT(N, DAG, Subtarget); + case ISD::FSHL: + case ISD::FSHR: return combineFunnelShift(N, DAG, DCI, Subtarget); // clang-format on } @@ -61531,8 +61974,8 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, if (auto *C = dyn_cast<ConstantSDNode>(Op)) { if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff || (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) { - Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), - Op.getValueType()); + Result = DAG.getSignedTargetConstant(C->getSExtValue(), SDLoc(Op), + Op.getValueType()); break; } } @@ -61570,7 +62013,8 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), C->getSExtValue())) { // Widen to 64 bits here to get it sign extended. - Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64); + Result = + DAG.getSignedTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64); break; } // FIXME gcc accepts some relocatable values here too, but only in certain @@ -61619,9 +62063,11 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, BooleanContent BCont = getBooleanContents(MVT::i64); ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont) : ISD::SIGN_EXTEND; - int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue() - : CST->getSExtValue(); - Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64); + SDLoc DL(Op); + Result = + ExtOpc == ISD::ZERO_EXTEND + ? DAG.getTargetConstant(CST->getZExtValue(), DL, MVT::i64) + : DAG.getSignedTargetConstant(CST->getSExtValue(), DL, MVT::i64); break; } |
