diff options
Diffstat (limited to 'llvm/lib')
27 files changed, 1403 insertions, 415 deletions
diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp index 0e5bc48..df75999 100644 --- a/llvm/lib/Analysis/LazyValueInfo.cpp +++ b/llvm/lib/Analysis/LazyValueInfo.cpp @@ -947,9 +947,8 @@ LazyValueInfoImpl::solveBlockValueSelect(SelectInst *SI, BasicBlock *BB) { /*UseBlockValue*/ false)); } - ValueLatticeElement Result = TrueVal; - Result.mergeIn(FalseVal); - return Result; + TrueVal.mergeIn(FalseVal); + return TrueVal; } std::optional<ConstantRange> @@ -1778,9 +1777,8 @@ ValueLatticeElement LazyValueInfoImpl::getValueInBlock(Value *V, BasicBlock *BB, assert(OptResult && "Value not available after solving"); } - ValueLatticeElement Result = *OptResult; - LLVM_DEBUG(dbgs() << " Result = " << Result << "\n"); - return Result; + LLVM_DEBUG(dbgs() << " Result = " << *OptResult << "\n"); + return *OptResult; } ValueLatticeElement LazyValueInfoImpl::getValueAt(Value *V, Instruction *CxtI) { diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index e06b095..425420f 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -15740,19 +15740,26 @@ void ScalarEvolution::LoopGuards::collectFromBlock( GetNextSCEVDividesByDivisor(One, DividesBy); To = SE.getUMaxExpr(FromRewritten, OneAlignedUp); } else { + // LHS != RHS can be rewritten as (LHS - RHS) = UMax(1, LHS - RHS), + // but creating the subtraction eagerly is expensive. Track the + // inequalities in a separate map, and materialize the rewrite lazily + // when encountering a suitable subtraction while re-writing. if (LHS->getType()->isPointerTy()) { LHS = SE.getLosslessPtrToIntExpr(LHS); RHS = SE.getLosslessPtrToIntExpr(RHS); if (isa<SCEVCouldNotCompute>(LHS) || isa<SCEVCouldNotCompute>(RHS)) break; } - auto AddSubRewrite = [&](const SCEV *A, const SCEV *B) { - const SCEV *Sub = SE.getMinusSCEV(A, B); - AddRewrite(Sub, Sub, - SE.getUMaxExpr(Sub, SE.getOne(From->getType()))); - }; - AddSubRewrite(LHS, RHS); - AddSubRewrite(RHS, LHS); + const SCEVConstant *C; + const SCEV *A, *B; + if (match(RHS, m_scev_Add(m_SCEVConstant(C), m_SCEV(A))) && + match(LHS, m_scev_Add(m_scev_Specific(C), m_SCEV(B)))) { + RHS = A; + LHS = B; + } + if (LHS > RHS) + std::swap(LHS, RHS); + Guards.NotEqual.insert({LHS, RHS}); continue; } break; @@ -15886,13 +15893,15 @@ const SCEV *ScalarEvolution::LoopGuards::rewrite(const SCEV *Expr) const { class SCEVLoopGuardRewriter : public SCEVRewriteVisitor<SCEVLoopGuardRewriter> { const DenseMap<const SCEV *, const SCEV *> ⤅ + const SmallDenseSet<std::pair<const SCEV *, const SCEV *>> ≠ SCEV::NoWrapFlags FlagMask = SCEV::FlagAnyWrap; public: SCEVLoopGuardRewriter(ScalarEvolution &SE, const ScalarEvolution::LoopGuards &Guards) - : SCEVRewriteVisitor(SE), Map(Guards.RewriteMap) { + : SCEVRewriteVisitor(SE), Map(Guards.RewriteMap), + NotEqual(Guards.NotEqual) { if (Guards.PreserveNUW) FlagMask = ScalarEvolution::setFlags(FlagMask, SCEV::FlagNUW); if (Guards.PreserveNSW) @@ -15947,14 +15956,36 @@ const SCEV *ScalarEvolution::LoopGuards::rewrite(const SCEV *Expr) const { } const SCEV *visitAddExpr(const SCEVAddExpr *Expr) { + // Helper to check if S is a subtraction (A - B) where A != B, and if so, + // return UMax(S, 1). + auto RewriteSubtraction = [&](const SCEV *S) -> const SCEV * { + const SCEV *LHS, *RHS; + if (MatchBinarySub(S, LHS, RHS)) { + if (LHS > RHS) + std::swap(LHS, RHS); + if (NotEqual.contains({LHS, RHS})) + return SE.getUMaxExpr(S, SE.getOne(S->getType())); + } + return nullptr; + }; + + // Check if Expr itself is a subtraction pattern with guard info. + if (const SCEV *Rewritten = RewriteSubtraction(Expr)) + return Rewritten; + // Trip count expressions sometimes consist of adding 3 operands, i.e. // (Const + A + B). There may be guard info for A + B, and if so, apply // it. // TODO: Could more generally apply guards to Add sub-expressions. if (isa<SCEVConstant>(Expr->getOperand(0)) && Expr->getNumOperands() == 3) { - if (const SCEV *S = Map.lookup( - SE.getAddExpr(Expr->getOperand(1), Expr->getOperand(2)))) + const SCEV *Add = + SE.getAddExpr(Expr->getOperand(1), Expr->getOperand(2)); + if (const SCEV *Rewritten = RewriteSubtraction(Add)) + return SE.getAddExpr( + Expr->getOperand(0), Rewritten, + ScalarEvolution::maskFlags(Expr->getNoWrapFlags(), FlagMask)); + if (const SCEV *S = Map.lookup(Add)) return SE.getAddExpr(Expr->getOperand(0), S); } SmallVector<const SCEV *, 2> Operands; @@ -15989,7 +16020,7 @@ const SCEV *ScalarEvolution::LoopGuards::rewrite(const SCEV *Expr) const { } }; - if (RewriteMap.empty()) + if (RewriteMap.empty() && NotEqual.empty()) return Expr; SCEVLoopGuardRewriter Rewriter(SE, *this); diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index c97300d..6bf9008 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -26876,6 +26876,8 @@ static SDValue combineTruncationShuffle(ShuffleVectorSDNode *SVN, // TODO: handle more extension/truncation cases as cases arise. if (EltSizeInBits != ExtSrcSizeInBits) return SDValue(); + if (VT.getSizeInBits() != N00.getValueSizeInBits()) + return SDValue(); // We can remove *extend_vector_inreg only if the truncation happens at // the same scale as the extension. diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp index 8623c06..4787604 100644 --- a/llvm/lib/Support/APFloat.cpp +++ b/llvm/lib/Support/APFloat.cpp @@ -130,44 +130,46 @@ struct fltSemantics { bool hasSignBitInMSB = true; }; -static constexpr fltSemantics semIEEEhalf = {15, -14, 11, 16}; -static constexpr fltSemantics semBFloat = {127, -126, 8, 16}; -static constexpr fltSemantics semIEEEsingle = {127, -126, 24, 32}; -static constexpr fltSemantics semIEEEdouble = {1023, -1022, 53, 64}; -static constexpr fltSemantics semIEEEquad = {16383, -16382, 113, 128}; -static constexpr fltSemantics semFloat8E5M2 = {15, -14, 3, 8}; -static constexpr fltSemantics semFloat8E5M2FNUZ = { +constexpr fltSemantics APFloatBase::semIEEEhalf = {15, -14, 11, 16}; +constexpr fltSemantics APFloatBase::semBFloat = {127, -126, 8, 16}; +constexpr fltSemantics APFloatBase::semIEEEsingle = {127, -126, 24, 32}; +constexpr fltSemantics APFloatBase::semIEEEdouble = {1023, -1022, 53, 64}; +constexpr fltSemantics APFloatBase::semIEEEquad = {16383, -16382, 113, 128}; +constexpr fltSemantics APFloatBase::semFloat8E5M2 = {15, -14, 3, 8}; +constexpr fltSemantics APFloatBase::semFloat8E5M2FNUZ = { 15, -15, 3, 8, fltNonfiniteBehavior::NanOnly, fltNanEncoding::NegativeZero}; -static constexpr fltSemantics semFloat8E4M3 = {7, -6, 4, 8}; -static constexpr fltSemantics semFloat8E4M3FN = { +constexpr fltSemantics APFloatBase::semFloat8E4M3 = {7, -6, 4, 8}; +constexpr fltSemantics APFloatBase::semFloat8E4M3FN = { 8, -6, 4, 8, fltNonfiniteBehavior::NanOnly, fltNanEncoding::AllOnes}; -static constexpr fltSemantics semFloat8E4M3FNUZ = { +constexpr fltSemantics APFloatBase::semFloat8E4M3FNUZ = { 7, -7, 4, 8, fltNonfiniteBehavior::NanOnly, fltNanEncoding::NegativeZero}; -static constexpr fltSemantics semFloat8E4M3B11FNUZ = { +constexpr fltSemantics APFloatBase::semFloat8E4M3B11FNUZ = { 4, -10, 4, 8, fltNonfiniteBehavior::NanOnly, fltNanEncoding::NegativeZero}; -static constexpr fltSemantics semFloat8E3M4 = {3, -2, 5, 8}; -static constexpr fltSemantics semFloatTF32 = {127, -126, 11, 19}; -static constexpr fltSemantics semFloat8E8M0FNU = {127, - -127, - 1, - 8, - fltNonfiniteBehavior::NanOnly, - fltNanEncoding::AllOnes, - false, - false, - false}; - -static constexpr fltSemantics semFloat6E3M2FN = { +constexpr fltSemantics APFloatBase::semFloat8E3M4 = {3, -2, 5, 8}; +constexpr fltSemantics APFloatBase::semFloatTF32 = {127, -126, 11, 19}; +constexpr fltSemantics APFloatBase::semFloat8E8M0FNU = { + 127, + -127, + 1, + 8, + fltNonfiniteBehavior::NanOnly, + fltNanEncoding::AllOnes, + false, + false, + false}; + +constexpr fltSemantics APFloatBase::semFloat6E3M2FN = { 4, -2, 3, 6, fltNonfiniteBehavior::FiniteOnly}; -static constexpr fltSemantics semFloat6E2M3FN = { +constexpr fltSemantics APFloatBase::semFloat6E2M3FN = { 2, 0, 4, 6, fltNonfiniteBehavior::FiniteOnly}; -static constexpr fltSemantics semFloat4E2M1FN = { +constexpr fltSemantics APFloatBase::semFloat4E2M1FN = { 2, 0, 2, 4, fltNonfiniteBehavior::FiniteOnly}; -static constexpr fltSemantics semX87DoubleExtended = {16383, -16382, 64, 80}; -static constexpr fltSemantics semBogus = {0, 0, 0, 0}; -static constexpr fltSemantics semPPCDoubleDouble = {-1, 0, 0, 128}; -static constexpr fltSemantics semPPCDoubleDoubleLegacy = {1023, -1022 + 53, - 53 + 53, 128}; +constexpr fltSemantics APFloatBase::semX87DoubleExtended = {16383, -16382, 64, + 80}; +constexpr fltSemantics APFloatBase::semBogus = {0, 0, 0, 0}; +constexpr fltSemantics APFloatBase::semPPCDoubleDouble = {-1, 0, 0, 128}; +constexpr fltSemantics APFloatBase::semPPCDoubleDoubleLegacy = { + 1023, -1022 + 53, 53 + 53, 128}; const llvm::fltSemantics &APFloatBase::EnumToSemantics(Semantics S) { switch (S) { @@ -261,36 +263,6 @@ APFloatBase::SemanticsToEnum(const llvm::fltSemantics &Sem) { llvm_unreachable("Unknown floating semantics"); } -const fltSemantics &APFloatBase::IEEEhalf() { return semIEEEhalf; } -const fltSemantics &APFloatBase::BFloat() { return semBFloat; } -const fltSemantics &APFloatBase::IEEEsingle() { return semIEEEsingle; } -const fltSemantics &APFloatBase::IEEEdouble() { return semIEEEdouble; } -const fltSemantics &APFloatBase::IEEEquad() { return semIEEEquad; } -const fltSemantics &APFloatBase::PPCDoubleDouble() { - return semPPCDoubleDouble; -} -const fltSemantics &APFloatBase::PPCDoubleDoubleLegacy() { - return semPPCDoubleDoubleLegacy; -} -const fltSemantics &APFloatBase::Float8E5M2() { return semFloat8E5M2; } -const fltSemantics &APFloatBase::Float8E5M2FNUZ() { return semFloat8E5M2FNUZ; } -const fltSemantics &APFloatBase::Float8E4M3() { return semFloat8E4M3; } -const fltSemantics &APFloatBase::Float8E4M3FN() { return semFloat8E4M3FN; } -const fltSemantics &APFloatBase::Float8E4M3FNUZ() { return semFloat8E4M3FNUZ; } -const fltSemantics &APFloatBase::Float8E4M3B11FNUZ() { - return semFloat8E4M3B11FNUZ; -} -const fltSemantics &APFloatBase::Float8E3M4() { return semFloat8E3M4; } -const fltSemantics &APFloatBase::FloatTF32() { return semFloatTF32; } -const fltSemantics &APFloatBase::Float8E8M0FNU() { return semFloat8E8M0FNU; } -const fltSemantics &APFloatBase::Float6E3M2FN() { return semFloat6E3M2FN; } -const fltSemantics &APFloatBase::Float6E2M3FN() { return semFloat6E2M3FN; } -const fltSemantics &APFloatBase::Float4E2M1FN() { return semFloat4E2M1FN; } -const fltSemantics &APFloatBase::x87DoubleExtended() { - return semX87DoubleExtended; -} -const fltSemantics &APFloatBase::Bogus() { return semBogus; } - bool APFloatBase::isRepresentableBy(const fltSemantics &A, const fltSemantics &B) { return A.maxExponent <= B.maxExponent && A.minExponent >= B.minExponent && @@ -1029,7 +1001,7 @@ void IEEEFloat::makeNaN(bool SNaN, bool Negative, const APInt *fill) { // For x87 extended precision, we want to make a NaN, not a // pseudo-NaN. Maybe we should expose the ability to make // pseudo-NaNs? - if (semantics == &semX87DoubleExtended) + if (semantics == &APFloatBase::semX87DoubleExtended) APInt::tcSetBit(significand, QNaNBit + 1); } @@ -1054,7 +1026,7 @@ IEEEFloat &IEEEFloat::operator=(IEEEFloat &&rhs) { category = rhs.category; sign = rhs.sign; - rhs.semantics = &semBogus; + rhs.semantics = &APFloatBase::semBogus; return *this; } @@ -1247,7 +1219,7 @@ IEEEFloat::IEEEFloat(const IEEEFloat &rhs) { assign(rhs); } -IEEEFloat::IEEEFloat(IEEEFloat &&rhs) : semantics(&semBogus) { +IEEEFloat::IEEEFloat(IEEEFloat &&rhs) : semantics(&APFloatBase::semBogus) { *this = std::move(rhs); } @@ -2607,8 +2579,8 @@ APFloat::opStatus IEEEFloat::convert(const fltSemantics &toSemantics, shift = toSemantics.precision - fromSemantics.precision; bool X86SpecialNan = false; - if (&fromSemantics == &semX87DoubleExtended && - &toSemantics != &semX87DoubleExtended && category == fcNaN && + if (&fromSemantics == &APFloatBase::semX87DoubleExtended && + &toSemantics != &APFloatBase::semX87DoubleExtended && category == fcNaN && (!(*significandParts() & 0x8000000000000000ULL) || !(*significandParts() & 0x4000000000000000ULL))) { // x86 has some unusual NaNs which cannot be represented in any other @@ -2628,8 +2600,7 @@ APFloat::opStatus IEEEFloat::convert(const fltSemantics &toSemantics, int exponentChange = omsb - fromSemantics.precision; if (exponent + exponentChange < toSemantics.minExponent) exponentChange = toSemantics.minExponent - exponent; - if (exponentChange < shift) - exponentChange = shift; + exponentChange = std::max(exponentChange, shift); if (exponentChange < 0) { shift -= exponentChange; exponent += exponentChange; @@ -2694,7 +2665,7 @@ APFloat::opStatus IEEEFloat::convert(const fltSemantics &toSemantics, // For x87 extended precision, we want to make a NaN, not a special NaN if // the input wasn't special either. - if (!X86SpecialNan && semantics == &semX87DoubleExtended) + if (!X86SpecialNan && semantics == &APFloatBase::semX87DoubleExtended) APInt::tcSetBit(significandParts(), semantics->precision - 1); // Convert of sNaN creates qNaN and raises an exception (invalid op). @@ -3071,8 +3042,7 @@ IEEEFloat::roundSignificandWithExponent(const integerPart *decSigParts, if (decSig.exponent < semantics->minExponent) { excessPrecision += (semantics->minExponent - decSig.exponent); truncatedBits = excessPrecision; - if (excessPrecision > calcSemantics.precision) - excessPrecision = calcSemantics.precision; + excessPrecision = std::min(excessPrecision, calcSemantics.precision); } /* Extra half-ulp lost in reciprocal of exponent. */ powHUerr = (powStatus == opOK && calcLostFraction == lfExactlyZero) ? 0:2; @@ -3469,8 +3439,7 @@ char *IEEEFloat::convertNormalToHexString(char *dst, unsigned int hexDigits, /* Convert as much of "part" to hexdigits as we can. */ unsigned int curDigits = integerPartWidth / 4; - if (curDigits > outputDigits) - curDigits = outputDigits; + curDigits = std::min(curDigits, outputDigits); dst += partAsHex (dst, part, curDigits, hexDigitChars); outputDigits -= curDigits; } @@ -3530,7 +3499,8 @@ hash_code hash_value(const IEEEFloat &Arg) { // the actual IEEE respresentations. We compensate for that here. APInt IEEEFloat::convertF80LongDoubleAPFloatToAPInt() const { - assert(semantics == (const llvm::fltSemantics*)&semX87DoubleExtended); + assert(semantics == + (const llvm::fltSemantics *)&APFloatBase::semX87DoubleExtended); assert(partCount()==2); uint64_t myexponent, mysignificand; @@ -3560,7 +3530,8 @@ APInt IEEEFloat::convertF80LongDoubleAPFloatToAPInt() const { } APInt IEEEFloat::convertPPCDoubleDoubleLegacyAPFloatToAPInt() const { - assert(semantics == (const llvm::fltSemantics *)&semPPCDoubleDoubleLegacy); + assert(semantics == + (const llvm::fltSemantics *)&APFloatBase::semPPCDoubleDoubleLegacy); assert(partCount()==2); uint64_t words[2]; @@ -3574,14 +3545,14 @@ APInt IEEEFloat::convertPPCDoubleDoubleLegacyAPFloatToAPInt() const { // Declare fltSemantics before APFloat that uses it (and // saves pointer to it) to ensure correct destruction order. fltSemantics extendedSemantics = *semantics; - extendedSemantics.minExponent = semIEEEdouble.minExponent; + extendedSemantics.minExponent = APFloatBase::semIEEEdouble.minExponent; IEEEFloat extended(*this); fs = extended.convert(extendedSemantics, rmNearestTiesToEven, &losesInfo); assert(fs == opOK && !losesInfo); (void)fs; IEEEFloat u(extended); - fs = u.convert(semIEEEdouble, rmNearestTiesToEven, &losesInfo); + fs = u.convert(APFloatBase::semIEEEdouble, rmNearestTiesToEven, &losesInfo); assert(fs == opOK || fs == opInexact); (void)fs; words[0] = *u.convertDoubleAPFloatToAPInt().getRawData(); @@ -3597,7 +3568,7 @@ APInt IEEEFloat::convertPPCDoubleDoubleLegacyAPFloatToAPInt() const { IEEEFloat v(extended); v.subtract(u, rmNearestTiesToEven); - fs = v.convert(semIEEEdouble, rmNearestTiesToEven, &losesInfo); + fs = v.convert(APFloatBase::semIEEEdouble, rmNearestTiesToEven, &losesInfo); assert(fs == opOK && !losesInfo); (void)fs; words[1] = *v.convertDoubleAPFloatToAPInt().getRawData(); @@ -3611,8 +3582,9 @@ APInt IEEEFloat::convertPPCDoubleDoubleLegacyAPFloatToAPInt() const { template <const fltSemantics &S> APInt IEEEFloat::convertIEEEFloatToAPInt() const { assert(semantics == &S); - const int bias = - (semantics == &semFloat8E8M0FNU) ? -S.minExponent : -(S.minExponent - 1); + const int bias = (semantics == &APFloatBase::semFloat8E8M0FNU) + ? -S.minExponent + : -(S.minExponent - 1); constexpr unsigned int trailing_significand_bits = S.precision - 1; constexpr int integer_bit_part = trailing_significand_bits / integerPartWidth; constexpr integerPart integer_bit = @@ -3677,87 +3649,87 @@ APInt IEEEFloat::convertIEEEFloatToAPInt() const { APInt IEEEFloat::convertQuadrupleAPFloatToAPInt() const { assert(partCount() == 2); - return convertIEEEFloatToAPInt<semIEEEquad>(); + return convertIEEEFloatToAPInt<APFloatBase::semIEEEquad>(); } APInt IEEEFloat::convertDoubleAPFloatToAPInt() const { assert(partCount()==1); - return convertIEEEFloatToAPInt<semIEEEdouble>(); + return convertIEEEFloatToAPInt<APFloatBase::semIEEEdouble>(); } APInt IEEEFloat::convertFloatAPFloatToAPInt() const { assert(partCount()==1); - return convertIEEEFloatToAPInt<semIEEEsingle>(); + return convertIEEEFloatToAPInt<APFloatBase::semIEEEsingle>(); } APInt IEEEFloat::convertBFloatAPFloatToAPInt() const { assert(partCount() == 1); - return convertIEEEFloatToAPInt<semBFloat>(); + return convertIEEEFloatToAPInt<APFloatBase::semBFloat>(); } APInt IEEEFloat::convertHalfAPFloatToAPInt() const { assert(partCount()==1); - return convertIEEEFloatToAPInt<semIEEEhalf>(); + return convertIEEEFloatToAPInt<APFloatBase::APFloatBase::semIEEEhalf>(); } APInt IEEEFloat::convertFloat8E5M2APFloatToAPInt() const { assert(partCount() == 1); - return convertIEEEFloatToAPInt<semFloat8E5M2>(); + return convertIEEEFloatToAPInt<APFloatBase::semFloat8E5M2>(); } APInt IEEEFloat::convertFloat8E5M2FNUZAPFloatToAPInt() const { assert(partCount() == 1); - return convertIEEEFloatToAPInt<semFloat8E5M2FNUZ>(); + return convertIEEEFloatToAPInt<APFloatBase::semFloat8E5M2FNUZ>(); } APInt IEEEFloat::convertFloat8E4M3APFloatToAPInt() const { assert(partCount() == 1); - return convertIEEEFloatToAPInt<semFloat8E4M3>(); + return convertIEEEFloatToAPInt<APFloatBase::semFloat8E4M3>(); } APInt IEEEFloat::convertFloat8E4M3FNAPFloatToAPInt() const { assert(partCount() == 1); - return convertIEEEFloatToAPInt<semFloat8E4M3FN>(); + return convertIEEEFloatToAPInt<APFloatBase::semFloat8E4M3FN>(); } APInt IEEEFloat::convertFloat8E4M3FNUZAPFloatToAPInt() const { assert(partCount() == 1); - return convertIEEEFloatToAPInt<semFloat8E4M3FNUZ>(); + return convertIEEEFloatToAPInt<APFloatBase::semFloat8E4M3FNUZ>(); } APInt IEEEFloat::convertFloat8E4M3B11FNUZAPFloatToAPInt() const { assert(partCount() == 1); - return convertIEEEFloatToAPInt<semFloat8E4M3B11FNUZ>(); + return convertIEEEFloatToAPInt<APFloatBase::semFloat8E4M3B11FNUZ>(); } APInt IEEEFloat::convertFloat8E3M4APFloatToAPInt() const { assert(partCount() == 1); - return convertIEEEFloatToAPInt<semFloat8E3M4>(); + return convertIEEEFloatToAPInt<APFloatBase::semFloat8E3M4>(); } APInt IEEEFloat::convertFloatTF32APFloatToAPInt() const { assert(partCount() == 1); - return convertIEEEFloatToAPInt<semFloatTF32>(); + return convertIEEEFloatToAPInt<APFloatBase::semFloatTF32>(); } APInt IEEEFloat::convertFloat8E8M0FNUAPFloatToAPInt() const { assert(partCount() == 1); - return convertIEEEFloatToAPInt<semFloat8E8M0FNU>(); + return convertIEEEFloatToAPInt<APFloatBase::semFloat8E8M0FNU>(); } APInt IEEEFloat::convertFloat6E3M2FNAPFloatToAPInt() const { assert(partCount() == 1); - return convertIEEEFloatToAPInt<semFloat6E3M2FN>(); + return convertIEEEFloatToAPInt<APFloatBase::semFloat6E3M2FN>(); } APInt IEEEFloat::convertFloat6E2M3FNAPFloatToAPInt() const { assert(partCount() == 1); - return convertIEEEFloatToAPInt<semFloat6E2M3FN>(); + return convertIEEEFloatToAPInt<APFloatBase::semFloat6E2M3FN>(); } APInt IEEEFloat::convertFloat4E2M1FNAPFloatToAPInt() const { assert(partCount() == 1); - return convertIEEEFloatToAPInt<semFloat4E2M1FN>(); + return convertIEEEFloatToAPInt<APFloatBase::semFloat4E2M1FN>(); } // This function creates an APInt that is just a bit map of the floating @@ -3765,74 +3737,77 @@ APInt IEEEFloat::convertFloat4E2M1FNAPFloatToAPInt() const { // and treating the result as a normal integer is unlikely to be useful. APInt IEEEFloat::bitcastToAPInt() const { - if (semantics == (const llvm::fltSemantics*)&semIEEEhalf) + if (semantics == (const llvm::fltSemantics *)&APFloatBase::semIEEEhalf) return convertHalfAPFloatToAPInt(); - if (semantics == (const llvm::fltSemantics *)&semBFloat) + if (semantics == (const llvm::fltSemantics *)&APFloatBase::semBFloat) return convertBFloatAPFloatToAPInt(); - if (semantics == (const llvm::fltSemantics*)&semIEEEsingle) + if (semantics == (const llvm::fltSemantics *)&APFloatBase::semIEEEsingle) return convertFloatAPFloatToAPInt(); - if (semantics == (const llvm::fltSemantics*)&semIEEEdouble) + if (semantics == (const llvm::fltSemantics *)&APFloatBase::semIEEEdouble) return convertDoubleAPFloatToAPInt(); - if (semantics == (const llvm::fltSemantics*)&semIEEEquad) + if (semantics == (const llvm::fltSemantics *)&APFloatBase::semIEEEquad) return convertQuadrupleAPFloatToAPInt(); - if (semantics == (const llvm::fltSemantics *)&semPPCDoubleDoubleLegacy) + if (semantics == + (const llvm::fltSemantics *)&APFloatBase::semPPCDoubleDoubleLegacy) return convertPPCDoubleDoubleLegacyAPFloatToAPInt(); - if (semantics == (const llvm::fltSemantics *)&semFloat8E5M2) + if (semantics == (const llvm::fltSemantics *)&APFloatBase::semFloat8E5M2) return convertFloat8E5M2APFloatToAPInt(); - if (semantics == (const llvm::fltSemantics *)&semFloat8E5M2FNUZ) + if (semantics == (const llvm::fltSemantics *)&APFloatBase::semFloat8E5M2FNUZ) return convertFloat8E5M2FNUZAPFloatToAPInt(); - if (semantics == (const llvm::fltSemantics *)&semFloat8E4M3) + if (semantics == (const llvm::fltSemantics *)&APFloatBase::semFloat8E4M3) return convertFloat8E4M3APFloatToAPInt(); - if (semantics == (const llvm::fltSemantics *)&semFloat8E4M3FN) + if (semantics == (const llvm::fltSemantics *)&APFloatBase::semFloat8E4M3FN) return convertFloat8E4M3FNAPFloatToAPInt(); - if (semantics == (const llvm::fltSemantics *)&semFloat8E4M3FNUZ) + if (semantics == (const llvm::fltSemantics *)&APFloatBase::semFloat8E4M3FNUZ) return convertFloat8E4M3FNUZAPFloatToAPInt(); - if (semantics == (const llvm::fltSemantics *)&semFloat8E4M3B11FNUZ) + if (semantics == + (const llvm::fltSemantics *)&APFloatBase::semFloat8E4M3B11FNUZ) return convertFloat8E4M3B11FNUZAPFloatToAPInt(); - if (semantics == (const llvm::fltSemantics *)&semFloat8E3M4) + if (semantics == (const llvm::fltSemantics *)&APFloatBase::semFloat8E3M4) return convertFloat8E3M4APFloatToAPInt(); - if (semantics == (const llvm::fltSemantics *)&semFloatTF32) + if (semantics == (const llvm::fltSemantics *)&APFloatBase::semFloatTF32) return convertFloatTF32APFloatToAPInt(); - if (semantics == (const llvm::fltSemantics *)&semFloat8E8M0FNU) + if (semantics == (const llvm::fltSemantics *)&APFloatBase::semFloat8E8M0FNU) return convertFloat8E8M0FNUAPFloatToAPInt(); - if (semantics == (const llvm::fltSemantics *)&semFloat6E3M2FN) + if (semantics == (const llvm::fltSemantics *)&APFloatBase::semFloat6E3M2FN) return convertFloat6E3M2FNAPFloatToAPInt(); - if (semantics == (const llvm::fltSemantics *)&semFloat6E2M3FN) + if (semantics == (const llvm::fltSemantics *)&APFloatBase::semFloat6E2M3FN) return convertFloat6E2M3FNAPFloatToAPInt(); - if (semantics == (const llvm::fltSemantics *)&semFloat4E2M1FN) + if (semantics == (const llvm::fltSemantics *)&APFloatBase::semFloat4E2M1FN) return convertFloat4E2M1FNAPFloatToAPInt(); - assert(semantics == (const llvm::fltSemantics*)&semX87DoubleExtended && + assert(semantics == + (const llvm::fltSemantics *)&APFloatBase::semX87DoubleExtended && "unknown format!"); return convertF80LongDoubleAPFloatToAPInt(); } float IEEEFloat::convertToFloat() const { - assert(semantics == (const llvm::fltSemantics*)&semIEEEsingle && + assert(semantics == (const llvm::fltSemantics *)&APFloatBase::semIEEEsingle && "Float semantics are not IEEEsingle"); APInt api = bitcastToAPInt(); return api.bitsToFloat(); } double IEEEFloat::convertToDouble() const { - assert(semantics == (const llvm::fltSemantics*)&semIEEEdouble && + assert(semantics == (const llvm::fltSemantics *)&APFloatBase::semIEEEdouble && "Float semantics are not IEEEdouble"); APInt api = bitcastToAPInt(); return api.bitsToDouble(); @@ -3840,7 +3815,7 @@ double IEEEFloat::convertToDouble() const { #ifdef HAS_IEE754_FLOAT128 float128 IEEEFloat::convertToQuad() const { - assert(semantics == (const llvm::fltSemantics *)&semIEEEquad && + assert(semantics == (const llvm::fltSemantics *)&APFloatBase::semIEEEquad && "Float semantics are not IEEEquads"); APInt api = bitcastToAPInt(); return api.bitsToQuad(); @@ -3861,7 +3836,7 @@ void IEEEFloat::initFromF80LongDoubleAPInt(const APInt &api) { uint64_t mysignificand = i1; uint8_t myintegerbit = mysignificand >> 63; - initialize(&semX87DoubleExtended); + initialize(&APFloatBase::semX87DoubleExtended); assert(partCount()==2); sign = static_cast<unsigned int>(i2>>15); @@ -3893,14 +3868,16 @@ void IEEEFloat::initFromPPCDoubleDoubleLegacyAPInt(const APInt &api) { // Get the first double and convert to our format. initFromDoubleAPInt(APInt(64, i1)); - fs = convert(semPPCDoubleDoubleLegacy, rmNearestTiesToEven, &losesInfo); + fs = convert(APFloatBase::semPPCDoubleDoubleLegacy, rmNearestTiesToEven, + &losesInfo); assert(fs == opOK && !losesInfo); (void)fs; // Unless we have a special case, add in second double. if (isFiniteNonZero()) { - IEEEFloat v(semIEEEdouble, APInt(64, i2)); - fs = v.convert(semPPCDoubleDoubleLegacy, rmNearestTiesToEven, &losesInfo); + IEEEFloat v(APFloatBase::semIEEEdouble, APInt(64, i2)); + fs = v.convert(APFloatBase::semPPCDoubleDoubleLegacy, rmNearestTiesToEven, + &losesInfo); assert(fs == opOK && !losesInfo); (void)fs; @@ -3918,7 +3895,7 @@ void IEEEFloat::initFromFloat8E8M0FNUAPInt(const APInt &api) { uint64_t val = api.getRawData()[0]; uint64_t myexponent = (val & exponent_mask); - initialize(&semFloat8E8M0FNU); + initialize(&APFloatBase::semFloat8E8M0FNU); assert(partCount() == 1); // This format has unsigned representation only @@ -4025,109 +4002,109 @@ void IEEEFloat::initFromIEEEAPInt(const APInt &api) { } void IEEEFloat::initFromQuadrupleAPInt(const APInt &api) { - initFromIEEEAPInt<semIEEEquad>(api); + initFromIEEEAPInt<APFloatBase::semIEEEquad>(api); } void IEEEFloat::initFromDoubleAPInt(const APInt &api) { - initFromIEEEAPInt<semIEEEdouble>(api); + initFromIEEEAPInt<APFloatBase::semIEEEdouble>(api); } void IEEEFloat::initFromFloatAPInt(const APInt &api) { - initFromIEEEAPInt<semIEEEsingle>(api); + initFromIEEEAPInt<APFloatBase::semIEEEsingle>(api); } void IEEEFloat::initFromBFloatAPInt(const APInt &api) { - initFromIEEEAPInt<semBFloat>(api); + initFromIEEEAPInt<APFloatBase::semBFloat>(api); } void IEEEFloat::initFromHalfAPInt(const APInt &api) { - initFromIEEEAPInt<semIEEEhalf>(api); + initFromIEEEAPInt<APFloatBase::semIEEEhalf>(api); } void IEEEFloat::initFromFloat8E5M2APInt(const APInt &api) { - initFromIEEEAPInt<semFloat8E5M2>(api); + initFromIEEEAPInt<APFloatBase::semFloat8E5M2>(api); } void IEEEFloat::initFromFloat8E5M2FNUZAPInt(const APInt &api) { - initFromIEEEAPInt<semFloat8E5M2FNUZ>(api); + initFromIEEEAPInt<APFloatBase::semFloat8E5M2FNUZ>(api); } void IEEEFloat::initFromFloat8E4M3APInt(const APInt &api) { - initFromIEEEAPInt<semFloat8E4M3>(api); + initFromIEEEAPInt<APFloatBase::semFloat8E4M3>(api); } void IEEEFloat::initFromFloat8E4M3FNAPInt(const APInt &api) { - initFromIEEEAPInt<semFloat8E4M3FN>(api); + initFromIEEEAPInt<APFloatBase::semFloat8E4M3FN>(api); } void IEEEFloat::initFromFloat8E4M3FNUZAPInt(const APInt &api) { - initFromIEEEAPInt<semFloat8E4M3FNUZ>(api); + initFromIEEEAPInt<APFloatBase::semFloat8E4M3FNUZ>(api); } void IEEEFloat::initFromFloat8E4M3B11FNUZAPInt(const APInt &api) { - initFromIEEEAPInt<semFloat8E4M3B11FNUZ>(api); + initFromIEEEAPInt<APFloatBase::semFloat8E4M3B11FNUZ>(api); } void IEEEFloat::initFromFloat8E3M4APInt(const APInt &api) { - initFromIEEEAPInt<semFloat8E3M4>(api); + initFromIEEEAPInt<APFloatBase::semFloat8E3M4>(api); } void IEEEFloat::initFromFloatTF32APInt(const APInt &api) { - initFromIEEEAPInt<semFloatTF32>(api); + initFromIEEEAPInt<APFloatBase::semFloatTF32>(api); } void IEEEFloat::initFromFloat6E3M2FNAPInt(const APInt &api) { - initFromIEEEAPInt<semFloat6E3M2FN>(api); + initFromIEEEAPInt<APFloatBase::semFloat6E3M2FN>(api); } void IEEEFloat::initFromFloat6E2M3FNAPInt(const APInt &api) { - initFromIEEEAPInt<semFloat6E2M3FN>(api); + initFromIEEEAPInt<APFloatBase::semFloat6E2M3FN>(api); } void IEEEFloat::initFromFloat4E2M1FNAPInt(const APInt &api) { - initFromIEEEAPInt<semFloat4E2M1FN>(api); + initFromIEEEAPInt<APFloatBase::semFloat4E2M1FN>(api); } /// Treat api as containing the bits of a floating point number. void IEEEFloat::initFromAPInt(const fltSemantics *Sem, const APInt &api) { assert(api.getBitWidth() == Sem->sizeInBits); - if (Sem == &semIEEEhalf) + if (Sem == &APFloatBase::semIEEEhalf) return initFromHalfAPInt(api); - if (Sem == &semBFloat) + if (Sem == &APFloatBase::semBFloat) return initFromBFloatAPInt(api); - if (Sem == &semIEEEsingle) + if (Sem == &APFloatBase::semIEEEsingle) return initFromFloatAPInt(api); - if (Sem == &semIEEEdouble) + if (Sem == &APFloatBase::semIEEEdouble) return initFromDoubleAPInt(api); - if (Sem == &semX87DoubleExtended) + if (Sem == &APFloatBase::semX87DoubleExtended) return initFromF80LongDoubleAPInt(api); - if (Sem == &semIEEEquad) + if (Sem == &APFloatBase::semIEEEquad) return initFromQuadrupleAPInt(api); - if (Sem == &semPPCDoubleDoubleLegacy) + if (Sem == &APFloatBase::semPPCDoubleDoubleLegacy) return initFromPPCDoubleDoubleLegacyAPInt(api); - if (Sem == &semFloat8E5M2) + if (Sem == &APFloatBase::semFloat8E5M2) return initFromFloat8E5M2APInt(api); - if (Sem == &semFloat8E5M2FNUZ) + if (Sem == &APFloatBase::semFloat8E5M2FNUZ) return initFromFloat8E5M2FNUZAPInt(api); - if (Sem == &semFloat8E4M3) + if (Sem == &APFloatBase::semFloat8E4M3) return initFromFloat8E4M3APInt(api); - if (Sem == &semFloat8E4M3FN) + if (Sem == &APFloatBase::semFloat8E4M3FN) return initFromFloat8E4M3FNAPInt(api); - if (Sem == &semFloat8E4M3FNUZ) + if (Sem == &APFloatBase::semFloat8E4M3FNUZ) return initFromFloat8E4M3FNUZAPInt(api); - if (Sem == &semFloat8E4M3B11FNUZ) + if (Sem == &APFloatBase::semFloat8E4M3B11FNUZ) return initFromFloat8E4M3B11FNUZAPInt(api); - if (Sem == &semFloat8E3M4) + if (Sem == &APFloatBase::semFloat8E3M4) return initFromFloat8E3M4APInt(api); - if (Sem == &semFloatTF32) + if (Sem == &APFloatBase::semFloatTF32) return initFromFloatTF32APInt(api); - if (Sem == &semFloat8E8M0FNU) + if (Sem == &APFloatBase::semFloat8E8M0FNU) return initFromFloat8E8M0FNUAPInt(api); - if (Sem == &semFloat6E3M2FN) + if (Sem == &APFloatBase::semFloat6E3M2FN) return initFromFloat6E3M2FNAPInt(api); - if (Sem == &semFloat6E2M3FN) + if (Sem == &APFloatBase::semFloat6E2M3FN) return initFromFloat6E2M3FNAPInt(api); - if (Sem == &semFloat4E2M1FN) + if (Sem == &APFloatBase::semFloat4E2M1FN) return initFromFloat4E2M1FNAPInt(api); llvm_unreachable("unsupported semantics"); @@ -4202,11 +4179,11 @@ IEEEFloat::IEEEFloat(const fltSemantics &Sem, const APInt &API) { } IEEEFloat::IEEEFloat(float f) { - initFromAPInt(&semIEEEsingle, APInt::floatToBits(f)); + initFromAPInt(&APFloatBase::semIEEEsingle, APInt::floatToBits(f)); } IEEEFloat::IEEEFloat(double d) { - initFromAPInt(&semIEEEdouble, APInt::doubleToBits(d)); + initFromAPInt(&APFloatBase::semIEEEdouble, APInt::doubleToBits(d)); } namespace { @@ -4815,38 +4792,40 @@ IEEEFloat frexp(const IEEEFloat &Val, int &Exp, roundingMode RM) { DoubleAPFloat::DoubleAPFloat(const fltSemantics &S) : Semantics(&S), - Floats(new APFloat[2]{APFloat(semIEEEdouble), APFloat(semIEEEdouble)}) { - assert(Semantics == &semPPCDoubleDouble); + Floats(new APFloat[2]{APFloat(APFloatBase::semIEEEdouble), + APFloat(APFloatBase::semIEEEdouble)}) { + assert(Semantics == &APFloatBase::semPPCDoubleDouble); } DoubleAPFloat::DoubleAPFloat(const fltSemantics &S, uninitializedTag) - : Semantics(&S), - Floats(new APFloat[2]{APFloat(semIEEEdouble, uninitialized), - APFloat(semIEEEdouble, uninitialized)}) { - assert(Semantics == &semPPCDoubleDouble); + : Semantics(&S), Floats(new APFloat[2]{ + APFloat(APFloatBase::semIEEEdouble, uninitialized), + APFloat(APFloatBase::semIEEEdouble, uninitialized)}) { + assert(Semantics == &APFloatBase::semPPCDoubleDouble); } DoubleAPFloat::DoubleAPFloat(const fltSemantics &S, integerPart I) - : Semantics(&S), Floats(new APFloat[2]{APFloat(semIEEEdouble, I), - APFloat(semIEEEdouble)}) { - assert(Semantics == &semPPCDoubleDouble); + : Semantics(&S), + Floats(new APFloat[2]{APFloat(APFloatBase::semIEEEdouble, I), + APFloat(APFloatBase::semIEEEdouble)}) { + assert(Semantics == &APFloatBase::semPPCDoubleDouble); } DoubleAPFloat::DoubleAPFloat(const fltSemantics &S, const APInt &I) : Semantics(&S), Floats(new APFloat[2]{ - APFloat(semIEEEdouble, APInt(64, I.getRawData()[0])), - APFloat(semIEEEdouble, APInt(64, I.getRawData()[1]))}) { - assert(Semantics == &semPPCDoubleDouble); + APFloat(APFloatBase::semIEEEdouble, APInt(64, I.getRawData()[0])), + APFloat(APFloatBase::semIEEEdouble, APInt(64, I.getRawData()[1]))}) { + assert(Semantics == &APFloatBase::semPPCDoubleDouble); } DoubleAPFloat::DoubleAPFloat(const fltSemantics &S, APFloat &&First, APFloat &&Second) : Semantics(&S), Floats(new APFloat[2]{std::move(First), std::move(Second)}) { - assert(Semantics == &semPPCDoubleDouble); - assert(&Floats[0].getSemantics() == &semIEEEdouble); - assert(&Floats[1].getSemantics() == &semIEEEdouble); + assert(Semantics == &APFloatBase::semPPCDoubleDouble); + assert(&Floats[0].getSemantics() == &APFloatBase::semIEEEdouble); + assert(&Floats[1].getSemantics() == &APFloatBase::semIEEEdouble); } DoubleAPFloat::DoubleAPFloat(const DoubleAPFloat &RHS) @@ -4854,14 +4833,14 @@ DoubleAPFloat::DoubleAPFloat(const DoubleAPFloat &RHS) Floats(RHS.Floats ? new APFloat[2]{APFloat(RHS.Floats[0]), APFloat(RHS.Floats[1])} : nullptr) { - assert(Semantics == &semPPCDoubleDouble); + assert(Semantics == &APFloatBase::semPPCDoubleDouble); } DoubleAPFloat::DoubleAPFloat(DoubleAPFloat &&RHS) : Semantics(RHS.Semantics), Floats(RHS.Floats) { - RHS.Semantics = &semBogus; + RHS.Semantics = &APFloatBase::semBogus; RHS.Floats = nullptr; - assert(Semantics == &semPPCDoubleDouble); + assert(Semantics == &APFloatBase::semPPCDoubleDouble); } DoubleAPFloat &DoubleAPFloat::operator=(const DoubleAPFloat &RHS) { @@ -5009,12 +4988,12 @@ APFloat::opStatus DoubleAPFloat::addWithSpecial(const DoubleAPFloat &LHS, APFloat A(LHS.Floats[0]), AA(LHS.Floats[1]), C(RHS.Floats[0]), CC(RHS.Floats[1]); - assert(&A.getSemantics() == &semIEEEdouble); - assert(&AA.getSemantics() == &semIEEEdouble); - assert(&C.getSemantics() == &semIEEEdouble); - assert(&CC.getSemantics() == &semIEEEdouble); - assert(&Out.Floats[0].getSemantics() == &semIEEEdouble); - assert(&Out.Floats[1].getSemantics() == &semIEEEdouble); + assert(&A.getSemantics() == &APFloatBase::semIEEEdouble); + assert(&AA.getSemantics() == &APFloatBase::semIEEEdouble); + assert(&C.getSemantics() == &APFloatBase::semIEEEdouble); + assert(&CC.getSemantics() == &APFloatBase::semIEEEdouble); + assert(&Out.Floats[0].getSemantics() == &APFloatBase::semIEEEdouble); + assert(&Out.Floats[1].getSemantics() == &APFloatBase::semIEEEdouble); return Out.addImpl(A, AA, C, CC, RM); } @@ -5119,28 +5098,32 @@ APFloat::opStatus DoubleAPFloat::multiply(const DoubleAPFloat &RHS, APFloat::opStatus DoubleAPFloat::divide(const DoubleAPFloat &RHS, APFloat::roundingMode RM) { - assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics"); - APFloat Tmp(semPPCDoubleDoubleLegacy, bitcastToAPInt()); - auto Ret = - Tmp.divide(APFloat(semPPCDoubleDoubleLegacy, RHS.bitcastToAPInt()), RM); - *this = DoubleAPFloat(semPPCDoubleDouble, Tmp.bitcastToAPInt()); + assert(Semantics == &APFloatBase::semPPCDoubleDouble && + "Unexpected Semantics"); + APFloat Tmp(APFloatBase::semPPCDoubleDoubleLegacy, bitcastToAPInt()); + auto Ret = Tmp.divide( + APFloat(APFloatBase::semPPCDoubleDoubleLegacy, RHS.bitcastToAPInt()), RM); + *this = DoubleAPFloat(APFloatBase::semPPCDoubleDouble, Tmp.bitcastToAPInt()); return Ret; } APFloat::opStatus DoubleAPFloat::remainder(const DoubleAPFloat &RHS) { - assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics"); - APFloat Tmp(semPPCDoubleDoubleLegacy, bitcastToAPInt()); - auto Ret = - Tmp.remainder(APFloat(semPPCDoubleDoubleLegacy, RHS.bitcastToAPInt())); - *this = DoubleAPFloat(semPPCDoubleDouble, Tmp.bitcastToAPInt()); + assert(Semantics == &APFloatBase::semPPCDoubleDouble && + "Unexpected Semantics"); + APFloat Tmp(APFloatBase::semPPCDoubleDoubleLegacy, bitcastToAPInt()); + auto Ret = Tmp.remainder( + APFloat(APFloatBase::semPPCDoubleDoubleLegacy, RHS.bitcastToAPInt())); + *this = DoubleAPFloat(APFloatBase::semPPCDoubleDouble, Tmp.bitcastToAPInt()); return Ret; } APFloat::opStatus DoubleAPFloat::mod(const DoubleAPFloat &RHS) { - assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics"); - APFloat Tmp(semPPCDoubleDoubleLegacy, bitcastToAPInt()); - auto Ret = Tmp.mod(APFloat(semPPCDoubleDoubleLegacy, RHS.bitcastToAPInt())); - *this = DoubleAPFloat(semPPCDoubleDouble, Tmp.bitcastToAPInt()); + assert(Semantics == &APFloatBase::semPPCDoubleDouble && + "Unexpected Semantics"); + APFloat Tmp(APFloatBase::semPPCDoubleDoubleLegacy, bitcastToAPInt()); + auto Ret = Tmp.mod( + APFloat(APFloatBase::semPPCDoubleDoubleLegacy, RHS.bitcastToAPInt())); + *this = DoubleAPFloat(APFloatBase::semPPCDoubleDouble, Tmp.bitcastToAPInt()); return Ret; } @@ -5148,17 +5131,21 @@ APFloat::opStatus DoubleAPFloat::fusedMultiplyAdd(const DoubleAPFloat &Multiplicand, const DoubleAPFloat &Addend, APFloat::roundingMode RM) { - assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics"); - APFloat Tmp(semPPCDoubleDoubleLegacy, bitcastToAPInt()); + assert(Semantics == &APFloatBase::semPPCDoubleDouble && + "Unexpected Semantics"); + APFloat Tmp(APFloatBase::semPPCDoubleDoubleLegacy, bitcastToAPInt()); auto Ret = Tmp.fusedMultiplyAdd( - APFloat(semPPCDoubleDoubleLegacy, Multiplicand.bitcastToAPInt()), - APFloat(semPPCDoubleDoubleLegacy, Addend.bitcastToAPInt()), RM); - *this = DoubleAPFloat(semPPCDoubleDouble, Tmp.bitcastToAPInt()); + APFloat(APFloatBase::semPPCDoubleDoubleLegacy, + Multiplicand.bitcastToAPInt()), + APFloat(APFloatBase::semPPCDoubleDoubleLegacy, Addend.bitcastToAPInt()), + RM); + *this = DoubleAPFloat(APFloatBase::semPPCDoubleDouble, Tmp.bitcastToAPInt()); return Ret; } APFloat::opStatus DoubleAPFloat::roundToIntegral(APFloat::roundingMode RM) { - assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics"); + assert(Semantics == &APFloatBase::semPPCDoubleDouble && + "Unexpected Semantics"); const APFloat &Hi = getFirst(); const APFloat &Lo = getSecond(); @@ -5309,22 +5296,28 @@ void DoubleAPFloat::makeZero(bool Neg) { } void DoubleAPFloat::makeLargest(bool Neg) { - assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics"); - Floats[0] = APFloat(semIEEEdouble, APInt(64, 0x7fefffffffffffffull)); - Floats[1] = APFloat(semIEEEdouble, APInt(64, 0x7c8ffffffffffffeull)); + assert(Semantics == &APFloatBase::semPPCDoubleDouble && + "Unexpected Semantics"); + Floats[0] = + APFloat(APFloatBase::semIEEEdouble, APInt(64, 0x7fefffffffffffffull)); + Floats[1] = + APFloat(APFloatBase::semIEEEdouble, APInt(64, 0x7c8ffffffffffffeull)); if (Neg) changeSign(); } void DoubleAPFloat::makeSmallest(bool Neg) { - assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics"); + assert(Semantics == &APFloatBase::semPPCDoubleDouble && + "Unexpected Semantics"); Floats[0].makeSmallest(Neg); Floats[1].makeZero(/* Neg = */ false); } void DoubleAPFloat::makeSmallestNormalized(bool Neg) { - assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics"); - Floats[0] = APFloat(semIEEEdouble, APInt(64, 0x0360000000000000ull)); + assert(Semantics == &APFloatBase::semPPCDoubleDouble && + "Unexpected Semantics"); + Floats[0] = + APFloat(APFloatBase::semIEEEdouble, APInt(64, 0x0360000000000000ull)); if (Neg) Floats[0].changeSign(); Floats[1].makeZero(/* Neg = */ false); @@ -5355,7 +5348,8 @@ hash_code hash_value(const DoubleAPFloat &Arg) { } APInt DoubleAPFloat::bitcastToAPInt() const { - assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics"); + assert(Semantics == &APFloatBase::semPPCDoubleDouble && + "Unexpected Semantics"); uint64_t Data[] = { Floats[0].bitcastToAPInt().getRawData()[0], Floats[1].bitcastToAPInt().getRawData()[0], @@ -5365,10 +5359,11 @@ APInt DoubleAPFloat::bitcastToAPInt() const { Expected<APFloat::opStatus> DoubleAPFloat::convertFromString(StringRef S, roundingMode RM) { - assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics"); - APFloat Tmp(semPPCDoubleDoubleLegacy); + assert(Semantics == &APFloatBase::semPPCDoubleDouble && + "Unexpected Semantics"); + APFloat Tmp(APFloatBase::semPPCDoubleDoubleLegacy); auto Ret = Tmp.convertFromString(S, RM); - *this = DoubleAPFloat(semPPCDoubleDouble, Tmp.bitcastToAPInt()); + *this = DoubleAPFloat(APFloatBase::semPPCDoubleDouble, Tmp.bitcastToAPInt()); return Ret; } @@ -5379,7 +5374,8 @@ Expected<APFloat::opStatus> DoubleAPFloat::convertFromString(StringRef S, // nextUp must choose the smallest output > input that follows these rules. // nexDown must choose the largest output < input that follows these rules. APFloat::opStatus DoubleAPFloat::next(bool nextDown) { - assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics"); + assert(Semantics == &APFloatBase::semPPCDoubleDouble && + "Unexpected Semantics"); // nextDown(x) = -nextUp(-x) if (nextDown) { changeSign(); @@ -5481,7 +5477,8 @@ APFloat::opStatus DoubleAPFloat::next(bool nextDown) { APFloat::opStatus DoubleAPFloat::convertToSignExtendedInteger( MutableArrayRef<integerPart> Input, unsigned int Width, bool IsSigned, roundingMode RM, bool *IsExact) const { - assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics"); + assert(Semantics == &APFloatBase::semPPCDoubleDouble && + "Unexpected Semantics"); // If Hi is not finite, or Lo is zero, the value is entirely represented // by Hi. Delegate to the simpler single-APFloat conversion. @@ -5761,8 +5758,9 @@ unsigned int DoubleAPFloat::convertToHexString(char *DST, unsigned int HexDigits, bool UpperCase, roundingMode RM) const { - assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics"); - return APFloat(semPPCDoubleDoubleLegacy, bitcastToAPInt()) + assert(Semantics == &APFloatBase::semPPCDoubleDouble && + "Unexpected Semantics"); + return APFloat(APFloatBase::semPPCDoubleDoubleLegacy, bitcastToAPInt()) .convertToHexString(DST, HexDigits, UpperCase, RM); } @@ -5799,7 +5797,8 @@ bool DoubleAPFloat::isLargest() const { } bool DoubleAPFloat::isInteger() const { - assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics"); + assert(Semantics == &APFloatBase::semPPCDoubleDouble && + "Unexpected Semantics"); return Floats[0].isInteger() && Floats[1].isInteger(); } @@ -5807,8 +5806,9 @@ void DoubleAPFloat::toString(SmallVectorImpl<char> &Str, unsigned FormatPrecision, unsigned FormatMaxPadding, bool TruncateZero) const { - assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics"); - APFloat(semPPCDoubleDoubleLegacy, bitcastToAPInt()) + assert(Semantics == &APFloatBase::semPPCDoubleDouble && + "Unexpected Semantics"); + APFloat(APFloatBase::semPPCDoubleDoubleLegacy, bitcastToAPInt()) .toString(Str, FormatPrecision, FormatMaxPadding, TruncateZero); } @@ -5840,14 +5840,17 @@ int ilogb(const DoubleAPFloat &Arg) { DoubleAPFloat scalbn(const DoubleAPFloat &Arg, int Exp, APFloat::roundingMode RM) { - assert(Arg.Semantics == &semPPCDoubleDouble && "Unexpected Semantics"); - return DoubleAPFloat(semPPCDoubleDouble, scalbn(Arg.Floats[0], Exp, RM), + assert(Arg.Semantics == &APFloatBase::PPCDoubleDouble() && + "Unexpected Semantics"); + return DoubleAPFloat(APFloatBase::PPCDoubleDouble(), + scalbn(Arg.Floats[0], Exp, RM), scalbn(Arg.Floats[1], Exp, RM)); } DoubleAPFloat frexp(const DoubleAPFloat &Arg, int &Exp, APFloat::roundingMode RM) { - assert(Arg.Semantics == &semPPCDoubleDouble && "Unexpected Semantics"); + assert(Arg.Semantics == &APFloatBase::PPCDoubleDouble() && + "Unexpected Semantics"); // Get the unbiased exponent e of the number, where |Arg| = m * 2^e for m in // [1.0, 2.0). @@ -5943,7 +5946,8 @@ DoubleAPFloat frexp(const DoubleAPFloat &Arg, int &Exp, } APFloat First = scalbn(Hi, -Exp, RM); - return DoubleAPFloat(semPPCDoubleDouble, std::move(First), std::move(Second)); + return DoubleAPFloat(APFloatBase::PPCDoubleDouble(), std::move(First), + std::move(Second)); } } // namespace detail @@ -5955,9 +5959,8 @@ APFloat::Storage::Storage(IEEEFloat F, const fltSemantics &Semantics) { } if (usesLayout<DoubleAPFloat>(Semantics)) { const fltSemantics& S = F.getSemantics(); - new (&Double) - DoubleAPFloat(Semantics, APFloat(std::move(F), S), - APFloat(semIEEEdouble)); + new (&Double) DoubleAPFloat(Semantics, APFloat(std::move(F), S), + APFloat(APFloatBase::IEEEdouble())); return; } llvm_unreachable("Unexpected semantics"); @@ -6065,8 +6068,9 @@ APFloat::opStatus APFloat::convert(const fltSemantics &ToSemantics, return U.IEEE.convert(ToSemantics, RM, losesInfo); if (usesLayout<IEEEFloat>(getSemantics()) && usesLayout<DoubleAPFloat>(ToSemantics)) { - assert(&ToSemantics == &semPPCDoubleDouble); - auto Ret = U.IEEE.convert(semPPCDoubleDoubleLegacy, RM, losesInfo); + assert(&ToSemantics == &APFloatBase::semPPCDoubleDouble); + auto Ret = + U.IEEE.convert(APFloatBase::semPPCDoubleDoubleLegacy, RM, losesInfo); *this = APFloat(ToSemantics, U.IEEE.bitcastToAPInt()); return Ret; } @@ -6113,13 +6117,15 @@ APFloat::opStatus APFloat::convertToInteger(APSInt &result, } double APFloat::convertToDouble() const { - if (&getSemantics() == (const llvm::fltSemantics *)&semIEEEdouble) + if (&getSemantics() == + (const llvm::fltSemantics *)&APFloatBase::semIEEEdouble) return getIEEE().convertToDouble(); assert(isRepresentableBy(getSemantics(), semIEEEdouble) && "Float semantics is not representable by IEEEdouble"); APFloat Temp = *this; bool LosesInfo; - opStatus St = Temp.convert(semIEEEdouble, rmNearestTiesToEven, &LosesInfo); + opStatus St = + Temp.convert(APFloatBase::semIEEEdouble, rmNearestTiesToEven, &LosesInfo); assert(!(St & opInexact) && !LosesInfo && "Unexpected imprecision"); (void)St; return Temp.getIEEE().convertToDouble(); @@ -6127,13 +6133,14 @@ double APFloat::convertToDouble() const { #ifdef HAS_IEE754_FLOAT128 float128 APFloat::convertToQuad() const { - if (&getSemantics() == (const llvm::fltSemantics *)&semIEEEquad) + if (&getSemantics() == (const llvm::fltSemantics *)&APFloatBase::semIEEEquad) return getIEEE().convertToQuad(); assert(isRepresentableBy(getSemantics(), semIEEEquad) && "Float semantics is not representable by IEEEquad"); APFloat Temp = *this; bool LosesInfo; - opStatus St = Temp.convert(semIEEEquad, rmNearestTiesToEven, &LosesInfo); + opStatus St = + Temp.convert(APFloatBase::semIEEEquad, rmNearestTiesToEven, &LosesInfo); assert(!(St & opInexact) && !LosesInfo && "Unexpected imprecision"); (void)St; return Temp.getIEEE().convertToQuad(); @@ -6141,18 +6148,84 @@ float128 APFloat::convertToQuad() const { #endif float APFloat::convertToFloat() const { - if (&getSemantics() == (const llvm::fltSemantics *)&semIEEEsingle) + if (&getSemantics() == + (const llvm::fltSemantics *)&APFloatBase::semIEEEsingle) return getIEEE().convertToFloat(); assert(isRepresentableBy(getSemantics(), semIEEEsingle) && "Float semantics is not representable by IEEEsingle"); APFloat Temp = *this; bool LosesInfo; - opStatus St = Temp.convert(semIEEEsingle, rmNearestTiesToEven, &LosesInfo); + opStatus St = + Temp.convert(APFloatBase::semIEEEsingle, rmNearestTiesToEven, &LosesInfo); assert(!(St & opInexact) && !LosesInfo && "Unexpected imprecision"); (void)St; return Temp.getIEEE().convertToFloat(); } +APFloat::Storage::~Storage() { + if (usesLayout<IEEEFloat>(*semantics)) { + IEEE.~IEEEFloat(); + return; + } + if (usesLayout<DoubleAPFloat>(*semantics)) { + Double.~DoubleAPFloat(); + return; + } + llvm_unreachable("Unexpected semantics"); +} + +APFloat::Storage::Storage(const APFloat::Storage &RHS) { + if (usesLayout<IEEEFloat>(*RHS.semantics)) { + new (this) IEEEFloat(RHS.IEEE); + return; + } + if (usesLayout<DoubleAPFloat>(*RHS.semantics)) { + new (this) DoubleAPFloat(RHS.Double); + return; + } + llvm_unreachable("Unexpected semantics"); +} + +APFloat::Storage::Storage(APFloat::Storage &&RHS) { + if (usesLayout<IEEEFloat>(*RHS.semantics)) { + new (this) IEEEFloat(std::move(RHS.IEEE)); + return; + } + if (usesLayout<DoubleAPFloat>(*RHS.semantics)) { + new (this) DoubleAPFloat(std::move(RHS.Double)); + return; + } + llvm_unreachable("Unexpected semantics"); +} + +APFloat::Storage &APFloat::Storage::operator=(const APFloat::Storage &RHS) { + if (usesLayout<IEEEFloat>(*semantics) && + usesLayout<IEEEFloat>(*RHS.semantics)) { + IEEE = RHS.IEEE; + } else if (usesLayout<DoubleAPFloat>(*semantics) && + usesLayout<DoubleAPFloat>(*RHS.semantics)) { + Double = RHS.Double; + } else if (this != &RHS) { + this->~Storage(); + new (this) Storage(RHS); + } + return *this; +} + +APFloat::Storage &APFloat::Storage::operator=(APFloat::Storage &&RHS) { + if (usesLayout<IEEEFloat>(*semantics) && + usesLayout<IEEEFloat>(*RHS.semantics)) { + IEEE = std::move(RHS.IEEE); + } else if (usesLayout<DoubleAPFloat>(*semantics) && + usesLayout<DoubleAPFloat>(*RHS.semantics)) { + Double = std::move(RHS.Double); + } else if (this != &RHS) { + this->~Storage(); + new (this) Storage(std::move(RHS)); + } + return *this; +} + } // namespace llvm #undef APFLOAT_DISPATCH_ON_SEMANTICS diff --git a/llvm/lib/Support/Unix/Signals.inc b/llvm/lib/Support/Unix/Signals.inc index 573ad82..78d6540 100644 --- a/llvm/lib/Support/Unix/Signals.inc +++ b/llvm/lib/Support/Unix/Signals.inc @@ -868,8 +868,7 @@ void llvm::sys::PrintStackTrace(raw_ostream &OS, int Depth) { nwidth = strlen(name) - 1; } - if (nwidth > width) - width = nwidth; + width = std::max(nwidth, width); } for (int i = 0; i < depth; ++i) { diff --git a/llvm/lib/Target/AArch64/AArch64ExpandImm.cpp b/llvm/lib/Target/AArch64/AArch64ExpandImm.cpp index 9801627..e9660ac1 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandImm.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandImm.cpp @@ -585,7 +585,7 @@ void AArch64_IMM::expandMOVImm(uint64_t Imm, unsigned BitSize, uint64_t ShiftedMask = (0xFFFFULL << Shift); uint64_t ZeroChunk = UImm & ~ShiftedMask; uint64_t OneChunk = UImm | ShiftedMask; - uint64_t RotatedImm = (UImm << 32) | (UImm >> 32); + uint64_t RotatedImm = llvm::rotl(UImm, 32); uint64_t ReplicateChunk = ZeroChunk | (RotatedImm & ShiftedMask); if (AArch64_AM::processLogicalImmediate(ZeroChunk, BitSize, Encoding) || AArch64_AM::processLogicalImmediate(OneChunk, BitSize, Encoding) || diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h index 979a8b0..4b22c68 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h @@ -21,6 +21,7 @@ #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/RegisterPressure.h" #include <algorithm> +#include <array> namespace llvm { @@ -45,7 +46,7 @@ struct GCNRegPressure { return !Value[SGPR] && !Value[VGPR] && !Value[AGPR] && !Value[AVGPR]; } - void clear() { std::fill(&Value[0], &Value[ValueArraySize], 0); } + void clear() { Value.fill(0); } unsigned getNumRegs(RegKind Kind) const { assert(Kind < TOTAL_KINDS); @@ -127,9 +128,7 @@ struct GCNRegPressure { bool less(const MachineFunction &MF, const GCNRegPressure &O, unsigned MaxOccupancy = std::numeric_limits<unsigned>::max()) const; - bool operator==(const GCNRegPressure &O) const { - return std::equal(&Value[0], &Value[ValueArraySize], O.Value); - } + bool operator==(const GCNRegPressure &O) const { return Value == O.Value; } bool operator!=(const GCNRegPressure &O) const { return !(*this == O); @@ -160,7 +159,7 @@ private: /// Pressure for all register kinds (first all regular registers kinds, then /// all tuple register kinds). - unsigned Value[ValueArraySize]; + std::array<unsigned, ValueArraySize> Value; static unsigned getRegKind(const TargetRegisterClass *RC, const SIRegisterInfo *STI); diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp index 2aa54c9..09ef6ac 100644 --- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -45,6 +45,9 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM, // Legalize loads and stores to the private address space. setOperationAction(ISD::LOAD, {MVT::i32, MVT::v2i32, MVT::v4i32}, Custom); + // 32-bit ABS is legal for AMDGPU except for R600 + setOperationAction(ISD::ABS, MVT::i32, Expand); + // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address // spaces, so it is custom lowered to handle those where it isn't. for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp index 1b7cb9b..636e31c 100644 --- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp +++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp @@ -699,7 +699,8 @@ public: "Can't encode VTYPE for uninitialized or unknown"); if (TWiden != 0) return RISCVVType::encodeXSfmmVType(SEW, TWiden, AltFmt); - return RISCVVType::encodeVTYPE(VLMul, SEW, TailAgnostic, MaskAgnostic); + return RISCVVType::encodeVTYPE(VLMul, SEW, TailAgnostic, MaskAgnostic, + AltFmt); } bool hasSEWLMULRatioOnly() const { return SEWLMULRatioOnly; } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index ddb53a2..12f776b 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -3775,11 +3775,13 @@ std::string RISCVInstrInfo::createMIROperandComment( #define CASE_VFMA_OPCODE_VV(OP) \ CASE_VFMA_OPCODE_LMULS_MF4(OP, VV, E16): \ + case CASE_VFMA_OPCODE_LMULS_MF4(OP##_ALT, VV, E16): \ case CASE_VFMA_OPCODE_LMULS_MF2(OP, VV, E32): \ case CASE_VFMA_OPCODE_LMULS_M1(OP, VV, E64) #define CASE_VFMA_SPLATS(OP) \ CASE_VFMA_OPCODE_LMULS_MF4(OP, VFPR16, E16): \ + case CASE_VFMA_OPCODE_LMULS_MF4(OP##_ALT, VFPR16, E16): \ case CASE_VFMA_OPCODE_LMULS_MF2(OP, VFPR32, E32): \ case CASE_VFMA_OPCODE_LMULS_M1(OP, VFPR64, E64) // clang-format on @@ -4003,11 +4005,13 @@ bool RISCVInstrInfo::findCommutedOpIndices(const MachineInstr &MI, #define CASE_VFMA_CHANGE_OPCODE_VV(OLDOP, NEWOP) \ CASE_VFMA_CHANGE_OPCODE_LMULS_MF4(OLDOP, NEWOP, VV, E16) \ + CASE_VFMA_CHANGE_OPCODE_LMULS_MF4(OLDOP##_ALT, NEWOP##_ALT, VV, E16) \ CASE_VFMA_CHANGE_OPCODE_LMULS_MF2(OLDOP, NEWOP, VV, E32) \ CASE_VFMA_CHANGE_OPCODE_LMULS_M1(OLDOP, NEWOP, VV, E64) #define CASE_VFMA_CHANGE_OPCODE_SPLATS(OLDOP, NEWOP) \ CASE_VFMA_CHANGE_OPCODE_LMULS_MF4(OLDOP, NEWOP, VFPR16, E16) \ + CASE_VFMA_CHANGE_OPCODE_LMULS_MF4(OLDOP##_ALT, NEWOP##_ALT, VFPR16, E16) \ CASE_VFMA_CHANGE_OPCODE_LMULS_MF2(OLDOP, NEWOP, VFPR32, E32) \ CASE_VFMA_CHANGE_OPCODE_LMULS_M1(OLDOP, NEWOP, VFPR64, E64) // clang-format on @@ -4469,6 +4473,20 @@ bool RISCVInstrInfo::simplifyInstruction(MachineInstr &MI) const { CASE_FP_WIDEOP_CHANGE_OPCODE_COMMON(OP, M2, E32) \ CASE_FP_WIDEOP_CHANGE_OPCODE_COMMON(OP, M4, E16) \ CASE_FP_WIDEOP_CHANGE_OPCODE_COMMON(OP, M4, E32) \ + +#define CASE_FP_WIDEOP_OPCODE_LMULS_ALT(OP) \ + CASE_FP_WIDEOP_OPCODE_COMMON(OP, MF4, E16): \ + case CASE_FP_WIDEOP_OPCODE_COMMON(OP, MF2, E16): \ + case CASE_FP_WIDEOP_OPCODE_COMMON(OP, M1, E16): \ + case CASE_FP_WIDEOP_OPCODE_COMMON(OP, M2, E16): \ + case CASE_FP_WIDEOP_OPCODE_COMMON(OP, M4, E16) + +#define CASE_FP_WIDEOP_CHANGE_OPCODE_LMULS_ALT(OP) \ + CASE_FP_WIDEOP_CHANGE_OPCODE_COMMON(OP, MF4, E16) \ + CASE_FP_WIDEOP_CHANGE_OPCODE_COMMON(OP, MF2, E16) \ + CASE_FP_WIDEOP_CHANGE_OPCODE_COMMON(OP, M1, E16) \ + CASE_FP_WIDEOP_CHANGE_OPCODE_COMMON(OP, M2, E16) \ + CASE_FP_WIDEOP_CHANGE_OPCODE_COMMON(OP, M4, E16) // clang-format on MachineInstr *RISCVInstrInfo::convertToThreeAddress(MachineInstr &MI, @@ -4478,6 +4496,8 @@ MachineInstr *RISCVInstrInfo::convertToThreeAddress(MachineInstr &MI, switch (MI.getOpcode()) { default: return nullptr; + case CASE_FP_WIDEOP_OPCODE_LMULS_ALT(FWADD_ALT_WV): + case CASE_FP_WIDEOP_OPCODE_LMULS_ALT(FWSUB_ALT_WV): case CASE_FP_WIDEOP_OPCODE_LMULS(FWADD_WV): case CASE_FP_WIDEOP_OPCODE_LMULS(FWSUB_WV): { assert(RISCVII::hasVecPolicyOp(MI.getDesc().TSFlags) && @@ -4494,6 +4514,8 @@ MachineInstr *RISCVInstrInfo::convertToThreeAddress(MachineInstr &MI, llvm_unreachable("Unexpected opcode"); CASE_FP_WIDEOP_CHANGE_OPCODE_LMULS(FWADD_WV) CASE_FP_WIDEOP_CHANGE_OPCODE_LMULS(FWSUB_WV) + CASE_FP_WIDEOP_CHANGE_OPCODE_LMULS_ALT(FWADD_ALT_WV) + CASE_FP_WIDEOP_CHANGE_OPCODE_LMULS_ALT(FWSUB_ALT_WV) } // clang-format on diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td index c9c1246..85700ae 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td @@ -44,6 +44,336 @@ let Predicates = [HasStdExtZvfbfmin] in { let mayRaiseFPException = true, Predicates = [HasStdExtZvfbfwma] in defm PseudoVFWMACCBF16 : VPseudoVWMAC_VV_VF_BF_RM; +defset list<VTypeInfoToWide> AllWidenableIntToBF16Vectors = { + def : VTypeInfoToWide<VI8MF8, VBF16MF4>; + def : VTypeInfoToWide<VI8MF4, VBF16MF2>; + def : VTypeInfoToWide<VI8MF2, VBF16M1>; + def : VTypeInfoToWide<VI8M1, VBF16M2>; + def : VTypeInfoToWide<VI8M2, VBF16M4>; + def : VTypeInfoToWide<VI8M4, VBF16M8>; +} + +multiclass VPseudoVALU_VV_VF_RM_BF16 { + foreach m = MxListF in { + defm "" : VPseudoBinaryFV_VV_RM<m, 16/*sew*/>, + SchedBinary<"WriteVFALUV", "ReadVFALUV", "ReadVFALUV", m.MX, + 16/*sew*/, forcePassthruRead=true>; + } + + defvar f = SCALAR_F16; + foreach m = f.MxList in { + defm "" : VPseudoBinaryV_VF_RM<m, f, f.SEW>, + SchedBinary<"WriteVFALUF", "ReadVFALUV", "ReadVFALUF", m.MX, + f.SEW, forcePassthruRead=true>; + } +} + +multiclass VPseudoVALU_VF_RM_BF16 { + defvar f = SCALAR_F16; + foreach m = f.MxList in { + defm "" : VPseudoBinaryV_VF_RM<m, f, f.SEW>, + SchedBinary<"WriteVFALUF", "ReadVFALUV", "ReadVFALUF", m.MX, + f.SEW, forcePassthruRead=true>; + } +} + +multiclass VPseudoVFWALU_VV_VF_RM_BF16 { + foreach m = MxListFW in { + defm "" : VPseudoBinaryW_VV_RM<m, sew=16>, + SchedBinary<"WriteVFWALUV", "ReadVFWALUV", "ReadVFWALUV", m.MX, + 16/*sew*/, forcePassthruRead=true>; + } + + defvar f = SCALAR_F16; + foreach m = f.MxListFW in { + defm "" : VPseudoBinaryW_VF_RM<m, f, sew=f.SEW>, + SchedBinary<"WriteVFWALUF", "ReadVFWALUV", "ReadVFWALUF", m.MX, + f.SEW, forcePassthruRead=true>; + } +} + +multiclass VPseudoVFWALU_WV_WF_RM_BF16 { + foreach m = MxListFW in { + defm "" : VPseudoBinaryW_WV_RM<m, sew=16>, + SchedBinary<"WriteVFWALUV", "ReadVFWALUV", "ReadVFWALUV", m.MX, + 16/*sew*/, forcePassthruRead=true>; + } + defvar f = SCALAR_F16; + foreach m = f.MxListFW in { + defm "" : VPseudoBinaryW_WF_RM<m, f, sew=f.SEW>, + SchedBinary<"WriteVFWALUF", "ReadVFWALUV", "ReadVFWALUF", m.MX, + f.SEW, forcePassthruRead=true>; + } +} + +multiclass VPseudoVFMUL_VV_VF_RM_BF16 { + foreach m = MxListF in { + defm "" : VPseudoBinaryFV_VV_RM<m, 16/*sew*/>, + SchedBinary<"WriteVFMulV", "ReadVFMulV", "ReadVFMulV", m.MX, + 16/*sew*/, forcePassthruRead=true>; + } + + defvar f = SCALAR_F16; + foreach m = f.MxList in { + defm "" : VPseudoBinaryV_VF_RM<m, f, f.SEW>, + SchedBinary<"WriteVFMulF", "ReadVFMulV", "ReadVFMulF", m.MX, + f.SEW, forcePassthruRead=true>; + } +} + +multiclass VPseudoVWMUL_VV_VF_RM_BF16 { + foreach m = MxListFW in { + defm "" : VPseudoBinaryW_VV_RM<m, sew=16>, + SchedBinary<"WriteVFWMulV", "ReadVFWMulV", "ReadVFWMulV", m.MX, + 16/*sew*/, forcePassthruRead=true>; + } + + defvar f = SCALAR_F16; + foreach m = f.MxListFW in { + defm "" : VPseudoBinaryW_VF_RM<m, f, sew=f.SEW>, + SchedBinary<"WriteVFWMulF", "ReadVFWMulV", "ReadVFWMulF", m.MX, + f.SEW, forcePassthruRead=true>; + } +} + +multiclass VPseudoVMAC_VV_VF_AAXA_RM_BF16 { + foreach m = MxListF in { + defm "" : VPseudoTernaryV_VV_AAXA_RM<m, 16/*sew*/>, + SchedTernary<"WriteVFMulAddV", "ReadVFMulAddV", "ReadVFMulAddV", + "ReadVFMulAddV", m.MX, 16/*sew*/>; + } + + defvar f = SCALAR_F16; + foreach m = f.MxList in { + defm "" : VPseudoTernaryV_VF_AAXA_RM<m, f, f.SEW>, + SchedTernary<"WriteVFMulAddF", "ReadVFMulAddV", "ReadVFMulAddF", + "ReadVFMulAddV", m.MX, f.SEW>; + } +} + +multiclass VPseudoVWMAC_VV_VF_RM_BF16 { + foreach m = MxListFW in { + defm "" : VPseudoTernaryW_VV_RM<m, sew=16>, + SchedTernary<"WriteVFWMulAddV", "ReadVFWMulAddV", + "ReadVFWMulAddV", "ReadVFWMulAddV", m.MX, 16/*sew*/>; + } + + defvar f = SCALAR_F16; + foreach m = f.MxListFW in { + defm "" : VPseudoTernaryW_VF_RM<m, f, sew=f.SEW>, + SchedTernary<"WriteVFWMulAddF", "ReadVFWMulAddV", + "ReadVFWMulAddF", "ReadVFWMulAddV", m.MX, f.SEW>; + } +} + +multiclass VPseudoVRCP_V_BF16 { + foreach m = MxListF in { + defvar mx = m.MX; + let VLMul = m.value in { + def "_V_" # mx # "_E16" + : VPseudoUnaryNoMask<m.vrclass, m.vrclass>, + SchedUnary<"WriteVFRecpV", "ReadVFRecpV", mx, 16/*sew*/, + forcePassthruRead=true>; + def "_V_" # mx # "_E16_MASK" + : VPseudoUnaryMask<m.vrclass, m.vrclass>, + RISCVMaskedPseudo<MaskIdx = 2>, + SchedUnary<"WriteVFRecpV", "ReadVFRecpV", mx, 16/*sew*/, + forcePassthruRead=true>; + } + } +} + +multiclass VPseudoVRCP_V_RM_BF16 { + foreach m = MxListF in { + defvar mx = m.MX; + let VLMul = m.value in { + def "_V_" # mx # "_E16" + : VPseudoUnaryNoMaskRoundingMode<m.vrclass, m.vrclass>, + SchedUnary<"WriteVFRecpV", "ReadVFRecpV", mx, 16/*sew*/, + forcePassthruRead=true>; + def "_V_" # mx # "_E16_MASK" + : VPseudoUnaryMaskRoundingMode<m.vrclass, m.vrclass>, + RISCVMaskedPseudo<MaskIdx = 2>, + SchedUnary<"WriteVFRecpV", "ReadVFRecpV", mx, 16/*sew*/, + forcePassthruRead=true>; + } + } +} + +multiclass VPseudoVMAX_VV_VF_BF16 { + foreach m = MxListF in { + defm "" : VPseudoBinaryV_VV<m, sew=16>, + SchedBinary<"WriteVFMinMaxV", "ReadVFMinMaxV", "ReadVFMinMaxV", + m.MX, 16/*sew*/, forcePassthruRead=true>; + } + + defvar f = SCALAR_F16; + foreach m = f.MxList in { + defm "" : VPseudoBinaryV_VF<m, f, f.SEW>, + SchedBinary<"WriteVFMinMaxF", "ReadVFMinMaxV", "ReadVFMinMaxF", + m.MX, f.SEW, forcePassthruRead=true>; + } +} + +multiclass VPseudoVSGNJ_VV_VF_BF16 { + foreach m = MxListF in { + defm "" : VPseudoBinaryV_VV<m, sew=16>, + SchedBinary<"WriteVFSgnjV", "ReadVFSgnjV", "ReadVFSgnjV", m.MX, + 16/*sew*/, forcePassthruRead=true>; + } + + defvar f = SCALAR_F16; + foreach m = f.MxList in { + defm "" : VPseudoBinaryV_VF<m, f, f.SEW>, + SchedBinary<"WriteVFSgnjF", "ReadVFSgnjV", "ReadVFSgnjF", m.MX, + f.SEW, forcePassthruRead=true>; + } +} + +multiclass VPseudoVWCVTF_V_BF16 { + defvar constraint = "@earlyclobber $rd"; + foreach m = MxListW in + defm _V : VPseudoConversion<m.wvrclass, m.vrclass, m, constraint, sew=8, + TargetConstraintType=3>, + SchedUnary<"WriteVFWCvtIToFV", "ReadVFWCvtIToFV", m.MX, 8/*sew*/, + forcePassthruRead=true>; +} + +multiclass VPseudoVWCVTD_V_BF16 { + defvar constraint = "@earlyclobber $rd"; + foreach m = MxListFW in + defm _V : VPseudoConversion<m.wvrclass, m.vrclass, m, constraint, sew=16, + TargetConstraintType=3>, + SchedUnary<"WriteVFWCvtFToFV", "ReadVFWCvtFToFV", m.MX, 16/*sew*/, + forcePassthruRead=true>; +} + +multiclass VPseudoVNCVTD_W_BF16 { + defvar constraint = "@earlyclobber $rd"; + foreach m = MxListFW in + defm _W : VPseudoConversion<m.vrclass, m.wvrclass, m, constraint, sew=16, + TargetConstraintType=2>, + SchedUnary<"WriteVFNCvtFToFV", "ReadVFNCvtFToFV", m.MX, 16/*sew*/, + forcePassthruRead=true>; +} + +multiclass VPseudoVNCVTD_W_RM_BF16 { + defvar constraint = "@earlyclobber $rd"; + foreach m = MxListFW in + defm _W : VPseudoConversionRoundingMode<m.vrclass, m.wvrclass, m, + constraint, sew=16, + TargetConstraintType=2>, + SchedUnary<"WriteVFNCvtFToFV", "ReadVFNCvtFToFV", m.MX, 16/*sew*/, + forcePassthruRead=true>; +} + +let Predicates = [HasStdExtZvfbfa], AltFmtType = IS_ALTFMT in { +let mayRaiseFPException = true in { +defm PseudoVFADD_ALT : VPseudoVALU_VV_VF_RM_BF16; +defm PseudoVFSUB_ALT : VPseudoVALU_VV_VF_RM_BF16; +defm PseudoVFRSUB_ALT : VPseudoVALU_VF_RM_BF16; +} + +let mayRaiseFPException = true in { +defm PseudoVFWADD_ALT : VPseudoVFWALU_VV_VF_RM_BF16; +defm PseudoVFWSUB_ALT : VPseudoVFWALU_VV_VF_RM_BF16; +defm PseudoVFWADD_ALT : VPseudoVFWALU_WV_WF_RM_BF16; +defm PseudoVFWSUB_ALT : VPseudoVFWALU_WV_WF_RM_BF16; +} + +let mayRaiseFPException = true in +defm PseudoVFMUL_ALT : VPseudoVFMUL_VV_VF_RM_BF16; + +let mayRaiseFPException = true in +defm PseudoVFWMUL_ALT : VPseudoVWMUL_VV_VF_RM_BF16; + +let mayRaiseFPException = true in { +defm PseudoVFMACC_ALT : VPseudoVMAC_VV_VF_AAXA_RM_BF16; +defm PseudoVFNMACC_ALT : VPseudoVMAC_VV_VF_AAXA_RM_BF16; +defm PseudoVFMSAC_ALT : VPseudoVMAC_VV_VF_AAXA_RM_BF16; +defm PseudoVFNMSAC_ALT : VPseudoVMAC_VV_VF_AAXA_RM_BF16; +defm PseudoVFMADD_ALT : VPseudoVMAC_VV_VF_AAXA_RM_BF16; +defm PseudoVFNMADD_ALT : VPseudoVMAC_VV_VF_AAXA_RM_BF16; +defm PseudoVFMSUB_ALT : VPseudoVMAC_VV_VF_AAXA_RM_BF16; +defm PseudoVFNMSUB_ALT : VPseudoVMAC_VV_VF_AAXA_RM_BF16; +} + +let mayRaiseFPException = true in { +defm PseudoVFWMACC_ALT : VPseudoVWMAC_VV_VF_RM_BF16; +defm PseudoVFWNMACC_ALT : VPseudoVWMAC_VV_VF_RM_BF16; +defm PseudoVFWMSAC_ALT : VPseudoVWMAC_VV_VF_RM_BF16; +defm PseudoVFWNMSAC_ALT : VPseudoVWMAC_VV_VF_RM_BF16; +} + +let mayRaiseFPException = true in +defm PseudoVFRSQRT7_ALT : VPseudoVRCP_V_BF16; + +let mayRaiseFPException = true in +defm PseudoVFREC7_ALT : VPseudoVRCP_V_RM_BF16; + +let mayRaiseFPException = true in { +defm PseudoVFMIN_ALT : VPseudoVMAX_VV_VF_BF16; +defm PseudoVFMAX_ALT : VPseudoVMAX_VV_VF_BF16; +} + +defm PseudoVFSGNJ_ALT : VPseudoVSGNJ_VV_VF_BF16; +defm PseudoVFSGNJN_ALT : VPseudoVSGNJ_VV_VF_BF16; +defm PseudoVFSGNJX_ALT : VPseudoVSGNJ_VV_VF_BF16; + +let mayRaiseFPException = true in { +defm PseudoVMFEQ_ALT : VPseudoVCMPM_VV_VF; +defm PseudoVMFNE_ALT : VPseudoVCMPM_VV_VF; +defm PseudoVMFLT_ALT : VPseudoVCMPM_VV_VF; +defm PseudoVMFLE_ALT : VPseudoVCMPM_VV_VF; +defm PseudoVMFGT_ALT : VPseudoVCMPM_VF; +defm PseudoVMFGE_ALT : VPseudoVCMPM_VF; +} + +defm PseudoVFCLASS_ALT : VPseudoVCLS_V; + +defm PseudoVFMERGE_ALT : VPseudoVMRG_FM; + +defm PseudoVFMV_V_ALT : VPseudoVMV_F; + +let mayRaiseFPException = true in { +defm PseudoVFWCVT_F_XU_ALT : VPseudoVWCVTF_V_BF16; +defm PseudoVFWCVT_F_X_ALT : VPseudoVWCVTF_V_BF16; + +defm PseudoVFWCVT_F_F_ALT : VPseudoVWCVTD_V_BF16; +} // mayRaiseFPException = true + +let mayRaiseFPException = true in { +let hasSideEffects = 0, hasPostISelHook = 1 in { +defm PseudoVFNCVT_XU_F_ALT : VPseudoVNCVTI_W_RM; +defm PseudoVFNCVT_X_F_ALT : VPseudoVNCVTI_W_RM; +} + +defm PseudoVFNCVT_RTZ_XU_F_ALT : VPseudoVNCVTI_W; +defm PseudoVFNCVT_RTZ_X_F_ALT : VPseudoVNCVTI_W; + +defm PseudoVFNCVT_F_F_ALT : VPseudoVNCVTD_W_RM_BF16; + +defm PseudoVFNCVT_ROD_F_F_ALT : VPseudoVNCVTD_W_BF16; +} // mayRaiseFPException = true + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { + defvar f = SCALAR_F16; + let HasSEWOp = 1, BaseInstr = VFMV_F_S in + def "PseudoVFMV_" # f.FX # "_S_ALT" : + RISCVVPseudo<(outs f.fprclass:$rd), (ins VR:$rs2, sew:$sew)>, + Sched<[WriteVMovFS, ReadVMovFS]>; + let HasVLOp = 1, HasSEWOp = 1, BaseInstr = VFMV_S_F, isReMaterializable = 1, + Constraints = "$rd = $passthru" in + def "PseudoVFMV_S_" # f.FX # "_ALT" : + RISCVVPseudo<(outs VR:$rd), + (ins VR:$passthru, f.fprclass:$rs1, AVL:$vl, sew:$sew)>, + Sched<[WriteVMovSF, ReadVMovSF_V, ReadVMovSF_F]>; +} + +defm PseudoVFSLIDE1UP_ALT : VPseudoVSLD1_VF<"@earlyclobber $rd">; +defm PseudoVFSLIDE1DOWN_ALT : VPseudoVSLD1_VF; +} // Predicates = [HasStdExtZvfbfa], AltFmtType = IS_ALTFMT + //===----------------------------------------------------------------------===// // Patterns //===----------------------------------------------------------------------===// @@ -108,6 +438,43 @@ let Predicates = [HasStdExtZvfbfmin] in { FRM_DYN, fvti.AVL, fvti.Log2SEW, TA_MA)>; } + + foreach fvti = AllBF16Vectors in { + defvar ivti = GetIntVTypeInfo<fvti>.Vti; + + def : Pat<(fvti.Vector (vselect (fvti.Mask VMV0:$vm), + (SplatFPOp (SelectScalarFPAsInt (XLenVT GPR:$imm))), + fvti.RegClass:$rs2)), + (!cast<Instruction>("PseudoVMERGE_VXM_"#fvti.LMul.MX) + (fvti.Vector (IMPLICIT_DEF)), + fvti.RegClass:$rs2, GPR:$imm, (fvti.Mask VMV0:$vm), fvti.AVL, fvti.Log2SEW)>; + + def : Pat<(fvti.Vector (vselect (fvti.Mask VMV0:$vm), + (SplatFPOp (fvti.Scalar fpimm0)), + fvti.RegClass:$rs2)), + (!cast<Instruction>("PseudoVMERGE_VIM_"#fvti.LMul.MX) + (fvti.Vector (IMPLICIT_DEF)), + fvti.RegClass:$rs2, 0, (fvti.Mask VMV0:$vm), fvti.AVL, fvti.Log2SEW)>; + + def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask VMV0:$vm), + (SplatFPOp (SelectScalarFPAsInt (XLenVT GPR:$imm))), + fvti.RegClass:$rs2, + fvti.RegClass:$passthru, + VLOpFrag)), + (!cast<Instruction>("PseudoVMERGE_VXM_"#fvti.LMul.MX) + fvti.RegClass:$passthru, fvti.RegClass:$rs2, GPR:$imm, (fvti.Mask VMV0:$vm), + GPR:$vl, fvti.Log2SEW)>; + + + def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask VMV0:$vm), + (SplatFPOp (fvti.Scalar fpimm0)), + fvti.RegClass:$rs2, + fvti.RegClass:$passthru, + VLOpFrag)), + (!cast<Instruction>("PseudoVMERGE_VIM_"#fvti.LMul.MX) + fvti.RegClass:$passthru, fvti.RegClass:$rs2, 0, (fvti.Mask VMV0:$vm), + GPR:$vl, fvti.Log2SEW)>; + } } let Predicates = [HasStdExtZvfbfwma] in { @@ -118,3 +485,224 @@ let Predicates = [HasStdExtZvfbfwma] in { defm : VPatWidenFPMulAccSDNode_VV_VF_RM<"PseudoVFWMACCBF16", AllWidenableBF16ToFloatVectors>; } + +multiclass VPatConversionVI_VF_BF16<string intrinsic, string instruction> { + foreach fvti = AllBF16Vectors in { + defvar ivti = GetIntVTypeInfo<fvti>.Vti; + let Predicates = !listconcat(GetVTypePredicates<fvti>.Predicates, + GetVTypePredicates<ivti>.Predicates) in + defm : VPatConversion<intrinsic, instruction, "V", + ivti.Vector, fvti.Vector, ivti.Mask, fvti.Log2SEW, + fvti.LMul, ivti.RegClass, fvti.RegClass>; + } +} + +multiclass VPatConversionWF_VI_BF16<string intrinsic, string instruction, + bit isSEWAware = 0> { + foreach vtiToWti = AllWidenableIntToBF16Vectors in { + defvar vti = vtiToWti.Vti; + defvar fwti = vtiToWti.Wti; + let Predicates = !listconcat(GetVTypePredicates<vti>.Predicates, + GetVTypePredicates<fwti>.Predicates) in + defm : VPatConversion<intrinsic, instruction, "V", + fwti.Vector, vti.Vector, fwti.Mask, vti.Log2SEW, + vti.LMul, fwti.RegClass, vti.RegClass, isSEWAware>; + } +} + +multiclass VPatConversionWF_VF_BF16<string intrinsic, string instruction, + bit isSEWAware = 0> { + foreach fvtiToFWti = AllWidenableBF16ToFloatVectors in { + defvar fvti = fvtiToFWti.Vti; + defvar fwti = fvtiToFWti.Wti; + let Predicates = !listconcat(GetVTypeMinimalPredicates<fvti>.Predicates, + GetVTypeMinimalPredicates<fwti>.Predicates) in + defm : VPatConversion<intrinsic, instruction, "V", + fwti.Vector, fvti.Vector, fwti.Mask, fvti.Log2SEW, + fvti.LMul, fwti.RegClass, fvti.RegClass, isSEWAware>; + } +} + +multiclass VPatConversionVI_WF_BF16<string intrinsic, string instruction> { + foreach vtiToWti = AllWidenableIntToBF16Vectors in { + defvar vti = vtiToWti.Vti; + defvar fwti = vtiToWti.Wti; + let Predicates = !listconcat(GetVTypePredicates<vti>.Predicates, + GetVTypePredicates<fwti>.Predicates) in + defm : VPatConversion<intrinsic, instruction, "W", + vti.Vector, fwti.Vector, vti.Mask, vti.Log2SEW, + vti.LMul, vti.RegClass, fwti.RegClass>; + } +} + +multiclass VPatConversionVI_WF_RM_BF16<string intrinsic, string instruction> { + foreach vtiToWti = AllWidenableIntToBF16Vectors in { + defvar vti = vtiToWti.Vti; + defvar fwti = vtiToWti.Wti; + let Predicates = !listconcat(GetVTypePredicates<vti>.Predicates, + GetVTypePredicates<fwti>.Predicates) in + defm : VPatConversionRoundingMode<intrinsic, instruction, "W", + vti.Vector, fwti.Vector, vti.Mask, vti.Log2SEW, + vti.LMul, vti.RegClass, fwti.RegClass>; + } +} + +multiclass VPatConversionVF_WF_BF16<string intrinsic, string instruction, + bit isSEWAware = 0> { + foreach fvtiToFWti = AllWidenableBF16ToFloatVectors in { + defvar fvti = fvtiToFWti.Vti; + defvar fwti = fvtiToFWti.Wti; + let Predicates = !listconcat(GetVTypePredicates<fvti>.Predicates, + GetVTypePredicates<fwti>.Predicates) in + defm : VPatConversion<intrinsic, instruction, "W", + fvti.Vector, fwti.Vector, fvti.Mask, fvti.Log2SEW, + fvti.LMul, fvti.RegClass, fwti.RegClass, isSEWAware>; + } +} + +let Predicates = [HasStdExtZvfbfa] in { +defm : VPatBinaryV_VV_VX_RM<"int_riscv_vfadd", "PseudoVFADD_ALT", + AllBF16Vectors, isSEWAware = 1>; +defm : VPatBinaryV_VV_VX_RM<"int_riscv_vfsub", "PseudoVFSUB_ALT", + AllBF16Vectors, isSEWAware = 1>; +defm : VPatBinaryV_VX_RM<"int_riscv_vfrsub", "PseudoVFRSUB_ALT", + AllBF16Vectors, isSEWAware = 1>; +defm : VPatBinaryW_VV_VX_RM<"int_riscv_vfwadd", "PseudoVFWADD_ALT", + AllWidenableBF16ToFloatVectors, isSEWAware=1>; +defm : VPatBinaryW_VV_VX_RM<"int_riscv_vfwsub", "PseudoVFWSUB_ALT", + AllWidenableBF16ToFloatVectors, isSEWAware=1>; +defm : VPatBinaryW_WV_WX_RM<"int_riscv_vfwadd_w", "PseudoVFWADD_ALT", + AllWidenableBF16ToFloatVectors, isSEWAware=1>; +defm : VPatBinaryW_WV_WX_RM<"int_riscv_vfwsub_w", "PseudoVFWSUB_ALT", + AllWidenableBF16ToFloatVectors, isSEWAware=1>; +defm : VPatBinaryV_VV_VX_RM<"int_riscv_vfmul", "PseudoVFMUL_ALT", + AllBF16Vectors, isSEWAware=1>; +defm : VPatBinaryW_VV_VX_RM<"int_riscv_vfwmul", "PseudoVFWMUL_ALT", + AllWidenableBF16ToFloatVectors, isSEWAware=1>; +defm : VPatTernaryV_VV_VX_AAXA_RM<"int_riscv_vfmacc", "PseudoVFMACC_ALT", + AllBF16Vectors, isSEWAware=1>; +defm : VPatTernaryV_VV_VX_AAXA_RM<"int_riscv_vfnmacc", "PseudoVFNMACC_ALT", + AllBF16Vectors, isSEWAware=1>; +defm : VPatTernaryV_VV_VX_AAXA_RM<"int_riscv_vfmsac", "PseudoVFMSAC_ALT", + AllBF16Vectors, isSEWAware=1>; +defm : VPatTernaryV_VV_VX_AAXA_RM<"int_riscv_vfnmsac", "PseudoVFNMSAC_ALT", + AllBF16Vectors, isSEWAware=1>; +defm : VPatTernaryV_VV_VX_AAXA_RM<"int_riscv_vfmadd", "PseudoVFMADD_ALT", + AllBF16Vectors, isSEWAware=1>; +defm : VPatTernaryV_VV_VX_AAXA_RM<"int_riscv_vfnmadd", "PseudoVFNMADD_ALT", + AllBF16Vectors, isSEWAware=1>; +defm : VPatTernaryV_VV_VX_AAXA_RM<"int_riscv_vfmsub", "PseudoVFMSUB_ALT", + AllBF16Vectors, isSEWAware=1>; +defm : VPatTernaryV_VV_VX_AAXA_RM<"int_riscv_vfnmsub", "PseudoVFNMSUB_ALT", + AllBF16Vectors, isSEWAware=1>; +defm : VPatTernaryW_VV_VX_RM<"int_riscv_vfwmacc", "PseudoVFWMACC_ALT", + AllWidenableBF16ToFloatVectors, isSEWAware=1>; +defm : VPatTernaryW_VV_VX_RM<"int_riscv_vfwnmacc", "PseudoVFWNMACC_ALT", + AllWidenableBF16ToFloatVectors, isSEWAware=1>; +defm : VPatTernaryW_VV_VX_RM<"int_riscv_vfwmsac", "PseudoVFWMSAC_ALT", + AllWidenableBF16ToFloatVectors, isSEWAware=1>; +defm : VPatTernaryW_VV_VX_RM<"int_riscv_vfwnmsac", "PseudoVFWNMSAC_ALT", + AllWidenableBF16ToFloatVectors, isSEWAware=1>; +defm : VPatUnaryV_V<"int_riscv_vfrsqrt7", "PseudoVFRSQRT7_ALT", + AllBF16Vectors, isSEWAware=1>; +defm : VPatUnaryV_V_RM<"int_riscv_vfrec7", "PseudoVFREC7_ALT", + AllBF16Vectors, isSEWAware=1>; +defm : VPatBinaryV_VV_VX<"int_riscv_vfmin", "PseudoVFMIN_ALT", + AllBF16Vectors, isSEWAware=1>; +defm : VPatBinaryV_VV_VX<"int_riscv_vfmax", "PseudoVFMAX_ALT", + AllBF16Vectors, isSEWAware=1>; +defm : VPatBinaryV_VV_VX<"int_riscv_vfsgnj", "PseudoVFSGNJ_ALT", + AllBF16Vectors, isSEWAware=1>; +defm : VPatBinaryV_VV_VX<"int_riscv_vfsgnjn", "PseudoVFSGNJN_ALT", + AllBF16Vectors, isSEWAware=1>; +defm : VPatBinaryV_VV_VX<"int_riscv_vfsgnjx", "PseudoVFSGNJX_ALT", + AllBF16Vectors, isSEWAware=1>; +defm : VPatBinaryM_VV_VX<"int_riscv_vmfeq", "PseudoVMFEQ_ALT", AllBF16Vectors>; +defm : VPatBinaryM_VV_VX<"int_riscv_vmfle", "PseudoVMFLE_ALT", AllBF16Vectors>; +defm : VPatBinaryM_VV_VX<"int_riscv_vmflt", "PseudoVMFLT_ALT", AllBF16Vectors>; +defm : VPatBinaryM_VV_VX<"int_riscv_vmfne", "PseudoVMFNE_ALT", AllBF16Vectors>; +defm : VPatBinaryM_VX<"int_riscv_vmfgt", "PseudoVMFGT_ALT", AllBF16Vectors>; +defm : VPatBinaryM_VX<"int_riscv_vmfge", "PseudoVMFGE_ALT", AllBF16Vectors>; +defm : VPatBinarySwappedM_VV<"int_riscv_vmfgt", "PseudoVMFLT_ALT", AllBF16Vectors>; +defm : VPatBinarySwappedM_VV<"int_riscv_vmfge", "PseudoVMFLE_ALT", AllBF16Vectors>; +defm : VPatConversionVI_VF_BF16<"int_riscv_vfclass", "PseudoVFCLASS_ALT">; +foreach vti = AllBF16Vectors in { + let Predicates = GetVTypePredicates<vti>.Predicates in + defm : VPatBinaryCarryInTAIL<"int_riscv_vfmerge", "PseudoVFMERGE_ALT", + "V"#vti.ScalarSuffix#"M", + vti.Vector, + vti.Vector, vti.Scalar, vti.Mask, + vti.Log2SEW, vti.LMul, vti.RegClass, + vti.RegClass, vti.ScalarRegClass>; +} +defm : VPatConversionWF_VI_BF16<"int_riscv_vfwcvt_f_xu_v", "PseudoVFWCVT_F_XU_ALT", + isSEWAware=1>; +defm : VPatConversionWF_VI_BF16<"int_riscv_vfwcvt_f_x_v", "PseudoVFWCVT_F_X_ALT", + isSEWAware=1>; +defm : VPatConversionWF_VF_BF16<"int_riscv_vfwcvt_f_f_v", "PseudoVFWCVT_F_F_ALT", + isSEWAware=1>; +defm : VPatConversionVI_WF_RM_BF16<"int_riscv_vfncvt_xu_f_w", "PseudoVFNCVT_XU_F_ALT">; +defm : VPatConversionVI_WF_RM_BF16<"int_riscv_vfncvt_x_f_w", "PseudoVFNCVT_X_F_ALT">; +defm : VPatConversionVI_WF_BF16<"int_riscv_vfncvt_rtz_xu_f_w", "PseudoVFNCVT_RTZ_XU_F_ALT">; +defm : VPatConversionVI_WF_BF16<"int_riscv_vfncvt_rtz_x_f_w", "PseudoVFNCVT_RTZ_X_F_ALT">; +defm : VPatConversionVF_WF_RM<"int_riscv_vfncvt_f_f_w", "PseudoVFNCVT_F_F_ALT", + AllWidenableBF16ToFloatVectors, isSEWAware=1>; +defm : VPatConversionVF_WF_BF16<"int_riscv_vfncvt_rod_f_f_w", "PseudoVFNCVT_ROD_F_F_ALT", + isSEWAware=1>; +defm : VPatBinaryV_VX<"int_riscv_vfslide1up", "PseudoVFSLIDE1UP_ALT", AllBF16Vectors>; +defm : VPatBinaryV_VX<"int_riscv_vfslide1down", "PseudoVFSLIDE1DOWN_ALT", AllBF16Vectors>; + +foreach fvti = AllBF16Vectors in { + defvar ivti = GetIntVTypeInfo<fvti>.Vti; + let Predicates = GetVTypePredicates<ivti>.Predicates in { + // 13.16. Vector Floating-Point Move Instruction + // If we're splatting fpimm0, use vmv.v.x vd, x0. + def : Pat<(fvti.Vector (riscv_vfmv_v_f_vl + fvti.Vector:$passthru, (fvti.Scalar (fpimm0)), VLOpFrag)), + (!cast<Instruction>("PseudoVMV_V_I_"#fvti.LMul.MX) + $passthru, 0, GPR:$vl, fvti.Log2SEW, TU_MU)>; + def : Pat<(fvti.Vector (riscv_vfmv_v_f_vl + fvti.Vector:$passthru, (fvti.Scalar (SelectScalarFPAsInt (XLenVT GPR:$imm))), VLOpFrag)), + (!cast<Instruction>("PseudoVMV_V_X_"#fvti.LMul.MX) + $passthru, GPR:$imm, GPR:$vl, fvti.Log2SEW, TU_MU)>; + } + + let Predicates = GetVTypePredicates<fvti>.Predicates in { + def : Pat<(fvti.Vector (riscv_vfmv_v_f_vl + fvti.Vector:$passthru, (fvti.Scalar fvti.ScalarRegClass:$rs2), VLOpFrag)), + (!cast<Instruction>("PseudoVFMV_V_ALT_" # fvti.ScalarSuffix # "_" # + fvti.LMul.MX) + $passthru, (fvti.Scalar fvti.ScalarRegClass:$rs2), + GPR:$vl, fvti.Log2SEW, TU_MU)>; + } +} + +foreach vti = NoGroupBF16Vectors in { + let Predicates = GetVTypePredicates<vti>.Predicates in { + def : Pat<(vti.Vector (riscv_vfmv_s_f_vl (vti.Vector vti.RegClass:$passthru), + (vti.Scalar (fpimm0)), + VLOpFrag)), + (PseudoVMV_S_X $passthru, (XLenVT X0), GPR:$vl, vti.Log2SEW)>; + def : Pat<(vti.Vector (riscv_vfmv_s_f_vl (vti.Vector vti.RegClass:$passthru), + (vti.Scalar (SelectScalarFPAsInt (XLenVT GPR:$imm))), + VLOpFrag)), + (PseudoVMV_S_X $passthru, GPR:$imm, GPR:$vl, vti.Log2SEW)>; + def : Pat<(vti.Vector (riscv_vfmv_s_f_vl (vti.Vector vti.RegClass:$passthru), + vti.ScalarRegClass:$rs1, + VLOpFrag)), + (!cast<Instruction>("PseudoVFMV_S_"#vti.ScalarSuffix#"_ALT") + vti.RegClass:$passthru, + (vti.Scalar vti.ScalarRegClass:$rs1), GPR:$vl, vti.Log2SEW)>; + } + + defvar vfmv_f_s_inst = !cast<Instruction>(!strconcat("PseudoVFMV_", + vti.ScalarSuffix, + "_S_ALT")); + // Only pattern-match extract-element operations where the index is 0. Any + // other index will have been custom-lowered to slide the vector correctly + // into place. + let Predicates = GetVTypePredicates<vti>.Predicates in + def : Pat<(vti.Scalar (extractelt (vti.Vector vti.RegClass:$rs2), 0)), + (vfmv_f_s_inst vti.RegClass:$rs2, vti.Log2SEW)>; +} +} // Predicates = [HasStdExtZvfbfa] diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h index 6acf799..334db4b 100644 --- a/llvm/lib/Target/RISCV/RISCVSubtarget.h +++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h @@ -288,9 +288,12 @@ public: bool hasVInstructionsI64() const { return HasStdExtZve64x; } bool hasVInstructionsF16Minimal() const { return HasStdExtZvfhmin; } bool hasVInstructionsF16() const { return HasStdExtZvfh; } - bool hasVInstructionsBF16Minimal() const { return HasStdExtZvfbfmin; } + bool hasVInstructionsBF16Minimal() const { + return HasStdExtZvfbfmin || HasStdExtZvfbfa; + } bool hasVInstructionsF32() const { return HasStdExtZve32f; } bool hasVInstructionsF64() const { return HasStdExtZve64d; } + bool hasVInstructionsBF16() const { return HasStdExtZvfbfa; } // F16 and F64 both require F32. bool hasVInstructionsAnyF() const { return hasVInstructionsF32(); } bool hasVInstructionsFullMultiply() const { return HasStdExtV; } diff --git a/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp b/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp index 100f1ec..53ec712 100644 --- a/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp +++ b/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp @@ -1879,28 +1879,34 @@ bool X86InstructionSelector::selectSelect(MachineInstr &I, unsigned OpCmp; LLT Ty = MRI.getType(DstReg); - switch (Ty.getSizeInBits()) { - default: - return false; - case 8: - OpCmp = X86::CMOV_GR8; - break; - case 16: - OpCmp = STI.canUseCMOV() ? X86::CMOV16rr : X86::CMOV_GR16; - break; - case 32: - OpCmp = STI.canUseCMOV() ? X86::CMOV32rr : X86::CMOV_GR32; - break; - case 64: - assert(STI.is64Bit() && STI.canUseCMOV()); - OpCmp = X86::CMOV64rr; - break; + if (Ty.getSizeInBits() == 80) { + BuildMI(*Sel.getParent(), Sel, Sel.getDebugLoc(), TII.get(X86::CMOVE_Fp80), + DstReg) + .addReg(Sel.getTrueReg()) + .addReg(Sel.getFalseReg()); + } else { + switch (Ty.getSizeInBits()) { + default: + return false; + case 8: + OpCmp = X86::CMOV_GR8; + break; + case 16: + OpCmp = STI.canUseCMOV() ? X86::CMOV16rr : X86::CMOV_GR16; + break; + case 32: + OpCmp = STI.canUseCMOV() ? X86::CMOV32rr : X86::CMOV_GR32; + break; + case 64: + assert(STI.is64Bit() && STI.canUseCMOV()); + OpCmp = X86::CMOV64rr; + break; + } + BuildMI(*Sel.getParent(), Sel, Sel.getDebugLoc(), TII.get(OpCmp), DstReg) + .addReg(Sel.getTrueReg()) + .addReg(Sel.getFalseReg()) + .addImm(X86::COND_E); } - BuildMI(*Sel.getParent(), Sel, Sel.getDebugLoc(), TII.get(OpCmp), DstReg) - .addReg(Sel.getTrueReg()) - .addReg(Sel.getFalseReg()) - .addImm(X86::COND_E); - const TargetRegisterClass *DstRC = getRegClass(Ty, DstReg, MRI); if (!RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) { LLVM_DEBUG(dbgs() << "Failed to constrain CMOV\n"); diff --git a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp index 28fa2cd..e792b1b 100644 --- a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp +++ b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp @@ -575,10 +575,13 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI, // todo: vectors and address spaces getActionDefinitionsBuilder(G_SELECT) - .legalFor({{s8, s32}, {s16, s32}, {s32, s32}, {s64, s32}, {p0, s32}}) + .legalFor({{s16, s32}, {s32, s32}, {p0, s32}}) + .legalFor(!HasCMOV, {{s8, s32}}) + .legalFor(Is64Bit, {{s64, s32}}) + .legalFor(UseX87, {{s80, s32}}) + .clampScalar(1, s32, s32) .widenScalarToNextPow2(0, /*Min=*/8) - .clampScalar(0, HasCMOV ? s16 : s8, sMaxScalar) - .clampScalar(1, s32, s32); + .clampScalar(0, HasCMOV ? s16 : s8, sMaxScalar); // memory intrinsics getActionDefinitionsBuilder({G_MEMCPY, G_MEMMOVE, G_MEMSET}).libcall(); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 2feee05..b5f8ee5 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -41846,7 +41846,7 @@ static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL, if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) || X86::mayFoldLoad(peekThroughOneUseBitcasts(N1), Subtarget)) return SDValue(); - Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4); + Imm = llvm::rotl<uint8_t>(Imm, 4); return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0, DAG.getTargetConstant(Imm, DL, MVT::i8)); }; @@ -44813,10 +44813,16 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( } case X86ISD::PCMPGT: // icmp sgt(0, R) == ashr(R, BitWidth-1). - // iff we only need the sign bit then we can use R directly. - if (OriginalDemandedBits.isSignMask() && - ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode())) - return TLO.CombineTo(Op, Op.getOperand(1)); + if (ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode())) { + // iff we only need the signbit then we can use R directly. + if (OriginalDemandedBits.isSignMask()) + return TLO.CombineTo(Op, Op.getOperand(1)); + // otherwise we just need R's signbit for the comparison. + APInt SignMask = APInt::getSignMask(BitWidth); + if (SimplifyDemandedBits(Op.getOperand(1), SignMask, OriginalDemandedElts, + Known, TLO, Depth + 1)) + return true; + } break; case X86ISD::MOVMSK: { SDValue Src = Op.getOperand(0); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index 09cb225..a8eb9b9 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -3757,6 +3757,10 @@ static Instruction *foldBitCeil(SelectInst &SI, IRBuilderBase &Builder, // (x < y) ? -1 : zext(x > y) // (x > y) ? 1 : sext(x != y) // (x > y) ? 1 : sext(x < y) +// (x == y) ? 0 : (x > y ? 1 : -1) +// (x == y) ? 0 : (x < y ? -1 : 1) +// Special case: x == C ? 0 : (x > C - 1 ? 1 : -1) +// Special case: x == C ? 0 : (x < C + 1 ? -1 : 1) // Into ucmp/scmp(x, y), where signedness is determined by the signedness // of the comparison in the original sequence. Instruction *InstCombinerImpl::foldSelectToCmp(SelectInst &SI) { @@ -3849,6 +3853,44 @@ Instruction *InstCombinerImpl::foldSelectToCmp(SelectInst &SI) { } } + // Special cases with constants: x == C ? 0 : (x > C-1 ? 1 : -1) + if (Pred == ICmpInst::ICMP_EQ && match(TV, m_Zero())) { + const APInt *C; + if (match(RHS, m_APInt(C))) { + CmpPredicate InnerPred; + Value *InnerRHS; + const APInt *InnerTV, *InnerFV; + if (match(FV, + m_Select(m_ICmp(InnerPred, m_Specific(LHS), m_Value(InnerRHS)), + m_APInt(InnerTV), m_APInt(InnerFV)))) { + + // x == C ? 0 : (x > C-1 ? 1 : -1) + if (ICmpInst::isGT(InnerPred) && InnerTV->isOne() && + InnerFV->isAllOnes()) { + IsSigned = ICmpInst::isSigned(InnerPred); + bool CanSubOne = IsSigned ? !C->isMinSignedValue() : !C->isMinValue(); + if (CanSubOne) { + APInt Cminus1 = *C - 1; + if (match(InnerRHS, m_SpecificInt(Cminus1))) + Replace = true; + } + } + + // x == C ? 0 : (x < C+1 ? -1 : 1) + if (ICmpInst::isLT(InnerPred) && InnerTV->isAllOnes() && + InnerFV->isOne()) { + IsSigned = ICmpInst::isSigned(InnerPred); + bool CanAddOne = IsSigned ? !C->isMaxSignedValue() : !C->isMaxValue(); + if (CanAddOne) { + APInt Cplus1 = *C + 1; + if (match(InnerRHS, m_SpecificInt(Cplus1))) + Replace = true; + } + } + } + } + } + Intrinsic::ID IID = IsSigned ? Intrinsic::scmp : Intrinsic::ucmp; if (Replace) return replaceInstUsesWith( diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp index 9693ae6..4947d03 100644 --- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp +++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp @@ -20,6 +20,7 @@ #include "llvm/Analysis/ValueLatticeUtils.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/ConstantRange.h" +#include "llvm/IR/DerivedTypes.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" #include "llvm/IR/Instructions.h" @@ -634,18 +635,10 @@ private: /// Merge \p MergeWithV into \p IV and push \p V to the worklist, if \p IV /// changes. bool mergeInValue(ValueLatticeElement &IV, Value *V, - ValueLatticeElement MergeWithV, + const ValueLatticeElement &MergeWithV, ValueLatticeElement::MergeOptions Opts = { /*MayIncludeUndef=*/false, /*CheckWiden=*/false}); - bool mergeInValue(Value *V, ValueLatticeElement MergeWithV, - ValueLatticeElement::MergeOptions Opts = { - /*MayIncludeUndef=*/false, /*CheckWiden=*/false}) { - assert(!V->getType()->isStructTy() && - "non-structs should use markConstant"); - return mergeInValue(ValueState[V], V, MergeWithV, Opts); - } - /// getValueState - Return the ValueLatticeElement object that corresponds to /// the value. This function handles the case when the value hasn't been seen /// yet by properly seeding constants etc. @@ -768,6 +761,7 @@ private: void handleCallArguments(CallBase &CB); void handleExtractOfWithOverflow(ExtractValueInst &EVI, const WithOverflowInst *WO, unsigned Idx); + bool isInstFullyOverDefined(Instruction &Inst); private: friend class InstVisitor<SCCPInstVisitor>; @@ -987,7 +981,7 @@ public: void trackValueOfArgument(Argument *A) { if (A->getType()->isStructTy()) return (void)markOverdefined(A); - mergeInValue(A, getArgAttributeVL(A)); + mergeInValue(ValueState[A], A, getArgAttributeVL(A)); } bool isStructLatticeConstant(Function *F, StructType *STy); @@ -1128,8 +1122,7 @@ bool SCCPInstVisitor::isStructLatticeConstant(Function *F, StructType *STy) { for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { const auto &It = TrackedMultipleRetVals.find(std::make_pair(F, i)); assert(It != TrackedMultipleRetVals.end()); - ValueLatticeElement LV = It->second; - if (!SCCPSolver::isConstant(LV)) + if (!SCCPSolver::isConstant(It->second)) return false; } return true; @@ -1160,7 +1153,7 @@ Constant *SCCPInstVisitor::getConstantOrNull(Value *V) const { std::vector<Constant *> ConstVals; auto *ST = cast<StructType>(V->getType()); for (unsigned I = 0, E = ST->getNumElements(); I != E; ++I) { - ValueLatticeElement LV = LVs[I]; + const ValueLatticeElement &LV = LVs[I]; ConstVals.push_back(SCCPSolver::isConstant(LV) ? getConstant(LV, ST->getElementType(I)) : UndefValue::get(ST->getElementType(I))); @@ -1225,7 +1218,7 @@ void SCCPInstVisitor::visitInstruction(Instruction &I) { } bool SCCPInstVisitor::mergeInValue(ValueLatticeElement &IV, Value *V, - ValueLatticeElement MergeWithV, + const ValueLatticeElement &MergeWithV, ValueLatticeElement::MergeOptions Opts) { if (IV.mergeIn(MergeWithV, Opts)) { pushUsersToWorkList(V); @@ -1264,7 +1257,7 @@ void SCCPInstVisitor::getFeasibleSuccessors(Instruction &TI, return; } - ValueLatticeElement BCValue = getValueState(BI->getCondition()); + const ValueLatticeElement &BCValue = getValueState(BI->getCondition()); ConstantInt *CI = getConstantInt(BCValue, BI->getCondition()->getType()); if (!CI) { // Overdefined condition variables, and branches on unfoldable constant @@ -1326,7 +1319,7 @@ void SCCPInstVisitor::getFeasibleSuccessors(Instruction &TI, // the target as executable. if (auto *IBR = dyn_cast<IndirectBrInst>(&TI)) { // Casts are folded by visitCastInst. - ValueLatticeElement IBRValue = getValueState(IBR->getAddress()); + const ValueLatticeElement &IBRValue = getValueState(IBR->getAddress()); BlockAddress *Addr = dyn_cast_or_null<BlockAddress>( getConstant(IBRValue, IBR->getAddress()->getType())); if (!Addr) { // Overdefined or unknown condition? @@ -1383,49 +1376,66 @@ bool SCCPInstVisitor::isEdgeFeasible(BasicBlock *From, BasicBlock *To) const { // 7. If a conditional branch has a value that is overdefined, make all // successors executable. void SCCPInstVisitor::visitPHINode(PHINode &PN) { - // If this PN returns a struct, just mark the result overdefined. - // TODO: We could do a lot better than this if code actually uses this. - if (PN.getType()->isStructTy()) - return (void)markOverdefined(&PN); - - if (getValueState(&PN).isOverdefined()) - return; // Quick exit - // Super-extra-high-degree PHI nodes are unlikely to ever be marked constant, // and slow us down a lot. Just mark them overdefined. if (PN.getNumIncomingValues() > 64) return (void)markOverdefined(&PN); - unsigned NumActiveIncoming = 0; + if (isInstFullyOverDefined(PN)) + return; + SmallVector<unsigned> FeasibleIncomingIndices; + for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) { + if (!isEdgeFeasible(PN.getIncomingBlock(i), PN.getParent())) + continue; + FeasibleIncomingIndices.push_back(i); + } // Look at all of the executable operands of the PHI node. If any of them // are overdefined, the PHI becomes overdefined as well. If they are all // constant, and they agree with each other, the PHI becomes the identical // constant. If they are constant and don't agree, the PHI is a constant // range. If there are no executable operands, the PHI remains unknown. - ValueLatticeElement PhiState = getValueState(&PN); - for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) { - if (!isEdgeFeasible(PN.getIncomingBlock(i), PN.getParent())) - continue; - - ValueLatticeElement IV = getValueState(PN.getIncomingValue(i)); - PhiState.mergeIn(IV); - NumActiveIncoming++; - if (PhiState.isOverdefined()) - break; + if (StructType *STy = dyn_cast<StructType>(PN.getType())) { + for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { + ValueLatticeElement PhiState = getStructValueState(&PN, i); + if (PhiState.isOverdefined()) + continue; + for (unsigned j : FeasibleIncomingIndices) { + const ValueLatticeElement &IV = + getStructValueState(PN.getIncomingValue(j), i); + PhiState.mergeIn(IV); + if (PhiState.isOverdefined()) + break; + } + ValueLatticeElement &PhiStateRef = getStructValueState(&PN, i); + mergeInValue(PhiStateRef, &PN, PhiState, + ValueLatticeElement::MergeOptions().setMaxWidenSteps( + FeasibleIncomingIndices.size() + 1)); + PhiStateRef.setNumRangeExtensions( + std::max((unsigned)FeasibleIncomingIndices.size(), + PhiStateRef.getNumRangeExtensions())); + } + } else { + ValueLatticeElement PhiState = getValueState(&PN); + for (unsigned i : FeasibleIncomingIndices) { + const ValueLatticeElement &IV = getValueState(PN.getIncomingValue(i)); + PhiState.mergeIn(IV); + if (PhiState.isOverdefined()) + break; + } + // We allow up to 1 range extension per active incoming value and one + // additional extension. Note that we manually adjust the number of range + // extensions to match the number of active incoming values. This helps to + // limit multiple extensions caused by the same incoming value, if other + // incoming values are equal. + ValueLatticeElement &PhiStateRef = ValueState[&PN]; + mergeInValue(PhiStateRef, &PN, PhiState, + ValueLatticeElement::MergeOptions().setMaxWidenSteps( + FeasibleIncomingIndices.size() + 1)); + PhiStateRef.setNumRangeExtensions( + std::max((unsigned)FeasibleIncomingIndices.size(), + PhiStateRef.getNumRangeExtensions())); } - - // We allow up to 1 range extension per active incoming value and one - // additional extension. Note that we manually adjust the number of range - // extensions to match the number of active incoming values. This helps to - // limit multiple extensions caused by the same incoming value, if other - // incoming values are equal. - mergeInValue(&PN, PhiState, - ValueLatticeElement::MergeOptions().setMaxWidenSteps( - NumActiveIncoming + 1)); - ValueLatticeElement &PhiStateRef = getValueState(&PN); - PhiStateRef.setNumRangeExtensions( - std::max(NumActiveIncoming, PhiStateRef.getNumRangeExtensions())); } void SCCPInstVisitor::visitReturnInst(ReturnInst &I) { @@ -1481,7 +1491,7 @@ void SCCPInstVisitor::visitCastInst(CastInst &I) { } } - ValueLatticeElement OpSt = getValueState(I.getOperand(0)); + const ValueLatticeElement &OpSt = getValueState(I.getOperand(0)); if (OpSt.isUnknownOrUndef()) return; @@ -1496,9 +1506,9 @@ void SCCPInstVisitor::visitCastInst(CastInst &I) { if (I.getDestTy()->isIntOrIntVectorTy() && I.getSrcTy()->isIntOrIntVectorTy() && I.getOpcode() != Instruction::BitCast) { - auto &LV = getValueState(&I); ConstantRange OpRange = OpSt.asConstantRange(I.getSrcTy(), /*UndefAllowed=*/false); + auto &LV = getValueState(&I); Type *DestTy = I.getDestTy(); ConstantRange Res = ConstantRange::getEmpty(DestTy->getScalarSizeInBits()); @@ -1516,19 +1526,24 @@ void SCCPInstVisitor::handleExtractOfWithOverflow(ExtractValueInst &EVI, const WithOverflowInst *WO, unsigned Idx) { Value *LHS = WO->getLHS(), *RHS = WO->getRHS(); - ValueLatticeElement L = getValueState(LHS); - ValueLatticeElement R = getValueState(RHS); + Type *Ty = LHS->getType(); + addAdditionalUser(LHS, &EVI); addAdditionalUser(RHS, &EVI); - if (L.isUnknownOrUndef() || R.isUnknownOrUndef()) - return; // Wait to resolve. - Type *Ty = LHS->getType(); + const ValueLatticeElement &L = getValueState(LHS); + if (L.isUnknownOrUndef()) + return; // Wait to resolve. ConstantRange LR = L.asConstantRange(Ty, /*UndefAllowed=*/false); + + const ValueLatticeElement &R = getValueState(RHS); + if (R.isUnknownOrUndef()) + return; // Wait to resolve. + ConstantRange RR = R.asConstantRange(Ty, /*UndefAllowed=*/false); if (Idx == 0) { ConstantRange Res = LR.binaryOp(WO->getBinaryOp(), RR); - mergeInValue(&EVI, ValueLatticeElement::getRange(Res)); + mergeInValue(ValueState[&EVI], &EVI, ValueLatticeElement::getRange(Res)); } else { assert(Idx == 1 && "Index can only be 0 or 1"); ConstantRange NWRegion = ConstantRange::makeGuaranteedNoWrapRegion( @@ -1560,7 +1575,7 @@ void SCCPInstVisitor::visitExtractValueInst(ExtractValueInst &EVI) { if (auto *WO = dyn_cast<WithOverflowInst>(AggVal)) return handleExtractOfWithOverflow(EVI, WO, i); ValueLatticeElement EltVal = getStructValueState(AggVal, i); - mergeInValue(getValueState(&EVI), &EVI, EltVal); + mergeInValue(ValueState[&EVI], &EVI, EltVal); } else { // Otherwise, must be extracting from an array. return (void)markOverdefined(&EVI); @@ -1616,14 +1631,18 @@ void SCCPInstVisitor::visitSelectInst(SelectInst &I) { if (ValueState[&I].isOverdefined()) return (void)markOverdefined(&I); - ValueLatticeElement CondValue = getValueState(I.getCondition()); + const ValueLatticeElement &CondValue = getValueState(I.getCondition()); if (CondValue.isUnknownOrUndef()) return; if (ConstantInt *CondCB = getConstantInt(CondValue, I.getCondition()->getType())) { Value *OpVal = CondCB->isZero() ? I.getFalseValue() : I.getTrueValue(); - mergeInValue(&I, getValueState(OpVal)); + const ValueLatticeElement &OpValState = getValueState(OpVal); + // Safety: ValueState[&I] doesn't invalidate OpValState since it is already + // in the map. + assert(ValueState.contains(&I) && "&I is not in ValueState map."); + mergeInValue(ValueState[&I], &I, OpValState); return; } @@ -1721,7 +1740,7 @@ void SCCPInstVisitor::visitBinaryOperator(Instruction &I) { // being a special floating value. ValueLatticeElement NewV; NewV.markConstant(C, /*MayIncludeUndef=*/true); - return (void)mergeInValue(&I, NewV); + return (void)mergeInValue(ValueState[&I], &I, NewV); } } @@ -1741,7 +1760,7 @@ void SCCPInstVisitor::visitBinaryOperator(Instruction &I) { R = A.overflowingBinaryOp(BO->getOpcode(), B, OBO->getNoWrapKind()); else R = A.binaryOp(BO->getOpcode(), B); - mergeInValue(&I, ValueLatticeElement::getRange(R)); + mergeInValue(ValueState[&I], &I, ValueLatticeElement::getRange(R)); // TODO: Currently we do not exploit special values that produce something // better than overdefined with an overdefined operand for vector or floating @@ -1767,7 +1786,7 @@ void SCCPInstVisitor::visitCmpInst(CmpInst &I) { if (C) { ValueLatticeElement CV; CV.markConstant(C); - mergeInValue(&I, CV); + mergeInValue(ValueState[&I], &I, CV); return; } @@ -1802,7 +1821,7 @@ void SCCPInstVisitor::visitGetElementPtrInst(GetElementPtrInst &I) { Operands.reserve(I.getNumOperands()); for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i) { - ValueLatticeElement State = getValueState(I.getOperand(i)); + const ValueLatticeElement &State = getValueState(I.getOperand(i)); if (State.isUnknownOrUndef()) return; // Operands are not resolved yet. @@ -1881,14 +1900,13 @@ void SCCPInstVisitor::visitLoadInst(LoadInst &I) { if (ValueState[&I].isOverdefined()) return (void)markOverdefined(&I); - ValueLatticeElement PtrVal = getValueState(I.getOperand(0)); + const ValueLatticeElement &PtrVal = getValueState(I.getOperand(0)); if (PtrVal.isUnknownOrUndef()) return; // The pointer is not resolved yet! - ValueLatticeElement &IV = ValueState[&I]; - if (SCCPSolver::isConstant(PtrVal)) { Constant *Ptr = getConstant(PtrVal, I.getOperand(0)->getType()); + ValueLatticeElement &IV = ValueState[&I]; // load null is undefined. if (isa<ConstantPointerNull>(Ptr)) { @@ -1916,7 +1934,7 @@ void SCCPInstVisitor::visitLoadInst(LoadInst &I) { } // Fall back to metadata. - mergeInValue(&I, getValueFromMetadata(&I)); + mergeInValue(ValueState[&I], &I, getValueFromMetadata(&I)); } void SCCPInstVisitor::visitCallBase(CallBase &CB) { @@ -1944,7 +1962,7 @@ void SCCPInstVisitor::handleCallOverdefined(CallBase &CB) { return markOverdefined(&CB); // Can't handle struct args. if (A.get()->getType()->isMetadataTy()) continue; // Carried in CB, not allowed in Operands. - ValueLatticeElement State = getValueState(A); + const ValueLatticeElement &State = getValueState(A); if (State.isUnknownOrUndef()) return; // Operands are not resolved yet. @@ -1964,7 +1982,7 @@ void SCCPInstVisitor::handleCallOverdefined(CallBase &CB) { } // Fall back to metadata. - mergeInValue(&CB, getValueFromMetadata(&CB)); + mergeInValue(ValueState[&CB], &CB, getValueFromMetadata(&CB)); } void SCCPInstVisitor::handleCallArguments(CallBase &CB) { @@ -1992,10 +2010,11 @@ void SCCPInstVisitor::handleCallArguments(CallBase &CB) { mergeInValue(getStructValueState(&*AI, i), &*AI, CallArg, getMaxWidenStepsOpts()); } - } else - mergeInValue(&*AI, - getValueState(*CAI).intersect(getArgAttributeVL(&*AI)), - getMaxWidenStepsOpts()); + } else { + ValueLatticeElement CallArg = + getValueState(*CAI).intersect(getArgAttributeVL(&*AI)); + mergeInValue(ValueState[&*AI], &*AI, CallArg, getMaxWidenStepsOpts()); + } } } } @@ -2076,7 +2095,8 @@ void SCCPInstVisitor::handleCallResult(CallBase &CB) { if (II->getIntrinsicID() == Intrinsic::vscale) { unsigned BitWidth = CB.getType()->getScalarSizeInBits(); const ConstantRange Result = getVScaleRange(II->getFunction(), BitWidth); - return (void)mergeInValue(II, ValueLatticeElement::getRange(Result)); + return (void)mergeInValue(ValueState[II], II, + ValueLatticeElement::getRange(Result)); } if (ConstantRange::isIntrinsicSupported(II->getIntrinsicID())) { @@ -2094,7 +2114,8 @@ void SCCPInstVisitor::handleCallResult(CallBase &CB) { ConstantRange Result = ConstantRange::intrinsic(II->getIntrinsicID(), OpRanges); - return (void)mergeInValue(II, ValueLatticeElement::getRange(Result)); + return (void)mergeInValue(ValueState[II], II, + ValueLatticeElement::getRange(Result)); } } @@ -2121,10 +2142,25 @@ void SCCPInstVisitor::handleCallResult(CallBase &CB) { return handleCallOverdefined(CB); // Not tracking this callee. // If so, propagate the return value of the callee into this call result. - mergeInValue(&CB, TFRVI->second, getMaxWidenStepsOpts()); + mergeInValue(ValueState[&CB], &CB, TFRVI->second, getMaxWidenStepsOpts()); } } +bool SCCPInstVisitor::isInstFullyOverDefined(Instruction &Inst) { + // For structure Type, we handle each member separately. + // A structure object won't be considered as overdefined when + // there is at least one member that is not overdefined. + if (StructType *STy = dyn_cast<StructType>(Inst.getType())) { + for (unsigned i = 0, e = STy->getNumElements(); i < e; ++i) { + if (!getStructValueState(&Inst, i).isOverdefined()) + return false; + } + return true; + } + + return getValueState(&Inst).isOverdefined(); +} + void SCCPInstVisitor::solve() { // Process the work lists until they are empty! while (!BBWorkList.empty() || !InstWorkList.empty()) { diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 280eb20..febdc54 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7192,7 +7192,8 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( // TODO: Move to VPlan transform stage once the transition to the VPlan-based // cost model is complete for better cost estimates. VPlanTransforms::runPass(VPlanTransforms::unrollByUF, BestVPlan, BestUF); - VPlanTransforms::runPass(VPlanTransforms::materializeBuildVectors, BestVPlan); + VPlanTransforms::runPass(VPlanTransforms::materializePacksAndUnpacks, + BestVPlan); VPlanTransforms::runPass(VPlanTransforms::materializeBroadcasts, BestVPlan); VPlanTransforms::runPass(VPlanTransforms::replicateByVF, BestVPlan, BestVF); bool HasBranchWeights = diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index b62c8f1..048a3e6 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -2242,8 +2242,49 @@ public: /// may not be necessary. bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const; bool isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy, - Align Alignment, const int64_t Diff, Value *Ptr0, - Value *PtrN, StridedPtrInfo &SPtrInfo) const; + Align Alignment, const int64_t Diff, + const size_t Sz) const; + + /// Return true if an array of scalar loads can be replaced with a strided + /// load (with constant stride). + /// + /// TODO: + /// It is possible that the load gets "widened". Suppose that originally each + /// load loads `k` bytes and `PointerOps` can be arranged as follows (`%s` is + /// constant): %b + 0 * %s + 0 %b + 0 * %s + 1 %b + 0 * %s + 2 + /// ... + /// %b + 0 * %s + (w - 1) + /// + /// %b + 1 * %s + 0 + /// %b + 1 * %s + 1 + /// %b + 1 * %s + 2 + /// ... + /// %b + 1 * %s + (w - 1) + /// ... + /// + /// %b + (n - 1) * %s + 0 + /// %b + (n - 1) * %s + 1 + /// %b + (n - 1) * %s + 2 + /// ... + /// %b + (n - 1) * %s + (w - 1) + /// + /// In this case we will generate a strided load of type `<n x (k * w)>`. + /// + /// \param PointerOps list of pointer arguments of loads. + /// \param ElemTy original scalar type of loads. + /// \param Alignment alignment of the first load. + /// \param SortedIndices is the order of PointerOps as returned by + /// `sortPtrAccesses` + /// \param Diff Pointer difference between the lowest and the highes pointer + /// in `PointerOps` as returned by `getPointersDiff`. + /// \param Ptr0 first pointer in `PointersOps`. + /// \param PtrN last pointer in `PointersOps`. + /// \param SPtrInfo If the function return `true`, it also sets all the fields + /// of `SPtrInfo` necessary to generate the strided load later. + bool analyzeConstantStrideCandidate( + const ArrayRef<Value *> PointerOps, Type *ElemTy, Align Alignment, + const SmallVectorImpl<unsigned> &SortedIndices, const int64_t Diff, + Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const; /// Return true if an array of scalar loads can be replaced with a strided /// load (with run-time stride). @@ -5302,7 +5343,7 @@ private: unsigned &OpCnt = OrderedEntriesCount.try_emplace(TE, 0).first->getSecond(); EdgeInfo EI(TE, U.getOperandNo()); - if (!getScheduleCopyableData(EI, Op) && OpCnt < NumOps) + if (!getScheduleCopyableData(EI, Op)) continue; // Found copyable operand - continue. ++OpCnt; @@ -6849,9 +6890,8 @@ isMaskedLoadCompress(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps, /// current graph (for masked gathers extra extractelement instructions /// might be required). bool BoUpSLP::isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy, - Align Alignment, const int64_t Diff, Value *Ptr0, - Value *PtrN, StridedPtrInfo &SPtrInfo) const { - const size_t Sz = PointerOps.size(); + Align Alignment, const int64_t Diff, + const size_t Sz) const { if (Diff % (Sz - 1) != 0) return false; @@ -6875,27 +6915,40 @@ bool BoUpSLP::isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy, return false; if (!TTI->isLegalStridedLoadStore(VecTy, Alignment)) return false; + return true; + } + return false; +} - // Iterate through all pointers and check if all distances are - // unique multiple of Dist. - SmallSet<int64_t, 4> Dists; - for (Value *Ptr : PointerOps) { - int64_t Dist = 0; - if (Ptr == PtrN) - Dist = Diff; - else if (Ptr != Ptr0) - Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE); - // If the strides are not the same or repeated, we can't - // vectorize. - if (((Dist / Stride) * Stride) != Dist || !Dists.insert(Dist).second) - break; - } - if (Dists.size() == Sz) { - Type *StrideTy = DL->getIndexType(Ptr0->getType()); - SPtrInfo.StrideVal = ConstantInt::get(StrideTy, Stride); - SPtrInfo.Ty = getWidenedType(ScalarTy, Sz); - return true; - } +bool BoUpSLP::analyzeConstantStrideCandidate( + const ArrayRef<Value *> PointerOps, Type *ScalarTy, Align Alignment, + const SmallVectorImpl<unsigned> &SortedIndices, const int64_t Diff, + Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const { + const size_t Sz = PointerOps.size(); + if (!isStridedLoad(PointerOps, ScalarTy, Alignment, Diff, Sz)) + return false; + + int64_t Stride = Diff / static_cast<int64_t>(Sz - 1); + + // Iterate through all pointers and check if all distances are + // unique multiple of Dist. + SmallSet<int64_t, 4> Dists; + for (Value *Ptr : PointerOps) { + int64_t Dist = 0; + if (Ptr == PtrN) + Dist = Diff; + else if (Ptr != Ptr0) + Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE); + // If the strides are not the same or repeated, we can't + // vectorize. + if (((Dist / Stride) * Stride) != Dist || !Dists.insert(Dist).second) + break; + } + if (Dists.size() == Sz) { + Type *StrideTy = DL->getIndexType(Ptr0->getType()); + SPtrInfo.StrideVal = ConstantInt::get(StrideTy, Stride); + SPtrInfo.Ty = getWidenedType(ScalarTy, Sz); + return true; } return false; } @@ -6995,8 +7048,8 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads( Align Alignment = cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()]) ->getAlign(); - if (isStridedLoad(PointerOps, ScalarTy, Alignment, *Diff, Ptr0, PtrN, - SPtrInfo)) + if (analyzeConstantStrideCandidate(PointerOps, ScalarTy, Alignment, Order, + *Diff, Ptr0, PtrN, SPtrInfo)) return LoadsState::StridedVectorize; } if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) || diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 0e0b042..fed04eb 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -407,6 +407,10 @@ public: VPBasicBlock *getParent() { return Parent; } const VPBasicBlock *getParent() const { return Parent; } + /// \return the VPRegionBlock which the recipe belongs to. + VPRegionBlock *getRegion(); + const VPRegionBlock *getRegion() const; + /// The method which generates the output IR instructions that correspond to /// this VPRecipe, thereby "executing" the VPlan. virtual void execute(VPTransformState &State) = 0; @@ -1003,6 +1007,11 @@ public: /// Creates a fixed-width vector containing all operands. The number of /// operands matches the vector element count. BuildVector, + /// Extracts all lanes from its (non-scalable) vector operand. This is an + /// abstract VPInstruction whose single defined VPValue represents VF + /// scalars extracted from a vector, to be replaced by VF ExtractElement + /// VPInstructions. + Unpack, /// Compute the final result of a AnyOf reduction with select(cmp(),x,y), /// where one of (x,y) is loop invariant, and both x and y are integer type. ComputeAnyOfResult, @@ -2711,6 +2720,15 @@ public: return R && classof(R); } + static inline bool classof(const VPValue *VPV) { + const VPRecipeBase *R = VPV->getDefiningRecipe(); + return R && classof(R); + } + + static inline bool classof(const VPSingleDefRecipe *R) { + return classof(static_cast<const VPRecipeBase *>(R)); + } + /// Generate the reduction in the loop. void execute(VPTransformState &State) override; @@ -3096,6 +3114,9 @@ public: /// Returns true if this expression contains recipes that may have side /// effects. bool mayHaveSideEffects() const; + + /// Returns true if the result of this VPExpressionRecipe is a single-scalar. + bool isSingleScalar() const; }; /// VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when @@ -4075,6 +4096,14 @@ public: } }; +inline VPRegionBlock *VPRecipeBase::getRegion() { + return getParent()->getParent(); +} + +inline const VPRegionBlock *VPRecipeBase::getRegion() const { + return getParent()->getParent(); +} + /// VPlan models a candidate for vectorization, encoding various decisions take /// to produce efficient output IR, including which branches, basic-blocks and /// output IR instructions to generate, and their cost. VPlan holds a diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index f413c63..80a2e4b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -110,6 +110,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) { case VPInstruction::AnyOf: case VPInstruction::BuildStructVector: case VPInstruction::BuildVector: + case VPInstruction::Unpack: return SetResultTyFromOp(); case VPInstruction::ExtractLane: return inferScalarType(R->getOperand(1)); @@ -377,7 +378,7 @@ bool VPDominatorTree::properlyDominates(const VPRecipeBase *A, #ifndef NDEBUG auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * { - auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent()); + VPRegionBlock *Region = R->getRegion(); if (Region && Region->isReplicator()) { assert(Region->getNumSuccessors() == 1 && Region->getNumPredecessors() == 1 && "Expected SESE region!"); diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index d8203e2..b5b98c6 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -388,6 +388,12 @@ m_ExtractLastElement(const Op0_t &Op0) { return m_VPInstruction<VPInstruction::ExtractLastElement>(Op0); } +template <typename Op0_t, typename Op1_t> +inline VPInstruction_match<Instruction::ExtractElement, Op0_t, Op1_t> +m_ExtractElement(const Op0_t &Op0, const Op1_t &Op1) { + return m_VPInstruction<Instruction::ExtractElement>(Op0, Op1); +} + template <typename Op0_t> inline VPInstruction_match<VPInstruction::ExtractLastLanePerPart, Op0_t> m_ExtractLastLanePerPart(const Op0_t &Op0) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 7a98c75..a865b2d 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -515,6 +515,7 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) { case VPInstruction::ExtractPenultimateElement: case VPInstruction::FirstActiveLane: case VPInstruction::Not: + case VPInstruction::Unpack: return 1; case Instruction::ICmp: case Instruction::FCmp: @@ -1246,6 +1247,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { case VPInstruction::StepVector: case VPInstruction::ReductionStartVector: case VPInstruction::VScale: + case VPInstruction::Unpack: return false; default: return true; @@ -1290,7 +1292,8 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { case VPInstruction::PtrAdd: return Op == getOperand(0) || vputils::onlyFirstLaneUsed(this); case VPInstruction::WidePtrAdd: - return Op == getOperand(0); + // WidePtrAdd supports scalar and vector base addresses. + return false; case VPInstruction::ComputeAnyOfResult: case VPInstruction::ComputeFindIVResult: return Op == getOperand(1); @@ -1417,6 +1420,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, case VPInstruction::ResumeForEpilogue: O << "resume-for-epilogue"; break; + case VPInstruction::Unpack: + O << "unpack"; + break; default: O << Instruction::getOpcodeName(getOpcode()); } @@ -2352,7 +2358,7 @@ bool VPWidenIntOrFpInductionRecipe::isCanonical() const { return false; auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue()); auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue()); - auto *CanIV = getParent()->getParent()->getCanonicalIV(); + auto *CanIV = getRegion()->getCanonicalIV(); return StartC && StartC->isZero() && StepC && StepC->isOne() && getScalarType() == CanIV->getScalarType(); } @@ -2888,6 +2894,13 @@ bool VPExpressionRecipe::mayHaveSideEffects() const { return false; } +bool VPExpressionRecipe::isSingleScalar() const { + // Cannot use vputils::isSingleScalar(), because all external operands + // of the expression will be live-ins while bundled. + return isa<VPReductionRecipe>(ExpressionRecipes.back()) && + !isa<VPPartialReductionRecipe>(ExpressionRecipes.back()); +} + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPExpressionRecipe::print(raw_ostream &O, const Twine &Indent, @@ -3076,7 +3089,7 @@ static void scalarizeInstruction(const Instruction *Instr, State.AC->registerAssumption(II); assert( - (RepRecipe->getParent()->getParent() || + (RepRecipe->getRegion() || !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() || all_of(RepRecipe->operands(), [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) && @@ -3268,7 +3281,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, to_vector(operands()), VF); // If the recipe is not predicated (i.e. not in a replicate region), return // the scalar cost. Otherwise handle predicated cost. - if (!getParent()->getParent()->isReplicator()) + if (!getRegion()->isReplicator()) return ScalarCost; // Account for the phi nodes that we will create. @@ -3284,7 +3297,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, case Instruction::Store: { // TODO: See getMemInstScalarizationCost for how to handle replicating and // predicated cases. - const VPRegionBlock *ParentRegion = getParent()->getParent(); + const VPRegionBlock *ParentRegion = getRegion(); if (ParentRegion && ParentRegion->isReplicator()) break; diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index cae9aee8..fa1fdaf 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1224,6 +1224,13 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { return; } + uint64_t Idx; + if (match(&R, m_ExtractElement(m_BuildVector(), m_ConstantInt(Idx)))) { + auto *BuildVector = cast<VPInstruction>(R.getOperand(0)); + Def->replaceAllUsesWith(BuildVector->getOperand(Idx)); + return; + } + if (auto *Phi = dyn_cast<VPPhi>(Def)) { if (Phi->getNumOperands() == 1) Phi->replaceAllUsesWith(Phi->getOperand(0)); @@ -1858,8 +1865,8 @@ static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR, return nullptr; VPRegionBlock *EnclosingLoopRegion = HoistCandidate->getParent()->getEnclosingLoopRegion(); - assert((!HoistCandidate->getParent()->getParent() || - HoistCandidate->getParent()->getParent() == EnclosingLoopRegion) && + assert((!HoistCandidate->getRegion() || + HoistCandidate->getRegion() == EnclosingLoopRegion) && "CFG in VPlan should still be flat, without replicate regions"); // Hoist candidate was already visited, no need to hoist. if (!Visited.insert(HoistCandidate).second) @@ -2898,7 +2905,7 @@ void VPlanTransforms::replaceSymbolicStrides( // evolution. auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) { auto *R = cast<VPRecipeBase>(&U); - return R->getParent()->getParent() || + return R->getRegion() || R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor(); }; ValueToSCEVMapTy RewriteMap; @@ -3780,7 +3787,7 @@ void VPlanTransforms::materializeBackedgeTakenCount(VPlan &Plan, BTC->replaceAllUsesWith(TCMO); } -void VPlanTransforms::materializeBuildVectors(VPlan &Plan) { +void VPlanTransforms::materializePacksAndUnpacks(VPlan &Plan) { if (Plan.hasScalarVFOnly()) return; @@ -3803,8 +3810,7 @@ void VPlanTransforms::materializeBuildVectors(VPlan &Plan) { continue; auto *DefR = cast<VPRecipeWithIRFlags>(&R); auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) { - VPRegionBlock *ParentRegion = - cast<VPRecipeBase>(U)->getParent()->getParent(); + VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion(); return !U->usesScalars(DefR) || ParentRegion != LoopRegion; }; if ((isa<VPReplicateRecipe>(DefR) && @@ -3829,6 +3835,50 @@ void VPlanTransforms::materializeBuildVectors(VPlan &Plan) { }); } } + + // Create explicit VPInstructions to convert vectors to scalars. The current + // implementation is conservative - it may miss some cases that may or may not + // be vector values. TODO: introduce Unpacks speculatively - remove them later + // if they are known to operate on scalar values. + for (VPBasicBlock *VPBB : VPBBsInsideLoopRegion) { + for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { + if (isa<VPReplicateRecipe, VPInstruction, VPScalarIVStepsRecipe, + VPDerivedIVRecipe, VPCanonicalIVPHIRecipe>(&R)) + continue; + for (VPValue *Def : R.definedValues()) { + // Skip recipes that are single-scalar or only have their first lane + // used. + // TODO: The Defs skipped here may or may not be vector values. + // Introduce Unpacks, and remove them later, if they are guaranteed to + // produce scalar values. + if (vputils::isSingleScalar(Def) || vputils::onlyFirstLaneUsed(Def)) + continue; + + // At the moment, we create unpacks only for scalar users outside + // replicate regions. Recipes inside replicate regions still extract the + // required lanes implicitly. + // TODO: Remove once replicate regions are unrolled completely. + auto IsCandidateUnpackUser = [Def](VPUser *U) { + VPRegionBlock *ParentRegion = + cast<VPRecipeBase>(U)->getParent()->getParent(); + return U->usesScalars(Def) && + (!ParentRegion || !ParentRegion->isReplicator()); + }; + if (none_of(Def->users(), IsCandidateUnpackUser)) + continue; + + auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Def}); + if (R.isPhi()) + Unpack->insertBefore(*VPBB, VPBB->getFirstNonPhi()); + else + Unpack->insertAfter(&R); + Def->replaceUsesWithIf(Unpack, + [&IsCandidateUnpackUser](VPUser &U, unsigned) { + return IsCandidateUnpackUser(&U); + }); + } + } + } } void VPlanTransforms::materializeVectorTripCount(VPlan &Plan, diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 5a8a2bb..b28559b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -325,9 +325,10 @@ struct VPlanTransforms { static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH); - /// Add explicit Build[Struct]Vector recipes that combine multiple scalar - /// values into single vectors. - static void materializeBuildVectors(VPlan &Plan); + /// Add explicit Build[Struct]Vector recipes to Pack multiple scalar values + /// into vectors and Unpack recipes to extract scalars from vectors as + /// needed. + static void materializePacksAndUnpacks(VPlan &Plan); /// Materialize VF and VFxUF to be computed explicitly using VPInstructions. static void materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH, diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp index 5aeda3e..cfd1a74 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp @@ -465,10 +465,21 @@ void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF) { /// Create a single-scalar clone of \p DefR (must be a VPReplicateRecipe or /// VPInstruction) for lane \p Lane. Use \p Def2LaneDefs to look up scalar /// definitions for operands of \DefR. -static VPRecipeWithIRFlags * +static VPValue * cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy, VPRecipeWithIRFlags *DefR, VPLane Lane, const DenseMap<VPValue *, SmallVector<VPValue *>> &Def2LaneDefs) { + VPValue *Op; + if (match(DefR, m_VPInstruction<VPInstruction::Unpack>(m_VPValue(Op)))) { + auto LaneDefs = Def2LaneDefs.find(Op); + if (LaneDefs != Def2LaneDefs.end()) + return LaneDefs->second[Lane.getKnownLane()]; + + VPValue *Idx = + Plan.getOrAddLiveIn(ConstantInt::get(IdxTy, Lane.getKnownLane())); + return Builder.createNaryOp(Instruction::ExtractElement, {Op, Idx}); + } + // Collect the operands at Lane, creating extracts as needed. SmallVector<VPValue *> NewOps; for (VPValue *Op : DefR->operands()) { @@ -480,6 +491,10 @@ cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy, continue; } if (Lane.getKind() == VPLane::Kind::ScalableLast) { + // Look through mandatory Unpack. + [[maybe_unused]] bool Matched = + match(Op, m_VPInstruction<VPInstruction::Unpack>(m_VPValue(Op))); + assert(Matched && "original op must have been Unpack"); NewOps.push_back( Builder.createNaryOp(VPInstruction::ExtractLastElement, {Op})); continue; @@ -547,7 +562,8 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) { (isa<VPReplicateRecipe>(&R) && cast<VPReplicateRecipe>(&R)->isSingleScalar()) || (isa<VPInstruction>(&R) && - !cast<VPInstruction>(&R)->doesGeneratePerAllLanes())) + !cast<VPInstruction>(&R)->doesGeneratePerAllLanes() && + cast<VPInstruction>(&R)->getOpcode() != VPInstruction::Unpack)) continue; auto *DefR = cast<VPRecipeWithIRFlags>(&R); diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h index cf95ac0..840a5b9 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h @@ -64,7 +64,7 @@ inline bool isSingleScalar(const VPValue *VPV) { return true; if (auto *Rep = dyn_cast<VPReplicateRecipe>(VPV)) { - const VPRegionBlock *RegionOfR = Rep->getParent()->getParent(); + const VPRegionBlock *RegionOfR = Rep->getRegion(); // Don't consider recipes in replicate regions as uniform yet; their first // lane cannot be accessed when executing the replicate region for other // lanes. @@ -84,6 +84,12 @@ inline bool isSingleScalar(const VPValue *VPV) { return VPI->isSingleScalar() || VPI->isVectorToScalar() || (PreservesUniformity(VPI->getOpcode()) && all_of(VPI->operands(), isSingleScalar)); + if (isa<VPPartialReductionRecipe>(VPV)) + return false; + if (isa<VPReductionRecipe>(VPV)) + return true; + if (auto *Expr = dyn_cast<VPExpressionRecipe>(VPV)) + return Expr->isSingleScalar(); // VPExpandSCEVRecipes must be placed in the entry and are alway uniform. return isa<VPExpandSCEVRecipe>(VPV); |