diff options
Diffstat (limited to 'llvm/lib')
169 files changed, 3629 insertions, 1665 deletions
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp index 45c889c..a5ba197 100755 --- a/llvm/lib/Analysis/ConstantFolding.cpp +++ b/llvm/lib/Analysis/ConstantFolding.cpp @@ -2177,16 +2177,13 @@ Constant *constantFoldVectorReduce(Intrinsic::ID IID, Constant *Op) { return PoisonValue::get(VT->getElementType()); // TODO: Handle undef. - if (!isa<ConstantVector>(Op) && !isa<ConstantDataVector>(Op)) - return nullptr; - - auto *EltC = dyn_cast<ConstantInt>(Op->getAggregateElement(0U)); + auto *EltC = dyn_cast_or_null<ConstantInt>(Op->getAggregateElement(0U)); if (!EltC) return nullptr; APInt Acc = EltC->getValue(); for (unsigned I = 1, E = VT->getNumElements(); I != E; I++) { - if (!(EltC = dyn_cast<ConstantInt>(Op->getAggregateElement(I)))) + if (!(EltC = dyn_cast_or_null<ConstantInt>(Op->getAggregateElement(I)))) return nullptr; const APInt &X = EltC->getValue(); switch (IID) { @@ -3059,35 +3056,25 @@ static Constant *ConstantFoldScalarCall1(StringRef Name, Val = Val | Val << 1; return ConstantInt::get(Ty, Val); } - - default: - return nullptr; } } - switch (IntrinsicID) { - default: break; - case Intrinsic::vector_reduce_add: - case Intrinsic::vector_reduce_mul: - case Intrinsic::vector_reduce_and: - case Intrinsic::vector_reduce_or: - case Intrinsic::vector_reduce_xor: - case Intrinsic::vector_reduce_smin: - case Intrinsic::vector_reduce_smax: - case Intrinsic::vector_reduce_umin: - case Intrinsic::vector_reduce_umax: - if (Constant *C = constantFoldVectorReduce(IntrinsicID, Operands[0])) - return C; - break; - } - - // Support ConstantVector in case we have an Undef in the top. - if (isa<ConstantVector>(Operands[0]) || - isa<ConstantDataVector>(Operands[0]) || - isa<ConstantAggregateZero>(Operands[0])) { + if (Operands[0]->getType()->isVectorTy()) { auto *Op = cast<Constant>(Operands[0]); switch (IntrinsicID) { default: break; + case Intrinsic::vector_reduce_add: + case Intrinsic::vector_reduce_mul: + case Intrinsic::vector_reduce_and: + case Intrinsic::vector_reduce_or: + case Intrinsic::vector_reduce_xor: + case Intrinsic::vector_reduce_smin: + case Intrinsic::vector_reduce_smax: + case Intrinsic::vector_reduce_umin: + case Intrinsic::vector_reduce_umax: + if (Constant *C = constantFoldVectorReduce(IntrinsicID, Operands[0])) + return C; + break; case Intrinsic::x86_sse_cvtss2si: case Intrinsic::x86_sse_cvtss2si64: case Intrinsic::x86_sse2_cvtsd2si: @@ -3116,10 +3103,15 @@ static Constant *ConstantFoldScalarCall1(StringRef Name, case Intrinsic::wasm_alltrue: // Check each element individually unsigned E = cast<FixedVectorType>(Op->getType())->getNumElements(); - for (unsigned I = 0; I != E; ++I) - if (Constant *Elt = Op->getAggregateElement(I)) - if (Elt->isZeroValue()) - return ConstantInt::get(Ty, 0); + for (unsigned I = 0; I != E; ++I) { + Constant *Elt = Op->getAggregateElement(I); + // Return false as soon as we find a non-true element. + if (Elt && Elt->isZeroValue()) + return ConstantInt::get(Ty, 0); + // Bail as soon as we find an element we cannot prove to be true. + if (!Elt || !isa<ConstantInt>(Elt)) + return nullptr; + } return ConstantInt::get(Ty, 1); } diff --git a/llvm/lib/Analysis/DXILResource.cpp b/llvm/lib/Analysis/DXILResource.cpp index b78cc03e..f9bf092 100644 --- a/llvm/lib/Analysis/DXILResource.cpp +++ b/llvm/lib/Analysis/DXILResource.cpp @@ -281,6 +281,38 @@ static StructType *getOrCreateElementStruct(Type *ElemType, StringRef Name) { return StructType::create(ElemType, Name); } +static Type *getTypeWithoutPadding(Type *Ty) { + // Recursively remove padding from structures. + if (auto *ST = dyn_cast<StructType>(Ty)) { + LLVMContext &Ctx = Ty->getContext(); + SmallVector<Type *> ElementTypes; + ElementTypes.reserve(ST->getNumElements()); + for (Type *ElTy : ST->elements()) { + if (isa<PaddingExtType>(ElTy)) + continue; + ElementTypes.push_back(getTypeWithoutPadding(ElTy)); + } + + // Handle explicitly padded cbuffer arrays like { [ n x paddedty ], ty } + if (ElementTypes.size() == 2) + if (auto *AT = dyn_cast<ArrayType>(ElementTypes[0])) + if (ElementTypes[1] == AT->getElementType()) + return ArrayType::get(ElementTypes[1], AT->getNumElements() + 1); + + // If we only have a single element, don't wrap it in a struct. + if (ElementTypes.size() == 1) + return ElementTypes[0]; + + return StructType::get(Ctx, ElementTypes, /*IsPacked=*/false); + } + // Arrays just need to have their element type adjusted. + if (auto *AT = dyn_cast<ArrayType>(Ty)) + return ArrayType::get(getTypeWithoutPadding(AT->getElementType()), + AT->getNumElements()); + // Anything else should be good as is. + return Ty; +} + StructType *ResourceTypeInfo::createElementStruct(StringRef CBufferName) { SmallString<64> TypeName; @@ -334,14 +366,21 @@ StructType *ResourceTypeInfo::createElementStruct(StringRef CBufferName) { } case ResourceKind::CBuffer: { auto *RTy = cast<CBufferExtType>(HandleTy); - LayoutExtType *LayoutType = cast<LayoutExtType>(RTy->getResourceType()); - StructType *Ty = cast<StructType>(LayoutType->getWrappedType()); SmallString<64> Name = getResourceKindName(Kind); if (!CBufferName.empty()) { Name.append("."); Name.append(CBufferName); } - return StructType::create(Ty->elements(), Name); + + // TODO: Remove this when we update the frontend to use explicit padding. + if (LayoutExtType *LayoutType = + dyn_cast<LayoutExtType>(RTy->getResourceType())) { + StructType *Ty = cast<StructType>(LayoutType->getWrappedType()); + return StructType::create(Ty->elements(), Name); + } + + return getOrCreateElementStruct( + getTypeWithoutPadding(RTy->getResourceType()), Name); } case ResourceKind::Sampler: { auto *RTy = cast<SamplerExtType>(HandleTy); @@ -454,10 +493,10 @@ uint32_t ResourceTypeInfo::getCBufferSize(const DataLayout &DL) const { Type *ElTy = cast<CBufferExtType>(HandleTy)->getResourceType(); + // TODO: Remove this when we update the frontend to use explicit padding. if (auto *LayoutTy = dyn_cast<LayoutExtType>(ElTy)) return LayoutTy->getSize(); - // TODO: What should we do with unannotated arrays? return DL.getTypeAllocSize(ElTy); } diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp index 0e5bc48..df75999 100644 --- a/llvm/lib/Analysis/LazyValueInfo.cpp +++ b/llvm/lib/Analysis/LazyValueInfo.cpp @@ -947,9 +947,8 @@ LazyValueInfoImpl::solveBlockValueSelect(SelectInst *SI, BasicBlock *BB) { /*UseBlockValue*/ false)); } - ValueLatticeElement Result = TrueVal; - Result.mergeIn(FalseVal); - return Result; + TrueVal.mergeIn(FalseVal); + return TrueVal; } std::optional<ConstantRange> @@ -1778,9 +1777,8 @@ ValueLatticeElement LazyValueInfoImpl::getValueInBlock(Value *V, BasicBlock *BB, assert(OptResult && "Value not available after solving"); } - ValueLatticeElement Result = *OptResult; - LLVM_DEBUG(dbgs() << " Result = " << Result << "\n"); - return Result; + LLVM_DEBUG(dbgs() << " Result = " << *OptResult << "\n"); + return *OptResult; } ValueLatticeElement LazyValueInfoImpl::getValueAt(Value *V, Instruction *CxtI) { diff --git a/llvm/lib/Analysis/LoopInfo.cpp b/llvm/lib/Analysis/LoopInfo.cpp index a8c3173..d84721b 100644 --- a/llvm/lib/Analysis/LoopInfo.cpp +++ b/llvm/lib/Analysis/LoopInfo.cpp @@ -986,8 +986,8 @@ PreservedAnalyses LoopPrinterPass::run(Function &F, return PreservedAnalyses::all(); } -void llvm::printLoop(Loop &L, raw_ostream &OS, const std::string &Banner) { - +void llvm::printLoop(const Loop &L, raw_ostream &OS, + const std::string &Banner) { if (forcePrintModuleIR()) { // handling -print-module-scope OS << Banner << " (loop: "; diff --git a/llvm/lib/Analysis/MLInlineAdvisor.cpp b/llvm/lib/Analysis/MLInlineAdvisor.cpp index f90717d..1d1a5560 100644 --- a/llvm/lib/Analysis/MLInlineAdvisor.cpp +++ b/llvm/lib/Analysis/MLInlineAdvisor.cpp @@ -61,6 +61,9 @@ static cl::opt<SkipMLPolicyCriteria> SkipPolicy( static cl::opt<std::string> ModelSelector("ml-inliner-model-selector", cl::Hidden, cl::init("")); +static cl::opt<bool> StopImmediatelyForTest("ml-inliner-stop-immediately", + cl::Hidden); + #if defined(LLVM_HAVE_TF_AOT_INLINERSIZEMODEL) // codegen-ed file #include "InlinerSizeModel.h" // NOLINT @@ -214,6 +217,7 @@ MLInlineAdvisor::MLInlineAdvisor( return; } ModelRunner->switchContext(""); + ForceStop = StopImmediatelyForTest; } unsigned MLInlineAdvisor::getInitialFunctionLevel(const Function &F) const { @@ -379,9 +383,17 @@ std::unique_ptr<InlineAdvice> MLInlineAdvisor::getAdviceImpl(CallBase &CB) { auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(Caller); if (SkipPolicy == SkipMLPolicyCriteria::IfCallerIsNotCold) { - if (!PSI.isFunctionEntryCold(&Caller)) - return std::make_unique<InlineAdvice>(this, CB, ORE, - GetDefaultAdvice(CB)); + if (!PSI.isFunctionEntryCold(&Caller)) { + // Return a MLInlineAdvice, despite delegating to the default advice, + // because we need to keep track of the internal state. This is different + // from the other instances where we return a "default" InlineAdvice, + // which happen at points we won't come back to the MLAdvisor for + // decisions requiring that state. + return ForceStop ? std::make_unique<InlineAdvice>(this, CB, ORE, + GetDefaultAdvice(CB)) + : std::make_unique<MLInlineAdvice>(this, CB, ORE, + GetDefaultAdvice(CB)); + } } auto MandatoryKind = InlineAdvisor::getMandatoryKind(CB, FAM, ORE); // If this is a "never inline" case, there won't be any changes to internal diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index a64b93d..425420f 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -1840,19 +1840,19 @@ const SCEV *ScalarEvolution::getZeroExtendExprImpl(const SCEV *Op, Type *Ty, // = zext((2^K * (trunc X to i{N-K}))<nuw>) to iM // = (2^K * (zext(trunc X to i{N-K}) to iM))<nuw>. // - if (SM->getNumOperands() == 2) - if (auto *MulLHS = dyn_cast<SCEVConstant>(SM->getOperand(0))) - if (MulLHS->getAPInt().isPowerOf2()) - if (auto *TruncRHS = dyn_cast<SCEVTruncateExpr>(SM->getOperand(1))) { - int NewTruncBits = getTypeSizeInBits(TruncRHS->getType()) - - MulLHS->getAPInt().logBase2(); - Type *NewTruncTy = IntegerType::get(getContext(), NewTruncBits); - return getMulExpr( - getZeroExtendExpr(MulLHS, Ty), - getZeroExtendExpr( - getTruncateExpr(TruncRHS->getOperand(), NewTruncTy), Ty), - SCEV::FlagNUW, Depth + 1); - } + const APInt *C; + const SCEV *TruncRHS; + if (match(SM, + m_scev_Mul(m_scev_APInt(C), m_scev_Trunc(m_SCEV(TruncRHS)))) && + C->isPowerOf2()) { + int NewTruncBits = + getTypeSizeInBits(SM->getOperand(1)->getType()) - C->logBase2(); + Type *NewTruncTy = IntegerType::get(getContext(), NewTruncBits); + return getMulExpr( + getZeroExtendExpr(SM->getOperand(0), Ty), + getZeroExtendExpr(getTruncateExpr(TruncRHS, NewTruncTy), Ty), + SCEV::FlagNUW, Depth + 1); + } } // zext(umin(x, y)) -> umin(zext(x), zext(y)) @@ -3144,20 +3144,19 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops, if (const SCEVConstant *LHSC = dyn_cast<SCEVConstant>(Ops[0])) { if (Ops.size() == 2) { // C1*(C2+V) -> C1*C2 + C1*V - if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(Ops[1])) - // If any of Add's ops are Adds or Muls with a constant, apply this - // transformation as well. - // - // TODO: There are some cases where this transformation is not - // profitable; for example, Add = (C0 + X) * Y + Z. Maybe the scope of - // this transformation should be narrowed down. - if (Add->getNumOperands() == 2 && containsConstantInAddMulChain(Add)) { - const SCEV *LHS = getMulExpr(LHSC, Add->getOperand(0), - SCEV::FlagAnyWrap, Depth + 1); - const SCEV *RHS = getMulExpr(LHSC, Add->getOperand(1), - SCEV::FlagAnyWrap, Depth + 1); - return getAddExpr(LHS, RHS, SCEV::FlagAnyWrap, Depth + 1); - } + // If any of Add's ops are Adds or Muls with a constant, apply this + // transformation as well. + // + // TODO: There are some cases where this transformation is not + // profitable; for example, Add = (C0 + X) * Y + Z. Maybe the scope of + // this transformation should be narrowed down. + const SCEV *Op0, *Op1; + if (match(Ops[1], m_scev_Add(m_SCEV(Op0), m_SCEV(Op1))) && + containsConstantInAddMulChain(Ops[1])) { + const SCEV *LHS = getMulExpr(LHSC, Op0, SCEV::FlagAnyWrap, Depth + 1); + const SCEV *RHS = getMulExpr(LHSC, Op1, SCEV::FlagAnyWrap, Depth + 1); + return getAddExpr(LHS, RHS, SCEV::FlagAnyWrap, Depth + 1); + } if (Ops[0]->isAllOnesValue()) { // If we have a mul by -1 of an add, try distributing the -1 among the @@ -3578,20 +3577,12 @@ const SCEV *ScalarEvolution::getUDivExpr(const SCEV *LHS, } // ((-C + (C smax %x)) /u %x) evaluates to zero, for any positive constant C. - if (const auto *AE = dyn_cast<SCEVAddExpr>(LHS); - AE && AE->getNumOperands() == 2) { - if (const auto *VC = dyn_cast<SCEVConstant>(AE->getOperand(0))) { - const APInt &NegC = VC->getAPInt(); - if (NegC.isNegative() && !NegC.isMinSignedValue()) { - const auto *MME = dyn_cast<SCEVSMaxExpr>(AE->getOperand(1)); - if (MME && MME->getNumOperands() == 2 && - isa<SCEVConstant>(MME->getOperand(0)) && - cast<SCEVConstant>(MME->getOperand(0))->getAPInt() == -NegC && - MME->getOperand(1) == RHS) - return getZero(LHS->getType()); - } - } - } + const APInt *NegC, *C; + if (match(LHS, + m_scev_Add(m_scev_APInt(NegC), + m_scev_SMax(m_scev_APInt(C), m_scev_Specific(RHS)))) && + NegC->isNegative() && !NegC->isMinSignedValue() && *C == -*NegC) + return getZero(LHS->getType()); // TODO: Generalize to handle any common factors. // udiv (mul nuw a, vscale), (mul nuw b, vscale) --> udiv a, b @@ -4623,17 +4614,11 @@ const SCEV *ScalarEvolution::getNegativeSCEV(const SCEV *V, /// If Expr computes ~A, return A else return nullptr static const SCEV *MatchNotExpr(const SCEV *Expr) { - const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(Expr); - if (!Add || Add->getNumOperands() != 2 || - !Add->getOperand(0)->isAllOnesValue()) - return nullptr; - - const SCEVMulExpr *AddRHS = dyn_cast<SCEVMulExpr>(Add->getOperand(1)); - if (!AddRHS || AddRHS->getNumOperands() != 2 || - !AddRHS->getOperand(0)->isAllOnesValue()) - return nullptr; - - return AddRHS->getOperand(1); + const SCEV *MulOp; + if (match(Expr, m_scev_Add(m_scev_AllOnes(), + m_scev_Mul(m_scev_AllOnes(), m_SCEV(MulOp))))) + return MulOp; + return nullptr; } /// Return a SCEV corresponding to ~V = -1-V @@ -10797,19 +10782,15 @@ static bool HasSameValue(const SCEV *A, const SCEV *B) { } static bool MatchBinarySub(const SCEV *S, const SCEV *&LHS, const SCEV *&RHS) { - const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S); - if (!Add || Add->getNumOperands() != 2) + const SCEV *Op0, *Op1; + if (!match(S, m_scev_Add(m_SCEV(Op0), m_SCEV(Op1)))) return false; - if (auto *ME = dyn_cast<SCEVMulExpr>(Add->getOperand(0)); - ME && ME->getNumOperands() == 2 && ME->getOperand(0)->isAllOnesValue()) { - LHS = Add->getOperand(1); - RHS = ME->getOperand(1); + if (match(Op0, m_scev_Mul(m_scev_AllOnes(), m_SCEV(RHS)))) { + LHS = Op1; return true; } - if (auto *ME = dyn_cast<SCEVMulExpr>(Add->getOperand(1)); - ME && ME->getNumOperands() == 2 && ME->getOperand(0)->isAllOnesValue()) { - LHS = Add->getOperand(0); - RHS = ME->getOperand(1); + if (match(Op1, m_scev_Mul(m_scev_AllOnes(), m_SCEV(RHS)))) { + LHS = Op0; return true; } return false; @@ -12172,13 +12153,10 @@ bool ScalarEvolution::isImpliedCondBalancedTypes( bool ScalarEvolution::splitBinaryAdd(const SCEV *Expr, const SCEV *&L, const SCEV *&R, SCEV::NoWrapFlags &Flags) { - const auto *AE = dyn_cast<SCEVAddExpr>(Expr); - if (!AE || AE->getNumOperands() != 2) + if (!match(Expr, m_scev_Add(m_SCEV(L), m_SCEV(R)))) return false; - L = AE->getOperand(0); - R = AE->getOperand(1); - Flags = AE->getNoWrapFlags(); + Flags = cast<SCEVAddExpr>(Expr)->getNoWrapFlags(); return true; } @@ -12220,12 +12198,11 @@ ScalarEvolution::computeConstantDifference(const SCEV *More, const SCEV *Less) { // Try to match a common constant multiply. auto MatchConstMul = [](const SCEV *S) -> std::optional<std::pair<const SCEV *, APInt>> { - auto *M = dyn_cast<SCEVMulExpr>(S); - if (!M || M->getNumOperands() != 2 || - !isa<SCEVConstant>(M->getOperand(0))) - return std::nullopt; - return { - {M->getOperand(1), cast<SCEVConstant>(M->getOperand(0))->getAPInt()}}; + const APInt *C; + const SCEV *Op; + if (match(S, m_scev_Mul(m_scev_APInt(C), m_SCEV(Op)))) + return {{Op, *C}}; + return std::nullopt; }; if (auto MatchedMore = MatchConstMul(More)) { if (auto MatchedLess = MatchConstMul(Less)) { @@ -15557,19 +15534,10 @@ void ScalarEvolution::LoopGuards::collectFromBlock( auto IsMinMaxSCEVWithNonNegativeConstant = [&](const SCEV *Expr, SCEVTypes &SCTy, const SCEV *&LHS, const SCEV *&RHS) { - if (auto *MinMax = dyn_cast<SCEVMinMaxExpr>(Expr)) { - if (MinMax->getNumOperands() != 2) - return false; - if (auto *C = dyn_cast<SCEVConstant>(MinMax->getOperand(0))) { - if (C->getAPInt().isNegative()) - return false; - SCTy = MinMax->getSCEVType(); - LHS = MinMax->getOperand(0); - RHS = MinMax->getOperand(1); - return true; - } - } - return false; + const APInt *C; + SCTy = Expr->getSCEVType(); + return match(Expr, m_scev_MinMax(m_SCEV(LHS), m_SCEV(RHS))) && + match(LHS, m_scev_APInt(C)) && C->isNonNegative(); }; // Return a new SCEV that modifies \p Expr to the closest number divides by @@ -15772,19 +15740,26 @@ void ScalarEvolution::LoopGuards::collectFromBlock( GetNextSCEVDividesByDivisor(One, DividesBy); To = SE.getUMaxExpr(FromRewritten, OneAlignedUp); } else { + // LHS != RHS can be rewritten as (LHS - RHS) = UMax(1, LHS - RHS), + // but creating the subtraction eagerly is expensive. Track the + // inequalities in a separate map, and materialize the rewrite lazily + // when encountering a suitable subtraction while re-writing. if (LHS->getType()->isPointerTy()) { LHS = SE.getLosslessPtrToIntExpr(LHS); RHS = SE.getLosslessPtrToIntExpr(RHS); if (isa<SCEVCouldNotCompute>(LHS) || isa<SCEVCouldNotCompute>(RHS)) break; } - auto AddSubRewrite = [&](const SCEV *A, const SCEV *B) { - const SCEV *Sub = SE.getMinusSCEV(A, B); - AddRewrite(Sub, Sub, - SE.getUMaxExpr(Sub, SE.getOne(From->getType()))); - }; - AddSubRewrite(LHS, RHS); - AddSubRewrite(RHS, LHS); + const SCEVConstant *C; + const SCEV *A, *B; + if (match(RHS, m_scev_Add(m_SCEVConstant(C), m_SCEV(A))) && + match(LHS, m_scev_Add(m_scev_Specific(C), m_SCEV(B)))) { + RHS = A; + LHS = B; + } + if (LHS > RHS) + std::swap(LHS, RHS); + Guards.NotEqual.insert({LHS, RHS}); continue; } break; @@ -15918,13 +15893,15 @@ const SCEV *ScalarEvolution::LoopGuards::rewrite(const SCEV *Expr) const { class SCEVLoopGuardRewriter : public SCEVRewriteVisitor<SCEVLoopGuardRewriter> { const DenseMap<const SCEV *, const SCEV *> ⤅ + const SmallDenseSet<std::pair<const SCEV *, const SCEV *>> ≠ SCEV::NoWrapFlags FlagMask = SCEV::FlagAnyWrap; public: SCEVLoopGuardRewriter(ScalarEvolution &SE, const ScalarEvolution::LoopGuards &Guards) - : SCEVRewriteVisitor(SE), Map(Guards.RewriteMap) { + : SCEVRewriteVisitor(SE), Map(Guards.RewriteMap), + NotEqual(Guards.NotEqual) { if (Guards.PreserveNUW) FlagMask = ScalarEvolution::setFlags(FlagMask, SCEV::FlagNUW); if (Guards.PreserveNSW) @@ -15979,14 +15956,36 @@ const SCEV *ScalarEvolution::LoopGuards::rewrite(const SCEV *Expr) const { } const SCEV *visitAddExpr(const SCEVAddExpr *Expr) { + // Helper to check if S is a subtraction (A - B) where A != B, and if so, + // return UMax(S, 1). + auto RewriteSubtraction = [&](const SCEV *S) -> const SCEV * { + const SCEV *LHS, *RHS; + if (MatchBinarySub(S, LHS, RHS)) { + if (LHS > RHS) + std::swap(LHS, RHS); + if (NotEqual.contains({LHS, RHS})) + return SE.getUMaxExpr(S, SE.getOne(S->getType())); + } + return nullptr; + }; + + // Check if Expr itself is a subtraction pattern with guard info. + if (const SCEV *Rewritten = RewriteSubtraction(Expr)) + return Rewritten; + // Trip count expressions sometimes consist of adding 3 operands, i.e. // (Const + A + B). There may be guard info for A + B, and if so, apply // it. // TODO: Could more generally apply guards to Add sub-expressions. if (isa<SCEVConstant>(Expr->getOperand(0)) && Expr->getNumOperands() == 3) { - if (const SCEV *S = Map.lookup( - SE.getAddExpr(Expr->getOperand(1), Expr->getOperand(2)))) + const SCEV *Add = + SE.getAddExpr(Expr->getOperand(1), Expr->getOperand(2)); + if (const SCEV *Rewritten = RewriteSubtraction(Add)) + return SE.getAddExpr( + Expr->getOperand(0), Rewritten, + ScalarEvolution::maskFlags(Expr->getNoWrapFlags(), FlagMask)); + if (const SCEV *S = Map.lookup(Add)) return SE.getAddExpr(Expr->getOperand(0), S); } SmallVector<const SCEV *, 2> Operands; @@ -16021,7 +16020,7 @@ const SCEV *ScalarEvolution::LoopGuards::rewrite(const SCEV *Expr) const { } }; - if (RewriteMap.empty()) + if (RewriteMap.empty() && NotEqual.empty()) return Expr; SCEVLoopGuardRewriter Rewriter(SE, *this); diff --git a/llvm/lib/Analysis/StaticDataProfileInfo.cpp b/llvm/lib/Analysis/StaticDataProfileInfo.cpp index e7f0b2c..61d4935 100644 --- a/llvm/lib/Analysis/StaticDataProfileInfo.cpp +++ b/llvm/lib/Analysis/StaticDataProfileInfo.cpp @@ -1,10 +1,14 @@ #include "llvm/Analysis/StaticDataProfileInfo.h" #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/Module.h" #include "llvm/InitializePasses.h" #include "llvm/ProfileData/InstrProf.h" +#define DEBUG_TYPE "static-data-profile-info" + using namespace llvm; namespace llvm { @@ -79,6 +83,17 @@ StaticDataProfileInfo::getConstantHotnessUsingProfileCount( return StaticDataHotness::LukewarmOrUnknown; } +StaticDataProfileInfo::StaticDataHotness +StaticDataProfileInfo::getSectionHotnessUsingDataAccessProfile( + std::optional<StringRef> MaybeSectionPrefix) const { + if (!MaybeSectionPrefix) + return StaticDataHotness::LukewarmOrUnknown; + StringRef Prefix = *MaybeSectionPrefix; + assert((Prefix == "hot" || Prefix == "unlikely") && + "Expect section_prefix to be one of hot or unlikely"); + return Prefix == "hot" ? StaticDataHotness::Hot : StaticDataHotness::Cold; +} + StringRef StaticDataProfileInfo::hotnessToStr(StaticDataHotness Hotness) const { switch (Hotness) { case StaticDataHotness::Cold: @@ -101,13 +116,66 @@ StaticDataProfileInfo::getConstantProfileCount(const Constant *C) const { StringRef StaticDataProfileInfo::getConstantSectionPrefix( const Constant *C, const ProfileSummaryInfo *PSI) const { std::optional<uint64_t> Count = getConstantProfileCount(C); + +#ifndef NDEBUG + auto DbgPrintPrefix = [](StringRef Prefix) { + return Prefix.empty() ? "<empty>" : Prefix; + }; +#endif + + if (EnableDataAccessProf) { + // Module flag `HasDataAccessProf` is 1 -> empty section prefix means + // unknown hotness except for string literals. + if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(C); + GV && llvm::memprof::IsAnnotationOK(*GV) && + !GV->getName().starts_with(".str")) { + auto HotnessFromDataAccessProf = + getSectionHotnessUsingDataAccessProfile(GV->getSectionPrefix()); + + if (!Count) { + StringRef Prefix = hotnessToStr(HotnessFromDataAccessProf); + LLVM_DEBUG(dbgs() << GV->getName() << " has section prefix " + << DbgPrintPrefix(Prefix) + << ", solely from data access profiles\n"); + return Prefix; + } + + // Both data access profiles and PGO counters are available. Use the + // hotter one. + auto HotnessFromPGO = getConstantHotnessUsingProfileCount(C, PSI, *Count); + StaticDataHotness GlobalVarHotness = StaticDataHotness::LukewarmOrUnknown; + if (HotnessFromDataAccessProf == StaticDataHotness::Hot || + HotnessFromPGO == StaticDataHotness::Hot) { + GlobalVarHotness = StaticDataHotness::Hot; + } else if (HotnessFromDataAccessProf == + StaticDataHotness::LukewarmOrUnknown || + HotnessFromPGO == StaticDataHotness::LukewarmOrUnknown) { + GlobalVarHotness = StaticDataHotness::LukewarmOrUnknown; + } else { + GlobalVarHotness = StaticDataHotness::Cold; + } + StringRef Prefix = hotnessToStr(GlobalVarHotness); + LLVM_DEBUG( + dbgs() << GV->getName() << " has section prefix " + << DbgPrintPrefix(Prefix) + << ", the max from data access profiles as " + << DbgPrintPrefix(hotnessToStr(HotnessFromDataAccessProf)) + << " and PGO counters as " + << DbgPrintPrefix(hotnessToStr(HotnessFromPGO)) << "\n"); + return Prefix; + } + } if (!Count) return ""; return hotnessToStr(getConstantHotnessUsingProfileCount(C, PSI, *Count)); } bool StaticDataProfileInfoWrapperPass::doInitialization(Module &M) { - Info.reset(new StaticDataProfileInfo()); + bool EnableDataAccessProf = false; + if (auto *MD = mdconst::extract_or_null<ConstantInt>( + M.getModuleFlag("EnableDataAccessProf"))) + EnableDataAccessProf = MD->getZExtValue(); + Info.reset(new StaticDataProfileInfo(EnableDataAccessProf)); return false; } diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp index 1dd470b..cf63285 100644 --- a/llvm/lib/AsmParser/LLParser.cpp +++ b/llvm/lib/AsmParser/LLParser.cpp @@ -1270,7 +1270,7 @@ bool LLParser::parseAliasOrIFunc(const std::string &Name, unsigned NameID, if (parseToken(lltok::StringConstant, "expected partition string")) return true; } else if (!IsAlias && Lex.getKind() == lltok::MetadataVar) { - if (parseGlobalObjectMetadataAttachment(*GI.get())) + if (parseGlobalObjectMetadataAttachment(*GI)) return true; } else { return tokError("unknown alias or ifunc property!"); @@ -5876,6 +5876,7 @@ bool LLParser::parseDICompileUnit(MDNode *&Result, bool IsDistinct) { REQUIRED(file, MDField, (/* AllowNull */ false)); \ OPTIONAL(language, DwarfLangField, ); \ OPTIONAL(sourceLanguageName, DwarfSourceLangNameField, ); \ + OPTIONAL(sourceLanguageVersion, MDUnsignedField, (0, UINT32_MAX)); \ OPTIONAL(producer, MDStringField, ); \ OPTIONAL(isOptimized, MDBoolField, ); \ OPTIONAL(flags, MDStringField, ); \ @@ -5905,10 +5906,15 @@ bool LLParser::parseDICompileUnit(MDNode *&Result, bool IsDistinct) { return error(Loc, "can only specify one of 'language' and " "'sourceLanguageName' on !DICompileUnit"); + if (sourceLanguageVersion.Seen && !sourceLanguageName.Seen) + return error(Loc, "'sourceLanguageVersion' requires an associated " + "'sourceLanguageName' on !DICompileUnit"); + Result = DICompileUnit::getDistinct( Context, language.Seen ? DISourceLanguageName(language.Val) - : DISourceLanguageName(sourceLanguageName.Val, 0), + : DISourceLanguageName(sourceLanguageName.Val, + sourceLanguageVersion.Val), file.Val, producer.Val, isOptimized.Val, flags.Val, runtimeVersion.Val, splitDebugFilename.Val, emissionKind.Val, enums.Val, retainedTypes.Val, globals.Val, imports.Val, macros.Val, dwoId.Val, splitDebugInlining.Val, diff --git a/llvm/lib/BinaryFormat/XCOFF.cpp b/llvm/lib/BinaryFormat/XCOFF.cpp index e0a4471..19d5b98 100644 --- a/llvm/lib/BinaryFormat/XCOFF.cpp +++ b/llvm/lib/BinaryFormat/XCOFF.cpp @@ -112,26 +112,26 @@ StringRef XCOFF::getNameForTracebackTableLanguageId( XCOFF::CFileCpuId XCOFF::getCpuID(StringRef CPUName) { StringRef CPU = PPC::normalizeCPUName(CPUName); return StringSwitch<XCOFF::CFileCpuId>(CPU) - .Cases("generic", "COM", XCOFF::TCPU_COM) + .Cases({"generic", "COM"}, XCOFF::TCPU_COM) .Case("601", XCOFF::TCPU_601) - .Cases("602", "603", "603e", "603ev", XCOFF::TCPU_603) - .Cases("604", "604e", XCOFF::TCPU_604) + .Cases({"602", "603", "603e", "603ev"}, XCOFF::TCPU_603) + .Cases({"604", "604e"}, XCOFF::TCPU_604) .Case("620", XCOFF::TCPU_620) .Case("970", XCOFF::TCPU_970) - .Cases("a2", "g3", "g4", "g5", "e500", XCOFF::TCPU_COM) - .Cases("pwr3", "pwr4", XCOFF::TCPU_COM) - .Cases("pwr5", "PWR5", XCOFF::TCPU_PWR5) - .Cases("pwr5x", "PWR5X", XCOFF::TCPU_PWR5X) - .Cases("pwr6", "PWR6", XCOFF::TCPU_PWR6) - .Cases("pwr6x", "PWR6E", XCOFF::TCPU_PWR6E) - .Cases("pwr7", "PWR7", XCOFF::TCPU_PWR7) - .Cases("pwr8", "PWR8", XCOFF::TCPU_PWR8) - .Cases("pwr9", "PWR9", XCOFF::TCPU_PWR9) - .Cases("pwr10", "PWR10", XCOFF::TCPU_PWR10) - .Cases("ppc", "PPC", "ppc32", "ppc64", XCOFF::TCPU_COM) + .Cases({"a2", "g3", "g4", "g5", "e500"}, XCOFF::TCPU_COM) + .Cases({"pwr3", "pwr4"}, XCOFF::TCPU_COM) + .Cases({"pwr5", "PWR5"}, XCOFF::TCPU_PWR5) + .Cases({"pwr5x", "PWR5X"}, XCOFF::TCPU_PWR5X) + .Cases({"pwr6", "PWR6"}, XCOFF::TCPU_PWR6) + .Cases({"pwr6x", "PWR6E"}, XCOFF::TCPU_PWR6E) + .Cases({"pwr7", "PWR7"}, XCOFF::TCPU_PWR7) + .Cases({"pwr8", "PWR8"}, XCOFF::TCPU_PWR8) + .Cases({"pwr9", "PWR9"}, XCOFF::TCPU_PWR9) + .Cases({"pwr10", "PWR10"}, XCOFF::TCPU_PWR10) + .Cases({"ppc", "PPC", "ppc32", "ppc64"}, XCOFF::TCPU_COM) .Case("ppc64le", XCOFF::TCPU_PWR8) .Case("future", XCOFF::TCPU_PWR10) - .Cases("any", "ANY", XCOFF::TCPU_ANY) + .Cases({"any", "ANY"}, XCOFF::TCPU_ANY) .Default(XCOFF::TCPU_INVALID); } diff --git a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp index cdcf7a8..ed0443f 100644 --- a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp +++ b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp @@ -1860,7 +1860,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata( break; } case bitc::METADATA_COMPILE_UNIT: { - if (Record.size() < 14 || Record.size() > 22) + if (Record.size() < 14 || Record.size() > 23) return error("Invalid record"); // Ignore Record[0], which indicates whether this compile unit is @@ -1869,11 +1869,13 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata( const auto LangVersionMask = (uint64_t(1) << 63); const bool HasVersionedLanguage = Record[1] & LangVersionMask; + const uint32_t LanguageVersion = Record.size() > 22 ? Record[22] : 0; auto *CU = DICompileUnit::getDistinct( Context, HasVersionedLanguage - ? DISourceLanguageName(Record[1] & ~LangVersionMask, 0) + ? DISourceLanguageName(Record[1] & ~LangVersionMask, + LanguageVersion) : DISourceLanguageName(Record[1]), getMDOrNull(Record[2]), getMDString(Record[3]), Record[4], getMDString(Record[5]), Record[6], getMDString(Record[7]), Record[8], diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index 54e916e..8ff3aa9 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -2142,6 +2142,7 @@ void ModuleBitcodeWriter::writeDICompileUnit(const DICompileUnit *N, Record.push_back(N->getRangesBaseAddress()); Record.push_back(VE.getMetadataOrNullID(N->getRawSysRoot())); Record.push_back(VE.getMetadataOrNullID(N->getRawSDK())); + Record.push_back(Lang.hasVersionedName() ? Lang.getVersion() : 0); Stream.EmitRecord(bitc::METADATA_COMPILE_UNIT, Record, Abbrev); Record.clear(); diff --git a/llvm/lib/CAS/OnDiskTrieRawHashMap.cpp b/llvm/lib/CAS/OnDiskTrieRawHashMap.cpp index 323b21e..4e6f93e 100644 --- a/llvm/lib/CAS/OnDiskTrieRawHashMap.cpp +++ b/llvm/lib/CAS/OnDiskTrieRawHashMap.cpp @@ -1102,8 +1102,6 @@ void TrieRawHashMapHandle::print( if (auto Err = Printer.printRecords()) OS << "error: " << toString(std::move(Err)) << "\n"; - - return; } Error TrieRawHashMapHandle::validate( diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index 05fffe9..e2af0c5 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -119,6 +119,7 @@ #include "llvm/Support/MathExtras.h" #include "llvm/Support/Path.h" #include "llvm/Support/VCSRevision.h" +#include "llvm/Support/VirtualFileSystem.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetLoweringObjectFile.h" #include "llvm/Target/TargetMachine.h" @@ -476,6 +477,7 @@ void AsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const { } bool AsmPrinter::doInitialization(Module &M) { + VFS = vfs::getRealFileSystem(); auto *MMIWP = getAnalysisIfAvailable<MachineModuleInfoWrapperPass>(); MMI = MMIWP ? &MMIWP->getMMI() : nullptr; HasSplitStack = false; @@ -1683,7 +1685,7 @@ static ConstantInt *extractNumericCGTypeId(const Function &F) { return nullptr; } -/// Emits .callgraph section. +/// Emits .llvm.callgraph section. void AsmPrinter::emitCallGraphSection(const MachineFunction &MF, FunctionCallGraphInfo &FuncCGInfo) { if (!MF.getTarget().Options.EmitCallGraphSection) diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp index c364ffc..8dd8b9da 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp @@ -36,6 +36,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/SourceMgr.h" +#include "llvm/Support/VirtualFileSystem.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" using namespace llvm; @@ -98,6 +99,7 @@ void AsmPrinter::emitInlineAsm(StringRef Str, const MCSubtargetInfo &STI, unsigned BufNum = addInlineAsmDiagBuffer(Str, LocMDNode); SourceMgr &SrcMgr = *MMI->getContext().getInlineSourceManager(); SrcMgr.setIncludeDirs(MCOptions.IASSearchPaths); + SrcMgr.setVirtualFileSystem(VFS); std::unique_ptr<MCAsmParser> Parser( createMCAsmParser(SrcMgr, OutContext, *OutStreamer, *MAI, BufNum)); diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp index 433877f..567acf7 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp @@ -1039,12 +1039,17 @@ void DwarfDebug::finishUnitAttributes(const DICompileUnit *DIUnit, } else NewCU.addString(Die, dwarf::DW_AT_producer, Producer); - if (auto Lang = DIUnit->getSourceLanguage(); Lang.hasVersionedName()) + if (auto Lang = DIUnit->getSourceLanguage(); Lang.hasVersionedName()) { NewCU.addUInt(Die, dwarf::DW_AT_language_name, dwarf::DW_FORM_data2, Lang.getName()); - else + + if (uint32_t LangVersion = Lang.getVersion(); LangVersion != 0) + NewCU.addUInt(Die, dwarf::DW_AT_language_version, /*Form=*/std::nullopt, + LangVersion); + } else { NewCU.addUInt(Die, dwarf::DW_AT_language, dwarf::DW_FORM_data2, Lang.getName()); + } NewCU.addString(Die, dwarf::DW_AT_name, FN); StringRef SysRoot = DIUnit->getSysRoot(); @@ -2066,11 +2071,36 @@ void DwarfDebug::beginInstruction(const MachineInstr *MI) { if (NoDebug) return; + auto RecordLineZero = [&]() { + // Preserve the file and column numbers, if we can, to save space in + // the encoded line table. + // Do not update PrevInstLoc, it remembers the last non-0 line. + const MDNode *Scope = nullptr; + unsigned Column = 0; + if (PrevInstLoc) { + Scope = PrevInstLoc.getScope(); + Column = PrevInstLoc.getCol(); + } + recordSourceLine(/*Line=*/0, Column, Scope, /*Flags=*/0); + }; + + // When we emit a line-0 record, we don't update PrevInstLoc; so look at + // the last line number actually emitted, to see if it was line 0. + unsigned LastAsmLine = + Asm->OutStreamer->getContext().getCurrentDwarfLoc().getLine(); + // Check if source location changes, but ignore DBG_VALUE and CFI locations. // If the instruction is part of the function frame setup code, do not emit // any line record, as there is no correspondence with any user code. - if (MI->isMetaInstruction() || MI->getFlag(MachineInstr::FrameSetup)) + if (MI->isMetaInstruction()) + return; + if (MI->getFlag(MachineInstr::FrameSetup)) { + // Prevent a loc from the previous block leaking into frame setup instrs. + if (LastAsmLine && PrevInstBB && PrevInstBB != MI->getParent()) + RecordLineZero(); return; + } + const DebugLoc &DL = MI->getDebugLoc(); unsigned Flags = 0; @@ -2093,11 +2123,6 @@ void DwarfDebug::beginInstruction(const MachineInstr *MI) { LocationString); }; - // When we emit a line-0 record, we don't update PrevInstLoc; so look at - // the last line number actually emitted, to see if it was line 0. - unsigned LastAsmLine = - Asm->OutStreamer->getContext().getCurrentDwarfLoc().getLine(); - // There may be a mixture of scopes using and not using Key Instructions. // Not-Key-Instructions functions inlined into Key Instructions functions // should use not-key is_stmt handling. Key Instructions functions inlined @@ -2163,18 +2188,8 @@ void DwarfDebug::beginInstruction(const MachineInstr *MI) { // - Instruction is at the top of a block; we don't want to inherit the // location from the physically previous (maybe unrelated) block. if (UnknownLocations == Enable || PrevLabel || - (PrevInstBB && PrevInstBB != MI->getParent())) { - // Preserve the file and column numbers, if we can, to save space in - // the encoded line table. - // Do not update PrevInstLoc, it remembers the last non-0 line. - const MDNode *Scope = nullptr; - unsigned Column = 0; - if (PrevInstLoc) { - Scope = PrevInstLoc.getScope(); - Column = PrevInstLoc.getCol(); - } - recordSourceLine(/*Line=*/0, Column, Scope, /*Flags=*/0); - } + (PrevInstBB && PrevInstBB != MI->getParent())) + RecordLineZero(); return; } diff --git a/llvm/lib/CodeGen/BranchRelaxation.cpp b/llvm/lib/CodeGen/BranchRelaxation.cpp index 2d50167..fae952e 100644 --- a/llvm/lib/CodeGen/BranchRelaxation.cpp +++ b/llvm/lib/CodeGen/BranchRelaxation.cpp @@ -491,6 +491,20 @@ bool BranchRelaxation::fixupConditionalBranch(MachineInstr &MI) { return true; } if (FBB) { + // If we get here with a MBB which ends like this: + // + // bb.1: + // successors: %bb.2; + // ... + // BNE $x1, $x0, %bb.2 + // PseudoBR %bb.2 + // + // Just remove conditional branch. + if (TBB == FBB) { + removeBranch(MBB); + insertUncondBranch(MBB, TBB); + return true; + } // We need to split the basic block here to obtain two long-range // unconditional branches. NewBB = createNewBlockAfter(*MBB); diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index 4320b1d..9e78ec9 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -819,7 +819,7 @@ void CodeGenPrepare::removeAllAssertingVHReferences(Value *V) { } // Verify BFI has been updated correctly by recomputing BFI and comparing them. -void LLVM_ATTRIBUTE_UNUSED CodeGenPrepare::verifyBFIUpdates(Function &F) { +[[maybe_unused]] void CodeGenPrepare::verifyBFIUpdates(Function &F) { DominatorTree NewDT(F); LoopInfo NewLI(NewDT); BranchProbabilityInfo NewBPI(F, NewLI, TLInfo); diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizeMutations.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizeMutations.cpp index 25c1db9..ded4df4 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizeMutations.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizeMutations.cpp @@ -55,12 +55,10 @@ LegalizeMutation LegalizeMutations::changeElementCountTo(unsigned TypeIdx, } LegalizeMutation LegalizeMutations::changeElementCountTo(unsigned TypeIdx, - LLT NewEltTy) { + ElementCount EC) { return [=](const LegalityQuery &Query) { const LLT OldTy = Query.Types[TypeIdx]; - ElementCount NewEltCount = NewEltTy.isVector() ? NewEltTy.getElementCount() - : ElementCount::getFixed(1); - return std::make_pair(TypeIdx, OldTy.changeElementCount(NewEltCount)); + return std::make_pair(TypeIdx, OldTy.changeElementCount(EC)); }; } diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index cffaf7c..38ec83f 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -3292,8 +3292,7 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { if (TypeIdx != 2) return UnableToLegalize; Observer.changingInstr(MI); - // TODO: Probably should be zext - widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT); + widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT); Observer.changedInstr(MI); return Legalized; } @@ -3325,8 +3324,7 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { if (TypeIdx == 2) { Observer.changingInstr(MI); - // TODO: Probably should be zext - widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_SEXT); + widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ZEXT); Observer.changedInstr(MI); return Legalized; } diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp index 055fdc6..ca82857 100644 --- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp +++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp @@ -818,8 +818,7 @@ bool llvm::isKnownNeverNaN(Register Val, const MachineRegisterInfo &MRI, if (!DefMI) return false; - const TargetMachine& TM = DefMI->getMF()->getTarget(); - if (DefMI->getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) + if (DefMI->getFlag(MachineInstr::FmNoNans)) return true; // If the value is a constant, we can obviously see if it is a NaN or not. diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp index e359831..ea08365 100644 --- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp +++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp @@ -1257,7 +1257,7 @@ void MachineCopyPropagation::BackwardCopyPropagateBlock( Tracker.clear(); } -static void LLVM_ATTRIBUTE_UNUSED printSpillReloadChain( +[[maybe_unused]] static void printSpillReloadChain( DenseMap<MachineInstr *, SmallVector<MachineInstr *>> &SpillChain, DenseMap<MachineInstr *, SmallVector<MachineInstr *>> &ReloadChain, MachineInstr *Leader) { diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 358e060..c97300d 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -17759,7 +17759,6 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { bool N1CFP = DAG.isConstantFPBuildVectorOrConstantFP(N1); EVT VT = N->getValueType(0); SDLoc DL(N); - const TargetOptions &Options = DAG.getTarget().Options; SDNodeFlags Flags = N->getFlags(); SelectionDAG::FlagInserter FlagsInserter(DAG, N); @@ -17825,7 +17824,7 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { bool AllowNewConst = (Level < AfterLegalizeDAG); // If nnan is enabled, fold lots of things. - if ((Options.NoNaNsFPMath || Flags.hasNoNaNs()) && AllowNewConst) { + if (Flags.hasNoNaNs() && AllowNewConst) { // If allowed, fold (fadd (fneg x), x) -> 0.0 if (N0.getOpcode() == ISD::FNEG && N0.getOperand(0) == N1) return DAG.getConstantFP(0.0, DL, VT); @@ -17974,7 +17973,6 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) { ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1, true); EVT VT = N->getValueType(0); SDLoc DL(N); - const TargetOptions &Options = DAG.getTarget().Options; const SDNodeFlags Flags = N->getFlags(); SelectionDAG::FlagInserter FlagsInserter(DAG, N); @@ -18002,7 +18000,7 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) { if (N0 == N1) { // (fsub x, x) -> 0.0 - if (Options.NoNaNsFPMath || Flags.hasNoNaNs()) + if (Flags.hasNoNaNs()) return DAG.getConstantFP(0.0f, DL, VT); } @@ -18313,7 +18311,6 @@ template <class MatchContextClass> SDValue DAGCombiner::visitFMA(SDNode *N) { ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2); EVT VT = N->getValueType(0); SDLoc DL(N); - const TargetOptions &Options = DAG.getTarget().Options; // FMA nodes have flags that propagate to the created nodes. SelectionDAG::FlagInserter FlagsInserter(DAG, N); MatchContextClass matcher(DAG, TLI, N); @@ -18339,8 +18336,7 @@ template <class MatchContextClass> SDValue DAGCombiner::visitFMA(SDNode *N) { return matcher.getNode(ISD::FMA, DL, VT, NegN0, NegN1, N2); } - if ((Options.NoNaNsFPMath && N->getFlags().hasNoInfs()) || - (N->getFlags().hasNoNaNs() && N->getFlags().hasNoInfs())) { + if (N->getFlags().hasNoNaNs() && N->getFlags().hasNoInfs()) { if (N->getFlags().hasNoSignedZeros() || (N2CFP && !N2CFP->isExactlyValue(-0.0))) { if (N0CFP && N0CFP->isZero()) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 0f2b518..cb0038c 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -3971,8 +3971,14 @@ void SelectionDAGBuilder::visitSIToFP(const User &I) { } void SelectionDAGBuilder::visitPtrToAddr(const User &I) { - // FIXME: this is not correct for pointers with addr width != pointer width - visitPtrToInt(I); + SDValue N = getValue(I.getOperand(0)); + // By definition the type of the ptrtoaddr must be equal to the address type. + const auto &TLI = DAG.getTargetLoweringInfo(); + EVT AddrVT = TLI.getValueType(DAG.getDataLayout(), I.getType()); + // The address width must be smaller or equal to the pointer representation + // width, so we lower ptrtoaddr as a truncate (possibly folded to a no-op). + N = DAG.getNode(ISD::TRUNCATE, getCurSDLoc(), AddrVT, N); + setValue(&I, N); } void SelectionDAGBuilder::visitPtrToInt(const User &I) { diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp index 6610eef..c61f757 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp @@ -181,8 +181,8 @@ DWARFDebugFrame::DWARFDebugFrame(Triple::ArchType Arch, DWARFDebugFrame::~DWARFDebugFrame() = default; -static void LLVM_ATTRIBUTE_UNUSED dumpDataAux(DataExtractor Data, - uint64_t Offset, int Length) { +[[maybe_unused]] static void dumpDataAux(DataExtractor Data, uint64_t Offset, + int Length) { errs() << "DUMP: "; for (int i = 0; i < Length; ++i) { uint8_t c = Data.getU8(&Offset); diff --git a/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt b/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt index 0ffe3ae..f343925 100644 --- a/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt +++ b/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt @@ -56,6 +56,7 @@ add_llvm_component_library(LLVMOrcJIT SectCreate.cpp SelfExecutorProcessControl.cpp SimpleRemoteEPC.cpp + SimpleRemoteMemoryMapper.cpp Speculation.cpp SpeculateAnalyses.cpp ExecutorProcessControl.cpp diff --git a/llvm/lib/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.cpp b/llvm/lib/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.cpp index 50e6b25..0833af7 100644 --- a/llvm/lib/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.cpp +++ b/llvm/lib/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.cpp @@ -57,16 +57,17 @@ public: std::swap(FR.Actions, G.allocActions()); Parent.EPC.callSPSWrapperAsync< - rt::SPSSimpleExecutorMemoryManagerFinalizeSignature>( - Parent.SAs.Finalize, + rt::SPSSimpleExecutorMemoryManagerInitializeSignature>( + Parent.SAs.Initialize, [OnFinalize = std::move(OnFinalize), AllocAddr = this->AllocAddr]( - Error SerializationErr, Error FinalizeErr) mutable { + Error SerializationErr, + Expected<ExecutorAddr> InitializeKey) mutable { // FIXME: Release abandoned alloc. if (SerializationErr) { - cantFail(std::move(FinalizeErr)); + cantFail(InitializeKey.takeError()); OnFinalize(std::move(SerializationErr)); - } else if (FinalizeErr) - OnFinalize(std::move(FinalizeErr)); + } else if (!InitializeKey) + OnFinalize(InitializeKey.takeError()); else OnFinalize(FinalizedAlloc(AllocAddr)); }, @@ -76,8 +77,8 @@ public: void abandon(OnAbandonedFunction OnAbandoned) override { // FIXME: Return memory to pool instead. Parent.EPC.callSPSWrapperAsync< - rt::SPSSimpleExecutorMemoryManagerDeallocateSignature>( - Parent.SAs.Deallocate, + rt::SPSSimpleExecutorMemoryManagerReleaseSignature>( + Parent.SAs.Release, [OnAbandoned = std::move(OnAbandoned)](Error SerializationErr, Error DeallocateErr) mutable { if (SerializationErr) { @@ -123,9 +124,8 @@ void EPCGenericJITLinkMemoryManager::allocate(const JITLinkDylib *JD, void EPCGenericJITLinkMemoryManager::deallocate( std::vector<FinalizedAlloc> Allocs, OnDeallocatedFunction OnDeallocated) { - EPC.callSPSWrapperAsync< - rt::SPSSimpleExecutorMemoryManagerDeallocateSignature>( - SAs.Deallocate, + EPC.callSPSWrapperAsync<rt::SPSSimpleExecutorMemoryManagerReleaseSignature>( + SAs.Release, [OnDeallocated = std::move(OnDeallocated)](Error SerErr, Error DeallocErr) mutable { if (SerErr) { diff --git a/llvm/lib/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.cpp b/llvm/lib/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.cpp index fec7062..cc72488 100644 --- a/llvm/lib/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.cpp +++ b/llvm/lib/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.cpp @@ -25,9 +25,9 @@ EPCGenericRTDyldMemoryManager::CreateWithDefaultBootstrapSymbols( if (auto Err = EPC.getBootstrapSymbols( {{SAs.Instance, rt::SimpleExecutorMemoryManagerInstanceName}, {SAs.Reserve, rt::SimpleExecutorMemoryManagerReserveWrapperName}, - {SAs.Finalize, rt::SimpleExecutorMemoryManagerFinalizeWrapperName}, - {SAs.Deallocate, - rt::SimpleExecutorMemoryManagerDeallocateWrapperName}, + {SAs.Initialize, + rt::SimpleExecutorMemoryManagerInitializeWrapperName}, + {SAs.Release, rt::SimpleExecutorMemoryManagerReleaseWrapperName}, {SAs.RegisterEHFrame, rt::RegisterEHFrameSectionAllocActionName}, {SAs.DeregisterEHFrame, rt::DeregisterEHFrameSectionAllocActionName}})) @@ -48,7 +48,7 @@ EPCGenericRTDyldMemoryManager::~EPCGenericRTDyldMemoryManager() { Error Err = Error::success(); if (auto Err2 = EPC.callSPSWrapper< - rt::SPSSimpleExecutorMemoryManagerDeallocateSignature>( + rt::SPSSimpleExecutorMemoryManagerReleaseSignature>( SAs.Reserve, Err, SAs.Instance, FinalizedAllocs)) { // FIXME: Report errors through EPC once that functionality is available. logAllUnhandledErrors(std::move(Err2), errs(), ""); @@ -267,10 +267,10 @@ bool EPCGenericRTDyldMemoryManager::finalizeMemory(std::string *ErrMsg) { // We'll also need to make an extra allocation for the eh-frame wrapper call // arguments. - Error FinalizeErr = Error::success(); + Expected<ExecutorAddr> InitializeKey((ExecutorAddr())); if (auto Err = EPC.callSPSWrapper< - rt::SPSSimpleExecutorMemoryManagerFinalizeSignature>( - SAs.Finalize, FinalizeErr, SAs.Instance, std::move(FR))) { + rt::SPSSimpleExecutorMemoryManagerInitializeSignature>( + SAs.Initialize, InitializeKey, SAs.Instance, std::move(FR))) { std::lock_guard<std::mutex> Lock(M); this->ErrMsg = toString(std::move(Err)); dbgs() << "Serialization error: " << this->ErrMsg << "\n"; @@ -278,9 +278,9 @@ bool EPCGenericRTDyldMemoryManager::finalizeMemory(std::string *ErrMsg) { *ErrMsg = this->ErrMsg; return true; } - if (FinalizeErr) { + if (!InitializeKey) { std::lock_guard<std::mutex> Lock(M); - this->ErrMsg = toString(std::move(FinalizeErr)); + this->ErrMsg = toString(InitializeKey.takeError()); dbgs() << "Finalization error: " << this->ErrMsg << "\n"; if (ErrMsg) *ErrMsg = this->ErrMsg; diff --git a/llvm/lib/ExecutionEngine/Orc/Shared/OrcRTBridge.cpp b/llvm/lib/ExecutionEngine/Orc/Shared/OrcRTBridge.cpp index 26e8f53..cc99d3c 100644 --- a/llvm/lib/ExecutionEngine/Orc/Shared/OrcRTBridge.cpp +++ b/llvm/lib/ExecutionEngine/Orc/Shared/OrcRTBridge.cpp @@ -23,10 +23,12 @@ const char *SimpleExecutorMemoryManagerInstanceName = "__llvm_orc_SimpleExecutorMemoryManager_Instance"; const char *SimpleExecutorMemoryManagerReserveWrapperName = "__llvm_orc_SimpleExecutorMemoryManager_reserve_wrapper"; -const char *SimpleExecutorMemoryManagerFinalizeWrapperName = - "__llvm_orc_SimpleExecutorMemoryManager_finalize_wrapper"; -const char *SimpleExecutorMemoryManagerDeallocateWrapperName = - "__llvm_orc_SimpleExecutorMemoryManager_deallocate_wrapper"; +const char *SimpleExecutorMemoryManagerInitializeWrapperName = + "__llvm_orc_SimpleExecutorMemoryManager_initialize_wrapper"; +const char *SimpleExecutorMemoryManagerDeinitializeWrapperName = + "__llvm_orc_SimpleExecutorMemoryManager_deinitialize_wrapper"; +const char *SimpleExecutorMemoryManagerReleaseWrapperName = + "__llvm_orc_SimpleExecutorMemoryManager_release_wrapper"; const char *ExecutorSharedMemoryMapperServiceInstanceName = "__llvm_orc_ExecutorSharedMemoryMapperService_Instance"; diff --git a/llvm/lib/ExecutionEngine/Orc/SimpleRemoteEPC.cpp b/llvm/lib/ExecutionEngine/Orc/SimpleRemoteEPC.cpp index 87d7578..dec1df7 100644 --- a/llvm/lib/ExecutionEngine/Orc/SimpleRemoteEPC.cpp +++ b/llvm/lib/ExecutionEngine/Orc/SimpleRemoteEPC.cpp @@ -216,9 +216,9 @@ SimpleRemoteEPC::createDefaultMemoryManager(SimpleRemoteEPC &SREPC) { if (auto Err = SREPC.getBootstrapSymbols( {{SAs.Allocator, rt::SimpleExecutorMemoryManagerInstanceName}, {SAs.Reserve, rt::SimpleExecutorMemoryManagerReserveWrapperName}, - {SAs.Finalize, rt::SimpleExecutorMemoryManagerFinalizeWrapperName}, - {SAs.Deallocate, - rt::SimpleExecutorMemoryManagerDeallocateWrapperName}})) + {SAs.Initialize, + rt::SimpleExecutorMemoryManagerInitializeWrapperName}, + {SAs.Release, rt::SimpleExecutorMemoryManagerReleaseWrapperName}})) return std::move(Err); return std::make_unique<EPCGenericJITLinkMemoryManager>(SREPC, SAs); diff --git a/llvm/lib/ExecutionEngine/Orc/SimpleRemoteMemoryMapper.cpp b/llvm/lib/ExecutionEngine/Orc/SimpleRemoteMemoryMapper.cpp new file mode 100644 index 0000000..b82de3f --- /dev/null +++ b/llvm/lib/ExecutionEngine/Orc/SimpleRemoteMemoryMapper.cpp @@ -0,0 +1,104 @@ +//===---- SimpleRemoteMemoryMapper.cpp - Remote memory mapper ----*- C++ -*-==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/ExecutionEngine/Orc/SimpleRemoteMemoryMapper.h" + +#include "llvm/ExecutionEngine/JITLink/JITLink.h" +#include "llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h" + +namespace llvm::orc { + +SimpleRemoteMemoryMapper::SimpleRemoteMemoryMapper(ExecutorProcessControl &EPC, + SymbolAddrs SAs) + : EPC(EPC), SAs(SAs) {} + +void SimpleRemoteMemoryMapper::reserve(size_t NumBytes, + OnReservedFunction OnReserved) { + EPC.callSPSWrapperAsync<rt::SPSSimpleRemoteMemoryMapReserveSignature>( + SAs.Reserve, + [NumBytes, OnReserved = std::move(OnReserved)]( + Error SerializationErr, Expected<ExecutorAddr> Result) mutable { + if (SerializationErr) { + cantFail(Result.takeError()); + return OnReserved(std::move(SerializationErr)); + } + + if (Result) + OnReserved(ExecutorAddrRange(*Result, NumBytes)); + else + OnReserved(Result.takeError()); + }, + SAs.Instance, static_cast<uint64_t>(NumBytes)); +} + +char *SimpleRemoteMemoryMapper::prepare(jitlink::LinkGraph &G, + ExecutorAddr Addr, size_t ContentSize) { + return G.allocateBuffer(ContentSize).data(); +} + +void SimpleRemoteMemoryMapper::initialize(MemoryMapper::AllocInfo &AI, + OnInitializedFunction OnInitialized) { + + tpctypes::FinalizeRequest FR; + + std::swap(FR.Actions, AI.Actions); + FR.Segments.reserve(AI.Segments.size()); + + for (auto Seg : AI.Segments) + FR.Segments.push_back({Seg.AG, AI.MappingBase + Seg.Offset, + Seg.ContentSize + Seg.ZeroFillSize, + ArrayRef<char>(Seg.WorkingMem, Seg.ContentSize)}); + + EPC.callSPSWrapperAsync<rt::SPSSimpleRemoteMemoryMapInitializeSignature>( + SAs.Initialize, + [OnInitialized = std::move(OnInitialized)]( + Error SerializationErr, Expected<ExecutorAddr> Result) mutable { + if (SerializationErr) { + cantFail(Result.takeError()); + return OnInitialized(std::move(SerializationErr)); + } + + OnInitialized(std::move(Result)); + }, + SAs.Instance, std::move(FR)); +} + +void SimpleRemoteMemoryMapper::deinitialize( + ArrayRef<ExecutorAddr> Allocations, + MemoryMapper::OnDeinitializedFunction OnDeinitialized) { + EPC.callSPSWrapperAsync<rt::SPSSimpleRemoteMemoryMapDeinitializeSignature>( + SAs.Deinitialize, + [OnDeinitialized = std::move(OnDeinitialized)](Error SerializationErr, + Error Result) mutable { + if (SerializationErr) { + cantFail(std::move(Result)); + return OnDeinitialized(std::move(SerializationErr)); + } + + OnDeinitialized(std::move(Result)); + }, + SAs.Instance, Allocations); +} + +void SimpleRemoteMemoryMapper::release(ArrayRef<ExecutorAddr> Bases, + OnReleasedFunction OnReleased) { + EPC.callSPSWrapperAsync<rt::SPSSimpleRemoteMemoryMapReleaseSignature>( + SAs.Release, + [OnReleased = std::move(OnReleased)](Error SerializationErr, + Error Result) mutable { + if (SerializationErr) { + cantFail(std::move(Result)); + return OnReleased(std::move(SerializationErr)); + } + + return OnReleased(std::move(Result)); + }, + SAs.Instance, Bases); +} + +} // namespace llvm::orc diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp index 3cdffb8..fe881a1 100644 --- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp +++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp @@ -8,6 +8,7 @@ #include "llvm/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.h" +#include "llvm/ADT/ScopeExit.h" #include "llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h" #include "llvm/Support/FormatVariadic.h" @@ -18,166 +19,167 @@ namespace orc { namespace rt_bootstrap { SimpleExecutorMemoryManager::~SimpleExecutorMemoryManager() { - assert(Allocations.empty() && "shutdown not called?"); + assert(Slabs.empty() && "shutdown not called?"); } -Expected<ExecutorAddr> SimpleExecutorMemoryManager::allocate(uint64_t Size) { +Expected<ExecutorAddr> SimpleExecutorMemoryManager::reserve(uint64_t Size) { std::error_code EC; auto MB = sys::Memory::allocateMappedMemory( Size, nullptr, sys::Memory::MF_READ | sys::Memory::MF_WRITE, EC); if (EC) return errorCodeToError(EC); std::lock_guard<std::mutex> Lock(M); - assert(!Allocations.count(MB.base()) && "Duplicate allocation addr"); - Allocations[MB.base()].Size = Size; + assert(!Slabs.count(MB.base()) && "Duplicate allocation addr"); + Slabs[MB.base()].Size = Size; return ExecutorAddr::fromPtr(MB.base()); } -Error SimpleExecutorMemoryManager::finalize(tpctypes::FinalizeRequest &FR) { - ExecutorAddr Base(~0ULL); +Expected<ExecutorAddr> +SimpleExecutorMemoryManager::initialize(tpctypes::FinalizeRequest &FR) { std::vector<shared::WrapperFunctionCall> DeallocationActions; - size_t SuccessfulFinalizationActions = 0; if (FR.Segments.empty()) { - // NOTE: Finalizing nothing is currently a no-op. Should it be an error? if (FR.Actions.empty()) - return Error::success(); + return make_error<StringError>("Finalization request is empty", + inconvertibleErrorCode()); else return make_error<StringError>("Finalization actions attached to empty " "finalization request", inconvertibleErrorCode()); } - for (auto &Seg : FR.Segments) - Base = std::min(Base, Seg.Addr); - - for (auto &ActPair : FR.Actions) - if (ActPair.Dealloc) - DeallocationActions.push_back(ActPair.Dealloc); - - // Get the Allocation for this finalization. - size_t AllocSize = 0; - { - std::lock_guard<std::mutex> Lock(M); - auto I = Allocations.find(Base.toPtr<void *>()); - if (I == Allocations.end()) - return make_error<StringError>("Attempt to finalize unrecognized " - "allocation " + - formatv("{0:x}", Base.getValue()), - inconvertibleErrorCode()); - AllocSize = I->second.Size; - I->second.DeallocationActions = std::move(DeallocationActions); - } - ExecutorAddr AllocEnd = Base + ExecutorAddrDiff(AllocSize); - - // Bail-out function: this will run deallocation actions corresponding to any - // completed finalization actions, then deallocate memory. - auto BailOut = [&](Error Err) { - std::pair<void *, Allocation> AllocToDestroy; - - // Get allocation to destroy. - { - std::lock_guard<std::mutex> Lock(M); - auto I = Allocations.find(Base.toPtr<void *>()); - - // Check for missing allocation (effective a double free). - if (I == Allocations.end()) - return joinErrors( - std::move(Err), - make_error<StringError>("No allocation entry found " - "for " + - formatv("{0:x}", Base.getValue()), - inconvertibleErrorCode())); - AllocToDestroy = std::move(*I); - Allocations.erase(I); - } + ExecutorAddrRange RR(FR.Segments.front().Addr, FR.Segments.front().Addr); - // Run deallocation actions for all completed finalization actions. - while (SuccessfulFinalizationActions) - Err = - joinErrors(std::move(Err), FR.Actions[--SuccessfulFinalizationActions] - .Dealloc.runWithSPSRetErrorMerged()); - - // Deallocate memory. - sys::MemoryBlock MB(AllocToDestroy.first, AllocToDestroy.second.Size); - if (auto EC = sys::Memory::releaseMappedMemory(MB)) - Err = joinErrors(std::move(Err), errorCodeToError(EC)); - - return Err; - }; + std::vector<sys::MemoryBlock> MBsToReset; + auto ResetMBs = make_scope_exit([&]() { + for (auto &MB : MBsToReset) + sys::Memory::protectMappedMemory(MB, sys::Memory::MF_READ | + sys::Memory::MF_WRITE); + sys::Memory::InvalidateInstructionCache(RR.Start.toPtr<void *>(), + RR.size()); + }); // Copy content and apply permissions. for (auto &Seg : FR.Segments) { + RR.Start = std::min(RR.Start, Seg.Addr); + RR.End = std::max(RR.End, Seg.Addr + Seg.Size); // Check segment ranges. if (LLVM_UNLIKELY(Seg.Size < Seg.Content.size())) - return BailOut(make_error<StringError>( + return make_error<StringError>( formatv("Segment {0:x} content size ({1:x} bytes) " "exceeds segment size ({2:x} bytes)", Seg.Addr.getValue(), Seg.Content.size(), Seg.Size), - inconvertibleErrorCode())); + inconvertibleErrorCode()); ExecutorAddr SegEnd = Seg.Addr + ExecutorAddrDiff(Seg.Size); - if (LLVM_UNLIKELY(Seg.Addr < Base || SegEnd > AllocEnd)) - return BailOut(make_error<StringError>( + if (LLVM_UNLIKELY(Seg.Addr < RR.Start || SegEnd > RR.End)) + return make_error<StringError>( formatv("Segment {0:x} -- {1:x} crosses boundary of " "allocation {2:x} -- {3:x}", - Seg.Addr.getValue(), SegEnd.getValue(), Base.getValue(), - AllocEnd.getValue()), - inconvertibleErrorCode())); + Seg.Addr, SegEnd, RR.Start, RR.End), + inconvertibleErrorCode()); char *Mem = Seg.Addr.toPtr<char *>(); if (!Seg.Content.empty()) memcpy(Mem, Seg.Content.data(), Seg.Content.size()); memset(Mem + Seg.Content.size(), 0, Seg.Size - Seg.Content.size()); assert(Seg.Size <= std::numeric_limits<size_t>::max()); + + sys::MemoryBlock MB(Mem, Seg.Size); if (auto EC = sys::Memory::protectMappedMemory( - {Mem, static_cast<size_t>(Seg.Size)}, - toSysMemoryProtectionFlags(Seg.RAG.Prot))) - return BailOut(errorCodeToError(EC)); + MB, toSysMemoryProtectionFlags(Seg.RAG.Prot))) + return errorCodeToError(EC); + + MBsToReset.push_back(MB); + if ((Seg.RAG.Prot & MemProt::Exec) == MemProt::Exec) sys::Memory::InvalidateInstructionCache(Mem, Seg.Size); } - // Run finalization actions. - for (auto &ActPair : FR.Actions) { - if (auto Err = ActPair.Finalize.runWithSPSRetErrorMerged()) - return BailOut(std::move(Err)); - ++SuccessfulFinalizationActions; + auto DeallocActions = runFinalizeActions(FR.Actions); + if (!DeallocActions) + return DeallocActions.takeError(); + + { + std::lock_guard<std::mutex> Lock(M); + auto Region = createRegionInfo(RR, "In initialize"); + if (!Region) + return Region.takeError(); + Region->DeallocActions = std::move(*DeallocActions); } - return Error::success(); + // Successful initialization. + ResetMBs.release(); + + return RR.Start; } -Error SimpleExecutorMemoryManager::deallocate( - const std::vector<ExecutorAddr> &Bases) { - std::vector<std::pair<void *, Allocation>> AllocPairs; - AllocPairs.reserve(Bases.size()); +Error SimpleExecutorMemoryManager::deinitialize( + const std::vector<ExecutorAddr> &InitKeys) { + Error Err = Error::success(); - // Get allocation to destroy. + for (auto &KeyAddr : llvm::reverse(InitKeys)) { + std::vector<shared::WrapperFunctionCall> DeallocActions; + { + std::scoped_lock<std::mutex> Lock(M); + auto Slab = getSlabInfo(KeyAddr, "In deinitialize"); + if (!Slab) { + Err = joinErrors(std::move(Err), Slab.takeError()); + continue; + } + + auto RI = getRegionInfo(*Slab, KeyAddr, "In deinitialize"); + if (!RI) { + Err = joinErrors(std::move(Err), RI.takeError()); + continue; + } + + DeallocActions = std::move(RI->DeallocActions); + } + + Err = joinErrors(std::move(Err), + runDeallocActions(std::move(DeallocActions))); + } + + return Err; +} + +Error SimpleExecutorMemoryManager::release( + const std::vector<ExecutorAddr> &Bases) { Error Err = Error::success(); - { - std::lock_guard<std::mutex> Lock(M); - for (auto &Base : Bases) { - auto I = Allocations.find(Base.toPtr<void *>()); - - // Check for missing allocation (effective a double free). - if (I != Allocations.end()) { - AllocPairs.push_back(std::move(*I)); - Allocations.erase(I); - } else + + // TODO: Prohibit new initializations within the slabs being removed? + for (auto &Base : llvm::reverse(Bases)) { + std::vector<shared::WrapperFunctionCall> DeallocActions; + sys::MemoryBlock MB; + + { + std::scoped_lock<std::mutex> Lock(M); + + auto SlabI = Slabs.find(Base.toPtr<void *>()); + if (SlabI == Slabs.end()) { Err = joinErrors( std::move(Err), - make_error<StringError>("No allocation entry found " - "for " + - formatv("{0:x}", Base.getValue()), + make_error<StringError>("In release, " + formatv("{0:x}", Base) + + " is not part of any reserved " + "address range", inconvertibleErrorCode())); + continue; + } + + auto &Slab = SlabI->second; + + for (auto &[Addr, Region] : Slab.Regions) + llvm::copy(Region.DeallocActions, back_inserter(DeallocActions)); + + MB = {Base.toPtr<void *>(), Slab.Size}; + + Slabs.erase(SlabI); } - } - while (!AllocPairs.empty()) { - auto &P = AllocPairs.back(); - Err = joinErrors(std::move(Err), deallocateImpl(P.first, P.second)); - AllocPairs.pop_back(); + Err = joinErrors(std::move(Err), runDeallocActions(DeallocActions)); + if (auto EC = sys::Memory::releaseMappedMemory(MB)) + Err = joinErrors(std::move(Err), errorCodeToError(EC)); } return Err; @@ -185,16 +187,15 @@ Error SimpleExecutorMemoryManager::deallocate( Error SimpleExecutorMemoryManager::shutdown() { - AllocationsMap AM; + // TODO: Prevent new allocations during shutdown. + std::vector<ExecutorAddr> Bases; { - std::lock_guard<std::mutex> Lock(M); - AM = std::move(Allocations); + std::scoped_lock<std::mutex> Lock(M); + for (auto &[Base, Slab] : Slabs) + Bases.push_back(ExecutorAddr::fromPtr(Base)); } - Error Err = Error::success(); - for (auto &KV : AM) - Err = joinErrors(std::move(Err), deallocateImpl(KV.first, KV.second)); - return Err; + return release(Bases); } void SimpleExecutorMemoryManager::addBootstrapSymbols( @@ -202,58 +203,150 @@ void SimpleExecutorMemoryManager::addBootstrapSymbols( M[rt::SimpleExecutorMemoryManagerInstanceName] = ExecutorAddr::fromPtr(this); M[rt::SimpleExecutorMemoryManagerReserveWrapperName] = ExecutorAddr::fromPtr(&reserveWrapper); - M[rt::SimpleExecutorMemoryManagerFinalizeWrapperName] = - ExecutorAddr::fromPtr(&finalizeWrapper); - M[rt::SimpleExecutorMemoryManagerDeallocateWrapperName] = - ExecutorAddr::fromPtr(&deallocateWrapper); + M[rt::SimpleExecutorMemoryManagerInitializeWrapperName] = + ExecutorAddr::fromPtr(&initializeWrapper); + M[rt::SimpleExecutorMemoryManagerDeinitializeWrapperName] = + ExecutorAddr::fromPtr(&deinitializeWrapper); + M[rt::SimpleExecutorMemoryManagerReleaseWrapperName] = + ExecutorAddr::fromPtr(&releaseWrapper); } -Error SimpleExecutorMemoryManager::deallocateImpl(void *Base, Allocation &A) { - Error Err = Error::success(); +Expected<SimpleExecutorMemoryManager::SlabInfo &> +SimpleExecutorMemoryManager::getSlabInfo(ExecutorAddr A, StringRef Context) { + auto MakeBadSlabError = [&]() { + return make_error<StringError>( + Context + ", address " + formatv("{0:x}", A) + + " is not part of any reserved address range", + inconvertibleErrorCode()); + }; - while (!A.DeallocationActions.empty()) { - Err = joinErrors(std::move(Err), - A.DeallocationActions.back().runWithSPSRetErrorMerged()); - A.DeallocationActions.pop_back(); + auto I = Slabs.upper_bound(A.toPtr<void *>()); + if (I == Slabs.begin()) + return MakeBadSlabError(); + --I; + if (!ExecutorAddrRange(ExecutorAddr::fromPtr(I->first), I->second.Size) + .contains(A)) + return MakeBadSlabError(); + + return I->second; +} + +Expected<SimpleExecutorMemoryManager::SlabInfo &> +SimpleExecutorMemoryManager::getSlabInfo(ExecutorAddrRange R, + StringRef Context) { + auto MakeBadSlabError = [&]() { + return make_error<StringError>( + Context + ", range " + formatv("{0:x}", R) + + " is not part of any reserved address range", + inconvertibleErrorCode()); + }; + + auto I = Slabs.upper_bound(R.Start.toPtr<void *>()); + if (I == Slabs.begin()) + return MakeBadSlabError(); + --I; + if (!ExecutorAddrRange(ExecutorAddr::fromPtr(I->first), I->second.Size) + .contains(R)) + return MakeBadSlabError(); + + return I->second; +} + +Expected<SimpleExecutorMemoryManager::RegionInfo &> +SimpleExecutorMemoryManager::createRegionInfo(ExecutorAddrRange R, + StringRef Context) { + + auto Slab = getSlabInfo(R, Context); + if (!Slab) + return Slab.takeError(); + + auto MakeBadRegionError = [&](ExecutorAddrRange Other, bool Prev) { + return make_error<StringError>(Context + ", region " + formatv("{0:x}", R) + + " overlaps " + + (Prev ? "previous" : "following") + + " region " + formatv("{0:x}", Other), + inconvertibleErrorCode()); + }; + + auto I = Slab->Regions.upper_bound(R.Start); + if (I != Slab->Regions.begin()) { + auto J = std::prev(I); + ExecutorAddrRange PrevRange(J->first, J->second.Size); + if (PrevRange.overlaps(R)) + return MakeBadRegionError(PrevRange, true); + } + if (I != Slab->Regions.end()) { + ExecutorAddrRange NextRange(I->first, I->second.Size); + if (NextRange.overlaps(R)) + return MakeBadRegionError(NextRange, false); } - sys::MemoryBlock MB(Base, A.Size); - if (auto EC = sys::Memory::releaseMappedMemory(MB)) - Err = joinErrors(std::move(Err), errorCodeToError(EC)); + auto &RInfo = Slab->Regions[R.Start]; + RInfo.Size = R.size(); + return RInfo; +} - return Err; +Expected<SimpleExecutorMemoryManager::RegionInfo &> +SimpleExecutorMemoryManager::getRegionInfo(SlabInfo &Slab, ExecutorAddr A, + StringRef Context) { + auto I = Slab.Regions.find(A); + if (I == Slab.Regions.end()) + return make_error<StringError>( + Context + ", address " + formatv("{0:x}", A) + + " does not correspond to the start of any initialized region", + inconvertibleErrorCode()); + + return I->second; +} + +Expected<SimpleExecutorMemoryManager::RegionInfo &> +SimpleExecutorMemoryManager::getRegionInfo(ExecutorAddr A, StringRef Context) { + auto Slab = getSlabInfo(A, Context); + if (!Slab) + return Slab.takeError(); + + return getRegionInfo(*Slab, A, Context); } llvm::orc::shared::CWrapperFunctionResult SimpleExecutorMemoryManager::reserveWrapper(const char *ArgData, size_t ArgSize) { - return shared::WrapperFunction< - rt::SPSSimpleExecutorMemoryManagerReserveSignature>:: + return shared::WrapperFunction<rt::SPSSimpleRemoteMemoryMapReserveSignature>:: handle(ArgData, ArgSize, shared::makeMethodWrapperHandler( - &SimpleExecutorMemoryManager::allocate)) + &SimpleExecutorMemoryManager::reserve)) + .release(); +} + +llvm::orc::shared::CWrapperFunctionResult +SimpleExecutorMemoryManager::initializeWrapper(const char *ArgData, + size_t ArgSize) { + return shared:: + WrapperFunction<rt::SPSSimpleRemoteMemoryMapInitializeSignature>::handle( + ArgData, ArgSize, + shared::makeMethodWrapperHandler( + &SimpleExecutorMemoryManager::initialize)) .release(); } llvm::orc::shared::CWrapperFunctionResult -SimpleExecutorMemoryManager::finalizeWrapper(const char *ArgData, - size_t ArgSize) { +SimpleExecutorMemoryManager::deinitializeWrapper(const char *ArgData, + size_t ArgSize) { return shared::WrapperFunction< - rt::SPSSimpleExecutorMemoryManagerFinalizeSignature>:: + rt::SPSSimpleRemoteMemoryMapDeinitializeSignature>:: handle(ArgData, ArgSize, shared::makeMethodWrapperHandler( - &SimpleExecutorMemoryManager::finalize)) + &SimpleExecutorMemoryManager::deinitialize)) .release(); } llvm::orc::shared::CWrapperFunctionResult -SimpleExecutorMemoryManager::deallocateWrapper(const char *ArgData, - size_t ArgSize) { - return shared::WrapperFunction< - rt::SPSSimpleExecutorMemoryManagerDeallocateSignature>:: +SimpleExecutorMemoryManager::releaseWrapper(const char *ArgData, + size_t ArgSize) { + return shared::WrapperFunction<rt::SPSSimpleRemoteMemoryMapReleaseSignature>:: handle(ArgData, ArgSize, shared::makeMethodWrapperHandler( - &SimpleExecutorMemoryManager::deallocate)) + &SimpleExecutorMemoryManager::release)) .release(); } diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp index 2430d98..3908a78 100644 --- a/llvm/lib/IR/AsmWriter.cpp +++ b/llvm/lib/IR/AsmWriter.cpp @@ -2374,16 +2374,21 @@ static void writeDICompileUnit(raw_ostream &Out, const DICompileUnit *N, Out << "!DICompileUnit("; MDFieldPrinter Printer(Out, WriterCtx); - auto Lang = N->getSourceLanguage(); - if (Lang.hasVersionedName()) + DISourceLanguageName Lang = N->getSourceLanguage(); + + if (Lang.hasVersionedName()) { Printer.printDwarfEnum( "sourceLanguageName", static_cast<llvm::dwarf::SourceLanguageName>(Lang.getName()), dwarf::SourceLanguageNameString, /* ShouldSkipZero */ false); - else + + Printer.printInt("sourceLanguageVersion", Lang.getVersion(), + /*ShouldSkipZero=*/true); + } else { Printer.printDwarfEnum("language", Lang.getName(), dwarf::LanguageString, /* ShouldSkipZero */ false); + } Printer.printMetadata("file", N->getRawFile(), /* ShouldSkipNull */ false); Printer.printString("producer", N->getProducer()); diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index f28b989..d8374b6 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -6041,8 +6041,7 @@ std::string llvm::UpgradeDataLayoutString(StringRef DL, StringRef TT) { Triple T(TT); // The only data layout upgrades needed for pre-GCN, SPIR or SPIRV are setting // the address space of globals to 1. This does not apply to SPIRV Logical. - if (((T.isAMDGPU() && !T.isAMDGCN()) || - (T.isSPIR() || (T.isSPIRV() && !T.isSPIRVLogical()))) && + if ((T.isSPIR() || (T.isSPIRV() && !T.isSPIRVLogical())) && !DL.contains("-G") && !DL.starts_with("G")) { return DL.empty() ? std::string("G1") : (DL + "-G1").str(); } @@ -6055,35 +6054,43 @@ std::string llvm::UpgradeDataLayoutString(StringRef DL, StringRef TT) { return DL.str(); } + // AMDGPU data layout upgrades. std::string Res = DL.str(); - // AMDGCN data layout upgrades. - if (T.isAMDGCN()) { + if (T.isAMDGPU()) { // Define address spaces for constants. if (!DL.contains("-G") && !DL.starts_with("G")) Res.append(Res.empty() ? "G1" : "-G1"); - // Add missing non-integral declarations. - // This goes before adding new address spaces to prevent incoherent string - // values. - if (!DL.contains("-ni") && !DL.starts_with("ni")) - Res.append("-ni:7:8:9"); - // Update ni:7 to ni:7:8:9. - if (DL.ends_with("ni:7")) - Res.append(":8:9"); - if (DL.ends_with("ni:7:8")) - Res.append(":9"); - - // Add sizing for address spaces 7 and 8 (fat raw buffers and buffer - // resources) An empty data layout has already been upgraded to G1 by now. - if (!DL.contains("-p7") && !DL.starts_with("p7")) - Res.append("-p7:160:256:256:32"); - if (!DL.contains("-p8") && !DL.starts_with("p8")) - Res.append("-p8:128:128:128:48"); - constexpr StringRef OldP8("-p8:128:128-"); - if (DL.contains(OldP8)) - Res.replace(Res.find(OldP8), OldP8.size(), "-p8:128:128:128:48-"); - if (!DL.contains("-p9") && !DL.starts_with("p9")) - Res.append("-p9:192:256:256:32"); + // AMDGCN data layout upgrades. + if (T.isAMDGCN()) { + + // Add missing non-integral declarations. + // This goes before adding new address spaces to prevent incoherent string + // values. + if (!DL.contains("-ni") && !DL.starts_with("ni")) + Res.append("-ni:7:8:9"); + // Update ni:7 to ni:7:8:9. + if (DL.ends_with("ni:7")) + Res.append(":8:9"); + if (DL.ends_with("ni:7:8")) + Res.append(":9"); + + // Add sizing for address spaces 7 and 8 (fat raw buffers and buffer + // resources) An empty data layout has already been upgraded to G1 by now. + if (!DL.contains("-p7") && !DL.starts_with("p7")) + Res.append("-p7:160:256:256:32"); + if (!DL.contains("-p8") && !DL.starts_with("p8")) + Res.append("-p8:128:128:128:48"); + constexpr StringRef OldP8("-p8:128:128-"); + if (DL.contains(OldP8)) + Res.replace(Res.find(OldP8), OldP8.size(), "-p8:128:128:128:48-"); + if (!DL.contains("-p9") && !DL.starts_with("p9")) + Res.append("-p9:192:256:256:32"); + } + + // Upgrade the ELF mangling mode. + if (!DL.contains("m:e")) + Res = Res.empty() ? "m:e" : "m:e-" + Res; return Res; } diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp index 9601a8a..5883606 100644 --- a/llvm/lib/IR/DebugInfo.cpp +++ b/llvm/lib/IR/DebugInfo.cpp @@ -294,9 +294,9 @@ void DebugInfoFinder::processSubprogram(DISubprogram *SP) { // just DISubprogram's, referenced from anywhere within the Function being // cloned prior to calling MapMetadata / RemapInstruction to avoid their // duplication later as DICompileUnit's are also directly referenced by - // llvm.dbg.cu list. Thefore we need to collect DICompileUnit's here as well. - // Also, DICompileUnit's may reference DISubprogram's too and therefore need - // to be at least looked through. + // llvm.dbg.cu list. Therefore we need to collect DICompileUnit's here as + // well. Also, DICompileUnit's may reference DISubprogram's too and therefore + // need to be at least looked through. processCompileUnit(SP->getUnit()); processType(SP->getType()); for (auto *Element : SP->getTemplateParams()) { @@ -377,7 +377,7 @@ bool DebugInfoFinder::addScope(DIScope *Scope) { /// Recursively handle DILocations in followup metadata etc. /// -/// TODO: If for example a followup loop metadata would refence itself this +/// TODO: If for example a followup loop metadata would reference itself this /// function would go into infinite recursion. We do not expect such cycles in /// the loop metadata (except for the self-referencing first element /// "LoopID"). However, we could at least handle such situations more gracefully @@ -679,7 +679,7 @@ private: auto Variables = nullptr; auto TemplateParams = nullptr; - // Make a distinct DISubprogram, for situations that warrent it. + // Make a distinct DISubprogram, for situations that warrant it. auto distinctMDSubprogram = [&]() { return DISubprogram::getDistinct( MDS->getContext(), FileAndScope, MDS->getName(), LinkageName, @@ -1095,6 +1095,35 @@ LLVMDIBuilderCreateFile(LLVMDIBuilderRef Builder, const char *Filename, StringRef(Directory, DirectoryLen))); } +static llvm::DIFile::ChecksumKind +map_from_llvmChecksumKind(LLVMChecksumKind CSKind) { + switch (CSKind) { + case LLVMChecksumKind::CSK_MD5: + return llvm::DIFile::CSK_MD5; + case LLVMChecksumKind::CSK_SHA1: + return llvm::DIFile::CSK_SHA1; + case LLVMChecksumKind::CSK_SHA256: + return llvm::DIFile::CSK_SHA256; + } + llvm_unreachable("Unhandled Checksum Kind"); +} + +LLVMMetadataRef LLVMDIBuilderCreateFileWithChecksum( + LLVMDIBuilderRef Builder, const char *Filename, size_t FilenameLen, + const char *Directory, size_t DirectoryLen, LLVMChecksumKind ChecksumKind, + const char *Checksum, size_t ChecksumLen, const char *Source, + size_t SourceLen) { + StringRef ChkSum = StringRef(Checksum, ChecksumLen); + auto CSK = map_from_llvmChecksumKind(ChecksumKind); + llvm::DIFile::ChecksumInfo<StringRef> CSInfo(CSK, ChkSum); + std::optional<StringRef> Src; + if (SourceLen > 0) + Src = StringRef(Source, SourceLen); + return wrap(unwrap(Builder)->createFile(StringRef(Filename, FilenameLen), + StringRef(Directory, DirectoryLen), + CSInfo, Src)); +} + LLVMMetadataRef LLVMDIBuilderCreateModule(LLVMDIBuilderRef Builder, LLVMMetadataRef ParentScope, const char *Name, size_t NameLen, @@ -2014,7 +2043,7 @@ void at::remapAssignID(DenseMap<DIAssignID *, DIAssignID *> &Map, I.setMetadata(LLVMContext::MD_DIAssignID, GetNewID(ID)); } -/// Collect constant properies (base, size, offset) of \p StoreDest. +/// Collect constant properties (base, size, offset) of \p StoreDest. /// Return std::nullopt if any properties are not constants or the /// offset from the base pointer is negative. static std::optional<AssignmentInfo> @@ -2300,7 +2329,7 @@ PreservedAnalyses AssignmentTrackingPass::run(Function &F, return PreservedAnalyses::all(); // Record that this module uses assignment tracking. It doesn't matter that - // some functons in the module may not use it - the debug info in those + // some functions in the module may not use it - the debug info in those // functions will still be handled properly. setAssignmentTrackingModuleFlag(*F.getParent()); diff --git a/llvm/lib/IR/Type.cpp b/llvm/lib/IR/Type.cpp index 9db48e8..0e9535d 100644 --- a/llvm/lib/IR/Type.cpp +++ b/llvm/lib/IR/Type.cpp @@ -1034,6 +1034,10 @@ static TargetTypeInfo getTargetTypeInfo(const TargetExtType *Ty) { } // DirectX resources + if (Name == "dx.Padding") + return TargetTypeInfo( + ArrayType::get(Type::getInt8Ty(C), Ty->getIntParameter(0)), + TargetExtType::CanBeGlobal); if (Name.starts_with("dx.")) return TargetTypeInfo(PointerType::get(C, 0), TargetExtType::CanBeGlobal, TargetExtType::CanBeLocal, diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index c79a950..3572852 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -6479,9 +6479,12 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) { NumRows->getZExtValue() * NumColumns->getZExtValue(), "Result of a matrix operation does not fit in the returned vector!"); - if (Stride) + if (Stride) { + Check(Stride->getBitWidth() <= 64, "Stride bitwidth cannot exceed 64!", + IF); Check(Stride->getZExtValue() >= NumRows->getZExtValue(), "Stride must be greater or equal than the number of rows!", IF); + } break; } diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp index e6544f3..aec8891 100644 --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -1257,38 +1257,6 @@ Error LTO::run(AddStreamFn AddStream, FileCache Cache) { return Result; } -void lto::updateMemProfAttributes(Module &Mod, - const ModuleSummaryIndex &Index) { - llvm::TimeTraceScope timeScope("LTO update memprof attributes"); - if (Index.withSupportsHotColdNew()) - return; - - // The profile matcher applies hotness attributes directly for allocations, - // and those will cause us to generate calls to the hot/cold interfaces - // unconditionally. If supports-hot-cold-new was not enabled in the LTO - // link then assume we don't want these calls (e.g. not linking with - // the appropriate library, or otherwise trying to disable this behavior). - for (auto &F : Mod) { - for (auto &BB : F) { - for (auto &I : BB) { - auto *CI = dyn_cast<CallBase>(&I); - if (!CI) - continue; - if (CI->hasFnAttr("memprof")) - CI->removeFnAttr("memprof"); - // Strip off all memprof metadata as it is no longer needed. - // Importantly, this avoids the addition of new memprof attributes - // after inlining propagation. - // TODO: If we support additional types of MemProf metadata beyond hot - // and cold, we will need to update the metadata based on the allocator - // APIs supported instead of completely stripping all. - CI->setMetadata(LLVMContext::MD_memprof, nullptr); - CI->setMetadata(LLVMContext::MD_callsite, nullptr); - } - } - } -} - Error LTO::runRegularLTO(AddStreamFn AddStream) { llvm::TimeTraceScope timeScope("Run regular LTO"); LLVMContext &CombinedCtx = RegularLTO.CombinedModule->getContext(); @@ -1346,8 +1314,6 @@ Error LTO::runRegularLTO(AddStreamFn AddStream) { } } - updateMemProfAttributes(*RegularLTO.CombinedModule, ThinLTO.CombinedIndex); - bool WholeProgramVisibilityEnabledInLTO = Conf.HasWholeProgramVisibility && // If validation is enabled, upgrade visibility only when all vtables diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp index 11a7b32..280c3d1 100644 --- a/llvm/lib/LTO/LTOBackend.cpp +++ b/llvm/lib/LTO/LTOBackend.cpp @@ -726,7 +726,6 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream, } // Do this after any importing so that imported code is updated. - updateMemProfAttributes(Mod, CombinedIndex); updatePublicTypeTestCalls(Mod, CombinedIndex.withWholeProgramVisibility()); if (Conf.PostImportModuleHook && !Conf.PostImportModuleHook(Task, Mod)) diff --git a/llvm/lib/MC/MCObjectFileInfo.cpp b/llvm/lib/MC/MCObjectFileInfo.cpp index a755c22..aee3c3b 100644 --- a/llvm/lib/MC/MCObjectFileInfo.cpp +++ b/llvm/lib/MC/MCObjectFileInfo.cpp @@ -553,7 +553,8 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(const Triple &T, bool Large) { SFrameSection = Ctx->getELFSection(".sframe", ELF::SHT_GNU_SFRAME, ELF::SHF_ALLOC); - CallGraphSection = Ctx->getELFSection(".callgraph", ELF::SHT_PROGBITS, 0); + CallGraphSection = + Ctx->getELFSection(".llvm.callgraph", ELF::SHT_PROGBITS, 0); StackSizesSection = Ctx->getELFSection(".stack_sizes", ELF::SHT_PROGBITS, 0); @@ -1171,8 +1172,8 @@ MCObjectFileInfo::getCallGraphSection(const MCSection &TextSec) const { } return Ctx->getELFSection( - ".callgraph", ELF::SHT_PROGBITS, Flags, 0, GroupName, true, - ElfSec.getUniqueID(), + ".llvm.callgraph", ELF::SHT_PROGBITS, Flags, 0, GroupName, + /*IsComdat=*/true, ElfSec.getUniqueID(), static_cast<const MCSymbolELF *>(TextSec.getBeginSymbol())); } diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp index 421d6603..c3a27c9 100644 --- a/llvm/lib/ObjectYAML/ELFYAML.cpp +++ b/llvm/lib/ObjectYAML/ELFYAML.cpp @@ -488,6 +488,7 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO, BCaseMask(EF_HEXAGON_MACH_V5, EF_HEXAGON_MACH); BCaseMask(EF_HEXAGON_MACH_V55, EF_HEXAGON_MACH); BCaseMask(EF_HEXAGON_MACH_V60, EF_HEXAGON_MACH); + BCaseMask(EF_HEXAGON_MACH_V61, EF_HEXAGON_MACH); BCaseMask(EF_HEXAGON_MACH_V62, EF_HEXAGON_MACH); BCaseMask(EF_HEXAGON_MACH_V65, EF_HEXAGON_MACH); BCaseMask(EF_HEXAGON_MACH_V66, EF_HEXAGON_MACH); @@ -499,12 +500,21 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO, BCaseMask(EF_HEXAGON_MACH_V71T, EF_HEXAGON_MACH); BCaseMask(EF_HEXAGON_MACH_V73, EF_HEXAGON_MACH); BCaseMask(EF_HEXAGON_MACH_V75, EF_HEXAGON_MACH); + BCaseMask(EF_HEXAGON_MACH_V77, EF_HEXAGON_MACH); + BCaseMask(EF_HEXAGON_MACH_V79, EF_HEXAGON_MACH); + BCaseMask(EF_HEXAGON_MACH_V81, EF_HEXAGON_MACH); + BCaseMask(EF_HEXAGON_MACH_V83, EF_HEXAGON_MACH); + BCaseMask(EF_HEXAGON_MACH_V85, EF_HEXAGON_MACH); + BCaseMask(EF_HEXAGON_MACH_V87, EF_HEXAGON_MACH); + BCaseMask(EF_HEXAGON_MACH_V89, EF_HEXAGON_MACH); + BCaseMask(EF_HEXAGON_MACH_V91, EF_HEXAGON_MACH); BCaseMask(EF_HEXAGON_ISA_V2, EF_HEXAGON_ISA); BCaseMask(EF_HEXAGON_ISA_V3, EF_HEXAGON_ISA); BCaseMask(EF_HEXAGON_ISA_V4, EF_HEXAGON_ISA); BCaseMask(EF_HEXAGON_ISA_V5, EF_HEXAGON_ISA); BCaseMask(EF_HEXAGON_ISA_V55, EF_HEXAGON_ISA); BCaseMask(EF_HEXAGON_ISA_V60, EF_HEXAGON_ISA); + BCaseMask(EF_HEXAGON_ISA_V61, EF_HEXAGON_ISA); BCaseMask(EF_HEXAGON_ISA_V62, EF_HEXAGON_ISA); BCaseMask(EF_HEXAGON_ISA_V65, EF_HEXAGON_ISA); BCaseMask(EF_HEXAGON_ISA_V66, EF_HEXAGON_ISA); @@ -514,6 +524,14 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO, BCaseMask(EF_HEXAGON_ISA_V71, EF_HEXAGON_ISA); BCaseMask(EF_HEXAGON_ISA_V73, EF_HEXAGON_ISA); BCaseMask(EF_HEXAGON_ISA_V75, EF_HEXAGON_ISA); + BCaseMask(EF_HEXAGON_ISA_V77, EF_HEXAGON_ISA); + BCaseMask(EF_HEXAGON_ISA_V79, EF_HEXAGON_ISA); + BCaseMask(EF_HEXAGON_ISA_V81, EF_HEXAGON_ISA); + BCaseMask(EF_HEXAGON_ISA_V83, EF_HEXAGON_ISA); + BCaseMask(EF_HEXAGON_ISA_V85, EF_HEXAGON_ISA); + BCaseMask(EF_HEXAGON_ISA_V87, EF_HEXAGON_ISA); + BCaseMask(EF_HEXAGON_ISA_V89, EF_HEXAGON_ISA); + BCaseMask(EF_HEXAGON_ISA_V91, EF_HEXAGON_ISA); break; case ELF::EM_AVR: BCaseMask(EF_AVR_ARCH_AVR1, EF_AVR_ARCH_MASK); diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 53cf004..e45cac8 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -2027,13 +2027,13 @@ Error PassBuilder::parseModulePass(ModulePassManager &MPM, #define LOOPNEST_PASS(NAME, CREATE_PASS) \ if (Name == NAME) { \ MPM.addPass(createModuleToFunctionPassAdaptor( \ - createFunctionToLoopPassAdaptor(CREATE_PASS, false, false))); \ + createFunctionToLoopPassAdaptor(CREATE_PASS, false))); \ return Error::success(); \ } #define LOOP_PASS(NAME, CREATE_PASS) \ if (Name == NAME) { \ MPM.addPass(createModuleToFunctionPassAdaptor( \ - createFunctionToLoopPassAdaptor(CREATE_PASS, false, false))); \ + createFunctionToLoopPassAdaptor(CREATE_PASS, false))); \ return Error::success(); \ } #define LOOP_PASS_WITH_PARAMS(NAME, CLASS, CREATE_PASS, PARSER, PARAMS) \ @@ -2041,9 +2041,8 @@ Error PassBuilder::parseModulePass(ModulePassManager &MPM, auto Params = parsePassParameters(PARSER, Name, NAME); \ if (!Params) \ return Params.takeError(); \ - MPM.addPass( \ - createModuleToFunctionPassAdaptor(createFunctionToLoopPassAdaptor( \ - CREATE_PASS(Params.get()), false, false))); \ + MPM.addPass(createModuleToFunctionPassAdaptor( \ + createFunctionToLoopPassAdaptor(CREATE_PASS(Params.get()), false))); \ return Error::success(); \ } #include "PassRegistry.def" @@ -2142,13 +2141,13 @@ Error PassBuilder::parseCGSCCPass(CGSCCPassManager &CGPM, #define LOOPNEST_PASS(NAME, CREATE_PASS) \ if (Name == NAME) { \ CGPM.addPass(createCGSCCToFunctionPassAdaptor( \ - createFunctionToLoopPassAdaptor(CREATE_PASS, false, false))); \ + createFunctionToLoopPassAdaptor(CREATE_PASS, false))); \ return Error::success(); \ } #define LOOP_PASS(NAME, CREATE_PASS) \ if (Name == NAME) { \ CGPM.addPass(createCGSCCToFunctionPassAdaptor( \ - createFunctionToLoopPassAdaptor(CREATE_PASS, false, false))); \ + createFunctionToLoopPassAdaptor(CREATE_PASS, false))); \ return Error::success(); \ } #define LOOP_PASS_WITH_PARAMS(NAME, CLASS, CREATE_PASS, PARSER, PARAMS) \ @@ -2156,9 +2155,8 @@ Error PassBuilder::parseCGSCCPass(CGSCCPassManager &CGPM, auto Params = parsePassParameters(PARSER, Name, NAME); \ if (!Params) \ return Params.takeError(); \ - CGPM.addPass( \ - createCGSCCToFunctionPassAdaptor(createFunctionToLoopPassAdaptor( \ - CREATE_PASS(Params.get()), false, false))); \ + CGPM.addPass(createCGSCCToFunctionPassAdaptor( \ + createFunctionToLoopPassAdaptor(CREATE_PASS(Params.get()), false))); \ return Error::success(); \ } #include "PassRegistry.def" @@ -2191,11 +2189,8 @@ Error PassBuilder::parseFunctionPass(FunctionPassManager &FPM, return Err; // Add the nested pass manager with the appropriate adaptor. bool UseMemorySSA = (Name == "loop-mssa"); - bool UseBFI = llvm::any_of(InnerPipeline, [](auto Pipeline) { - return Pipeline.Name.contains("simple-loop-unswitch"); - }); - FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM), UseMemorySSA, - UseBFI)); + FPM.addPass( + createFunctionToLoopPassAdaptor(std::move(LPM), UseMemorySSA)); return Error::success(); } if (Name == "machine-function") { @@ -2248,12 +2243,12 @@ Error PassBuilder::parseFunctionPass(FunctionPassManager &FPM, // The risk is that it may become obsolete if we're not careful. #define LOOPNEST_PASS(NAME, CREATE_PASS) \ if (Name == NAME) { \ - FPM.addPass(createFunctionToLoopPassAdaptor(CREATE_PASS, false, false)); \ + FPM.addPass(createFunctionToLoopPassAdaptor(CREATE_PASS, false)); \ return Error::success(); \ } #define LOOP_PASS(NAME, CREATE_PASS) \ if (Name == NAME) { \ - FPM.addPass(createFunctionToLoopPassAdaptor(CREATE_PASS, false, false)); \ + FPM.addPass(createFunctionToLoopPassAdaptor(CREATE_PASS, false)); \ return Error::success(); \ } #define LOOP_PASS_WITH_PARAMS(NAME, CLASS, CREATE_PASS, PARSER, PARAMS) \ @@ -2261,8 +2256,8 @@ Error PassBuilder::parseFunctionPass(FunctionPassManager &FPM, auto Params = parsePassParameters(PARSER, Name, NAME); \ if (!Params) \ return Params.takeError(); \ - FPM.addPass(createFunctionToLoopPassAdaptor(CREATE_PASS(Params.get()), \ - false, false)); \ + FPM.addPass( \ + createFunctionToLoopPassAdaptor(CREATE_PASS(Params.get()), false)); \ return Error::success(); \ } #include "PassRegistry.def" diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index fea0d25..bd03ac0 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -519,16 +519,14 @@ PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level, invokeLoopOptimizerEndEPCallbacks(LPM2, Level); FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM1), - /*UseMemorySSA=*/true, - /*UseBlockFrequencyInfo=*/true)); + /*UseMemorySSA=*/true)); FPM.addPass( SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); FPM.addPass(InstCombinePass()); // The loop passes in LPM2 (LoopFullUnrollPass) do not preserve MemorySSA. // *All* loop passes must preserve it, in order to be able to use it. FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM2), - /*UseMemorySSA=*/false, - /*UseBlockFrequencyInfo=*/false)); + /*UseMemorySSA=*/false)); // Delete small array after loop unroll. FPM.addPass(SROAPass(SROAOptions::ModifyCFG)); @@ -710,8 +708,7 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, invokeLoopOptimizerEndEPCallbacks(LPM2, Level); FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM1), - /*UseMemorySSA=*/true, - /*UseBlockFrequencyInfo=*/true)); + /*UseMemorySSA=*/true)); FPM.addPass( SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); FPM.addPass(InstCombinePass()); @@ -719,8 +716,7 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, // LoopDeletionPass and LoopFullUnrollPass) do not preserve MemorySSA. // *All* loop passes must preserve it, in order to be able to use it. FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM2), - /*UseMemorySSA=*/false, - /*UseBlockFrequencyInfo=*/false)); + /*UseMemorySSA=*/false)); // Delete small array after loop unroll. FPM.addPass(SROAPass(SROAOptions::ModifyCFG)); @@ -773,7 +769,7 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, FPM.addPass(createFunctionToLoopPassAdaptor( LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, /*AllowSpeculation=*/true), - /*UseMemorySSA=*/true, /*UseBlockFrequencyInfo=*/false)); + /*UseMemorySSA=*/true)); FPM.addPass(CoroElidePass()); @@ -842,8 +838,7 @@ void PassBuilder::addPostPGOLoopRotation(ModulePassManager &MPM, createFunctionToLoopPassAdaptor( LoopRotatePass(EnableLoopHeaderDuplication || Level != OptimizationLevel::Oz), - /*UseMemorySSA=*/false, - /*UseBlockFrequencyInfo=*/false), + /*UseMemorySSA=*/false), PTO.EagerlyInvalidateAnalyses)); } } @@ -1358,8 +1353,7 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level, LPM.addPass(SimpleLoopUnswitchPass(/* NonTrivial */ Level == OptimizationLevel::O3)); ExtraPasses.addPass( - createFunctionToLoopPassAdaptor(std::move(LPM), /*UseMemorySSA=*/true, - /*UseBlockFrequencyInfo=*/true)); + createFunctionToLoopPassAdaptor(std::move(LPM), /*UseMemorySSA=*/true)); ExtraPasses.addPass( SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); ExtraPasses.addPass(InstCombinePass()); @@ -1438,7 +1432,7 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level, FPM.addPass(createFunctionToLoopPassAdaptor( LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, /*AllowSpeculation=*/true), - /*UseMemorySSA=*/true, /*UseBlockFrequencyInfo=*/false)); + /*UseMemorySSA=*/true)); // Now that we've vectorized and unrolled loops, we may have more refined // alignment information, try to re-derive it here. @@ -1520,7 +1514,7 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level, OptimizePM.addPass(createFunctionToLoopPassAdaptor( LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, /*AllowSpeculation=*/true), - /*USeMemorySSA=*/true, /*UseBlockFrequencyInfo=*/false)); + /*USeMemorySSA=*/true)); } OptimizePM.addPass(Float2IntPass()); @@ -1560,8 +1554,8 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level, if (PTO.LoopInterchange) LPM.addPass(LoopInterchangePass()); - OptimizePM.addPass(createFunctionToLoopPassAdaptor( - std::move(LPM), /*UseMemorySSA=*/false, /*UseBlockFrequencyInfo=*/false)); + OptimizePM.addPass( + createFunctionToLoopPassAdaptor(std::move(LPM), /*UseMemorySSA=*/false)); // FIXME: This may not be the right place in the pipeline. // We need to have the data to support the right place. @@ -1658,6 +1652,16 @@ PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level, ModulePassManager MPM; + // Currently this pipeline is only invoked in an LTO pre link pass or when we + // are not running LTO. If that changes the below checks may need updating. + assert(isLTOPreLink(Phase) || Phase == ThinOrFullLTOPhase::None); + + // If we are invoking this in non-LTO mode, remove any MemProf related + // attributes and metadata, as we don't know whether we are linking with + // a library containing the necessary interfaces. + if (Phase == ThinOrFullLTOPhase::None) + MPM.addPass(MemProfRemoveInfo()); + // Convert @llvm.global.annotations to !annotation metadata. MPM.addPass(Annotation2MetadataPass()); @@ -1803,6 +1807,12 @@ ModulePassManager PassBuilder::buildThinLTODefaultPipeline( OptimizationLevel Level, const ModuleSummaryIndex *ImportSummary) { ModulePassManager MPM; + // If we are invoking this without a summary index noting that we are linking + // with a library containing the necessary APIs, remove any MemProf related + // attributes and metadata. + if (!ImportSummary || !ImportSummary->withSupportsHotColdNew()) + MPM.addPass(MemProfRemoveInfo()); + if (ImportSummary) { // For ThinLTO we must apply the context disambiguation decisions early, to // ensure we can correctly match the callsites to summary data. @@ -1874,6 +1884,12 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, invokeFullLinkTimeOptimizationEarlyEPCallbacks(MPM, Level); + // If we are invoking this without a summary index noting that we are linking + // with a library containing the necessary APIs, remove any MemProf related + // attributes and metadata. + if (!ExportSummary || !ExportSummary->withSupportsHotColdNew()) + MPM.addPass(MemProfRemoveInfo()); + // Create a function that performs CFI checks for cross-DSO calls with targets // in the current module. MPM.addPass(CrossDSOCFIPass()); @@ -2111,7 +2127,7 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, MainFPM.addPass(createFunctionToLoopPassAdaptor( LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, /*AllowSpeculation=*/true), - /*USeMemorySSA=*/true, /*UseBlockFrequencyInfo=*/false)); + /*USeMemorySSA=*/true)); if (RunNewGVN) MainFPM.addPass(NewGVNPass()); @@ -2141,8 +2157,8 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, PTO.ForgetAllSCEVInLoopUnroll)); // The loop passes in LPM (LoopFullUnrollPass) do not preserve MemorySSA. // *All* loop passes must preserve it, in order to be able to use it. - MainFPM.addPass(createFunctionToLoopPassAdaptor( - std::move(LPM), /*UseMemorySSA=*/false, /*UseBlockFrequencyInfo=*/true)); + MainFPM.addPass( + createFunctionToLoopPassAdaptor(std::move(LPM), /*UseMemorySSA=*/false)); MainFPM.addPass(LoopDistributePass()); diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index 1b16525..884d8da 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -113,6 +113,7 @@ MODULE_PASS("pgo-force-function-attrs", ? PGOOpt->ColdOptType : PGOOptions::ColdFuncOpt::Default)) MODULE_PASS("memprof-context-disambiguation", MemProfContextDisambiguation()) +MODULE_PASS("memprof-remove-attributes", MemProfRemoveInfo()) MODULE_PASS("memprof-module", ModuleMemProfilerPass()) MODULE_PASS("mergefunc", MergeFunctionsPass()) MODULE_PASS("metarenamer", MetaRenamerPass()) diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp index 3c8e44a..0208735 100644 --- a/llvm/lib/ProfileData/InstrProf.cpp +++ b/llvm/lib/ProfileData/InstrProf.cpp @@ -302,7 +302,7 @@ void ProfOStream::patch(ArrayRef<PatchItem> P) { std::string getPGOFuncName(StringRef Name, GlobalValue::LinkageTypes Linkage, StringRef FileName, - uint64_t Version LLVM_ATTRIBUTE_UNUSED) { + [[maybe_unused]] uint64_t Version) { // Value names may be prefixed with a binary '1' to indicate // that the backend should not modify the symbols due to any platform // naming convention. Do not include that '1' in the PGO profile name. diff --git a/llvm/lib/Remarks/BitstreamRemarkParser.h b/llvm/lib/Remarks/BitstreamRemarkParser.h index 4f66c47..914edd8 100644 --- a/llvm/lib/Remarks/BitstreamRemarkParser.h +++ b/llvm/lib/Remarks/BitstreamRemarkParser.h @@ -112,7 +112,7 @@ public: /// Helper to parse a META_BLOCK for a bitstream remark container. class BitstreamMetaParserHelper : public BitstreamBlockParserHelper<BitstreamMetaParserHelper> { - friend class BitstreamBlockParserHelper; + friend class BitstreamBlockParserHelper<BitstreamMetaParserHelper>; public: struct ContainerInfo { @@ -137,7 +137,7 @@ protected: /// Helper to parse a REMARK_BLOCK for a bitstream remark container. class BitstreamRemarkParserHelper : public BitstreamBlockParserHelper<BitstreamRemarkParserHelper> { - friend class BitstreamBlockParserHelper; + friend class BitstreamBlockParserHelper<BitstreamRemarkParserHelper>; protected: SmallVector<uint64_t, 5> Record; diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp index 8623c06..b4de79a 100644 --- a/llvm/lib/Support/APFloat.cpp +++ b/llvm/lib/Support/APFloat.cpp @@ -130,44 +130,46 @@ struct fltSemantics { bool hasSignBitInMSB = true; }; -static constexpr fltSemantics semIEEEhalf = {15, -14, 11, 16}; -static constexpr fltSemantics semBFloat = {127, -126, 8, 16}; -static constexpr fltSemantics semIEEEsingle = {127, -126, 24, 32}; -static constexpr fltSemantics semIEEEdouble = {1023, -1022, 53, 64}; -static constexpr fltSemantics semIEEEquad = {16383, -16382, 113, 128}; -static constexpr fltSemantics semFloat8E5M2 = {15, -14, 3, 8}; -static constexpr fltSemantics semFloat8E5M2FNUZ = { +constexpr fltSemantics APFloatBase::semIEEEhalf = {15, -14, 11, 16}; +constexpr fltSemantics APFloatBase::semBFloat = {127, -126, 8, 16}; +constexpr fltSemantics APFloatBase::semIEEEsingle = {127, -126, 24, 32}; +constexpr fltSemantics APFloatBase::semIEEEdouble = {1023, -1022, 53, 64}; +constexpr fltSemantics APFloatBase::semIEEEquad = {16383, -16382, 113, 128}; +constexpr fltSemantics APFloatBase::semFloat8E5M2 = {15, -14, 3, 8}; +constexpr fltSemantics APFloatBase::semFloat8E5M2FNUZ = { 15, -15, 3, 8, fltNonfiniteBehavior::NanOnly, fltNanEncoding::NegativeZero}; -static constexpr fltSemantics semFloat8E4M3 = {7, -6, 4, 8}; -static constexpr fltSemantics semFloat8E4M3FN = { +constexpr fltSemantics APFloatBase::semFloat8E4M3 = {7, -6, 4, 8}; +constexpr fltSemantics APFloatBase::semFloat8E4M3FN = { 8, -6, 4, 8, fltNonfiniteBehavior::NanOnly, fltNanEncoding::AllOnes}; -static constexpr fltSemantics semFloat8E4M3FNUZ = { +constexpr fltSemantics APFloatBase::semFloat8E4M3FNUZ = { 7, -7, 4, 8, fltNonfiniteBehavior::NanOnly, fltNanEncoding::NegativeZero}; -static constexpr fltSemantics semFloat8E4M3B11FNUZ = { +constexpr fltSemantics APFloatBase::semFloat8E4M3B11FNUZ = { 4, -10, 4, 8, fltNonfiniteBehavior::NanOnly, fltNanEncoding::NegativeZero}; -static constexpr fltSemantics semFloat8E3M4 = {3, -2, 5, 8}; -static constexpr fltSemantics semFloatTF32 = {127, -126, 11, 19}; -static constexpr fltSemantics semFloat8E8M0FNU = {127, - -127, - 1, - 8, - fltNonfiniteBehavior::NanOnly, - fltNanEncoding::AllOnes, - false, - false, - false}; - -static constexpr fltSemantics semFloat6E3M2FN = { +constexpr fltSemantics APFloatBase::semFloat8E3M4 = {3, -2, 5, 8}; +constexpr fltSemantics APFloatBase::semFloatTF32 = {127, -126, 11, 19}; +constexpr fltSemantics APFloatBase::semFloat8E8M0FNU = { + 127, + -127, + 1, + 8, + fltNonfiniteBehavior::NanOnly, + fltNanEncoding::AllOnes, + false, + false, + false}; + +constexpr fltSemantics APFloatBase::semFloat6E3M2FN = { 4, -2, 3, 6, fltNonfiniteBehavior::FiniteOnly}; -static constexpr fltSemantics semFloat6E2M3FN = { +constexpr fltSemantics APFloatBase::semFloat6E2M3FN = { 2, 0, 4, 6, fltNonfiniteBehavior::FiniteOnly}; -static constexpr fltSemantics semFloat4E2M1FN = { +constexpr fltSemantics APFloatBase::semFloat4E2M1FN = { 2, 0, 2, 4, fltNonfiniteBehavior::FiniteOnly}; -static constexpr fltSemantics semX87DoubleExtended = {16383, -16382, 64, 80}; -static constexpr fltSemantics semBogus = {0, 0, 0, 0}; -static constexpr fltSemantics semPPCDoubleDouble = {-1, 0, 0, 128}; -static constexpr fltSemantics semPPCDoubleDoubleLegacy = {1023, -1022 + 53, - 53 + 53, 128}; +constexpr fltSemantics APFloatBase::semX87DoubleExtended = {16383, -16382, 64, + 80}; +constexpr fltSemantics APFloatBase::semBogus = {0, 0, 0, 0}; +constexpr fltSemantics APFloatBase::semPPCDoubleDouble = {-1, 0, 0, 128}; +constexpr fltSemantics APFloatBase::semPPCDoubleDoubleLegacy = { + 1023, -1022 + 53, 53 + 53, 128}; const llvm::fltSemantics &APFloatBase::EnumToSemantics(Semantics S) { switch (S) { @@ -261,36 +263,6 @@ APFloatBase::SemanticsToEnum(const llvm::fltSemantics &Sem) { llvm_unreachable("Unknown floating semantics"); } -const fltSemantics &APFloatBase::IEEEhalf() { return semIEEEhalf; } -const fltSemantics &APFloatBase::BFloat() { return semBFloat; } -const fltSemantics &APFloatBase::IEEEsingle() { return semIEEEsingle; } -const fltSemantics &APFloatBase::IEEEdouble() { return semIEEEdouble; } -const fltSemantics &APFloatBase::IEEEquad() { return semIEEEquad; } -const fltSemantics &APFloatBase::PPCDoubleDouble() { - return semPPCDoubleDouble; -} -const fltSemantics &APFloatBase::PPCDoubleDoubleLegacy() { - return semPPCDoubleDoubleLegacy; -} -const fltSemantics &APFloatBase::Float8E5M2() { return semFloat8E5M2; } -const fltSemantics &APFloatBase::Float8E5M2FNUZ() { return semFloat8E5M2FNUZ; } -const fltSemantics &APFloatBase::Float8E4M3() { return semFloat8E4M3; } -const fltSemantics &APFloatBase::Float8E4M3FN() { return semFloat8E4M3FN; } -const fltSemantics &APFloatBase::Float8E4M3FNUZ() { return semFloat8E4M3FNUZ; } -const fltSemantics &APFloatBase::Float8E4M3B11FNUZ() { - return semFloat8E4M3B11FNUZ; -} -const fltSemantics &APFloatBase::Float8E3M4() { return semFloat8E3M4; } -const fltSemantics &APFloatBase::FloatTF32() { return semFloatTF32; } -const fltSemantics &APFloatBase::Float8E8M0FNU() { return semFloat8E8M0FNU; } -const fltSemantics &APFloatBase::Float6E3M2FN() { return semFloat6E3M2FN; } -const fltSemantics &APFloatBase::Float6E2M3FN() { return semFloat6E2M3FN; } -const fltSemantics &APFloatBase::Float4E2M1FN() { return semFloat4E2M1FN; } -const fltSemantics &APFloatBase::x87DoubleExtended() { - return semX87DoubleExtended; -} -const fltSemantics &APFloatBase::Bogus() { return semBogus; } - bool APFloatBase::isRepresentableBy(const fltSemantics &A, const fltSemantics &B) { return A.maxExponent <= B.maxExponent && A.minExponent >= B.minExponent && @@ -1029,7 +1001,7 @@ void IEEEFloat::makeNaN(bool SNaN, bool Negative, const APInt *fill) { // For x87 extended precision, we want to make a NaN, not a // pseudo-NaN. Maybe we should expose the ability to make // pseudo-NaNs? - if (semantics == &semX87DoubleExtended) + if (semantics == &APFloatBase::semX87DoubleExtended) APInt::tcSetBit(significand, QNaNBit + 1); } @@ -1054,7 +1026,7 @@ IEEEFloat &IEEEFloat::operator=(IEEEFloat &&rhs) { category = rhs.category; sign = rhs.sign; - rhs.semantics = &semBogus; + rhs.semantics = &APFloatBase::semBogus; return *this; } @@ -1247,7 +1219,7 @@ IEEEFloat::IEEEFloat(const IEEEFloat &rhs) { assign(rhs); } -IEEEFloat::IEEEFloat(IEEEFloat &&rhs) : semantics(&semBogus) { +IEEEFloat::IEEEFloat(IEEEFloat &&rhs) : semantics(&APFloatBase::semBogus) { *this = std::move(rhs); } @@ -2607,8 +2579,8 @@ APFloat::opStatus IEEEFloat::convert(const fltSemantics &toSemantics, shift = toSemantics.precision - fromSemantics.precision; bool X86SpecialNan = false; - if (&fromSemantics == &semX87DoubleExtended && - &toSemantics != &semX87DoubleExtended && category == fcNaN && + if (&fromSemantics == &APFloatBase::semX87DoubleExtended && + &toSemantics != &APFloatBase::semX87DoubleExtended && category == fcNaN && (!(*significandParts() & 0x8000000000000000ULL) || !(*significandParts() & 0x4000000000000000ULL))) { // x86 has some unusual NaNs which cannot be represented in any other @@ -2694,7 +2666,7 @@ APFloat::opStatus IEEEFloat::convert(const fltSemantics &toSemantics, // For x87 extended precision, we want to make a NaN, not a special NaN if // the input wasn't special either. - if (!X86SpecialNan && semantics == &semX87DoubleExtended) + if (!X86SpecialNan && semantics == &APFloatBase::semX87DoubleExtended) APInt::tcSetBit(significandParts(), semantics->precision - 1); // Convert of sNaN creates qNaN and raises an exception (invalid op). @@ -3530,7 +3502,8 @@ hash_code hash_value(const IEEEFloat &Arg) { // the actual IEEE respresentations. We compensate for that here. APInt IEEEFloat::convertF80LongDoubleAPFloatToAPInt() const { - assert(semantics == (const llvm::fltSemantics*)&semX87DoubleExtended); + assert(semantics == + (const llvm::fltSemantics *)&APFloatBase::semX87DoubleExtended); assert(partCount()==2); uint64_t myexponent, mysignificand; @@ -3560,7 +3533,8 @@ APInt IEEEFloat::convertF80LongDoubleAPFloatToAPInt() const { } APInt IEEEFloat::convertPPCDoubleDoubleLegacyAPFloatToAPInt() const { - assert(semantics == (const llvm::fltSemantics *)&semPPCDoubleDoubleLegacy); + assert(semantics == + (const llvm::fltSemantics *)&APFloatBase::semPPCDoubleDoubleLegacy); assert(partCount()==2); uint64_t words[2]; @@ -3574,14 +3548,14 @@ APInt IEEEFloat::convertPPCDoubleDoubleLegacyAPFloatToAPInt() const { // Declare fltSemantics before APFloat that uses it (and // saves pointer to it) to ensure correct destruction order. fltSemantics extendedSemantics = *semantics; - extendedSemantics.minExponent = semIEEEdouble.minExponent; + extendedSemantics.minExponent = APFloatBase::semIEEEdouble.minExponent; IEEEFloat extended(*this); fs = extended.convert(extendedSemantics, rmNearestTiesToEven, &losesInfo); assert(fs == opOK && !losesInfo); (void)fs; IEEEFloat u(extended); - fs = u.convert(semIEEEdouble, rmNearestTiesToEven, &losesInfo); + fs = u.convert(APFloatBase::semIEEEdouble, rmNearestTiesToEven, &losesInfo); assert(fs == opOK || fs == opInexact); (void)fs; words[0] = *u.convertDoubleAPFloatToAPInt().getRawData(); @@ -3597,7 +3571,7 @@ APInt IEEEFloat::convertPPCDoubleDoubleLegacyAPFloatToAPInt() const { IEEEFloat v(extended); v.subtract(u, rmNearestTiesToEven); - fs = v.convert(semIEEEdouble, rmNearestTiesToEven, &losesInfo); + fs = v.convert(APFloatBase::semIEEEdouble, rmNearestTiesToEven, &losesInfo); assert(fs == opOK && !losesInfo); (void)fs; words[1] = *v.convertDoubleAPFloatToAPInt().getRawData(); @@ -3611,8 +3585,9 @@ APInt IEEEFloat::convertPPCDoubleDoubleLegacyAPFloatToAPInt() const { template <const fltSemantics &S> APInt IEEEFloat::convertIEEEFloatToAPInt() const { assert(semantics == &S); - const int bias = - (semantics == &semFloat8E8M0FNU) ? -S.minExponent : -(S.minExponent - 1); + const int bias = (semantics == &APFloatBase::semFloat8E8M0FNU) + ? -S.minExponent + : -(S.minExponent - 1); constexpr unsigned int trailing_significand_bits = S.precision - 1; constexpr int integer_bit_part = trailing_significand_bits / integerPartWidth; constexpr integerPart integer_bit = @@ -3677,87 +3652,87 @@ APInt IEEEFloat::convertIEEEFloatToAPInt() const { APInt IEEEFloat::convertQuadrupleAPFloatToAPInt() const { assert(partCount() == 2); - return convertIEEEFloatToAPInt<semIEEEquad>(); + return convertIEEEFloatToAPInt<APFloatBase::semIEEEquad>(); } APInt IEEEFloat::convertDoubleAPFloatToAPInt() const { assert(partCount()==1); - return convertIEEEFloatToAPInt<semIEEEdouble>(); + return convertIEEEFloatToAPInt<APFloatBase::semIEEEdouble>(); } APInt IEEEFloat::convertFloatAPFloatToAPInt() const { assert(partCount()==1); - return convertIEEEFloatToAPInt<semIEEEsingle>(); + return convertIEEEFloatToAPInt<APFloatBase::semIEEEsingle>(); } APInt IEEEFloat::convertBFloatAPFloatToAPInt() const { assert(partCount() == 1); - return convertIEEEFloatToAPInt<semBFloat>(); + return convertIEEEFloatToAPInt<APFloatBase::semBFloat>(); } APInt IEEEFloat::convertHalfAPFloatToAPInt() const { assert(partCount()==1); - return convertIEEEFloatToAPInt<semIEEEhalf>(); + return convertIEEEFloatToAPInt<APFloatBase::APFloatBase::semIEEEhalf>(); } APInt IEEEFloat::convertFloat8E5M2APFloatToAPInt() const { assert(partCount() == 1); - return convertIEEEFloatToAPInt<semFloat8E5M2>(); + return convertIEEEFloatToAPInt<APFloatBase::semFloat8E5M2>(); } APInt IEEEFloat::convertFloat8E5M2FNUZAPFloatToAPInt() const { assert(partCount() == 1); - return convertIEEEFloatToAPInt<semFloat8E5M2FNUZ>(); + return convertIEEEFloatToAPInt<APFloatBase::semFloat8E5M2FNUZ>(); } APInt IEEEFloat::convertFloat8E4M3APFloatToAPInt() const { assert(partCount() == 1); - return convertIEEEFloatToAPInt<semFloat8E4M3>(); + return convertIEEEFloatToAPInt<APFloatBase::semFloat8E4M3>(); } APInt IEEEFloat::convertFloat8E4M3FNAPFloatToAPInt() const { assert(partCount() == 1); - return convertIEEEFloatToAPInt<semFloat8E4M3FN>(); + return convertIEEEFloatToAPInt<APFloatBase::semFloat8E4M3FN>(); } APInt IEEEFloat::convertFloat8E4M3FNUZAPFloatToAPInt() const { assert(partCount() == 1); - return convertIEEEFloatToAPInt<semFloat8E4M3FNUZ>(); + return convertIEEEFloatToAPInt<APFloatBase::semFloat8E4M3FNUZ>(); } APInt IEEEFloat::convertFloat8E4M3B11FNUZAPFloatToAPInt() const { assert(partCount() == 1); - return convertIEEEFloatToAPInt<semFloat8E4M3B11FNUZ>(); + return convertIEEEFloatToAPInt<APFloatBase::semFloat8E4M3B11FNUZ>(); } APInt IEEEFloat::convertFloat8E3M4APFloatToAPInt() const { assert(partCount() == 1); - return convertIEEEFloatToAPInt<semFloat8E3M4>(); + return convertIEEEFloatToAPInt<APFloatBase::semFloat8E3M4>(); } APInt IEEEFloat::convertFloatTF32APFloatToAPInt() const { assert(partCount() == 1); - return convertIEEEFloatToAPInt<semFloatTF32>(); + return convertIEEEFloatToAPInt<APFloatBase::semFloatTF32>(); } APInt IEEEFloat::convertFloat8E8M0FNUAPFloatToAPInt() const { assert(partCount() == 1); - return convertIEEEFloatToAPInt<semFloat8E8M0FNU>(); + return convertIEEEFloatToAPInt<APFloatBase::semFloat8E8M0FNU>(); } APInt IEEEFloat::convertFloat6E3M2FNAPFloatToAPInt() const { assert(partCount() == 1); - return convertIEEEFloatToAPInt<semFloat6E3M2FN>(); + return convertIEEEFloatToAPInt<APFloatBase::semFloat6E3M2FN>(); } APInt IEEEFloat::convertFloat6E2M3FNAPFloatToAPInt() const { assert(partCount() == 1); - return convertIEEEFloatToAPInt<semFloat6E2M3FN>(); + return convertIEEEFloatToAPInt<APFloatBase::semFloat6E2M3FN>(); } APInt IEEEFloat::convertFloat4E2M1FNAPFloatToAPInt() const { assert(partCount() == 1); - return convertIEEEFloatToAPInt<semFloat4E2M1FN>(); + return convertIEEEFloatToAPInt<APFloatBase::semFloat4E2M1FN>(); } // This function creates an APInt that is just a bit map of the floating @@ -3765,74 +3740,77 @@ APInt IEEEFloat::convertFloat4E2M1FNAPFloatToAPInt() const { // and treating the result as a normal integer is unlikely to be useful. APInt IEEEFloat::bitcastToAPInt() const { - if (semantics == (const llvm::fltSemantics*)&semIEEEhalf) + if (semantics == (const llvm::fltSemantics *)&APFloatBase::semIEEEhalf) return convertHalfAPFloatToAPInt(); - if (semantics == (const llvm::fltSemantics *)&semBFloat) + if (semantics == (const llvm::fltSemantics *)&APFloatBase::semBFloat) return convertBFloatAPFloatToAPInt(); - if (semantics == (const llvm::fltSemantics*)&semIEEEsingle) + if (semantics == (const llvm::fltSemantics *)&APFloatBase::semIEEEsingle) return convertFloatAPFloatToAPInt(); - if (semantics == (const llvm::fltSemantics*)&semIEEEdouble) + if (semantics == (const llvm::fltSemantics *)&APFloatBase::semIEEEdouble) return convertDoubleAPFloatToAPInt(); - if (semantics == (const llvm::fltSemantics*)&semIEEEquad) + if (semantics == (const llvm::fltSemantics *)&APFloatBase::semIEEEquad) return convertQuadrupleAPFloatToAPInt(); - if (semantics == (const llvm::fltSemantics *)&semPPCDoubleDoubleLegacy) + if (semantics == + (const llvm::fltSemantics *)&APFloatBase::semPPCDoubleDoubleLegacy) return convertPPCDoubleDoubleLegacyAPFloatToAPInt(); - if (semantics == (const llvm::fltSemantics *)&semFloat8E5M2) + if (semantics == (const llvm::fltSemantics *)&APFloatBase::semFloat8E5M2) return convertFloat8E5M2APFloatToAPInt(); - if (semantics == (const llvm::fltSemantics *)&semFloat8E5M2FNUZ) + if (semantics == (const llvm::fltSemantics *)&APFloatBase::semFloat8E5M2FNUZ) return convertFloat8E5M2FNUZAPFloatToAPInt(); - if (semantics == (const llvm::fltSemantics *)&semFloat8E4M3) + if (semantics == (const llvm::fltSemantics *)&APFloatBase::semFloat8E4M3) return convertFloat8E4M3APFloatToAPInt(); - if (semantics == (const llvm::fltSemantics *)&semFloat8E4M3FN) + if (semantics == (const llvm::fltSemantics *)&APFloatBase::semFloat8E4M3FN) return convertFloat8E4M3FNAPFloatToAPInt(); - if (semantics == (const llvm::fltSemantics *)&semFloat8E4M3FNUZ) + if (semantics == (const llvm::fltSemantics *)&APFloatBase::semFloat8E4M3FNUZ) return convertFloat8E4M3FNUZAPFloatToAPInt(); - if (semantics == (const llvm::fltSemantics *)&semFloat8E4M3B11FNUZ) + if (semantics == + (const llvm::fltSemantics *)&APFloatBase::semFloat8E4M3B11FNUZ) return convertFloat8E4M3B11FNUZAPFloatToAPInt(); - if (semantics == (const llvm::fltSemantics *)&semFloat8E3M4) + if (semantics == (const llvm::fltSemantics *)&APFloatBase::semFloat8E3M4) return convertFloat8E3M4APFloatToAPInt(); - if (semantics == (const llvm::fltSemantics *)&semFloatTF32) + if (semantics == (const llvm::fltSemantics *)&APFloatBase::semFloatTF32) return convertFloatTF32APFloatToAPInt(); - if (semantics == (const llvm::fltSemantics *)&semFloat8E8M0FNU) + if (semantics == (const llvm::fltSemantics *)&APFloatBase::semFloat8E8M0FNU) return convertFloat8E8M0FNUAPFloatToAPInt(); - if (semantics == (const llvm::fltSemantics *)&semFloat6E3M2FN) + if (semantics == (const llvm::fltSemantics *)&APFloatBase::semFloat6E3M2FN) return convertFloat6E3M2FNAPFloatToAPInt(); - if (semantics == (const llvm::fltSemantics *)&semFloat6E2M3FN) + if (semantics == (const llvm::fltSemantics *)&APFloatBase::semFloat6E2M3FN) return convertFloat6E2M3FNAPFloatToAPInt(); - if (semantics == (const llvm::fltSemantics *)&semFloat4E2M1FN) + if (semantics == (const llvm::fltSemantics *)&APFloatBase::semFloat4E2M1FN) return convertFloat4E2M1FNAPFloatToAPInt(); - assert(semantics == (const llvm::fltSemantics*)&semX87DoubleExtended && + assert(semantics == + (const llvm::fltSemantics *)&APFloatBase::semX87DoubleExtended && "unknown format!"); return convertF80LongDoubleAPFloatToAPInt(); } float IEEEFloat::convertToFloat() const { - assert(semantics == (const llvm::fltSemantics*)&semIEEEsingle && + assert(semantics == (const llvm::fltSemantics *)&APFloatBase::semIEEEsingle && "Float semantics are not IEEEsingle"); APInt api = bitcastToAPInt(); return api.bitsToFloat(); } double IEEEFloat::convertToDouble() const { - assert(semantics == (const llvm::fltSemantics*)&semIEEEdouble && + assert(semantics == (const llvm::fltSemantics *)&APFloatBase::semIEEEdouble && "Float semantics are not IEEEdouble"); APInt api = bitcastToAPInt(); return api.bitsToDouble(); @@ -3840,7 +3818,7 @@ double IEEEFloat::convertToDouble() const { #ifdef HAS_IEE754_FLOAT128 float128 IEEEFloat::convertToQuad() const { - assert(semantics == (const llvm::fltSemantics *)&semIEEEquad && + assert(semantics == (const llvm::fltSemantics *)&APFloatBase::semIEEEquad && "Float semantics are not IEEEquads"); APInt api = bitcastToAPInt(); return api.bitsToQuad(); @@ -3861,7 +3839,7 @@ void IEEEFloat::initFromF80LongDoubleAPInt(const APInt &api) { uint64_t mysignificand = i1; uint8_t myintegerbit = mysignificand >> 63; - initialize(&semX87DoubleExtended); + initialize(&APFloatBase::semX87DoubleExtended); assert(partCount()==2); sign = static_cast<unsigned int>(i2>>15); @@ -3893,14 +3871,16 @@ void IEEEFloat::initFromPPCDoubleDoubleLegacyAPInt(const APInt &api) { // Get the first double and convert to our format. initFromDoubleAPInt(APInt(64, i1)); - fs = convert(semPPCDoubleDoubleLegacy, rmNearestTiesToEven, &losesInfo); + fs = convert(APFloatBase::semPPCDoubleDoubleLegacy, rmNearestTiesToEven, + &losesInfo); assert(fs == opOK && !losesInfo); (void)fs; // Unless we have a special case, add in second double. if (isFiniteNonZero()) { - IEEEFloat v(semIEEEdouble, APInt(64, i2)); - fs = v.convert(semPPCDoubleDoubleLegacy, rmNearestTiesToEven, &losesInfo); + IEEEFloat v(APFloatBase::semIEEEdouble, APInt(64, i2)); + fs = v.convert(APFloatBase::semPPCDoubleDoubleLegacy, rmNearestTiesToEven, + &losesInfo); assert(fs == opOK && !losesInfo); (void)fs; @@ -3918,7 +3898,7 @@ void IEEEFloat::initFromFloat8E8M0FNUAPInt(const APInt &api) { uint64_t val = api.getRawData()[0]; uint64_t myexponent = (val & exponent_mask); - initialize(&semFloat8E8M0FNU); + initialize(&APFloatBase::semFloat8E8M0FNU); assert(partCount() == 1); // This format has unsigned representation only @@ -4025,109 +4005,109 @@ void IEEEFloat::initFromIEEEAPInt(const APInt &api) { } void IEEEFloat::initFromQuadrupleAPInt(const APInt &api) { - initFromIEEEAPInt<semIEEEquad>(api); + initFromIEEEAPInt<APFloatBase::semIEEEquad>(api); } void IEEEFloat::initFromDoubleAPInt(const APInt &api) { - initFromIEEEAPInt<semIEEEdouble>(api); + initFromIEEEAPInt<APFloatBase::semIEEEdouble>(api); } void IEEEFloat::initFromFloatAPInt(const APInt &api) { - initFromIEEEAPInt<semIEEEsingle>(api); + initFromIEEEAPInt<APFloatBase::semIEEEsingle>(api); } void IEEEFloat::initFromBFloatAPInt(const APInt &api) { - initFromIEEEAPInt<semBFloat>(api); + initFromIEEEAPInt<APFloatBase::semBFloat>(api); } void IEEEFloat::initFromHalfAPInt(const APInt &api) { - initFromIEEEAPInt<semIEEEhalf>(api); + initFromIEEEAPInt<APFloatBase::semIEEEhalf>(api); } void IEEEFloat::initFromFloat8E5M2APInt(const APInt &api) { - initFromIEEEAPInt<semFloat8E5M2>(api); + initFromIEEEAPInt<APFloatBase::semFloat8E5M2>(api); } void IEEEFloat::initFromFloat8E5M2FNUZAPInt(const APInt &api) { - initFromIEEEAPInt<semFloat8E5M2FNUZ>(api); + initFromIEEEAPInt<APFloatBase::semFloat8E5M2FNUZ>(api); } void IEEEFloat::initFromFloat8E4M3APInt(const APInt &api) { - initFromIEEEAPInt<semFloat8E4M3>(api); + initFromIEEEAPInt<APFloatBase::semFloat8E4M3>(api); } void IEEEFloat::initFromFloat8E4M3FNAPInt(const APInt &api) { - initFromIEEEAPInt<semFloat8E4M3FN>(api); + initFromIEEEAPInt<APFloatBase::semFloat8E4M3FN>(api); } void IEEEFloat::initFromFloat8E4M3FNUZAPInt(const APInt &api) { - initFromIEEEAPInt<semFloat8E4M3FNUZ>(api); + initFromIEEEAPInt<APFloatBase::semFloat8E4M3FNUZ>(api); } void IEEEFloat::initFromFloat8E4M3B11FNUZAPInt(const APInt &api) { - initFromIEEEAPInt<semFloat8E4M3B11FNUZ>(api); + initFromIEEEAPInt<APFloatBase::semFloat8E4M3B11FNUZ>(api); } void IEEEFloat::initFromFloat8E3M4APInt(const APInt &api) { - initFromIEEEAPInt<semFloat8E3M4>(api); + initFromIEEEAPInt<APFloatBase::semFloat8E3M4>(api); } void IEEEFloat::initFromFloatTF32APInt(const APInt &api) { - initFromIEEEAPInt<semFloatTF32>(api); + initFromIEEEAPInt<APFloatBase::semFloatTF32>(api); } void IEEEFloat::initFromFloat6E3M2FNAPInt(const APInt &api) { - initFromIEEEAPInt<semFloat6E3M2FN>(api); + initFromIEEEAPInt<APFloatBase::semFloat6E3M2FN>(api); } void IEEEFloat::initFromFloat6E2M3FNAPInt(const APInt &api) { - initFromIEEEAPInt<semFloat6E2M3FN>(api); + initFromIEEEAPInt<APFloatBase::semFloat6E2M3FN>(api); } void IEEEFloat::initFromFloat4E2M1FNAPInt(const APInt &api) { - initFromIEEEAPInt<semFloat4E2M1FN>(api); + initFromIEEEAPInt<APFloatBase::semFloat4E2M1FN>(api); } /// Treat api as containing the bits of a floating point number. void IEEEFloat::initFromAPInt(const fltSemantics *Sem, const APInt &api) { assert(api.getBitWidth() == Sem->sizeInBits); - if (Sem == &semIEEEhalf) + if (Sem == &APFloatBase::semIEEEhalf) return initFromHalfAPInt(api); - if (Sem == &semBFloat) + if (Sem == &APFloatBase::semBFloat) return initFromBFloatAPInt(api); - if (Sem == &semIEEEsingle) + if (Sem == &APFloatBase::semIEEEsingle) return initFromFloatAPInt(api); - if (Sem == &semIEEEdouble) + if (Sem == &APFloatBase::semIEEEdouble) return initFromDoubleAPInt(api); - if (Sem == &semX87DoubleExtended) + if (Sem == &APFloatBase::semX87DoubleExtended) return initFromF80LongDoubleAPInt(api); - if (Sem == &semIEEEquad) + if (Sem == &APFloatBase::semIEEEquad) return initFromQuadrupleAPInt(api); - if (Sem == &semPPCDoubleDoubleLegacy) + if (Sem == &APFloatBase::semPPCDoubleDoubleLegacy) return initFromPPCDoubleDoubleLegacyAPInt(api); - if (Sem == &semFloat8E5M2) + if (Sem == &APFloatBase::semFloat8E5M2) return initFromFloat8E5M2APInt(api); - if (Sem == &semFloat8E5M2FNUZ) + if (Sem == &APFloatBase::semFloat8E5M2FNUZ) return initFromFloat8E5M2FNUZAPInt(api); - if (Sem == &semFloat8E4M3) + if (Sem == &APFloatBase::semFloat8E4M3) return initFromFloat8E4M3APInt(api); - if (Sem == &semFloat8E4M3FN) + if (Sem == &APFloatBase::semFloat8E4M3FN) return initFromFloat8E4M3FNAPInt(api); - if (Sem == &semFloat8E4M3FNUZ) + if (Sem == &APFloatBase::semFloat8E4M3FNUZ) return initFromFloat8E4M3FNUZAPInt(api); - if (Sem == &semFloat8E4M3B11FNUZ) + if (Sem == &APFloatBase::semFloat8E4M3B11FNUZ) return initFromFloat8E4M3B11FNUZAPInt(api); - if (Sem == &semFloat8E3M4) + if (Sem == &APFloatBase::semFloat8E3M4) return initFromFloat8E3M4APInt(api); - if (Sem == &semFloatTF32) + if (Sem == &APFloatBase::semFloatTF32) return initFromFloatTF32APInt(api); - if (Sem == &semFloat8E8M0FNU) + if (Sem == &APFloatBase::semFloat8E8M0FNU) return initFromFloat8E8M0FNUAPInt(api); - if (Sem == &semFloat6E3M2FN) + if (Sem == &APFloatBase::semFloat6E3M2FN) return initFromFloat6E3M2FNAPInt(api); - if (Sem == &semFloat6E2M3FN) + if (Sem == &APFloatBase::semFloat6E2M3FN) return initFromFloat6E2M3FNAPInt(api); - if (Sem == &semFloat4E2M1FN) + if (Sem == &APFloatBase::semFloat4E2M1FN) return initFromFloat4E2M1FNAPInt(api); llvm_unreachable("unsupported semantics"); @@ -4202,11 +4182,11 @@ IEEEFloat::IEEEFloat(const fltSemantics &Sem, const APInt &API) { } IEEEFloat::IEEEFloat(float f) { - initFromAPInt(&semIEEEsingle, APInt::floatToBits(f)); + initFromAPInt(&APFloatBase::semIEEEsingle, APInt::floatToBits(f)); } IEEEFloat::IEEEFloat(double d) { - initFromAPInt(&semIEEEdouble, APInt::doubleToBits(d)); + initFromAPInt(&APFloatBase::semIEEEdouble, APInt::doubleToBits(d)); } namespace { @@ -4815,38 +4795,40 @@ IEEEFloat frexp(const IEEEFloat &Val, int &Exp, roundingMode RM) { DoubleAPFloat::DoubleAPFloat(const fltSemantics &S) : Semantics(&S), - Floats(new APFloat[2]{APFloat(semIEEEdouble), APFloat(semIEEEdouble)}) { - assert(Semantics == &semPPCDoubleDouble); + Floats(new APFloat[2]{APFloat(APFloatBase::semIEEEdouble), + APFloat(APFloatBase::semIEEEdouble)}) { + assert(Semantics == &APFloatBase::semPPCDoubleDouble); } DoubleAPFloat::DoubleAPFloat(const fltSemantics &S, uninitializedTag) - : Semantics(&S), - Floats(new APFloat[2]{APFloat(semIEEEdouble, uninitialized), - APFloat(semIEEEdouble, uninitialized)}) { - assert(Semantics == &semPPCDoubleDouble); + : Semantics(&S), Floats(new APFloat[2]{ + APFloat(APFloatBase::semIEEEdouble, uninitialized), + APFloat(APFloatBase::semIEEEdouble, uninitialized)}) { + assert(Semantics == &APFloatBase::semPPCDoubleDouble); } DoubleAPFloat::DoubleAPFloat(const fltSemantics &S, integerPart I) - : Semantics(&S), Floats(new APFloat[2]{APFloat(semIEEEdouble, I), - APFloat(semIEEEdouble)}) { - assert(Semantics == &semPPCDoubleDouble); + : Semantics(&S), + Floats(new APFloat[2]{APFloat(APFloatBase::semIEEEdouble, I), + APFloat(APFloatBase::semIEEEdouble)}) { + assert(Semantics == &APFloatBase::semPPCDoubleDouble); } DoubleAPFloat::DoubleAPFloat(const fltSemantics &S, const APInt &I) : Semantics(&S), Floats(new APFloat[2]{ - APFloat(semIEEEdouble, APInt(64, I.getRawData()[0])), - APFloat(semIEEEdouble, APInt(64, I.getRawData()[1]))}) { - assert(Semantics == &semPPCDoubleDouble); + APFloat(APFloatBase::semIEEEdouble, APInt(64, I.getRawData()[0])), + APFloat(APFloatBase::semIEEEdouble, APInt(64, I.getRawData()[1]))}) { + assert(Semantics == &APFloatBase::semPPCDoubleDouble); } DoubleAPFloat::DoubleAPFloat(const fltSemantics &S, APFloat &&First, APFloat &&Second) : Semantics(&S), Floats(new APFloat[2]{std::move(First), std::move(Second)}) { - assert(Semantics == &semPPCDoubleDouble); - assert(&Floats[0].getSemantics() == &semIEEEdouble); - assert(&Floats[1].getSemantics() == &semIEEEdouble); + assert(Semantics == &APFloatBase::semPPCDoubleDouble); + assert(&Floats[0].getSemantics() == &APFloatBase::semIEEEdouble); + assert(&Floats[1].getSemantics() == &APFloatBase::semIEEEdouble); } DoubleAPFloat::DoubleAPFloat(const DoubleAPFloat &RHS) @@ -4854,14 +4836,14 @@ DoubleAPFloat::DoubleAPFloat(const DoubleAPFloat &RHS) Floats(RHS.Floats ? new APFloat[2]{APFloat(RHS.Floats[0]), APFloat(RHS.Floats[1])} : nullptr) { - assert(Semantics == &semPPCDoubleDouble); + assert(Semantics == &APFloatBase::semPPCDoubleDouble); } DoubleAPFloat::DoubleAPFloat(DoubleAPFloat &&RHS) : Semantics(RHS.Semantics), Floats(RHS.Floats) { - RHS.Semantics = &semBogus; + RHS.Semantics = &APFloatBase::semBogus; RHS.Floats = nullptr; - assert(Semantics == &semPPCDoubleDouble); + assert(Semantics == &APFloatBase::semPPCDoubleDouble); } DoubleAPFloat &DoubleAPFloat::operator=(const DoubleAPFloat &RHS) { @@ -5009,12 +4991,12 @@ APFloat::opStatus DoubleAPFloat::addWithSpecial(const DoubleAPFloat &LHS, APFloat A(LHS.Floats[0]), AA(LHS.Floats[1]), C(RHS.Floats[0]), CC(RHS.Floats[1]); - assert(&A.getSemantics() == &semIEEEdouble); - assert(&AA.getSemantics() == &semIEEEdouble); - assert(&C.getSemantics() == &semIEEEdouble); - assert(&CC.getSemantics() == &semIEEEdouble); - assert(&Out.Floats[0].getSemantics() == &semIEEEdouble); - assert(&Out.Floats[1].getSemantics() == &semIEEEdouble); + assert(&A.getSemantics() == &APFloatBase::semIEEEdouble); + assert(&AA.getSemantics() == &APFloatBase::semIEEEdouble); + assert(&C.getSemantics() == &APFloatBase::semIEEEdouble); + assert(&CC.getSemantics() == &APFloatBase::semIEEEdouble); + assert(&Out.Floats[0].getSemantics() == &APFloatBase::semIEEEdouble); + assert(&Out.Floats[1].getSemantics() == &APFloatBase::semIEEEdouble); return Out.addImpl(A, AA, C, CC, RM); } @@ -5119,28 +5101,32 @@ APFloat::opStatus DoubleAPFloat::multiply(const DoubleAPFloat &RHS, APFloat::opStatus DoubleAPFloat::divide(const DoubleAPFloat &RHS, APFloat::roundingMode RM) { - assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics"); - APFloat Tmp(semPPCDoubleDoubleLegacy, bitcastToAPInt()); - auto Ret = - Tmp.divide(APFloat(semPPCDoubleDoubleLegacy, RHS.bitcastToAPInt()), RM); - *this = DoubleAPFloat(semPPCDoubleDouble, Tmp.bitcastToAPInt()); + assert(Semantics == &APFloatBase::semPPCDoubleDouble && + "Unexpected Semantics"); + APFloat Tmp(APFloatBase::semPPCDoubleDoubleLegacy, bitcastToAPInt()); + auto Ret = Tmp.divide( + APFloat(APFloatBase::semPPCDoubleDoubleLegacy, RHS.bitcastToAPInt()), RM); + *this = DoubleAPFloat(APFloatBase::semPPCDoubleDouble, Tmp.bitcastToAPInt()); return Ret; } APFloat::opStatus DoubleAPFloat::remainder(const DoubleAPFloat &RHS) { - assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics"); - APFloat Tmp(semPPCDoubleDoubleLegacy, bitcastToAPInt()); - auto Ret = - Tmp.remainder(APFloat(semPPCDoubleDoubleLegacy, RHS.bitcastToAPInt())); - *this = DoubleAPFloat(semPPCDoubleDouble, Tmp.bitcastToAPInt()); + assert(Semantics == &APFloatBase::semPPCDoubleDouble && + "Unexpected Semantics"); + APFloat Tmp(APFloatBase::semPPCDoubleDoubleLegacy, bitcastToAPInt()); + auto Ret = Tmp.remainder( + APFloat(APFloatBase::semPPCDoubleDoubleLegacy, RHS.bitcastToAPInt())); + *this = DoubleAPFloat(APFloatBase::semPPCDoubleDouble, Tmp.bitcastToAPInt()); return Ret; } APFloat::opStatus DoubleAPFloat::mod(const DoubleAPFloat &RHS) { - assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics"); - APFloat Tmp(semPPCDoubleDoubleLegacy, bitcastToAPInt()); - auto Ret = Tmp.mod(APFloat(semPPCDoubleDoubleLegacy, RHS.bitcastToAPInt())); - *this = DoubleAPFloat(semPPCDoubleDouble, Tmp.bitcastToAPInt()); + assert(Semantics == &APFloatBase::semPPCDoubleDouble && + "Unexpected Semantics"); + APFloat Tmp(APFloatBase::semPPCDoubleDoubleLegacy, bitcastToAPInt()); + auto Ret = Tmp.mod( + APFloat(APFloatBase::semPPCDoubleDoubleLegacy, RHS.bitcastToAPInt())); + *this = DoubleAPFloat(APFloatBase::semPPCDoubleDouble, Tmp.bitcastToAPInt()); return Ret; } @@ -5148,17 +5134,21 @@ APFloat::opStatus DoubleAPFloat::fusedMultiplyAdd(const DoubleAPFloat &Multiplicand, const DoubleAPFloat &Addend, APFloat::roundingMode RM) { - assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics"); - APFloat Tmp(semPPCDoubleDoubleLegacy, bitcastToAPInt()); + assert(Semantics == &APFloatBase::semPPCDoubleDouble && + "Unexpected Semantics"); + APFloat Tmp(APFloatBase::semPPCDoubleDoubleLegacy, bitcastToAPInt()); auto Ret = Tmp.fusedMultiplyAdd( - APFloat(semPPCDoubleDoubleLegacy, Multiplicand.bitcastToAPInt()), - APFloat(semPPCDoubleDoubleLegacy, Addend.bitcastToAPInt()), RM); - *this = DoubleAPFloat(semPPCDoubleDouble, Tmp.bitcastToAPInt()); + APFloat(APFloatBase::semPPCDoubleDoubleLegacy, + Multiplicand.bitcastToAPInt()), + APFloat(APFloatBase::semPPCDoubleDoubleLegacy, Addend.bitcastToAPInt()), + RM); + *this = DoubleAPFloat(APFloatBase::semPPCDoubleDouble, Tmp.bitcastToAPInt()); return Ret; } APFloat::opStatus DoubleAPFloat::roundToIntegral(APFloat::roundingMode RM) { - assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics"); + assert(Semantics == &APFloatBase::semPPCDoubleDouble && + "Unexpected Semantics"); const APFloat &Hi = getFirst(); const APFloat &Lo = getSecond(); @@ -5309,22 +5299,28 @@ void DoubleAPFloat::makeZero(bool Neg) { } void DoubleAPFloat::makeLargest(bool Neg) { - assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics"); - Floats[0] = APFloat(semIEEEdouble, APInt(64, 0x7fefffffffffffffull)); - Floats[1] = APFloat(semIEEEdouble, APInt(64, 0x7c8ffffffffffffeull)); + assert(Semantics == &APFloatBase::semPPCDoubleDouble && + "Unexpected Semantics"); + Floats[0] = + APFloat(APFloatBase::semIEEEdouble, APInt(64, 0x7fefffffffffffffull)); + Floats[1] = + APFloat(APFloatBase::semIEEEdouble, APInt(64, 0x7c8ffffffffffffeull)); if (Neg) changeSign(); } void DoubleAPFloat::makeSmallest(bool Neg) { - assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics"); + assert(Semantics == &APFloatBase::semPPCDoubleDouble && + "Unexpected Semantics"); Floats[0].makeSmallest(Neg); Floats[1].makeZero(/* Neg = */ false); } void DoubleAPFloat::makeSmallestNormalized(bool Neg) { - assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics"); - Floats[0] = APFloat(semIEEEdouble, APInt(64, 0x0360000000000000ull)); + assert(Semantics == &APFloatBase::semPPCDoubleDouble && + "Unexpected Semantics"); + Floats[0] = + APFloat(APFloatBase::semIEEEdouble, APInt(64, 0x0360000000000000ull)); if (Neg) Floats[0].changeSign(); Floats[1].makeZero(/* Neg = */ false); @@ -5355,7 +5351,8 @@ hash_code hash_value(const DoubleAPFloat &Arg) { } APInt DoubleAPFloat::bitcastToAPInt() const { - assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics"); + assert(Semantics == &APFloatBase::semPPCDoubleDouble && + "Unexpected Semantics"); uint64_t Data[] = { Floats[0].bitcastToAPInt().getRawData()[0], Floats[1].bitcastToAPInt().getRawData()[0], @@ -5365,10 +5362,11 @@ APInt DoubleAPFloat::bitcastToAPInt() const { Expected<APFloat::opStatus> DoubleAPFloat::convertFromString(StringRef S, roundingMode RM) { - assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics"); - APFloat Tmp(semPPCDoubleDoubleLegacy); + assert(Semantics == &APFloatBase::semPPCDoubleDouble && + "Unexpected Semantics"); + APFloat Tmp(APFloatBase::semPPCDoubleDoubleLegacy); auto Ret = Tmp.convertFromString(S, RM); - *this = DoubleAPFloat(semPPCDoubleDouble, Tmp.bitcastToAPInt()); + *this = DoubleAPFloat(APFloatBase::semPPCDoubleDouble, Tmp.bitcastToAPInt()); return Ret; } @@ -5379,7 +5377,8 @@ Expected<APFloat::opStatus> DoubleAPFloat::convertFromString(StringRef S, // nextUp must choose the smallest output > input that follows these rules. // nexDown must choose the largest output < input that follows these rules. APFloat::opStatus DoubleAPFloat::next(bool nextDown) { - assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics"); + assert(Semantics == &APFloatBase::semPPCDoubleDouble && + "Unexpected Semantics"); // nextDown(x) = -nextUp(-x) if (nextDown) { changeSign(); @@ -5481,7 +5480,8 @@ APFloat::opStatus DoubleAPFloat::next(bool nextDown) { APFloat::opStatus DoubleAPFloat::convertToSignExtendedInteger( MutableArrayRef<integerPart> Input, unsigned int Width, bool IsSigned, roundingMode RM, bool *IsExact) const { - assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics"); + assert(Semantics == &APFloatBase::semPPCDoubleDouble && + "Unexpected Semantics"); // If Hi is not finite, or Lo is zero, the value is entirely represented // by Hi. Delegate to the simpler single-APFloat conversion. @@ -5761,8 +5761,9 @@ unsigned int DoubleAPFloat::convertToHexString(char *DST, unsigned int HexDigits, bool UpperCase, roundingMode RM) const { - assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics"); - return APFloat(semPPCDoubleDoubleLegacy, bitcastToAPInt()) + assert(Semantics == &APFloatBase::semPPCDoubleDouble && + "Unexpected Semantics"); + return APFloat(APFloatBase::semPPCDoubleDoubleLegacy, bitcastToAPInt()) .convertToHexString(DST, HexDigits, UpperCase, RM); } @@ -5799,7 +5800,8 @@ bool DoubleAPFloat::isLargest() const { } bool DoubleAPFloat::isInteger() const { - assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics"); + assert(Semantics == &APFloatBase::semPPCDoubleDouble && + "Unexpected Semantics"); return Floats[0].isInteger() && Floats[1].isInteger(); } @@ -5807,8 +5809,9 @@ void DoubleAPFloat::toString(SmallVectorImpl<char> &Str, unsigned FormatPrecision, unsigned FormatMaxPadding, bool TruncateZero) const { - assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics"); - APFloat(semPPCDoubleDoubleLegacy, bitcastToAPInt()) + assert(Semantics == &APFloatBase::semPPCDoubleDouble && + "Unexpected Semantics"); + APFloat(APFloatBase::semPPCDoubleDoubleLegacy, bitcastToAPInt()) .toString(Str, FormatPrecision, FormatMaxPadding, TruncateZero); } @@ -5840,14 +5843,17 @@ int ilogb(const DoubleAPFloat &Arg) { DoubleAPFloat scalbn(const DoubleAPFloat &Arg, int Exp, APFloat::roundingMode RM) { - assert(Arg.Semantics == &semPPCDoubleDouble && "Unexpected Semantics"); - return DoubleAPFloat(semPPCDoubleDouble, scalbn(Arg.Floats[0], Exp, RM), + assert(Arg.Semantics == &APFloatBase::PPCDoubleDouble() && + "Unexpected Semantics"); + return DoubleAPFloat(APFloatBase::PPCDoubleDouble(), + scalbn(Arg.Floats[0], Exp, RM), scalbn(Arg.Floats[1], Exp, RM)); } DoubleAPFloat frexp(const DoubleAPFloat &Arg, int &Exp, APFloat::roundingMode RM) { - assert(Arg.Semantics == &semPPCDoubleDouble && "Unexpected Semantics"); + assert(Arg.Semantics == &APFloatBase::PPCDoubleDouble() && + "Unexpected Semantics"); // Get the unbiased exponent e of the number, where |Arg| = m * 2^e for m in // [1.0, 2.0). @@ -5943,7 +5949,8 @@ DoubleAPFloat frexp(const DoubleAPFloat &Arg, int &Exp, } APFloat First = scalbn(Hi, -Exp, RM); - return DoubleAPFloat(semPPCDoubleDouble, std::move(First), std::move(Second)); + return DoubleAPFloat(APFloatBase::PPCDoubleDouble(), std::move(First), + std::move(Second)); } } // namespace detail @@ -5955,9 +5962,8 @@ APFloat::Storage::Storage(IEEEFloat F, const fltSemantics &Semantics) { } if (usesLayout<DoubleAPFloat>(Semantics)) { const fltSemantics& S = F.getSemantics(); - new (&Double) - DoubleAPFloat(Semantics, APFloat(std::move(F), S), - APFloat(semIEEEdouble)); + new (&Double) DoubleAPFloat(Semantics, APFloat(std::move(F), S), + APFloat(APFloatBase::IEEEdouble())); return; } llvm_unreachable("Unexpected semantics"); @@ -6065,8 +6071,9 @@ APFloat::opStatus APFloat::convert(const fltSemantics &ToSemantics, return U.IEEE.convert(ToSemantics, RM, losesInfo); if (usesLayout<IEEEFloat>(getSemantics()) && usesLayout<DoubleAPFloat>(ToSemantics)) { - assert(&ToSemantics == &semPPCDoubleDouble); - auto Ret = U.IEEE.convert(semPPCDoubleDoubleLegacy, RM, losesInfo); + assert(&ToSemantics == &APFloatBase::semPPCDoubleDouble); + auto Ret = + U.IEEE.convert(APFloatBase::semPPCDoubleDoubleLegacy, RM, losesInfo); *this = APFloat(ToSemantics, U.IEEE.bitcastToAPInt()); return Ret; } @@ -6113,13 +6120,15 @@ APFloat::opStatus APFloat::convertToInteger(APSInt &result, } double APFloat::convertToDouble() const { - if (&getSemantics() == (const llvm::fltSemantics *)&semIEEEdouble) + if (&getSemantics() == + (const llvm::fltSemantics *)&APFloatBase::semIEEEdouble) return getIEEE().convertToDouble(); assert(isRepresentableBy(getSemantics(), semIEEEdouble) && "Float semantics is not representable by IEEEdouble"); APFloat Temp = *this; bool LosesInfo; - opStatus St = Temp.convert(semIEEEdouble, rmNearestTiesToEven, &LosesInfo); + opStatus St = + Temp.convert(APFloatBase::semIEEEdouble, rmNearestTiesToEven, &LosesInfo); assert(!(St & opInexact) && !LosesInfo && "Unexpected imprecision"); (void)St; return Temp.getIEEE().convertToDouble(); @@ -6127,13 +6136,14 @@ double APFloat::convertToDouble() const { #ifdef HAS_IEE754_FLOAT128 float128 APFloat::convertToQuad() const { - if (&getSemantics() == (const llvm::fltSemantics *)&semIEEEquad) + if (&getSemantics() == (const llvm::fltSemantics *)&APFloatBase::semIEEEquad) return getIEEE().convertToQuad(); assert(isRepresentableBy(getSemantics(), semIEEEquad) && "Float semantics is not representable by IEEEquad"); APFloat Temp = *this; bool LosesInfo; - opStatus St = Temp.convert(semIEEEquad, rmNearestTiesToEven, &LosesInfo); + opStatus St = + Temp.convert(APFloatBase::semIEEEquad, rmNearestTiesToEven, &LosesInfo); assert(!(St & opInexact) && !LosesInfo && "Unexpected imprecision"); (void)St; return Temp.getIEEE().convertToQuad(); @@ -6141,18 +6151,84 @@ float128 APFloat::convertToQuad() const { #endif float APFloat::convertToFloat() const { - if (&getSemantics() == (const llvm::fltSemantics *)&semIEEEsingle) + if (&getSemantics() == + (const llvm::fltSemantics *)&APFloatBase::semIEEEsingle) return getIEEE().convertToFloat(); assert(isRepresentableBy(getSemantics(), semIEEEsingle) && "Float semantics is not representable by IEEEsingle"); APFloat Temp = *this; bool LosesInfo; - opStatus St = Temp.convert(semIEEEsingle, rmNearestTiesToEven, &LosesInfo); + opStatus St = + Temp.convert(APFloatBase::semIEEEsingle, rmNearestTiesToEven, &LosesInfo); assert(!(St & opInexact) && !LosesInfo && "Unexpected imprecision"); (void)St; return Temp.getIEEE().convertToFloat(); } +APFloat::Storage::~Storage() { + if (usesLayout<IEEEFloat>(*semantics)) { + IEEE.~IEEEFloat(); + return; + } + if (usesLayout<DoubleAPFloat>(*semantics)) { + Double.~DoubleAPFloat(); + return; + } + llvm_unreachable("Unexpected semantics"); +} + +APFloat::Storage::Storage(const APFloat::Storage &RHS) { + if (usesLayout<IEEEFloat>(*RHS.semantics)) { + new (this) IEEEFloat(RHS.IEEE); + return; + } + if (usesLayout<DoubleAPFloat>(*RHS.semantics)) { + new (this) DoubleAPFloat(RHS.Double); + return; + } + llvm_unreachable("Unexpected semantics"); +} + +APFloat::Storage::Storage(APFloat::Storage &&RHS) { + if (usesLayout<IEEEFloat>(*RHS.semantics)) { + new (this) IEEEFloat(std::move(RHS.IEEE)); + return; + } + if (usesLayout<DoubleAPFloat>(*RHS.semantics)) { + new (this) DoubleAPFloat(std::move(RHS.Double)); + return; + } + llvm_unreachable("Unexpected semantics"); +} + +APFloat::Storage &APFloat::Storage::operator=(const APFloat::Storage &RHS) { + if (usesLayout<IEEEFloat>(*semantics) && + usesLayout<IEEEFloat>(*RHS.semantics)) { + IEEE = RHS.IEEE; + } else if (usesLayout<DoubleAPFloat>(*semantics) && + usesLayout<DoubleAPFloat>(*RHS.semantics)) { + Double = RHS.Double; + } else if (this != &RHS) { + this->~Storage(); + new (this) Storage(RHS); + } + return *this; +} + +APFloat::Storage &APFloat::Storage::operator=(APFloat::Storage &&RHS) { + if (usesLayout<IEEEFloat>(*semantics) && + usesLayout<IEEEFloat>(*RHS.semantics)) { + IEEE = std::move(RHS.IEEE); + } else if (usesLayout<DoubleAPFloat>(*semantics) && + usesLayout<DoubleAPFloat>(*RHS.semantics)) { + Double = std::move(RHS.Double); + } else if (this != &RHS) { + this->~Storage(); + new (this) Storage(std::move(RHS)); + } + return *this; +} + } // namespace llvm #undef APFLOAT_DISPATCH_ON_SEMANTICS diff --git a/llvm/lib/Support/PrettyStackTrace.cpp b/llvm/lib/Support/PrettyStackTrace.cpp index 82b0e6a..eff9947 100644 --- a/llvm/lib/Support/PrettyStackTrace.cpp +++ b/llvm/lib/Support/PrettyStackTrace.cpp @@ -141,7 +141,7 @@ extern "C" const char *__crashreporter_info__ asm(".desc ___crashreporter_info__, 0x10"); #endif -static void setCrashLogMessage(const char *msg) LLVM_ATTRIBUTE_UNUSED; +[[maybe_unused]] static void setCrashLogMessage(const char *msg); static void setCrashLogMessage(const char *msg) { #ifdef HAVE_CRASHREPORTERCLIENT_H (void)CRSetCrashLogMessage(msg); diff --git a/llvm/lib/Support/SourceMgr.cpp b/llvm/lib/Support/SourceMgr.cpp index a43cf37a..299615a 100644 --- a/llvm/lib/Support/SourceMgr.cpp +++ b/llvm/lib/Support/SourceMgr.cpp @@ -24,6 +24,7 @@ #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/Path.h" #include "llvm/Support/SMLoc.h" +#include "llvm/Support/VirtualFileSystem.h" #include "llvm/Support/WithColor.h" #include "llvm/Support/raw_ostream.h" #include <algorithm> @@ -38,6 +39,22 @@ using namespace llvm; static const size_t TabStop = 8; +// Out of line to avoid needing definition of vfs::FileSystem in header. +SourceMgr::SourceMgr() = default; +SourceMgr::SourceMgr(IntrusiveRefCntPtr<vfs::FileSystem> FS) + : FS(std::move(FS)) {} +SourceMgr::SourceMgr(SourceMgr &&) = default; +SourceMgr &SourceMgr::operator=(SourceMgr &&) = default; +SourceMgr::~SourceMgr() = default; + +IntrusiveRefCntPtr<vfs::FileSystem> SourceMgr::getVirtualFileSystem() const { + return FS; +} + +void SourceMgr::setVirtualFileSystem(IntrusiveRefCntPtr<vfs::FileSystem> FS) { + this->FS = std::move(FS); +} + unsigned SourceMgr::AddIncludeFile(const std::string &Filename, SMLoc IncludeLoc, std::string &IncludedFile) { @@ -52,8 +69,11 @@ unsigned SourceMgr::AddIncludeFile(const std::string &Filename, ErrorOr<std::unique_ptr<MemoryBuffer>> SourceMgr::OpenIncludeFile(const std::string &Filename, std::string &IncludedFile) { - ErrorOr<std::unique_ptr<MemoryBuffer>> NewBufOrErr = - MemoryBuffer::getFile(Filename); + auto GetFile = [this](StringRef Path) { + return FS ? FS->getBufferForFile(Path) : MemoryBuffer::getFile(Path); + }; + + ErrorOr<std::unique_ptr<MemoryBuffer>> NewBufOrErr = GetFile(Filename); SmallString<64> Buffer(Filename); // If the file didn't exist directly, see if it's in an include path. @@ -61,7 +81,7 @@ SourceMgr::OpenIncludeFile(const std::string &Filename, ++i) { Buffer = IncludeDirectories[i]; sys::path::append(Buffer, Filename); - NewBufOrErr = MemoryBuffer::getFile(Buffer); + NewBufOrErr = GetFile(Buffer); } if (NewBufOrErr) diff --git a/llvm/lib/Support/Windows/Signals.inc b/llvm/lib/Support/Windows/Signals.inc index dad0fa3..648d6a5 100644 --- a/llvm/lib/Support/Windows/Signals.inc +++ b/llvm/lib/Support/Windows/Signals.inc @@ -354,8 +354,8 @@ namespace llvm { /// Emulates hitting "retry" from an "abort, retry, ignore" CRT debug report /// dialog. "retry" raises an exception which ultimately triggers our stack /// dumper. -static LLVM_ATTRIBUTE_UNUSED int -AvoidMessageBoxHook(int ReportType, char *Message, int *Return) { +[[maybe_unused]] static int AvoidMessageBoxHook(int ReportType, char *Message, + int *Return) { // Set *Return to the retry code for the return value of _CrtDbgReport: // http://msdn.microsoft.com/en-us/library/8hyw4sy7(v=vs.71).aspx // This may also trigger just-in-time debugging via DebugBreak(). diff --git a/llvm/lib/TableGen/Main.cpp b/llvm/lib/TableGen/Main.cpp index 42043f7..b1024a8 100644 --- a/llvm/lib/TableGen/Main.cpp +++ b/llvm/lib/TableGen/Main.cpp @@ -26,6 +26,7 @@ #include "llvm/Support/SMLoc.h" #include "llvm/Support/SourceMgr.h" #include "llvm/Support/ToolOutputFile.h" +#include "llvm/Support/VirtualFileSystem.h" #include "llvm/Support/raw_ostream.h" #include "llvm/TableGen/Error.h" #include "llvm/TableGen/Record.h" @@ -128,6 +129,7 @@ int llvm::TableGenMain(const char *argv0, // Record the location of the include directory so that the lexer can find // it later. SrcMgr.setIncludeDirs(IncludeDirs); + SrcMgr.setVirtualFileSystem(vfs::getRealFileSystem()); TGParser Parser(SrcMgr, MacroNames, Records, NoWarnOnUnusedTemplateArgs); diff --git a/llvm/lib/TableGen/Parser.cpp b/llvm/lib/TableGen/Parser.cpp index 2c3726a..db45054 100644 --- a/llvm/lib/TableGen/Parser.cpp +++ b/llvm/lib/TableGen/Parser.cpp @@ -9,6 +9,7 @@ #include "llvm/TableGen/Parser.h" #include "TGParser.h" #include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/VirtualFileSystem.h" #include "llvm/TableGen/Record.h" using namespace llvm; @@ -21,6 +22,7 @@ bool llvm::TableGenParseFile(SourceMgr &InputSrcMgr, RecordKeeper &Records) { SrcMgr = SourceMgr(); SrcMgr.takeSourceBuffersFrom(InputSrcMgr); SrcMgr.setIncludeDirs(InputSrcMgr.getIncludeDirs()); + SrcMgr.setVirtualFileSystem(InputSrcMgr.getVirtualFileSystem()); SrcMgr.setDiagHandler(InputSrcMgr.getDiagHandler(), InputSrcMgr.getDiagContext()); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index be2f2e4..662d84b 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1561,6 +1561,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::VECREDUCE_ADD, VT, Custom); setOperationAction(ISD::VECREDUCE_AND, VT, Custom); setOperationAction(ISD::VECREDUCE_OR, VT, Custom); + setOperationAction(ISD::VECREDUCE_MUL, VT, Custom); setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); @@ -1717,6 +1718,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); setOperationAction(ISD::VECREDUCE_FMAXIMUM, VT, Custom); setOperationAction(ISD::VECREDUCE_FMINIMUM, VT, Custom); + setOperationAction(ISD::VECREDUCE_FMUL, VT, Custom); setOperationAction(ISD::VECTOR_SPLICE, VT, Custom); setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom); setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom); @@ -7775,6 +7777,9 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, case ISD::VECREDUCE_FMAXIMUM: case ISD::VECREDUCE_FMINIMUM: return LowerVECREDUCE(Op, DAG); + case ISD::VECREDUCE_MUL: + case ISD::VECREDUCE_FMUL: + return LowerVECREDUCE_MUL(Op, DAG); case ISD::ATOMIC_LOAD_AND: return LowerATOMIC_LOAD_AND(Op, DAG); case ISD::DYNAMIC_STACKALLOC: @@ -16794,6 +16799,33 @@ SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op, } } +SDValue AArch64TargetLowering::LowerVECREDUCE_MUL(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + SDValue Src = Op.getOperand(0); + EVT SrcVT = Src.getValueType(); + assert(SrcVT.isScalableVector() && "Unexpected operand type!"); + + SDVTList SrcVTs = DAG.getVTList(SrcVT, SrcVT); + unsigned BaseOpc = ISD::getVecReduceBaseOpcode(Op.getOpcode()); + SDValue Identity = DAG.getNeutralElement(BaseOpc, DL, SrcVT, Op->getFlags()); + + // Whilst we don't know the size of the vector we do know the maximum size so + // can perform a tree reduction with an identity vector, which means once we + // arrive at the result the remaining stages (when the vector is smaller than + // the maximum) have no affect. + + unsigned Segments = AArch64::SVEMaxBitsPerVector / AArch64::SVEBitsPerBlock; + unsigned Stages = llvm::Log2_32(Segments * SrcVT.getVectorMinNumElements()); + + for (unsigned I = 0; I < Stages; ++I) { + Src = DAG.getNode(ISD::VECTOR_DEINTERLEAVE, DL, SrcVTs, Src, Identity); + Src = DAG.getNode(BaseOpc, DL, SrcVT, Src.getValue(0), Src.getValue(1)); + } + + return DAG.getExtractVectorElt(DL, Op.getValueType(), Src, 0); +} + SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op, SelectionDAG &DAG) const { auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>(); @@ -18144,8 +18176,8 @@ bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store, bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad( Instruction *Load, Value *Mask, IntrinsicInst *DI) const { const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID()); - if (Factor != 2 && Factor != 4) { - LLVM_DEBUG(dbgs() << "Matching ld2 and ld4 patterns failed\n"); + if (Factor != 2 && Factor != 3 && Factor != 4) { + LLVM_DEBUG(dbgs() << "Matching ld2, ld3 and ld4 patterns failed\n"); return false; } auto *LI = dyn_cast<LoadInst>(Load); @@ -18223,8 +18255,8 @@ bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore( Instruction *Store, Value *Mask, ArrayRef<Value *> InterleavedValues) const { unsigned Factor = InterleavedValues.size(); - if (Factor != 2 && Factor != 4) { - LLVM_DEBUG(dbgs() << "Matching st2 and st4 patterns failed\n"); + if (Factor != 2 && Factor != 3 && Factor != 4) { + LLVM_DEBUG(dbgs() << "Matching st2, st3 and st4 patterns failed\n"); return false; } StoreInst *SI = dyn_cast<StoreInst>(Store); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 00956fd..9495c9f 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -752,6 +752,7 @@ private: SDValue LowerVSCALE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVECREDUCE_MUL(SDValue Op, SelectionDAG &DAG) const; SDValue LowerATOMIC_LOAD_AND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerInlineDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/AArch64/AArch64InstrGISel.td b/llvm/lib/Target/AArch64/AArch64InstrGISel.td index 7322212..fe84193 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrGISel.td +++ b/llvm/lib/Target/AArch64/AArch64InstrGISel.td @@ -233,6 +233,12 @@ def G_SDOT : AArch64GenericInstruction { let hasSideEffects = 0; } +def G_USDOT : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src1, type0:$src2, type0:$src3); + let hasSideEffects = 0; +} + // Generic instruction for the BSP pseudo. It is expanded into BSP, which // expands into BSL/BIT/BIF after register allocation. def G_BSP : AArch64GenericInstruction { @@ -278,6 +284,7 @@ def : GINodeEquiv<G_UADDLV, AArch64uaddlv>; def : GINodeEquiv<G_UDOT, AArch64udot>; def : GINodeEquiv<G_SDOT, AArch64sdot>; +def : GINodeEquiv<G_USDOT, AArch64usdot>; def : GINodeEquiv<G_EXTRACT_VECTOR_ELT, vector_extract>; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 30dfcf2b..12c600f 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -10600,6 +10600,9 @@ describeORRLoadedValue(const MachineInstr &MI, Register DescribedReg, Register DestReg = DestSrc->Destination->getReg(); Register SrcReg = DestSrc->Source->getReg(); + if (!DestReg.isValid() || !SrcReg.isValid()) + return std::nullopt; + auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {}); // If the described register is the destination, just return the source. diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index 9e2d698..05a4313 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -1855,6 +1855,8 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, return LowerTriOp(AArch64::G_UDOT); case Intrinsic::aarch64_neon_sdot: return LowerTriOp(AArch64::G_SDOT); + case Intrinsic::aarch64_neon_usdot: + return LowerTriOp(AArch64::G_USDOT); case Intrinsic::aarch64_neon_sqxtn: return LowerUnaryOp(TargetOpcode::G_TRUNC_SSAT_S); case Intrinsic::aarch64_neon_sqxtun: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 1b559a6..8ed4062 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -514,8 +514,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, MVT::i64, Custom); setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); - setOperationAction({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, MVT::i32, - Legal); + setOperationAction({ISD::ABS, ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, + MVT::i32, Legal); setOperationAction( {ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index bfe2c80..a67b12a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -901,6 +901,8 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, addRulesForGOpcs({G_ABS}, Standard).Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt}}); + addRulesForGOpcs({G_READSTEADYCOUNTER}, Standard).Uni(S64, {{Sgpr64}, {}}); + bool hasSALUFloat = ST->hasSALUFloatInsts(); addRulesForGOpcs({G_FADD}, Standard) diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index d0ad120..b841171 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -1488,6 +1488,12 @@ let AssemblerPredicate = isGFX12Plus in { def : MnemonicAlias<"ds_load_tr_b64", "ds_load_tr8_b64">, Requires<[isGFX1250Plus]>; def : MnemonicAlias<"ds_load_tr_b128", "ds_load_tr16_b128">, Requires<[isGFX1250Plus]>; +// Additional aliases for ds load transpose instructions. +def : MnemonicAlias<"ds_load_b64_tr_b8", "ds_load_tr8_b64">, Requires<[isGFX125xOnly]>; +def : MnemonicAlias<"ds_load_b128_tr_b16", "ds_load_tr16_b128">, Requires<[isGFX125xOnly]>; +def : MnemonicAlias<"ds_load_b64_tr_b4", "ds_load_tr4_b64">, Requires<[isGFX125xOnly]>; +def : MnemonicAlias<"ds_load_b96_tr_b6", "ds_load_tr6_b96">, Requires<[isGFX125xOnly]>; + //===----------------------------------------------------------------------===// // GFX11. //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index be62395..e3f3aba 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -513,8 +513,7 @@ void AMDGPUDisassembler::decodeImmOperands(MCInst &MI, } if (Imm == AMDGPU::EncValues::LITERAL_CONST) { - Op = decodeLiteralConstant( - Desc, OpDesc, OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_FP64); + Op = decodeLiteralConstant(Desc, OpDesc); continue; } @@ -893,6 +892,7 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, // have EXEC as implicit destination. Issue a warning if encoding for // vdst is not EXEC. if ((MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3) && + MCII->get(MI.getOpcode()).getNumDefs() == 0 && MCII->get(MI.getOpcode()).hasImplicitDefOfPhysReg(AMDGPU::EXEC)) { auto ExecEncoding = MRI.getEncodingValue(AMDGPU::EXEC_LO); if (Bytes_[0] != ExecEncoding) @@ -1545,21 +1545,21 @@ AMDGPUDisassembler::decodeMandatoryLiteralConstant(unsigned Val) const { MCOperand AMDGPUDisassembler::decodeMandatoryLiteral64Constant(uint64_t Val) const { if (HasLiteral) { - if (Literal64 != Val) + if (Literal != Val) return errOperand(Val, "More than one unique literal is illegal"); } HasLiteral = true; - Literal = Literal64 = Val; + Literal = Val; - bool UseLit64 = Hi_32(Literal64) == 0; + bool UseLit64 = Hi_32(Literal) == 0; return UseLit64 ? MCOperand::createExpr(AMDGPUMCExpr::createLit( - LitModifier::Lit64, Literal64, getContext())) - : MCOperand::createImm(Literal64); + LitModifier::Lit64, Literal, getContext())) + : MCOperand::createImm(Literal); } -MCOperand AMDGPUDisassembler::decodeLiteralConstant(const MCInstrDesc &Desc, - const MCOperandInfo &OpDesc, - bool ExtendFP64) const { +MCOperand +AMDGPUDisassembler::decodeLiteralConstant(const MCInstrDesc &Desc, + const MCOperandInfo &OpDesc) const { // For now all literal constants are supposed to be unsigned integer // ToDo: deal with signed/unsigned 64-bit integer constants // ToDo: deal with float/double constants @@ -1569,35 +1569,79 @@ MCOperand AMDGPUDisassembler::decodeLiteralConstant(const MCInstrDesc &Desc, Twine(Bytes.size())); } HasLiteral = true; - Literal = Literal64 = eatBytes<uint32_t>(Bytes); - if (ExtendFP64) - Literal64 <<= 32; + Literal = eatBytes<uint32_t>(Bytes); } - int64_t Val = ExtendFP64 ? Literal64 : Literal; + // For disassembling always assume all inline constants are available. + bool HasInv2Pi = true; - bool CanUse64BitLiterals = - STI.hasFeature(AMDGPU::Feature64BitLiterals) && - !(Desc.TSFlags & (SIInstrFlags::VOP3 | SIInstrFlags::VOP3P)); - - bool UseLit64 = false; - if (CanUse64BitLiterals) { - if (OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 || - OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_C_INT64) - UseLit64 = false; - else if (OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_FP64 || - OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_C_FP64 || - OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_AC_FP64) - UseLit64 = Hi_32(Literal64) == 0; + // Invalid instruction codes may contain literals for inline-only + // operands, so we support them here as well. + int64_t Val = Literal; + bool UseLit = false; + switch (OpDesc.OperandType) { + default: + llvm_unreachable("Unexpected operand type!"); + case AMDGPU::OPERAND_REG_IMM_BF16: + case AMDGPU::OPERAND_REG_INLINE_C_BF16: + case AMDGPU::OPERAND_REG_INLINE_C_V2BF16: + UseLit = AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi); + break; + case AMDGPU::OPERAND_REG_IMM_V2BF16: + UseLit = AMDGPU::isInlinableLiteralV2BF16(Val); + break; + case AMDGPU::OPERAND_REG_IMM_FP16: + case AMDGPU::OPERAND_REG_INLINE_C_FP16: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: + UseLit = AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi); + break; + case AMDGPU::OPERAND_REG_IMM_V2FP16: + UseLit = AMDGPU::isInlinableLiteralV2F16(Val); + break; + case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: + break; + case AMDGPU::OPERAND_REG_IMM_INT16: + case AMDGPU::OPERAND_REG_INLINE_C_INT16: + case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: + UseLit = AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi); + break; + case AMDGPU::OPERAND_REG_IMM_V2INT16: + UseLit = AMDGPU::isInlinableLiteralV2I16(Val); + break; + case AMDGPU::OPERAND_REG_IMM_FP32: + case AMDGPU::OPERAND_REG_INLINE_C_FP32: + case AMDGPU::OPERAND_REG_INLINE_AC_FP32: + case AMDGPU::OPERAND_REG_IMM_INT32: + case AMDGPU::OPERAND_REG_INLINE_C_INT32: + case AMDGPU::OPERAND_REG_INLINE_AC_INT32: + case AMDGPU::OPERAND_REG_IMM_V2FP32: + case AMDGPU::OPERAND_REG_IMM_V2INT32: + case AMDGPU::OPERAND_KIMM32: + UseLit = AMDGPU::isInlinableLiteral32(Val, HasInv2Pi); + break; + case AMDGPU::OPERAND_REG_IMM_FP64: + case AMDGPU::OPERAND_REG_INLINE_C_FP64: + case AMDGPU::OPERAND_REG_INLINE_AC_FP64: + Val <<= 32; + break; + case AMDGPU::OPERAND_REG_IMM_INT64: + case AMDGPU::OPERAND_REG_INLINE_C_INT64: + UseLit = AMDGPU::isInlinableLiteral64(Val, HasInv2Pi); + break; + case MCOI::OPERAND_REGISTER: + // TODO: Disassembling V_DUAL_FMAMK_F32_X_FMAMK_F32_gfx11 hits + // decoding a literal in a position of a register operand. Give + // it special handling in the caller, decodeImmOperands(), instead + // of quietly allowing it here. + break; } - return UseLit64 ? MCOperand::createExpr(AMDGPUMCExpr::createLit( - LitModifier::Lit64, Val, getContext())) - : MCOperand::createImm(Val); + return UseLit ? MCOperand::createExpr(AMDGPUMCExpr::createLit( + LitModifier::Lit, Val, getContext())) + : MCOperand::createImm(Val); } -MCOperand -AMDGPUDisassembler::decodeLiteral64Constant(const MCInst &Inst) const { +MCOperand AMDGPUDisassembler::decodeLiteral64Constant() const { assert(STI.hasFeature(AMDGPU::Feature64BitLiterals)); if (!HasLiteral) { @@ -1606,25 +1650,13 @@ AMDGPUDisassembler::decodeLiteral64Constant(const MCInst &Inst) const { Twine(Bytes.size())); } HasLiteral = true; - Literal64 = eatBytes<uint64_t>(Bytes); - } - - bool UseLit64 = false; - const MCInstrDesc &Desc = MCII->get(Inst.getOpcode()); - const MCOperandInfo &OpDesc = Desc.operands()[Inst.getNumOperands()]; - if (OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 || - OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_C_INT64) { - UseLit64 = false; - } else { - assert(OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_FP64 || - OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_C_FP64 || - OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_AC_FP64); - UseLit64 = Hi_32(Literal64) == 0; + Literal = eatBytes<uint64_t>(Bytes); } + bool UseLit64 = Hi_32(Literal) == 0; return UseLit64 ? MCOperand::createExpr(AMDGPUMCExpr::createLit( - LitModifier::Lit64, Literal64, getContext())) - : MCOperand::createImm(Literal64); + LitModifier::Lit64, Literal, getContext())) + : MCOperand::createImm(Literal); } MCOperand AMDGPUDisassembler::decodeIntImmed(unsigned Imm) { @@ -1913,7 +1945,7 @@ MCOperand AMDGPUDisassembler::decodeNonVGPRSrcOp(const MCInst &Inst, return MCOperand::createImm(Val); if (Val == LITERAL64_CONST && STI.hasFeature(AMDGPU::Feature64BitLiterals)) { - return decodeLiteral64Constant(Inst); + return decodeLiteral64Constant(); } switch (Width) { diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h index 2751857..d103d79 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -44,8 +44,7 @@ private: const unsigned HwModeRegClass; const unsigned TargetMaxInstBytes; mutable ArrayRef<uint8_t> Bytes; - mutable uint32_t Literal; - mutable uint64_t Literal64; + mutable uint64_t Literal; mutable bool HasLiteral; mutable std::optional<bool> EnableWavefrontSize32; unsigned CodeObjectVersion; @@ -144,9 +143,8 @@ public: MCOperand decodeMandatoryLiteralConstant(unsigned Imm) const; MCOperand decodeMandatoryLiteral64Constant(uint64_t Imm) const; MCOperand decodeLiteralConstant(const MCInstrDesc &Desc, - const MCOperandInfo &OpDesc, - bool ExtendFP64) const; - MCOperand decodeLiteral64Constant(const MCInst &Inst) const; + const MCOperandInfo &OpDesc) const; + MCOperand decodeLiteral64Constant() const; MCOperand decodeSrcOp(const MCInst &Inst, unsigned Width, unsigned Val) const; diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 6de59be..8ea64d1 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -3711,6 +3711,12 @@ defm GLOBAL_LOAD_TR_B64_w32 : VFLAT_Real_AllAddr_gfx1250<0x058, "globa defm GLOBAL_LOAD_TR4_B64 : VFLAT_Real_AllAddr_gfx1250<0x073>; defm GLOBAL_LOAD_TR6_B96 : VFLAT_Real_AllAddr_gfx1250<0x074>; +// Additional aliases for global load transpose instructions. +def : MnemonicAlias<"global_load_b128_tr_b16", "global_load_tr16_b128">, Requires<[isGFX125xOnly]>; +def : MnemonicAlias<"global_load_b64_tr_b8", "global_load_tr8_b64">, Requires<[isGFX125xOnly]>; +def : MnemonicAlias<"global_load_b64_tr_b4", "global_load_tr4_b64">, Requires<[isGFX125xOnly]>; +def : MnemonicAlias<"global_load_b96_tr_b6", "global_load_tr6_b96">, Requires<[isGFX125xOnly]>; + defm FLAT_ATOMIC_ADD_F64 : VFLAT_Real_Atomics_gfx1250<0x055>; defm FLAT_ATOMIC_MIN_F64 : VFLAT_Real_Atomics_gfx1250<0x05b, "flat_atomic_min_num_f64">; defm FLAT_ATOMIC_MAX_F64 : VFLAT_Real_Atomics_gfx1250<0x05c, "flat_atomic_max_num_f64">; diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h index 979a8b0..4b22c68 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h @@ -21,6 +21,7 @@ #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/RegisterPressure.h" #include <algorithm> +#include <array> namespace llvm { @@ -45,7 +46,7 @@ struct GCNRegPressure { return !Value[SGPR] && !Value[VGPR] && !Value[AGPR] && !Value[AVGPR]; } - void clear() { std::fill(&Value[0], &Value[ValueArraySize], 0); } + void clear() { Value.fill(0); } unsigned getNumRegs(RegKind Kind) const { assert(Kind < TOTAL_KINDS); @@ -127,9 +128,7 @@ struct GCNRegPressure { bool less(const MachineFunction &MF, const GCNRegPressure &O, unsigned MaxOccupancy = std::numeric_limits<unsigned>::max()) const; - bool operator==(const GCNRegPressure &O) const { - return std::equal(&Value[0], &Value[ValueArraySize], O.Value); - } + bool operator==(const GCNRegPressure &O) const { return Value == O.Value; } bool operator!=(const GCNRegPressure &O) const { return !(*this == O); @@ -160,7 +159,7 @@ private: /// Pressure for all register kinds (first all regular registers kinds, then /// all tuple register kinds). - unsigned Value[ValueArraySize]; + std::array<unsigned, ValueArraySize> Value; static unsigned getRegKind(const TargetRegisterClass *RC, const SIRegisterInfo *STI); diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 58482ea..9fbf9e5 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -69,6 +69,12 @@ static cl::opt<bool> GCNTrackers( cl::desc("Use the AMDGPU specific RPTrackers during scheduling"), cl::init(false)); +static cl::opt<unsigned> PendingQueueLimit( + "amdgpu-scheduler-pending-queue-limit", cl::Hidden, + cl::desc( + "Max (Available+Pending) size to inspect pending queue (0 disables)"), + cl::init(256)); + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) #define DUMP_MAX_REG_PRESSURE static cl::opt<bool> PrintMaxRPRegUsageBeforeScheduler( @@ -335,17 +341,52 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, } } +static bool shouldCheckPending(SchedBoundary &Zone, + const TargetSchedModel *SchedModel) { + bool HasBufferedModel = + SchedModel->hasInstrSchedModel() && SchedModel->getMicroOpBufferSize(); + unsigned Combined = Zone.Available.size() + Zone.Pending.size(); + return Combined <= PendingQueueLimit && HasBufferedModel; +} + +static SUnit *pickOnlyChoice(SchedBoundary &Zone, + const TargetSchedModel *SchedModel) { + // pickOnlyChoice() releases pending instructions and checks for new hazards. + SUnit *OnlyChoice = Zone.pickOnlyChoice(); + if (!shouldCheckPending(Zone, SchedModel) || Zone.Pending.empty()) + return OnlyChoice; + + return nullptr; +} + +void GCNSchedStrategy::printCandidateDecision(const SchedCandidate &Current, + const SchedCandidate &Preferred) { + LLVM_DEBUG({ + dbgs() << "Prefer:\t\t"; + DAG->dumpNode(*Preferred.SU); + + if (Current.SU) { + dbgs() << "Not:\t"; + DAG->dumpNode(*Current.SU); + } + + dbgs() << "Reason:\t\t"; + traceCandidate(Preferred); + }); +} + // This function is mostly cut and pasted from // GenericScheduler::pickNodeFromQueue() void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy, const RegPressureTracker &RPTracker, - SchedCandidate &Cand, + SchedCandidate &Cand, bool &IsPending, bool IsBottomUp) { const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI); ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos(); unsigned SGPRPressure = 0; unsigned VGPRPressure = 0; + IsPending = false; if (DAG->isTrackingPressure()) { if (!GCNTrackers) { SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32]; @@ -358,8 +399,9 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, VGPRPressure = T->getPressure().getArchVGPRNum(); } } - ReadyQueue &Q = Zone.Available; - for (SUnit *SU : Q) { + LLVM_DEBUG(dbgs() << "Available Q:\n"); + ReadyQueue &AQ = Zone.Available; + for (SUnit *SU : AQ) { SchedCandidate TryCand(ZonePolicy); initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure, @@ -371,27 +413,55 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, // Initialize resource delta if needed in case future heuristics query it. if (TryCand.ResDelta == SchedResourceDelta()) TryCand.initResourceDelta(Zone.DAG, SchedModel); + LLVM_DEBUG(printCandidateDecision(Cand, TryCand)); Cand.setBest(TryCand); - LLVM_DEBUG(traceCandidate(Cand)); + } else { + printCandidateDecision(TryCand, Cand); + } + } + + if (!shouldCheckPending(Zone, SchedModel)) + return; + + LLVM_DEBUG(dbgs() << "Pending Q:\n"); + ReadyQueue &PQ = Zone.Pending; + for (SUnit *SU : PQ) { + + SchedCandidate TryCand(ZonePolicy); + initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure, + VGPRPressure, IsBottomUp); + // Pass SchedBoundary only when comparing nodes from the same boundary. + SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr; + tryPendingCandidate(Cand, TryCand, ZoneArg); + if (TryCand.Reason != NoCand) { + // Initialize resource delta if needed in case future heuristics query it. + if (TryCand.ResDelta == SchedResourceDelta()) + TryCand.initResourceDelta(Zone.DAG, SchedModel); + LLVM_DEBUG(printCandidateDecision(Cand, TryCand)); + IsPending = true; + Cand.setBest(TryCand); + } else { + printCandidateDecision(TryCand, Cand); } } } // This function is mostly cut and pasted from // GenericScheduler::pickNodeBidirectional() -SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) { +SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode, + bool &PickedPending) { // Schedule as far as possible in the direction of no choice. This is most // efficient, but also provides the best heuristics for CriticalPSets. - if (SUnit *SU = Bot.pickOnlyChoice()) { + if (SUnit *SU = pickOnlyChoice(Bot, SchedModel)) { IsTopNode = false; return SU; } - if (SUnit *SU = Top.pickOnlyChoice()) { + if (SUnit *SU = pickOnlyChoice(Top, SchedModel)) { IsTopNode = true; return SU; } - // Set the bottom-up policy based on the state of the current bottom zone and - // the instructions outside the zone, including the top zone. + // Set the bottom-up policy based on the state of the current bottom zone + // and the instructions outside the zone, including the top zone. CandPolicy BotPolicy; setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top); // Set the top-down policy based on the state of the current top zone and @@ -399,12 +469,14 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) { CandPolicy TopPolicy; setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot); + bool BotPending = false; // See if BotCand is still valid (because we previously scheduled from Top). LLVM_DEBUG(dbgs() << "Picking from Bot:\n"); if (!BotCand.isValid() || BotCand.SU->isScheduled || BotCand.Policy != BotPolicy) { BotCand.reset(CandPolicy()); pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand, + BotPending, /*IsBottomUp=*/true); assert(BotCand.Reason != NoCand && "failed to find the first candidate"); } else { @@ -414,6 +486,7 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) { SchedCandidate TCand; TCand.reset(CandPolicy()); pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand, + BotPending, /*IsBottomUp=*/true); assert(TCand.SU == BotCand.SU && "Last pick result should correspond to re-picking right now"); @@ -421,12 +494,14 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) { #endif } + bool TopPending = false; // Check if the top Q has a better candidate. LLVM_DEBUG(dbgs() << "Picking from Top:\n"); if (!TopCand.isValid() || TopCand.SU->isScheduled || TopCand.Policy != TopPolicy) { TopCand.reset(CandPolicy()); pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand, + TopPending, /*IsBottomUp=*/false); assert(TopCand.Reason != NoCand && "failed to find the first candidate"); } else { @@ -436,6 +511,7 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) { SchedCandidate TCand; TCand.reset(CandPolicy()); pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand, + TopPending, /*IsBottomUp=*/false); assert(TCand.SU == TopCand.SU && "Last pick result should correspond to re-picking right now"); @@ -446,12 +522,21 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) { // Pick best from BotCand and TopCand. LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand); dbgs() << "Bot Cand: "; traceCandidate(BotCand);); - SchedCandidate Cand = BotCand; - TopCand.Reason = NoCand; - tryCandidate(Cand, TopCand, nullptr); - if (TopCand.Reason != NoCand) { - Cand.setBest(TopCand); + SchedCandidate Cand = BotPending ? TopCand : BotCand; + SchedCandidate TryCand = BotPending ? BotCand : TopCand; + PickedPending = BotPending && TopPending; + + TryCand.Reason = NoCand; + if (BotPending || TopPending) { + PickedPending |= tryPendingCandidate(Cand, TopCand, nullptr); + } else { + tryCandidate(Cand, TryCand, nullptr); + } + + if (TryCand.Reason != NoCand) { + Cand.setBest(TryCand); } + LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand);); IsTopNode = Cand.AtTop; @@ -466,35 +551,55 @@ SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) { Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage"); return nullptr; } + bool PickedPending; SUnit *SU; do { + PickedPending = false; if (RegionPolicy.OnlyTopDown) { - SU = Top.pickOnlyChoice(); + SU = pickOnlyChoice(Top, SchedModel); if (!SU) { CandPolicy NoPolicy; TopCand.reset(NoPolicy); pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand, + PickedPending, /*IsBottomUp=*/false); assert(TopCand.Reason != NoCand && "failed to find a candidate"); SU = TopCand.SU; } IsTopNode = true; } else if (RegionPolicy.OnlyBottomUp) { - SU = Bot.pickOnlyChoice(); + SU = pickOnlyChoice(Bot, SchedModel); if (!SU) { CandPolicy NoPolicy; BotCand.reset(NoPolicy); pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand, + PickedPending, /*IsBottomUp=*/true); assert(BotCand.Reason != NoCand && "failed to find a candidate"); SU = BotCand.SU; } IsTopNode = false; } else { - SU = pickNodeBidirectional(IsTopNode); + SU = pickNodeBidirectional(IsTopNode, PickedPending); } } while (SU->isScheduled); + if (PickedPending) { + unsigned ReadyCycle = IsTopNode ? SU->TopReadyCycle : SU->BotReadyCycle; + SchedBoundary &Zone = IsTopNode ? Top : Bot; + unsigned CurrentCycle = Zone.getCurrCycle(); + if (ReadyCycle > CurrentCycle) + Zone.bumpCycle(ReadyCycle); + + // FIXME: checkHazard() doesn't give information about which cycle the + // hazard will resolve so just keep bumping the cycle by 1. This could be + // made more efficient if checkHazard() returned more details. + while (Zone.checkHazard(SU)) + Zone.bumpCycle(Zone.getCurrCycle() + 1); + + Zone.releasePending(); + } + if (SU->isTopReady()) Top.removeReady(SU); if (SU->isBottomReady()) @@ -540,6 +645,47 @@ GCNSchedStageID GCNSchedStrategy::getNextStage() const { return *std::next(CurrentStage); } +bool GCNSchedStrategy::tryPendingCandidate(SchedCandidate &Cand, + SchedCandidate &TryCand, + SchedBoundary *Zone) const { + // Initialize the candidate if needed. + if (!Cand.isValid()) { + TryCand.Reason = NodeOrder; + return true; + } + + // Bias PhysReg Defs and copies to their uses and defined respectively. + if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop), + biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg)) + return TryCand.Reason != NoCand; + + // Avoid exceeding the target's limit. + if (DAG->isTrackingPressure() && + tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand, + RegExcess, TRI, DAG->MF)) + return TryCand.Reason != NoCand; + + // Avoid increasing the max critical pressure in the scheduled region. + if (DAG->isTrackingPressure() && + tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax, + TryCand, Cand, RegCritical, TRI, DAG->MF)) + return TryCand.Reason != NoCand; + + bool SameBoundary = Zone != nullptr; + if (SameBoundary) { + TryCand.initResourceDelta(DAG, SchedModel); + if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources, + TryCand, Cand, ResourceReduce)) + return TryCand.Reason != NoCand; + if (tryGreater(TryCand.ResDelta.DemandedResources, + Cand.ResDelta.DemandedResources, TryCand, Cand, + ResourceDemand)) + return TryCand.Reason != NoCand; + } + + return false; +} + GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy( const MachineSchedContext *C, bool IsLegacyScheduler) : GCNSchedStrategy(C) { diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h index 8ea4267..975781f 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -44,17 +44,32 @@ raw_ostream &operator<<(raw_ostream &OS, const GCNSchedStageID &StageID); /// heuristics to determine excess/critical pressure sets. class GCNSchedStrategy : public GenericScheduler { protected: - SUnit *pickNodeBidirectional(bool &IsTopNode); + SUnit *pickNodeBidirectional(bool &IsTopNode, bool &PickedPending); void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy, const RegPressureTracker &RPTracker, - SchedCandidate &Cand, bool IsBottomUp); + SchedCandidate &Cand, bool &IsPending, + bool IsBottomUp); void initCandidate(SchedCandidate &Cand, SUnit *SU, bool AtTop, const RegPressureTracker &RPTracker, const SIRegisterInfo *SRI, unsigned SGPRPressure, unsigned VGPRPressure, bool IsBottomUp); + /// Evaluates instructions in the pending queue using a subset of scheduling + /// heuristics. + /// + /// Instructions that cannot be issued due to hardware constraints are placed + /// in the pending queue rather than the available queue, making them normally + /// invisible to scheduling heuristics. However, in certain scenarios (such as + /// avoiding register spilling), it may be beneficial to consider scheduling + /// these not-yet-ready instructions. + bool tryPendingCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone) const; + + void printCandidateDecision(const SchedCandidate &Current, + const SchedCandidate &Preferred); + std::vector<unsigned> Pressure; std::vector<unsigned> MaxPressure; diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp index 2aa54c9..09ef6ac 100644 --- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -45,6 +45,9 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM, // Legalize loads and stores to the private address space. setOperationAction(ISD::LOAD, {MVT::i32, MVT::v2i32, MVT::v4i32}, Custom); + // 32-bit ABS is legal for AMDGPU except for R600 + setOperationAction(ISD::ABS, MVT::i32, Expand); + // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address // spaces, so it is custom lowered to handle those where it isn't. for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index d516330..50447f4 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -9072,6 +9072,67 @@ void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist, MachineOperand &Src1 = Inst.getOperand(2); const DebugLoc &DL = Inst.getDebugLoc(); + if (ST.useRealTrue16Insts()) { + Register SrcReg0, SrcReg1; + if (!Src0.isReg() || !RI.isVGPR(MRI, Src0.getReg())) { + SrcReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), SrcReg0).add(Src0); + } else { + SrcReg0 = Src0.getReg(); + } + + if (!Src1.isReg() || !RI.isVGPR(MRI, Src1.getReg())) { + SrcReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), SrcReg1).add(Src1); + } else { + SrcReg1 = Src1.getReg(); + } + + bool isSrc0Reg16 = MRI.constrainRegClass(SrcReg0, &AMDGPU::VGPR_16RegClass); + bool isSrc1Reg16 = MRI.constrainRegClass(SrcReg1, &AMDGPU::VGPR_16RegClass); + + auto NewMI = BuildMI(*MBB, Inst, DL, get(AMDGPU::REG_SEQUENCE), ResultReg); + switch (Inst.getOpcode()) { + case AMDGPU::S_PACK_LL_B32_B16: + NewMI + .addReg(SrcReg0, 0, + isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16) + .addImm(AMDGPU::lo16) + .addReg(SrcReg1, 0, + isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16) + .addImm(AMDGPU::hi16); + break; + case AMDGPU::S_PACK_LH_B32_B16: + NewMI + .addReg(SrcReg0, 0, + isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16) + .addImm(AMDGPU::lo16) + .addReg(SrcReg1, 0, AMDGPU::hi16) + .addImm(AMDGPU::hi16); + break; + case AMDGPU::S_PACK_HL_B32_B16: + NewMI.addReg(SrcReg0, 0, AMDGPU::hi16) + .addImm(AMDGPU::lo16) + .addReg(SrcReg1, 0, + isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16) + .addImm(AMDGPU::hi16); + break; + case AMDGPU::S_PACK_HH_B32_B16: + NewMI.addReg(SrcReg0, 0, AMDGPU::hi16) + .addImm(AMDGPU::lo16) + .addReg(SrcReg1, 0, AMDGPU::hi16) + .addImm(AMDGPU::hi16); + break; + default: + llvm_unreachable("unhandled s_pack_* instruction"); + } + + MachineOperand &Dest = Inst.getOperand(0); + MRI.replaceRegWith(Dest.getReg(), ResultReg); + addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); + return; + } + switch (Inst.getOpcode()) { case AMDGPU::S_PACK_LL_B32_B16: { Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index e979eeb..df27ec1 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -879,6 +879,11 @@ public: MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64; } + bool isMFMA(uint16_t Opcode) const { + return isMAI(Opcode) && Opcode != AMDGPU::V_ACCVGPR_WRITE_B32_e64 && + Opcode != AMDGPU::V_ACCVGPR_READ_B32_e64; + } + static bool isDOT(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::IsDOT; } @@ -895,6 +900,10 @@ public: return isMFMA(MI) || isWMMA(MI) || isSWMMAC(MI); } + bool isMFMAorWMMA(uint16_t Opcode) const { + return isMFMA(Opcode) || isWMMA(Opcode) || isSWMMAC(Opcode); + } + static bool isSWMMAC(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::IsSWMMAC; } diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp index c684f9e..7431e11 100644 --- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp @@ -47,9 +47,6 @@ private: const MachineBasicBlock &From, const MachineBasicBlock &To) const; bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB); - // Check if the machine instruction being processed is a supported packed - // instruction. - bool isUnpackingSupportedInstr(MachineInstr &MI) const; // Creates a list of packed instructions following an MFMA that are suitable // for unpacking. void collectUnpackingCandidates(MachineInstr &BeginMI, @@ -454,23 +451,6 @@ bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI, return true; } -// If support is extended to new operations, add tests in -// llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir. -bool SIPreEmitPeephole::isUnpackingSupportedInstr(MachineInstr &MI) const { - if (!TII->isNeverCoissue(MI)) - return false; - unsigned Opcode = MI.getOpcode(); - switch (Opcode) { - case AMDGPU::V_PK_ADD_F32: - case AMDGPU::V_PK_MUL_F32: - case AMDGPU::V_PK_FMA_F32: - return true; - default: - return false; - } - llvm_unreachable("Fully covered switch"); -} - bool SIPreEmitPeephole::canUnpackingClobberRegister(const MachineInstr &MI) { unsigned OpCode = MI.getOpcode(); Register DstReg = MI.getOperand(0).getReg(); @@ -612,10 +592,13 @@ void SIPreEmitPeephole::collectUnpackingCandidates( for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) { MachineInstr &Instr = *I; + uint16_t UnpackedOpCode = mapToUnpackedOpcode(Instr); + bool IsUnpackable = + !(UnpackedOpCode == std::numeric_limits<uint16_t>::max()); if (Instr.isMetaInstruction()) continue; if ((Instr.isTerminator()) || - (TII->isNeverCoissue(Instr) && !isUnpackingSupportedInstr(Instr)) || + (TII->isNeverCoissue(Instr) && !IsUnpackable) || (SIInstrInfo::modifiesModeRegister(Instr) && Instr.modifiesRegister(AMDGPU::EXEC, TRI))) return; @@ -639,7 +622,7 @@ void SIPreEmitPeephole::collectUnpackingCandidates( if (TRI->regsOverlap(MFMADef, InstrMO.getReg())) return; } - if (!isUnpackingSupportedInstr(Instr)) + if (!IsUnpackable) continue; if (canUnpackingClobberRegister(Instr)) @@ -654,7 +637,6 @@ void SIPreEmitPeephole::collectUnpackingCandidates( if (TotalCyclesBetweenCandidates < NumMFMACycles - 1) InstrsToUnpack.insert(&Instr); } - return; } void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) { @@ -681,7 +663,6 @@ void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) { HiDstOp.setIsRenamable(DstOp.isRenamable()); I.eraseFromParent(); - return; } MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I, @@ -689,8 +670,8 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I, bool IsHiBits) { MachineBasicBlock &MBB = *I.getParent(); const DebugLoc &DL = I.getDebugLoc(); - const MachineOperand *SrcMO1 = TII->getNamedOperand(I, AMDGPU::OpName::src0); - const MachineOperand *SrcMO2 = TII->getNamedOperand(I, AMDGPU::OpName::src1); + const MachineOperand *SrcMO0 = TII->getNamedOperand(I, AMDGPU::OpName::src0); + const MachineOperand *SrcMO1 = TII->getNamedOperand(I, AMDGPU::OpName::src1); Register DstReg = I.getOperand(0).getReg(); unsigned OpCode = I.getOpcode(); Register UnpackedDstReg = IsHiBits ? TRI->getSubReg(DstReg, AMDGPU::sub1) @@ -704,15 +685,15 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I, MachineInstrBuilder NewMI = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode)); NewMI.addDef(UnpackedDstReg); // vdst - addOperandAndMods(NewMI, Src0Mods, IsHiBits, *SrcMO1); - addOperandAndMods(NewMI, Src1Mods, IsHiBits, *SrcMO2); + addOperandAndMods(NewMI, Src0Mods, IsHiBits, *SrcMO0); + addOperandAndMods(NewMI, Src1Mods, IsHiBits, *SrcMO1); if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) { - const MachineOperand *SrcMO3 = + const MachineOperand *SrcMO2 = TII->getNamedOperand(I, AMDGPU::OpName::src2); unsigned Src2Mods = TII->getNamedOperand(I, AMDGPU::OpName::src2_modifiers)->getImm(); - addOperandAndMods(NewMI, Src2Mods, IsHiBits, *SrcMO3); + addOperandAndMods(NewMI, Src2Mods, IsHiBits, *SrcMO2); } NewMI.addImm(ClampVal); // clamp // Packed instructions do not support output modifiers. safe to assign them 0 @@ -789,9 +770,13 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) { // TODO: Fold this into previous block, if possible. Evaluate and handle any // side effects. + + // Perform the extra MF scans only for supported archs + if (!ST.hasGFX940Insts()) + return Changed; for (MachineBasicBlock &MBB : MF) { - // Unpack packed instructions overlapped by MFMAs. This allows the compiler - // to co-issue unpacked instructions with MFMA + // Unpack packed instructions overlapped by MFMAs. This allows the + // compiler to co-issue unpacked instructions with MFMA auto SchedModel = TII->getSchedModel(); SetVector<MachineInstr *> InstrsToUnpack; for (auto &MI : make_early_inc_range(MBB.instrs())) { diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index a01a5fd..5e3195b 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -1697,9 +1697,6 @@ LLVM_READNONE bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi); LLVM_READNONE -bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi); - -LLVM_READNONE bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi); LLVM_READNONE diff --git a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp index 9945ecc..0d7b6d1 100644 --- a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -161,8 +161,8 @@ namespace { friend bool operator<(const NEONLdStTableEntry &TE, unsigned PseudoOpc) { return TE.PseudoOpc < PseudoOpc; } - friend bool LLVM_ATTRIBUTE_UNUSED operator<(unsigned PseudoOpc, - const NEONLdStTableEntry &TE) { + [[maybe_unused]] friend bool operator<(unsigned PseudoOpc, + const NEONLdStTableEntry &TE) { return PseudoOpc < TE.PseudoOpc; } }; diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 67ea2dd..35e1127 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -21287,21 +21287,28 @@ bool ARMTargetLowering::useLoadStackGuardNode(const Module &M) const { } void ARMTargetLowering::insertSSPDeclarations(Module &M) const { + // MSVC CRT provides functionalities for stack protection. RTLIB::LibcallImpl SecurityCheckCookieLibcall = getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE); - if (SecurityCheckCookieLibcall == RTLIB::Unsupported) - return TargetLowering::insertSSPDeclarations(M); - // MSVC CRT has a global variable holding security cookie. - M.getOrInsertGlobal("__security_cookie", - PointerType::getUnqual(M.getContext())); + RTLIB::LibcallImpl SecurityCookieVar = + getLibcallImpl(RTLIB::STACK_CHECK_GUARD); + if (SecurityCheckCookieLibcall != RTLIB::Unsupported && + SecurityCookieVar != RTLIB::Unsupported) { + // MSVC CRT has a global variable holding security cookie. + M.getOrInsertGlobal(getLibcallImplName(SecurityCookieVar), + PointerType::getUnqual(M.getContext())); - // MSVC CRT has a function to validate security cookie. - FunctionCallee SecurityCheckCookie = M.getOrInsertFunction( - getLibcallImplName(SecurityCheckCookieLibcall), - Type::getVoidTy(M.getContext()), PointerType::getUnqual(M.getContext())); - if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) - F->addParamAttr(0, Attribute::AttrKind::InReg); + // MSVC CRT has a function to validate security cookie. + FunctionCallee SecurityCheckCookie = + M.getOrInsertFunction(getLibcallImplName(SecurityCheckCookieLibcall), + Type::getVoidTy(M.getContext()), + PointerType::getUnqual(M.getContext())); + if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) + F->addParamAttr(0, Attribute::AttrKind::InReg); + } + + TargetLowering::insertSSPDeclarations(M); } Function *ARMTargetLowering::getSSPStackGuardCheck(const Module &M) const { diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td index 228114c..44c4830 100644 --- a/llvm/lib/Target/DirectX/DXIL.td +++ b/llvm/lib/Target/DirectX/DXIL.td @@ -57,6 +57,7 @@ def ResBindTy : DXILOpParamType; def ResPropsTy : DXILOpParamType; def SplitDoubleTy : DXILOpParamType; def BinaryWithCarryTy : DXILOpParamType; +def DimensionsTy : DXILOpParamType; class DXILOpClass; @@ -901,6 +902,13 @@ def CheckAccessFullyMapped : DXILOp<71, checkAccessFullyMapped> { let attributes = [Attributes<DXIL1_0, [ReadOnly]>]; } +def GetDimensions : DXILOp<72, getDimensions> { + let Doc = "gets the dimensions of a buffer or texture"; + let arguments = [HandleTy, Int32Ty]; + let result = DimensionsTy; + let stages = [Stages<DXIL1_0, [all_stages]>]; +} + def Barrier : DXILOp<80, barrier> { let Doc = "inserts a memory barrier in the shader"; let intrinsics = [ diff --git a/llvm/lib/Target/DirectX/DXILOpBuilder.cpp b/llvm/lib/Target/DirectX/DXILOpBuilder.cpp index 1aed8f9..944b2e6 100644 --- a/llvm/lib/Target/DirectX/DXILOpBuilder.cpp +++ b/llvm/lib/Target/DirectX/DXILOpBuilder.cpp @@ -261,6 +261,12 @@ static StructType *getBinaryWithCarryType(LLVMContext &Context) { return StructType::create({Int32Ty, Int1Ty}, "dx.types.i32c"); } +static StructType *getDimensionsType(LLVMContext &Ctx) { + Type *Int32Ty = Type::getInt32Ty(Ctx); + return getOrCreateStructType("dx.types.Dimensions", + {Int32Ty, Int32Ty, Int32Ty, Int32Ty}, Ctx); +} + static Type *getTypeFromOpParamType(OpParamType Kind, LLVMContext &Ctx, Type *OverloadTy) { switch (Kind) { @@ -318,6 +324,8 @@ static Type *getTypeFromOpParamType(OpParamType Kind, LLVMContext &Ctx, return getSplitDoubleType(Ctx); case OpParamType::BinaryWithCarryTy: return getBinaryWithCarryType(Ctx); + case OpParamType::DimensionsTy: + return getDimensionsType(Ctx); } llvm_unreachable("Invalid parameter kind"); return nullptr; diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp index 610d8b6..e46a393 100644 --- a/llvm/lib/Target/DirectX/DXILOpLowering.cpp +++ b/llvm/lib/Target/DirectX/DXILOpLowering.cpp @@ -627,6 +627,28 @@ public: }); } + [[nodiscard]] bool lowerGetDimensionsX(Function &F) { + IRBuilder<> &IRB = OpBuilder.getIRB(); + Type *Int32Ty = IRB.getInt32Ty(); + + return replaceFunction(F, [&](CallInst *CI) -> Error { + IRB.SetInsertPoint(CI); + Value *Handle = + createTmpHandleCast(CI->getArgOperand(0), OpBuilder.getHandleType()); + Value *Undef = UndefValue::get(Int32Ty); + + Expected<CallInst *> OpCall = OpBuilder.tryCreateOp( + OpCode::GetDimensions, {Handle, Undef}, CI->getName(), Int32Ty); + if (Error E = OpCall.takeError()) + return E; + Value *Dim = IRB.CreateExtractValue(*OpCall, 0); + + CI->replaceAllUsesWith(Dim); + CI->eraseFromParent(); + return Error::success(); + }); + } + [[nodiscard]] bool lowerGetPointer(Function &F) { // These should have already been handled in DXILResourceAccess, so we can // just clean up the dead prototype. @@ -934,6 +956,9 @@ public: case Intrinsic::dx_resource_updatecounter: HasErrors |= lowerUpdateCounter(F); break; + case Intrinsic::dx_resource_getdimensions_x: + HasErrors |= lowerGetDimensionsX(F); + break; case Intrinsic::ctpop: HasErrors |= lowerCtpopToCountBits(F); break; diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp index 82c43ff..26a8728 100644 --- a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp +++ b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp @@ -1165,12 +1165,15 @@ void DXILBitcodeWriter::writeValueSymbolTableForwardDecl() {} /// Returns the bit offset to backpatch with the location of the real VST. void DXILBitcodeWriter::writeModuleInfo() { // Emit various pieces of data attached to a module. - if (!M.getTargetTriple().empty()) - writeStringRecord(Stream, bitc::MODULE_CODE_TRIPLE, - M.getTargetTriple().str(), 0 /*TODO*/); - const std::string &DL = M.getDataLayoutStr(); - if (!DL.empty()) - writeStringRecord(Stream, bitc::MODULE_CODE_DATALAYOUT, DL, 0 /*TODO*/); + + // We need to hardcode a triple and datalayout that's compatible with the + // historical DXIL triple and datalayout from DXC. + StringRef Triple = "dxil-ms-dx"; + StringRef DL = "e-m:e-p:32:32-i1:8-i8:8-i16:32-i32:32-i64:64-" + "f16:32-f32:32-f64:64-n8:16:32:64"; + writeStringRecord(Stream, bitc::MODULE_CODE_TRIPLE, Triple, 0 /*TODO*/); + writeStringRecord(Stream, bitc::MODULE_CODE_DATALAYOUT, DL, 0 /*TODO*/); + if (!M.getModuleInlineAsm().empty()) writeStringRecord(Stream, bitc::MODULE_CODE_ASM, M.getModuleInlineAsm(), 0 /*TODO*/); diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILWriterPass.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILWriterPass.cpp index 1eb03bf..725f2b1 100644 --- a/llvm/lib/Target/DirectX/DXILWriter/DXILWriterPass.cpp +++ b/llvm/lib/Target/DirectX/DXILWriter/DXILWriterPass.cpp @@ -149,11 +149,6 @@ public: std::string Data; llvm::raw_string_ostream OS(Data); - Triple OriginalTriple = M.getTargetTriple(); - // Set to DXIL triple when write to bitcode. - // Only the output bitcode need to be DXIL triple. - M.setTargetTriple(Triple("dxil-ms-dx")); - // Perform late legalization of lifetime intrinsics that would otherwise // fail the Module Verifier if performed in an earlier pass legalizeLifetimeIntrinsics(M); @@ -165,9 +160,6 @@ public: // not-so-legal legalizations removeLifetimeIntrinsics(M); - // Recover triple. - M.setTargetTriple(OriginalTriple); - Constant *ModuleConstant = ConstantDataArray::get(M.getContext(), arrayRefFromStringRef(Data)); auto *GV = new llvm::GlobalVariable(M, ModuleConstant->getType(), true, diff --git a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp index 5f180d6..3bd6ed4 100644 --- a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp +++ b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp @@ -66,6 +66,10 @@ public: void remapInstruction(MCInst &Instr) const; + Expected<bool> onSymbolStart(SymbolInfoTy &Symbol, uint64_t &Size, + ArrayRef<uint8_t> Bytes, + uint64_t Address) const override; + private: bool makeBundle(ArrayRef<uint8_t> Bytes, uint64_t Address, uint64_t &BytesToSkip, raw_ostream &CS) const; @@ -567,6 +571,18 @@ DecodeStatus HexagonDisassembler::getSingleInstruction(MCInst &MI, MCInst &MCB, return Result; } +Expected<bool> HexagonDisassembler::onSymbolStart(SymbolInfoTy &Symbol, + uint64_t &Size, + ArrayRef<uint8_t> Bytes, + uint64_t Address) const { + // At the start of a symbol, force a fresh packet by resetting any + // in-progress bundle state. This prevents packets from straddling label + // boundaries when data (e.g. jump tables) appears in between. + Size = 0; + resetBundle(); + return true; +} + static DecodeStatus DecodeRegisterClass(MCInst &Inst, unsigned RegNo, ArrayRef<MCPhysReg> Table) { if (RegNo < Table.size()) { diff --git a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp index 52e6b0b..68f5312 100644 --- a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp +++ b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp @@ -174,8 +174,8 @@ namespace { const TargetRegisterInfo *TRI; }; - raw_ostream &operator<< (raw_ostream &OS, const PrintRegSet &P) - LLVM_ATTRIBUTE_UNUSED; + [[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS, + const PrintRegSet &P); raw_ostream &operator<< (raw_ostream &OS, const PrintRegSet &P) { OS << '{'; for (unsigned R = P.RS.find_first(); R; R = P.RS.find_next(R)) diff --git a/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp b/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp index 14b6bb3..9087f9d 100644 --- a/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp +++ b/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp @@ -272,15 +272,14 @@ namespace { OS << *I << ' ' << **I << '\n'; } - raw_ostream &operator<< (raw_ostream &OS, - const NodeVect &S) LLVM_ATTRIBUTE_UNUSED; + [[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS, const NodeVect &S); raw_ostream &operator<< (raw_ostream &OS, const NodeVect &S) { dump_node_container(OS, S); return OS; } - raw_ostream &operator<< (raw_ostream &OS, - const NodeToUsesMap &M) LLVM_ATTRIBUTE_UNUSED; + [[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS, + const NodeToUsesMap &M); raw_ostream &operator<< (raw_ostream &OS, const NodeToUsesMap &M){ for (const auto &I : M) { const UseSet &Us = I.second; @@ -914,9 +913,8 @@ namespace { const NodeToValueMap ⤅ }; - raw_ostream &operator<< (raw_ostream &OS, - const LocationAsBlock &Loc) LLVM_ATTRIBUTE_UNUSED ; - raw_ostream &operator<< (raw_ostream &OS, const LocationAsBlock &Loc) { + [[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS, + const LocationAsBlock &Loc) { for (const auto &I : Loc.Map) { OS << I.first << " -> "; if (BasicBlock *B = cast_or_null<BasicBlock>(I.second)) diff --git a/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp b/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp index 14a7ae7..3900aac 100644 --- a/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp +++ b/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp @@ -132,8 +132,7 @@ namespace { const TargetRegisterInfo &TRI; friend raw_ostream &operator<< (raw_ostream &OS, const PrintFP &P); }; - raw_ostream &operator<<(raw_ostream &OS, - const PrintFP &P) LLVM_ATTRIBUTE_UNUSED; + [[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS, const PrintFP &P); raw_ostream &operator<<(raw_ostream &OS, const PrintFP &P) { OS << "{ SplitB:" << PrintMB(P.FP.SplitB) << ", PredR:" << printReg(P.FP.PredR, &P.TRI) diff --git a/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp b/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp index f9fdab4..9c81e96 100644 --- a/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp +++ b/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp @@ -51,11 +51,11 @@ private: const TargetRegisterInfo &TRI; }; - raw_ostream &operator<< (raw_ostream &OS, const PrintRegister &PR) - LLVM_ATTRIBUTE_UNUSED; - raw_ostream &operator<< (raw_ostream &OS, const PrintRegister &PR) { - return OS << printReg(PR.Reg.Reg, &PR.TRI, PR.Reg.SubReg); - } +[[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS, + const PrintRegister &PR); +raw_ostream &operator<<(raw_ostream &OS, const PrintRegister &PR) { + return OS << printReg(PR.Reg.Reg, &PR.TRI, PR.Reg.SubReg); +} class HexagonGenPredicate : public MachineFunctionPass { public: diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp index bfea50e..6b48a21 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp @@ -422,12 +422,12 @@ static MCTargetStreamer *createHexagonNullTargetStreamer(MCStreamer &S) { return new HexagonTargetStreamer(S); } -static void LLVM_ATTRIBUTE_UNUSED clearFeature(MCSubtargetInfo* STI, uint64_t F) { +[[maybe_unused]] static void clearFeature(MCSubtargetInfo *STI, uint64_t F) { if (STI->hasFeature(F)) STI->ToggleFeature(F); } -static bool LLVM_ATTRIBUTE_UNUSED checkFeature(MCSubtargetInfo* STI, uint64_t F) { +[[maybe_unused]] static bool checkFeature(MCSubtargetInfo *STI, uint64_t F) { return STI->hasFeature(F); } diff --git a/llvm/lib/Target/Mips/MipsFastISel.cpp b/llvm/lib/Target/Mips/MipsFastISel.cpp index 1ce8d7e3..df0c8c1 100644 --- a/llvm/lib/Target/Mips/MipsFastISel.cpp +++ b/llvm/lib/Target/Mips/MipsFastISel.cpp @@ -264,9 +264,10 @@ public: } // end anonymous namespace -static bool CC_Mips(unsigned ValNo, MVT ValVT, MVT LocVT, - CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, - Type *OrigTy, CCState &State) LLVM_ATTRIBUTE_UNUSED; +[[maybe_unused]] static bool CC_Mips(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, Type *OrigTy, + CCState &State); static bool CC_MipsO32_FP32(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp index 7f1ff45..2fd7327 100644 --- a/llvm/lib/Target/Mips/MipsISelLowering.cpp +++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp @@ -3176,9 +3176,10 @@ static bool CC_MipsO32_FP64(unsigned ValNo, MVT ValVT, MVT LocVT, F64Regs); } -static bool CC_MipsO32(unsigned ValNo, MVT ValVT, MVT LocVT, - CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, - Type *OrigTy, CCState &State) LLVM_ATTRIBUTE_UNUSED; +[[maybe_unused]] static bool CC_MipsO32(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, Type *OrigTy, + CCState &State); #include "MipsGenCallingConv.inc" diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 272c21f..2f1a7ad 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -749,7 +749,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setTruncStoreAction(VT, MVT::i1, Expand); } - // Disable generations of extload/truncstore for v2i16/v2i8. The generic + // Disable generations of extload/truncstore for v2i32/v2i16/v2i8. The generic // expansion for these nodes when they are unaligned is incorrect if the // type is a vector. // @@ -757,7 +757,11 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, // TargetLowering::expandUnalignedLoad/Store. setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::v2i16, MVT::v2i8, Expand); + setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::v2i32, + {MVT::v2i8, MVT::v2i16}, Expand); setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand); + setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); + setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand); // Register custom handling for illegal type loads/stores. We'll try to custom // lower almost all illegal types and logic in the lowering will discard cases diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index 8851a0f..e857b2d 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -3356,10 +3356,10 @@ bool RISCVAsmParser::parseDirectiveAttribute() { bool isValidInsnFormat(StringRef Format, const MCSubtargetInfo &STI) { return StringSwitch<bool>(Format) - .Cases("r", "r4", "i", "b", "sb", "u", "j", "uj", "s", true) - .Cases("cr", "ci", "ciw", "css", "cl", "cs", "ca", "cb", "cj", + .Cases({"r", "r4", "i", "b", "sb", "u", "j", "uj", "s"}, true) + .Cases({"cr", "ci", "ciw", "css", "cl", "cs", "ca", "cb", "cj"}, STI.hasFeature(RISCV::FeatureStdExtZca)) - .Cases("qc.eai", "qc.ei", "qc.eb", "qc.ej", "qc.es", + .Cases({"qc.eai", "qc.ei", "qc.eb", "qc.ej", "qc.es"}, !STI.hasFeature(RISCV::Feature64Bit)) .Default(false); } diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp index 662d3f6..b1794b7 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp @@ -717,6 +717,18 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST) .clampScalar(0, sXLen, sXLen) .lower(); + LegalityPredicate InsertVectorEltPred = [=](const LegalityQuery &Query) { + LLT VecTy = Query.Types[0]; + LLT EltTy = Query.Types[1]; + return VecTy.getElementType() == EltTy; + }; + + getActionDefinitionsBuilder(G_INSERT_VECTOR_ELT) + .legalIf(all(typeIsLegalIntOrFPVec(0, IntOrFPVecTys, ST), + InsertVectorEltPred, typeIs(2, sXLen))) + .legalIf(all(typeIsLegalBoolVec(0, BoolVecTys, ST), InsertVectorEltPred, + typeIs(2, sXLen))); + getLegacyLegalizerInfo().computeTables(); verify(*ST.getInstrInfo()); } diff --git a/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp b/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp index 50730c6..ab93bba 100644 --- a/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp +++ b/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp @@ -43,7 +43,7 @@ const llvm::StringRef RISCVLMULInstrument::DESC_NAME = "RISCV-LMUL"; bool RISCVLMULInstrument::isDataValid(llvm::StringRef Data) { // Return true if not one of the valid LMUL strings return StringSwitch<bool>(Data) - .Cases("M1", "M2", "M4", "M8", "MF2", "MF4", "MF8", true) + .Cases({"M1", "M2", "M4", "M8", "MF2", "MF4", "MF8"}, true) .Default(false); } diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index 5ceb477..19992e6 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -695,6 +695,9 @@ def HasStdExtZvfbfa : Predicate<"Subtarget->hasStdExtZvfbfa()">, def FeatureStdExtZvfbfmin : RISCVExtension<1, 0, "Vector BF16 Converts", [FeatureStdExtZve32f]>; +def HasStdExtZvfbfmin : Predicate<"Subtarget->hasStdExtZvfbfmin()">, + AssemblerPredicate<(all_of FeatureStdExtZvfbfmin), + "'Zvfbfmin' (Vector BF16 Converts)">; def FeatureStdExtZvfbfwma : RISCVExtension<1, 0, "Vector BF16 widening mul-add", diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 7123a2d..169465e 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -1672,6 +1672,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, if (Subtarget.useRVVForFixedLengthVectors()) setTargetDAGCombine(ISD::BITCAST); + setMaxDivRemBitWidthSupported(Subtarget.is64Bit() ? 128 : 64); + // Disable strict node mutation. IsStrictFPEnabled = true; EnableExtLdPromotion = true; @@ -24828,7 +24830,8 @@ bool RISCVTargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const { // instruction, as it is usually smaller than the alternative sequence. // TODO: Add vector division? bool OptSize = Attr.hasFnAttr(Attribute::MinSize); - return OptSize && !VT.isVector(); + return OptSize && !VT.isVector() && + VT.getSizeInBits() <= getMaxDivRemBitWidthSupported(); } bool RISCVTargetLowering::preferScalarizeSplat(SDNode *N) const { diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp index 1b7cb9b..636e31c 100644 --- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp +++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp @@ -699,7 +699,8 @@ public: "Can't encode VTYPE for uninitialized or unknown"); if (TWiden != 0) return RISCVVType::encodeXSfmmVType(SEW, TWiden, AltFmt); - return RISCVVType::encodeVTYPE(VLMul, SEW, TailAgnostic, MaskAgnostic); + return RISCVVType::encodeVTYPE(VLMul, SEW, TailAgnostic, MaskAgnostic, + AltFmt); } bool hasSEWLMULRatioOnly() const { return SEWLMULRatioOnly; } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index ddb53a2..12f776b 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -3775,11 +3775,13 @@ std::string RISCVInstrInfo::createMIROperandComment( #define CASE_VFMA_OPCODE_VV(OP) \ CASE_VFMA_OPCODE_LMULS_MF4(OP, VV, E16): \ + case CASE_VFMA_OPCODE_LMULS_MF4(OP##_ALT, VV, E16): \ case CASE_VFMA_OPCODE_LMULS_MF2(OP, VV, E32): \ case CASE_VFMA_OPCODE_LMULS_M1(OP, VV, E64) #define CASE_VFMA_SPLATS(OP) \ CASE_VFMA_OPCODE_LMULS_MF4(OP, VFPR16, E16): \ + case CASE_VFMA_OPCODE_LMULS_MF4(OP##_ALT, VFPR16, E16): \ case CASE_VFMA_OPCODE_LMULS_MF2(OP, VFPR32, E32): \ case CASE_VFMA_OPCODE_LMULS_M1(OP, VFPR64, E64) // clang-format on @@ -4003,11 +4005,13 @@ bool RISCVInstrInfo::findCommutedOpIndices(const MachineInstr &MI, #define CASE_VFMA_CHANGE_OPCODE_VV(OLDOP, NEWOP) \ CASE_VFMA_CHANGE_OPCODE_LMULS_MF4(OLDOP, NEWOP, VV, E16) \ + CASE_VFMA_CHANGE_OPCODE_LMULS_MF4(OLDOP##_ALT, NEWOP##_ALT, VV, E16) \ CASE_VFMA_CHANGE_OPCODE_LMULS_MF2(OLDOP, NEWOP, VV, E32) \ CASE_VFMA_CHANGE_OPCODE_LMULS_M1(OLDOP, NEWOP, VV, E64) #define CASE_VFMA_CHANGE_OPCODE_SPLATS(OLDOP, NEWOP) \ CASE_VFMA_CHANGE_OPCODE_LMULS_MF4(OLDOP, NEWOP, VFPR16, E16) \ + CASE_VFMA_CHANGE_OPCODE_LMULS_MF4(OLDOP##_ALT, NEWOP##_ALT, VFPR16, E16) \ CASE_VFMA_CHANGE_OPCODE_LMULS_MF2(OLDOP, NEWOP, VFPR32, E32) \ CASE_VFMA_CHANGE_OPCODE_LMULS_M1(OLDOP, NEWOP, VFPR64, E64) // clang-format on @@ -4469,6 +4473,20 @@ bool RISCVInstrInfo::simplifyInstruction(MachineInstr &MI) const { CASE_FP_WIDEOP_CHANGE_OPCODE_COMMON(OP, M2, E32) \ CASE_FP_WIDEOP_CHANGE_OPCODE_COMMON(OP, M4, E16) \ CASE_FP_WIDEOP_CHANGE_OPCODE_COMMON(OP, M4, E32) \ + +#define CASE_FP_WIDEOP_OPCODE_LMULS_ALT(OP) \ + CASE_FP_WIDEOP_OPCODE_COMMON(OP, MF4, E16): \ + case CASE_FP_WIDEOP_OPCODE_COMMON(OP, MF2, E16): \ + case CASE_FP_WIDEOP_OPCODE_COMMON(OP, M1, E16): \ + case CASE_FP_WIDEOP_OPCODE_COMMON(OP, M2, E16): \ + case CASE_FP_WIDEOP_OPCODE_COMMON(OP, M4, E16) + +#define CASE_FP_WIDEOP_CHANGE_OPCODE_LMULS_ALT(OP) \ + CASE_FP_WIDEOP_CHANGE_OPCODE_COMMON(OP, MF4, E16) \ + CASE_FP_WIDEOP_CHANGE_OPCODE_COMMON(OP, MF2, E16) \ + CASE_FP_WIDEOP_CHANGE_OPCODE_COMMON(OP, M1, E16) \ + CASE_FP_WIDEOP_CHANGE_OPCODE_COMMON(OP, M2, E16) \ + CASE_FP_WIDEOP_CHANGE_OPCODE_COMMON(OP, M4, E16) // clang-format on MachineInstr *RISCVInstrInfo::convertToThreeAddress(MachineInstr &MI, @@ -4478,6 +4496,8 @@ MachineInstr *RISCVInstrInfo::convertToThreeAddress(MachineInstr &MI, switch (MI.getOpcode()) { default: return nullptr; + case CASE_FP_WIDEOP_OPCODE_LMULS_ALT(FWADD_ALT_WV): + case CASE_FP_WIDEOP_OPCODE_LMULS_ALT(FWSUB_ALT_WV): case CASE_FP_WIDEOP_OPCODE_LMULS(FWADD_WV): case CASE_FP_WIDEOP_OPCODE_LMULS(FWSUB_WV): { assert(RISCVII::hasVecPolicyOp(MI.getDesc().TSFlags) && @@ -4494,6 +4514,8 @@ MachineInstr *RISCVInstrInfo::convertToThreeAddress(MachineInstr &MI, llvm_unreachable("Unexpected opcode"); CASE_FP_WIDEOP_CHANGE_OPCODE_LMULS(FWADD_WV) CASE_FP_WIDEOP_CHANGE_OPCODE_LMULS(FWSUB_WV) + CASE_FP_WIDEOP_CHANGE_OPCODE_LMULS_ALT(FWADD_ALT_WV) + CASE_FP_WIDEOP_CHANGE_OPCODE_LMULS_ALT(FWSUB_ALT_WV) } // clang-format on diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td index 65865ce..eb3c9b0 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td @@ -5862,20 +5862,6 @@ multiclass VPatConversionWF_VF<string intrinsic, string instruction, } } -multiclass VPatConversionWF_VF_BF<string intrinsic, string instruction, - bit isSEWAware = 0> { - foreach fvtiToFWti = AllWidenableBF16ToFloatVectors in - { - defvar fvti = fvtiToFWti.Vti; - defvar fwti = fvtiToFWti.Wti; - let Predicates = !listconcat(GetVTypePredicates<fvti>.Predicates, - GetVTypePredicates<fwti>.Predicates) in - defm : VPatConversion<intrinsic, instruction, "V", - fwti.Vector, fvti.Vector, fwti.Mask, fvti.Log2SEW, - fvti.LMul, fwti.RegClass, fvti.RegClass, isSEWAware>; - } -} - multiclass VPatConversionVI_WF<string intrinsic, string instruction> { foreach vtiToWti = AllWidenableIntToFloatVectors in { defvar vti = vtiToWti.Vti; @@ -5969,20 +5955,6 @@ multiclass VPatConversionVF_WF_RTZ<string intrinsic, string instruction, } } -multiclass VPatConversionVF_WF_BF_RM<string intrinsic, string instruction, - bit isSEWAware = 0> { - foreach fvtiToFWti = AllWidenableBF16ToFloatVectors in { - defvar fvti = fvtiToFWti.Vti; - defvar fwti = fvtiToFWti.Wti; - let Predicates = !listconcat(GetVTypePredicates<fvti>.Predicates, - GetVTypePredicates<fwti>.Predicates) in - defm : VPatConversionRoundingMode<intrinsic, instruction, "W", - fvti.Vector, fwti.Vector, fvti.Mask, fvti.Log2SEW, - fvti.LMul, fvti.RegClass, fwti.RegClass, - isSEWAware>; - } -} - multiclass VPatCompare_VI<string intrinsic, string inst, ImmLeaf ImmType> { foreach vti = AllIntegerVectors in { diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td index 0be9eab..9358486 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td @@ -36,7 +36,7 @@ defm VFWMACCBF16_V : VWMAC_FV_V_F<"vfwmaccbf16", 0b111011>; //===----------------------------------------------------------------------===// // Pseudo instructions //===----------------------------------------------------------------------===// -let Predicates = [HasStdExtZvfbfminOrZvfofp8min] in { +let Predicates = [HasStdExtZvfbfmin] in { defm PseudoVFWCVTBF16_F_F : VPseudoVWCVTD_V; defm PseudoVFNCVTBF16_F_F : VPseudoVNCVTD_W_RM; } @@ -44,10 +44,364 @@ let Predicates = [HasStdExtZvfbfminOrZvfofp8min] in { let mayRaiseFPException = true, Predicates = [HasStdExtZvfbfwma] in defm PseudoVFWMACCBF16 : VPseudoVWMAC_VV_VF_BF_RM; +defset list<VTypeInfoToWide> AllWidenableIntToBF16Vectors = { + def : VTypeInfoToWide<VI8MF8, VBF16MF4>; + def : VTypeInfoToWide<VI8MF4, VBF16MF2>; + def : VTypeInfoToWide<VI8MF2, VBF16M1>; + def : VTypeInfoToWide<VI8M1, VBF16M2>; + def : VTypeInfoToWide<VI8M2, VBF16M4>; + def : VTypeInfoToWide<VI8M4, VBF16M8>; +} + +multiclass VPseudoVALU_VV_VF_RM_BF16 { + foreach m = MxListF in { + defm "" : VPseudoBinaryFV_VV_RM<m, 16/*sew*/>, + SchedBinary<"WriteVFALUV", "ReadVFALUV", "ReadVFALUV", m.MX, + 16/*sew*/, forcePassthruRead=true>; + } + + defvar f = SCALAR_F16; + foreach m = f.MxList in { + defm "" : VPseudoBinaryV_VF_RM<m, f, f.SEW>, + SchedBinary<"WriteVFALUF", "ReadVFALUV", "ReadVFALUF", m.MX, + f.SEW, forcePassthruRead=true>; + } +} + +multiclass VPseudoVALU_VF_RM_BF16 { + defvar f = SCALAR_F16; + foreach m = f.MxList in { + defm "" : VPseudoBinaryV_VF_RM<m, f, f.SEW>, + SchedBinary<"WriteVFALUF", "ReadVFALUV", "ReadVFALUF", m.MX, + f.SEW, forcePassthruRead=true>; + } +} + +multiclass VPseudoVFWALU_VV_VF_RM_BF16 { + foreach m = MxListFW in { + defm "" : VPseudoBinaryW_VV_RM<m, sew=16>, + SchedBinary<"WriteVFWALUV", "ReadVFWALUV", "ReadVFWALUV", m.MX, + 16/*sew*/, forcePassthruRead=true>; + } + + defvar f = SCALAR_F16; + foreach m = f.MxListFW in { + defm "" : VPseudoBinaryW_VF_RM<m, f, sew=f.SEW>, + SchedBinary<"WriteVFWALUF", "ReadVFWALUV", "ReadVFWALUF", m.MX, + f.SEW, forcePassthruRead=true>; + } +} + +multiclass VPseudoVFWALU_WV_WF_RM_BF16 { + foreach m = MxListFW in { + defm "" : VPseudoBinaryW_WV_RM<m, sew=16>, + SchedBinary<"WriteVFWALUV", "ReadVFWALUV", "ReadVFWALUV", m.MX, + 16/*sew*/, forcePassthruRead=true>; + } + defvar f = SCALAR_F16; + foreach m = f.MxListFW in { + defm "" : VPseudoBinaryW_WF_RM<m, f, sew=f.SEW>, + SchedBinary<"WriteVFWALUF", "ReadVFWALUV", "ReadVFWALUF", m.MX, + f.SEW, forcePassthruRead=true>; + } +} + +multiclass VPseudoVFMUL_VV_VF_RM_BF16 { + foreach m = MxListF in { + defm "" : VPseudoBinaryFV_VV_RM<m, 16/*sew*/>, + SchedBinary<"WriteVFMulV", "ReadVFMulV", "ReadVFMulV", m.MX, + 16/*sew*/, forcePassthruRead=true>; + } + + defvar f = SCALAR_F16; + foreach m = f.MxList in { + defm "" : VPseudoBinaryV_VF_RM<m, f, f.SEW>, + SchedBinary<"WriteVFMulF", "ReadVFMulV", "ReadVFMulF", m.MX, + f.SEW, forcePassthruRead=true>; + } +} + +multiclass VPseudoVWMUL_VV_VF_RM_BF16 { + foreach m = MxListFW in { + defm "" : VPseudoBinaryW_VV_RM<m, sew=16>, + SchedBinary<"WriteVFWMulV", "ReadVFWMulV", "ReadVFWMulV", m.MX, + 16/*sew*/, forcePassthruRead=true>; + } + + defvar f = SCALAR_F16; + foreach m = f.MxListFW in { + defm "" : VPseudoBinaryW_VF_RM<m, f, sew=f.SEW>, + SchedBinary<"WriteVFWMulF", "ReadVFWMulV", "ReadVFWMulF", m.MX, + f.SEW, forcePassthruRead=true>; + } +} + +multiclass VPseudoVMAC_VV_VF_AAXA_RM_BF16 { + foreach m = MxListF in { + defm "" : VPseudoTernaryV_VV_AAXA_RM<m, 16/*sew*/>, + SchedTernary<"WriteVFMulAddV", "ReadVFMulAddV", "ReadVFMulAddV", + "ReadVFMulAddV", m.MX, 16/*sew*/>; + } + + defvar f = SCALAR_F16; + foreach m = f.MxList in { + defm "" : VPseudoTernaryV_VF_AAXA_RM<m, f, f.SEW>, + SchedTernary<"WriteVFMulAddF", "ReadVFMulAddV", "ReadVFMulAddF", + "ReadVFMulAddV", m.MX, f.SEW>; + } +} + +multiclass VPseudoVWMAC_VV_VF_RM_BF16 { + foreach m = MxListFW in { + defm "" : VPseudoTernaryW_VV_RM<m, sew=16>, + SchedTernary<"WriteVFWMulAddV", "ReadVFWMulAddV", + "ReadVFWMulAddV", "ReadVFWMulAddV", m.MX, 16/*sew*/>; + } + + defvar f = SCALAR_F16; + foreach m = f.MxListFW in { + defm "" : VPseudoTernaryW_VF_RM<m, f, sew=f.SEW>, + SchedTernary<"WriteVFWMulAddF", "ReadVFWMulAddV", + "ReadVFWMulAddF", "ReadVFWMulAddV", m.MX, f.SEW>; + } +} + +multiclass VPseudoVRCP_V_BF16 { + foreach m = MxListF in { + defvar mx = m.MX; + let VLMul = m.value in { + def "_V_" # mx # "_E16" + : VPseudoUnaryNoMask<m.vrclass, m.vrclass>, + SchedUnary<"WriteVFRecpV", "ReadVFRecpV", mx, 16/*sew*/, + forcePassthruRead=true>; + def "_V_" # mx # "_E16_MASK" + : VPseudoUnaryMask<m.vrclass, m.vrclass>, + RISCVMaskedPseudo<MaskIdx = 2>, + SchedUnary<"WriteVFRecpV", "ReadVFRecpV", mx, 16/*sew*/, + forcePassthruRead=true>; + } + } +} + +multiclass VPseudoVRCP_V_RM_BF16 { + foreach m = MxListF in { + defvar mx = m.MX; + let VLMul = m.value in { + def "_V_" # mx # "_E16" + : VPseudoUnaryNoMaskRoundingMode<m.vrclass, m.vrclass>, + SchedUnary<"WriteVFRecpV", "ReadVFRecpV", mx, 16/*sew*/, + forcePassthruRead=true>; + def "_V_" # mx # "_E16_MASK" + : VPseudoUnaryMaskRoundingMode<m.vrclass, m.vrclass>, + RISCVMaskedPseudo<MaskIdx = 2>, + SchedUnary<"WriteVFRecpV", "ReadVFRecpV", mx, 16/*sew*/, + forcePassthruRead=true>; + } + } +} + +multiclass VPseudoVMAX_VV_VF_BF16 { + foreach m = MxListF in { + defm "" : VPseudoBinaryV_VV<m, sew=16>, + SchedBinary<"WriteVFMinMaxV", "ReadVFMinMaxV", "ReadVFMinMaxV", + m.MX, 16/*sew*/, forcePassthruRead=true>; + } + + defvar f = SCALAR_F16; + foreach m = f.MxList in { + defm "" : VPseudoBinaryV_VF<m, f, f.SEW>, + SchedBinary<"WriteVFMinMaxF", "ReadVFMinMaxV", "ReadVFMinMaxF", + m.MX, f.SEW, forcePassthruRead=true>; + } +} + +multiclass VPseudoVSGNJ_VV_VF_BF16 { + foreach m = MxListF in { + defm "" : VPseudoBinaryV_VV<m, sew=16>, + SchedBinary<"WriteVFSgnjV", "ReadVFSgnjV", "ReadVFSgnjV", m.MX, + 16/*sew*/, forcePassthruRead=true>; + } + + defvar f = SCALAR_F16; + foreach m = f.MxList in { + defm "" : VPseudoBinaryV_VF<m, f, f.SEW>, + SchedBinary<"WriteVFSgnjF", "ReadVFSgnjV", "ReadVFSgnjF", m.MX, + f.SEW, forcePassthruRead=true>; + } +} + +multiclass VPseudoVWCVTF_V_BF16 { + defvar constraint = "@earlyclobber $rd"; + foreach m = MxListW in + defm _V : VPseudoConversion<m.wvrclass, m.vrclass, m, constraint, sew=8, + TargetConstraintType=3>, + SchedUnary<"WriteVFWCvtIToFV", "ReadVFWCvtIToFV", m.MX, 8/*sew*/, + forcePassthruRead=true>; +} + +multiclass VPseudoVWCVTD_V_BF16 { + defvar constraint = "@earlyclobber $rd"; + foreach m = MxListFW in + defm _V : VPseudoConversion<m.wvrclass, m.vrclass, m, constraint, sew=16, + TargetConstraintType=3>, + SchedUnary<"WriteVFWCvtFToFV", "ReadVFWCvtFToFV", m.MX, 16/*sew*/, + forcePassthruRead=true>; +} + +multiclass VPseudoVNCVTD_W_BF16 { + defvar constraint = "@earlyclobber $rd"; + foreach m = MxListFW in + defm _W : VPseudoConversion<m.vrclass, m.wvrclass, m, constraint, sew=16, + TargetConstraintType=2>, + SchedUnary<"WriteVFNCvtFToFV", "ReadVFNCvtFToFV", m.MX, 16/*sew*/, + forcePassthruRead=true>; +} + +multiclass VPseudoVNCVTD_W_RM_BF16 { + defvar constraint = "@earlyclobber $rd"; + foreach m = MxListFW in + defm _W : VPseudoConversionRoundingMode<m.vrclass, m.wvrclass, m, + constraint, sew=16, + TargetConstraintType=2>, + SchedUnary<"WriteVFNCvtFToFV", "ReadVFNCvtFToFV", m.MX, 16/*sew*/, + forcePassthruRead=true>; +} + +let Predicates = [HasStdExtZvfbfa], AltFmtType = IS_ALTFMT in { +let mayRaiseFPException = true in { +defm PseudoVFADD_ALT : VPseudoVALU_VV_VF_RM_BF16; +defm PseudoVFSUB_ALT : VPseudoVALU_VV_VF_RM_BF16; +defm PseudoVFRSUB_ALT : VPseudoVALU_VF_RM_BF16; +} + +let mayRaiseFPException = true in { +defm PseudoVFWADD_ALT : VPseudoVFWALU_VV_VF_RM_BF16; +defm PseudoVFWSUB_ALT : VPseudoVFWALU_VV_VF_RM_BF16; +defm PseudoVFWADD_ALT : VPseudoVFWALU_WV_WF_RM_BF16; +defm PseudoVFWSUB_ALT : VPseudoVFWALU_WV_WF_RM_BF16; +} + +let mayRaiseFPException = true in +defm PseudoVFMUL_ALT : VPseudoVFMUL_VV_VF_RM_BF16; + +let mayRaiseFPException = true in +defm PseudoVFWMUL_ALT : VPseudoVWMUL_VV_VF_RM_BF16; + +let mayRaiseFPException = true in { +defm PseudoVFMACC_ALT : VPseudoVMAC_VV_VF_AAXA_RM_BF16; +defm PseudoVFNMACC_ALT : VPseudoVMAC_VV_VF_AAXA_RM_BF16; +defm PseudoVFMSAC_ALT : VPseudoVMAC_VV_VF_AAXA_RM_BF16; +defm PseudoVFNMSAC_ALT : VPseudoVMAC_VV_VF_AAXA_RM_BF16; +defm PseudoVFMADD_ALT : VPseudoVMAC_VV_VF_AAXA_RM_BF16; +defm PseudoVFNMADD_ALT : VPseudoVMAC_VV_VF_AAXA_RM_BF16; +defm PseudoVFMSUB_ALT : VPseudoVMAC_VV_VF_AAXA_RM_BF16; +defm PseudoVFNMSUB_ALT : VPseudoVMAC_VV_VF_AAXA_RM_BF16; +} + +let mayRaiseFPException = true in { +defm PseudoVFWMACC_ALT : VPseudoVWMAC_VV_VF_RM_BF16; +defm PseudoVFWNMACC_ALT : VPseudoVWMAC_VV_VF_RM_BF16; +defm PseudoVFWMSAC_ALT : VPseudoVWMAC_VV_VF_RM_BF16; +defm PseudoVFWNMSAC_ALT : VPseudoVWMAC_VV_VF_RM_BF16; +} + +let mayRaiseFPException = true in +defm PseudoVFRSQRT7_ALT : VPseudoVRCP_V_BF16; + +let mayRaiseFPException = true in +defm PseudoVFREC7_ALT : VPseudoVRCP_V_RM_BF16; + +let mayRaiseFPException = true in { +defm PseudoVFMIN_ALT : VPseudoVMAX_VV_VF_BF16; +defm PseudoVFMAX_ALT : VPseudoVMAX_VV_VF_BF16; +} + +defm PseudoVFSGNJ_ALT : VPseudoVSGNJ_VV_VF_BF16; +defm PseudoVFSGNJN_ALT : VPseudoVSGNJ_VV_VF_BF16; +defm PseudoVFSGNJX_ALT : VPseudoVSGNJ_VV_VF_BF16; + +let mayRaiseFPException = true in { +defm PseudoVMFEQ_ALT : VPseudoVCMPM_VV_VF; +defm PseudoVMFNE_ALT : VPseudoVCMPM_VV_VF; +defm PseudoVMFLT_ALT : VPseudoVCMPM_VV_VF; +defm PseudoVMFLE_ALT : VPseudoVCMPM_VV_VF; +defm PseudoVMFGT_ALT : VPseudoVCMPM_VF; +defm PseudoVMFGE_ALT : VPseudoVCMPM_VF; +} + +defm PseudoVFCLASS_ALT : VPseudoVCLS_V; + +defm PseudoVFMERGE_ALT : VPseudoVMRG_FM; + +defm PseudoVFMV_V_ALT : VPseudoVMV_F; + +let mayRaiseFPException = true in { +defm PseudoVFWCVT_F_XU_ALT : VPseudoVWCVTF_V_BF16; +defm PseudoVFWCVT_F_X_ALT : VPseudoVWCVTF_V_BF16; + +defm PseudoVFWCVT_F_F_ALT : VPseudoVWCVTD_V_BF16; +} // mayRaiseFPException = true + +let mayRaiseFPException = true in { +let hasSideEffects = 0, hasPostISelHook = 1 in { +defm PseudoVFNCVT_XU_F_ALT : VPseudoVNCVTI_W_RM; +defm PseudoVFNCVT_X_F_ALT : VPseudoVNCVTI_W_RM; +} + +defm PseudoVFNCVT_RTZ_XU_F_ALT : VPseudoVNCVTI_W; +defm PseudoVFNCVT_RTZ_X_F_ALT : VPseudoVNCVTI_W; + +defm PseudoVFNCVT_F_F_ALT : VPseudoVNCVTD_W_RM_BF16; + +defm PseudoVFNCVT_ROD_F_F_ALT : VPseudoVNCVTD_W_BF16; +} // mayRaiseFPException = true + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { + defvar f = SCALAR_F16; + let HasSEWOp = 1, BaseInstr = VFMV_F_S in + def "PseudoVFMV_" # f.FX # "_S_ALT" : + RISCVVPseudo<(outs f.fprclass:$rd), (ins VR:$rs2, sew:$sew)>, + Sched<[WriteVMovFS, ReadVMovFS]>; + let HasVLOp = 1, HasSEWOp = 1, BaseInstr = VFMV_S_F, isReMaterializable = 1, + Constraints = "$rd = $passthru" in + def "PseudoVFMV_S_" # f.FX # "_ALT" : + RISCVVPseudo<(outs VR:$rd), + (ins VR:$passthru, f.fprclass:$rs1, AVL:$vl, sew:$sew)>, + Sched<[WriteVMovSF, ReadVMovSF_V, ReadVMovSF_F]>; +} + +defm PseudoVFSLIDE1UP_ALT : VPseudoVSLD1_VF<"@earlyclobber $rd">; +defm PseudoVFSLIDE1DOWN_ALT : VPseudoVSLD1_VF; +} // Predicates = [HasStdExtZvfbfa], AltFmtType = IS_ALTFMT + //===----------------------------------------------------------------------===// // Patterns //===----------------------------------------------------------------------===// -let Predicates = [HasStdExtZvfbfminOrZvfofp8min] in { +multiclass VPatConversionWF_VF_BF<string intrinsic, string instruction, + bit isSEWAware = 0> { + foreach fvtiToFWti = AllWidenableBF16ToFloatVectors in + { + defvar fvti = fvtiToFWti.Vti; + defvar fwti = fvtiToFWti.Wti; + defm : VPatConversion<intrinsic, instruction, "V", + fwti.Vector, fvti.Vector, fwti.Mask, fvti.Log2SEW, + fvti.LMul, fwti.RegClass, fvti.RegClass, isSEWAware>; + } +} + +multiclass VPatConversionVF_WF_BF_RM<string intrinsic, string instruction, + bit isSEWAware = 0> { + foreach fvtiToFWti = AllWidenableBF16ToFloatVectors in { + defvar fvti = fvtiToFWti.Vti; + defvar fwti = fvtiToFWti.Wti; + defm : VPatConversionRoundingMode<intrinsic, instruction, "W", + fvti.Vector, fwti.Vector, fvti.Mask, fvti.Log2SEW, + fvti.LMul, fvti.RegClass, fwti.RegClass, + isSEWAware>; + } +} + +let Predicates = [HasStdExtZvfbfmin] in { defm : VPatConversionWF_VF_BF<"int_riscv_vfwcvtbf16_f_f_v", "PseudoVFWCVTBF16_F_F", isSEWAware=1>; defm : VPatConversionVF_WF_BF_RM<"int_riscv_vfncvtbf16_f_f_w", @@ -56,7 +410,6 @@ let Predicates = [HasStdExtZvfbfminOrZvfofp8min] in { foreach fvtiToFWti = AllWidenableBF16ToFloatVectors in { defvar fvti = fvtiToFWti.Vti; defvar fwti = fvtiToFWti.Wti; - let Predicates = [HasVInstructionsBF16Minimal] in def : Pat<(fwti.Vector (any_riscv_fpextend_vl (fvti.Vector fvti.RegClass:$rs1), (fvti.Mask VMV0:$vm), @@ -66,18 +419,16 @@ let Predicates = [HasStdExtZvfbfminOrZvfofp8min] in { (fvti.Mask VMV0:$vm), GPR:$vl, fvti.Log2SEW, TA_MA)>; - let Predicates = [HasVInstructionsBF16Minimal] in - def : Pat<(fvti.Vector (any_riscv_fpround_vl - (fwti.Vector fwti.RegClass:$rs1), - (fwti.Mask VMV0:$vm), VLOpFrag)), - (!cast<Instruction>("PseudoVFNCVTBF16_F_F_W_"#fvti.LMul.MX#"_E"#fvti.SEW#"_MASK") - (fvti.Vector (IMPLICIT_DEF)), fwti.RegClass:$rs1, - (fwti.Mask VMV0:$vm), - // Value to indicate no rounding mode change in - // RISCVInsertReadWriteCSR - FRM_DYN, - GPR:$vl, fvti.Log2SEW, TA_MA)>; - let Predicates = [HasVInstructionsBF16Minimal] in + def : Pat<(fvti.Vector (any_riscv_fpround_vl + (fwti.Vector fwti.RegClass:$rs1), + (fwti.Mask VMV0:$vm), VLOpFrag)), + (!cast<Instruction>("PseudoVFNCVTBF16_F_F_W_"#fvti.LMul.MX#"_E"#fvti.SEW#"_MASK") + (fvti.Vector (IMPLICIT_DEF)), fwti.RegClass:$rs1, + (fwti.Mask VMV0:$vm), + // Value to indicate no rounding mode change in + // RISCVInsertReadWriteCSR + FRM_DYN, + GPR:$vl, fvti.Log2SEW, TA_MA)>; def : Pat<(fvti.Vector (fpround (fwti.Vector fwti.RegClass:$rs1))), (!cast<Instruction>("PseudoVFNCVTBF16_F_F_W_"#fvti.LMul.MX#"_E"#fvti.SEW) (fvti.Vector (IMPLICIT_DEF)), @@ -87,6 +438,130 @@ let Predicates = [HasStdExtZvfbfminOrZvfofp8min] in { FRM_DYN, fvti.AVL, fvti.Log2SEW, TA_MA)>; } + + defm : VPatUnaryV_V_AnyMask<"int_riscv_vcompress", "PseudoVCOMPRESS", AllBF16Vectors>; + defm : VPatBinaryV_VV_VX_VI_INT<"int_riscv_vrgather", "PseudoVRGATHER", + AllBF16Vectors, uimm5>; + defm : VPatBinaryV_VV_INT_EEW<"int_riscv_vrgatherei16_vv", "PseudoVRGATHEREI16", + eew=16, vtilist=AllBF16Vectors>; + defm : VPatTernaryV_VX_VI<"int_riscv_vslideup", "PseudoVSLIDEUP", AllBF16Vectors, uimm5>; + defm : VPatTernaryV_VX_VI<"int_riscv_vslidedown", "PseudoVSLIDEDOWN", AllBF16Vectors, uimm5>; + + foreach fvti = AllBF16Vectors in { + defm : VPatBinaryCarryInTAIL<"int_riscv_vmerge", "PseudoVMERGE", "VVM", + fvti.Vector, + fvti.Vector, fvti.Vector, fvti.Mask, + fvti.Log2SEW, fvti.LMul, fvti.RegClass, + fvti.RegClass, fvti.RegClass>; + defm : VPatBinaryCarryInTAIL<"int_riscv_vfmerge", "PseudoVFMERGE", + "V"#fvti.ScalarSuffix#"M", + fvti.Vector, + fvti.Vector, fvti.Scalar, fvti.Mask, + fvti.Log2SEW, fvti.LMul, fvti.RegClass, + fvti.RegClass, fvti.ScalarRegClass>; + defvar instr = !cast<Instruction>("PseudoVMERGE_VIM_"#fvti.LMul.MX); + def : Pat<(fvti.Vector (int_riscv_vfmerge (fvti.Vector fvti.RegClass:$passthru), + (fvti.Vector fvti.RegClass:$rs2), + (fvti.Scalar (fpimm0)), + (fvti.Mask VMV0:$vm), VLOpFrag)), + (instr fvti.RegClass:$passthru, fvti.RegClass:$rs2, 0, + (fvti.Mask VMV0:$vm), GPR:$vl, fvti.Log2SEW)>; + + defvar ivti = GetIntVTypeInfo<fvti>.Vti; + def : Pat<(fvti.Vector (vselect (fvti.Mask VMV0:$vm), fvti.RegClass:$rs1, + fvti.RegClass:$rs2)), + (!cast<Instruction>("PseudoVMERGE_VVM_"#fvti.LMul.MX) + (fvti.Vector (IMPLICIT_DEF)), + fvti.RegClass:$rs2, fvti.RegClass:$rs1, (fvti.Mask VMV0:$vm), + fvti.AVL, fvti.Log2SEW)>; + + def : Pat<(fvti.Vector (vselect (fvti.Mask VMV0:$vm), + (SplatFPOp (SelectScalarFPAsInt (XLenVT GPR:$imm))), + fvti.RegClass:$rs2)), + (!cast<Instruction>("PseudoVMERGE_VXM_"#fvti.LMul.MX) + (fvti.Vector (IMPLICIT_DEF)), + fvti.RegClass:$rs2, GPR:$imm, (fvti.Mask VMV0:$vm), fvti.AVL, fvti.Log2SEW)>; + + def : Pat<(fvti.Vector (vselect (fvti.Mask VMV0:$vm), + (SplatFPOp (fvti.Scalar fpimm0)), + fvti.RegClass:$rs2)), + (!cast<Instruction>("PseudoVMERGE_VIM_"#fvti.LMul.MX) + (fvti.Vector (IMPLICIT_DEF)), + fvti.RegClass:$rs2, 0, (fvti.Mask VMV0:$vm), fvti.AVL, fvti.Log2SEW)>; + + def : Pat<(fvti.Vector (vselect (fvti.Mask VMV0:$vm), + (SplatFPOp fvti.ScalarRegClass:$rs1), + fvti.RegClass:$rs2)), + (!cast<Instruction>("PseudoVFMERGE_V"#fvti.ScalarSuffix#"M_"#fvti.LMul.MX) + (fvti.Vector (IMPLICIT_DEF)), + fvti.RegClass:$rs2, + (fvti.Scalar fvti.ScalarRegClass:$rs1), + (fvti.Mask VMV0:$vm), fvti.AVL, fvti.Log2SEW)>; + + def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask VMV0:$vm), + fvti.RegClass:$rs1, + fvti.RegClass:$rs2, + fvti.RegClass:$passthru, + VLOpFrag)), + (!cast<Instruction>("PseudoVMERGE_VVM_"#fvti.LMul.MX) + fvti.RegClass:$passthru, fvti.RegClass:$rs2, fvti.RegClass:$rs1, (fvti.Mask VMV0:$vm), + GPR:$vl, fvti.Log2SEW)>; + + def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask VMV0:$vm), + (SplatFPOp (SelectScalarFPAsInt (XLenVT GPR:$imm))), + fvti.RegClass:$rs2, + fvti.RegClass:$passthru, + VLOpFrag)), + (!cast<Instruction>("PseudoVMERGE_VXM_"#fvti.LMul.MX) + fvti.RegClass:$passthru, fvti.RegClass:$rs2, GPR:$imm, (fvti.Mask VMV0:$vm), + GPR:$vl, fvti.Log2SEW)>; + + + def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask VMV0:$vm), + (SplatFPOp (fvti.Scalar fpimm0)), + fvti.RegClass:$rs2, + fvti.RegClass:$passthru, + VLOpFrag)), + (!cast<Instruction>("PseudoVMERGE_VIM_"#fvti.LMul.MX) + fvti.RegClass:$passthru, fvti.RegClass:$rs2, 0, (fvti.Mask VMV0:$vm), + GPR:$vl, fvti.Log2SEW)>; + + def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask VMV0:$vm), + (SplatFPOp fvti.ScalarRegClass:$rs1), + fvti.RegClass:$rs2, + fvti.RegClass:$passthru, + VLOpFrag)), + (!cast<Instruction>("PseudoVFMERGE_V"#fvti.ScalarSuffix#"M_"#fvti.LMul.MX) + fvti.RegClass:$passthru, fvti.RegClass:$rs2, + (fvti.Scalar fvti.ScalarRegClass:$rs1), + (fvti.Mask VMV0:$vm), GPR:$vl, fvti.Log2SEW)>; + + def : Pat<(fvti.Vector + (riscv_vrgather_vv_vl fvti.RegClass:$rs2, + (ivti.Vector fvti.RegClass:$rs1), + fvti.RegClass:$passthru, + (fvti.Mask VMV0:$vm), + VLOpFrag)), + (!cast<Instruction>("PseudoVRGATHER_VV_"# fvti.LMul.MX#"_E"# fvti.SEW#"_MASK") + fvti.RegClass:$passthru, fvti.RegClass:$rs2, fvti.RegClass:$rs1, + (fvti.Mask VMV0:$vm), GPR:$vl, fvti.Log2SEW, TAIL_AGNOSTIC)>; + def : Pat<(fvti.Vector (riscv_vrgather_vx_vl fvti.RegClass:$rs2, GPR:$rs1, + fvti.RegClass:$passthru, + (fvti.Mask VMV0:$vm), + VLOpFrag)), + (!cast<Instruction>("PseudoVRGATHER_VX_"# fvti.LMul.MX#"_MASK") + fvti.RegClass:$passthru, fvti.RegClass:$rs2, GPR:$rs1, + (fvti.Mask VMV0:$vm), GPR:$vl, fvti.Log2SEW, TAIL_AGNOSTIC)>; + def : Pat<(fvti.Vector + (riscv_vrgather_vx_vl fvti.RegClass:$rs2, + uimm5:$imm, + fvti.RegClass:$passthru, + (fvti.Mask VMV0:$vm), + VLOpFrag)), + (!cast<Instruction>("PseudoVRGATHER_VI_"# fvti.LMul.MX#"_MASK") + fvti.RegClass:$passthru, fvti.RegClass:$rs2, uimm5:$imm, + (fvti.Mask VMV0:$vm), GPR:$vl, fvti.Log2SEW, TAIL_AGNOSTIC)>; + } } let Predicates = [HasStdExtZvfbfwma] in { @@ -97,3 +572,224 @@ let Predicates = [HasStdExtZvfbfwma] in { defm : VPatWidenFPMulAccSDNode_VV_VF_RM<"PseudoVFWMACCBF16", AllWidenableBF16ToFloatVectors>; } + +multiclass VPatConversionVI_VF_BF16<string intrinsic, string instruction> { + foreach fvti = AllBF16Vectors in { + defvar ivti = GetIntVTypeInfo<fvti>.Vti; + let Predicates = !listconcat(GetVTypePredicates<fvti>.Predicates, + GetVTypePredicates<ivti>.Predicates) in + defm : VPatConversion<intrinsic, instruction, "V", + ivti.Vector, fvti.Vector, ivti.Mask, fvti.Log2SEW, + fvti.LMul, ivti.RegClass, fvti.RegClass>; + } +} + +multiclass VPatConversionWF_VI_BF16<string intrinsic, string instruction, + bit isSEWAware = 0> { + foreach vtiToWti = AllWidenableIntToBF16Vectors in { + defvar vti = vtiToWti.Vti; + defvar fwti = vtiToWti.Wti; + let Predicates = !listconcat(GetVTypePredicates<vti>.Predicates, + GetVTypePredicates<fwti>.Predicates) in + defm : VPatConversion<intrinsic, instruction, "V", + fwti.Vector, vti.Vector, fwti.Mask, vti.Log2SEW, + vti.LMul, fwti.RegClass, vti.RegClass, isSEWAware>; + } +} + +multiclass VPatConversionWF_VF_BF16<string intrinsic, string instruction, + bit isSEWAware = 0> { + foreach fvtiToFWti = AllWidenableBF16ToFloatVectors in { + defvar fvti = fvtiToFWti.Vti; + defvar fwti = fvtiToFWti.Wti; + let Predicates = !listconcat(GetVTypeMinimalPredicates<fvti>.Predicates, + GetVTypeMinimalPredicates<fwti>.Predicates) in + defm : VPatConversion<intrinsic, instruction, "V", + fwti.Vector, fvti.Vector, fwti.Mask, fvti.Log2SEW, + fvti.LMul, fwti.RegClass, fvti.RegClass, isSEWAware>; + } +} + +multiclass VPatConversionVI_WF_BF16<string intrinsic, string instruction> { + foreach vtiToWti = AllWidenableIntToBF16Vectors in { + defvar vti = vtiToWti.Vti; + defvar fwti = vtiToWti.Wti; + let Predicates = !listconcat(GetVTypePredicates<vti>.Predicates, + GetVTypePredicates<fwti>.Predicates) in + defm : VPatConversion<intrinsic, instruction, "W", + vti.Vector, fwti.Vector, vti.Mask, vti.Log2SEW, + vti.LMul, vti.RegClass, fwti.RegClass>; + } +} + +multiclass VPatConversionVI_WF_RM_BF16<string intrinsic, string instruction> { + foreach vtiToWti = AllWidenableIntToBF16Vectors in { + defvar vti = vtiToWti.Vti; + defvar fwti = vtiToWti.Wti; + let Predicates = !listconcat(GetVTypePredicates<vti>.Predicates, + GetVTypePredicates<fwti>.Predicates) in + defm : VPatConversionRoundingMode<intrinsic, instruction, "W", + vti.Vector, fwti.Vector, vti.Mask, vti.Log2SEW, + vti.LMul, vti.RegClass, fwti.RegClass>; + } +} + +multiclass VPatConversionVF_WF_BF16<string intrinsic, string instruction, + bit isSEWAware = 0> { + foreach fvtiToFWti = AllWidenableBF16ToFloatVectors in { + defvar fvti = fvtiToFWti.Vti; + defvar fwti = fvtiToFWti.Wti; + let Predicates = !listconcat(GetVTypePredicates<fvti>.Predicates, + GetVTypePredicates<fwti>.Predicates) in + defm : VPatConversion<intrinsic, instruction, "W", + fvti.Vector, fwti.Vector, fvti.Mask, fvti.Log2SEW, + fvti.LMul, fvti.RegClass, fwti.RegClass, isSEWAware>; + } +} + +let Predicates = [HasStdExtZvfbfa] in { +defm : VPatBinaryV_VV_VX_RM<"int_riscv_vfadd", "PseudoVFADD_ALT", + AllBF16Vectors, isSEWAware = 1>; +defm : VPatBinaryV_VV_VX_RM<"int_riscv_vfsub", "PseudoVFSUB_ALT", + AllBF16Vectors, isSEWAware = 1>; +defm : VPatBinaryV_VX_RM<"int_riscv_vfrsub", "PseudoVFRSUB_ALT", + AllBF16Vectors, isSEWAware = 1>; +defm : VPatBinaryW_VV_VX_RM<"int_riscv_vfwadd", "PseudoVFWADD_ALT", + AllWidenableBF16ToFloatVectors, isSEWAware=1>; +defm : VPatBinaryW_VV_VX_RM<"int_riscv_vfwsub", "PseudoVFWSUB_ALT", + AllWidenableBF16ToFloatVectors, isSEWAware=1>; +defm : VPatBinaryW_WV_WX_RM<"int_riscv_vfwadd_w", "PseudoVFWADD_ALT", + AllWidenableBF16ToFloatVectors, isSEWAware=1>; +defm : VPatBinaryW_WV_WX_RM<"int_riscv_vfwsub_w", "PseudoVFWSUB_ALT", + AllWidenableBF16ToFloatVectors, isSEWAware=1>; +defm : VPatBinaryV_VV_VX_RM<"int_riscv_vfmul", "PseudoVFMUL_ALT", + AllBF16Vectors, isSEWAware=1>; +defm : VPatBinaryW_VV_VX_RM<"int_riscv_vfwmul", "PseudoVFWMUL_ALT", + AllWidenableBF16ToFloatVectors, isSEWAware=1>; +defm : VPatTernaryV_VV_VX_AAXA_RM<"int_riscv_vfmacc", "PseudoVFMACC_ALT", + AllBF16Vectors, isSEWAware=1>; +defm : VPatTernaryV_VV_VX_AAXA_RM<"int_riscv_vfnmacc", "PseudoVFNMACC_ALT", + AllBF16Vectors, isSEWAware=1>; +defm : VPatTernaryV_VV_VX_AAXA_RM<"int_riscv_vfmsac", "PseudoVFMSAC_ALT", + AllBF16Vectors, isSEWAware=1>; +defm : VPatTernaryV_VV_VX_AAXA_RM<"int_riscv_vfnmsac", "PseudoVFNMSAC_ALT", + AllBF16Vectors, isSEWAware=1>; +defm : VPatTernaryV_VV_VX_AAXA_RM<"int_riscv_vfmadd", "PseudoVFMADD_ALT", + AllBF16Vectors, isSEWAware=1>; +defm : VPatTernaryV_VV_VX_AAXA_RM<"int_riscv_vfnmadd", "PseudoVFNMADD_ALT", + AllBF16Vectors, isSEWAware=1>; +defm : VPatTernaryV_VV_VX_AAXA_RM<"int_riscv_vfmsub", "PseudoVFMSUB_ALT", + AllBF16Vectors, isSEWAware=1>; +defm : VPatTernaryV_VV_VX_AAXA_RM<"int_riscv_vfnmsub", "PseudoVFNMSUB_ALT", + AllBF16Vectors, isSEWAware=1>; +defm : VPatTernaryW_VV_VX_RM<"int_riscv_vfwmacc", "PseudoVFWMACC_ALT", + AllWidenableBF16ToFloatVectors, isSEWAware=1>; +defm : VPatTernaryW_VV_VX_RM<"int_riscv_vfwnmacc", "PseudoVFWNMACC_ALT", + AllWidenableBF16ToFloatVectors, isSEWAware=1>; +defm : VPatTernaryW_VV_VX_RM<"int_riscv_vfwmsac", "PseudoVFWMSAC_ALT", + AllWidenableBF16ToFloatVectors, isSEWAware=1>; +defm : VPatTernaryW_VV_VX_RM<"int_riscv_vfwnmsac", "PseudoVFWNMSAC_ALT", + AllWidenableBF16ToFloatVectors, isSEWAware=1>; +defm : VPatUnaryV_V<"int_riscv_vfrsqrt7", "PseudoVFRSQRT7_ALT", + AllBF16Vectors, isSEWAware=1>; +defm : VPatUnaryV_V_RM<"int_riscv_vfrec7", "PseudoVFREC7_ALT", + AllBF16Vectors, isSEWAware=1>; +defm : VPatBinaryV_VV_VX<"int_riscv_vfmin", "PseudoVFMIN_ALT", + AllBF16Vectors, isSEWAware=1>; +defm : VPatBinaryV_VV_VX<"int_riscv_vfmax", "PseudoVFMAX_ALT", + AllBF16Vectors, isSEWAware=1>; +defm : VPatBinaryV_VV_VX<"int_riscv_vfsgnj", "PseudoVFSGNJ_ALT", + AllBF16Vectors, isSEWAware=1>; +defm : VPatBinaryV_VV_VX<"int_riscv_vfsgnjn", "PseudoVFSGNJN_ALT", + AllBF16Vectors, isSEWAware=1>; +defm : VPatBinaryV_VV_VX<"int_riscv_vfsgnjx", "PseudoVFSGNJX_ALT", + AllBF16Vectors, isSEWAware=1>; +defm : VPatBinaryM_VV_VX<"int_riscv_vmfeq", "PseudoVMFEQ_ALT", AllBF16Vectors>; +defm : VPatBinaryM_VV_VX<"int_riscv_vmfle", "PseudoVMFLE_ALT", AllBF16Vectors>; +defm : VPatBinaryM_VV_VX<"int_riscv_vmflt", "PseudoVMFLT_ALT", AllBF16Vectors>; +defm : VPatBinaryM_VV_VX<"int_riscv_vmfne", "PseudoVMFNE_ALT", AllBF16Vectors>; +defm : VPatBinaryM_VX<"int_riscv_vmfgt", "PseudoVMFGT_ALT", AllBF16Vectors>; +defm : VPatBinaryM_VX<"int_riscv_vmfge", "PseudoVMFGE_ALT", AllBF16Vectors>; +defm : VPatBinarySwappedM_VV<"int_riscv_vmfgt", "PseudoVMFLT_ALT", AllBF16Vectors>; +defm : VPatBinarySwappedM_VV<"int_riscv_vmfge", "PseudoVMFLE_ALT", AllBF16Vectors>; +defm : VPatConversionVI_VF_BF16<"int_riscv_vfclass", "PseudoVFCLASS_ALT">; +foreach vti = AllBF16Vectors in { + let Predicates = GetVTypePredicates<vti>.Predicates in + defm : VPatBinaryCarryInTAIL<"int_riscv_vfmerge", "PseudoVFMERGE_ALT", + "V"#vti.ScalarSuffix#"M", + vti.Vector, + vti.Vector, vti.Scalar, vti.Mask, + vti.Log2SEW, vti.LMul, vti.RegClass, + vti.RegClass, vti.ScalarRegClass>; +} +defm : VPatConversionWF_VI_BF16<"int_riscv_vfwcvt_f_xu_v", "PseudoVFWCVT_F_XU_ALT", + isSEWAware=1>; +defm : VPatConversionWF_VI_BF16<"int_riscv_vfwcvt_f_x_v", "PseudoVFWCVT_F_X_ALT", + isSEWAware=1>; +defm : VPatConversionWF_VF_BF16<"int_riscv_vfwcvt_f_f_v", "PseudoVFWCVT_F_F_ALT", + isSEWAware=1>; +defm : VPatConversionVI_WF_RM_BF16<"int_riscv_vfncvt_xu_f_w", "PseudoVFNCVT_XU_F_ALT">; +defm : VPatConversionVI_WF_RM_BF16<"int_riscv_vfncvt_x_f_w", "PseudoVFNCVT_X_F_ALT">; +defm : VPatConversionVI_WF_BF16<"int_riscv_vfncvt_rtz_xu_f_w", "PseudoVFNCVT_RTZ_XU_F_ALT">; +defm : VPatConversionVI_WF_BF16<"int_riscv_vfncvt_rtz_x_f_w", "PseudoVFNCVT_RTZ_X_F_ALT">; +defm : VPatConversionVF_WF_RM<"int_riscv_vfncvt_f_f_w", "PseudoVFNCVT_F_F_ALT", + AllWidenableBF16ToFloatVectors, isSEWAware=1>; +defm : VPatConversionVF_WF_BF16<"int_riscv_vfncvt_rod_f_f_w", "PseudoVFNCVT_ROD_F_F_ALT", + isSEWAware=1>; +defm : VPatBinaryV_VX<"int_riscv_vfslide1up", "PseudoVFSLIDE1UP_ALT", AllBF16Vectors>; +defm : VPatBinaryV_VX<"int_riscv_vfslide1down", "PseudoVFSLIDE1DOWN_ALT", AllBF16Vectors>; + +foreach fvti = AllBF16Vectors in { + defvar ivti = GetIntVTypeInfo<fvti>.Vti; + let Predicates = GetVTypePredicates<ivti>.Predicates in { + // 13.16. Vector Floating-Point Move Instruction + // If we're splatting fpimm0, use vmv.v.x vd, x0. + def : Pat<(fvti.Vector (riscv_vfmv_v_f_vl + fvti.Vector:$passthru, (fvti.Scalar (fpimm0)), VLOpFrag)), + (!cast<Instruction>("PseudoVMV_V_I_"#fvti.LMul.MX) + $passthru, 0, GPR:$vl, fvti.Log2SEW, TU_MU)>; + def : Pat<(fvti.Vector (riscv_vfmv_v_f_vl + fvti.Vector:$passthru, (fvti.Scalar (SelectScalarFPAsInt (XLenVT GPR:$imm))), VLOpFrag)), + (!cast<Instruction>("PseudoVMV_V_X_"#fvti.LMul.MX) + $passthru, GPR:$imm, GPR:$vl, fvti.Log2SEW, TU_MU)>; + } + + let Predicates = GetVTypePredicates<fvti>.Predicates in { + def : Pat<(fvti.Vector (riscv_vfmv_v_f_vl + fvti.Vector:$passthru, (fvti.Scalar fvti.ScalarRegClass:$rs2), VLOpFrag)), + (!cast<Instruction>("PseudoVFMV_V_ALT_" # fvti.ScalarSuffix # "_" # + fvti.LMul.MX) + $passthru, (fvti.Scalar fvti.ScalarRegClass:$rs2), + GPR:$vl, fvti.Log2SEW, TU_MU)>; + } +} + +foreach vti = NoGroupBF16Vectors in { + let Predicates = GetVTypePredicates<vti>.Predicates in { + def : Pat<(vti.Vector (riscv_vfmv_s_f_vl (vti.Vector vti.RegClass:$passthru), + (vti.Scalar (fpimm0)), + VLOpFrag)), + (PseudoVMV_S_X $passthru, (XLenVT X0), GPR:$vl, vti.Log2SEW)>; + def : Pat<(vti.Vector (riscv_vfmv_s_f_vl (vti.Vector vti.RegClass:$passthru), + (vti.Scalar (SelectScalarFPAsInt (XLenVT GPR:$imm))), + VLOpFrag)), + (PseudoVMV_S_X $passthru, GPR:$imm, GPR:$vl, vti.Log2SEW)>; + def : Pat<(vti.Vector (riscv_vfmv_s_f_vl (vti.Vector vti.RegClass:$passthru), + vti.ScalarRegClass:$rs1, + VLOpFrag)), + (!cast<Instruction>("PseudoVFMV_S_"#vti.ScalarSuffix#"_ALT") + vti.RegClass:$passthru, + (vti.Scalar vti.ScalarRegClass:$rs1), GPR:$vl, vti.Log2SEW)>; + } + + defvar vfmv_f_s_inst = !cast<Instruction>(!strconcat("PseudoVFMV_", + vti.ScalarSuffix, + "_S_ALT")); + // Only pattern-match extract-element operations where the index is 0. Any + // other index will have been custom-lowered to slide the vector correctly + // into place. + let Predicates = GetVTypePredicates<vti>.Predicates in + def : Pat<(vti.Scalar (extractelt (vti.Vector vti.RegClass:$rs2), 0)), + (vfmv_f_s_inst vti.RegClass:$rs2, vti.Log2SEW)>; +} +} // Predicates = [HasStdExtZvfbfa] diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td index f863392a..637d61fe 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td +++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td @@ -270,7 +270,7 @@ class SiFive7AnyToGPRBypass<SchedRead read, int cycles = 2> // and floating point computation. // The V pipeline is modeled by the VCQ, VA, VL, and VS resources. There can // be one or two VA (Vector Arithmetic). -multiclass SiFive7ProcResources<bit extraVALU = false> { +multiclass SiFive7ProcResources<bit dualVALU = false> { let BufferSize = 0 in { def PipeA : ProcResource<1>; def PipeB : ProcResource<1>; @@ -279,7 +279,7 @@ multiclass SiFive7ProcResources<bit extraVALU = false> { def FDiv : ProcResource<1>; // FP Division/Sqrt // Arithmetic sequencer(s) - if extraVALU then { + if dualVALU then { // VA1 can handle any vector airthmetic instruction. def VA1 : ProcResource<1>; // VA2 generally can only handle simple vector arithmetic. @@ -305,7 +305,7 @@ multiclass SiFive7ProcResources<bit extraVALU = false> { def PipeAB : ProcResGroup<[!cast<ProcResource>(NAME#"PipeA"), !cast<ProcResource>(NAME#"PipeB")]>; - if extraVALU then + if dualVALU then def VA1OrVA2 : ProcResGroup<[!cast<ProcResource>(NAME#"VA1"), !cast<ProcResource>(NAME#"VA2")]>; } @@ -1550,10 +1550,10 @@ multiclass SiFive7ReadAdvance { /// This multiclass is a "bundle" of (1) processor resources (i.e. pipes) and /// (2) WriteRes entries. It's parameterized by config values that will /// eventually be supplied by different SchedMachineModels. -multiclass SiFive7SchedResources<int vlen, bit extraVALU, +multiclass SiFive7SchedResources<int vlen, bit dualVALU, SiFive7FPLatencies fpLatencies, bit hasFastGather> { - defm SiFive7 : SiFive7ProcResources<extraVALU>; + defm SiFive7 : SiFive7ProcResources<dualVALU>; // Pull out defs from SiFive7ProcResources so we can refer to them by name. defvar SiFive7PipeA = !cast<ProcResource>(NAME # SiFive7PipeA); @@ -1562,10 +1562,10 @@ multiclass SiFive7SchedResources<int vlen, bit extraVALU, defvar SiFive7IDiv = !cast<ProcResource>(NAME # SiFive7IDiv); defvar SiFive7FDiv = !cast<ProcResource>(NAME # SiFive7FDiv); // Pass SiFive7VA for VA1 and VA1OrVA2 if there is only 1 VALU. - defvar SiFive7VA1 = !if (extraVALU, + defvar SiFive7VA1 = !if (dualVALU, !cast<ProcResource>(NAME # SiFive7VA1), !cast<ProcResource>(NAME # SiFive7VA)); - defvar SiFive7VA1OrVA2 = !if (extraVALU, + defvar SiFive7VA1OrVA2 = !if (dualVALU, !cast<ProcResGroup>(NAME # SiFive7VA1OrVA2), !cast<ProcResource>(NAME # SiFive7VA)); defvar SiFive7VA = !cast<ProcResource>(NAME # SiFive7VA); @@ -1608,7 +1608,7 @@ class SiFive7SchedMachineModel<int vlen> : SchedMachineModel { HasStdExtZknh, HasStdExtZksed, HasStdExtZksh, HasStdExtZkr]; int VLEN = vlen; - bit HasExtraVALU = false; + bit HasDualVALU = false; SiFive7FPLatencies FPLatencies; bit HasFastGather = false; @@ -1635,7 +1635,7 @@ def SiFive7VLEN512Model : SiFive7SchedMachineModel<512> { } def SiFive7VLEN1024X300Model : SiFive7SchedMachineModel<1024> { - let HasExtraVALU = true; + let HasDualVALU = true; let FPLatencies = SiFive7LowFPLatencies; let HasFastGather = true; } @@ -1643,7 +1643,7 @@ def SiFive7VLEN1024X300Model : SiFive7SchedMachineModel<1024> { /// Binding models to their scheduling resources. foreach model = [SiFive7VLEN512Model, SiFive7VLEN1024X300Model] in { let SchedModel = model in - defm model.Name : SiFive7SchedResources<model.VLEN, model.HasExtraVALU, + defm model.Name : SiFive7SchedResources<model.VLEN, model.HasDualVALU, model.FPLatencies, model.HasFastGather>; } diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h index 6acf799..334db4b 100644 --- a/llvm/lib/Target/RISCV/RISCVSubtarget.h +++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h @@ -288,9 +288,12 @@ public: bool hasVInstructionsI64() const { return HasStdExtZve64x; } bool hasVInstructionsF16Minimal() const { return HasStdExtZvfhmin; } bool hasVInstructionsF16() const { return HasStdExtZvfh; } - bool hasVInstructionsBF16Minimal() const { return HasStdExtZvfbfmin; } + bool hasVInstructionsBF16Minimal() const { + return HasStdExtZvfbfmin || HasStdExtZvfbfa; + } bool hasVInstructionsF32() const { return HasStdExtZve32f; } bool hasVInstructionsF64() const { return HasStdExtZve64d; } + bool hasVInstructionsBF16() const { return HasStdExtZvfbfa; } // F16 and F64 both require F32. bool hasVInstructionsAnyF() const { return hasVInstructionsF32(); } bool hasVInstructionsFullMultiply() const { return HasStdExtV; } diff --git a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp index 56a6168..640b014 100644 --- a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp @@ -78,6 +78,8 @@ public: void outputExecutionModeFromNumthreadsAttribute( const MCRegister &Reg, const Attribute &Attr, SPIRV::ExecutionMode::ExecutionMode EM); + void outputExecutionModeFromEnableMaximalReconvergenceAttr( + const MCRegister &Reg, const SPIRVSubtarget &ST); void outputExecutionMode(const Module &M); void outputAnnotations(const Module &M); void outputModuleSections(); @@ -495,6 +497,20 @@ void SPIRVAsmPrinter::outputExecutionModeFromNumthreadsAttribute( outputMCInst(Inst); } +void SPIRVAsmPrinter::outputExecutionModeFromEnableMaximalReconvergenceAttr( + const MCRegister &Reg, const SPIRVSubtarget &ST) { + assert(ST.canUseExtension(SPIRV::Extension::SPV_KHR_maximal_reconvergence) && + "Function called when SPV_KHR_maximal_reconvergence is not enabled."); + + MCInst Inst; + Inst.setOpcode(SPIRV::OpExecutionMode); + Inst.addOperand(MCOperand::createReg(Reg)); + unsigned EM = + static_cast<unsigned>(SPIRV::ExecutionMode::MaximallyReconvergesKHR); + Inst.addOperand(MCOperand::createImm(EM)); + outputMCInst(Inst); +} + void SPIRVAsmPrinter::outputExecutionMode(const Module &M) { NamedMDNode *Node = M.getNamedMetadata("spirv.ExecutionMode"); if (Node) { @@ -551,6 +567,10 @@ void SPIRVAsmPrinter::outputExecutionMode(const Module &M) { if (Attribute Attr = F.getFnAttribute("hlsl.numthreads"); Attr.isValid()) outputExecutionModeFromNumthreadsAttribute( FReg, Attr, SPIRV::ExecutionMode::LocalSize); + if (Attribute Attr = F.getFnAttribute("enable-maximal-reconvergence"); + Attr.getValueAsBool()) { + outputExecutionModeFromEnableMaximalReconvergenceAttr(FReg, *ST); + } if (MDNode *Node = F.getMetadata("work_group_size_hint")) outputExecutionModeFromMDNode(FReg, Node, SPIRV::ExecutionMode::LocalSizeHint, 3, 1); diff --git a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp index 5f3ed86..96f5dee 100644 --- a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp @@ -153,7 +153,9 @@ static const std::map<std::string, SPIRV::Extension::Extension, std::less<>> SPIRV::Extension::Extension:: SPV_EXT_relaxed_printf_string_address_space}, {"SPV_INTEL_predicated_io", - SPIRV::Extension::Extension::SPV_INTEL_predicated_io}}; + SPIRV::Extension::Extension::SPV_INTEL_predicated_io}, + {"SPV_KHR_maximal_reconvergence", + SPIRV::Extension::Extension::SPV_KHR_maximal_reconvergence}}; bool SPIRVExtensionsParser::parse(cl::Option &O, StringRef ArgName, StringRef ArgValue, diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp index c6c6182..a151fd2 100644 --- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp @@ -1392,19 +1392,19 @@ void SPIRVEmitIntrinsics::preprocessCompositeConstants(IRBuilder<> &B) { Constant *AggrConst = nullptr; Type *ResTy = nullptr; if (auto *COp = dyn_cast<ConstantVector>(Op)) { - AggrConst = cast<Constant>(COp); + AggrConst = COp; ResTy = COp->getType(); } else if (auto *COp = dyn_cast<ConstantArray>(Op)) { - AggrConst = cast<Constant>(COp); + AggrConst = COp; ResTy = B.getInt32Ty(); } else if (auto *COp = dyn_cast<ConstantStruct>(Op)) { - AggrConst = cast<Constant>(COp); + AggrConst = COp; ResTy = B.getInt32Ty(); } else if (auto *COp = dyn_cast<ConstantDataArray>(Op)) { - AggrConst = cast<Constant>(COp); + AggrConst = COp; ResTy = B.getInt32Ty(); } else if (auto *COp = dyn_cast<ConstantAggregateZero>(Op)) { - AggrConst = cast<Constant>(COp); + AggrConst = COp; ResTy = Op->getType()->isVectorTy() ? COp->getType() : B.getInt32Ty(); } if (AggrConst) { diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index a466ab2..a0cff4d 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -3765,7 +3765,6 @@ void SPIRVInstructionSelector::decorateUsesAsNonUniform( SPIRV::Decoration::NonUniformEXT, {}); } } - return; } bool SPIRVInstructionSelector::extractSubvector( diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp index 5144fb1..61a0bbe 100644 --- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp @@ -1200,6 +1200,23 @@ void addOpAccessChainReqs(const MachineInstr &Instr, return; } + bool IsNonUniform = + hasNonUniformDecoration(Instr.getOperand(0).getReg(), MRI); + + auto FirstIndexReg = Instr.getOperand(3).getReg(); + bool FirstIndexIsConstant = + Subtarget.getInstrInfo()->isConstantInstr(*MRI.getVRegDef(FirstIndexReg)); + + if (StorageClass == SPIRV::StorageClass::StorageClass::StorageBuffer) { + if (IsNonUniform) + Handler.addRequirements( + SPIRV::Capability::StorageBufferArrayNonUniformIndexingEXT); + else if (!FirstIndexIsConstant) + Handler.addRequirements( + SPIRV::Capability::StorageBufferArrayDynamicIndexing); + return; + } + Register PointeeTypeReg = ResTypeInst->getOperand(2).getReg(); MachineInstr *PointeeType = MRI.getUniqueVRegDef(PointeeTypeReg); if (PointeeType->getOpcode() != SPIRV::OpTypeImage && @@ -1208,27 +1225,25 @@ void addOpAccessChainReqs(const MachineInstr &Instr, return; } - bool IsNonUniform = - hasNonUniformDecoration(Instr.getOperand(0).getReg(), MRI); if (isUniformTexelBuffer(PointeeType)) { if (IsNonUniform) Handler.addRequirements( SPIRV::Capability::UniformTexelBufferArrayNonUniformIndexingEXT); - else + else if (!FirstIndexIsConstant) Handler.addRequirements( SPIRV::Capability::UniformTexelBufferArrayDynamicIndexingEXT); } else if (isInputAttachment(PointeeType)) { if (IsNonUniform) Handler.addRequirements( SPIRV::Capability::InputAttachmentArrayNonUniformIndexingEXT); - else + else if (!FirstIndexIsConstant) Handler.addRequirements( SPIRV::Capability::InputAttachmentArrayDynamicIndexingEXT); } else if (isStorageTexelBuffer(PointeeType)) { if (IsNonUniform) Handler.addRequirements( SPIRV::Capability::StorageTexelBufferArrayNonUniformIndexingEXT); - else + else if (!FirstIndexIsConstant) Handler.addRequirements( SPIRV::Capability::StorageTexelBufferArrayDynamicIndexingEXT); } else if (isSampledImage(PointeeType) || @@ -1237,14 +1252,14 @@ void addOpAccessChainReqs(const MachineInstr &Instr, if (IsNonUniform) Handler.addRequirements( SPIRV::Capability::SampledImageArrayNonUniformIndexingEXT); - else + else if (!FirstIndexIsConstant) Handler.addRequirements( SPIRV::Capability::SampledImageArrayDynamicIndexing); } else if (isStorageImage(PointeeType)) { if (IsNonUniform) Handler.addRequirements( SPIRV::Capability::StorageImageArrayNonUniformIndexingEXT); - else + else if (!FirstIndexIsConstant) Handler.addRequirements( SPIRV::Capability::StorageImageArrayDynamicIndexing); } @@ -2155,6 +2170,9 @@ static void collectReqs(const Module &M, SPIRV::ModuleAnalysisInfo &MAI, SPIRV::OperandCategory::ExecutionModeOperand, SPIRV::ExecutionMode::LocalSize, ST); } + if (F.getFnAttribute("enable-maximal-reconvergence").getValueAsBool()) { + MAI.Reqs.addExtension(SPIRV::Extension::SPV_KHR_maximal_reconvergence); + } if (F.getMetadata("work_group_size_hint")) MAI.Reqs.getAndAddRequirements( SPIRV::OperandCategory::ExecutionModeOperand, diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td index 2625642..7d08b29 100644 --- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td +++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td @@ -386,6 +386,7 @@ defm SPV_KHR_float_controls2 : ExtensionOperand<124, [EnvVulkan, EnvOpenCL]>; defm SPV_INTEL_tensor_float32_conversion : ExtensionOperand<125, [EnvOpenCL]>; defm SPV_KHR_bfloat16 : ExtensionOperand<126, [EnvVulkan, EnvOpenCL]>; defm SPV_INTEL_predicated_io : ExtensionOperand<127, [EnvOpenCL]>; +defm SPV_KHR_maximal_reconvergence : ExtensionOperand<128, [EnvVulkan]>; //===----------------------------------------------------------------------===// // Multiclass used to define Capabilities enum values and at the same time @@ -698,7 +699,7 @@ defm IntersectionNV: ExecutionModelOperand<5314, [RayTracingNV]>; defm AnyHitNV: ExecutionModelOperand<5315, [RayTracingNV]>; defm ClosestHitNV: ExecutionModelOperand<5316, [RayTracingNV]>; defm MissNV: ExecutionModelOperand<5317, [RayTracingNV]>; -defm CallableNV: ExecutionModelOperand<5318, [RayTracingNV]>; +defm CallableNV : ExecutionModelOperand<5318, [RayTracingNV]>; //===----------------------------------------------------------------------===// // Multiclass used to define MemoryModel enum values and at the same time @@ -805,6 +806,7 @@ defm RoundingModeRTNINTEL : ExecutionModeOperand<5621, [RoundToInfinityINTEL]>; defm FloatingPointModeALTINTEL : ExecutionModeOperand<5622, [FloatingPointModeINTEL]>; defm FloatingPointModeIEEEINTEL : ExecutionModeOperand<5623, [FloatingPointModeINTEL]>; defm FPFastMathDefault : ExecutionModeOperand<6028, [FloatControls2]>; +defm MaximallyReconvergesKHR : ExecutionModeOperand<6023, [Shader]>; //===----------------------------------------------------------------------===// // Multiclass used to define StorageClass enum values and at the same time diff --git a/llvm/lib/Target/Sparc/SparcFrameLowering.cpp b/llvm/lib/Target/Sparc/SparcFrameLowering.cpp index 2934c88..fa08d44 100644 --- a/llvm/lib/Target/Sparc/SparcFrameLowering.cpp +++ b/llvm/lib/Target/Sparc/SparcFrameLowering.cpp @@ -246,8 +246,7 @@ SparcFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, } } -static bool LLVM_ATTRIBUTE_UNUSED verifyLeafProcRegUse(MachineRegisterInfo *MRI) -{ +[[maybe_unused]] static bool verifyLeafProcRegUse(MachineRegisterInfo *MRI) { for (unsigned reg = SP::I0; reg <= SP::I7; ++reg) if (MRI->isPhysRegUsed(reg)) diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.cpp index d9c8e22..6e99fc3 100644 --- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.cpp +++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.cpp @@ -23,7 +23,7 @@ std::optional<wasm::ValType> WebAssembly::parseType(StringRef Type) { .Case("i64", wasm::ValType::I64) .Case("f32", wasm::ValType::F32) .Case("f64", wasm::ValType::F64) - .Cases("v128", "i8x16", "i16x8", "i32x4", "i64x2", "f32x4", "f64x2", + .Cases({"v128", "i8x16", "i16x8", "i32x4", "i64x2", "f32x4", "f64x2"}, wasm::ValType::V128) .Case("funcref", wasm::ValType::FUNCREF) .Case("externref", wasm::ValType::EXTERNREF) diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td index 0f6e1ca..ed54404d 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -1763,6 +1763,26 @@ defm RELAXED_DOT : "i16x8.relaxed_dot_i8x16_i7x16_s\t$dst, $lhs, $rhs", "i16x8.relaxed_dot_i8x16_i7x16_s", 0x112>; +def : Pat< + (v8i16 (add + (wasm_shuffle + (v8i16 (extmul_low_s v16i8:$lhs, v16i8:$rhs)), + (v8i16 (extmul_high_s v16i8:$lhs, v16i8:$rhs)), + (i32 0), (i32 1), (i32 4), (i32 5), + (i32 8), (i32 9), (i32 12), (i32 13), + (i32 16), (i32 17), (i32 20), (i32 21), + (i32 24), (i32 25), (i32 28), (i32 29)), + (wasm_shuffle + (v8i16 (extmul_low_s v16i8:$lhs, v16i8:$rhs)), + (v8i16 (extmul_high_s v16i8:$lhs, v16i8:$rhs)), + (i32 2), (i32 3), (i32 6), (i32 7), + (i32 10), (i32 11), (i32 14), (i32 15), + (i32 18), (i32 19), (i32 22), (i32 23), + (i32 26), (i32 27), (i32 30), (i32 31))) + ), + (v8i16 (RELAXED_DOT v16i8:$lhs, v16i8:$rhs)) +>; + defm RELAXED_DOT_ADD : RELAXED_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs, V128:$acc), (outs), (ins), @@ -1771,6 +1791,18 @@ defm RELAXED_DOT_ADD : "i32x4.relaxed_dot_i8x16_i7x16_add_s\t$dst, $lhs, $rhs, $acc", "i32x4.relaxed_dot_i8x16_i7x16_add_s", 0x113>; +def : Pat< + (v4i32 (add + (v4i32 (int_wasm_extadd_pairwise_signed + (v8i16 (int_wasm_relaxed_dot_i8x16_i7x16_signed v16i8:$lhs, v16i8:$rhs)))), + (v4i32 V128:$acc))), + (v4i32 (RELAXED_DOT_ADD v16i8:$lhs, v16i8:$rhs, (v4i32 V128:$acc))) + >; + +def : Pat<(v4i32 (partial_reduce_smla (v4i32 V128:$acc), (v16i8 V128:$lhs), + (v16i8 V128:$rhs))), + (RELAXED_DOT_ADD $lhs, $rhs, $acc)>, Requires<[HasRelaxedSIMD]>; + //===----------------------------------------------------------------------===// // Relaxed BFloat16 dot product //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp index a8908d4..ac251fd 100644 --- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -3514,15 +3514,16 @@ bool X86AsmParser::parseInstruction(ParseInstructionInfo &Info, StringRef Name, // xacquire <insn> ; xacquire must be accompanied by 'lock' bool IsPrefix = StringSwitch<bool>(Name) - .Cases("cs", "ds", "es", "fs", "gs", "ss", true) - .Cases("rex64", "data32", "data16", "addr32", "addr16", true) - .Cases("xacquire", "xrelease", true) - .Cases("acquire", "release", isParsingIntelSyntax()) + .Cases({"cs", "ds", "es", "fs", "gs", "ss"}, true) + .Cases({"rex64", "data32", "data16", "addr32", "addr16"}, true) + .Cases({"xacquire", "xrelease"}, true) + .Cases({"acquire", "release"}, isParsingIntelSyntax()) .Default(false); auto isLockRepeatNtPrefix = [](StringRef N) { return StringSwitch<bool>(N) - .Cases("lock", "rep", "repe", "repz", "repne", "repnz", "notrack", true) + .Cases({"lock", "rep", "repe", "repz", "repne", "repnz", "notrack"}, + true) .Default(false); }; diff --git a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp index b81641f..28fa2cd 100644 --- a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp +++ b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp @@ -414,8 +414,6 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI, getActionDefinitionsBuilder(G_SEXT_INREG).lower(); - getActionDefinitionsBuilder(G_IS_FPCLASS).lower(); - // fp constants getActionDefinitionsBuilder(G_FCONSTANT) .legalFor({s32, s64}) diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index 6db780f..8e08d16 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -1338,6 +1338,10 @@ def ProcessorFeatures { list<SubtargetFeature> PTLFeatures = !listremove(ARLSFeatures, [FeatureWIDEKL]); + // Novalake + list<SubtargetFeature> NVLFeatures = + !listconcat(PTLFeatures, [FeaturePREFETCHI]); + // Clearwaterforest list<SubtargetFeature> CWFAdditionalFeatures = [FeaturePREFETCHI, FeatureAVXVNNIINT16, @@ -1883,6 +1887,9 @@ foreach P = ["pantherlake", "wildcatlake"] in { def : ProcModel<P, AlderlakePModel, ProcessorFeatures.PTLFeatures, ProcessorFeatures.ADLTuning>; } +def : ProcModel<"novalake", AlderlakePModel, ProcessorFeatures.NVLFeatures, + ProcessorFeatures.ADLTuning>; + def : ProcModel<"clearwaterforest", AlderlakePModel, ProcessorFeatures.CWFFeatures, ProcessorFeatures.ADLTuning>; def : ProcModel<"emeraldrapids", SapphireRapidsModel, diff --git a/llvm/lib/Target/X86/X86FloatingPoint.cpp b/llvm/lib/Target/X86/X86FloatingPoint.cpp index e0991aa..9f88fda 100644 --- a/llvm/lib/Target/X86/X86FloatingPoint.cpp +++ b/llvm/lib/Target/X86/X86FloatingPoint.cpp @@ -602,8 +602,7 @@ namespace { friend bool operator<(const TableEntry &TE, unsigned V) { return TE.from < V; } - friend bool LLVM_ATTRIBUTE_UNUSED operator<(unsigned V, - const TableEntry &TE) { + [[maybe_unused]] friend bool operator<(unsigned V, const TableEntry &TE) { return V < TE.from; } }; diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index c32b1a6..b05d7c7 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -29755,65 +29755,30 @@ static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue *Low = nullptr) { - unsigned NumElts = VT.getVectorNumElements(); - // For vXi8 we will unpack the low and high half of each 128 bit lane to widen // to a vXi16 type. Do the multiplies, shift the results and pack the half // lane results back together. // We'll take different approaches for signed and unsigned. - // For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes - // and use pmullw to calculate the full 16-bit product. + // For unsigned we'll use punpcklbw/punpckhbw to zero extend the bytes to + // words and use pmullw to calculate the full 16-bit product. // For signed we'll use punpcklbw/punpckbw to extend the bytes to words and // shift them left into the upper byte of each word. This allows us to use // pmulhw to calculate the full 16-bit product. This trick means we don't // need to sign extend the bytes to use pmullw. - - MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2); + MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2); SDValue Zero = DAG.getConstant(0, dl, VT); - SDValue ALo, AHi; + SDValue ALo, AHi, BLo, BHi; if (IsSigned) { ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A)); - AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A)); - } else { - ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero)); - AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero)); - } - - SDValue BLo, BHi; - if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) { - // If the RHS is a constant, manually unpackl/unpackh and extend. - SmallVector<SDValue, 16> LoOps, HiOps; - for (unsigned i = 0; i != NumElts; i += 16) { - for (unsigned j = 0; j != 8; ++j) { - SDValue LoOp = B.getOperand(i + j); - SDValue HiOp = B.getOperand(i + j + 8); - - if (IsSigned) { - LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16); - HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16); - LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp, - DAG.getConstant(8, dl, MVT::i16)); - HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp, - DAG.getConstant(8, dl, MVT::i16)); - } else { - LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16); - HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16); - } - - LoOps.push_back(LoOp); - HiOps.push_back(HiOp); - } - } - - BLo = DAG.getBuildVector(ExVT, dl, LoOps); - BHi = DAG.getBuildVector(ExVT, dl, HiOps); - } else if (IsSigned) { BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B)); + AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A)); BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B)); } else { + ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero)); BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero)); + AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero)); BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero)); } @@ -29826,7 +29791,7 @@ static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl, if (Low) *Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi); - return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf*/ true); + return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf=*/true); } static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, @@ -44848,10 +44813,16 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( } case X86ISD::PCMPGT: // icmp sgt(0, R) == ashr(R, BitWidth-1). - // iff we only need the sign bit then we can use R directly. - if (OriginalDemandedBits.isSignMask() && - ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode())) - return TLO.CombineTo(Op, Op.getOperand(1)); + if (ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode())) { + // iff we only need the signbit then we can use R directly. + if (OriginalDemandedBits.isSignMask()) + return TLO.CombineTo(Op, Op.getOperand(1)); + // otherwise we just need R's signbit for the comparison. + APInt SignMask = APInt::getSignMask(BitWidth); + if (SimplifyDemandedBits(Op.getOperand(1), SignMask, OriginalDemandedElts, + Known, TLO, Depth + 1)) + return true; + } break; case X86ISD::MOVMSK: { SDValue Src = Op.getOperand(0); @@ -47761,6 +47732,15 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, DL, DAG, Subtarget)) return V; + // If the sign bit is known then BLENDV can be folded away. + if (N->getOpcode() == X86ISD::BLENDV) { + KnownBits KnownCond = DAG.computeKnownBits(Cond); + if (KnownCond.isNegative()) + return LHS; + if (KnownCond.isNonNegative()) + return RHS; + } + if (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV) { SmallVector<int, 64> CondMask; if (createShuffleMaskFromVSELECT(CondMask, Cond, @@ -58342,11 +58322,12 @@ static SDValue combineX86CloadCstore(SDNode *N, SelectionDAG &DAG) { } else if (Op1.getOpcode() == ISD::AND && Sub.getValue(0).use_empty()) { SDValue Src = Op1; SDValue Op10 = Op1.getOperand(0); - if (Op10.getOpcode() == ISD::XOR && isAllOnesConstant(Op10.getOperand(1))) { - // res, flags2 = sub 0, (and (xor X, -1), Y) + if (Op10.getOpcode() == ISD::XOR && isAllOnesConstant(Op10.getOperand(1)) && + llvm::isOneConstant(Op1.getOperand(1))) { + // res, flags2 = sub 0, (and (xor X, -1), 1) // cload/cstore ..., cond_ne, flag2 // -> - // res, flags2 = sub 0, (and X, Y) + // res, flags2 = sub 0, (and X, 1) // cload/cstore ..., cond_e, flag2 Src = DAG.getNode(ISD::AND, DL, Op1.getValueType(), Op10.getOperand(0), Op1.getOperand(1)); diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp index 6dd43b2..37d7772 100644 --- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp +++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp @@ -606,16 +606,24 @@ Value *X86TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const { void X86TargetLowering::insertSSPDeclarations(Module &M) const { // MSVC CRT provides functionalities for stack protection. - if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() || - Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) { + RTLIB::LibcallImpl SecurityCheckCookieLibcall = + getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE); + + RTLIB::LibcallImpl SecurityCookieVar = + getLibcallImpl(RTLIB::STACK_CHECK_GUARD); + if (SecurityCheckCookieLibcall != RTLIB::Unsupported && + SecurityCookieVar != RTLIB::Unsupported) { + // MSVC CRT provides functionalities for stack protection. // MSVC CRT has a global variable holding security cookie. - M.getOrInsertGlobal("__security_cookie", + M.getOrInsertGlobal(getLibcallImplName(SecurityCookieVar), PointerType::getUnqual(M.getContext())); // MSVC CRT has a function to validate security cookie. - FunctionCallee SecurityCheckCookie = M.getOrInsertFunction( - "__security_check_cookie", Type::getVoidTy(M.getContext()), - PointerType::getUnqual(M.getContext())); + FunctionCallee SecurityCheckCookie = + M.getOrInsertFunction(getLibcallImplName(SecurityCheckCookieLibcall), + Type::getVoidTy(M.getContext()), + PointerType::getUnqual(M.getContext())); + if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) { F->setCallingConv(CallingConv::X86_FastCall); F->addParamAttr(0, Attribute::AttrKind::InReg); diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td index 0fd44b7..ec31675 100644 --- a/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/llvm/lib/Target/X86/X86InstrCompiler.td @@ -1256,8 +1256,17 @@ def : Pat<(i64 (X86Wrapper tconstpool :$dst)), (MOV64ri32 tconstpool :$dst)>, Requires<[KernelCode]>; def : Pat<(i64 (X86Wrapper tjumptable :$dst)), (MOV64ri32 tjumptable :$dst)>, Requires<[KernelCode]>; -def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)), - (MOV64ri32 tglobaladdr :$dst)>, Requires<[KernelCode]>; + +// If the globaladdr is an absolute_symbol, don't bother using the sign extending +// instruction since there's no benefit to using it with absolute symbols. +def globalAddrNoAbsSym : PatLeaf<(tglobaladdr:$dst), [{ + auto *GA = cast<GlobalAddressSDNode>(N); + return !GA->getGlobal()->getAbsoluteSymbolRange(); +}]>; +def : Pat<(i64 (X86Wrapper globalAddrNoAbsSym:$dst)), + (MOV64ri32 tglobaladdr:$dst)>, + Requires<[KernelCode]>; + def : Pat<(i64 (X86Wrapper texternalsym:$dst)), (MOV64ri32 texternalsym:$dst)>, Requires<[KernelCode]>; def : Pat<(i64 (X86Wrapper mcsym:$dst)), diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 1d2cd39..5c23f91 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -10809,39 +10809,27 @@ void X86InstrInfo::buildClearRegister(Register Reg, MachineBasicBlock &MBB, if (!ST.hasSSE1()) return; - // PXOR is safe to use because it doesn't affect flags. - BuildMI(MBB, Iter, DL, get(X86::PXORrr), Reg) - .addReg(Reg, RegState::Undef) - .addReg(Reg, RegState::Undef); + BuildMI(MBB, Iter, DL, get(X86::V_SET0), Reg); } else if (X86::VR256RegClass.contains(Reg)) { // YMM# if (!ST.hasAVX()) return; - // VPXOR is safe to use because it doesn't affect flags. - BuildMI(MBB, Iter, DL, get(X86::VPXORrr), Reg) - .addReg(Reg, RegState::Undef) - .addReg(Reg, RegState::Undef); + BuildMI(MBB, Iter, DL, get(X86::AVX_SET0), Reg); } else if (X86::VR512RegClass.contains(Reg)) { // ZMM# if (!ST.hasAVX512()) return; - // VPXORY is safe to use because it doesn't affect flags. - BuildMI(MBB, Iter, DL, get(X86::VPXORYrr), Reg) - .addReg(Reg, RegState::Undef) - .addReg(Reg, RegState::Undef); + BuildMI(MBB, Iter, DL, get(X86::AVX512_512_SET0), Reg); } else if (X86::VK1RegClass.contains(Reg) || X86::VK2RegClass.contains(Reg) || X86::VK4RegClass.contains(Reg) || X86::VK8RegClass.contains(Reg) || X86::VK16RegClass.contains(Reg)) { if (!ST.hasVLX()) return; - // KXOR is safe to use because it doesn't affect flags. - unsigned Op = ST.hasBWI() ? X86::KXORQkk : X86::KXORWkk; - BuildMI(MBB, Iter, DL, get(Op), Reg) - .addReg(Reg, RegState::Undef) - .addReg(Reg, RegState::Undef); + unsigned Op = ST.hasBWI() ? X86::KSET0Q : X86::KSET0W; + BuildMI(MBB, Iter, DL, get(Op), Reg); } } diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp index 481a9be..713d504 100644 --- a/llvm/lib/Target/X86/X86MCInstLower.cpp +++ b/llvm/lib/Target/X86/X86MCInstLower.cpp @@ -1928,6 +1928,17 @@ static void addConstantComments(const MachineInstr *MI, #define INSTR_CASE(Prefix, Instr, Suffix, Postfix) \ case X86::Prefix##Instr##Suffix##rm##Postfix: +#define CASE_AVX512_ARITH_RM(Instr) \ + INSTR_CASE(V, Instr, Z128, ) \ + INSTR_CASE(V, Instr, Z128, k) \ + INSTR_CASE(V, Instr, Z128, kz) \ + INSTR_CASE(V, Instr, Z256, ) \ + INSTR_CASE(V, Instr, Z256, k) \ + INSTR_CASE(V, Instr, Z256, kz) \ + INSTR_CASE(V, Instr, Z, ) \ + INSTR_CASE(V, Instr, Z, k) \ + INSTR_CASE(V, Instr, Z, kz) + #define CASE_ARITH_RM(Instr) \ INSTR_CASE(, Instr, , ) /* SSE */ \ INSTR_CASE(V, Instr, , ) /* AVX-128 */ \ @@ -1943,40 +1954,26 @@ static void addConstantComments(const MachineInstr *MI, INSTR_CASE(V, Instr, Z, kz) // TODO: Add additional instructions when useful. - CASE_ARITH_RM(PMADDUBSW) { - unsigned SrcIdx = getSrcIdx(MI, 1); - if (auto *C = X86::getConstantFromPool(*MI, SrcIdx + 1)) { - if (C->getType()->getScalarSizeInBits() == 8) { - std::string Comment; - raw_string_ostream CS(Comment); - unsigned VectorWidth = - X86::getVectorRegisterWidth(MI->getDesc().operands()[0]); - CS << "["; - printConstant(C, VectorWidth, CS); - CS << "]"; - OutStreamer.AddComment(CS.str()); - } - } - break; - } - + CASE_ARITH_RM(PMADDUBSW) CASE_ARITH_RM(PMADDWD) + CASE_ARITH_RM(PMULDQ) + CASE_ARITH_RM(PMULUDQ) + CASE_ARITH_RM(PMULLD) + CASE_AVX512_ARITH_RM(PMULLQ) CASE_ARITH_RM(PMULLW) CASE_ARITH_RM(PMULHW) CASE_ARITH_RM(PMULHUW) CASE_ARITH_RM(PMULHRSW) { unsigned SrcIdx = getSrcIdx(MI, 1); if (auto *C = X86::getConstantFromPool(*MI, SrcIdx + 1)) { - if (C->getType()->getScalarSizeInBits() == 16) { - std::string Comment; - raw_string_ostream CS(Comment); - unsigned VectorWidth = - X86::getVectorRegisterWidth(MI->getDesc().operands()[0]); - CS << "["; - printConstant(C, VectorWidth, CS); - CS << "]"; - OutStreamer.AddComment(CS.str()); - } + std::string Comment; + raw_string_ostream CS(Comment); + unsigned VectorWidth = + X86::getVectorRegisterWidth(MI->getDesc().operands()[0]); + CS << "["; + printConstant(C, VectorWidth, CS); + CS << "]"; + OutStreamer.AddComment(CS.str()); } break; } diff --git a/llvm/lib/TargetParser/ARMTargetParserCommon.cpp b/llvm/lib/TargetParser/ARMTargetParserCommon.cpp index 89d5e0d..f6cea85 100644 --- a/llvm/lib/TargetParser/ARMTargetParserCommon.cpp +++ b/llvm/lib/TargetParser/ARMTargetParserCommon.cpp @@ -22,13 +22,13 @@ StringRef ARM::getArchSynonym(StringRef Arch) { .Case("v5e", "v5te") .Case("v6j", "v6") .Case("v6hl", "v6k") - .Cases("v6m", "v6sm", "v6s-m", "v6-m") - .Cases("v6z", "v6zk", "v6kz") - .Cases("v7", "v7a", "v7hl", "v7l", "v7-a") + .Cases({"v6m", "v6sm", "v6s-m"}, "v6-m") + .Cases({"v6z", "v6zk"}, "v6kz") + .Cases({"v7", "v7a", "v7hl", "v7l"}, "v7-a") .Case("v7r", "v7-r") .Case("v7m", "v7-m") .Case("v7em", "v7e-m") - .Cases("v8", "v8a", "v8l", "aarch64", "arm64", "v8-a") + .Cases({"v8", "v8a", "v8l", "aarch64", "arm64"}, "v8-a") .Case("v8.1a", "v8.1-a") .Case("v8.2a", "v8.2-a") .Case("v8.3a", "v8.3-a") @@ -39,7 +39,7 @@ StringRef ARM::getArchSynonym(StringRef Arch) { .Case("v8.8a", "v8.8-a") .Case("v8.9a", "v8.9-a") .Case("v8r", "v8-r") - .Cases("v9", "v9a", "v9-a") + .Cases({"v9", "v9a"}, "v9-a") .Case("v9.1a", "v9.1-a") .Case("v9.2a", "v9.2-a") .Case("v9.3a", "v9.3-a") diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp index 928e779..6065575 100644 --- a/llvm/lib/TargetParser/Host.cpp +++ b/llvm/lib/TargetParser/Host.cpp @@ -70,8 +70,8 @@ using namespace llvm; -static std::unique_ptr<llvm::MemoryBuffer> - LLVM_ATTRIBUTE_UNUSED getProcCpuinfoContent() { +[[maybe_unused]] static std::unique_ptr<llvm::MemoryBuffer> +getProcCpuinfoContent() { const char *CPUInfoFile = "/proc/cpuinfo"; if (const char *CpuinfoIntercept = std::getenv("LLVM_CPUINFO")) CPUInfoFile = CpuinfoIntercept; @@ -1152,6 +1152,20 @@ static StringRef getIntelProcessorTypeAndSubtype(unsigned Family, break; } break; + case 0x12: + switch (Model) { + // Novalake: + case 0x1: + case 0x3: + CPU = "novalake"; + *Type = X86::INTEL_COREI7; + *Subtype = X86::INTEL_COREI7_NOVALAKE; + break; + default: // Unknown family 0x12 CPU. + break; + } + break; + default: break; // Unknown. } diff --git a/llvm/lib/TargetParser/RISCVISAInfo.cpp b/llvm/lib/TargetParser/RISCVISAInfo.cpp index 9268df2..31126cc 100644 --- a/llvm/lib/TargetParser/RISCVISAInfo.cpp +++ b/llvm/lib/TargetParser/RISCVISAInfo.cpp @@ -887,7 +887,7 @@ void RISCVISAInfo::updateImplication() { } static constexpr StringLiteral CombineIntoExts[] = { - {"b"}, {"zk"}, {"zkn"}, {"zks"}, {"zvkn"}, + {"a"}, {"b"}, {"zk"}, {"zkn"}, {"zks"}, {"zvkn"}, {"zvknc"}, {"zvkng"}, {"zvks"}, {"zvksc"}, {"zvksg"}, }; diff --git a/llvm/lib/TargetParser/TargetDataLayout.cpp b/llvm/lib/TargetParser/TargetDataLayout.cpp index 950bb2b..d765d9c 100644 --- a/llvm/lib/TargetParser/TargetDataLayout.cpp +++ b/llvm/lib/TargetParser/TargetDataLayout.cpp @@ -548,8 +548,11 @@ std::string Triple::computeDataLayout(StringRef ABIName) const { case Triple::csky: return computeCSKYDataLayout(*this); case Triple::dxil: + // TODO: We need to align vectors on the element size generally, but for now + // we hard code this for 3-element 32- and 64-bit vectors as a workaround. + // See https://github.com/llvm/llvm-project/issues/123968 return "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-" - "f32:32-f64:64-n8:16:32:64"; + "f32:32-f64:64-n8:16:32:64-v48:16:16-v96:32:32-v192:64:64"; case Triple::hexagon: return "e-m:e-p:32:32:32-a:0-n16:32-" "i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-" diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp index f021094..1068ce4 100644 --- a/llvm/lib/TargetParser/Triple.cpp +++ b/llvm/lib/TargetParser/Triple.cpp @@ -579,87 +579,89 @@ static Triple::ArchType parseARMArch(StringRef ArchName) { } static Triple::ArchType parseArch(StringRef ArchName) { - auto AT = StringSwitch<Triple::ArchType>(ArchName) - .Cases("i386", "i486", "i586", "i686", Triple::x86) - // FIXME: Do we need to support these? - .Cases("i786", "i886", "i986", Triple::x86) - .Cases("amd64", "x86_64", "x86_64h", Triple::x86_64) - .Cases("powerpc", "powerpcspe", "ppc", "ppc32", Triple::ppc) - .Cases("powerpcle", "ppcle", "ppc32le", Triple::ppcle) - .Cases("powerpc64", "ppu", "ppc64", Triple::ppc64) - .Cases("powerpc64le", "ppc64le", Triple::ppc64le) - .Case("xscale", Triple::arm) - .Case("xscaleeb", Triple::armeb) - .Case("aarch64", Triple::aarch64) - .Case("aarch64_be", Triple::aarch64_be) - .Case("aarch64_32", Triple::aarch64_32) - .Case("arc", Triple::arc) - .Case("arm64", Triple::aarch64) - .Case("arm64_32", Triple::aarch64_32) - .Case("arm64e", Triple::aarch64) - .Case("arm64ec", Triple::aarch64) - .Case("arm", Triple::arm) - .Case("armeb", Triple::armeb) - .Case("thumb", Triple::thumb) - .Case("thumbeb", Triple::thumbeb) - .Case("avr", Triple::avr) - .Case("m68k", Triple::m68k) - .Case("msp430", Triple::msp430) - .Cases("mips", "mipseb", "mipsallegrex", "mipsisa32r6", - "mipsr6", Triple::mips) - .Cases("mipsel", "mipsallegrexel", "mipsisa32r6el", "mipsr6el", - Triple::mipsel) - .Cases("mips64", "mips64eb", "mipsn32", "mipsisa64r6", - "mips64r6", "mipsn32r6", Triple::mips64) - .Cases("mips64el", "mipsn32el", "mipsisa64r6el", "mips64r6el", - "mipsn32r6el", Triple::mips64el) - .Case("r600", Triple::r600) - .Case("amdgcn", Triple::amdgcn) - .Case("riscv32", Triple::riscv32) - .Case("riscv64", Triple::riscv64) - .Case("riscv32be", Triple::riscv32be) - .Case("riscv64be", Triple::riscv64be) - .Case("hexagon", Triple::hexagon) - .Cases("s390x", "systemz", Triple::systemz) - .Case("sparc", Triple::sparc) - .Case("sparcel", Triple::sparcel) - .Cases("sparcv9", "sparc64", Triple::sparcv9) - .Case("tce", Triple::tce) - .Case("tcele", Triple::tcele) - .Case("xcore", Triple::xcore) - .Case("nvptx", Triple::nvptx) - .Case("nvptx64", Triple::nvptx64) - .Case("amdil", Triple::amdil) - .Case("amdil64", Triple::amdil64) - .Case("hsail", Triple::hsail) - .Case("hsail64", Triple::hsail64) - .Case("spir", Triple::spir) - .Case("spir64", Triple::spir64) - .Cases("spirv", "spirv1.5", "spirv1.6", Triple::spirv) - .Cases("spirv32", "spirv32v1.0", "spirv32v1.1", "spirv32v1.2", - "spirv32v1.3", "spirv32v1.4", "spirv32v1.5", - "spirv32v1.6", Triple::spirv32) - .Cases("spirv64", "spirv64v1.0", "spirv64v1.1", "spirv64v1.2", - "spirv64v1.3", "spirv64v1.4", "spirv64v1.5", - "spirv64v1.6", Triple::spirv64) - .StartsWith("kalimba", Triple::kalimba) - .Case("lanai", Triple::lanai) - .Case("renderscript32", Triple::renderscript32) - .Case("renderscript64", Triple::renderscript64) - .Case("shave", Triple::shave) - .Case("ve", Triple::ve) - .Case("wasm32", Triple::wasm32) - .Case("wasm64", Triple::wasm64) - .Case("csky", Triple::csky) - .Case("loongarch32", Triple::loongarch32) - .Case("loongarch64", Triple::loongarch64) - .Cases("dxil", "dxilv1.0", "dxilv1.1", "dxilv1.2", "dxilv1.3", - "dxilv1.4", "dxilv1.5", "dxilv1.6", "dxilv1.7", - "dxilv1.8", Triple::dxil) - // Note: Cases has max limit of 10. - .Case("dxilv1.9", Triple::dxil) - .Case("xtensa", Triple::xtensa) - .Default(Triple::UnknownArch); + auto AT = + StringSwitch<Triple::ArchType>(ArchName) + .Cases({"i386", "i486", "i586", "i686"}, Triple::x86) + // FIXME: Do we need to support these? + .Cases({"i786", "i886", "i986"}, Triple::x86) + .Cases({"amd64", "x86_64", "x86_64h"}, Triple::x86_64) + .Cases({"powerpc", "powerpcspe", "ppc", "ppc32"}, Triple::ppc) + .Cases({"powerpcle", "ppcle", "ppc32le"}, Triple::ppcle) + .Cases({"powerpc64", "ppu", "ppc64"}, Triple::ppc64) + .Cases({"powerpc64le", "ppc64le"}, Triple::ppc64le) + .Case("xscale", Triple::arm) + .Case("xscaleeb", Triple::armeb) + .Case("aarch64", Triple::aarch64) + .Case("aarch64_be", Triple::aarch64_be) + .Case("aarch64_32", Triple::aarch64_32) + .Case("arc", Triple::arc) + .Case("arm64", Triple::aarch64) + .Case("arm64_32", Triple::aarch64_32) + .Case("arm64e", Triple::aarch64) + .Case("arm64ec", Triple::aarch64) + .Case("arm", Triple::arm) + .Case("armeb", Triple::armeb) + .Case("thumb", Triple::thumb) + .Case("thumbeb", Triple::thumbeb) + .Case("avr", Triple::avr) + .Case("m68k", Triple::m68k) + .Case("msp430", Triple::msp430) + .Cases({"mips", "mipseb", "mipsallegrex", "mipsisa32r6", "mipsr6"}, + Triple::mips) + .Cases({"mipsel", "mipsallegrexel", "mipsisa32r6el", "mipsr6el"}, + Triple::mipsel) + .Cases({"mips64", "mips64eb", "mipsn32", "mipsisa64r6", "mips64r6", + "mipsn32r6"}, + Triple::mips64) + .Cases({"mips64el", "mipsn32el", "mipsisa64r6el", "mips64r6el", + "mipsn32r6el"}, + Triple::mips64el) + .Case("r600", Triple::r600) + .Case("amdgcn", Triple::amdgcn) + .Case("riscv32", Triple::riscv32) + .Case("riscv64", Triple::riscv64) + .Case("riscv32be", Triple::riscv32be) + .Case("riscv64be", Triple::riscv64be) + .Case("hexagon", Triple::hexagon) + .Cases({"s390x", "systemz"}, Triple::systemz) + .Case("sparc", Triple::sparc) + .Case("sparcel", Triple::sparcel) + .Cases({"sparcv9", "sparc64"}, Triple::sparcv9) + .Case("tce", Triple::tce) + .Case("tcele", Triple::tcele) + .Case("xcore", Triple::xcore) + .Case("nvptx", Triple::nvptx) + .Case("nvptx64", Triple::nvptx64) + .Case("amdil", Triple::amdil) + .Case("amdil64", Triple::amdil64) + .Case("hsail", Triple::hsail) + .Case("hsail64", Triple::hsail64) + .Case("spir", Triple::spir) + .Case("spir64", Triple::spir64) + .Cases({"spirv", "spirv1.5", "spirv1.6"}, Triple::spirv) + .Cases({"spirv32", "spirv32v1.0", "spirv32v1.1", "spirv32v1.2", + "spirv32v1.3", "spirv32v1.4", "spirv32v1.5", "spirv32v1.6"}, + Triple::spirv32) + .Cases({"spirv64", "spirv64v1.0", "spirv64v1.1", "spirv64v1.2", + "spirv64v1.3", "spirv64v1.4", "spirv64v1.5", "spirv64v1.6"}, + Triple::spirv64) + .StartsWith("kalimba", Triple::kalimba) + .Case("lanai", Triple::lanai) + .Case("renderscript32", Triple::renderscript32) + .Case("renderscript64", Triple::renderscript64) + .Case("shave", Triple::shave) + .Case("ve", Triple::ve) + .Case("wasm32", Triple::wasm32) + .Case("wasm64", Triple::wasm64) + .Case("csky", Triple::csky) + .Case("loongarch32", Triple::loongarch32) + .Case("loongarch64", Triple::loongarch64) + .Cases({"dxil", "dxilv1.0", "dxilv1.1", "dxilv1.2", "dxilv1.3", + "dxilv1.4", "dxilv1.5", "dxilv1.6", "dxilv1.7", "dxilv1.8", + "dxilv1.9"}, + Triple::dxil) + .Case("xtensa", Triple::xtensa) + .Default(Triple::UnknownArch); // Some architectures require special parsing logic just to compute the // ArchType result. @@ -1071,7 +1073,7 @@ Triple::Triple(std::string &&Str) : Data(std::move(Str)) { .StartsWith("mips64", Triple::GNUABI64) .StartsWith("mipsisa64", Triple::GNUABI64) .StartsWith("mipsisa32", Triple::GNU) - .Cases("mips", "mipsel", "mipsr6", "mipsr6el", Triple::GNU) + .Cases({"mips", "mipsel", "mipsr6", "mipsr6el"}, Triple::GNU) .Default(UnknownEnvironment); } } diff --git a/llvm/lib/TargetParser/Unix/Host.inc b/llvm/lib/TargetParser/Unix/Host.inc index aeb2f59..38b942d 100644 --- a/llvm/lib/TargetParser/Unix/Host.inc +++ b/llvm/lib/TargetParser/Unix/Host.inc @@ -59,10 +59,30 @@ static std::string updateTripleOSVersion(std::string TargetTripleString) { if (TT.getOS() == Triple::AIX && !TT.getOSMajorVersion()) { struct utsname name; if (uname(&name) != -1) { + std::string release = name.release; + + if (strcmp(name.sysname, "OS400") == 0) { + /* + PASE uses different versioning system than AIX. + The following table shows the currently supported PASE + releases and the corresponding AIX release: + -------------------------- + PASE | AIX + -------------------------- + V7R4 | 7.2 (TL2) + -------------------------- + V7R5 | 7.2 (TL5) + -------------------------- + V7R6 | 7.3 (TL1) + -------------------------- + */ + release = (release == "4" || release == "5") ? "2" : "3"; + } + std::string NewOSName = std::string(Triple::getOSTypeName(Triple::AIX)); NewOSName += name.version; NewOSName += '.'; - NewOSName += name.release; + NewOSName += release; NewOSName += ".0.0"; TT.setOSName(NewOSName); return TT.str(); diff --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp index e382cfe..dd13ce3 100644 --- a/llvm/lib/TargetParser/X86TargetParser.cpp +++ b/llvm/lib/TargetParser/X86TargetParser.cpp @@ -176,6 +176,8 @@ constexpr FeatureBitset FeaturesArrowlakeS = FeatureSM4; constexpr FeatureBitset FeaturesPantherlake = (FeaturesArrowlakeS ^ FeatureWIDEKL); +constexpr FeatureBitset FeaturesNovalake = + FeaturesPantherlake | FeaturePREFETCHI; constexpr FeatureBitset FeaturesClearwaterforest = (FeaturesSierraforest ^ FeatureWIDEKL) | FeatureAVXVNNIINT16 | FeatureSHA512 | FeatureSM3 | FeatureSM4 | FeaturePREFETCHI | FeatureUSERMSR; @@ -379,6 +381,8 @@ constexpr ProcInfo Processors[] = { // Pantherlake microarchitecture based processors. { {"pantherlake"}, CK_Lunarlake, FEATURE_AVX2, FeaturesPantherlake, 'p', false }, { {"wildcatlake"}, CK_Lunarlake, FEATURE_AVX2, FeaturesPantherlake, 'p', false }, + // Novalake microarchitecture based processors. + { {"novalake"}, CK_Novalake, FEATURE_AVX2, FeaturesNovalake, 'r', false }, // Sierraforest microarchitecture based processors. { {"sierraforest"}, CK_Sierraforest, FEATURE_AVX2, FeaturesSierraforest, 'p', false }, // Grandridge microarchitecture based processors. diff --git a/llvm/lib/Transforms/CFGuard/CFGuard.cpp b/llvm/lib/Transforms/CFGuard/CFGuard.cpp index b73a0ce..4645670 100644 --- a/llvm/lib/Transforms/CFGuard/CFGuard.cpp +++ b/llvm/lib/Transforms/CFGuard/CFGuard.cpp @@ -147,7 +147,7 @@ public: private: // Only add checks if the module has the cfguard=2 flag. - int cfguard_module_flag = 0; + int CFGuardModuleFlag = 0; StringRef GuardFnName; Mechanism GuardMechanism = Mechanism::Check; FunctionType *GuardFnType = nullptr; @@ -162,9 +162,7 @@ public: static char ID; // Default constructor required for the INITIALIZE_PASS macro. - CFGuard(CFGuardImpl::Mechanism M) : FunctionPass(ID), Impl(M) { - initializeCFGuardPass(*PassRegistry::getPassRegistry()); - } + CFGuard(CFGuardImpl::Mechanism M) : FunctionPass(ID), Impl(M) {} bool doInitialization(Module &M) override { return Impl.doInitialization(M); } bool runOnFunction(Function &F) override { return Impl.runOnFunction(F); } @@ -173,7 +171,6 @@ public: } // end anonymous namespace void CFGuardImpl::insertCFGuardCheck(CallBase *CB) { - assert(CB->getModule()->getTargetTriple().isOSWindows() && "Only applicable for Windows targets"); assert(CB->isIndirectCall() && @@ -202,7 +199,6 @@ void CFGuardImpl::insertCFGuardCheck(CallBase *CB) { } void CFGuardImpl::insertCFGuardDispatch(CallBase *CB) { - assert(CB->getModule()->getTargetTriple().isOSWindows() && "Only applicable for Windows targets"); assert(CB->isIndirectCall() && @@ -236,14 +232,13 @@ void CFGuardImpl::insertCFGuardDispatch(CallBase *CB) { } bool CFGuardImpl::doInitialization(Module &M) { - // Check if this module has the cfguard flag and read its value. if (auto *MD = mdconst::extract_or_null<ConstantInt>(M.getModuleFlag("cfguard"))) - cfguard_module_flag = MD->getZExtValue(); + CFGuardModuleFlag = MD->getZExtValue(); // Skip modules for which CFGuard checks have been disabled. - if (cfguard_module_flag != 2) + if (CFGuardModuleFlag != 2) return false; // Set up prototypes for the guard check and dispatch functions. @@ -264,9 +259,8 @@ bool CFGuardImpl::doInitialization(Module &M) { } bool CFGuardImpl::runOnFunction(Function &F) { - // Skip modules for which CFGuard checks have been disabled. - if (cfguard_module_flag != 2) + if (CFGuardModuleFlag != 2) return false; SmallVector<CallBase *, 8> IndirectCalls; @@ -286,19 +280,16 @@ bool CFGuardImpl::runOnFunction(Function &F) { } // If no checks are needed, return early. - if (IndirectCalls.empty()) { + if (IndirectCalls.empty()) return false; - } // For each indirect call/invoke, add the appropriate dispatch or check. if (GuardMechanism == Mechanism::Dispatch) { - for (CallBase *CB : IndirectCalls) { + for (CallBase *CB : IndirectCalls) insertCFGuardDispatch(CB); - } } else { - for (CallBase *CB : IndirectCalls) { + for (CallBase *CB : IndirectCalls) insertCFGuardCheck(CB); - } } return true; diff --git a/llvm/lib/Transforms/Coroutines/CoroAnnotationElide.cpp b/llvm/lib/Transforms/Coroutines/CoroAnnotationElide.cpp index f166fef..cf7e450 100644 --- a/llvm/lib/Transforms/Coroutines/CoroAnnotationElide.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroAnnotationElide.cpp @@ -153,26 +153,23 @@ PreservedAnalyses CoroAnnotationElidePass::run(LazyCallGraph::SCC &C, bool IsCallerPresplitCoroutine = Caller->isPresplitCoroutine(); bool HasAttr = CB->hasFnAttr(llvm::Attribute::CoroElideSafe); if (IsCallerPresplitCoroutine && HasAttr) { - BranchProbability MinBranchProbability( - static_cast<int>(CoroElideBranchRatio * MinBlockCounterExecution), - MinBlockCounterExecution); - auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(*Caller); - auto Prob = BranchProbability::getBranchProbability( - BFI.getBlockFreq(CB->getParent()).getFrequency(), - BFI.getEntryFreq().getFrequency()); + auto BlockFreq = BFI.getBlockFreq(CB->getParent()).getFrequency(); + auto EntryFreq = BFI.getEntryFreq().getFrequency(); + uint64_t MinFreq = + static_cast<uint64_t>(EntryFreq * CoroElideBranchRatio); - if (Prob < MinBranchProbability) { + if (BlockFreq < MinFreq) { ORE.emit([&]() { return OptimizationRemarkMissed( DEBUG_TYPE, "CoroAnnotationElideUnlikely", Caller) << "'" << ore::NV("callee", Callee->getName()) << "' not elided in '" << ore::NV("caller", Caller->getName()) - << "' because of low probability: " - << ore::NV("probability", Prob) << " (threshold: " - << ore::NV("threshold", MinBranchProbability) << ")"; + << "' because of low frequency: " + << ore::NV("block_freq", BlockFreq) + << " (threshold: " << ore::NV("min_freq", MinFreq) << ")"; }); continue; } @@ -188,7 +185,8 @@ PreservedAnalyses CoroAnnotationElidePass::run(LazyCallGraph::SCC &C, return OptimizationRemark(DEBUG_TYPE, "CoroAnnotationElide", Caller) << "'" << ore::NV("callee", Callee->getName()) << "' elided in '" << ore::NV("caller", Caller->getName()) - << "' (probability: " << ore::NV("probability", Prob) << ")"; + << "' (block_freq: " << ore::NV("block_freq", BlockFreq) + << ")"; }); FAM.invalidate(*Caller, PreservedAnalyses::none()); diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index 5066a99..894d83f 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -6150,3 +6150,42 @@ void MemProfContextDisambiguation::run( IndexCallsiteContextGraph CCG(Index, isPrevailing); CCG.process(); } + +// Strips MemProf attributes and metadata. Can be invoked by the pass pipeline +// when we don't have an index that has recorded that we are linking with +// allocation libraries containing the necessary APIs for downstream +// transformations. +PreservedAnalyses MemProfRemoveInfo::run(Module &M, ModuleAnalysisManager &AM) { + // The profile matcher applies hotness attributes directly for allocations, + // and those will cause us to generate calls to the hot/cold interfaces + // unconditionally. If supports-hot-cold-new was not enabled in the LTO + // link then assume we don't want these calls (e.g. not linking with + // the appropriate library, or otherwise trying to disable this behavior). + bool Changed = false; + for (auto &F : M) { + for (auto &BB : F) { + for (auto &I : BB) { + auto *CI = dyn_cast<CallBase>(&I); + if (!CI) + continue; + if (CI->hasFnAttr("memprof")) { + CI->removeFnAttr("memprof"); + Changed = true; + } + if (!CI->hasMetadata(LLVMContext::MD_callsite)) { + assert(!CI->hasMetadata(LLVMContext::MD_memprof)); + continue; + } + // Strip off all memprof metadata as it is no longer needed. + // Importantly, this avoids the addition of new memprof attributes + // after inlining propagation. + CI->setMetadata(LLVMContext::MD_memprof, nullptr); + CI->setMetadata(LLVMContext::MD_callsite, nullptr); + Changed = true; + } + } + } + if (!Changed) + return PreservedAnalyses::all(); + return PreservedAnalyses::none(); +} diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index 4c9b10a..cdc559b 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -156,9 +156,9 @@ Instruction *InstCombinerImpl::commonCastTransforms(CastInst &CI) { Value *Src = CI.getOperand(0); Type *Ty = CI.getType(); - if (auto *SrcC = dyn_cast<Constant>(Src)) - if (Constant *Res = ConstantFoldCastOperand(CI.getOpcode(), SrcC, Ty, DL)) - return replaceInstUsesWith(CI, Res); + if (Value *Res = + simplifyCastInst(CI.getOpcode(), Src, Ty, SQ.getWithInstruction(&CI))) + return replaceInstUsesWith(CI, Res); // Try to eliminate a cast of a cast. if (auto *CSrc = dyn_cast<CastInst>(Src)) { // A->B->C cast diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index 07ad65c..fba1ccf 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -1481,13 +1481,13 @@ Instruction *InstCombinerImpl::foldICmpTruncConstant(ICmpInst &Cmp, return new ICmpInst(Pred, Y, ConstantInt::get(SrcTy, C.logBase2())); } - if (Cmp.isEquality() && Trunc->hasOneUse()) { + if (Cmp.isEquality() && (Trunc->hasOneUse() || Trunc->hasNoUnsignedWrap())) { // Canonicalize to a mask and wider compare if the wide type is suitable: // (trunc X to i8) == C --> (X & 0xff) == (zext C) if (!SrcTy->isVectorTy() && shouldChangeType(DstBits, SrcBits)) { Constant *Mask = ConstantInt::get(SrcTy, APInt::getLowBitsSet(SrcBits, DstBits)); - Value *And = Builder.CreateAnd(X, Mask); + Value *And = Trunc->hasNoUnsignedWrap() ? X : Builder.CreateAnd(X, Mask); Constant *WideC = ConstantInt::get(SrcTy, C.zext(SrcBits)); return new ICmpInst(Pred, And, WideC); } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index 09cb225..a8eb9b9 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -3757,6 +3757,10 @@ static Instruction *foldBitCeil(SelectInst &SI, IRBuilderBase &Builder, // (x < y) ? -1 : zext(x > y) // (x > y) ? 1 : sext(x != y) // (x > y) ? 1 : sext(x < y) +// (x == y) ? 0 : (x > y ? 1 : -1) +// (x == y) ? 0 : (x < y ? -1 : 1) +// Special case: x == C ? 0 : (x > C - 1 ? 1 : -1) +// Special case: x == C ? 0 : (x < C + 1 ? -1 : 1) // Into ucmp/scmp(x, y), where signedness is determined by the signedness // of the comparison in the original sequence. Instruction *InstCombinerImpl::foldSelectToCmp(SelectInst &SI) { @@ -3849,6 +3853,44 @@ Instruction *InstCombinerImpl::foldSelectToCmp(SelectInst &SI) { } } + // Special cases with constants: x == C ? 0 : (x > C-1 ? 1 : -1) + if (Pred == ICmpInst::ICMP_EQ && match(TV, m_Zero())) { + const APInt *C; + if (match(RHS, m_APInt(C))) { + CmpPredicate InnerPred; + Value *InnerRHS; + const APInt *InnerTV, *InnerFV; + if (match(FV, + m_Select(m_ICmp(InnerPred, m_Specific(LHS), m_Value(InnerRHS)), + m_APInt(InnerTV), m_APInt(InnerFV)))) { + + // x == C ? 0 : (x > C-1 ? 1 : -1) + if (ICmpInst::isGT(InnerPred) && InnerTV->isOne() && + InnerFV->isAllOnes()) { + IsSigned = ICmpInst::isSigned(InnerPred); + bool CanSubOne = IsSigned ? !C->isMinSignedValue() : !C->isMinValue(); + if (CanSubOne) { + APInt Cminus1 = *C - 1; + if (match(InnerRHS, m_SpecificInt(Cminus1))) + Replace = true; + } + } + + // x == C ? 0 : (x < C+1 ? -1 : 1) + if (ICmpInst::isLT(InnerPred) && InnerTV->isAllOnes() && + InnerFV->isOne()) { + IsSigned = ICmpInst::isSigned(InnerPred); + bool CanAddOne = IsSigned ? !C->isMaxSignedValue() : !C->isMaxValue(); + if (CanAddOne) { + APInt Cplus1 = *C + 1; + if (match(InnerRHS, m_SpecificInt(Cplus1))) + Replace = true; + } + } + } + } + } + Intrinsic::ID IID = IsSigned ? Intrinsic::scmp : Intrinsic::ucmp; if (Replace) return replaceInstUsesWith( diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp index 511bca4..2646334 100644 --- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -605,17 +605,16 @@ static ShadowMapping getShadowMapping(const Triple &TargetTriple, int LongSize, return Mapping; } -namespace llvm { -void getAddressSanitizerParams(const Triple &TargetTriple, int LongSize, - bool IsKasan, uint64_t *ShadowBase, - int *MappingScale, bool *OrShadowOffset) { +void llvm::getAddressSanitizerParams(const Triple &TargetTriple, int LongSize, + bool IsKasan, uint64_t *ShadowBase, + int *MappingScale, bool *OrShadowOffset) { auto Mapping = getShadowMapping(TargetTriple, LongSize, IsKasan); *ShadowBase = Mapping.Offset; *MappingScale = Mapping.Scale; *OrShadowOffset = Mapping.OrShadowOffset; } -void removeASanIncompatibleFnAttributes(Function &F, bool ReadsArgMem) { +void llvm::removeASanIncompatibleFnAttributes(Function &F, bool ReadsArgMem) { // Sanitizer checks read from shadow, which invalidates memory(argmem: *). // // This is not only true for sanitized functions, because AttrInfer can @@ -668,8 +667,6 @@ ASanAccessInfo::ASanAccessInfo(bool IsWrite, bool CompileKernel, AccessSizeIndex(AccessSizeIndex), IsWrite(IsWrite), CompileKernel(CompileKernel) {} -} // namespace llvm - static uint64_t getRedzoneSizeForScale(int MappingScale) { // Redzone used for stack and globals is at least 32 bytes. // For scales 6 and 7, the redzone has to be 64 and 128 bytes respectively. @@ -677,11 +674,10 @@ static uint64_t getRedzoneSizeForScale(int MappingScale) { } static uint64_t GetCtorAndDtorPriority(Triple &TargetTriple) { - if (TargetTriple.isOSEmscripten()) { + if (TargetTriple.isOSEmscripten()) return kAsanEmscriptenCtorAndDtorPriority; - } else { + else return kAsanCtorAndDtorPriority; - } } static Twine genName(StringRef suffix) { @@ -848,6 +844,7 @@ struct AddressSanitizer { bool maybeInsertAsanInitAtFunctionEntry(Function &F); bool maybeInsertDynamicShadowAtFunctionEntry(Function &F); void markEscapedLocalAllocas(Function &F); + void markCatchParametersAsUninteresting(Function &F); private: friend struct FunctionStackPoisoner; @@ -3001,6 +2998,22 @@ void AddressSanitizer::markEscapedLocalAllocas(Function &F) { } } } +// Mitigation for https://github.com/google/sanitizers/issues/749 +// We don't instrument Windows catch-block parameters to avoid +// interfering with exception handling assumptions. +void AddressSanitizer::markCatchParametersAsUninteresting(Function &F) { + for (BasicBlock &BB : F) { + for (Instruction &I : BB) { + if (auto *CatchPad = dyn_cast<CatchPadInst>(&I)) { + // Mark the parameters to a catch-block as uninteresting to avoid + // instrumenting them. + for (Value *Operand : CatchPad->arg_operands()) + if (auto *AI = dyn_cast<AllocaInst>(Operand)) + ProcessedAllocas[AI] = false; + } + } + } +} bool AddressSanitizer::suppressInstrumentationSiteForDebug(int &Instrumented) { bool ShouldInstrument = @@ -3045,6 +3058,9 @@ bool AddressSanitizer::instrumentFunction(Function &F, // can be passed to that intrinsic. markEscapedLocalAllocas(F); + if (TargetTriple.isOSWindows()) + markCatchParametersAsUninteresting(F); + // We want to instrument every address only once per basic block (unless there // are calls between uses). SmallPtrSet<Value *, 16> TempsToInstrument; diff --git a/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp b/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp index 7c78eb3..72e8e50 100644 --- a/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp +++ b/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp @@ -396,9 +396,8 @@ class CHR { } // end anonymous namespace -static inline -raw_ostream LLVM_ATTRIBUTE_UNUSED &operator<<(raw_ostream &OS, - const CHRStats &Stats) { +[[maybe_unused]] static inline raw_ostream &operator<<(raw_ostream &OS, + const CHRStats &Stats) { Stats.print(OS); return OS; } @@ -425,8 +424,8 @@ static bool shouldApply(Function &F, ProfileSummaryInfo &PSI) { return PSI.isFunctionEntryHot(&F); } -static void LLVM_ATTRIBUTE_UNUSED dumpIR(Function &F, const char *Label, - CHRStats *Stats) { +[[maybe_unused]] static void dumpIR(Function &F, const char *Label, + CHRStats *Stats) { StringRef FuncName = F.getName(); StringRef ModuleName = F.getParent()->getName(); (void)(FuncName); // Unused in release build. @@ -1622,7 +1621,7 @@ static void insertTrivialPHIs(CHRScope *Scope, } // Assert that all the CHR regions of the scope have a biased branch or select. -static void LLVM_ATTRIBUTE_UNUSED +[[maybe_unused]] static void assertCHRRegionsHaveBiasedBranchOrSelect(CHRScope *Scope) { #ifndef NDEBUG auto HasBiasedBranchOrSelect = [](RegInfo &RI, CHRScope *Scope) { @@ -1644,8 +1643,9 @@ assertCHRRegionsHaveBiasedBranchOrSelect(CHRScope *Scope) { // Assert that all the condition values of the biased branches and selects have // been hoisted to the pre-entry block or outside of the scope. -static void LLVM_ATTRIBUTE_UNUSED assertBranchOrSelectConditionHoisted( - CHRScope *Scope, BasicBlock *PreEntryBlock) { +[[maybe_unused]] static void +assertBranchOrSelectConditionHoisted(CHRScope *Scope, + BasicBlock *PreEntryBlock) { CHR_DEBUG(dbgs() << "Biased regions condition values \n"); for (RegInfo &RI : Scope->CHRRegions) { Region *R = RI.R; @@ -2007,8 +2007,8 @@ void CHR::transformScopes(SmallVectorImpl<CHRScope *> &CHRScopes) { } } -static void LLVM_ATTRIBUTE_UNUSED -dumpScopes(SmallVectorImpl<CHRScope *> &Scopes, const char *Label) { +[[maybe_unused]] static void dumpScopes(SmallVectorImpl<CHRScope *> &Scopes, + const char *Label) { dbgs() << Label << " " << Scopes.size() << "\n"; for (CHRScope *Scope : Scopes) { dbgs() << *Scope << "\n"; @@ -2092,8 +2092,6 @@ bool CHR::run() { return Changed; } -namespace llvm { - ControlHeightReductionPass::ControlHeightReductionPass() { parseCHRFilterFiles(); } @@ -2116,5 +2114,3 @@ PreservedAnalyses ControlHeightReductionPass::run( return PreservedAnalyses::all(); return PreservedAnalyses::none(); } - -} // namespace llvm diff --git a/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp index 09db464..386e48f 100644 --- a/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp +++ b/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp @@ -326,8 +326,7 @@ const unsigned BBState::OverflowOccurredValue = 0xffffffff; namespace llvm { -raw_ostream &operator<<(raw_ostream &OS, - BBState &BBState) LLVM_ATTRIBUTE_UNUSED; +[[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS, BBState &BBState); } // end namespace llvm diff --git a/llvm/lib/Transforms/ObjCARC/PtrState.h b/llvm/lib/Transforms/ObjCARC/PtrState.h index 232db2b..5cc4212 100644 --- a/llvm/lib/Transforms/ObjCARC/PtrState.h +++ b/llvm/lib/Transforms/ObjCARC/PtrState.h @@ -47,8 +47,7 @@ enum Sequence { S_MovableRelease ///< objc_release(x), !clang.imprecise_release. }; -raw_ostream &operator<<(raw_ostream &OS, - const Sequence S) LLVM_ATTRIBUTE_UNUSED; +[[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS, const Sequence S); /// Unidirectional information about either a /// retain-decrement-use-release sequence or release-use-decrement-retain diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp index 7ad710d..6141b6d 100644 --- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -77,6 +77,7 @@ #include "llvm/Support/DebugCounter.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/AssumeBundleBuilder.h" #include "llvm/Transforms/Utils/BuildLibCalls.h" #include "llvm/Transforms/Utils/Local.h" @@ -805,9 +806,8 @@ tryToMergePartialOverlappingStores(StoreInst *KillingI, StoreInst *DeadI, return nullptr; } -namespace { // Returns true if \p I is an intrinsic that does not read or write memory. -bool isNoopIntrinsic(Instruction *I) { +static bool isNoopIntrinsic(Instruction *I) { if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { switch (II->getIntrinsicID()) { case Intrinsic::lifetime_start: @@ -828,7 +828,7 @@ bool isNoopIntrinsic(Instruction *I) { } // Check if we can ignore \p D for DSE. -bool canSkipDef(MemoryDef *D, bool DefVisibleToCaller) { +static bool canSkipDef(MemoryDef *D, bool DefVisibleToCaller) { Instruction *DI = D->getMemoryInst(); // Calls that only access inaccessible memory cannot read or write any memory // locations we consider for elimination. @@ -856,6 +856,8 @@ bool canSkipDef(MemoryDef *D, bool DefVisibleToCaller) { return false; } +namespace { + // A memory location wrapper that represents a MemoryLocation, `MemLoc`, // defined by `MemDef`. struct MemoryLocationWrapper { @@ -889,23 +891,25 @@ struct MemoryDefWrapper { SmallVector<MemoryLocationWrapper, 1> DefinedLocations; }; -bool hasInitializesAttr(Instruction *I) { - CallBase *CB = dyn_cast<CallBase>(I); - return CB && CB->getArgOperandWithAttribute(Attribute::Initializes); -} - struct ArgumentInitInfo { unsigned Idx; bool IsDeadOrInvisibleOnUnwind; ConstantRangeList Inits; }; +} // namespace + +static bool hasInitializesAttr(Instruction *I) { + CallBase *CB = dyn_cast<CallBase>(I); + return CB && CB->getArgOperandWithAttribute(Attribute::Initializes); +} // Return the intersected range list of the initializes attributes of "Args". // "Args" are call arguments that alias to each other. // If any argument in "Args" doesn't have dead_on_unwind attr and // "CallHasNoUnwindAttr" is false, return empty. -ConstantRangeList getIntersectedInitRangeList(ArrayRef<ArgumentInitInfo> Args, - bool CallHasNoUnwindAttr) { +static ConstantRangeList +getIntersectedInitRangeList(ArrayRef<ArgumentInitInfo> Args, + bool CallHasNoUnwindAttr) { if (Args.empty()) return {}; @@ -925,6 +929,8 @@ ConstantRangeList getIntersectedInitRangeList(ArrayRef<ArgumentInitInfo> Args, return IntersectedIntervals; } +namespace { + struct DSEState { Function &F; AliasAnalysis &AA; @@ -2328,10 +2334,11 @@ struct DSEState { // change state: whether make any change. bool eliminateDeadDefs(const MemoryDefWrapper &KillingDefWrapper); }; +} // namespace // Return true if "Arg" is function local and isn't captured before "CB". -bool isFuncLocalAndNotCaptured(Value *Arg, const CallBase *CB, - EarliestEscapeAnalysis &EA) { +static bool isFuncLocalAndNotCaptured(Value *Arg, const CallBase *CB, + EarliestEscapeAnalysis &EA) { const Value *UnderlyingObj = getUnderlyingObject(Arg); return isIdentifiedFunctionLocal(UnderlyingObj) && capturesNothing( @@ -2627,7 +2634,6 @@ static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, return MadeChange; } -} // end anonymous namespace //===----------------------------------------------------------------------===// // DSE Pass @@ -2728,8 +2734,6 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_END(DSELegacyPass, "dse", "Dead Store Elimination", false, false) -namespace llvm { -LLVM_ABI FunctionPass *createDeadStoreEliminationPass() { +LLVM_ABI FunctionPass *llvm::createDeadStoreEliminationPass() { return new DSELegacyPass(); } -} // namespace llvm diff --git a/llvm/lib/Transforms/Scalar/GVNSink.cpp b/llvm/lib/Transforms/Scalar/GVNSink.cpp index 1c88532..b9534def 100644 --- a/llvm/lib/Transforms/Scalar/GVNSink.cpp +++ b/llvm/lib/Transforms/Scalar/GVNSink.cpp @@ -73,24 +73,17 @@ #include <utility> using namespace llvm; +using namespace llvm::GVNExpression; #define DEBUG_TYPE "gvn-sink" STATISTIC(NumRemoved, "Number of instructions removed"); -namespace llvm { -namespace GVNExpression { - LLVM_DUMP_METHOD void Expression::dump() const { print(dbgs()); dbgs() << "\n"; } -} // end namespace GVNExpression -} // end namespace llvm - -namespace { - static bool isMemoryInst(const Instruction *I) { return isa<LoadInst>(I) || isa<StoreInst>(I) || (isa<InvokeInst>(I) && !cast<InvokeInst>(I)->doesNotAccessMemory()) || @@ -99,6 +92,8 @@ static bool isMemoryInst(const Instruction *I) { //===----------------------------------------------------------------------===// +namespace { + /// Candidate solution for sinking. There may be different ways to /// sink instructions, differing in the number of instructions sunk, /// the number of predecessors sunk from and the number of PHIs @@ -125,14 +120,6 @@ struct SinkingInstructionCandidate { } }; -#ifndef NDEBUG -raw_ostream &operator<<(raw_ostream &OS, const SinkingInstructionCandidate &C) { - OS << "<Candidate Cost=" << C.Cost << " #Blocks=" << C.NumBlocks - << " #Insts=" << C.NumInstructions << " #PHIs=" << C.NumPHIs << ">"; - return OS; -} -#endif - //===----------------------------------------------------------------------===// /// Describes a PHI node that may or may not exist. These track the PHIs @@ -256,8 +243,18 @@ public: return Values == Other.Values && Blocks == Other.Blocks; } }; +} // namespace -template <typename ModelledPHI> struct DenseMapInfo { +#ifndef NDEBUG +static raw_ostream &operator<<(raw_ostream &OS, + const SinkingInstructionCandidate &C) { + OS << "<Candidate Cost=" << C.Cost << " #Blocks=" << C.NumBlocks + << " #Insts=" << C.NumInstructions << " #PHIs=" << C.NumPHIs << ">"; + return OS; +} +#endif + +template <> struct llvm::DenseMapInfo<ModelledPHI> { static inline ModelledPHI &getEmptyKey() { static ModelledPHI Dummy = ModelledPHI::createDummy(0); return Dummy; @@ -275,7 +272,9 @@ template <typename ModelledPHI> struct DenseMapInfo { } }; -using ModelledPHISet = DenseSet<ModelledPHI, DenseMapInfo<ModelledPHI>>; +using ModelledPHISet = DenseSet<ModelledPHI>; + +namespace { //===----------------------------------------------------------------------===// // ValueTable @@ -290,7 +289,7 @@ using ModelledPHISet = DenseSet<ModelledPHI, DenseMapInfo<ModelledPHI>>; /// /// This class also contains fields for discriminators used when determining /// equivalence of instructions with sideeffects. -class InstructionUseExpr : public GVNExpression::BasicExpression { +class InstructionUseExpr : public BasicExpression { unsigned MemoryUseOrder = -1; bool Volatile = false; ArrayRef<int> ShuffleMask; @@ -298,7 +297,7 @@ class InstructionUseExpr : public GVNExpression::BasicExpression { public: InstructionUseExpr(Instruction *I, ArrayRecycler<Value *> &R, BumpPtrAllocator &A) - : GVNExpression::BasicExpression(I->getNumUses()) { + : BasicExpression(I->getNumUses()) { allocateOperands(R, A); setOpcode(I->getOpcode()); setType(I->getType()); @@ -315,8 +314,8 @@ public: void setVolatile(bool V) { Volatile = V; } hash_code getHashValue() const override { - return hash_combine(GVNExpression::BasicExpression::getHashValue(), - MemoryUseOrder, Volatile, ShuffleMask); + return hash_combine(BasicExpression::getHashValue(), MemoryUseOrder, + Volatile, ShuffleMask); } template <typename Function> hash_code getHashValue(Function MapFn) { @@ -332,7 +331,7 @@ using BasicBlocksSet = SmallPtrSet<const BasicBlock *, 32>; class ValueTable { DenseMap<Value *, uint32_t> ValueNumbering; - DenseMap<GVNExpression::Expression *, uint32_t> ExpressionNumbering; + DenseMap<Expression *, uint32_t> ExpressionNumbering; DenseMap<size_t, uint32_t> HashNumbering; BumpPtrAllocator Allocator; ArrayRecycler<Value *> Recycler; @@ -594,6 +593,7 @@ private: } } }; +} // namespace std::optional<SinkingInstructionCandidate> GVNSink::analyzeInstructionForSinking(LockstepReverseIterator<false> &LRI, @@ -851,8 +851,6 @@ void GVNSink::sinkLastInstruction(ArrayRef<BasicBlock *> Blocks, NumRemoved += Insts.size() - 1; } -} // end anonymous namespace - PreservedAnalyses GVNSinkPass::run(Function &F, FunctionAnalysisManager &AM) { GVNSink G; if (!G.run(F)) diff --git a/llvm/lib/Transforms/Scalar/GuardWidening.cpp b/llvm/lib/Transforms/Scalar/GuardWidening.cpp index d99f1eb..ddb99a5 100644 --- a/llvm/lib/Transforms/Scalar/GuardWidening.cpp +++ b/llvm/lib/Transforms/Scalar/GuardWidening.cpp @@ -75,8 +75,6 @@ static cl::opt<bool> "expressed as branches by widenable conditions"), cl::init(true)); -namespace { - // Get the condition of \p I. It can either be a guard or a conditional branch. static Value *getCondition(Instruction *I) { if (IntrinsicInst *GI = dyn_cast<IntrinsicInst>(I)) { @@ -130,6 +128,8 @@ findInsertionPointForWideCondition(Instruction *WCOrGuard) { return std::nullopt; } +namespace { + class GuardWideningImpl { DominatorTree &DT; PostDominatorTree *PDT; @@ -328,7 +328,7 @@ public: /// The entry point for this pass. bool run(); }; -} +} // namespace static bool isSupportedGuardInstruction(const Instruction *Insn) { if (isGuard(Insn)) diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp index c327311..7ebcc21 100644 --- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -53,6 +53,7 @@ #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/PatternMatch.h" @@ -117,6 +118,10 @@ static cl::opt<bool> LoopPredication("indvars-predicate-loops", cl::Hidden, cl::init(true), cl::desc("Predicate conditions in read only loops")); +static cl::opt<bool> LoopPredicationTraps( + "indvars-predicate-loop-traps", cl::Hidden, cl::init(true), + cl::desc("Predicate conditions that trap in loops with only local writes")); + static cl::opt<bool> AllowIVWidening("indvars-widen-indvars", cl::Hidden, cl::init(true), cl::desc("Allow widening of indvars to eliminate s/zext")); @@ -1704,6 +1709,24 @@ bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) { return Changed; } +static bool crashingBBWithoutEffect(const BasicBlock &BB) { + return llvm::all_of(BB, [](const Instruction &I) { + // TODO: for now this is overly restrictive, to make sure nothing in this + // BB can depend on the loop body. + // It's not enough to check for !I.mayHaveSideEffects(), because e.g. a + // load does not have a side effect, but we could have + // %a = load ptr, ptr %ptr + // %b = load i32, ptr %a + // Now if the loop stored a non-nullptr to %a, we could cause a nullptr + // dereference by skipping over loop iterations. + if (const auto *CB = dyn_cast<CallBase>(&I)) { + if (CB->onlyAccessesInaccessibleMemory()) + return true; + } + return isa<UnreachableInst>(I); + }); +} + bool IndVarSimplify::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) { SmallVector<BasicBlock*, 16> ExitingBlocks; L->getExitingBlocks(ExitingBlocks); @@ -1816,11 +1839,25 @@ bool IndVarSimplify::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) { // suggestions on how to improve this? I can obviously bail out for outer // loops, but that seems less than ideal. MemorySSA can find memory writes, // is that enough for *all* side effects? + bool HasThreadLocalSideEffects = false; for (BasicBlock *BB : L->blocks()) for (auto &I : *BB) // TODO:isGuaranteedToTransfer - if (I.mayHaveSideEffects()) - return false; + if (I.mayHaveSideEffects()) { + if (!LoopPredicationTraps) + return false; + HasThreadLocalSideEffects = true; + if (StoreInst *SI = dyn_cast<StoreInst>(&I)) { + // Simple stores cannot be observed by other threads. + // If HasThreadLocalSideEffects is set, we check + // crashingBBWithoutEffect to make sure that the crashing BB cannot + // observe them either. + if (!SI->isSimple()) + return false; + } else { + return false; + } + } bool Changed = false; // Finally, do the actual predication for all predicatable blocks. A couple @@ -1840,6 +1877,19 @@ bool IndVarSimplify::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) { const SCEV *ExitCount = SE->getExitCount(L, ExitingBB); auto *BI = cast<BranchInst>(ExitingBB->getTerminator()); + if (HasThreadLocalSideEffects) { + const BasicBlock *Unreachable = nullptr; + for (const BasicBlock *Succ : BI->successors()) { + if (isa<UnreachableInst>(Succ->getTerminator())) + Unreachable = Succ; + } + // Exit BB which have one branch back into the loop and another one to + // a trap can still be optimized, because local side effects cannot + // be observed in the exit case (the trap). We could be smarter about + // this, but for now lets pattern match common cases that directly trap. + if (Unreachable == nullptr || !crashingBBWithoutEffect(*Unreachable)) + return Changed; + } Value *NewCond; if (ExitCount == ExactBTC) { NewCond = L->contains(BI->getSuccessor(0)) ? diff --git a/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp b/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp index 3c14036e..6fb8197 100644 --- a/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp +++ b/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp @@ -26,8 +26,6 @@ using namespace llvm; -namespace llvm { - static cl::opt<unsigned> JumpTableSizeThreshold("jump-table-to-switch-size-threshold", cl::Hidden, cl::desc("Only split jump tables with size less or " @@ -43,8 +41,8 @@ static cl::opt<unsigned> FunctionSizeThreshold( "or equal than this threshold."), cl::init(50)); +namespace llvm { extern cl::opt<bool> ProfcheckDisableMetadataFixes; - } // end namespace llvm #define DEBUG_TYPE "jump-table-to-switch" diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp index 9655173..b2c526b 100644 --- a/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/llvm/lib/Transforms/Scalar/LICM.cpp @@ -116,8 +116,6 @@ STATISTIC(NumIntAssociationsHoisted, STATISTIC(NumBOAssociationsHoisted, "Number of invariant BinaryOp expressions " "reassociated and hoisted out of the loop"); -namespace llvm { - /// Memory promotion is enabled by default. static cl::opt<bool> DisablePromotion("disable-licm-promotion", cl::Hidden, cl::init(false), @@ -156,7 +154,7 @@ static cl::opt<unsigned> IntAssociationUpperLimit( // which may not be precise, since optimizeUses is capped. The result is // correct, but we may not get as "far up" as possible to get which access is // clobbering the one queried. -cl::opt<unsigned> SetLicmMssaOptCap( +cl::opt<unsigned> llvm::SetLicmMssaOptCap( "licm-mssa-optimization-cap", cl::init(100), cl::Hidden, cl::desc("Enable imprecision in LICM in pathological cases, in exchange " "for faster compile. Caps the MemorySSA clobbering calls.")); @@ -164,15 +162,15 @@ cl::opt<unsigned> SetLicmMssaOptCap( // Experimentally, memory promotion carries less importance than sinking and // hoisting. Limit when we do promotion when using MemorySSA, in order to save // compile time. -cl::opt<unsigned> SetLicmMssaNoAccForPromotionCap( +cl::opt<unsigned> llvm::SetLicmMssaNoAccForPromotionCap( "licm-mssa-max-acc-promotion", cl::init(250), cl::Hidden, cl::desc("[LICM & MemorySSA] When MSSA in LICM is disabled, this has no " "effect. When MSSA in LICM is enabled, then this is the maximum " "number of accesses allowed to be present in a loop in order to " "enable memory promotion.")); +namespace llvm { extern cl::opt<bool> ProfcheckDisableMetadataFixes; - } // end namespace llvm static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI); @@ -1120,11 +1118,10 @@ static bool isLoadInvariantInLoop(LoadInst *LI, DominatorTree *DT, return false; } -namespace { /// Return true if-and-only-if we know how to (mechanically) both hoist and /// sink a given instruction out of a loop. Does not address legality /// concerns such as aliasing or speculation safety. -bool isHoistableAndSinkableInst(Instruction &I) { +static bool isHoistableAndSinkableInst(Instruction &I) { // Only these instructions are hoistable/sinkable. return (isa<LoadInst>(I) || isa<StoreInst>(I) || isa<CallInst>(I) || isa<FenceInst>(I) || isa<CastInst>(I) || isa<UnaryOperator>(I) || @@ -1136,8 +1133,8 @@ bool isHoistableAndSinkableInst(Instruction &I) { } /// Return true if I is the only Instruction with a MemoryAccess in L. -bool isOnlyMemoryAccess(const Instruction *I, const Loop *L, - const MemorySSAUpdater &MSSAU) { +static bool isOnlyMemoryAccess(const Instruction *I, const Loop *L, + const MemorySSAUpdater &MSSAU) { for (auto *BB : L->getBlocks()) if (auto *Accs = MSSAU.getMemorySSA()->getBlockAccesses(BB)) { int NotAPhi = 0; @@ -1151,7 +1148,6 @@ bool isOnlyMemoryAccess(const Instruction *I, const Loop *L, } return true; } -} static MemoryAccess *getClobberingMemoryAccess(MemorySSA &MSSA, BatchAAResults &BAA, diff --git a/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp b/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp index 73f1942..7706de8 100644 --- a/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp +++ b/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp @@ -21,8 +21,7 @@ #define DEBUG_TYPE "loop-bound-split" -namespace llvm { - +using namespace llvm; using namespace PatternMatch; namespace { @@ -358,8 +357,7 @@ static bool splitLoopBound(Loop &L, DominatorTree &DT, LoopInfo &LI, IRBuilder<> Builder(&PostLoopPreHeader->front()); // Update phi nodes in header of post-loop. - bool isExitingLatch = - (L.getExitingBlock() == L.getLoopLatch()) ? true : false; + bool isExitingLatch = L.getExitingBlock() == L.getLoopLatch(); Value *ExitingCondLCSSAPhi = nullptr; for (PHINode &PN : L.getHeader()->phis()) { // Create LCSSA phi node in preheader of post-loop. @@ -472,8 +470,7 @@ static bool splitLoopBound(Loop &L, DominatorTree &DT, LoopInfo &LI, PreservedAnalyses LoopBoundSplitPass::run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U) { - Function &F = *L.getHeader()->getParent(); - (void)F; + [[maybe_unused]] Function &F = *L.getHeader()->getParent(); LLVM_DEBUG(dbgs() << "Spliting bound of loop in " << F.getName() << ": " << L << "\n"); @@ -486,5 +483,3 @@ PreservedAnalyses LoopBoundSplitPass::run(Loop &L, LoopAnalysisManager &AM, return getLoopPassPreservedAnalyses(); } - -} // end namespace llvm diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp index 20733032..19eccb9 100644 --- a/llvm/lib/Transforms/Scalar/LoopFuse.cpp +++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp @@ -368,7 +368,7 @@ private: Valid = false; } - bool reportInvalidCandidate(llvm::Statistic &Stat) const { + bool reportInvalidCandidate(Statistic &Stat) const { using namespace ore; assert(L && Preheader && "Fusion candidate not initialized properly!"); #if LLVM_ENABLE_STATS @@ -445,6 +445,7 @@ struct FusionCandidateCompare { "No dominance relationship between these fusion candidates!"); } }; +} // namespace using LoopVector = SmallVector<Loop *, 4>; @@ -461,9 +462,15 @@ using LoopVector = SmallVector<Loop *, 4>; using FusionCandidateSet = std::set<FusionCandidate, FusionCandidateCompare>; using FusionCandidateCollection = SmallVector<FusionCandidateSet, 4>; -#if !defined(NDEBUG) -static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, - const FusionCandidate &FC) { +#ifndef NDEBUG +static void printLoopVector(const LoopVector &LV) { + dbgs() << "****************************\n"; + for (const Loop *L : LV) + printLoop(*L, dbgs()); + dbgs() << "****************************\n"; +} + +static raw_ostream &operator<<(raw_ostream &OS, const FusionCandidate &FC) { if (FC.isValid()) OS << FC.Preheader->getName(); else @@ -472,8 +479,8 @@ static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, return OS; } -static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, - const FusionCandidateSet &CandSet) { +static raw_ostream &operator<<(raw_ostream &OS, + const FusionCandidateSet &CandSet) { for (const FusionCandidate &FC : CandSet) OS << FC << '\n'; @@ -489,7 +496,9 @@ printFusionCandidates(const FusionCandidateCollection &FusionCandidates) { dbgs() << "****************************\n"; } } -#endif +#endif // NDEBUG + +namespace { /// Collect all loops in function at the same nest level, starting at the /// outermost level. @@ -550,15 +559,6 @@ private: LoopsOnLevelTy LoopsOnLevel; }; -#ifndef NDEBUG -static void printLoopVector(const LoopVector &LV) { - dbgs() << "****************************\n"; - for (auto *L : LV) - printLoop(*L, dbgs()); - dbgs() << "****************************\n"; -} -#endif - struct LoopFuser { private: // Sets of control flow equivalent fusion candidates for a given nest level. @@ -1850,7 +1850,7 @@ private: /// <Cand1 Preheader> and <Cand2 Preheader>: <Stat Description> template <typename RemarkKind> void reportLoopFusion(const FusionCandidate &FC0, const FusionCandidate &FC1, - llvm::Statistic &Stat) { + Statistic &Stat) { assert(FC0.Preheader && FC1.Preheader && "Expecting valid fusion candidates"); using namespace ore; diff --git a/llvm/lib/Transforms/Scalar/LoopPassManager.cpp b/llvm/lib/Transforms/Scalar/LoopPassManager.cpp index 32078b1..d827e64 100644 --- a/llvm/lib/Transforms/Scalar/LoopPassManager.cpp +++ b/llvm/lib/Transforms/Scalar/LoopPassManager.cpp @@ -8,7 +8,6 @@ #include "llvm/Transforms/Scalar/LoopPassManager.h" #include "llvm/Analysis/AssumptionCache.h" -#include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetLibraryInfo.h" @@ -16,8 +15,6 @@ using namespace llvm; -namespace llvm { - /// Explicitly specialize the pass manager's run method to handle loop nest /// structure updates. PreservedAnalyses @@ -185,7 +182,6 @@ LoopPassManager::runWithoutLoopNestPasses(Loop &L, LoopAnalysisManager &AM, } return PA; } -} // namespace llvm void FunctionToLoopPassAdaptor::printPipeline( raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { @@ -193,6 +189,7 @@ void FunctionToLoopPassAdaptor::printPipeline( Pass->printPipeline(OS, MapClassName2PassName); OS << ')'; } + PreservedAnalyses FunctionToLoopPassAdaptor::run(Function &F, FunctionAnalysisManager &AM) { // Before we even compute any loop analyses, first run a miniature function @@ -219,9 +216,6 @@ PreservedAnalyses FunctionToLoopPassAdaptor::run(Function &F, // Get the analysis results needed by loop passes. MemorySSA *MSSA = UseMemorySSA ? (&AM.getResult<MemorySSAAnalysis>(F).getMSSA()) : nullptr; - BlockFrequencyInfo *BFI = UseBlockFrequencyInfo && F.hasProfileData() - ? (&AM.getResult<BlockFrequencyAnalysis>(F)) - : nullptr; LoopStandardAnalysisResults LAR = {AM.getResult<AAManager>(F), AM.getResult<AssumptionAnalysis>(F), AM.getResult<DominatorTreeAnalysis>(F), @@ -229,7 +223,6 @@ PreservedAnalyses FunctionToLoopPassAdaptor::run(Function &F, AM.getResult<ScalarEvolutionAnalysis>(F), AM.getResult<TargetLibraryAnalysis>(F), AM.getResult<TargetIRAnalysis>(F), - BFI, MSSA}; // Setup the loop analysis manager from its proxy. It is important that diff --git a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp index 448dc2b..f3e6cbf 100644 --- a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp +++ b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp @@ -540,8 +540,6 @@ bool LoopVersioningLICM::run(DominatorTree *DT) { return Changed; } -namespace llvm { - PreservedAnalyses LoopVersioningLICMPass::run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &LAR, LPMUpdater &U) { @@ -556,4 +554,3 @@ PreservedAnalyses LoopVersioningLICMPass::run(Loop &L, LoopAnalysisManager &AM, return PreservedAnalyses::all(); return getLoopPassPreservedAnalyses(); } -} // namespace llvm diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp index 7cae94eb..3487e81 100644 --- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp +++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp @@ -97,6 +97,12 @@ static cl::opt<MatrixLayoutTy> MatrixLayout( static cl::opt<bool> PrintAfterTransposeOpt("matrix-print-after-transpose-opt", cl::init(false)); +static cl::opt<unsigned> SplitMatmulRemainderOverThreshold( + "matrix-split-matmul-remainder-over-threshold", cl::Hidden, + cl::desc("Illegal remainder vectors over this size in bits should be split " + "in the inner loop of matmul"), + cl::init(0)); + /// Helper function to either return Scope, if it is a subprogram or the /// attached subprogram for a local scope. static DISubprogram *getSubprogram(DIScope *Scope) { @@ -115,18 +121,16 @@ static bool isSplat(Value *V) { /// Match any mul operation (fp or integer). template <typename LTy, typename RTy> -auto m_AnyMul(const LTy &L, const RTy &R) { +static auto m_AnyMul(const LTy &L, const RTy &R) { return m_CombineOr(m_Mul(L, R), m_FMul(L, R)); } /// Match any add operation (fp or integer). template <typename LTy, typename RTy> -auto m_AnyAdd(const LTy &L, const RTy &R) { +static auto m_AnyAdd(const LTy &L, const RTy &R) { return m_CombineOr(m_Add(L, R), m_FAdd(L, R)); } -namespace { - // Given an element pointer \p BasePtr to the start of a (sub) matrix, compute // the start address of vector \p VecIdx with type (\p EltType x \p NumElements) // assuming \p Stride elements between start two consecutive vectors. @@ -167,9 +171,9 @@ namespace { // v_2_0 |v_2_1 |v_2_2 |v_2_3 // v_3_0 {v_3_1 {v_3_2 v_3_3 // -Value *computeVectorAddr(Value *BasePtr, Value *VecIdx, Value *Stride, - unsigned NumElements, Type *EltType, - IRBuilder<> &Builder) { +static Value *computeVectorAddr(Value *BasePtr, Value *VecIdx, Value *Stride, + unsigned NumElements, Type *EltType, + IRBuilder<> &Builder) { assert((!isa<ConstantInt>(Stride) || cast<ConstantInt>(Stride)->getZExtValue() >= NumElements) && @@ -338,6 +342,8 @@ computeShapeInfoForInst(Instruction *I, return std::nullopt; } +namespace { + /// LowerMatrixIntrinsics contains the methods used to lower matrix intrinsics. /// /// Currently, the lowering for each matrix intrinsic is done as follows: @@ -371,7 +377,8 @@ class LowerMatrixIntrinsics { LoopInfo *LI = nullptr; OptimizationRemarkEmitter *ORE = nullptr; - /// Contains estimates of the number of operations (loads, stores, compute) required to lower a matrix operation. + /// Contains estimates of the number of operations (loads, stores, compute) + /// required to lower a matrix operation. struct OpInfoTy { /// Number of stores emitted to generate this matrix. unsigned NumStores = 0; @@ -1719,6 +1726,31 @@ public: ToRemove.push_back(MatMul); } + /// Given \p Remainder iterations of the the matmul inner loop, + /// potentially lower \p Blocksize that is used for the underlying + /// vector. + unsigned capBlockSize(unsigned BlockSize, unsigned Remainder, Type *EltType) { + if (BlockSize <= Remainder) + return BlockSize; + + // If the remainder is also a legal type just use it. + auto *VecTy = FixedVectorType::get(EltType, Remainder); + if (TTI.isTypeLegal(VecTy)) + return Remainder; + + // Similarly, if the vector is small enough that we don't want + // to split further. + if (VecTy->getPrimitiveSizeInBits() <= SplitMatmulRemainderOverThreshold) + return Remainder; + + // Gradually lower the vectorization factor to cover the + // remainder. + do { + BlockSize /= 2; + } while (BlockSize > Remainder); + return BlockSize; + } + /// Compute \p Result += \p A * \p B for input matrices with left-associating /// addition. /// @@ -1756,10 +1788,8 @@ public: bool isSumZero = isa<ConstantAggregateZero>(Result.getColumn(J)); for (unsigned I = 0; I < R; I += BlockSize) { - // Gradually lower the vectorization factor to cover the remainder. - while (I + BlockSize > R) - BlockSize /= 2; - + // Lower block size to make sure we stay within bounds. + BlockSize = capBlockSize(BlockSize, R - I, Result.getElementType()); Value *Sum = IsTiled ? Result.extractVector(I, J, BlockSize, Builder) : nullptr; for (unsigned K = 0; K < M; ++K) { @@ -1784,9 +1814,8 @@ public: unsigned BlockSize = VF; bool isSumZero = isa<ConstantAggregateZero>(Result.getRow(I)); for (unsigned J = 0; J < C; J += BlockSize) { - // Gradually lower the vectorization factor to cover the remainder. - while (J + BlockSize > C) - BlockSize /= 2; + // Lower the vectorization factor to cover the remainder. + BlockSize = capBlockSize(BlockSize, C - J, Result.getElementType()); Value *Sum = nullptr; for (unsigned K = 0; K < M; ++K) { diff --git a/llvm/lib/Transforms/Scalar/NewGVN.cpp b/llvm/lib/Transforms/Scalar/NewGVN.cpp index 80aa98d..5a8f18a 100644 --- a/llvm/lib/Transforms/Scalar/NewGVN.cpp +++ b/llvm/lib/Transforms/Scalar/NewGVN.cpp @@ -160,9 +160,6 @@ static cl::opt<bool> EnablePhiOfOps("enable-phi-of-ops", cl::init(true), //===----------------------------------------------------------------------===// // Anchor methods. -namespace llvm { -namespace GVNExpression { - Expression::~Expression() = default; BasicExpression::~BasicExpression() = default; CallExpression::~CallExpression() = default; @@ -171,9 +168,6 @@ StoreExpression::~StoreExpression() = default; AggregateValueExpression::~AggregateValueExpression() = default; PHIExpression::~PHIExpression() = default; -} // end namespace GVNExpression -} // end namespace llvm - namespace { // Tarjan's SCC finding algorithm with Nuutila's improvements diff --git a/llvm/lib/Transforms/Scalar/Reassociate.cpp b/llvm/lib/Transforms/Scalar/Reassociate.cpp index ba58b8e..6d7ce36 100644 --- a/llvm/lib/Transforms/Scalar/Reassociate.cpp +++ b/llvm/lib/Transforms/Scalar/Reassociate.cpp @@ -2623,32 +2623,32 @@ PreservedAnalyses ReassociatePass::run(Function &F, FunctionAnalysisManager &) { namespace { - class ReassociateLegacyPass : public FunctionPass { - ReassociatePass Impl; +class ReassociateLegacyPass : public FunctionPass { + ReassociatePass Impl; - public: - static char ID; // Pass identification, replacement for typeid +public: + static char ID; // Pass identification, replacement for typeid - ReassociateLegacyPass() : FunctionPass(ID) { - initializeReassociateLegacyPassPass(*PassRegistry::getPassRegistry()); - } + ReassociateLegacyPass() : FunctionPass(ID) { + initializeReassociateLegacyPassPass(*PassRegistry::getPassRegistry()); + } - bool runOnFunction(Function &F) override { - if (skipFunction(F)) - return false; + bool runOnFunction(Function &F) override { + if (skipFunction(F)) + return false; - FunctionAnalysisManager DummyFAM; - auto PA = Impl.run(F, DummyFAM); - return !PA.areAllPreserved(); - } + FunctionAnalysisManager DummyFAM; + auto PA = Impl.run(F, DummyFAM); + return !PA.areAllPreserved(); + } - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - AU.addPreserved<AAResultsWrapperPass>(); - AU.addPreserved<BasicAAWrapperPass>(); - AU.addPreserved<GlobalsAAWrapperPass>(); - } - }; + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addPreserved<AAResultsWrapperPass>(); + AU.addPreserved<BasicAAWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); + } +}; } // end anonymous namespace diff --git a/llvm/lib/Transforms/Scalar/Reg2Mem.cpp b/llvm/lib/Transforms/Scalar/Reg2Mem.cpp index 30b27cb..7646624 100644 --- a/llvm/lib/Transforms/Scalar/Reg2Mem.cpp +++ b/llvm/lib/Transforms/Scalar/Reg2Mem.cpp @@ -107,9 +107,7 @@ PreservedAnalyses RegToMemPass::run(Function &F, FunctionAnalysisManager &AM) { return PA; } -namespace llvm { - -void initializeRegToMemWrapperPassPass(PassRegistry &); +namespace { class RegToMemWrapperPass : public FunctionPass { public: @@ -136,7 +134,7 @@ public: return N != 0 || Changed; } }; -} // namespace llvm +} // namespace INITIALIZE_PASS_BEGIN(RegToMemWrapperPass, "reg2mem", "", true, true) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass); diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index 578fec7..5c60fad 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -344,6 +344,12 @@ static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit, uint64_t SliceSizeInBits, Instruction *OldInst, Instruction *Inst, Value *Dest, Value *Value, const DataLayout &DL) { + // If we want allocas to be migrated using this helper then we need to ensure + // that the BaseFragments map code still works. A simple solution would be + // to choose to always clone alloca dbg_assigns (rather than sometimes + // "stealing" them). + assert(!isa<AllocaInst>(Inst) && "Unexpected alloca"); + auto DVRAssignMarkerRange = at::getDVRAssignmentMarkers(OldInst); // Nothing to do if OldInst has no linked dbg.assign intrinsics. if (DVRAssignMarkerRange.empty()) @@ -429,11 +435,22 @@ static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit, Inst->setMetadata(LLVMContext::MD_DIAssignID, NewID); } - ::Value *NewValue = Value ? Value : DbgAssign->getValue(); - DbgVariableRecord *NewAssign = cast<DbgVariableRecord>(cast<DbgRecord *>( - DIB.insertDbgAssign(Inst, NewValue, DbgAssign->getVariable(), Expr, - Dest, DIExpression::get(Expr->getContext(), {}), - DbgAssign->getDebugLoc()))); + DbgVariableRecord *NewAssign; + if (IsSplit) { + ::Value *NewValue = Value ? Value : DbgAssign->getValue(); + NewAssign = cast<DbgVariableRecord>(cast<DbgRecord *>( + DIB.insertDbgAssign(Inst, NewValue, DbgAssign->getVariable(), Expr, + Dest, DIExpression::get(Expr->getContext(), {}), + DbgAssign->getDebugLoc()))); + } else { + // The store is not split, simply steal the existing dbg_assign. + NewAssign = DbgAssign; + NewAssign->setAssignId(NewID); // FIXME: Can we avoid generating new IDs? + NewAssign->setAddress(Dest); + if (Value) + NewAssign->replaceVariableLocationOp(0u, Value); + assert(Expr == NewAssign->getExpression()); + } // If we've updated the value but the original dbg.assign has an arglist // then kill it now - we can't use the requested new value. @@ -464,9 +481,10 @@ static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit, // noted as slightly offset (in code) from the store. In practice this // should have little effect on the debugging experience due to the fact // that all the split stores should get the same line number. - NewAssign->moveBefore(DbgAssign->getIterator()); - - NewAssign->setDebugLoc(DbgAssign->getDebugLoc()); + if (NewAssign != DbgAssign) { + NewAssign->moveBefore(DbgAssign->getIterator()); + NewAssign->setDebugLoc(DbgAssign->getDebugLoc()); + } LLVM_DEBUG(dbgs() << "Created new assign: " << *NewAssign << "\n"); }; @@ -551,12 +569,10 @@ public: } /// Support comparison with a single offset to allow binary searches. - friend LLVM_ATTRIBUTE_UNUSED bool operator<(const Slice &LHS, - uint64_t RHSOffset) { + [[maybe_unused]] friend bool operator<(const Slice &LHS, uint64_t RHSOffset) { return LHS.beginOffset() < RHSOffset; } - friend LLVM_ATTRIBUTE_UNUSED bool operator<(uint64_t LHSOffset, - const Slice &RHS) { + [[maybe_unused]] friend bool operator<(uint64_t LHSOffset, const Slice &RHS) { return LHSOffset < RHS.beginOffset(); } diff --git a/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/llvm/lib/Transforms/Scalar/Scalarizer.cpp index aae5d60..25a531c 100644 --- a/llvm/lib/Transforms/Scalar/Scalarizer.cpp +++ b/llvm/lib/Transforms/Scalar/Scalarizer.cpp @@ -50,9 +50,7 @@ using namespace llvm; #define DEBUG_TYPE "scalarizer" -namespace { - -BasicBlock::iterator skipPastPhiNodesAndDbg(BasicBlock::iterator Itr) { +static BasicBlock::iterator skipPastPhiNodesAndDbg(BasicBlock::iterator Itr) { BasicBlock *BB = Itr->getParent(); if (isa<PHINode>(Itr)) Itr = BB->getFirstInsertionPt(); @@ -76,6 +74,8 @@ using ScatterMap = std::map<std::pair<Value *, Type *>, ValueVector>; // along with a pointer to their scattered forms. using GatherList = SmallVector<std::pair<Instruction *, ValueVector *>, 16>; +namespace { + struct VectorSplit { // The type of the vector. FixedVectorType *VecTy = nullptr; @@ -196,6 +196,7 @@ struct VectorLayout { // The size of each (non-remainder) fragment in bytes. uint64_t SplitSize = 0; }; +} // namespace static bool isStructOfMatchingFixedVectors(Type *Ty) { if (!isa<StructType>(Ty)) @@ -268,6 +269,7 @@ static Value *concatenate(IRBuilder<> &Builder, ArrayRef<Value *> Fragments, return Res; } +namespace { class ScalarizerVisitor : public InstVisitor<ScalarizerVisitor, bool> { public: ScalarizerVisitor(DominatorTree *DT, const TargetTransformInfo *TTI, diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index e4ba70d..5af6c96 100644 --- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -27,7 +27,6 @@ #include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/MemorySSAUpdater.h" #include "llvm/Analysis/MustExecute.h" -#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" @@ -3611,8 +3610,7 @@ static bool unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC, AAResults &AA, TargetTransformInfo &TTI, bool Trivial, bool NonTrivial, ScalarEvolution *SE, - MemorySSAUpdater *MSSAU, ProfileSummaryInfo *PSI, - BlockFrequencyInfo *BFI, LPMUpdater &LoopUpdater) { + MemorySSAUpdater *MSSAU, LPMUpdater &LoopUpdater) { assert(L.isRecursivelyLCSSAForm(DT, LI) && "Loops must be in LCSSA form before unswitching."); @@ -3652,35 +3650,6 @@ static bool unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, if (F->hasOptSize()) return false; - // Returns true if Loop L's loop nest is cold, i.e. if the headers of L, - // of the loops L is nested in, and of the loops nested in L are all cold. - auto IsLoopNestCold = [&](const Loop *L) { - // Check L and all of its parent loops. - auto *Parent = L; - while (Parent) { - if (!PSI->isColdBlock(Parent->getHeader(), BFI)) - return false; - Parent = Parent->getParentLoop(); - } - // Next check all loops nested within L. - SmallVector<const Loop *, 4> Worklist; - llvm::append_range(Worklist, L->getSubLoops()); - while (!Worklist.empty()) { - auto *CurLoop = Worklist.pop_back_val(); - if (!PSI->isColdBlock(CurLoop->getHeader(), BFI)) - return false; - llvm::append_range(Worklist, CurLoop->getSubLoops()); - } - return true; - }; - - // Skip cold loops in cold loop nests, as unswitching them brings little - // benefit but increases the code size - if (PSI && PSI->hasProfileSummary() && BFI && IsLoopNestCold(&L)) { - LLVM_DEBUG(dbgs() << " Skip cold loop: " << L << "\n"); - return false; - } - // Perform legality checks. if (!isSafeForNoNTrivialUnswitching(L, LI)) return false; @@ -3705,11 +3674,6 @@ PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM, LPMUpdater &U) { Function &F = *L.getHeader()->getParent(); (void)F; - ProfileSummaryInfo *PSI = nullptr; - if (auto OuterProxy = - AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR) - .getCachedResult<ModuleAnalysisManagerFunctionProxy>(F)) - PSI = OuterProxy->getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); LLVM_DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << L << "\n"); @@ -3720,7 +3684,7 @@ PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM, AR.MSSA->verifyMemorySSA(); } if (!unswitchLoop(L, AR.DT, AR.LI, AR.AC, AR.AA, AR.TTI, Trivial, NonTrivial, - &AR.SE, MSSAU ? &*MSSAU : nullptr, PSI, AR.BFI, U)) + &AR.SE, MSSAU ? &*MSSAU : nullptr, U)) return PreservedAnalyses::all(); if (AR.MSSA && VerifyMemorySSA) diff --git a/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp b/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp index ebcbd2b..fa66a03 100644 --- a/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp +++ b/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp @@ -149,8 +149,6 @@ bool SpeculativeExecutionLegacyPass::runOnFunction(Function &F) { return Impl.runImpl(F, TTI); } -namespace llvm { - bool SpeculativeExecutionPass::runImpl(Function &F, TargetTransformInfo *TTI) { if (OnlyIfDivergentTarget && !TTI->hasBranchDivergence(&F)) { LLVM_DEBUG(dbgs() << "Not running SpeculativeExecution because " @@ -328,11 +326,11 @@ bool SpeculativeExecutionPass::considerHoistingFromTo( return true; } -FunctionPass *createSpeculativeExecutionPass() { +FunctionPass *llvm::createSpeculativeExecutionPass() { return new SpeculativeExecutionLegacyPass(); } -FunctionPass *createSpeculativeExecutionIfHasBranchDivergencePass() { +FunctionPass *llvm::createSpeculativeExecutionIfHasBranchDivergencePass() { return new SpeculativeExecutionLegacyPass(/* OnlyIfDivergentTarget = */ true); } @@ -362,4 +360,3 @@ void SpeculativeExecutionPass::printPipeline( OS << "only-if-divergent-target"; OS << '>'; } -} // namespace llvm diff --git a/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp index 7d01709..e94ad19 100644 --- a/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp @@ -716,8 +716,6 @@ bool StraightLineStrengthReduce::runOnFunction(Function &F) { return Ret; } -namespace llvm { - PreservedAnalyses StraightLineStrengthReducePass::run(Function &F, FunctionAnalysisManager &AM) { const DataLayout *DL = &F.getDataLayout(); @@ -735,5 +733,3 @@ StraightLineStrengthReducePass::run(Function &F, FunctionAnalysisManager &AM) { PA.preserve<TargetIRAnalysis>(); return PA; } - -} // namespace llvm diff --git a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp index 1d83ddc..89d41f3e 100644 --- a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp +++ b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp @@ -192,7 +192,7 @@ struct AllocaDerivedValueTracker { SmallPtrSet<Instruction *, 32> AllocaUsers; SmallPtrSet<Instruction *, 32> EscapePoints; }; -} +} // namespace static bool markTails(Function &F, OptimizationRemarkEmitter *ORE) { if (F.callsFunctionThatReturnsTwice()) @@ -967,7 +967,7 @@ struct TailCallElim : public FunctionPass { /*BFI=*/nullptr); } }; -} +} // namespace char TailCallElim::ID = 0; INITIALIZE_PASS_BEGIN(TailCallElim, "tailcallelim", "Tail Call Elimination", diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp index b18acea..4fe736a 100644 --- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp @@ -1106,7 +1106,6 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, } Phi.replaceAllUsesWith(RdxResult); - continue; } } diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp index 9693ae6..4947d03 100644 --- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp +++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp @@ -20,6 +20,7 @@ #include "llvm/Analysis/ValueLatticeUtils.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/ConstantRange.h" +#include "llvm/IR/DerivedTypes.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" #include "llvm/IR/Instructions.h" @@ -634,18 +635,10 @@ private: /// Merge \p MergeWithV into \p IV and push \p V to the worklist, if \p IV /// changes. bool mergeInValue(ValueLatticeElement &IV, Value *V, - ValueLatticeElement MergeWithV, + const ValueLatticeElement &MergeWithV, ValueLatticeElement::MergeOptions Opts = { /*MayIncludeUndef=*/false, /*CheckWiden=*/false}); - bool mergeInValue(Value *V, ValueLatticeElement MergeWithV, - ValueLatticeElement::MergeOptions Opts = { - /*MayIncludeUndef=*/false, /*CheckWiden=*/false}) { - assert(!V->getType()->isStructTy() && - "non-structs should use markConstant"); - return mergeInValue(ValueState[V], V, MergeWithV, Opts); - } - /// getValueState - Return the ValueLatticeElement object that corresponds to /// the value. This function handles the case when the value hasn't been seen /// yet by properly seeding constants etc. @@ -768,6 +761,7 @@ private: void handleCallArguments(CallBase &CB); void handleExtractOfWithOverflow(ExtractValueInst &EVI, const WithOverflowInst *WO, unsigned Idx); + bool isInstFullyOverDefined(Instruction &Inst); private: friend class InstVisitor<SCCPInstVisitor>; @@ -987,7 +981,7 @@ public: void trackValueOfArgument(Argument *A) { if (A->getType()->isStructTy()) return (void)markOverdefined(A); - mergeInValue(A, getArgAttributeVL(A)); + mergeInValue(ValueState[A], A, getArgAttributeVL(A)); } bool isStructLatticeConstant(Function *F, StructType *STy); @@ -1128,8 +1122,7 @@ bool SCCPInstVisitor::isStructLatticeConstant(Function *F, StructType *STy) { for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { const auto &It = TrackedMultipleRetVals.find(std::make_pair(F, i)); assert(It != TrackedMultipleRetVals.end()); - ValueLatticeElement LV = It->second; - if (!SCCPSolver::isConstant(LV)) + if (!SCCPSolver::isConstant(It->second)) return false; } return true; @@ -1160,7 +1153,7 @@ Constant *SCCPInstVisitor::getConstantOrNull(Value *V) const { std::vector<Constant *> ConstVals; auto *ST = cast<StructType>(V->getType()); for (unsigned I = 0, E = ST->getNumElements(); I != E; ++I) { - ValueLatticeElement LV = LVs[I]; + const ValueLatticeElement &LV = LVs[I]; ConstVals.push_back(SCCPSolver::isConstant(LV) ? getConstant(LV, ST->getElementType(I)) : UndefValue::get(ST->getElementType(I))); @@ -1225,7 +1218,7 @@ void SCCPInstVisitor::visitInstruction(Instruction &I) { } bool SCCPInstVisitor::mergeInValue(ValueLatticeElement &IV, Value *V, - ValueLatticeElement MergeWithV, + const ValueLatticeElement &MergeWithV, ValueLatticeElement::MergeOptions Opts) { if (IV.mergeIn(MergeWithV, Opts)) { pushUsersToWorkList(V); @@ -1264,7 +1257,7 @@ void SCCPInstVisitor::getFeasibleSuccessors(Instruction &TI, return; } - ValueLatticeElement BCValue = getValueState(BI->getCondition()); + const ValueLatticeElement &BCValue = getValueState(BI->getCondition()); ConstantInt *CI = getConstantInt(BCValue, BI->getCondition()->getType()); if (!CI) { // Overdefined condition variables, and branches on unfoldable constant @@ -1326,7 +1319,7 @@ void SCCPInstVisitor::getFeasibleSuccessors(Instruction &TI, // the target as executable. if (auto *IBR = dyn_cast<IndirectBrInst>(&TI)) { // Casts are folded by visitCastInst. - ValueLatticeElement IBRValue = getValueState(IBR->getAddress()); + const ValueLatticeElement &IBRValue = getValueState(IBR->getAddress()); BlockAddress *Addr = dyn_cast_or_null<BlockAddress>( getConstant(IBRValue, IBR->getAddress()->getType())); if (!Addr) { // Overdefined or unknown condition? @@ -1383,49 +1376,66 @@ bool SCCPInstVisitor::isEdgeFeasible(BasicBlock *From, BasicBlock *To) const { // 7. If a conditional branch has a value that is overdefined, make all // successors executable. void SCCPInstVisitor::visitPHINode(PHINode &PN) { - // If this PN returns a struct, just mark the result overdefined. - // TODO: We could do a lot better than this if code actually uses this. - if (PN.getType()->isStructTy()) - return (void)markOverdefined(&PN); - - if (getValueState(&PN).isOverdefined()) - return; // Quick exit - // Super-extra-high-degree PHI nodes are unlikely to ever be marked constant, // and slow us down a lot. Just mark them overdefined. if (PN.getNumIncomingValues() > 64) return (void)markOverdefined(&PN); - unsigned NumActiveIncoming = 0; + if (isInstFullyOverDefined(PN)) + return; + SmallVector<unsigned> FeasibleIncomingIndices; + for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) { + if (!isEdgeFeasible(PN.getIncomingBlock(i), PN.getParent())) + continue; + FeasibleIncomingIndices.push_back(i); + } // Look at all of the executable operands of the PHI node. If any of them // are overdefined, the PHI becomes overdefined as well. If they are all // constant, and they agree with each other, the PHI becomes the identical // constant. If they are constant and don't agree, the PHI is a constant // range. If there are no executable operands, the PHI remains unknown. - ValueLatticeElement PhiState = getValueState(&PN); - for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) { - if (!isEdgeFeasible(PN.getIncomingBlock(i), PN.getParent())) - continue; - - ValueLatticeElement IV = getValueState(PN.getIncomingValue(i)); - PhiState.mergeIn(IV); - NumActiveIncoming++; - if (PhiState.isOverdefined()) - break; + if (StructType *STy = dyn_cast<StructType>(PN.getType())) { + for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { + ValueLatticeElement PhiState = getStructValueState(&PN, i); + if (PhiState.isOverdefined()) + continue; + for (unsigned j : FeasibleIncomingIndices) { + const ValueLatticeElement &IV = + getStructValueState(PN.getIncomingValue(j), i); + PhiState.mergeIn(IV); + if (PhiState.isOverdefined()) + break; + } + ValueLatticeElement &PhiStateRef = getStructValueState(&PN, i); + mergeInValue(PhiStateRef, &PN, PhiState, + ValueLatticeElement::MergeOptions().setMaxWidenSteps( + FeasibleIncomingIndices.size() + 1)); + PhiStateRef.setNumRangeExtensions( + std::max((unsigned)FeasibleIncomingIndices.size(), + PhiStateRef.getNumRangeExtensions())); + } + } else { + ValueLatticeElement PhiState = getValueState(&PN); + for (unsigned i : FeasibleIncomingIndices) { + const ValueLatticeElement &IV = getValueState(PN.getIncomingValue(i)); + PhiState.mergeIn(IV); + if (PhiState.isOverdefined()) + break; + } + // We allow up to 1 range extension per active incoming value and one + // additional extension. Note that we manually adjust the number of range + // extensions to match the number of active incoming values. This helps to + // limit multiple extensions caused by the same incoming value, if other + // incoming values are equal. + ValueLatticeElement &PhiStateRef = ValueState[&PN]; + mergeInValue(PhiStateRef, &PN, PhiState, + ValueLatticeElement::MergeOptions().setMaxWidenSteps( + FeasibleIncomingIndices.size() + 1)); + PhiStateRef.setNumRangeExtensions( + std::max((unsigned)FeasibleIncomingIndices.size(), + PhiStateRef.getNumRangeExtensions())); } - - // We allow up to 1 range extension per active incoming value and one - // additional extension. Note that we manually adjust the number of range - // extensions to match the number of active incoming values. This helps to - // limit multiple extensions caused by the same incoming value, if other - // incoming values are equal. - mergeInValue(&PN, PhiState, - ValueLatticeElement::MergeOptions().setMaxWidenSteps( - NumActiveIncoming + 1)); - ValueLatticeElement &PhiStateRef = getValueState(&PN); - PhiStateRef.setNumRangeExtensions( - std::max(NumActiveIncoming, PhiStateRef.getNumRangeExtensions())); } void SCCPInstVisitor::visitReturnInst(ReturnInst &I) { @@ -1481,7 +1491,7 @@ void SCCPInstVisitor::visitCastInst(CastInst &I) { } } - ValueLatticeElement OpSt = getValueState(I.getOperand(0)); + const ValueLatticeElement &OpSt = getValueState(I.getOperand(0)); if (OpSt.isUnknownOrUndef()) return; @@ -1496,9 +1506,9 @@ void SCCPInstVisitor::visitCastInst(CastInst &I) { if (I.getDestTy()->isIntOrIntVectorTy() && I.getSrcTy()->isIntOrIntVectorTy() && I.getOpcode() != Instruction::BitCast) { - auto &LV = getValueState(&I); ConstantRange OpRange = OpSt.asConstantRange(I.getSrcTy(), /*UndefAllowed=*/false); + auto &LV = getValueState(&I); Type *DestTy = I.getDestTy(); ConstantRange Res = ConstantRange::getEmpty(DestTy->getScalarSizeInBits()); @@ -1516,19 +1526,24 @@ void SCCPInstVisitor::handleExtractOfWithOverflow(ExtractValueInst &EVI, const WithOverflowInst *WO, unsigned Idx) { Value *LHS = WO->getLHS(), *RHS = WO->getRHS(); - ValueLatticeElement L = getValueState(LHS); - ValueLatticeElement R = getValueState(RHS); + Type *Ty = LHS->getType(); + addAdditionalUser(LHS, &EVI); addAdditionalUser(RHS, &EVI); - if (L.isUnknownOrUndef() || R.isUnknownOrUndef()) - return; // Wait to resolve. - Type *Ty = LHS->getType(); + const ValueLatticeElement &L = getValueState(LHS); + if (L.isUnknownOrUndef()) + return; // Wait to resolve. ConstantRange LR = L.asConstantRange(Ty, /*UndefAllowed=*/false); + + const ValueLatticeElement &R = getValueState(RHS); + if (R.isUnknownOrUndef()) + return; // Wait to resolve. + ConstantRange RR = R.asConstantRange(Ty, /*UndefAllowed=*/false); if (Idx == 0) { ConstantRange Res = LR.binaryOp(WO->getBinaryOp(), RR); - mergeInValue(&EVI, ValueLatticeElement::getRange(Res)); + mergeInValue(ValueState[&EVI], &EVI, ValueLatticeElement::getRange(Res)); } else { assert(Idx == 1 && "Index can only be 0 or 1"); ConstantRange NWRegion = ConstantRange::makeGuaranteedNoWrapRegion( @@ -1560,7 +1575,7 @@ void SCCPInstVisitor::visitExtractValueInst(ExtractValueInst &EVI) { if (auto *WO = dyn_cast<WithOverflowInst>(AggVal)) return handleExtractOfWithOverflow(EVI, WO, i); ValueLatticeElement EltVal = getStructValueState(AggVal, i); - mergeInValue(getValueState(&EVI), &EVI, EltVal); + mergeInValue(ValueState[&EVI], &EVI, EltVal); } else { // Otherwise, must be extracting from an array. return (void)markOverdefined(&EVI); @@ -1616,14 +1631,18 @@ void SCCPInstVisitor::visitSelectInst(SelectInst &I) { if (ValueState[&I].isOverdefined()) return (void)markOverdefined(&I); - ValueLatticeElement CondValue = getValueState(I.getCondition()); + const ValueLatticeElement &CondValue = getValueState(I.getCondition()); if (CondValue.isUnknownOrUndef()) return; if (ConstantInt *CondCB = getConstantInt(CondValue, I.getCondition()->getType())) { Value *OpVal = CondCB->isZero() ? I.getFalseValue() : I.getTrueValue(); - mergeInValue(&I, getValueState(OpVal)); + const ValueLatticeElement &OpValState = getValueState(OpVal); + // Safety: ValueState[&I] doesn't invalidate OpValState since it is already + // in the map. + assert(ValueState.contains(&I) && "&I is not in ValueState map."); + mergeInValue(ValueState[&I], &I, OpValState); return; } @@ -1721,7 +1740,7 @@ void SCCPInstVisitor::visitBinaryOperator(Instruction &I) { // being a special floating value. ValueLatticeElement NewV; NewV.markConstant(C, /*MayIncludeUndef=*/true); - return (void)mergeInValue(&I, NewV); + return (void)mergeInValue(ValueState[&I], &I, NewV); } } @@ -1741,7 +1760,7 @@ void SCCPInstVisitor::visitBinaryOperator(Instruction &I) { R = A.overflowingBinaryOp(BO->getOpcode(), B, OBO->getNoWrapKind()); else R = A.binaryOp(BO->getOpcode(), B); - mergeInValue(&I, ValueLatticeElement::getRange(R)); + mergeInValue(ValueState[&I], &I, ValueLatticeElement::getRange(R)); // TODO: Currently we do not exploit special values that produce something // better than overdefined with an overdefined operand for vector or floating @@ -1767,7 +1786,7 @@ void SCCPInstVisitor::visitCmpInst(CmpInst &I) { if (C) { ValueLatticeElement CV; CV.markConstant(C); - mergeInValue(&I, CV); + mergeInValue(ValueState[&I], &I, CV); return; } @@ -1802,7 +1821,7 @@ void SCCPInstVisitor::visitGetElementPtrInst(GetElementPtrInst &I) { Operands.reserve(I.getNumOperands()); for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i) { - ValueLatticeElement State = getValueState(I.getOperand(i)); + const ValueLatticeElement &State = getValueState(I.getOperand(i)); if (State.isUnknownOrUndef()) return; // Operands are not resolved yet. @@ -1881,14 +1900,13 @@ void SCCPInstVisitor::visitLoadInst(LoadInst &I) { if (ValueState[&I].isOverdefined()) return (void)markOverdefined(&I); - ValueLatticeElement PtrVal = getValueState(I.getOperand(0)); + const ValueLatticeElement &PtrVal = getValueState(I.getOperand(0)); if (PtrVal.isUnknownOrUndef()) return; // The pointer is not resolved yet! - ValueLatticeElement &IV = ValueState[&I]; - if (SCCPSolver::isConstant(PtrVal)) { Constant *Ptr = getConstant(PtrVal, I.getOperand(0)->getType()); + ValueLatticeElement &IV = ValueState[&I]; // load null is undefined. if (isa<ConstantPointerNull>(Ptr)) { @@ -1916,7 +1934,7 @@ void SCCPInstVisitor::visitLoadInst(LoadInst &I) { } // Fall back to metadata. - mergeInValue(&I, getValueFromMetadata(&I)); + mergeInValue(ValueState[&I], &I, getValueFromMetadata(&I)); } void SCCPInstVisitor::visitCallBase(CallBase &CB) { @@ -1944,7 +1962,7 @@ void SCCPInstVisitor::handleCallOverdefined(CallBase &CB) { return markOverdefined(&CB); // Can't handle struct args. if (A.get()->getType()->isMetadataTy()) continue; // Carried in CB, not allowed in Operands. - ValueLatticeElement State = getValueState(A); + const ValueLatticeElement &State = getValueState(A); if (State.isUnknownOrUndef()) return; // Operands are not resolved yet. @@ -1964,7 +1982,7 @@ void SCCPInstVisitor::handleCallOverdefined(CallBase &CB) { } // Fall back to metadata. - mergeInValue(&CB, getValueFromMetadata(&CB)); + mergeInValue(ValueState[&CB], &CB, getValueFromMetadata(&CB)); } void SCCPInstVisitor::handleCallArguments(CallBase &CB) { @@ -1992,10 +2010,11 @@ void SCCPInstVisitor::handleCallArguments(CallBase &CB) { mergeInValue(getStructValueState(&*AI, i), &*AI, CallArg, getMaxWidenStepsOpts()); } - } else - mergeInValue(&*AI, - getValueState(*CAI).intersect(getArgAttributeVL(&*AI)), - getMaxWidenStepsOpts()); + } else { + ValueLatticeElement CallArg = + getValueState(*CAI).intersect(getArgAttributeVL(&*AI)); + mergeInValue(ValueState[&*AI], &*AI, CallArg, getMaxWidenStepsOpts()); + } } } } @@ -2076,7 +2095,8 @@ void SCCPInstVisitor::handleCallResult(CallBase &CB) { if (II->getIntrinsicID() == Intrinsic::vscale) { unsigned BitWidth = CB.getType()->getScalarSizeInBits(); const ConstantRange Result = getVScaleRange(II->getFunction(), BitWidth); - return (void)mergeInValue(II, ValueLatticeElement::getRange(Result)); + return (void)mergeInValue(ValueState[II], II, + ValueLatticeElement::getRange(Result)); } if (ConstantRange::isIntrinsicSupported(II->getIntrinsicID())) { @@ -2094,7 +2114,8 @@ void SCCPInstVisitor::handleCallResult(CallBase &CB) { ConstantRange Result = ConstantRange::intrinsic(II->getIntrinsicID(), OpRanges); - return (void)mergeInValue(II, ValueLatticeElement::getRange(Result)); + return (void)mergeInValue(ValueState[II], II, + ValueLatticeElement::getRange(Result)); } } @@ -2121,10 +2142,25 @@ void SCCPInstVisitor::handleCallResult(CallBase &CB) { return handleCallOverdefined(CB); // Not tracking this callee. // If so, propagate the return value of the callee into this call result. - mergeInValue(&CB, TFRVI->second, getMaxWidenStepsOpts()); + mergeInValue(ValueState[&CB], &CB, TFRVI->second, getMaxWidenStepsOpts()); } } +bool SCCPInstVisitor::isInstFullyOverDefined(Instruction &Inst) { + // For structure Type, we handle each member separately. + // A structure object won't be considered as overdefined when + // there is at least one member that is not overdefined. + if (StructType *STy = dyn_cast<StructType>(Inst.getType())) { + for (unsigned i = 0, e = STy->getNumElements(); i < e; ++i) { + if (!getStructValueState(&Inst, i).isOverdefined()) + return false; + } + return true; + } + + return getValueState(&Inst).isOverdefined(); +} + void SCCPInstVisitor::solve() { // Process the work lists until they are empty! while (!BBWorkList.empty() || !InstWorkList.empty()) { diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index b96d29e..280eb20 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7957,9 +7957,9 @@ bool VPRecipeBuilder::getScaledReductions( auto CollectExtInfo = [this, &Exts, &ExtOpTypes, &ExtKinds](SmallVectorImpl<Value *> &Ops) -> bool { for (const auto &[I, OpI] : enumerate(Ops)) { - auto *CI = dyn_cast<ConstantInt>(OpI); - if (I > 0 && CI && - canConstantBeExtended(CI, ExtOpTypes[0], ExtKinds[0])) { + const APInt *C; + if (I > 0 && match(OpI, m_APInt(C)) && + canConstantBeExtended(C, ExtOpTypes[0], ExtKinds[0])) { ExtOpTypes[I] = ExtOpTypes[0]; ExtKinds[I] = ExtKinds[0]; continue; @@ -8240,14 +8240,14 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( // the vector loop or when not folding the tail. In the later case, we know // that the canonical induction increment will not overflow as the vector trip // count is >= increment and a multiple of the increment. + VPRegionBlock *LoopRegion = Plan->getVectorLoopRegion(); bool HasNUW = !IVUpdateMayOverflow || Style == TailFoldingStyle::None; if (!HasNUW) { - auto *IVInc = Plan->getVectorLoopRegion() - ->getExitingBasicBlock() - ->getTerminator() - ->getOperand(0); - assert(match(IVInc, m_VPInstruction<Instruction::Add>( - m_Specific(Plan->getCanonicalIV()), m_VPValue())) && + auto *IVInc = + LoopRegion->getExitingBasicBlock()->getTerminator()->getOperand(0); + assert(match(IVInc, + m_VPInstruction<Instruction::Add>( + m_Specific(LoopRegion->getCanonicalIV()), m_VPValue())) && "Did not find the canonical IV increment"); cast<VPRecipeWithIRFlags>(IVInc)->dropPoisonGeneratingFlags(); } @@ -8293,7 +8293,6 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( // Scan the body of the loop in a topological order to visit each basic block // after having visited its predecessor basic blocks. - VPRegionBlock *LoopRegion = Plan->getVectorLoopRegion(); VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock(); ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT( HeaderVPBB); @@ -8377,8 +8376,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( for (VPValue *Old : Old2New.keys()) Old->getDefiningRecipe()->eraseFromParent(); - assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) && - !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() && + assert(isa<VPRegionBlock>(LoopRegion) && + !LoopRegion->getEntryBasicBlock()->empty() && "entry block must be set to a VPRegionBlock having a non-empty entry " "VPBasicBlock"); @@ -9326,8 +9325,9 @@ static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) { if (ResumePhiIter == MainScalarPH->phis().end()) { VPBuilder ScalarPHBuilder(MainScalarPH, MainScalarPH->begin()); ResumePhi = ScalarPHBuilder.createScalarPhi( - {VectorTC, MainPlan.getCanonicalIV()->getStartValue()}, {}, - "vec.epilog.resume.val"); + {VectorTC, + MainPlan.getVectorLoopRegion()->getCanonicalIV()->getStartValue()}, + {}, "vec.epilog.resume.val"); } else { ResumePhi = cast<VPPhi>(&*ResumePhiIter); if (MainScalarPH->begin() == MainScalarPH->end()) @@ -9354,7 +9354,7 @@ static SmallVector<Instruction *> preparePlanForEpilogueVectorLoop( VPBasicBlock *Header = VectorLoop->getEntryBasicBlock(); Header->setName("vec.epilog.vector.body"); - VPCanonicalIVPHIRecipe *IV = Plan.getCanonicalIV(); + VPCanonicalIVPHIRecipe *IV = VectorLoop->getCanonicalIV(); // When vectorizing the epilogue loop, the canonical induction needs to be // adjusted by the value after the main vector loop. Find the resume value // created during execution of the main VPlan. It must be the first phi in the diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 88af2cf..9cd52da 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -2242,8 +2242,49 @@ public: /// may not be necessary. bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const; bool isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy, - Align Alignment, const int64_t Diff, Value *Ptr0, - Value *PtrN, StridedPtrInfo &SPtrInfo) const; + Align Alignment, const int64_t Diff, + const size_t Sz) const; + + /// Return true if an array of scalar loads can be replaced with a strided + /// load (with constant stride). + /// + /// TODO: + /// It is possible that the load gets "widened". Suppose that originally each + /// load loads `k` bytes and `PointerOps` can be arranged as follows (`%s` is + /// constant): %b + 0 * %s + 0 %b + 0 * %s + 1 %b + 0 * %s + 2 + /// ... + /// %b + 0 * %s + (w - 1) + /// + /// %b + 1 * %s + 0 + /// %b + 1 * %s + 1 + /// %b + 1 * %s + 2 + /// ... + /// %b + 1 * %s + (w - 1) + /// ... + /// + /// %b + (n - 1) * %s + 0 + /// %b + (n - 1) * %s + 1 + /// %b + (n - 1) * %s + 2 + /// ... + /// %b + (n - 1) * %s + (w - 1) + /// + /// In this case we will generate a strided load of type `<n x (k * w)>`. + /// + /// \param PointerOps list of pointer arguments of loads. + /// \param ElemTy original scalar type of loads. + /// \param Alignment alignment of the first load. + /// \param SortedIndices is the order of PointerOps as returned by + /// `sortPtrAccesses` + /// \param Diff Pointer difference between the lowest and the highes pointer + /// in `PointerOps` as returned by `getPointersDiff`. + /// \param Ptr0 first pointer in `PointersOps`. + /// \param PtrN last pointer in `PointersOps`. + /// \param SPtrInfo If the function return `true`, it also sets all the fields + /// of `SPtrInfo` necessary to generate the strided load later. + bool analyzeConstantStrideCandidate( + const ArrayRef<Value *> PointerOps, Type *ElemTy, Align Alignment, + const SmallVectorImpl<unsigned> &SortedIndices, const int64_t Diff, + Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const; /// Return true if an array of scalar loads can be replaced with a strided /// load (with run-time stride). @@ -6849,9 +6890,8 @@ isMaskedLoadCompress(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps, /// current graph (for masked gathers extra extractelement instructions /// might be required). bool BoUpSLP::isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy, - Align Alignment, const int64_t Diff, Value *Ptr0, - Value *PtrN, StridedPtrInfo &SPtrInfo) const { - const size_t Sz = PointerOps.size(); + Align Alignment, const int64_t Diff, + const size_t Sz) const { if (Diff % (Sz - 1) != 0) return false; @@ -6875,27 +6915,40 @@ bool BoUpSLP::isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy, return false; if (!TTI->isLegalStridedLoadStore(VecTy, Alignment)) return false; + return true; + } + return false; +} - // Iterate through all pointers and check if all distances are - // unique multiple of Dist. - SmallSet<int64_t, 4> Dists; - for (Value *Ptr : PointerOps) { - int64_t Dist = 0; - if (Ptr == PtrN) - Dist = Diff; - else if (Ptr != Ptr0) - Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE); - // If the strides are not the same or repeated, we can't - // vectorize. - if (((Dist / Stride) * Stride) != Dist || !Dists.insert(Dist).second) - break; - } - if (Dists.size() == Sz) { - Type *StrideTy = DL->getIndexType(Ptr0->getType()); - SPtrInfo.StrideVal = ConstantInt::get(StrideTy, Stride); - SPtrInfo.Ty = getWidenedType(ScalarTy, Sz); - return true; - } +bool BoUpSLP::analyzeConstantStrideCandidate( + const ArrayRef<Value *> PointerOps, Type *ScalarTy, Align Alignment, + const SmallVectorImpl<unsigned> &SortedIndices, const int64_t Diff, + Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const { + const size_t Sz = PointerOps.size(); + if (!isStridedLoad(PointerOps, ScalarTy, Alignment, Diff, Sz)) + return false; + + int64_t Stride = Diff / static_cast<int64_t>(Sz - 1); + + // Iterate through all pointers and check if all distances are + // unique multiple of Dist. + SmallSet<int64_t, 4> Dists; + for (Value *Ptr : PointerOps) { + int64_t Dist = 0; + if (Ptr == PtrN) + Dist = Diff; + else if (Ptr != Ptr0) + Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE); + // If the strides are not the same or repeated, we can't + // vectorize. + if (((Dist / Stride) * Stride) != Dist || !Dists.insert(Dist).second) + break; + } + if (Dists.size() == Sz) { + Type *StrideTy = DL->getIndexType(Ptr0->getType()); + SPtrInfo.StrideVal = ConstantInt::get(StrideTy, Stride); + SPtrInfo.Ty = getWidenedType(ScalarTy, Sz); + return true; } return false; } @@ -6995,8 +7048,8 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads( Align Alignment = cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()]) ->getAlign(); - if (isStridedLoad(PointerOps, ScalarTy, Alignment, *Diff, Ptr0, PtrN, - SPtrInfo)) + if (analyzeConstantStrideCandidate(PointerOps, ScalarTy, Alignment, Order, + *Diff, Ptr0, PtrN, SPtrInfo)) return LoadsState::StridedVectorize; } if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) || @@ -17632,7 +17685,9 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) { } if (IsPHI || (!E->isGather() && E->State != TreeEntry::SplitVectorize && - E->doesNotNeedToSchedule()) || + (E->doesNotNeedToSchedule() || + (E->hasCopyableElements() && !E->isCopyableElement(LastInst) && + isUsedOutsideBlock(LastInst)))) || (GatheredLoadsEntriesFirst.has_value() && E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() && E->getOpcode() == Instruction::Load)) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 0101942..d167009 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -1753,14 +1753,14 @@ void LoopVectorizationPlanner::printPlans(raw_ostream &O) { } #endif -bool llvm::canConstantBeExtended(const ConstantInt *CI, Type *NarrowType, +bool llvm::canConstantBeExtended(const APInt *C, Type *NarrowType, TTI::PartialReductionExtendKind ExtKind) { - APInt TruncatedVal = CI->getValue().trunc(NarrowType->getScalarSizeInBits()); - unsigned WideSize = CI->getType()->getScalarSizeInBits(); + APInt TruncatedVal = C->trunc(NarrowType->getScalarSizeInBits()); + unsigned WideSize = C->getBitWidth(); APInt ExtendedVal = ExtKind == TTI::PR_SignExtend ? TruncatedVal.sext(WideSize) : TruncatedVal.zext(WideSize); - return ExtendedVal == CI->getValue(); + return ExtendedVal == *C; } TargetTransformInfo::OperandValueInfo diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 23f5623..84d2ea6 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -407,6 +407,10 @@ public: VPBasicBlock *getParent() { return Parent; } const VPBasicBlock *getParent() const { return Parent; } + /// \return the VPRegionBlock which the recipe belongs to. + VPRegionBlock *getRegion(); + const VPRegionBlock *getRegion() const; + /// The method which generates the output IR instructions that correspond to /// this VPRecipe, thereby "executing" the VPlan. virtual void execute(VPTransformState &State) = 0; @@ -1012,6 +1016,8 @@ public: // part if scalar. In the latter case, the recipe will be removed during // unrolling. ExtractLastElement, + // Extracts the last lane for each part from its operand. + ExtractLastLanePerPart, // Extracts the second-to-last lane from its operand or the second-to-last // part if it is scalar. In the latter case, the recipe will be removed // during unrolling. @@ -4058,8 +4064,29 @@ public: /// Remove the current region from its VPlan, connecting its predecessor to /// its entry, and its exiting block to its successor. void dissolveToCFGLoop(); + + /// Returns the canonical induction recipe of the region. + VPCanonicalIVPHIRecipe *getCanonicalIV() { + VPBasicBlock *EntryVPBB = getEntryBasicBlock(); + if (EntryVPBB->empty()) { + // VPlan native path. TODO: Unify both code paths. + EntryVPBB = cast<VPBasicBlock>(EntryVPBB->getSingleSuccessor()); + } + return cast<VPCanonicalIVPHIRecipe>(&*EntryVPBB->begin()); + } + const VPCanonicalIVPHIRecipe *getCanonicalIV() const { + return const_cast<VPRegionBlock *>(this)->getCanonicalIV(); + } }; +inline VPRegionBlock *VPRecipeBase::getRegion() { + return getParent()->getParent(); +} + +inline const VPRegionBlock *VPRecipeBase::getRegion() const { + return getParent()->getParent(); +} + /// VPlan models a candidate for vectorization, encoding various decisions take /// to produce efficient output IR, including which branches, basic-blocks and /// output IR instructions to generate, and their cost. VPlan holds a @@ -4252,12 +4279,14 @@ public: BackedgeTakenCount = new VPValue(); return BackedgeTakenCount; } + VPValue *getBackedgeTakenCount() const { return BackedgeTakenCount; } /// The vector trip count. VPValue &getVectorTripCount() { return VectorTripCount; } /// Returns the VF of the vector loop region. VPValue &getVF() { return VF; }; + const VPValue &getVF() const { return VF; }; /// Returns VF * UF of the vector loop region. VPValue &getVFxUF() { return VFxUF; } @@ -4369,16 +4398,6 @@ public: LLVM_DUMP_METHOD void dump() const; #endif - /// Returns the canonical induction recipe of the vector loop. - VPCanonicalIVPHIRecipe *getCanonicalIV() { - VPBasicBlock *EntryVPBB = getVectorLoopRegion()->getEntryBasicBlock(); - if (EntryVPBB->empty()) { - // VPlan native path. - EntryVPBB = cast<VPBasicBlock>(EntryVPBB->getSingleSuccessor()); - } - return cast<VPCanonicalIVPHIRecipe>(&*EntryVPBB->begin()); - } - VPValue *getSCEVExpansion(const SCEV *S) const { return SCEVToExpansion.lookup(S); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 07bfe7a..7e074c1 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -116,6 +116,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) { case VPInstruction::FirstActiveLane: return Type::getIntNTy(Ctx, 64); case VPInstruction::ExtractLastElement: + case VPInstruction::ExtractLastLanePerPart: case VPInstruction::ExtractPenultimateElement: { Type *BaseTy = inferScalarType(R->getOperand(0)); if (auto *VecTy = dyn_cast<VectorType>(BaseTy)) @@ -376,7 +377,7 @@ bool VPDominatorTree::properlyDominates(const VPRecipeBase *A, #ifndef NDEBUG auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * { - auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent()); + VPRegionBlock *Region = R->getRegion(); if (Region && Region->isReplicator()) { assert(Region->getNumSuccessors() == 1 && Region->getNumPredecessors() == 1 && "Expected SESE region!"); diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp index c0147ce..332791a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp @@ -658,9 +658,11 @@ void VPlanTransforms::attachCheckBlock(VPlan &Plan, Value *Cond, } VPIRMetadata VPBranchWeights; - auto *Term = VPBuilder(CheckBlockVPBB) - .createNaryOp(VPInstruction::BranchOnCond, {CondVPV}, - Plan.getCanonicalIV()->getDebugLoc()); + auto *Term = + VPBuilder(CheckBlockVPBB) + .createNaryOp( + VPInstruction::BranchOnCond, {CondVPV}, + Plan.getVectorLoopRegion()->getCanonicalIV()->getDebugLoc()); if (AddBranchWeights) { MDBuilder MDB(Plan.getContext()); MDNode *BranchWeights = @@ -921,8 +923,8 @@ bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) { if (auto *DerivedIV = dyn_cast<VPDerivedIVRecipe>(VecV)) { if (DerivedIV->getNumUsers() == 1 && DerivedIV->getOperand(1) == &Plan.getVectorTripCount()) { - auto *NewSel = Builder.createSelect(AnyNaN, Plan.getCanonicalIV(), - &Plan.getVectorTripCount()); + auto *NewSel = Builder.createSelect( + AnyNaN, LoopRegion->getCanonicalIV(), &Plan.getVectorTripCount()); DerivedIV->moveAfter(&*Builder.getInsertPoint()); DerivedIV->setOperand(1, NewSel); continue; @@ -935,7 +937,8 @@ bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) { "FMaxNum/FMinNum reduction.\n"); return false; } - auto *NewSel = Builder.createSelect(AnyNaN, Plan.getCanonicalIV(), VecV); + auto *NewSel = + Builder.createSelect(AnyNaN, LoopRegion->getCanonicalIV(), VecV); ResumeR->setOperand(0, NewSel); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h index 1580a3b..2aaabd9 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h +++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h @@ -474,7 +474,7 @@ public: /// Check if a constant \p CI can be safely treated as having been extended /// from a narrower type with the given extension kind. -bool canConstantBeExtended(const ConstantInt *CI, Type *NarrowType, +bool canConstantBeExtended(const APInt *C, Type *NarrowType, TTI::PartialReductionExtendKind ExtKind); } // end namespace llvm diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index b42b049..d8203e2 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -173,10 +173,10 @@ inline int_pred_ty<is_zero_int> m_ZeroInt() { /// For vectors, this includes constants with undefined elements. inline int_pred_ty<is_one> m_One() { return int_pred_ty<is_one>(); } -struct bind_const_int { - uint64_t &Res; +struct bind_apint { + const APInt *&Res; - bind_const_int(uint64_t &Res) : Res(Res) {} + bind_apint(const APInt *&Res) : Res(Res) {} bool match(VPValue *VPV) const { if (!VPV->isLiveIn()) @@ -188,7 +188,23 @@ struct bind_const_int { const auto *CI = dyn_cast<ConstantInt>(V); if (!CI) return false; - if (auto C = CI->getValue().tryZExtValue()) { + Res = &CI->getValue(); + return true; + } +}; + +inline bind_apint m_APInt(const APInt *&C) { return C; } + +struct bind_const_int { + uint64_t &Res; + + bind_const_int(uint64_t &Res) : Res(Res) {} + + bool match(VPValue *VPV) const { + const APInt *APConst; + if (!bind_apint(APConst).match(VPV)) + return false; + if (auto C = APConst->tryZExtValue()) { Res = *C; return true; } @@ -372,6 +388,12 @@ m_ExtractLastElement(const Op0_t &Op0) { return m_VPInstruction<VPInstruction::ExtractLastElement>(Op0); } +template <typename Op0_t> +inline VPInstruction_match<VPInstruction::ExtractLastLanePerPart, Op0_t> +m_ExtractLastLanePerPart(const Op0_t &Op0) { + return m_VPInstruction<VPInstruction::ExtractLastLanePerPart>(Op0); +} + template <typename Op0_t, typename Op1_t, typename Op2_t> inline VPInstruction_match<VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t> m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2) { @@ -394,6 +416,12 @@ m_AnyOf(const Op0_t &Op0) { return m_VPInstruction<VPInstruction::AnyOf>(Op0); } +template <typename Op0_t> +inline VPInstruction_match<VPInstruction::FirstActiveLane, Op0_t> +m_FirstActiveLane(const Op0_t &Op0) { + return m_VPInstruction<VPInstruction::FirstActiveLane>(Op0); +} + template <unsigned Opcode, typename Op0_t> inline AllRecipe_match<Opcode, Op0_t> m_Unary(const Op0_t &Op0) { return AllRecipe_match<Opcode, Op0_t>(Op0); diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp index 0c27d53..fb17d5d 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp @@ -168,7 +168,8 @@ void VPPredicator::createHeaderMask(VPBasicBlock *HeaderVPBB, bool FoldTail) { // non-phi instructions. auto &Plan = *HeaderVPBB->getPlan(); - auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV()); + auto *IV = + new VPWidenCanonicalIVRecipe(HeaderVPBB->getParent()->getCanonicalIV()); Builder.setInsertPoint(HeaderVPBB, HeaderVPBB->getFirstNonPhi()); Builder.insert(IV); diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 2368d18..d1e67e6b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -341,12 +341,12 @@ VPPartialReductionRecipe::computeCost(ElementCount VF, ExtAType = GetExtendKind(ExtAR); ExtBType = GetExtendKind(ExtBR); - if (!ExtBR && Widen->getOperand(1)->isLiveIn()) { - auto *CI = cast<ConstantInt>(Widen->getOperand(1)->getLiveInIRValue()); - if (canConstantBeExtended(CI, InputTypeA, ExtAType)) { - InputTypeB = InputTypeA; - ExtBType = ExtAType; - } + using namespace VPlanPatternMatch; + const APInt *C; + if (!ExtBR && match(Widen->getOperand(1), m_APInt(C)) && + canConstantBeExtended(C, InputTypeA, ExtAType)) { + InputTypeB = InputTypeA; + ExtBType = ExtAType; } }; @@ -511,6 +511,7 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) { case VPInstruction::CanonicalIVIncrementForPart: case VPInstruction::ExplicitVectorLength: case VPInstruction::ExtractLastElement: + case VPInstruction::ExtractLastLanePerPart: case VPInstruction::ExtractPenultimateElement: case VPInstruction::FirstActiveLane: case VPInstruction::Not: @@ -878,9 +879,11 @@ Value *VPInstruction::generate(VPTransformState &State) { return ReducedPartRdx; } + case VPInstruction::ExtractLastLanePerPart: case VPInstruction::ExtractLastElement: case VPInstruction::ExtractPenultimateElement: { - unsigned Offset = getOpcode() == VPInstruction::ExtractLastElement ? 1 : 2; + unsigned Offset = + getOpcode() == VPInstruction::ExtractPenultimateElement ? 2 : 1; Value *Res; if (State.VF.isVector()) { assert(Offset <= State.VF.getKnownMinValue() && @@ -1166,6 +1169,7 @@ InstructionCost VPInstruction::computeCost(ElementCount VF, bool VPInstruction::isVectorToScalar() const { return getOpcode() == VPInstruction::ExtractLastElement || + getOpcode() == VPInstruction::ExtractLastLanePerPart || getOpcode() == VPInstruction::ExtractPenultimateElement || getOpcode() == Instruction::ExtractElement || getOpcode() == VPInstruction::ExtractLane || @@ -1229,6 +1233,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { case VPInstruction::CanonicalIVIncrementForPart: case VPInstruction::ExtractLane: case VPInstruction::ExtractLastElement: + case VPInstruction::ExtractLastLanePerPart: case VPInstruction::ExtractPenultimateElement: case VPInstruction::ActiveLaneMask: case VPInstruction::FirstActiveLane: @@ -1376,6 +1381,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, case VPInstruction::ExtractLastElement: O << "extract-last-element"; break; + case VPInstruction::ExtractLastLanePerPart: + O << "extract-last-lane-per-part"; + break; case VPInstruction::ExtractPenultimateElement: O << "extract-penultimate-element"; break; @@ -2344,7 +2352,7 @@ bool VPWidenIntOrFpInductionRecipe::isCanonical() const { return false; auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue()); auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue()); - auto *CanIV = cast<VPCanonicalIVPHIRecipe>(&*getParent()->begin()); + auto *CanIV = getRegion()->getCanonicalIV(); return StartC && StartC->isZero() && StepC && StepC->isOne() && getScalarType() == CanIV->getScalarType(); } @@ -3068,7 +3076,7 @@ static void scalarizeInstruction(const Instruction *Instr, State.AC->registerAssumption(II); assert( - (RepRecipe->getParent()->getParent() || + (RepRecipe->getRegion() || !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() || all_of(RepRecipe->operands(), [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) && @@ -3260,7 +3268,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, to_vector(operands()), VF); // If the recipe is not predicated (i.e. not in a replicate region), return // the scalar cost. Otherwise handle predicated cost. - if (!getParent()->getParent()->isReplicator()) + if (!getRegion()->isReplicator()) return ScalarCost; // Account for the phi nodes that we will create. @@ -3276,7 +3284,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, case Instruction::Store: { // TODO: See getMemInstScalarizationCost for how to handle replicating and // predicated cases. - const VPRegionBlock *ParentRegion = getParent()->getParent(); + const VPRegionBlock *ParentRegion = getRegion(); if (ParentRegion && ParentRegion->isReplicator()) break; diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 40b7e8d..f5f616f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -501,7 +501,8 @@ static void removeRedundantInductionCasts(VPlan &Plan) { /// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV /// recipe, if it exists. static void removeRedundantCanonicalIVs(VPlan &Plan) { - VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV(); + VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); + VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV(); VPWidenCanonicalIVRecipe *WidenNewIV = nullptr; for (VPUser *U : CanonicalIV->users()) { WidenNewIV = dyn_cast<VPWidenCanonicalIVRecipe>(U); @@ -512,7 +513,7 @@ static void removeRedundantCanonicalIVs(VPlan &Plan) { if (!WidenNewIV) return; - VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); + VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock(); for (VPRecipeBase &Phi : HeaderVPBB->phis()) { auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi); @@ -582,8 +583,9 @@ createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, FPMathOperator *FPBinOp, Instruction *TruncI, VPValue *StartV, VPValue *Step, DebugLoc DL, VPBuilder &Builder) { - VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); - VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV(); + VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); + VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock(); + VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV(); VPSingleDefRecipe *BaseIV = Builder.createDerivedIV( Kind, FPBinOp, StartV, CanonicalIV, Step, "offset.idx"); @@ -786,9 +788,7 @@ static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan, ScalarEvolution &SE) { VPValue *Incoming, *Mask; if (!match(Op, m_VPInstruction<VPInstruction::ExtractLane>( - m_VPInstruction<VPInstruction::FirstActiveLane>( - m_VPValue(Mask)), - m_VPValue(Incoming)))) + m_FirstActiveLane(m_VPValue(Mask)), m_VPValue(Incoming)))) return nullptr; auto *WideIV = getOptimizableIVOf(Incoming, SE); @@ -800,8 +800,9 @@ static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan, return nullptr; // Calculate the final index. - VPValue *EndValue = Plan.getCanonicalIV(); - auto CanonicalIVType = Plan.getCanonicalIV()->getScalarType(); + VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); + auto *CanonicalIV = LoopRegion->getCanonicalIV(); + Type *CanonicalIVType = CanonicalIV->getScalarType(); VPBuilder B(cast<VPBasicBlock>(PredVPBB)); DebugLoc DL = cast<VPInstruction>(Op)->getDebugLoc(); @@ -810,7 +811,8 @@ static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan, Type *FirstActiveLaneType = TypeInfo.inferScalarType(FirstActiveLane); FirstActiveLane = B.createScalarZExtOrTrunc(FirstActiveLane, CanonicalIVType, FirstActiveLaneType, DL); - EndValue = B.createNaryOp(Instruction::Add, {EndValue, FirstActiveLane}, DL); + VPValue *EndValue = + B.createNaryOp(Instruction::Add, {CanonicalIV, FirstActiveLane}, DL); // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it // changed it means the exit is using the incremented value, so we need to @@ -1205,7 +1207,8 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { } // Look through ExtractLastElement (BuildVector ....). - if (match(&R, m_ExtractLastElement(m_BuildVector()))) { + if (match(&R, m_CombineOr(m_ExtractLastElement(m_BuildVector()), + m_ExtractLastLanePerPart(m_BuildVector())))) { auto *BuildVector = cast<VPInstruction>(R.getOperand(0)); Def->replaceAllUsesWith( BuildVector->getOperand(BuildVector->getNumOperands() - 1)); @@ -1271,13 +1274,15 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { return; } - if (match(Def, m_ExtractLastElement(m_Broadcast(m_VPValue(A))))) { + if (match(Def, + m_CombineOr(m_ExtractLastElement(m_Broadcast(m_VPValue(A))), + m_ExtractLastLanePerPart(m_Broadcast(m_VPValue(A)))))) { Def->replaceAllUsesWith(A); return; } - if (match(Def, - m_VPInstruction<VPInstruction::ExtractLastElement>(m_VPValue(A))) && + if (match(Def, m_CombineOr(m_ExtractLastElement(m_VPValue(A)), + m_ExtractLastLanePerPart(m_VPValue(A)))) && ((isa<VPInstruction>(A) && vputils::isSingleScalar(A)) || (isa<VPReplicateRecipe>(A) && cast<VPReplicateRecipe>(A)->isSingleScalar())) && @@ -1285,6 +1290,12 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { [Def, A](VPUser *U) { return U->usesScalars(A) || Def == U; })) { return Def->replaceAllUsesWith(A); } + + if (Plan->getUF() == 1 && + match(Def, m_ExtractLastLanePerPart(m_VPValue(A)))) { + return Def->replaceAllUsesWith( + Builder.createNaryOp(VPInstruction::ExtractLastElement, {A})); + } } void VPlanTransforms::simplifyRecipes(VPlan &Plan) { @@ -1322,8 +1333,11 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) { RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(), true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Metadata*/); Clone->insertBefore(RepOrWidenR); - auto *Ext = new VPInstruction(VPInstruction::ExtractLastElement, - {Clone->getOperand(0)}); + unsigned ExtractOpc = + vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1)) + ? VPInstruction::ExtractLastElement + : VPInstruction::ExtractLastLanePerPart; + auto *Ext = new VPInstruction(ExtractOpc, {Clone->getOperand(0)}); Ext->insertBefore(Clone); Clone->setOperand(0, Ext); RepR->eraseFromParent(); @@ -1337,7 +1351,8 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) { !all_of(RepOrWidenR->users(), [RepOrWidenR](const VPUser *U) { return U->usesScalars(RepOrWidenR) || match(cast<VPRecipeBase>(U), - m_ExtractLastElement(m_VPValue())); + m_CombineOr(m_ExtractLastElement(m_VPValue()), + m_ExtractLastLanePerPart(m_VPValue()))); })) continue; @@ -1530,7 +1545,7 @@ static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, return isConditionTrueViaVFAndUF(C, Plan, BestVF, BestUF, SE); }); - auto *CanIV = Plan.getCanonicalIV(); + auto *CanIV = Plan.getVectorLoopRegion()->getCanonicalIV(); if (!match(Cond, m_SpecificICmp(CmpInst::ICMP_EQ, m_Specific(CanIV->getBackedgeValue()), m_Specific(&Plan.getVectorTripCount())))) @@ -1843,8 +1858,8 @@ static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR, return nullptr; VPRegionBlock *EnclosingLoopRegion = HoistCandidate->getParent()->getEnclosingLoopRegion(); - assert((!HoistCandidate->getParent()->getParent() || - HoistCandidate->getParent()->getParent() == EnclosingLoopRegion) && + assert((!HoistCandidate->getRegion() || + HoistCandidate->getRegion() == EnclosingLoopRegion) && "CFG in VPlan should still be flat, without replicate regions"); // Hoist candidate was already visited, no need to hoist. if (!Visited.insert(HoistCandidate).second) @@ -2107,9 +2122,18 @@ static void licm(VPlan &Plan) { VPBasicBlock *Preheader = Plan.getVectorPreheader(); // Return true if we do not know how to (mechanically) hoist a given recipe - // out of a loop region. Does not address legality concerns such as aliasing - // or speculation safety. + // out of a loop region. auto CannotHoistRecipe = [](VPRecipeBase &R) { + // Assumes don't alias anything or throw; as long as they're guaranteed to + // execute, they're safe to hoist. + if (match(&R, m_Intrinsic<Intrinsic::assume>())) + return false; + + // TODO: Relax checks in the future, e.g. we could also hoist reads, if + // their memory location is not modified in the vector loop. + if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi()) + return true; + // Allocas cannot be hoisted. auto *RepR = dyn_cast<VPReplicateRecipe>(&R); return RepR && RepR->getOpcode() == Instruction::Alloca; @@ -2117,17 +2141,18 @@ static void licm(VPlan &Plan) { // Hoist any loop invariant recipes from the vector loop region to the // preheader. Preform a shallow traversal of the vector loop region, to - // exclude recipes in replicate regions. + // exclude recipes in replicate regions. Since the top-level blocks in the + // vector loop region are guaranteed to execute if the vector pre-header is, + // we don't need to check speculation safety. VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); + assert(Preheader->getSingleSuccessor() == LoopRegion && + "Expected vector prehader's successor to be the vector loop region"); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( vp_depth_first_shallow(LoopRegion->getEntry()))) { for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { if (CannotHoistRecipe(R)) continue; - // TODO: Relax checks in the future, e.g. we could also hoist reads, if - // their memory location is not modified in the vector loop. - if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi() || - any_of(R.operands(), [](VPValue *Op) { + if (any_of(R.operands(), [](VPValue *Op) { return !Op->isDefinedOutsideLoopRegions(); })) continue; @@ -2319,7 +2344,7 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck) { VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); VPBasicBlock *EB = TopRegion->getExitingBasicBlock(); - auto *CanonicalIVPHI = Plan.getCanonicalIV(); + auto *CanonicalIVPHI = TopRegion->getCanonicalIV(); VPValue *StartV = CanonicalIVPHI->getStartValue(); auto *CanonicalIVIncrement = @@ -2358,7 +2383,7 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( // Create the active lane mask instruction in the VPlan preheader. VPValue *ALMMultiplier = Plan.getOrAddLiveIn( - ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1)); + ConstantInt::get(TopRegion->getCanonicalIV()->getScalarType(), 1)); auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {EntryIncrement, TC, ALMMultiplier}, DL, "active.lane.mask.entry"); @@ -2394,13 +2419,15 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( /// TODO: Introduce explicit recipe for header-mask instead of searching /// for the header-mask pattern manually. static VPSingleDefRecipe *findHeaderMask(VPlan &Plan) { + VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); SmallVector<VPValue *> WideCanonicalIVs; - auto *FoundWidenCanonicalIVUser = find_if(Plan.getCanonicalIV()->users(), - IsaPred<VPWidenCanonicalIVRecipe>); - assert(count_if(Plan.getCanonicalIV()->users(), + auto *FoundWidenCanonicalIVUser = find_if( + LoopRegion->getCanonicalIV()->users(), IsaPred<VPWidenCanonicalIVRecipe>); + assert(count_if(LoopRegion->getCanonicalIV()->users(), IsaPred<VPWidenCanonicalIVRecipe>) <= 1 && "Must have at most one VPWideCanonicalIVRecipe"); - if (FoundWidenCanonicalIVUser != Plan.getCanonicalIV()->users().end()) { + if (FoundWidenCanonicalIVUser != + LoopRegion->getCanonicalIV()->users().end()) { auto *WideCanonicalIV = cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser); WideCanonicalIVs.push_back(WideCanonicalIV); @@ -2408,7 +2435,7 @@ static VPSingleDefRecipe *findHeaderMask(VPlan &Plan) { // Also include VPWidenIntOrFpInductionRecipes that represent a widened // version of the canonical induction. - VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); + VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock(); for (VPRecipeBase &Phi : HeaderVPBB->phis()) { auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi); if (WidenOriginalIV && WidenOriginalIV->isCanonical()) @@ -2441,8 +2468,9 @@ void VPlanTransforms::addActiveLaneMask( "DataAndControlFlowWithoutRuntimeCheck implies " "UseActiveLaneMaskForControlFlow"); - auto *FoundWidenCanonicalIVUser = find_if(Plan.getCanonicalIV()->users(), - IsaPred<VPWidenCanonicalIVRecipe>); + VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); + auto *FoundWidenCanonicalIVUser = find_if( + LoopRegion->getCanonicalIV()->users(), IsaPred<VPWidenCanonicalIVRecipe>); assert(FoundWidenCanonicalIVUser && "Must have widened canonical IV when tail folding!"); VPSingleDefRecipe *HeaderMask = findHeaderMask(Plan); @@ -2455,7 +2483,7 @@ void VPlanTransforms::addActiveLaneMask( } else { VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV); VPValue *ALMMultiplier = Plan.getOrAddLiveIn( - ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1)); + ConstantInt::get(LoopRegion->getCanonicalIV()->getScalarType(), 1)); LaneMask = B.createNaryOp(VPInstruction::ActiveLaneMask, {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier}, @@ -2565,9 +2593,10 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { }); assert(all_of(Plan.getVFxUF().users(), - [&Plan](VPUser *U) { - return match(U, m_c_Add(m_Specific(Plan.getCanonicalIV()), - m_Specific(&Plan.getVFxUF()))) || + [&LoopRegion, &Plan](VPUser *U) { + return match(U, + m_c_Add(m_Specific(LoopRegion->getCanonicalIV()), + m_Specific(&Plan.getVFxUF()))) || isa<VPWidenPointerInductionRecipe>(U); }) && "Only users of VFxUF should be VPWidenPointerInductionRecipe and the " @@ -2722,9 +2751,10 @@ void VPlanTransforms::addExplicitVectorLength( VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) { if (Plan.hasScalarVFOnly()) return; - VPBasicBlock *Header = Plan.getVectorLoopRegion()->getEntryBasicBlock(); + VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); + VPBasicBlock *Header = LoopRegion->getEntryBasicBlock(); - auto *CanonicalIVPHI = Plan.getCanonicalIV(); + auto *CanonicalIVPHI = LoopRegion->getCanonicalIV(); auto *CanIVTy = CanonicalIVPHI->getScalarType(); VPValue *StartV = CanonicalIVPHI->getStartValue(); @@ -2868,7 +2898,7 @@ void VPlanTransforms::replaceSymbolicStrides( // evolution. auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) { auto *R = cast<VPRecipeBase>(&U); - return R->getParent()->getParent() || + return R->getRegion() || R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor(); }; ValueToSCEVMapTy RewriteMap; @@ -3773,8 +3803,7 @@ void VPlanTransforms::materializeBuildVectors(VPlan &Plan) { continue; auto *DefR = cast<VPRecipeWithIRFlags>(&R); auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) { - VPRegionBlock *ParentRegion = - cast<VPRecipeBase>(U)->getParent()->getParent(); + VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion(); return !U->usesScalars(DefR) || ParentRegion != LoopRegion; }; if ((isa<VPReplicateRecipe>(DefR) && @@ -4164,7 +4193,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, // Adjust induction to reflect that the transformed plan only processes one // original iteration. - auto *CanIV = Plan.getCanonicalIV(); + auto *CanIV = VectorLoop->getCanonicalIV(); auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue()); VPBuilder PHBuilder(Plan.getVectorPreheader()); diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp index 1c4adfc..5aeda3e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp @@ -69,7 +69,8 @@ class UnrollState { VPBasicBlock::iterator InsertPtForPhi); VPValue *getConstantVPV(unsigned Part) { - Type *CanIVIntTy = Plan.getCanonicalIV()->getScalarType(); + Type *CanIVIntTy = + Plan.getVectorLoopRegion()->getCanonicalIV()->getScalarType(); return Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, Part)); } @@ -351,8 +352,7 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) { // Compute*Result which combine all parts to compute the final value. VPValue *Op1; if (match(&R, m_VPInstruction<VPInstruction::AnyOf>(m_VPValue(Op1))) || - match(&R, m_VPInstruction<VPInstruction::FirstActiveLane>( - m_VPValue(Op1))) || + match(&R, m_FirstActiveLane(m_VPValue(Op1))) || match(&R, m_VPInstruction<VPInstruction::ComputeAnyOfResult>( m_VPValue(), m_VPValue(), m_VPValue(Op1))) || match(&R, m_VPInstruction<VPInstruction::ComputeReductionResult>( diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index 66748c5..8b1b0e5 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -53,7 +53,7 @@ VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr) { return Expanded; } -bool vputils::isHeaderMask(const VPValue *V, VPlan &Plan) { +bool vputils::isHeaderMask(const VPValue *V, const VPlan &Plan) { if (isa<VPActiveLaneMaskPHIRecipe>(V)) return true; @@ -67,12 +67,14 @@ bool vputils::isHeaderMask(const VPValue *V, VPlan &Plan) { if (match(V, m_ActiveLaneMask(m_VPValue(A), m_VPValue(B), m_One()))) return B == Plan.getTripCount() && - (match(A, m_ScalarIVSteps(m_Specific(Plan.getCanonicalIV()), m_One(), - m_Specific(&Plan.getVF()))) || + (match(A, + m_ScalarIVSteps( + m_Specific(Plan.getVectorLoopRegion()->getCanonicalIV()), + m_One(), m_Specific(&Plan.getVF()))) || IsWideCanonicalIV(A)); return match(V, m_ICmp(m_VPValue(A), m_VPValue(B))) && IsWideCanonicalIV(A) && - B == Plan.getOrCreateBackedgeTakenCount(); + B == Plan.getBackedgeTakenCount(); } const SCEV *vputils::getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE) { @@ -102,7 +104,8 @@ bool vputils::isUniformAcrossVFsAndUFs(VPValue *V) { return all_of(R->operands(), isUniformAcrossVFsAndUFs); } - auto *CanonicalIV = R->getParent()->getPlan()->getCanonicalIV(); + auto *CanonicalIV = + R->getParent()->getEnclosingLoopRegion()->getCanonicalIV(); // Canonical IV chain is uniform. if (V == CanonicalIV || V == CanonicalIV->getBackedgeValue()) return true; diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h index 0222b0a..9a2497e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h @@ -64,7 +64,7 @@ inline bool isSingleScalar(const VPValue *VPV) { return true; if (auto *Rep = dyn_cast<VPReplicateRecipe>(VPV)) { - const VPRegionBlock *RegionOfR = Rep->getParent()->getParent(); + const VPRegionBlock *RegionOfR = Rep->getRegion(); // Don't consider recipes in replicate regions as uniform yet; their first // lane cannot be accessed when executing the replicate region for other // lanes. @@ -90,7 +90,7 @@ inline bool isSingleScalar(const VPValue *VPV) { } /// Return true if \p V is a header mask in \p Plan. -bool isHeaderMask(const VPValue *V, VPlan &Plan); +bool isHeaderMask(const VPValue *V, const VPlan &Plan); /// Checks if \p V is uniform across all VF lanes and UF parts. It is considered /// as such if it is either loop invariant (defined outside the vector region) diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index 5262af6..91734a1 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -298,11 +298,16 @@ bool VPlanVerifier::verifyVPBasicBlock(const VPBasicBlock *VPBB) { return false; } } - if (const auto *EVL = dyn_cast<VPInstruction>(&R)) { - if (EVL->getOpcode() == VPInstruction::ExplicitVectorLength && - !verifyEVLRecipe(*EVL)) { - errs() << "EVL VPValue is not used correctly\n"; - return false; + if (const auto *VPI = dyn_cast<VPInstruction>(&R)) { + switch (VPI->getOpcode()) { + case VPInstruction::ExplicitVectorLength: + if (!verifyEVLRecipe(*VPI)) { + errs() << "EVL VPValue is not used correctly\n"; + return false; + } + break; + default: + break; } } } |