diff options
Diffstat (limited to 'llvm/lib')
135 files changed, 2284 insertions, 1474 deletions
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp index 88533f2..031d675 100644 --- a/llvm/lib/Analysis/ConstantFolding.cpp +++ b/llvm/lib/Analysis/ConstantFolding.cpp @@ -45,8 +45,10 @@ #include "llvm/IR/IntrinsicsAArch64.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsARM.h" +#include "llvm/IR/IntrinsicsNVPTX.h" #include "llvm/IR/IntrinsicsWebAssembly.h" #include "llvm/IR/IntrinsicsX86.h" +#include "llvm/IR/NVVMIntrinsicUtils.h" #include "llvm/IR/Operator.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" @@ -1687,6 +1689,58 @@ bool llvm::canConstantFoldCallTo(const CallBase *Call, const Function *F) { case Intrinsic::x86_avx512_cvttsd2usi64: return !Call->isStrictFP(); + // NVVM float/double to int32/uint32 conversion intrinsics + case Intrinsic::nvvm_f2i_rm: + case Intrinsic::nvvm_f2i_rn: + case Intrinsic::nvvm_f2i_rp: + case Intrinsic::nvvm_f2i_rz: + case Intrinsic::nvvm_f2i_rm_ftz: + case Intrinsic::nvvm_f2i_rn_ftz: + case Intrinsic::nvvm_f2i_rp_ftz: + case Intrinsic::nvvm_f2i_rz_ftz: + case Intrinsic::nvvm_f2ui_rm: + case Intrinsic::nvvm_f2ui_rn: + case Intrinsic::nvvm_f2ui_rp: + case Intrinsic::nvvm_f2ui_rz: + case Intrinsic::nvvm_f2ui_rm_ftz: + case Intrinsic::nvvm_f2ui_rn_ftz: + case Intrinsic::nvvm_f2ui_rp_ftz: + case Intrinsic::nvvm_f2ui_rz_ftz: + case Intrinsic::nvvm_d2i_rm: + case Intrinsic::nvvm_d2i_rn: + case Intrinsic::nvvm_d2i_rp: + case Intrinsic::nvvm_d2i_rz: + case Intrinsic::nvvm_d2ui_rm: + case Intrinsic::nvvm_d2ui_rn: + case Intrinsic::nvvm_d2ui_rp: + case Intrinsic::nvvm_d2ui_rz: + + // NVVM float/double to int64/uint64 conversion intrinsics + case Intrinsic::nvvm_f2ll_rm: + case Intrinsic::nvvm_f2ll_rn: + case Intrinsic::nvvm_f2ll_rp: + case Intrinsic::nvvm_f2ll_rz: + case Intrinsic::nvvm_f2ll_rm_ftz: + case Intrinsic::nvvm_f2ll_rn_ftz: + case Intrinsic::nvvm_f2ll_rp_ftz: + case Intrinsic::nvvm_f2ll_rz_ftz: + case Intrinsic::nvvm_f2ull_rm: + case Intrinsic::nvvm_f2ull_rn: + case Intrinsic::nvvm_f2ull_rp: + case Intrinsic::nvvm_f2ull_rz: + case Intrinsic::nvvm_f2ull_rm_ftz: + case Intrinsic::nvvm_f2ull_rn_ftz: + case Intrinsic::nvvm_f2ull_rp_ftz: + case Intrinsic::nvvm_f2ull_rz_ftz: + case Intrinsic::nvvm_d2ll_rm: + case Intrinsic::nvvm_d2ll_rn: + case Intrinsic::nvvm_d2ll_rp: + case Intrinsic::nvvm_d2ll_rz: + case Intrinsic::nvvm_d2ull_rm: + case Intrinsic::nvvm_d2ull_rn: + case Intrinsic::nvvm_d2ull_rp: + case Intrinsic::nvvm_d2ull_rz: + // Sign operations are actually bitwise operations, they do not raise // exceptions even for SNANs. case Intrinsic::fabs: @@ -1849,6 +1903,12 @@ inline bool llvm_fenv_testexcept() { return false; } +static const APFloat FTZPreserveSign(const APFloat &V) { + if (V.isDenormal()) + return APFloat::getZero(V.getSemantics(), V.isNegative()); + return V; +} + Constant *ConstantFoldFP(double (*NativeFP)(double), const APFloat &V, Type *Ty) { llvm_fenv_clearexcept(); @@ -2309,6 +2369,85 @@ static Constant *ConstantFoldScalarCall1(StringRef Name, return ConstantFP::get(Ty->getContext(), U); } + // NVVM float/double to signed/unsigned int32/int64 conversions: + switch (IntrinsicID) { + // f2i + case Intrinsic::nvvm_f2i_rm: + case Intrinsic::nvvm_f2i_rn: + case Intrinsic::nvvm_f2i_rp: + case Intrinsic::nvvm_f2i_rz: + case Intrinsic::nvvm_f2i_rm_ftz: + case Intrinsic::nvvm_f2i_rn_ftz: + case Intrinsic::nvvm_f2i_rp_ftz: + case Intrinsic::nvvm_f2i_rz_ftz: + // f2ui + case Intrinsic::nvvm_f2ui_rm: + case Intrinsic::nvvm_f2ui_rn: + case Intrinsic::nvvm_f2ui_rp: + case Intrinsic::nvvm_f2ui_rz: + case Intrinsic::nvvm_f2ui_rm_ftz: + case Intrinsic::nvvm_f2ui_rn_ftz: + case Intrinsic::nvvm_f2ui_rp_ftz: + case Intrinsic::nvvm_f2ui_rz_ftz: + // d2i + case Intrinsic::nvvm_d2i_rm: + case Intrinsic::nvvm_d2i_rn: + case Intrinsic::nvvm_d2i_rp: + case Intrinsic::nvvm_d2i_rz: + // d2ui + case Intrinsic::nvvm_d2ui_rm: + case Intrinsic::nvvm_d2ui_rn: + case Intrinsic::nvvm_d2ui_rp: + case Intrinsic::nvvm_d2ui_rz: + // f2ll + case Intrinsic::nvvm_f2ll_rm: + case Intrinsic::nvvm_f2ll_rn: + case Intrinsic::nvvm_f2ll_rp: + case Intrinsic::nvvm_f2ll_rz: + case Intrinsic::nvvm_f2ll_rm_ftz: + case Intrinsic::nvvm_f2ll_rn_ftz: + case Intrinsic::nvvm_f2ll_rp_ftz: + case Intrinsic::nvvm_f2ll_rz_ftz: + // f2ull + case Intrinsic::nvvm_f2ull_rm: + case Intrinsic::nvvm_f2ull_rn: + case Intrinsic::nvvm_f2ull_rp: + case Intrinsic::nvvm_f2ull_rz: + case Intrinsic::nvvm_f2ull_rm_ftz: + case Intrinsic::nvvm_f2ull_rn_ftz: + case Intrinsic::nvvm_f2ull_rp_ftz: + case Intrinsic::nvvm_f2ull_rz_ftz: + // d2ll + case Intrinsic::nvvm_d2ll_rm: + case Intrinsic::nvvm_d2ll_rn: + case Intrinsic::nvvm_d2ll_rp: + case Intrinsic::nvvm_d2ll_rz: + // d2ull + case Intrinsic::nvvm_d2ull_rm: + case Intrinsic::nvvm_d2ull_rn: + case Intrinsic::nvvm_d2ull_rp: + case Intrinsic::nvvm_d2ull_rz: { + // In float-to-integer conversion, NaN inputs are converted to 0. + if (U.isNaN()) + return ConstantInt::get(Ty, 0); + + APFloat::roundingMode RMode = nvvm::IntrinsicGetRoundingMode(IntrinsicID); + bool IsFTZ = nvvm::IntrinsicShouldFTZ(IntrinsicID); + bool IsSigned = nvvm::IntrinsicConvertsToSignedInteger(IntrinsicID); + + APSInt ResInt(Ty->getIntegerBitWidth(), !IsSigned); + auto FloatToRound = IsFTZ ? FTZPreserveSign(U) : U; + + bool IsExact = false; + APFloat::opStatus Status = + FloatToRound.convertToInteger(ResInt, RMode, &IsExact); + + if (Status != APFloat::opInvalidOp) + return ConstantInt::get(Ty, ResInt); + return nullptr; + } + } + /// We only fold functions with finite arguments. Folding NaN and inf is /// likely to be aborted with an exception anyway, and some host libms /// have known errors raising exceptions. diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp index 5158064..999386c 100644 --- a/llvm/lib/Analysis/InstructionSimplify.cpp +++ b/llvm/lib/Analysis/InstructionSimplify.cpp @@ -4275,25 +4275,27 @@ Value *llvm::simplifyFCmpInst(CmpPredicate Predicate, Value *LHS, Value *RHS, return ::simplifyFCmpInst(Predicate, LHS, RHS, FMF, Q, RecursionLimit); } -static Value *simplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp, - const SimplifyQuery &Q, - bool AllowRefinement, - SmallVectorImpl<Instruction *> *DropFlags, - unsigned MaxRecurse) { +static Value *simplifyWithOpsReplaced(Value *V, + ArrayRef<std::pair<Value *, Value *>> Ops, + const SimplifyQuery &Q, + bool AllowRefinement, + SmallVectorImpl<Instruction *> *DropFlags, + unsigned MaxRecurse) { assert((AllowRefinement || !Q.CanUseUndef) && "If AllowRefinement=false then CanUseUndef=false"); + for (const auto &OpAndRepOp : Ops) { + // We cannot replace a constant, and shouldn't even try. + if (isa<Constant>(OpAndRepOp.first)) + return nullptr; - // Trivial replacement. - if (V == Op) - return RepOp; + // Trivial replacement. + if (V == OpAndRepOp.first) + return OpAndRepOp.second; + } if (!MaxRecurse--) return nullptr; - // We cannot replace a constant, and shouldn't even try. - if (isa<Constant>(Op)) - return nullptr; - auto *I = dyn_cast<Instruction>(V); if (!I) return nullptr; @@ -4303,11 +4305,6 @@ static Value *simplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp, if (isa<PHINode>(I)) return nullptr; - // For vector types, the simplification must hold per-lane, so forbid - // potentially cross-lane operations like shufflevector. - if (Op->getType()->isVectorTy() && !isNotCrossLaneOperation(I)) - return nullptr; - // Don't fold away llvm.is.constant checks based on assumptions. if (match(I, m_Intrinsic<Intrinsic::is_constant>())) return nullptr; @@ -4316,12 +4313,20 @@ static Value *simplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp, if (isa<FreezeInst>(I)) return nullptr; + for (const auto &OpAndRepOp : Ops) { + // For vector types, the simplification must hold per-lane, so forbid + // potentially cross-lane operations like shufflevector. + if (OpAndRepOp.first->getType()->isVectorTy() && + !isNotCrossLaneOperation(I)) + return nullptr; + } + // Replace Op with RepOp in instruction operands. SmallVector<Value *, 8> NewOps; bool AnyReplaced = false; for (Value *InstOp : I->operands()) { - if (Value *NewInstOp = simplifyWithOpReplaced( - InstOp, Op, RepOp, Q, AllowRefinement, DropFlags, MaxRecurse)) { + if (Value *NewInstOp = simplifyWithOpsReplaced( + InstOp, Ops, Q, AllowRefinement, DropFlags, MaxRecurse)) { NewOps.push_back(NewInstOp); AnyReplaced = InstOp != NewInstOp; } else { @@ -4372,7 +4377,8 @@ static Value *simplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp, // by assumption and this case never wraps, so nowrap flags can be // ignored. if ((Opcode == Instruction::Sub || Opcode == Instruction::Xor) && - NewOps[0] == RepOp && NewOps[1] == RepOp) + NewOps[0] == NewOps[1] && + any_of(Ops, [=](const auto &Rep) { return NewOps[0] == Rep.second; })) return Constant::getNullValue(I->getType()); // If we are substituting an absorber constant into a binop and extra @@ -4382,10 +4388,10 @@ static Value *simplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp, // (Op == 0) ? 0 : (Op & -Op) --> Op & -Op // (Op == 0) ? 0 : (Op * (binop Op, C)) --> Op * (binop Op, C) // (Op == -1) ? -1 : (Op | (binop C, Op) --> Op | (binop C, Op) - Constant *Absorber = - ConstantExpr::getBinOpAbsorber(Opcode, I->getType()); + Constant *Absorber = ConstantExpr::getBinOpAbsorber(Opcode, I->getType()); if ((NewOps[0] == Absorber || NewOps[1] == Absorber) && - impliesPoison(BO, Op)) + any_of(Ops, + [=](const auto &Rep) { return impliesPoison(BO, Rep.first); })) return Absorber; } @@ -4453,6 +4459,15 @@ static Value *simplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp, /*AllowNonDeterministic=*/false); } +static Value *simplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp, + const SimplifyQuery &Q, + bool AllowRefinement, + SmallVectorImpl<Instruction *> *DropFlags, + unsigned MaxRecurse) { + return simplifyWithOpsReplaced(V, {{Op, RepOp}}, Q, AllowRefinement, + DropFlags, MaxRecurse); +} + Value *llvm::simplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp, const SimplifyQuery &Q, bool AllowRefinement, @@ -4595,21 +4610,20 @@ static Value *simplifySelectWithFakeICmpEq(Value *CmpLHS, Value *CmpRHS, /// Try to simplify a select instruction when its condition operand is an /// integer equality or floating-point equivalence comparison. -static Value *simplifySelectWithEquivalence(Value *CmpLHS, Value *CmpRHS, - Value *TrueVal, Value *FalseVal, - const SimplifyQuery &Q, - unsigned MaxRecurse) { +static Value *simplifySelectWithEquivalence( + ArrayRef<std::pair<Value *, Value *>> Replacements, Value *TrueVal, + Value *FalseVal, const SimplifyQuery &Q, unsigned MaxRecurse) { Value *SimplifiedFalseVal = - simplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, Q.getWithoutUndef(), - /* AllowRefinement */ false, - /* DropFlags */ nullptr, MaxRecurse); + simplifyWithOpsReplaced(FalseVal, Replacements, Q.getWithoutUndef(), + /* AllowRefinement */ false, + /* DropFlags */ nullptr, MaxRecurse); if (!SimplifiedFalseVal) SimplifiedFalseVal = FalseVal; Value *SimplifiedTrueVal = - simplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, Q, - /* AllowRefinement */ true, - /* DropFlags */ nullptr, MaxRecurse); + simplifyWithOpsReplaced(TrueVal, Replacements, Q, + /* AllowRefinement */ true, + /* DropFlags */ nullptr, MaxRecurse); if (!SimplifiedTrueVal) SimplifiedTrueVal = TrueVal; @@ -4707,10 +4721,10 @@ static Value *simplifySelectWithICmpCond(Value *CondVal, Value *TrueVal, // the arms of the select. See if substituting this value into the arm and // simplifying the result yields the same value as the other arm. if (Pred == ICmpInst::ICMP_EQ) { - if (Value *V = simplifySelectWithEquivalence(CmpLHS, CmpRHS, TrueVal, + if (Value *V = simplifySelectWithEquivalence({{CmpLHS, CmpRHS}}, TrueVal, FalseVal, Q, MaxRecurse)) return V; - if (Value *V = simplifySelectWithEquivalence(CmpRHS, CmpLHS, TrueVal, + if (Value *V = simplifySelectWithEquivalence({{CmpRHS, CmpLHS}}, TrueVal, FalseVal, Q, MaxRecurse)) return V; @@ -4720,11 +4734,8 @@ static Value *simplifySelectWithICmpCond(Value *CondVal, Value *TrueVal, if (match(CmpLHS, m_Or(m_Value(X), m_Value(Y))) && match(CmpRHS, m_Zero())) { // (X | Y) == 0 implies X == 0 and Y == 0. - if (Value *V = simplifySelectWithEquivalence(X, CmpRHS, TrueVal, FalseVal, - Q, MaxRecurse)) - return V; - if (Value *V = simplifySelectWithEquivalence(Y, CmpRHS, TrueVal, FalseVal, - Q, MaxRecurse)) + if (Value *V = simplifySelectWithEquivalence( + {{X, CmpRHS}, {Y, CmpRHS}}, TrueVal, FalseVal, Q, MaxRecurse)) return V; } @@ -4732,11 +4743,8 @@ static Value *simplifySelectWithICmpCond(Value *CondVal, Value *TrueVal, if (match(CmpLHS, m_And(m_Value(X), m_Value(Y))) && match(CmpRHS, m_AllOnes())) { // (X & Y) == -1 implies X == -1 and Y == -1. - if (Value *V = simplifySelectWithEquivalence(X, CmpRHS, TrueVal, FalseVal, - Q, MaxRecurse)) - return V; - if (Value *V = simplifySelectWithEquivalence(Y, CmpRHS, TrueVal, FalseVal, - Q, MaxRecurse)) + if (Value *V = simplifySelectWithEquivalence( + {{X, CmpRHS}, {Y, CmpRHS}}, TrueVal, FalseVal, Q, MaxRecurse)) return V; } } @@ -4765,11 +4773,11 @@ static Value *simplifySelectWithFCmp(Value *Cond, Value *T, Value *F, // This transforms is safe if at least one operand is known to not be zero. // Otherwise, the select can change the sign of a zero operand. if (IsEquiv) { - if (Value *V = - simplifySelectWithEquivalence(CmpLHS, CmpRHS, T, F, Q, MaxRecurse)) + if (Value *V = simplifySelectWithEquivalence({{CmpLHS, CmpRHS}}, T, F, Q, + MaxRecurse)) return V; - if (Value *V = - simplifySelectWithEquivalence(CmpRHS, CmpLHS, T, F, Q, MaxRecurse)) + if (Value *V = simplifySelectWithEquivalence({{CmpRHS, CmpLHS}}, T, F, Q, + MaxRecurse)) return V; } diff --git a/llvm/lib/Analysis/Lint.cpp b/llvm/lib/Analysis/Lint.cpp index 4689451..e9d96a0c 100644 --- a/llvm/lib/Analysis/Lint.cpp +++ b/llvm/lib/Analysis/Lint.cpp @@ -266,6 +266,30 @@ void Lint::visitCallBase(CallBase &I) { visitMemoryReference(I, Loc, DL->getABITypeAlign(Ty), Ty, MemRef::Read | MemRef::Write); } + + // Check that ABI attributes for the function and call-site match. + unsigned ArgNo = AI->getOperandNo(); + Attribute::AttrKind ABIAttributes[] = { + Attribute::ZExt, Attribute::SExt, Attribute::InReg, + Attribute::ByVal, Attribute::ByRef, Attribute::InAlloca, + Attribute::Preallocated, Attribute::StructRet}; + AttributeList CallAttrs = I.getAttributes(); + for (Attribute::AttrKind Attr : ABIAttributes) { + Attribute CallAttr = CallAttrs.getParamAttr(ArgNo, Attr); + Attribute FnAttr = F->getParamAttribute(ArgNo, Attr); + Check(CallAttr.isValid() == FnAttr.isValid(), + Twine("Undefined behavior: ABI attribute ") + + Attribute::getNameFromAttrKind(Attr) + + " not present on both function and call-site", + &I); + if (CallAttr.isValid() && FnAttr.isValid()) { + Check(CallAttr == FnAttr, + Twine("Undefined behavior: ABI attribute ") + + Attribute::getNameFromAttrKind(Attr) + + " does not have same argument for function and call-site", + &I); + } + } } } } diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp index 54b9521..bc03e40 100644 --- a/llvm/lib/Analysis/Loads.cpp +++ b/llvm/lib/Analysis/Loads.cpp @@ -25,10 +25,9 @@ using namespace llvm; -static bool isAligned(const Value *Base, const APInt &Offset, Align Alignment, +static bool isAligned(const Value *Base, Align Alignment, const DataLayout &DL) { - Align BA = Base->getPointerAlignment(DL); - return BA >= Alignment && Offset.isAligned(BA); + return Base->getPointerAlignment(DL) >= Alignment; } /// Test if V is always a pointer to allocated and suitably aligned memory for @@ -118,8 +117,7 @@ static bool isDereferenceableAndAlignedPointer( // As we recursed through GEPs to get here, we've incrementally checked // that each step advanced by a multiple of the alignment. If our base is // properly aligned, then the original offset accessed must also be. - APInt Offset(DL.getTypeStoreSizeInBits(V->getType()), 0); - return isAligned(V, Offset, Alignment, DL); + return isAligned(V, Alignment, DL); } /// TODO refactor this function to be able to search independently for @@ -154,8 +152,7 @@ static bool isDereferenceableAndAlignedPointer( // checked that each step advanced by a multiple of the alignment. If // our base is properly aligned, then the original offset accessed // must also be. - APInt Offset(DL.getTypeStoreSizeInBits(V->getType()), 0); - return isAligned(V, Offset, Alignment, DL); + return isAligned(V, Alignment, DL); } } } diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index b5668a1..7e18f7c 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -226,7 +226,7 @@ static cl::opt<unsigned> RangeIterThreshold( static cl::opt<unsigned> MaxLoopGuardCollectionDepth( "scalar-evolution-max-loop-guard-collection-depth", cl::Hidden, - cl::desc("Maximum depth for recrusive loop guard collection"), cl::init(1)); + cl::desc("Maximum depth for recursive loop guard collection"), cl::init(1)); static cl::opt<bool> ClassifyExpressions("scalar-evolution-classify-expressions", diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index 2f6e869..0eb43dd 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -8641,6 +8641,80 @@ SelectPatternResult llvm::getSelectPattern(CmpInst::Predicate Pred, } } +std::optional<std::pair<CmpPredicate, Constant *>> +llvm::getFlippedStrictnessPredicateAndConstant(CmpPredicate Pred, Constant *C) { + assert(ICmpInst::isRelational(Pred) && ICmpInst::isIntPredicate(Pred) && + "Only for relational integer predicates."); + if (isa<UndefValue>(C)) + return std::nullopt; + + Type *Type = C->getType(); + bool IsSigned = ICmpInst::isSigned(Pred); + + CmpInst::Predicate UnsignedPred = ICmpInst::getUnsignedPredicate(Pred); + bool WillIncrement = + UnsignedPred == ICmpInst::ICMP_ULE || UnsignedPred == ICmpInst::ICMP_UGT; + + // Check if the constant operand can be safely incremented/decremented + // without overflowing/underflowing. + auto ConstantIsOk = [WillIncrement, IsSigned](ConstantInt *C) { + return WillIncrement ? !C->isMaxValue(IsSigned) : !C->isMinValue(IsSigned); + }; + + Constant *SafeReplacementConstant = nullptr; + if (auto *CI = dyn_cast<ConstantInt>(C)) { + // Bail out if the constant can't be safely incremented/decremented. + if (!ConstantIsOk(CI)) + return std::nullopt; + } else if (auto *FVTy = dyn_cast<FixedVectorType>(Type)) { + unsigned NumElts = FVTy->getNumElements(); + for (unsigned i = 0; i != NumElts; ++i) { + Constant *Elt = C->getAggregateElement(i); + if (!Elt) + return std::nullopt; + + if (isa<UndefValue>(Elt)) + continue; + + // Bail out if we can't determine if this constant is min/max or if we + // know that this constant is min/max. + auto *CI = dyn_cast<ConstantInt>(Elt); + if (!CI || !ConstantIsOk(CI)) + return std::nullopt; + + if (!SafeReplacementConstant) + SafeReplacementConstant = CI; + } + } else if (isa<VectorType>(C->getType())) { + // Handle scalable splat + Value *SplatC = C->getSplatValue(); + auto *CI = dyn_cast_or_null<ConstantInt>(SplatC); + // Bail out if the constant can't be safely incremented/decremented. + if (!CI || !ConstantIsOk(CI)) + return std::nullopt; + } else { + // ConstantExpr? + return std::nullopt; + } + + // It may not be safe to change a compare predicate in the presence of + // undefined elements, so replace those elements with the first safe constant + // that we found. + // TODO: in case of poison, it is safe; let's replace undefs only. + if (C->containsUndefOrPoisonElement()) { + assert(SafeReplacementConstant && "Replacement constant not set"); + C = Constant::replaceUndefsWith(C, SafeReplacementConstant); + } + + CmpInst::Predicate NewPred = CmpInst::getFlippedStrictnessPredicate(Pred); + + // Increment or decrement the constant. + Constant *OneOrNegOne = ConstantInt::get(Type, WillIncrement ? 1 : -1, true); + Constant *NewC = ConstantExpr::getAdd(C, OneOrNegOne); + + return std::make_pair(NewPred, NewC); +} + static SelectPatternResult matchSelectPattern(CmpInst::Predicate Pred, FastMathFlags FMF, Value *CmpLHS, Value *CmpRHS, diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index 7bd3fb3..3ba4590 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -3914,21 +3914,22 @@ static void emitGlobalConstantImpl(const DataLayout &DL, const Constant *CV, if (isa<ConstantAggregateZero>(CV)) { StructType *structType; if (AliasList && (structType = llvm::dyn_cast<StructType>(CV->getType()))) { - // Handle cases of aliases to direct struct elements - const StructLayout *Layout = DL.getStructLayout(structType); - uint64_t SizeSoFar = 0; - for (unsigned int i = 0, n = structType->getNumElements(); i < n - 1; - ++i) { - uint64_t GapToNext = Layout->getElementOffset(i + 1) - SizeSoFar; - AP.OutStreamer->emitZeros(GapToNext); - SizeSoFar += GapToNext; - emitGlobalAliasInline(AP, Offset + SizeSoFar, AliasList); + unsigned numElements = {structType->getNumElements()}; + if (numElements != 0) { + // Handle cases of aliases to direct struct elements + const StructLayout *Layout = DL.getStructLayout(structType); + uint64_t SizeSoFar = 0; + for (unsigned int i = 0; i < numElements - 1; ++i) { + uint64_t GapToNext = Layout->getElementOffset(i + 1) - SizeSoFar; + AP.OutStreamer->emitZeros(GapToNext); + SizeSoFar += GapToNext; + emitGlobalAliasInline(AP, Offset + SizeSoFar, AliasList); + } + AP.OutStreamer->emitZeros(Size - SizeSoFar); + return; } - AP.OutStreamer->emitZeros(Size - SizeSoFar); - return; - } else { - return AP.OutStreamer->emitZeros(Size); } + return AP.OutStreamer->emitZeros(Size); } if (isa<UndefValue>(CV)) diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp index e1291e2..11de4b6 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp @@ -3789,6 +3789,7 @@ void DwarfDebug::addDwarfTypeUnitType(DwarfCompileUnit &CU, // they depend on addresses, throwing them out and rebuilding them. setCurrentDWARF5AccelTable(DWARF5AccelTableKind::CU); CU.constructTypeDIE(RefDie, cast<DICompositeType>(CTy)); + CU.updateAcceleratorTables(CTy->getScope(), CTy, RefDie); return; } diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h index 0225654..1632053 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h @@ -315,6 +315,11 @@ public: /// Get context owner's DIE. DIE *createTypeDIE(const DICompositeType *Ty); + /// If this is a named finished type then include it in the list of types for + /// the accelerator tables. + void updateAcceleratorTables(const DIScope *Context, const DIType *Ty, + const DIE &TyDIE); + protected: ~DwarfUnit(); @@ -357,11 +362,6 @@ private: virtual void finishNonUnitTypeDIE(DIE& D, const DICompositeType *CTy) = 0; - /// If this is a named finished type then include it in the list of types for - /// the accelerator tables. - void updateAcceleratorTables(const DIScope *Context, const DIType *Ty, - const DIE &TyDIE); - virtual bool isDwoUnit() const = 0; const MCSymbol *getCrossSectionRelativeBaseAddress() const override; diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index 5c712e4..ba1b10e 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -152,7 +152,7 @@ static cl::opt<bool> static cl::opt<bool> EnableAndCmpSinking("enable-andcmp-sinking", cl::Hidden, cl::init(true), - cl::desc("Enable sinkinig and/cmp into branches.")); + cl::desc("Enable sinking and/cmp into branches.")); static cl::opt<bool> DisableStoreExtract( "disable-cgp-store-extract", cl::Hidden, cl::init(false), diff --git a/llvm/lib/CodeGen/MIRSampleProfile.cpp b/llvm/lib/CodeGen/MIRSampleProfile.cpp index 23db09b..9bba50e8 100644 --- a/llvm/lib/CodeGen/MIRSampleProfile.cpp +++ b/llvm/lib/CodeGen/MIRSampleProfile.cpp @@ -46,8 +46,9 @@ static cl::opt<bool> ShowFSBranchProb( cl::desc("Print setting flow sensitive branch probabilities")); static cl::opt<unsigned> FSProfileDebugProbDiffThreshold( "fs-profile-debug-prob-diff-threshold", cl::init(10), - cl::desc("Only show debug message if the branch probility is greater than " - "this value (in percentage).")); + cl::desc( + "Only show debug message if the branch probability is greater than " + "this value (in percentage).")); static cl::opt<unsigned> FSProfileDebugBWThreshold( "fs-profile-debug-bw-threshold", cl::init(10000), diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp index 0f68313..05bc4cf 100644 --- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp +++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp @@ -149,7 +149,7 @@ static cl::opt<unsigned> JumpInstCost("jump-inst-cost", static cl::opt<bool> TailDupPlacement("tail-dup-placement", cl::desc("Perform tail duplication during placement. " - "Creates more fallthrough opportunites in " + "Creates more fallthrough opportunities in " "outline branches."), cl::init(true), cl::Hidden); diff --git a/llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp b/llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp index 56fffff..2e92dd8 100644 --- a/llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp +++ b/llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp @@ -29,7 +29,7 @@ namespace llvm { cl::opt<unsigned> StaticLikelyProb("static-likely-prob", cl::desc("branch probability threshold in percentage" - "to be considered very likely"), + " to be considered very likely"), cl::init(80), cl::Hidden); cl::opt<unsigned> ProfileLikelyProb( diff --git a/llvm/lib/CodeGen/ReachingDefAnalysis.cpp b/llvm/lib/CodeGen/ReachingDefAnalysis.cpp index 79b0fa6..3ab6315 100644 --- a/llvm/lib/CodeGen/ReachingDefAnalysis.cpp +++ b/llvm/lib/CodeGen/ReachingDefAnalysis.cpp @@ -30,22 +30,22 @@ static bool isValidRegUse(const MachineOperand &MO) { return isValidReg(MO) && MO.isUse(); } -static bool isValidRegUseOf(const MachineOperand &MO, MCRegister PhysReg, +static bool isValidRegUseOf(const MachineOperand &MO, MCRegister Reg, const TargetRegisterInfo *TRI) { if (!isValidRegUse(MO)) return false; - return TRI->regsOverlap(MO.getReg(), PhysReg); + return TRI->regsOverlap(MO.getReg(), Reg); } static bool isValidRegDef(const MachineOperand &MO) { return isValidReg(MO) && MO.isDef(); } -static bool isValidRegDefOf(const MachineOperand &MO, MCRegister PhysReg, +static bool isValidRegDefOf(const MachineOperand &MO, MCRegister Reg, const TargetRegisterInfo *TRI) { if (!isValidRegDef(MO)) return false; - return TRI->regsOverlap(MO.getReg(), PhysReg); + return TRI->regsOverlap(MO.getReg(), Reg); } void ReachingDefAnalysis::enterBasicBlock(MachineBasicBlock *MBB) { @@ -261,7 +261,7 @@ void ReachingDefAnalysis::traverse() { } int ReachingDefAnalysis::getReachingDef(MachineInstr *MI, - MCRegister PhysReg) const { + MCRegister Reg) const { assert(InstIds.count(MI) && "Unexpected machine instuction."); int InstId = InstIds.lookup(MI); int DefRes = ReachingDefDefaultVal; @@ -269,7 +269,7 @@ int ReachingDefAnalysis::getReachingDef(MachineInstr *MI, assert(MBBNumber < MBBReachingDefs.numBlockIDs() && "Unexpected basic block number."); int LatestDef = ReachingDefDefaultVal; - for (MCRegUnit Unit : TRI->regunits(PhysReg)) { + for (MCRegUnit Unit : TRI->regunits(Reg)) { for (int Def : MBBReachingDefs.defs(MBBNumber, Unit)) { if (Def >= InstId) break; @@ -280,22 +280,21 @@ int ReachingDefAnalysis::getReachingDef(MachineInstr *MI, return LatestDef; } -MachineInstr * -ReachingDefAnalysis::getReachingLocalMIDef(MachineInstr *MI, - MCRegister PhysReg) const { - return hasLocalDefBefore(MI, PhysReg) - ? getInstFromId(MI->getParent(), getReachingDef(MI, PhysReg)) - : nullptr; +MachineInstr *ReachingDefAnalysis::getReachingLocalMIDef(MachineInstr *MI, + MCRegister Reg) const { + return hasLocalDefBefore(MI, Reg) + ? getInstFromId(MI->getParent(), getReachingDef(MI, Reg)) + : nullptr; } bool ReachingDefAnalysis::hasSameReachingDef(MachineInstr *A, MachineInstr *B, - MCRegister PhysReg) const { + MCRegister Reg) const { MachineBasicBlock *ParentA = A->getParent(); MachineBasicBlock *ParentB = B->getParent(); if (ParentA != ParentB) return false; - return getReachingDef(A, PhysReg) == getReachingDef(B, PhysReg); + return getReachingDef(A, Reg) == getReachingDef(B, Reg); } MachineInstr *ReachingDefAnalysis::getInstFromId(MachineBasicBlock *MBB, @@ -318,19 +317,18 @@ MachineInstr *ReachingDefAnalysis::getInstFromId(MachineBasicBlock *MBB, return nullptr; } -int ReachingDefAnalysis::getClearance(MachineInstr *MI, - MCRegister PhysReg) const { +int ReachingDefAnalysis::getClearance(MachineInstr *MI, MCRegister Reg) const { assert(InstIds.count(MI) && "Unexpected machine instuction."); - return InstIds.lookup(MI) - getReachingDef(MI, PhysReg); + return InstIds.lookup(MI) - getReachingDef(MI, Reg); } bool ReachingDefAnalysis::hasLocalDefBefore(MachineInstr *MI, - MCRegister PhysReg) const { - return getReachingDef(MI, PhysReg) >= 0; + MCRegister Reg) const { + return getReachingDef(MI, Reg) >= 0; } void ReachingDefAnalysis::getReachingLocalUses(MachineInstr *Def, - MCRegister PhysReg, + MCRegister Reg, InstSet &Uses) const { MachineBasicBlock *MBB = Def->getParent(); MachineBasicBlock::iterator MI = MachineBasicBlock::iterator(Def); @@ -340,11 +338,11 @@ void ReachingDefAnalysis::getReachingLocalUses(MachineInstr *Def, // If/when we find a new reaching def, we know that there's no more uses // of 'Def'. - if (getReachingLocalMIDef(&*MI, PhysReg) != Def) + if (getReachingLocalMIDef(&*MI, Reg) != Def) return; for (auto &MO : MI->operands()) { - if (!isValidRegUseOf(MO, PhysReg, TRI)) + if (!isValidRegUseOf(MO, Reg, TRI)) continue; Uses.insert(&*MI); @@ -354,15 +352,14 @@ void ReachingDefAnalysis::getReachingLocalUses(MachineInstr *Def, } } -bool ReachingDefAnalysis::getLiveInUses(MachineBasicBlock *MBB, - MCRegister PhysReg, +bool ReachingDefAnalysis::getLiveInUses(MachineBasicBlock *MBB, MCRegister Reg, InstSet &Uses) const { for (MachineInstr &MI : instructionsWithoutDebug(MBB->instr_begin(), MBB->instr_end())) { for (auto &MO : MI.operands()) { - if (!isValidRegUseOf(MO, PhysReg, TRI)) + if (!isValidRegUseOf(MO, Reg, TRI)) continue; - if (getReachingDef(&MI, PhysReg) >= 0) + if (getReachingDef(&MI, Reg) >= 0) return false; Uses.insert(&MI); } @@ -370,18 +367,18 @@ bool ReachingDefAnalysis::getLiveInUses(MachineBasicBlock *MBB, auto Last = MBB->getLastNonDebugInstr(); if (Last == MBB->end()) return true; - return isReachingDefLiveOut(&*Last, PhysReg); + return isReachingDefLiveOut(&*Last, Reg); } -void ReachingDefAnalysis::getGlobalUses(MachineInstr *MI, MCRegister PhysReg, +void ReachingDefAnalysis::getGlobalUses(MachineInstr *MI, MCRegister Reg, InstSet &Uses) const { MachineBasicBlock *MBB = MI->getParent(); // Collect the uses that each def touches within the block. - getReachingLocalUses(MI, PhysReg, Uses); + getReachingLocalUses(MI, Reg, Uses); // Handle live-out values. - if (auto *LiveOut = getLocalLiveOutMIDef(MI->getParent(), PhysReg)) { + if (auto *LiveOut = getLocalLiveOutMIDef(MI->getParent(), Reg)) { if (LiveOut != MI) return; @@ -389,9 +386,9 @@ void ReachingDefAnalysis::getGlobalUses(MachineInstr *MI, MCRegister PhysReg, SmallPtrSet<MachineBasicBlock*, 4>Visited; while (!ToVisit.empty()) { MachineBasicBlock *MBB = ToVisit.pop_back_val(); - if (Visited.count(MBB) || !MBB->isLiveIn(PhysReg)) + if (Visited.count(MBB) || !MBB->isLiveIn(Reg)) continue; - if (getLiveInUses(MBB, PhysReg, Uses)) + if (getLiveInUses(MBB, Reg, Uses)) llvm::append_range(ToVisit, MBB->successors()); Visited.insert(MBB); } @@ -399,25 +396,25 @@ void ReachingDefAnalysis::getGlobalUses(MachineInstr *MI, MCRegister PhysReg, } void ReachingDefAnalysis::getGlobalReachingDefs(MachineInstr *MI, - MCRegister PhysReg, + MCRegister Reg, InstSet &Defs) const { - if (auto *Def = getUniqueReachingMIDef(MI, PhysReg)) { + if (auto *Def = getUniqueReachingMIDef(MI, Reg)) { Defs.insert(Def); return; } for (auto *MBB : MI->getParent()->predecessors()) - getLiveOuts(MBB, PhysReg, Defs); + getLiveOuts(MBB, Reg, Defs); } -void ReachingDefAnalysis::getLiveOuts(MachineBasicBlock *MBB, - MCRegister PhysReg, InstSet &Defs) const { +void ReachingDefAnalysis::getLiveOuts(MachineBasicBlock *MBB, MCRegister Reg, + InstSet &Defs) const { SmallPtrSet<MachineBasicBlock*, 2> VisitedBBs; - getLiveOuts(MBB, PhysReg, Defs, VisitedBBs); + getLiveOuts(MBB, Reg, Defs, VisitedBBs); } -void ReachingDefAnalysis::getLiveOuts(MachineBasicBlock *MBB, - MCRegister PhysReg, InstSet &Defs, +void ReachingDefAnalysis::getLiveOuts(MachineBasicBlock *MBB, MCRegister Reg, + InstSet &Defs, BlockSet &VisitedBBs) const { if (VisitedBBs.count(MBB)) return; @@ -425,28 +422,28 @@ void ReachingDefAnalysis::getLiveOuts(MachineBasicBlock *MBB, VisitedBBs.insert(MBB); LiveRegUnits LiveRegs(*TRI); LiveRegs.addLiveOuts(*MBB); - if (LiveRegs.available(PhysReg)) + if (LiveRegs.available(Reg)) return; - if (auto *Def = getLocalLiveOutMIDef(MBB, PhysReg)) + if (auto *Def = getLocalLiveOutMIDef(MBB, Reg)) Defs.insert(Def); else for (auto *Pred : MBB->predecessors()) - getLiveOuts(Pred, PhysReg, Defs, VisitedBBs); + getLiveOuts(Pred, Reg, Defs, VisitedBBs); } MachineInstr * ReachingDefAnalysis::getUniqueReachingMIDef(MachineInstr *MI, - MCRegister PhysReg) const { + MCRegister Reg) const { // If there's a local def before MI, return it. - MachineInstr *LocalDef = getReachingLocalMIDef(MI, PhysReg); + MachineInstr *LocalDef = getReachingLocalMIDef(MI, Reg); if (LocalDef && InstIds.lookup(LocalDef) < InstIds.lookup(MI)) return LocalDef; SmallPtrSet<MachineInstr*, 2> Incoming; MachineBasicBlock *Parent = MI->getParent(); for (auto *Pred : Parent->predecessors()) - getLiveOuts(Pred, PhysReg, Incoming); + getLiveOuts(Pred, Reg, Incoming); // Check that we have a single incoming value and that it does not // come from the same block as MI - since it would mean that the def @@ -469,13 +466,13 @@ MachineInstr *ReachingDefAnalysis::getMIOperand(MachineInstr *MI, } bool ReachingDefAnalysis::isRegUsedAfter(MachineInstr *MI, - MCRegister PhysReg) const { + MCRegister Reg) const { MachineBasicBlock *MBB = MI->getParent(); LiveRegUnits LiveRegs(*TRI); LiveRegs.addLiveOuts(*MBB); // Yes if the register is live out of the basic block. - if (!LiveRegs.available(PhysReg)) + if (!LiveRegs.available(Reg)) return true; // Walk backwards through the block to see if the register is live at some @@ -483,62 +480,61 @@ bool ReachingDefAnalysis::isRegUsedAfter(MachineInstr *MI, for (MachineInstr &Last : instructionsWithoutDebug(MBB->instr_rbegin(), MBB->instr_rend())) { LiveRegs.stepBackward(Last); - if (!LiveRegs.available(PhysReg)) + if (!LiveRegs.available(Reg)) return InstIds.lookup(&Last) > InstIds.lookup(MI); } return false; } bool ReachingDefAnalysis::isRegDefinedAfter(MachineInstr *MI, - MCRegister PhysReg) const { + MCRegister Reg) const { MachineBasicBlock *MBB = MI->getParent(); auto Last = MBB->getLastNonDebugInstr(); if (Last != MBB->end() && - getReachingDef(MI, PhysReg) != getReachingDef(&*Last, PhysReg)) + getReachingDef(MI, Reg) != getReachingDef(&*Last, Reg)) return true; - if (auto *Def = getLocalLiveOutMIDef(MBB, PhysReg)) - return Def == getReachingLocalMIDef(MI, PhysReg); + if (auto *Def = getLocalLiveOutMIDef(MBB, Reg)) + return Def == getReachingLocalMIDef(MI, Reg); return false; } bool ReachingDefAnalysis::isReachingDefLiveOut(MachineInstr *MI, - MCRegister PhysReg) const { + MCRegister Reg) const { MachineBasicBlock *MBB = MI->getParent(); LiveRegUnits LiveRegs(*TRI); LiveRegs.addLiveOuts(*MBB); - if (LiveRegs.available(PhysReg)) + if (LiveRegs.available(Reg)) return false; auto Last = MBB->getLastNonDebugInstr(); - int Def = getReachingDef(MI, PhysReg); - if (Last != MBB->end() && getReachingDef(&*Last, PhysReg) != Def) + int Def = getReachingDef(MI, Reg); + if (Last != MBB->end() && getReachingDef(&*Last, Reg) != Def) return false; // Finally check that the last instruction doesn't redefine the register. for (auto &MO : Last->operands()) - if (isValidRegDefOf(MO, PhysReg, TRI)) + if (isValidRegDefOf(MO, Reg, TRI)) return false; return true; } -MachineInstr * -ReachingDefAnalysis::getLocalLiveOutMIDef(MachineBasicBlock *MBB, - MCRegister PhysReg) const { +MachineInstr *ReachingDefAnalysis::getLocalLiveOutMIDef(MachineBasicBlock *MBB, + MCRegister Reg) const { LiveRegUnits LiveRegs(*TRI); LiveRegs.addLiveOuts(*MBB); - if (LiveRegs.available(PhysReg)) + if (LiveRegs.available(Reg)) return nullptr; auto Last = MBB->getLastNonDebugInstr(); if (Last == MBB->end()) return nullptr; - int Def = getReachingDef(&*Last, PhysReg); + int Def = getReachingDef(&*Last, Reg); for (auto &MO : Last->operands()) - if (isValidRegDefOf(MO, PhysReg, TRI)) + if (isValidRegDefOf(MO, Reg, TRI)) return &*Last; return Def < 0 ? nullptr : getInstFromId(MBB, Def); @@ -650,7 +646,7 @@ ReachingDefAnalysis::isSafeToRemove(MachineInstr *MI, InstSet &Visited, void ReachingDefAnalysis::collectKilledOperands(MachineInstr *MI, InstSet &Dead) const { Dead.insert(MI); - auto IsDead = [this, &Dead](MachineInstr *Def, MCRegister PhysReg) { + auto IsDead = [this, &Dead](MachineInstr *Def, MCRegister Reg) { if (mayHaveSideEffects(*Def)) return false; @@ -666,7 +662,7 @@ void ReachingDefAnalysis::collectKilledOperands(MachineInstr *MI, return false; SmallPtrSet<MachineInstr*, 4> Uses; - getGlobalUses(Def, PhysReg, Uses); + getGlobalUses(Def, Reg, Uses); return llvm::set_is_subset(Uses, Dead); }; @@ -680,18 +676,18 @@ void ReachingDefAnalysis::collectKilledOperands(MachineInstr *MI, } bool ReachingDefAnalysis::isSafeToDefRegAt(MachineInstr *MI, - MCRegister PhysReg) const { + MCRegister Reg) const { SmallPtrSet<MachineInstr*, 1> Ignore; - return isSafeToDefRegAt(MI, PhysReg, Ignore); + return isSafeToDefRegAt(MI, Reg, Ignore); } -bool ReachingDefAnalysis::isSafeToDefRegAt(MachineInstr *MI, MCRegister PhysReg, +bool ReachingDefAnalysis::isSafeToDefRegAt(MachineInstr *MI, MCRegister Reg, InstSet &Ignore) const { // Check for any uses of the register after MI. - if (isRegUsedAfter(MI, PhysReg)) { - if (auto *Def = getReachingLocalMIDef(MI, PhysReg)) { + if (isRegUsedAfter(MI, Reg)) { + if (auto *Def = getReachingLocalMIDef(MI, Reg)) { SmallPtrSet<MachineInstr*, 2> Uses; - getGlobalUses(Def, PhysReg, Uses); + getGlobalUses(Def, Reg, Uses); if (!llvm::set_is_subset(Uses, Ignore)) return false; } else @@ -700,13 +696,13 @@ bool ReachingDefAnalysis::isSafeToDefRegAt(MachineInstr *MI, MCRegister PhysReg, MachineBasicBlock *MBB = MI->getParent(); // Check for any defs after MI. - if (isRegDefinedAfter(MI, PhysReg)) { + if (isRegDefinedAfter(MI, Reg)) { auto I = MachineBasicBlock::iterator(MI); for (auto E = MBB->end(); I != E; ++I) { if (Ignore.count(&*I)) continue; for (auto &MO : I->operands()) - if (isValidRegDefOf(MO, PhysReg, TRI)) + if (isValidRegDefOf(MO, Reg, TRI)) return false; } } diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp index e61dad5..b94992c 100644 --- a/llvm/lib/CodeGen/RegAllocGreedy.cpp +++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp @@ -140,7 +140,7 @@ static cl::opt<bool> GreedyReverseLocalAssignment( static cl::opt<unsigned> SplitThresholdForRegWithHint( "split-threshold-for-reg-with-hint", cl::desc("The threshold for splitting a virtual register with a hint, in " - "percentate"), + "percentage"), cl::init(75), cl::Hidden); static RegisterRegAlloc greedyRegAlloc("greedy", "greedy register allocator", diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp index 20ad644..8313927 100644 --- a/llvm/lib/CodeGen/RegisterCoalescer.cpp +++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp @@ -113,7 +113,7 @@ static cl::opt<unsigned> LargeIntervalSizeThreshold( static cl::opt<unsigned> LargeIntervalFreqThreshold( "large-interval-freq-threshold", cl::Hidden, - cl::desc("For a large interval, if it is coalesed with other live " + cl::desc("For a large interval, if it is coalesced with other live " "intervals many times more than the threshold, stop its " "coalescing to control the compile time. "), cl::init(256)); @@ -1325,11 +1325,6 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP, const MCInstrDesc &MCID = DefMI->getDesc(); if (MCID.getNumDefs() != 1) return false; - // Only support subregister destinations when the def is read-undef. - MachineOperand &DstOperand = CopyMI->getOperand(0); - Register CopyDstReg = DstOperand.getReg(); - if (DstOperand.getSubReg() && !DstOperand.isUndef()) - return false; // If both SrcIdx and DstIdx are set, correct rematerialization would widen // the register substantially (beyond both source and dest size). This is bad @@ -1339,6 +1334,32 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP, if (SrcIdx && DstIdx) return false; + // Only support subregister destinations when the def is read-undef. + MachineOperand &DstOperand = CopyMI->getOperand(0); + Register CopyDstReg = DstOperand.getReg(); + if (DstOperand.getSubReg() && !DstOperand.isUndef()) + return false; + + // In the physical register case, checking that the def is read-undef is not + // enough. We're widening the def and need to avoid clobbering other live + // values in the unused register pieces. + // + // TODO: Targets may support rewriting the rematerialized instruction to only + // touch relevant lanes, in which case we don't need any liveness check. + if (CopyDstReg.isPhysical() && CP.isPartial()) { + for (MCRegUnit Unit : TRI->regunits(DstReg)) { + // Ignore the register units we are writing anyway. + if (is_contained(TRI->regunits(CopyDstReg), Unit)) + continue; + + // Check if the other lanes we are defining are live at the + // rematerialization point. + LiveRange &LR = LIS->getRegUnit(Unit); + if (LR.liveAt(CopyIdx)) + return false; + } + } + const unsigned DefSubIdx = DefMI->getOperand(0).getSubReg(); const TargetRegisterClass *DefRC = TII->getRegClass(MCID, 0, TRI, *MF); if (!DefMI->isImplicitDef()) { @@ -1375,27 +1396,6 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP, NewMI.setDebugLoc(DL); // In a situation like the following: - // - // undef %2.subreg:reg = INST %1:reg ; DefMI (rematerializable), - // ; DefSubIdx = subreg - // %3:reg = COPY %2 ; SrcIdx = DstIdx = 0 - // .... = SOMEINSTR %3:reg - // - // there are no subranges for %3 so after rematerialization we need - // to explicitly create them. Undefined subranges are removed later on. - if (DstReg.isVirtual() && DefSubIdx && !CP.getSrcIdx() && !CP.getDstIdx() && - MRI->shouldTrackSubRegLiveness(DstReg)) { - LiveInterval &DstInt = LIS->getInterval(DstReg); - if (!DstInt.hasSubRanges()) { - LaneBitmask FullMask = MRI->getMaxLaneMaskForVReg(DstReg); - LaneBitmask UsedLanes = TRI->getSubRegIndexLaneMask(DefSubIdx); - LaneBitmask UnusedLanes = FullMask & ~UsedLanes; - DstInt.createSubRangeFrom(LIS->getVNInfoAllocator(), UsedLanes, DstInt); - DstInt.createSubRangeFrom(LIS->getVNInfoAllocator(), UnusedLanes, DstInt); - } - } - - // In a situation like the following: // %0:subreg = instr ; DefMI, subreg = DstIdx // %1 = copy %0:subreg ; CopyMI, SrcIdx = 0 // instead of widening %1 to the register class of %0 simply do: @@ -1523,6 +1523,27 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP, // sure that "undef" is not set. if (NewIdx == 0) NewMI.getOperand(0).setIsUndef(false); + + // In a situation like the following: + // + // undef %2.subreg:reg = INST %1:reg ; DefMI (rematerializable), + // ; Defines only some of lanes, + // ; so DefSubIdx = NewIdx = subreg + // %3:reg = COPY %2 ; Copy full reg + // .... = SOMEINSTR %3:reg ; Use full reg + // + // there are no subranges for %3 so after rematerialization we need + // to explicitly create them. Undefined subranges are removed later on. + if (NewIdx && !DstInt.hasSubRanges() && + MRI->shouldTrackSubRegLiveness(DstReg)) { + LaneBitmask FullMask = MRI->getMaxLaneMaskForVReg(DstReg); + LaneBitmask UsedLanes = TRI->getSubRegIndexLaneMask(NewIdx); + LaneBitmask UnusedLanes = FullMask & ~UsedLanes; + VNInfo::Allocator &Alloc = LIS->getVNInfoAllocator(); + DstInt.createSubRangeFrom(Alloc, UsedLanes, DstInt); + DstInt.createSubRangeFrom(Alloc, UnusedLanes, DstInt); + } + // Add dead subregister definitions if we are defining the whole register // but only part of it is live. // This could happen if the rematerialization instruction is rematerializing diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index e89e7ef..da3c834 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -141,7 +141,7 @@ static cl::opt<bool> EnableReduceLoadOpStoreWidth( static cl::opt<bool> ReduceLoadOpStoreWidthForceNarrowingProfitable( "combiner-reduce-load-op-store-width-force-narrowing-profitable", cl::Hidden, cl::init(false), - cl::desc("DAG combiner force override the narrowing profitable check when" + cl::desc("DAG combiner force override the narrowing profitable check when " "reducing the width of load/op/store sequences")); static cl::opt<bool> EnableShrinkLoadReplaceStoreWithStore( @@ -20455,10 +20455,8 @@ SDValue DAGCombiner::TransformFPLoadStorePair(SDNode *N) { Value.hasOneUse()) { LoadSDNode *LD = cast<LoadSDNode>(Value); EVT VT = LD->getMemoryVT(); - if (!VT.isFloatingPoint() || - VT != ST->getMemoryVT() || - LD->isNonTemporal() || - ST->isNonTemporal() || + if (!VT.isSimple() || !VT.isFloatingPoint() || VT != ST->getMemoryVT() || + LD->isNonTemporal() || ST->isNonTemporal() || LD->getPointerInfo().getAddrSpace() != 0 || ST->getPointerInfo().getAddrSpace() != 0) return SDValue(); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index e8404a1..89a00c5 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -1777,6 +1777,31 @@ void VectorLegalizer::ExpandUINT_TO_FLOAT(SDNode *Node, assert((BW == 64 || BW == 32) && "Elements in vector-UINT_TO_FP must be 32 or 64 bits wide"); + // If STRICT_/FMUL is not supported by the target (in case of f16) replace the + // UINT_TO_FP with a larger float and round to the smaller type + if ((!IsStrict && !TLI.isOperationLegalOrCustom(ISD::FMUL, DstVT)) || + (IsStrict && !TLI.isOperationLegalOrCustom(ISD::STRICT_FMUL, DstVT))) { + EVT FPVT = BW == 32 ? MVT::f32 : MVT::f64; + SDValue UIToFP; + SDValue Result; + SDValue TargetZero = DAG.getIntPtrConstant(0, DL, /*isTarget=*/true); + EVT FloatVecVT = SrcVT.changeVectorElementType(FPVT); + if (IsStrict) { + UIToFP = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {FloatVecVT, MVT::Other}, + {Node->getOperand(0), Src}); + Result = DAG.getNode(ISD::STRICT_FP_ROUND, DL, {DstVT, MVT::Other}, + {Node->getOperand(0), UIToFP, TargetZero}); + Results.push_back(Result); + Results.push_back(Result.getValue(1)); + } else { + UIToFP = DAG.getNode(ISD::UINT_TO_FP, DL, FloatVecVT, Src); + Result = DAG.getNode(ISD::FP_ROUND, DL, DstVT, UIToFP, TargetZero); + Results.push_back(Result); + } + + return; + } + SDValue HalfWord = DAG.getConstant(BW / 2, DL, SrcVT); // Constants to clear the upper part of the word. diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp index 9e5867c..51ee3cc 100644 --- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp @@ -125,9 +125,9 @@ static cl::opt<int> MaxReorderWindow( cl::desc("Number of instructions to allow ahead of the critical path " "in sched=list-ilp")); -static cl::opt<unsigned> AvgIPC( - "sched-avg-ipc", cl::Hidden, cl::init(1), - cl::desc("Average inst/cycle whan no target itinerary exists.")); +static cl::opt<unsigned> + AvgIPC("sched-avg-ipc", cl::Hidden, cl::init(1), + cl::desc("Average inst/cycle when no target itinerary exists.")); namespace { diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp index 26fc75c..dff7243 100644 --- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp @@ -43,9 +43,9 @@ STATISTIC(LoadsClustered, "Number of loads clustered together"); // without a target itinerary. The choice of number here has more to do with // balancing scheduler heuristics than with the actual machine latency. static cl::opt<int> HighLatencyCycles( - "sched-high-latency-cycles", cl::Hidden, cl::init(10), - cl::desc("Roughly estimate the number of cycles that 'long latency'" - "instructions take for targets with no itinerary")); + "sched-high-latency-cycles", cl::Hidden, cl::init(10), + cl::desc("Roughly estimate the number of cycles that 'long latency' " + "instructions take for targets with no itinerary")); ScheduleDAGSDNodes::ScheduleDAGSDNodes(MachineFunction &mf) : ScheduleDAG(mf), InstrItins(mf.getSubtarget().getInstrItineraryData()) {} diff --git a/llvm/lib/ExecutionEngine/Orc/Core.cpp b/llvm/lib/ExecutionEngine/Orc/Core.cpp index 6a9ebb4..d47eb44 100644 --- a/llvm/lib/ExecutionEngine/Orc/Core.cpp +++ b/llvm/lib/ExecutionEngine/Orc/Core.cpp @@ -1576,12 +1576,22 @@ void Platform::lookupInitSymbolsAsync( } } +MaterializationTask::~MaterializationTask() { + // If this task wasn't run then fail materialization. + if (MR) + MR->failMaterialization(); +} + void MaterializationTask::printDescription(raw_ostream &OS) { OS << "Materialization task: " << MU->getName() << " in " << MR->getTargetJITDylib().getName(); } -void MaterializationTask::run() { MU->materialize(std::move(MR)); } +void MaterializationTask::run() { + assert(MU && "MU should not be null"); + assert(MR && "MR should not be null"); + MU->materialize(std::move(MR)); +} void LookupTask::printDescription(raw_ostream &OS) { OS << "Lookup task"; } @@ -1821,17 +1831,10 @@ ExecutionSession::lookup(const JITDylibSearchOrder &SearchOrder, RegisterDependenciesFunction RegisterDependencies) { #if LLVM_ENABLE_THREADS // In the threaded case we use promises to return the results. - std::promise<SymbolMap> PromisedResult; - Error ResolutionError = Error::success(); + std::promise<MSVCPExpected<SymbolMap>> PromisedResult; auto NotifyComplete = [&](Expected<SymbolMap> R) { - if (R) - PromisedResult.set_value(std::move(*R)); - else { - ErrorAsOutParameter _(ResolutionError); - ResolutionError = R.takeError(); - PromisedResult.set_value(SymbolMap()); - } + PromisedResult.set_value(std::move(R)); }; #else @@ -1848,18 +1851,11 @@ ExecutionSession::lookup(const JITDylibSearchOrder &SearchOrder, #endif // Perform the asynchronous lookup. - lookup(K, SearchOrder, std::move(Symbols), RequiredState, NotifyComplete, - RegisterDependencies); + lookup(K, SearchOrder, std::move(Symbols), RequiredState, + std::move(NotifyComplete), RegisterDependencies); #if LLVM_ENABLE_THREADS - auto ResultFuture = PromisedResult.get_future(); - auto Result = ResultFuture.get(); - - if (ResolutionError) - return std::move(ResolutionError); - - return std::move(Result); - + return PromisedResult.get_future().get(); #else if (ResolutionError) return std::move(ResolutionError); diff --git a/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp b/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp index 6688b09..9bc0aa8 100644 --- a/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp +++ b/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp @@ -16,8 +16,6 @@ namespace llvm::orc { char ObjectLinkingLayer::ID; -using BaseObjectLayer = RTTIExtends<ObjectLinkingLayer, ObjectLayer>; - void ObjectLinkingLayer::emit(std::unique_ptr<MaterializationResponsibility> R, std::unique_ptr<MemoryBuffer> O) { assert(O && "Object must not be null"); diff --git a/llvm/lib/ExecutionEngine/Orc/TaskDispatch.cpp b/llvm/lib/ExecutionEngine/Orc/TaskDispatch.cpp index fbe4b09..1af17e8 100644 --- a/llvm/lib/ExecutionEngine/Orc/TaskDispatch.cpp +++ b/llvm/lib/ExecutionEngine/Orc/TaskDispatch.cpp @@ -31,6 +31,10 @@ void DynamicThreadPoolTaskDispatcher::dispatch(std::unique_ptr<Task> T) { { std::lock_guard<std::mutex> Lock(DispatchMutex); + // Reject new tasks if they're dispatched after a call to shutdown. + if (Shutdown) + return; + if (IsMaterializationTask) { // If this is a materialization task and there are too many running @@ -54,6 +58,14 @@ void DynamicThreadPoolTaskDispatcher::dispatch(std::unique_ptr<Task> T) { // Run the task. T->run(); + // Reset the task to free any resources. We need this to happen *before* + // we notify anyone (via Outstanding) that this thread is done to ensure + // that we don't proceed with JIT shutdown while still holding resources. + // (E.g. this was causing "Dangling SymbolStringPtr" assertions). + T.reset(); + + // Check the work queue state and either proceed with the next task or + // end this thread. std::lock_guard<std::mutex> Lock(DispatchMutex); if (!MaterializationTaskQueue.empty()) { // If there are any materialization tasks running then steal that work. @@ -64,7 +76,6 @@ void DynamicThreadPoolTaskDispatcher::dispatch(std::unique_ptr<Task> T) { IsMaterializationTask = true; } } else { - // Otherwise decrement work counters. if (IsMaterializationTask) --NumMaterializationThreads; --Outstanding; @@ -78,7 +89,7 @@ void DynamicThreadPoolTaskDispatcher::dispatch(std::unique_ptr<Task> T) { void DynamicThreadPoolTaskDispatcher::shutdown() { std::unique_lock<std::mutex> Lock(DispatchMutex); - Running = false; + Shutdown = true; OutstandingCV.wait(Lock, [this]() { return Outstanding == 0; }); } #endif diff --git a/llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.cpp b/llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.cpp index 93bc663..d4eb6a9b 100644 --- a/llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.cpp +++ b/llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.cpp @@ -116,6 +116,11 @@ uint64_t MachOLayoutBuilder::layoutSegments() { const bool IsObjectFile = O.Header.FileType == MachO::HeaderFileType::MH_OBJECT; uint64_t Offset = IsObjectFile ? (HeaderSize + O.Header.SizeOfCmds) : 0; + if (O.EncryptionInfoCommandIndex) { + // If we are emitting an encryptable binary, our load commands must have a + // separate (non-encrypted) page to themselves. + Offset = alignToPowerOf2(HeaderSize + O.Header.SizeOfCmds, PageSize); + } for (LoadCommand &LC : O.LoadCommands) { auto &MLC = LC.MachOLoadCommand; StringRef Segname; diff --git a/llvm/lib/ObjCopy/MachO/MachOObject.cpp b/llvm/lib/ObjCopy/MachO/MachOObject.cpp index 8d2c02d..e0819d8 100644 --- a/llvm/lib/ObjCopy/MachO/MachOObject.cpp +++ b/llvm/lib/ObjCopy/MachO/MachOObject.cpp @@ -98,6 +98,10 @@ void Object::updateLoadCommandIndexes() { case MachO::LC_DYLD_EXPORTS_TRIE: ExportsTrieCommandIndex = Index; break; + case MachO::LC_ENCRYPTION_INFO: + case MachO::LC_ENCRYPTION_INFO_64: + EncryptionInfoCommandIndex = Index; + break; } } } diff --git a/llvm/lib/ObjCopy/MachO/MachOObject.h b/llvm/lib/ObjCopy/MachO/MachOObject.h index a454c4f..79eb0133 100644 --- a/llvm/lib/ObjCopy/MachO/MachOObject.h +++ b/llvm/lib/ObjCopy/MachO/MachOObject.h @@ -341,6 +341,9 @@ struct Object { /// The index of the LC_SEGMENT or LC_SEGMENT_64 load command /// corresponding to the __TEXT segment. std::optional<size_t> TextSegmentCommandIndex; + /// The index of the LC_ENCRYPTION_INFO or LC_ENCRYPTION_INFO_64 load command + /// if present. + std::optional<size_t> EncryptionInfoCommandIndex; BumpPtrAllocator Alloc; StringSaver NewSectionsContents; diff --git a/llvm/lib/ObjCopy/MachO/MachOReader.cpp b/llvm/lib/ObjCopy/MachO/MachOReader.cpp index 2b344f3..ef0e026 100644 --- a/llvm/lib/ObjCopy/MachO/MachOReader.cpp +++ b/llvm/lib/ObjCopy/MachO/MachOReader.cpp @@ -184,6 +184,10 @@ Error MachOReader::readLoadCommands(Object &O) const { case MachO::LC_DYLD_CHAINED_FIXUPS: O.ChainedFixupsCommandIndex = O.LoadCommands.size(); break; + case MachO::LC_ENCRYPTION_INFO: + case MachO::LC_ENCRYPTION_INFO_64: + O.EncryptionInfoCommandIndex = O.LoadCommands.size(); + break; } #define HANDLE_LOAD_COMMAND(LCName, LCValue, LCStruct) \ case MachO::LCName: \ diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index d737ea5..4ec0fb8 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -189,9 +189,9 @@ static cl::opt<bool> EnableGlobalAnalyses( "enable-global-analyses", cl::init(true), cl::Hidden, cl::desc("Enable inter-procedural analyses")); -static cl::opt<bool> - RunPartialInlining("enable-partial-inlining", cl::init(false), cl::Hidden, - cl::desc("Run Partial inlinining pass")); +static cl::opt<bool> RunPartialInlining("enable-partial-inlining", + cl::init(false), cl::Hidden, + cl::desc("Run Partial inlining pass")); static cl::opt<bool> ExtraVectorizerPasses( "extra-vectorizer-passes", cl::init(false), cl::Hidden, @@ -264,7 +264,7 @@ static cl::opt<bool> static cl::opt<bool> FlattenedProfileUsed( "flattened-profile-used", cl::init(false), cl::Hidden, cl::desc("Indicate the sample profile being used is flattened, i.e., " - "no inline hierachy exists in the profile")); + "no inline hierarchy exists in the profile")); static cl::opt<bool> EnableOrderFileInstrumentation( "enable-order-file-instrumentation", cl::init(false), cl::Hidden, diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index 9f0b092..13e192f 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -156,7 +156,7 @@ MODULE_PASS("strip-nonlinetable-debuginfo", StripNonLineTableDebugInfoPass()) MODULE_PASS("trigger-crash-module", TriggerCrashModulePass()) MODULE_PASS("trigger-verifier-error", TriggerVerifierErrorPass()) MODULE_PASS("tsan-module", ModuleThreadSanitizerPass()) -MODULE_PASS("tysan-module", ModuleTypeSanitizerPass()) +MODULE_PASS("tysan", TypeSanitizerPass()) MODULE_PASS("verify", VerifierPass()) MODULE_PASS("view-callgraph", CallGraphViewerPass()) MODULE_PASS("wholeprogramdevirt", WholeProgramDevirtPass()) @@ -481,7 +481,6 @@ FUNCTION_PASS("transform-warning", WarnMissedTransformationsPass()) FUNCTION_PASS("trigger-crash-function", TriggerCrashFunctionPass()) FUNCTION_PASS("trigger-verifier-error", TriggerVerifierErrorPass()) FUNCTION_PASS("tsan", ThreadSanitizerPass()) -FUNCTION_PASS("tysan", TypeSanitizerPass()) FUNCTION_PASS("typepromotion", TypePromotionPass(TM)) FUNCTION_PASS("unify-loop-exits", UnifyLoopExitsPass()) FUNCTION_PASS("vector-combine", VectorCombinePass()) diff --git a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp index a4539fa..e6a789c 100644 --- a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp +++ b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp @@ -135,6 +135,31 @@ Counter CounterExpressionBuilder::subtract(Counter LHS, Counter RHS, return Simplify ? simplify(Cnt) : Cnt; } +Counter CounterExpressionBuilder::subst(Counter C, const SubstMap &Map) { + // Replace C with the value found in Map even if C is Expression. + if (auto I = Map.find(C); I != Map.end()) + return I->second; + + if (!C.isExpression()) + return C; + + auto CE = Expressions[C.getExpressionID()]; + auto NewLHS = subst(CE.LHS, Map); + auto NewRHS = subst(CE.RHS, Map); + + // Reconstruct Expression with induced subexpressions. + switch (CE.Kind) { + case CounterExpression::Add: + C = add(NewLHS, NewRHS); + break; + case CounterExpression::Subtract: + C = subtract(NewLHS, NewRHS); + break; + } + + return C; +} + void CounterMappingContext::dump(const Counter &C, raw_ostream &OS) const { switch (C.getKind()) { case Counter::Zero: diff --git a/llvm/lib/Target/AArch64/AArch64FMV.td b/llvm/lib/Target/AArch64/AArch64FMV.td index fc7a94a..e0f56fd 100644 --- a/llvm/lib/Target/AArch64/AArch64FMV.td +++ b/llvm/lib/Target/AArch64/AArch64FMV.td @@ -22,64 +22,65 @@ // Something you can add to target_version or target_clones. -class FMVExtension<string n, string b, int p> { +class FMVExtension<string name, string enumeration> { // Name, as spelled in target_version or target_clones. e.g. "memtag". - string Name = n; + string Name = name; // A C++ expression giving the number of the bit in the FMV ABI. // Currently this is given as a value from the enum "CPUFeatures". - string Bit = b; + string FeatureBit = "FEAT_" # enumeration; // SubtargetFeature enabled for codegen when this FMV feature is present. - string BackendFeature = n; + string BackendFeature = name; - // The FMV priority. - int Priority = p; + // A C++ expression giving the number of the priority bit. + // Currently this is given as a value from the enum "FeatPriorities". + string PriorityBit = "PRIOR_" # enumeration; } -def : FMVExtension<"aes", "FEAT_PMULL", 150>; -def : FMVExtension<"bf16", "FEAT_BF16", 280>; -def : FMVExtension<"bti", "FEAT_BTI", 510>; -def : FMVExtension<"crc", "FEAT_CRC", 110>; -def : FMVExtension<"dit", "FEAT_DIT", 180>; -def : FMVExtension<"dotprod", "FEAT_DOTPROD", 104>; -let BackendFeature = "ccpp" in def : FMVExtension<"dpb", "FEAT_DPB", 190>; -let BackendFeature = "ccdp" in def : FMVExtension<"dpb2", "FEAT_DPB2", 200>; -def : FMVExtension<"f32mm", "FEAT_SVE_F32MM", 350>; -def : FMVExtension<"f64mm", "FEAT_SVE_F64MM", 360>; -def : FMVExtension<"fcma", "FEAT_FCMA", 220>; -def : FMVExtension<"flagm", "FEAT_FLAGM", 20>; -let BackendFeature = "altnzcv" in def : FMVExtension<"flagm2", "FEAT_FLAGM2", 30>; -def : FMVExtension<"fp", "FEAT_FP", 90>; -def : FMVExtension<"fp16", "FEAT_FP16", 170>; -def : FMVExtension<"fp16fml", "FEAT_FP16FML", 175>; -let BackendFeature = "fptoint" in def : FMVExtension<"frintts", "FEAT_FRINTTS", 250>; -def : FMVExtension<"i8mm", "FEAT_I8MM", 270>; -def : FMVExtension<"jscvt", "FEAT_JSCVT", 210>; -def : FMVExtension<"ls64", "FEAT_LS64_ACCDATA", 520>; -def : FMVExtension<"lse", "FEAT_LSE", 80>; -def : FMVExtension<"memtag", "FEAT_MEMTAG2", 440>; -def : FMVExtension<"mops", "FEAT_MOPS", 650>; -def : FMVExtension<"predres", "FEAT_PREDRES", 480>; -def : FMVExtension<"rcpc", "FEAT_RCPC", 230>; -let BackendFeature = "rcpc-immo" in def : FMVExtension<"rcpc2", "FEAT_RCPC2", 240>; -def : FMVExtension<"rcpc3", "FEAT_RCPC3", 241>; -def : FMVExtension<"rdm", "FEAT_RDM", 108>; -def : FMVExtension<"rng", "FEAT_RNG", 10>; -def : FMVExtension<"sb", "FEAT_SB", 470>; -def : FMVExtension<"sha2", "FEAT_SHA2", 130>; -def : FMVExtension<"sha3", "FEAT_SHA3", 140>; -def : FMVExtension<"simd", "FEAT_SIMD", 100>; -def : FMVExtension<"sm4", "FEAT_SM4", 106>; -def : FMVExtension<"sme", "FEAT_SME", 430>; -def : FMVExtension<"sme-f64f64", "FEAT_SME_F64", 560>; -def : FMVExtension<"sme-i16i64", "FEAT_SME_I64", 570>; -def : FMVExtension<"sme2", "FEAT_SME2", 580>; -def : FMVExtension<"ssbs", "FEAT_SSBS2", 490>; -def : FMVExtension<"sve", "FEAT_SVE", 310>; -def : FMVExtension<"sve2", "FEAT_SVE2", 370>; -def : FMVExtension<"sve2-aes", "FEAT_SVE_PMULL128", 380>; -def : FMVExtension<"sve2-bitperm", "FEAT_SVE_BITPERM", 400>; -def : FMVExtension<"sve2-sha3", "FEAT_SVE_SHA3", 410>; -def : FMVExtension<"sve2-sm4", "FEAT_SVE_SM4", 420>; -def : FMVExtension<"wfxt", "FEAT_WFXT", 550>; +def : FMVExtension<"aes", "PMULL">; +def : FMVExtension<"bf16", "BF16">; +def : FMVExtension<"bti", "BTI">; +def : FMVExtension<"crc", "CRC">; +def : FMVExtension<"dit", "DIT">; +def : FMVExtension<"dotprod", "DOTPROD">; +let BackendFeature = "ccpp" in def : FMVExtension<"dpb", "DPB">; +let BackendFeature = "ccdp" in def : FMVExtension<"dpb2", "DPB2">; +def : FMVExtension<"f32mm", "SVE_F32MM">; +def : FMVExtension<"f64mm", "SVE_F64MM">; +def : FMVExtension<"fcma", "FCMA">; +def : FMVExtension<"flagm", "FLAGM">; +let BackendFeature = "altnzcv" in def : FMVExtension<"flagm2", "FLAGM2">; +def : FMVExtension<"fp", "FP">; +def : FMVExtension<"fp16", "FP16">; +def : FMVExtension<"fp16fml", "FP16FML">; +let BackendFeature = "fptoint" in def : FMVExtension<"frintts", "FRINTTS">; +def : FMVExtension<"i8mm", "I8MM">; +def : FMVExtension<"jscvt", "JSCVT">; +def : FMVExtension<"ls64", "LS64_ACCDATA">; +def : FMVExtension<"lse", "LSE">; +def : FMVExtension<"memtag", "MEMTAG2">; +def : FMVExtension<"mops", "MOPS">; +def : FMVExtension<"predres", "PREDRES">; +def : FMVExtension<"rcpc", "RCPC">; +let BackendFeature = "rcpc-immo" in def : FMVExtension<"rcpc2", "RCPC2">; +def : FMVExtension<"rcpc3", "RCPC3">; +def : FMVExtension<"rdm", "RDM">; +def : FMVExtension<"rng", "RNG">; +def : FMVExtension<"sb", "SB">; +def : FMVExtension<"sha2", "SHA2">; +def : FMVExtension<"sha3", "SHA3">; +def : FMVExtension<"simd", "SIMD">; +def : FMVExtension<"sm4", "SM4">; +def : FMVExtension<"sme", "SME">; +def : FMVExtension<"sme-f64f64", "SME_F64">; +def : FMVExtension<"sme-i16i64", "SME_I64">; +def : FMVExtension<"sme2", "SME2">; +def : FMVExtension<"ssbs", "SSBS2">; +def : FMVExtension<"sve", "SVE">; +def : FMVExtension<"sve2", "SVE2">; +def : FMVExtension<"sve2-aes", "SVE_PMULL128">; +def : FMVExtension<"sve2-bitperm", "SVE_BITPERM">; +def : FMVExtension<"sve2-sha3", "SVE_SHA3">; +def : FMVExtension<"sve2-sm4", "SVE_SM4">; +def : FMVExtension<"wfxt", "WFXT">; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index ef00b09..3ad2905 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -753,6 +753,14 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(Op, MVT::v8bf16, Expand); } + // For bf16, fpextend is custom lowered to be optionally expanded into shifts. + setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom); + setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom); + setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Custom); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f32, Custom); + auto LegalizeNarrowFP = [this](MVT ScalarVT) { for (auto Op : { ISD::SETCC, @@ -893,10 +901,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(Op, MVT::f16, Legal); } - // Strict conversion to a larger type is legal - for (auto VT : {MVT::f32, MVT::f64}) - setOperationAction(ISD::STRICT_FP_EXTEND, VT, Legal); - setOperationAction(ISD::PREFETCH, MVT::Other, Custom); setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom); @@ -1183,6 +1187,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setMaxDivRemBitWidthSupported(128); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); + if (Subtarget->hasSME()) + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom); if (Subtarget->isNeonAvailable()) { // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to @@ -4496,6 +4502,54 @@ SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op, if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) return LowerFixedLengthFPExtendToSVE(Op, DAG); + bool IsStrict = Op->isStrictFPOpcode(); + SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0); + EVT Op0VT = Op0.getValueType(); + if (VT == MVT::f64) { + // FP16->FP32 extends are legal for v32 and v4f32. + if (Op0VT == MVT::f32 || Op0VT == MVT::f16) + return Op; + // Split bf16->f64 extends into two fpextends. + if (Op0VT == MVT::bf16 && IsStrict) { + SDValue Ext1 = + DAG.getNode(ISD::STRICT_FP_EXTEND, SDLoc(Op), {MVT::f32, MVT::Other}, + {Op0, Op.getOperand(0)}); + return DAG.getNode(ISD::STRICT_FP_EXTEND, SDLoc(Op), {VT, MVT::Other}, + {Ext1, Ext1.getValue(1)}); + } + if (Op0VT == MVT::bf16) + return DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), VT, + DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, Op0)); + return SDValue(); + } + + if (VT.getScalarType() == MVT::f32) { + // FP16->FP32 extends are legal for v32 and v4f32. + if (Op0VT.getScalarType() == MVT::f16) + return Op; + if (Op0VT.getScalarType() == MVT::bf16) { + SDLoc DL(Op); + EVT IVT = VT.changeTypeToInteger(); + if (!Op0VT.isVector()) { + Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4bf16, Op0); + IVT = MVT::v4i32; + } + + EVT Op0IVT = Op0.getValueType().changeTypeToInteger(); + SDValue Ext = + DAG.getNode(ISD::ANY_EXTEND, DL, IVT, DAG.getBitcast(Op0IVT, Op0)); + SDValue Shift = + DAG.getNode(ISD::SHL, DL, IVT, Ext, DAG.getConstant(16, DL, IVT)); + if (!Op0VT.isVector()) + Shift = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Shift, + DAG.getConstant(0, DL, MVT::i64)); + Shift = DAG.getBitcast(VT, Shift); + return IsStrict ? DAG.getMergeValues({Shift, Op.getOperand(0)}, DL) + : Shift; + } + return SDValue(); + } + assert(Op.getValueType() == MVT::f128 && "Unexpected lowering"); return SDValue(); } @@ -7343,6 +7397,7 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG); case ISD::FP_EXTEND: + case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG); case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); @@ -27429,6 +27484,15 @@ void AArch64TargetLowering::ReplaceNodeResults( Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V)); return; } + case Intrinsic::aarch64_sme_in_streaming_mode: { + SDLoc DL(N); + SDValue Chain = DAG.getEntryNode(); + SDValue RuntimePStateSM = + getRuntimePStateSM(DAG, Chain, DL, N->getValueType(0)); + Results.push_back( + DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, RuntimePStateSM)); + return; + } case Intrinsic::experimental_vector_match: case Intrinsic::get_active_lane_mask: { if (!VT.isFixedLengthVector() || VT.getVectorElementType() != MVT::i1) diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index 47c4c6c..f527f7e 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -1804,7 +1804,9 @@ class TMSystemException<bits<3> op1, string asm, list<dag> pattern> } class APASI : SimpleSystemI<0, (ins GPR64:$Xt), "apas", "\t$Xt">, Sched<[]> { + bits<5> Xt; let Inst{20-5} = 0b0111001110000000; + let Inst{4-0} = Xt; let DecoderNamespace = "APAS"; } @@ -2768,6 +2770,8 @@ class MulHi<bits<3> opc, string asm, SDNode OpNode> let Inst{23-21} = opc; let Inst{20-16} = Rm; let Inst{15} = 0; + let Inst{14-10} = 0b11111; + let Unpredictable{14-10} = 0b11111; let Inst{9-5} = Rn; let Inst{4-0} = Rd; @@ -4920,6 +4924,8 @@ class LoadExclusivePair<bits<2> sz, bit o2, bit L, bit o1, bit o0, bits<5> Rt; bits<5> Rt2; bits<5> Rn; + let Inst{20-16} = 0b11111; + let Unpredictable{20-16} = 0b11111; let Inst{14-10} = Rt2; let Inst{9-5} = Rn; let Inst{4-0} = Rt; @@ -4935,6 +4941,7 @@ class BaseLoadStoreExclusiveLSUI<bits<2> sz, bit L, bit o0, let Inst{31-30} = sz; let Inst{29-23} = 0b0010010; let Inst{22} = L; + let Inst{21} = 0b0; let Inst{15} = o0; } diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index ec891ea4..c6f5cdc 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -5123,22 +5123,6 @@ let Predicates = [HasFullFP16] in { //===----------------------------------------------------------------------===// defm FCVT : FPConversion<"fcvt">; -// Helper to get bf16 into fp32. -def cvt_bf16_to_fp32 : - OutPatFrag<(ops node:$Rn), - (f32 (COPY_TO_REGCLASS - (i32 (UBFMWri - (i32 (COPY_TO_REGCLASS (INSERT_SUBREG (f32 (IMPLICIT_DEF)), - node:$Rn, hsub), GPR32)), - (i64 (i32shift_a (i64 16))), - (i64 (i32shift_b (i64 16))))), - FPR32))>; -// Pattern for bf16 -> fp32. -def : Pat<(f32 (any_fpextend (bf16 FPR16:$Rn))), - (cvt_bf16_to_fp32 FPR16:$Rn)>; -// Pattern for bf16 -> fp64. -def : Pat<(f64 (any_fpextend (bf16 FPR16:$Rn))), - (FCVTDSr (f32 (cvt_bf16_to_fp32 FPR16:$Rn)))>; //===----------------------------------------------------------------------===// // Floating point single operand instructions. @@ -8333,8 +8317,6 @@ def : Pat<(v4i32 (anyext (v4i16 V64:$Rn))), (USHLLv4i16_shift V64:$Rn, (i32 0))> def : Pat<(v2i64 (sext (v2i32 V64:$Rn))), (SSHLLv2i32_shift V64:$Rn, (i32 0))>; def : Pat<(v2i64 (zext (v2i32 V64:$Rn))), (USHLLv2i32_shift V64:$Rn, (i32 0))>; def : Pat<(v2i64 (anyext (v2i32 V64:$Rn))), (USHLLv2i32_shift V64:$Rn, (i32 0))>; -// Vector bf16 -> fp32 is implemented morally as a zext + shift. -def : Pat<(v4f32 (any_fpextend (v4bf16 V64:$Rn))), (SHLLv4i16 V64:$Rn)>; // Also match an extend from the upper half of a 128 bit source register. def : Pat<(v8i16 (anyext (v8i8 (extract_high_v16i8 (v16i8 V128:$Rn)) ))), (USHLLv16i8_shift V128:$Rn, (i32 0))>; diff --git a/llvm/lib/Target/AArch64/AArch64SystemOperands.td b/llvm/lib/Target/AArch64/AArch64SystemOperands.td index c76fc8a..355a9d2 100644 --- a/llvm/lib/Target/AArch64/AArch64SystemOperands.td +++ b/llvm/lib/Target/AArch64/AArch64SystemOperands.td @@ -630,7 +630,7 @@ def ExactFPImmValues : GenericEnum { def ExactFPImmsList : GenericTable { let FilterClass = "ExactFPImm"; - let Fields = ["Name", "Enum", "Repr"]; + let Fields = ["Enum", "Repr"]; } def lookupExactFPImmByEnum : SearchIndex { @@ -638,11 +638,6 @@ def lookupExactFPImmByEnum : SearchIndex { let Key = ["Enum"]; } -def lookupExactFPImmByRepr : SearchIndex { - let Table = ExactFPImmsList; - let Key = ["Repr"]; -} - def : ExactFPImm<"zero", "0.0", 0x0>; def : ExactFPImm<"half", "0.5", 0x1>; def : ExactFPImm<"one", "1.0", 0x2>; @@ -998,7 +993,6 @@ defm : TLBI<"VMALLWS2E1OS", 0b100, 0b1000, 0b0101, 0b010, 0>; class SysReg<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm, bits<3> op2> { string Name = name; - string AltName = name; bits<16> Encoding; let Encoding{15-14} = op0; let Encoding{13-11} = op1; @@ -1018,8 +1012,11 @@ def SysRegValues : GenericEnum { def SysRegsList : GenericTable { let FilterClass = "SysReg"; - let Fields = ["Name", "AltName", "Encoding", "Readable", "Writeable", - "Requires"]; + let Fields = ["Name", "Encoding", "Readable", "Writeable", "Requires"]; + + let PrimaryKey = ["Encoding"]; + let PrimaryKeyName = "lookupSysRegByEncoding"; + let PrimaryKeyReturnRange = true; } def lookupSysRegByName : SearchIndex { @@ -1027,11 +1024,6 @@ def lookupSysRegByName : SearchIndex { let Key = ["Name"]; } -def lookupSysRegByEncoding : SearchIndex { - let Table = SysRegsList; - let Key = ["Encoding"]; -} - class RWSysReg<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm, bits<3> op2> : SysReg<name, op0, op1, crn, crm, op2> { @@ -1317,9 +1309,7 @@ def : RWSysReg<"TTBR0_EL1", 0b11, 0b000, 0b0010, 0b0000, 0b000>; def : RWSysReg<"TTBR0_EL3", 0b11, 0b110, 0b0010, 0b0000, 0b000>; let Requires = [{ {AArch64::FeatureEL2VMSA} }] in { -def : RWSysReg<"TTBR0_EL2", 0b11, 0b100, 0b0010, 0b0000, 0b000> { - let AltName = "VSCTLR_EL2"; -} +def : RWSysReg<"TTBR0_EL2", 0b11, 0b100, 0b0010, 0b0000, 0b000>; def : RWSysReg<"VTTBR_EL2", 0b11, 0b100, 0b0010, 0b0001, 0b000>; } @@ -1706,9 +1696,7 @@ def : RWSysReg<"ICH_LR15_EL2", 0b11, 0b100, 0b1100, 0b1101, 0b111>; let Requires = [{ {AArch64::HasV8_0rOps} }] in { //Virtualization System Control Register // Op0 Op1 CRn CRm Op2 -def : RWSysReg<"VSCTLR_EL2", 0b11, 0b100, 0b0010, 0b0000, 0b000> { - let AltName = "TTBR0_EL2"; -} +def : RWSysReg<"VSCTLR_EL2", 0b11, 0b100, 0b0010, 0b0000, 0b000>; //MPU Type Register // Op0 Op1 CRn CRm Op2 @@ -2376,7 +2364,6 @@ def : RWSysReg<"ACTLRALIAS_EL1", 0b11, 0b000, 0b0001, 0b0100, 0b101>; class PHint<bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm, bits<3> op2, string name> { string Name = name; - string AltName = name; bits<16> Encoding; let Encoding{15-14} = op0; let Encoding{13-11} = op1; @@ -2394,7 +2381,7 @@ def PHintValues : GenericEnum { def PHintsList : GenericTable { let FilterClass = "PHint"; - let Fields = ["Name", "AltName", "Encoding", "Requires"]; + let Fields = ["Name", "Encoding", "Requires"]; } def lookupPHintByName : SearchIndex { diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 5abe69e..25b6731 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -2761,6 +2761,21 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, return AdjustCost( BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); + static const TypeConversionCostTblEntry BF16Tbl[] = { + {ISD::FP_ROUND, MVT::bf16, MVT::f32, 1}, // bfcvt + {ISD::FP_ROUND, MVT::bf16, MVT::f64, 1}, // bfcvt + {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 1}, // bfcvtn + {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 2}, // bfcvtn+bfcvtn2 + {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 2}, // bfcvtn+fcvtn + {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 3}, // fcvtn+fcvtl2+bfcvtn + {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+bfcvtn + }; + + if (ST->hasBF16()) + if (const auto *Entry = ConvertCostTableLookup( + BF16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT())) + return AdjustCost(Entry->Cost); + static const TypeConversionCostTblEntry ConversionTbl[] = { {ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1}, // xtn {ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1}, // xtn @@ -2848,6 +2863,14 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f16, 2}, // fcvtl+fcvtl {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f16, 3}, // fcvtl+fcvtl2+fcvtl {ISD::FP_EXTEND, MVT::v8f64, MVT::v8f16, 6}, // 2 * fcvtl+fcvtl2+fcvtl + // BF16 (uses shift) + {ISD::FP_EXTEND, MVT::f32, MVT::bf16, 1}, // shl + {ISD::FP_EXTEND, MVT::f64, MVT::bf16, 2}, // shl+fcvt + {ISD::FP_EXTEND, MVT::v4f32, MVT::v4bf16, 1}, // shll + {ISD::FP_EXTEND, MVT::v8f32, MVT::v8bf16, 2}, // shll+shll2 + {ISD::FP_EXTEND, MVT::v2f64, MVT::v2bf16, 2}, // shll+fcvtl + {ISD::FP_EXTEND, MVT::v4f64, MVT::v4bf16, 3}, // shll+fcvtl+fcvtl2 + {ISD::FP_EXTEND, MVT::v8f64, MVT::v8bf16, 6}, // 2 * shll+fcvtl+fcvtl2 // FP Ext and trunc {ISD::FP_ROUND, MVT::f32, MVT::f64, 1}, // fcvt {ISD::FP_ROUND, MVT::v2f32, MVT::v2f64, 1}, // fcvtn @@ -2860,6 +2883,15 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, {ISD::FP_ROUND, MVT::v2f16, MVT::v2f64, 2}, // fcvtn+fcvtn {ISD::FP_ROUND, MVT::v4f16, MVT::v4f64, 3}, // fcvtn+fcvtn2+fcvtn {ISD::FP_ROUND, MVT::v8f16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+fcvtn + // BF16 (more complex, with +bf16 is handled above) + {ISD::FP_ROUND, MVT::bf16, MVT::f32, 8}, // Expansion is ~8 insns + {ISD::FP_ROUND, MVT::bf16, MVT::f64, 9}, // fcvtn + above + {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f32, 8}, + {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 8}, + {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 15}, + {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 9}, + {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 10}, + {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 19}, // LowerVectorINT_TO_FP: {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1}, @@ -4706,10 +4738,21 @@ InstructionCost AArch64TTIImpl::getShuffleCost( } Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp); - // Treat extractsubvector as single op permutation. bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector; - if (IsExtractSubvector && LT.second.isFixedLengthVector()) + // A sebvector extract can be implemented with a ext (or trivial extract, if + // from lane 0). This currently only handles low or high extracts to prevent + // SLP vectorizer regressions. + if (IsExtractSubvector && LT.second.isFixedLengthVector()) { + if (LT.second.is128BitVector() && + cast<FixedVectorType>(SubTp)->getNumElements() == + LT.second.getVectorNumElements() / 2) { + if (Index == 0) + return 0; + if (Index == (int)LT.second.getVectorNumElements() / 2) + return 1; + } Kind = TTI::SK_PermuteSingleSrc; + } // Check for broadcast loads, which are supported by the LD1R instruction. // In terms of code-size, the shuffle vector is free when a load + dup get diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp index ae84bc9..875b505 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp @@ -1874,26 +1874,25 @@ void AArch64InstPrinter::printBarriernXSOption(const MCInst *MI, unsigned OpNo, markup(O, Markup::Immediate) << "#" << Val; } -static bool isValidSysReg(const AArch64SysReg::SysReg *Reg, bool Read, +static bool isValidSysReg(const AArch64SysReg::SysReg &Reg, bool Read, const MCSubtargetInfo &STI) { - return (Reg && (Read ? Reg->Readable : Reg->Writeable) && - Reg->haveFeatures(STI.getFeatureBits())); + return (Read ? Reg.Readable : Reg.Writeable) && + Reg.haveFeatures(STI.getFeatureBits()); } -// Looks up a system register either by encoding or by name. Some system +// Looks up a system register either by encoding. Some system // registers share the same encoding between different architectures, -// therefore a tablegen lookup by encoding will return an entry regardless -// of the register's predication on a specific subtarget feature. To work -// around this problem we keep an alternative name for such registers and -// look them up by that name if the first lookup was unsuccessful. +// to work around this tablegen will return a range of registers with the same +// encodings. We need to check each register in the range to see if it valid. static const AArch64SysReg::SysReg *lookupSysReg(unsigned Val, bool Read, const MCSubtargetInfo &STI) { - const AArch64SysReg::SysReg *Reg = AArch64SysReg::lookupSysRegByEncoding(Val); - - if (Reg && !isValidSysReg(Reg, Read, STI)) - Reg = AArch64SysReg::lookupSysRegByName(Reg->AltName); + auto Range = AArch64SysReg::lookupSysRegByEncoding(Val); + for (auto &Reg : Range) { + if (isValidSysReg(Reg, Read, STI)) + return &Reg; + } - return Reg; + return nullptr; } void AArch64InstPrinter::printMRSSystemRegister(const MCInst *MI, unsigned OpNo, @@ -1917,7 +1916,7 @@ void AArch64InstPrinter::printMRSSystemRegister(const MCInst *MI, unsigned OpNo, const AArch64SysReg::SysReg *Reg = lookupSysReg(Val, true /*Read*/, STI); - if (isValidSysReg(Reg, true /*Read*/, STI)) + if (Reg) O << Reg->Name; else O << AArch64SysReg::genericRegisterString(Val); @@ -1944,7 +1943,7 @@ void AArch64InstPrinter::printMSRSystemRegister(const MCInst *MI, unsigned OpNo, const AArch64SysReg::SysReg *Reg = lookupSysReg(Val, false /*Read*/, STI); - if (isValidSysReg(Reg, false /*Read*/, STI)) + if (Reg) O << Reg->Name; else O << AArch64SysReg::genericRegisterString(Val); diff --git a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h index 94bba4e..b8d3236 100644 --- a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h +++ b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h @@ -564,11 +564,10 @@ LLVM_DECLARE_ENUM_AS_BITMASK(TailFoldingOpts, /* LargestValue */ (long)TailFoldingOpts::Reverse); namespace AArch64ExactFPImm { - struct ExactFPImm { - const char *Name; - int Enum; - const char *Repr; - }; +struct ExactFPImm { + int Enum; + const char *Repr; +}; #define GET_ExactFPImmValues_DECL #define GET_ExactFPImmsList_DECL #include "AArch64GenSystemOperands.inc" @@ -602,7 +601,6 @@ namespace AArch64PSBHint { namespace AArch64PHint { struct PHint { const char *Name; - const char *AltName; unsigned Encoding; FeatureBitset FeaturesRequired; @@ -720,7 +718,6 @@ AArch64StringToVectorLayout(StringRef LayoutStr) { namespace AArch64SysReg { struct SysReg { const char Name[32]; - const char AltName[32]; unsigned Encoding; bool Readable; bool Writeable; @@ -736,9 +733,6 @@ namespace AArch64SysReg { #define GET_SysRegValues_DECL #include "AArch64GenSystemOperands.inc" - const SysReg *lookupSysRegByName(StringRef); - const SysReg *lookupSysRegByEncoding(uint16_t); - uint32_t parseGenericRegister(StringRef Name); std::string genericRegisterString(uint32_t Bits); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index e844904..0f97988 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -1523,7 +1523,8 @@ Value *AMDGPUCodeGenPrepareImpl::shrinkDivRem64(IRBuilder<> &Builder, bool IsDiv = Opc == Instruction::SDiv || Opc == Instruction::UDiv; bool IsSigned = Opc == Instruction::SDiv || Opc == Instruction::SRem; - int NumDivBits = getDivNumBits(I, Num, Den, 32, IsSigned); + unsigned BitWidth = Num->getType()->getScalarSizeInBits(); + int NumDivBits = getDivNumBits(I, Num, Den, BitWidth - 32, IsSigned); if (NumDivBits == -1) return nullptr; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index d9eaf82..27e9018 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1997,7 +1997,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr, if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset)) return false; SAddr = SelectSAddrFI(CurDAG, SAddr); - Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32); + Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32); return true; } diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index 88205ea..f2686bd 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -680,7 +680,7 @@ multiclass MUBUF_Pseudo_Stores<string opName, ValueType store_vt = i32> { class MUBUF_Pseudo_Store_Lds<string opName> : MUBUF_Pseudo<opName, (outs), - (ins SReg_128:$srsrc, SCSrc_b32:$soffset, Offset:$offset, CPol:$cpol, i1imm:$swz), + (ins SReg_128_XNULL:$srsrc, SCSrc_b32:$soffset, Offset:$offset, CPol:$cpol, i1imm:$swz), " $srsrc, $soffset$offset lds$cpol"> { let LGKM_CNT = 1; let mayLoad = 1; diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td index 3c7627f..1b94d6c 100644 --- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td +++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td @@ -1524,7 +1524,7 @@ class MIMG_IntersectRay_Helper<bit Is64, bit IsA16> { class MIMG_IntersectRay_gfx10<mimgopc op, string opcode, RegisterClass AddrRC> : MIMG_gfx10<op.GFX10M, (outs VReg_128:$vdata), "GFX10"> { - let InOperandList = (ins AddrRC:$vaddr0, SReg_128:$srsrc, A16:$a16); + let InOperandList = (ins AddrRC:$vaddr0, SReg_128_XNULL:$srsrc, A16:$a16); let AsmString = opcode#" $vdata, $vaddr0, $srsrc$a16"; let nsa = 0; @@ -1532,13 +1532,13 @@ class MIMG_IntersectRay_gfx10<mimgopc op, string opcode, RegisterClass AddrRC> class MIMG_IntersectRay_nsa_gfx10<mimgopc op, string opcode, int num_addrs> : MIMG_nsa_gfx10<op.GFX10M, (outs VReg_128:$vdata), num_addrs, "GFX10"> { - let InOperandList = !con(nsah.AddrIns, (ins SReg_128:$srsrc, A16:$a16)); + let InOperandList = !con(nsah.AddrIns, (ins SReg_128_XNULL:$srsrc, A16:$a16)); let AsmString = opcode#" $vdata, "#nsah.AddrAsm#", $srsrc$a16"; } class MIMG_IntersectRay_gfx11<mimgopc op, string opcode, RegisterClass AddrRC> : MIMG_gfx11<op.GFX11, (outs VReg_128:$vdata), "GFX11"> { - let InOperandList = (ins AddrRC:$vaddr0, SReg_128:$srsrc, A16:$a16); + let InOperandList = (ins AddrRC:$vaddr0, SReg_128_XNULL:$srsrc, A16:$a16); let AsmString = opcode#" $vdata, $vaddr0, $srsrc$a16"; let nsa = 0; @@ -1548,7 +1548,7 @@ class MIMG_IntersectRay_nsa_gfx11<mimgopc op, string opcode, int num_addrs, list<RegisterClass> addr_types> : MIMG_nsa_gfx11<op.GFX11, (outs VReg_128:$vdata), num_addrs, "GFX11", addr_types> { - let InOperandList = !con(nsah.AddrIns, (ins SReg_128:$srsrc, A16:$a16)); + let InOperandList = !con(nsah.AddrIns, (ins SReg_128_XNULL:$srsrc, A16:$a16)); let AsmString = opcode#" $vdata, "#nsah.AddrAsm#", $srsrc$a16"; } @@ -1556,7 +1556,7 @@ class VIMAGE_IntersectRay_gfx12<mimgopc op, string opcode, int num_addrs, list<RegisterClass> addr_types> : VIMAGE_gfx12<op.GFX12, (outs VReg_128:$vdata), num_addrs, "GFX12", addr_types> { - let InOperandList = !con(nsah.AddrIns, (ins SReg_128:$rsrc, A16:$a16)); + let InOperandList = !con(nsah.AddrIns, (ins SReg_128_XNULL:$rsrc, A16:$a16)); let AsmString = opcode#" $vdata, "#nsah.AddrAsm#", $rsrc$a16"; } diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index c2199fd..2bc1913 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -1096,21 +1096,8 @@ void SIFoldOperandsImpl::foldOperand( B.addImm(Defs[I].second); } LLVM_DEBUG(dbgs() << "Folded " << *UseMI); - return; } - if (Size != 4) - return; - - Register Reg0 = UseMI->getOperand(0).getReg(); - Register Reg1 = UseMI->getOperand(1).getReg(); - if (TRI->isAGPR(*MRI, Reg0) && TRI->isVGPR(*MRI, Reg1)) - UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64)); - else if (TRI->isVGPR(*MRI, Reg0) && TRI->isAGPR(*MRI, Reg1)) - UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64)); - else if (ST->hasGFX90AInsts() && TRI->isAGPR(*MRI, Reg0) && - TRI->isAGPR(*MRI, Reg1)) - UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_MOV_B32)); return; } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index b3cfa39..0ac84f4 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -13985,6 +13985,43 @@ SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N, return Accum; } +SDValue +SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N, + DAGCombinerInfo &DCI) const { + SDValue RHS = N->getOperand(1); + auto *CRHS = dyn_cast<ConstantSDNode>(RHS); + if (!CRHS) + return SDValue(); + + // TODO: Worth using computeKnownBits? Maybe expensive since it's so + // common. + uint64_t Val = CRHS->getZExtValue(); + if (countr_zero(Val) >= 32) { + SelectionDAG &DAG = DCI.DAG; + SDLoc SL(N); + SDValue LHS = N->getOperand(0); + + // Avoid carry machinery if we know the low half of the add does not + // contribute to the final result. + // + // add i64:x, K if computeTrailingZeros(K) >= 32 + // => build_pair (add x.hi, K.hi), x.lo + + // Breaking the 64-bit add here with this strange constant is unlikely + // to interfere with addressing mode patterns. + + SDValue Hi = getHiHalf64(LHS, DAG); + SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32); + SDValue AddHi = + DAG.getNode(N->getOpcode(), SL, MVT::i32, Hi, ConstHi32, N->getFlags()); + + SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS); + return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi); + } + + return SDValue(); +} + // Collect the ultimate src of each of the mul node's operands, and confirm // each operand is 8 bytes. static std::optional<ByteProvider<SDValue>> @@ -14261,6 +14298,11 @@ SDValue SITargetLowering::performAddCombine(SDNode *N, return V; } + if (VT == MVT::i64) { + if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI)) + return Folded; + } + if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() && (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) { SDValue TempNode(N, 0); @@ -14446,6 +14488,11 @@ SDValue SITargetLowering::performSubCombine(SDNode *N, SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); + if (VT == MVT::i64) { + if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI)) + return Folded; + } + if (VT != MVT::i32) return SDValue(); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index f4641e7..299c8f5 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -212,6 +212,9 @@ private: unsigned getFusedOpcode(const SelectionDAG &DAG, const SDNode *N0, const SDNode *N1) const; SDValue tryFoldToMad64_32(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue foldAddSub64WithZeroLowBitsTo32(SDNode *N, + DAGCombinerInfo &DCI) const; + SDValue performAddCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performAddCarrySubCarryCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performSubCombine(SDNode *N, DAGCombinerInfo &DCI) const; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 692e286..e6f333f 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -6890,9 +6890,8 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI, AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc); if (RsrcIdx != -1) { MachineOperand *Rsrc = &MI.getOperand(RsrcIdx); - if (Rsrc->isReg() && !RI.isSGPRClass(MRI.getRegClass(Rsrc->getReg()))) { + if (Rsrc->isReg() && !RI.isSGPRReg(MRI, Rsrc->getReg())) isRsrcLegal = false; - } } // The operands are legal. diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index e388efe..ee83dff 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -3055,7 +3055,7 @@ def : GCNPat< (V_BFREV_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0))), sub1)>; // If fcanonicalize's operand is implicitly canonicalized, we only need a copy. -let AddedComplexity = 1000 in { +let AddedComplexity = 8 in { foreach vt = [f16, v2f16, f32, v2f32, f64] in { def : GCNPat< (fcanonicalize (vt is_canonicalized:$src)), @@ -3710,12 +3710,15 @@ def : IntMinMaxPat<V_MAXMIN_U32_e64, umin, umax_oneuse>; def : IntMinMaxPat<V_MINMAX_U32_e64, umax, umin_oneuse>; def : FPMinMaxPat<V_MINMAX_F32_e64, f32, fmaxnum_like, fminnum_like_oneuse>; def : FPMinMaxPat<V_MAXMIN_F32_e64, f32, fminnum_like, fmaxnum_like_oneuse>; -def : FPMinMaxPat<V_MINMAX_F16_e64, f16, fmaxnum_like, fminnum_like_oneuse>; -def : FPMinMaxPat<V_MAXMIN_F16_e64, f16, fminnum_like, fmaxnum_like_oneuse>; def : FPMinCanonMaxPat<V_MINMAX_F32_e64, f32, fmaxnum_like, fminnum_like_oneuse>; def : FPMinCanonMaxPat<V_MAXMIN_F32_e64, f32, fminnum_like, fmaxnum_like_oneuse>; -def : FPMinCanonMaxPat<V_MINMAX_F16_e64, f16, fmaxnum_like, fminnum_like_oneuse>; -def : FPMinCanonMaxPat<V_MAXMIN_F16_e64, f16, fminnum_like, fmaxnum_like_oneuse>; +} + +let True16Predicate = UseFakeTrue16Insts in { +def : FPMinMaxPat<V_MINMAX_F16_fake16_e64, f16, fmaxnum_like, fminnum_like_oneuse>; +def : FPMinMaxPat<V_MAXMIN_F16_fake16_e64, f16, fminnum_like, fmaxnum_like_oneuse>; +def : FPMinCanonMaxPat<V_MINMAX_F16_fake16_e64, f16, fmaxnum_like, fminnum_like_oneuse>; +def : FPMinCanonMaxPat<V_MAXMIN_F16_fake16_e64, f16, fminnum_like, fmaxnum_like_oneuse>; } let SubtargetPredicate = isGFX9Plus in { @@ -3723,6 +3726,10 @@ let True16Predicate = NotHasTrue16BitInsts in { defm : Int16Med3Pat<V_MED3_I16_e64, smin, smax, VSrc_b16>; defm : Int16Med3Pat<V_MED3_U16_e64, umin, umax, VSrc_b16>; } +let True16Predicate = UseRealTrue16Insts in { + defm : Int16Med3Pat<V_MED3_I16_t16_e64, smin, smax, VSrcT_b16>; + defm : Int16Med3Pat<V_MED3_U16_t16_e64, umin, umax, VSrcT_b16>; +} let True16Predicate = UseFakeTrue16Insts in { defm : Int16Med3Pat<V_MED3_I16_fake16_e64, smin, smax, VSrc_b16>; defm : Int16Med3Pat<V_MED3_U16_fake16_e64, umin, umax, VSrc_b16>; diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td index 60e4ce9..37dcc100 100644 --- a/llvm/lib/Target/AMDGPU/SMInstructions.td +++ b/llvm/lib/Target/AMDGPU/SMInstructions.td @@ -341,10 +341,10 @@ let SubtargetPredicate = HasScalarDwordx3Loads in defm S_BUFFER_LOAD_DWORDX4 : SM_Pseudo_Loads <SReg_128_XNULL, SReg_128>; defm S_BUFFER_LOAD_DWORDX8 : SM_Pseudo_Loads <SReg_128_XNULL, SReg_256>; defm S_BUFFER_LOAD_DWORDX16 : SM_Pseudo_Loads <SReg_128_XNULL, SReg_512>; -defm S_BUFFER_LOAD_I8 : SM_Pseudo_Loads <SReg_128, SReg_32_XM0_XEXEC>; -defm S_BUFFER_LOAD_U8 : SM_Pseudo_Loads <SReg_128, SReg_32_XM0_XEXEC>; -defm S_BUFFER_LOAD_I16 : SM_Pseudo_Loads <SReg_128, SReg_32_XM0_XEXEC>; -defm S_BUFFER_LOAD_U16 : SM_Pseudo_Loads <SReg_128, SReg_32_XM0_XEXEC>; +defm S_BUFFER_LOAD_I8 : SM_Pseudo_Loads <SReg_128_XNULL, SReg_32_XM0_XEXEC>; +defm S_BUFFER_LOAD_U8 : SM_Pseudo_Loads <SReg_128_XNULL, SReg_32_XM0_XEXEC>; +defm S_BUFFER_LOAD_I16 : SM_Pseudo_Loads <SReg_128_XNULL, SReg_32_XM0_XEXEC>; +defm S_BUFFER_LOAD_U16 : SM_Pseudo_Loads <SReg_128_XNULL, SReg_32_XM0_XEXEC>; } let SubtargetPredicate = HasScalarStores in { @@ -375,7 +375,7 @@ def S_DCACHE_WB_VOL : SM_Inval_Pseudo <"s_dcache_wb_vol", int_amdgcn_s_dcache_wb defm S_ATC_PROBE : SM_Pseudo_Probe <SReg_64>; let is_buffer = 1 in { -defm S_ATC_PROBE_BUFFER : SM_Pseudo_Probe <SReg_128>; +defm S_ATC_PROBE_BUFFER : SM_Pseudo_Probe <SReg_128_XNULL>; } } // SubtargetPredicate = isGFX8Plus @@ -470,7 +470,7 @@ def S_PREFETCH_INST : SM_Prefetch_Pseudo <"s_prefetch_inst", SReg_64, 1>; def S_PREFETCH_INST_PC_REL : SM_Prefetch_Pseudo <"s_prefetch_inst_pc_rel", SReg_64, 0>; def S_PREFETCH_DATA : SM_Prefetch_Pseudo <"s_prefetch_data", SReg_64, 1>; def S_PREFETCH_DATA_PC_REL : SM_Prefetch_Pseudo <"s_prefetch_data_pc_rel", SReg_64, 0>; -def S_BUFFER_PREFETCH_DATA : SM_Prefetch_Pseudo <"s_buffer_prefetch_data", SReg_128, 1> { +def S_BUFFER_PREFETCH_DATA : SM_Prefetch_Pseudo <"s_buffer_prefetch_data", SReg_128_XNULL, 1> { let is_buffer = 1; } } // end let SubtargetPredicate = isGFX12Plus diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index cef1f20..24a2eed 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -1374,8 +1374,8 @@ class VOP3_DOT_Profile_fake16<VOPProfile P, VOP3Features Features = VOP3_REGULAR let SubtargetPredicate = isGFX11Plus in { defm V_MAXMIN_F32 : VOP3Inst<"v_maxmin_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>; defm V_MINMAX_F32 : VOP3Inst<"v_minmax_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>; - defm V_MAXMIN_F16 : VOP3Inst<"v_maxmin_f16", VOP3_Profile<VOP_F16_F16_F16_F16>>; - defm V_MINMAX_F16 : VOP3Inst<"v_minmax_f16", VOP3_Profile<VOP_F16_F16_F16_F16>>; + defm V_MAXMIN_F16 : VOP3Inst_t16<"v_maxmin_f16", VOP_F16_F16_F16_F16>; + defm V_MINMAX_F16 : VOP3Inst_t16<"v_minmax_f16", VOP_F16_F16_F16_F16>; defm V_MAXMIN_U32 : VOP3Inst<"v_maxmin_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>; defm V_MINMAX_U32 : VOP3Inst<"v_minmax_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>; defm V_MAXMIN_I32 : VOP3Inst<"v_maxmin_i32", VOP3_Profile<VOP_I32_I32_I32_I32>>; @@ -1588,8 +1588,8 @@ defm V_MED3_NUM_F32 : VOP3_Realtriple_with_name_gfx12<0x231, "V_MED3_F32", defm V_MED3_NUM_F16 : VOP3_Realtriple_t16_and_fake16_gfx12<0x232, "v_med3_num_f16", "V_MED3_F16", "v_med3_f16">; defm V_MINMAX_NUM_F32 : VOP3_Realtriple_with_name_gfx12<0x268, "V_MINMAX_F32", "v_minmax_num_f32">; defm V_MAXMIN_NUM_F32 : VOP3_Realtriple_with_name_gfx12<0x269, "V_MAXMIN_F32", "v_maxmin_num_f32">; -defm V_MINMAX_NUM_F16 : VOP3_Realtriple_with_name_gfx12<0x26a, "V_MINMAX_F16", "v_minmax_num_f16">; -defm V_MAXMIN_NUM_F16 : VOP3_Realtriple_with_name_gfx12<0x26b, "V_MAXMIN_F16", "v_maxmin_num_f16">; +defm V_MINMAX_NUM_F16 : VOP3_Realtriple_t16_and_fake16_gfx12<0x26a, "v_minmax_num_f16", "V_MINMAX_F16", "v_minmax_f16">; +defm V_MAXMIN_NUM_F16 : VOP3_Realtriple_t16_and_fake16_gfx12<0x26b, "v_maxmin_num_f16", "V_MAXMIN_F16", "v_maxmin_f16">; defm V_MINIMUMMAXIMUM_F32 : VOP3Only_Realtriple_gfx12<0x26c>; defm V_MAXIMUMMINIMUM_F32 : VOP3Only_Realtriple_gfx12<0x26d>; defm V_MINIMUMMAXIMUM_F16 : VOP3Only_Realtriple_t16_gfx12<0x26e>; @@ -1730,8 +1730,8 @@ defm V_PERMLANE16_B32 : VOP3_Real_Base_gfx11_gfx12<0x25b>; defm V_PERMLANEX16_B32 : VOP3_Real_Base_gfx11_gfx12<0x25c>; defm V_MAXMIN_F32 : VOP3_Realtriple_gfx11<0x25e>; defm V_MINMAX_F32 : VOP3_Realtriple_gfx11<0x25f>; -defm V_MAXMIN_F16 : VOP3_Realtriple_gfx11<0x260>; -defm V_MINMAX_F16 : VOP3_Realtriple_gfx11<0x261>; +defm V_MAXMIN_F16 : VOP3_Realtriple_t16_and_fake16_gfx11<0x260, "v_maxmin_f16">; +defm V_MINMAX_F16 : VOP3_Realtriple_t16_and_fake16_gfx11<0x261, "v_minmax_f16">; defm V_MAXMIN_U32 : VOP3_Realtriple_gfx11_gfx12<0x262>; defm V_MINMAX_U32 : VOP3_Realtriple_gfx11_gfx12<0x263>; defm V_MAXMIN_I32 : VOP3_Realtriple_gfx11_gfx12<0x264>; diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index d236907..930ed9a 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -1909,8 +1909,8 @@ multiclass VOP3_Realtriple_t16_gfx11<bits<10> op, string asmName, string opName multiclass VOP3_Realtriple_t16_and_fake16_gfx11<bits<10> op, string asmName, string opName = NAME, string pseudo_mnemonic = "", bit isSingle = 0> { - defm _t16: VOP3_Realtriple_t16_gfx11<op, opName#"_t16", asmName, pseudo_mnemonic, isSingle>; - defm _fake16: VOP3_Realtriple_t16_gfx11<op, opName#"_fake16", asmName, pseudo_mnemonic, isSingle>; + defm _t16: VOP3_Realtriple_t16_gfx11<op, asmName, opName#"_t16", pseudo_mnemonic, isSingle>; + defm _fake16: VOP3_Realtriple_t16_gfx11<op, asmName, opName#"_fake16", pseudo_mnemonic, isSingle>; } multiclass VOP3Only_Realtriple_t16_gfx11<bits<10> op, string asmName, diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td index c67177c..009b60c 100644 --- a/llvm/lib/Target/ARM/ARMInstrInfo.td +++ b/llvm/lib/Target/ARM/ARMInstrInfo.td @@ -3320,7 +3320,7 @@ def STRH_preidx: ARMPseudoInst<(outs GPR:$Rn_wb), } - +let mayStore = 1, hasSideEffects = 0 in { def STRH_PRE : AI3ldstidx<0b1011, 0, 1, (outs GPR:$Rn_wb), (ins GPR:$Rt, addrmode3_pre:$addr), IndexModePre, StMiscFrm, IIC_iStore_bh_ru, @@ -3352,6 +3352,7 @@ def STRH_POST : AI3ldstidx<0b1011, 0, 0, (outs GPR:$Rn_wb), let Inst{3-0} = offset{3-0}; // imm3_0/Rm let DecoderMethod = "DecodeAddrMode3Instruction"; } +} // mayStore = 1, hasSideEffects = 0 let mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 in { def STRD_PRE : AI3ldstidx<0b1111, 0, 1, (outs GPR:$Rn_wb), diff --git a/llvm/lib/Target/ARM/ARMSystemRegister.td b/llvm/lib/Target/ARM/ARMSystemRegister.td index c03db15..3afc410 100644 --- a/llvm/lib/Target/ARM/ARMSystemRegister.td +++ b/llvm/lib/Target/ARM/ARMSystemRegister.td @@ -19,17 +19,13 @@ class MClassSysReg<bits<1> UniqMask1, bits<1> UniqMask2, bits<1> UniqMask3, bits<12> Enc12, - string name> : SearchableTable { - let SearchableFields = ["Name", "M1Encoding12", "M2M3Encoding8", "Encoding"]; + string name> { string Name; bits<13> M1Encoding12; bits<10> M2M3Encoding8; bits<12> Encoding; let Name = name; - let EnumValueField = "M1Encoding12"; - let EnumValueField = "M2M3Encoding8"; - let EnumValueField = "Encoding"; let M1Encoding12{12} = UniqMask1; let M1Encoding12{11-00} = Enc12; @@ -41,6 +37,27 @@ class MClassSysReg<bits<1> UniqMask1, code Requires = [{ {} }]; } +def MClassSysRegsList : GenericTable { + let FilterClass = "MClassSysReg"; + let Fields = ["Name", "M1Encoding12", "M2M3Encoding8", "Encoding", + "Requires"]; +} + +def lookupMClassSysRegByName : SearchIndex { + let Table = MClassSysRegsList; + let Key = ["Name"]; +} + +def lookupMClassSysRegByM1Encoding12 : SearchIndex { + let Table = MClassSysRegsList; + let Key = ["M1Encoding12"]; +} + +def lookupMClassSysRegByM2M3Encoding8 : SearchIndex { + let Table = MClassSysRegsList; + let Key = ["M2M3Encoding8"]; +} + // [|i|e|x]apsr_nzcvq has alias [|i|e|x]apsr. // Mask1 Mask2 Mask3 Enc12, Name let Requires = [{ {ARM::FeatureDSP} }] in { @@ -127,15 +144,29 @@ def : MClassSysReg<0, 0, 1, 0x8a7, "pac_key_u_3_ns">; // Banked Registers // -class BankedReg<string name, bits<8> enc> - : SearchableTable { +class BankedReg<string name, bits<8> enc> { string Name; bits<8> Encoding; let Name = name; let Encoding = enc; - let SearchableFields = ["Name", "Encoding"]; } +def BankedRegsList : GenericTable { + let FilterClass = "BankedReg"; + let Fields = ["Name", "Encoding"]; +} + +def lookupBankedRegByName : SearchIndex { + let Table = BankedRegsList; + let Key = ["Name"]; +} + +def lookupBankedRegByEncoding : SearchIndex { + let Table = BankedRegsList; + let Key = ["Encoding"]; +} + + // The values here come from B9.2.3 of the ARM ARM, where bits 4-0 are SysM // and bit 5 is R. def : BankedReg<"r8_usr", 0x00>; diff --git a/llvm/lib/Target/ARM/Utils/ARMBaseInfo.cpp b/llvm/lib/Target/ARM/Utils/ARMBaseInfo.cpp index 494c67d..e76a70b 100644 --- a/llvm/lib/Target/ARM/Utils/ARMBaseInfo.cpp +++ b/llvm/lib/Target/ARM/Utils/ARMBaseInfo.cpp @@ -62,13 +62,13 @@ const MClassSysReg *lookupMClassSysRegBy8bitSYSmValue(unsigned SYSm) { return ARMSysReg::lookupMClassSysRegByM2M3Encoding8((1<<8)|(SYSm & 0xFF)); } -#define GET_MCLASSSYSREG_IMPL +#define GET_MClassSysRegsList_IMPL #include "ARMGenSystemRegister.inc" } // end namespace ARMSysReg namespace ARMBankedReg { -#define GET_BANKEDREG_IMPL +#define GET_BankedRegsList_IMPL #include "ARMGenSystemRegister.inc" } // end namespce ARMSysReg } // end namespace llvm diff --git a/llvm/lib/Target/ARM/Utils/ARMBaseInfo.h b/llvm/lib/Target/ARM/Utils/ARMBaseInfo.h index 5562572..dc4f811 100644 --- a/llvm/lib/Target/ARM/Utils/ARMBaseInfo.h +++ b/llvm/lib/Target/ARM/Utils/ARMBaseInfo.h @@ -206,8 +206,8 @@ namespace ARMSysReg { } }; - #define GET_MCLASSSYSREG_DECL - #include "ARMGenSystemRegister.inc" +#define GET_MClassSysRegsList_DECL +#include "ARMGenSystemRegister.inc" // lookup system register using 12-bit SYSm value. // Note: the search is uniqued using M1 mask @@ -228,8 +228,8 @@ namespace ARMBankedReg { const char *Name; uint16_t Encoding; }; - #define GET_BANKEDREG_DECL - #include "ARMGenSystemRegister.inc" +#define GET_BankedRegsList_DECL +#include "ARMGenSystemRegister.inc" } // end namespace ARMBankedReg } // end namespace llvm diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td index 5d865a3..62b5b70 100644 --- a/llvm/lib/Target/DirectX/DXIL.td +++ b/llvm/lib/Target/DirectX/DXIL.td @@ -42,8 +42,10 @@ def FloatTy : DXILOpParamType; def DoubleTy : DXILOpParamType; def ResRetHalfTy : DXILOpParamType; def ResRetFloatTy : DXILOpParamType; +def ResRetDoubleTy : DXILOpParamType; def ResRetInt16Ty : DXILOpParamType; def ResRetInt32Ty : DXILOpParamType; +def ResRetInt64Ty : DXILOpParamType; def HandleTy : DXILOpParamType; def ResBindTy : DXILOpParamType; def ResPropsTy : DXILOpParamType; @@ -890,6 +892,23 @@ def SplitDouble : DXILOp<102, splitDouble> { let attributes = [Attributes<DXIL1_0, [ReadNone]>]; } +def RawBufferLoad : DXILOp<139, rawBufferLoad> { + let Doc = "reads from a raw buffer and structured buffer"; + // Handle, Coord0, Coord1, Mask, Alignment + let arguments = [HandleTy, Int32Ty, Int32Ty, Int8Ty, Int32Ty]; + let result = OverloadTy; + let overloads = [ + Overloads<DXIL1_2, + [ResRetHalfTy, ResRetFloatTy, ResRetInt16Ty, ResRetInt32Ty]>, + Overloads<DXIL1_3, + [ + ResRetHalfTy, ResRetFloatTy, ResRetDoubleTy, ResRetInt16Ty, + ResRetInt32Ty, ResRetInt64Ty + ]> + ]; + let stages = [Stages<DXIL1_2, [all_stages]>]; +} + def Dot4AddI8Packed : DXILOp<163, dot4AddPacked> { let Doc = "signed dot product of 4 x i8 vectors packed into i32, with " "accumulate to i32"; diff --git a/llvm/lib/Target/DirectX/DXILOpBuilder.cpp b/llvm/lib/Target/DirectX/DXILOpBuilder.cpp index 5d5bb3e..9f88ccd 100644 --- a/llvm/lib/Target/DirectX/DXILOpBuilder.cpp +++ b/llvm/lib/Target/DirectX/DXILOpBuilder.cpp @@ -263,10 +263,14 @@ static Type *getTypeFromOpParamType(OpParamType Kind, LLVMContext &Ctx, return getResRetType(Type::getHalfTy(Ctx)); case OpParamType::ResRetFloatTy: return getResRetType(Type::getFloatTy(Ctx)); + case OpParamType::ResRetDoubleTy: + return getResRetType(Type::getDoubleTy(Ctx)); case OpParamType::ResRetInt16Ty: return getResRetType(Type::getInt16Ty(Ctx)); case OpParamType::ResRetInt32Ty: return getResRetType(Type::getInt32Ty(Ctx)); + case OpParamType::ResRetInt64Ty: + return getResRetType(Type::getInt64Ty(Ctx)); case OpParamType::HandleTy: return getHandleType(Ctx); case OpParamType::ResBindTy: diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp index 4e01dd1..f43815b 100644 --- a/llvm/lib/Target/DirectX/DXILOpLowering.cpp +++ b/llvm/lib/Target/DirectX/DXILOpLowering.cpp @@ -415,8 +415,16 @@ public: } } - OldResult = cast<Instruction>( - IRB.CreateExtractValue(Op, 0, OldResult->getName())); + if (OldResult->use_empty()) { + // Only the check bit was used, so we're done here. + OldResult->eraseFromParent(); + return Error::success(); + } + + assert(OldResult->hasOneUse() && + isa<ExtractValueInst>(*OldResult->user_begin()) && + "Expected only use to be extract of first element"); + OldResult = cast<Instruction>(*OldResult->user_begin()); OldTy = ST->getElementType(0); } @@ -534,6 +542,48 @@ public: }); } + [[nodiscard]] bool lowerRawBufferLoad(Function &F) { + Triple TT(Triple(M.getTargetTriple())); + VersionTuple DXILVersion = TT.getDXILVersion(); + const DataLayout &DL = F.getDataLayout(); + IRBuilder<> &IRB = OpBuilder.getIRB(); + Type *Int8Ty = IRB.getInt8Ty(); + Type *Int32Ty = IRB.getInt32Ty(); + + return replaceFunction(F, [&](CallInst *CI) -> Error { + IRB.SetInsertPoint(CI); + + Type *OldTy = cast<StructType>(CI->getType())->getElementType(0); + Type *ScalarTy = OldTy->getScalarType(); + Type *NewRetTy = OpBuilder.getResRetType(ScalarTy); + + Value *Handle = + createTmpHandleCast(CI->getArgOperand(0), OpBuilder.getHandleType()); + Value *Index0 = CI->getArgOperand(1); + Value *Index1 = CI->getArgOperand(2); + uint64_t NumElements = + DL.getTypeSizeInBits(OldTy) / DL.getTypeSizeInBits(ScalarTy); + Value *Mask = ConstantInt::get(Int8Ty, ~(~0U << NumElements)); + Value *Align = + ConstantInt::get(Int32Ty, DL.getPrefTypeAlign(ScalarTy).value()); + + Expected<CallInst *> OpCall = + DXILVersion >= VersionTuple(1, 2) + ? OpBuilder.tryCreateOp(OpCode::RawBufferLoad, + {Handle, Index0, Index1, Mask, Align}, + CI->getName(), NewRetTy) + : OpBuilder.tryCreateOp(OpCode::BufferLoad, + {Handle, Index0, Index1}, CI->getName(), + NewRetTy); + if (Error E = OpCall.takeError()) + return E; + if (Error E = replaceResRetUses(CI, *OpCall, /*HasCheckBit=*/true)) + return E; + + return Error::success(); + }); + } + [[nodiscard]] bool lowerUpdateCounter(Function &F) { IRBuilder<> &IRB = OpBuilder.getIRB(); Type *Int32Ty = IRB.getInt32Ty(); @@ -723,14 +773,14 @@ public: HasErrors |= lowerGetPointer(F); break; case Intrinsic::dx_resource_load_typedbuffer: - HasErrors |= lowerTypedBufferLoad(F, /*HasCheckBit=*/false); - break; - case Intrinsic::dx_resource_loadchecked_typedbuffer: HasErrors |= lowerTypedBufferLoad(F, /*HasCheckBit=*/true); break; case Intrinsic::dx_resource_store_typedbuffer: HasErrors |= lowerTypedBufferStore(F); break; + case Intrinsic::dx_resource_load_rawbuffer: + HasErrors |= lowerRawBufferLoad(F); + break; case Intrinsic::dx_resource_updatecounter: HasErrors |= lowerUpdateCounter(F); break; diff --git a/llvm/lib/Target/DirectX/DXILResourceAccess.cpp b/llvm/lib/Target/DirectX/DXILResourceAccess.cpp index 1ff8f09..8376249 100644 --- a/llvm/lib/Target/DirectX/DXILResourceAccess.cpp +++ b/llvm/lib/Target/DirectX/DXILResourceAccess.cpp @@ -30,6 +30,9 @@ static void replaceTypedBufferAccess(IntrinsicInst *II, "Unexpected typed buffer type"); Type *ContainedType = HandleType->getTypeParameter(0); + Type *LoadType = + StructType::get(ContainedType, Type::getInt1Ty(II->getContext())); + // We need the size of an element in bytes so that we can calculate the offset // in elements given a total offset in bytes later. Type *ScalarType = ContainedType->getScalarType(); @@ -81,13 +84,15 @@ static void replaceTypedBufferAccess(IntrinsicInst *II, // We're storing a scalar, so we need to load the current value and only // replace the relevant part. auto *Load = Builder.CreateIntrinsic( - ContainedType, Intrinsic::dx_resource_load_typedbuffer, + LoadType, Intrinsic::dx_resource_load_typedbuffer, {II->getOperand(0), II->getOperand(1)}); + auto *Struct = Builder.CreateExtractValue(Load, {0}); + // If we have an offset from seeing a GEP earlier, use it. Value *IndexOp = Current.Index ? Current.Index : ConstantInt::get(Builder.getInt32Ty(), 0); - V = Builder.CreateInsertElement(Load, V, IndexOp); + V = Builder.CreateInsertElement(Struct, V, IndexOp); } else { llvm_unreachable("Store to typed resource has invalid type"); } @@ -101,8 +106,10 @@ static void replaceTypedBufferAccess(IntrinsicInst *II, } else if (auto *LI = dyn_cast<LoadInst>(Current.Access)) { IRBuilder<> Builder(LI); Value *V = Builder.CreateIntrinsic( - ContainedType, Intrinsic::dx_resource_load_typedbuffer, + LoadType, Intrinsic::dx_resource_load_typedbuffer, {II->getOperand(0), II->getOperand(1)}); + V = Builder.CreateExtractValue(V, {0}); + if (Current.Index) V = Builder.CreateExtractElement(V, Current.Index); diff --git a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp index ad079f4..5afe6b2 100644 --- a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp +++ b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp @@ -15,14 +15,12 @@ #include "llvm/ADT/Twine.h" #include "llvm/Analysis/DXILMetadataAnalysis.h" #include "llvm/Analysis/DXILResource.h" -#include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/DiagnosticPrinter.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/LLVMContext.h" -#include "llvm/IR/MDBuilder.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" #include "llvm/InitializePasses.h" @@ -302,39 +300,6 @@ static MDTuple *emitTopLevelLibraryNode(Module &M, MDNode *RMD, return constructEntryMetadata(nullptr, nullptr, RMD, Properties, Ctx); } -// TODO: We might need to refactor this to be more generic, -// in case we need more metadata to be replaced. -static void translateBranchMetadata(Module &M) { - for (Function &F : M) { - for (BasicBlock &BB : F) { - Instruction *BBTerminatorInst = BB.getTerminator(); - - MDNode *HlslControlFlowMD = - BBTerminatorInst->getMetadata("hlsl.controlflow.hint"); - - if (!HlslControlFlowMD) - continue; - - assert(HlslControlFlowMD->getNumOperands() == 2 && - "invalid operands for hlsl.controlflow.hint"); - - MDBuilder MDHelper(M.getContext()); - ConstantInt *Op1 = - mdconst::extract<ConstantInt>(HlslControlFlowMD->getOperand(1)); - - SmallVector<llvm::Metadata *, 2> Vals( - ArrayRef<Metadata *>{MDHelper.createString("dx.controlflow.hints"), - MDHelper.createConstant(Op1)}); - - MDNode *MDNode = llvm::MDNode::get(M.getContext(), Vals); - - BBTerminatorInst->setMetadata("dx.controlflow.hints", MDNode); - BBTerminatorInst->setMetadata("hlsl.controlflow.hint", nullptr); - } - F.clearMetadata(); - } -} - static void translateMetadata(Module &M, DXILBindingMap &DBM, DXILResourceTypeMap &DRTM, const Resources &MDResources, @@ -407,7 +372,6 @@ PreservedAnalyses DXILTranslateMetadata::run(Module &M, const dxil::ModuleMetadataInfo MMDI = MAM.getResult<DXILMetadataAnalysis>(M); translateMetadata(M, DBM, DRTM, MDResources, ShaderFlags, MMDI); - translateBranchMetadata(M); return PreservedAnalyses::all(); } @@ -445,7 +409,6 @@ public: getAnalysis<DXILMetadataAnalysisWrapperPass>().getModuleMetadata(); translateMetadata(M, DBM, DRTM, MDResources, ShaderFlags, MMDI); - translateBranchMetadata(M); return true; } }; diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp index 45aadac..be68d46 100644 --- a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp +++ b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp @@ -749,8 +749,8 @@ uint64_t DXILBitcodeWriter::getOptimizationFlags(const Value *V) { if (PEO->isExact()) Flags |= 1 << bitc::PEO_EXACT; } else if (const auto *FPMO = dyn_cast<FPMathOperator>(V)) { - if (FPMO->hasAllowReassoc()) - Flags |= bitc::AllowReassoc; + if (FPMO->hasAllowReassoc() || FPMO->hasAllowContract()) + Flags |= bitc::UnsafeAlgebra; if (FPMO->hasNoNaNs()) Flags |= bitc::NoNaNs; if (FPMO->hasNoInfs()) @@ -759,10 +759,6 @@ uint64_t DXILBitcodeWriter::getOptimizationFlags(const Value *V) { Flags |= bitc::NoSignedZeros; if (FPMO->hasAllowReciprocal()) Flags |= bitc::AllowReciprocal; - if (FPMO->hasAllowContract()) - Flags |= bitc::AllowContract; - if (FPMO->hasApproxFunc()) - Flags |= bitc::ApproxFunc; } return Flags; diff --git a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp index 46a8ab3..991ee5b 100644 --- a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp +++ b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp @@ -1796,6 +1796,8 @@ bool PolynomialMultiplyRecognize::recognize() { IterCount = CV->getValue()->getZExtValue() + 1; Value *CIV = getCountIV(LoopB); + if (CIV == nullptr) + return false; ParsedValues PV; Simplifier PreSimp; PV.IterCount = IterCount; diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp index 54aeda2..32bc8bb 100644 --- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp @@ -154,6 +154,9 @@ void LoongArchInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, Register VReg) const { MachineFunction *MF = MBB.getParent(); MachineFrameInfo &MFI = MF->getFrameInfo(); + DebugLoc DL; + if (I != MBB.end()) + DL = I->getDebugLoc(); unsigned Opcode; if (LoongArch::GPRRegClass.hasSubClassEq(RC)) @@ -177,7 +180,7 @@ void LoongArchInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad, MFI.getObjectSize(FI), MFI.getObjectAlign(FI)); - BuildMI(MBB, I, DebugLoc(), get(Opcode), DstReg) + BuildMI(MBB, I, DL, get(Opcode), DstReg) .addFrameIndex(FI) .addImm(0) .addMemOperand(MMO); diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp index 65e1893..d34f45f 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp @@ -14,7 +14,7 @@ #include "NVPTX.h" #include "NVPTXUtilities.h" #include "llvm/ADT/StringRef.h" -#include "llvm/IR/NVVMIntrinsicFlags.h" +#include "llvm/IR/NVVMIntrinsicUtils.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstrInfo.h" diff --git a/llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.cpp index f940dc0..c03ef8d 100644 --- a/llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.cpp @@ -14,6 +14,7 @@ #include "MCTargetDesc/NVPTXBaseInfo.h" #include "NVPTX.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/IR/CallingConv.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalVariable.h" @@ -49,39 +50,34 @@ static std::string getHash(StringRef Str) { return llvm::utohexstr(Hash.low(), /*LowerCase=*/true); } -static void addKernelMetadata(Module &M, GlobalValue *GV) { +static void addKernelMetadata(Module &M, Function *F) { llvm::LLVMContext &Ctx = M.getContext(); // Get "nvvm.annotations" metadata node. llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations"); - llvm::Metadata *KernelMDVals[] = { - llvm::ConstantAsMetadata::get(GV), llvm::MDString::get(Ctx, "kernel"), - llvm::ConstantAsMetadata::get( - llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))}; - // This kernel is only to be called single-threaded. llvm::Metadata *ThreadXMDVals[] = { - llvm::ConstantAsMetadata::get(GV), llvm::MDString::get(Ctx, "maxntidx"), + llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, "maxntidx"), llvm::ConstantAsMetadata::get( llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))}; llvm::Metadata *ThreadYMDVals[] = { - llvm::ConstantAsMetadata::get(GV), llvm::MDString::get(Ctx, "maxntidy"), + llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, "maxntidy"), llvm::ConstantAsMetadata::get( llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))}; llvm::Metadata *ThreadZMDVals[] = { - llvm::ConstantAsMetadata::get(GV), llvm::MDString::get(Ctx, "maxntidz"), + llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, "maxntidz"), llvm::ConstantAsMetadata::get( llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))}; llvm::Metadata *BlockMDVals[] = { - llvm::ConstantAsMetadata::get(GV), + llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, "maxclusterrank"), llvm::ConstantAsMetadata::get( llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))}; // Append metadata to nvvm.annotations. - MD->addOperand(llvm::MDNode::get(Ctx, KernelMDVals)); + F->setCallingConv(CallingConv::PTX_Kernel); MD->addOperand(llvm::MDNode::get(Ctx, ThreadXMDVals)); MD->addOperand(llvm::MDNode::get(Ctx, ThreadYMDVals)); MD->addOperand(llvm::MDNode::get(Ctx, ThreadZMDVals)); diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index c51729e..ef97844 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -14,10 +14,11 @@ #include "NVPTXUtilities.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/ISDOpcodes.h" +#include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/IR/GlobalValue.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicsNVPTX.h" -#include "llvm/IR/NVVMIntrinsicFlags.h" +#include "llvm/IR/NVVMIntrinsicUtils.h" #include "llvm/Support/AtomicOrdering.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" @@ -2449,6 +2450,11 @@ bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) { return true; } +static inline bool isAddLike(const SDValue V) { + return V.getOpcode() == ISD::ADD || + (V->getOpcode() == ISD::OR && V->getFlags().hasDisjoint()); +} + // SelectDirectAddr - Match a direct address for DAG. // A direct address could be a globaladdress or externalsymbol. bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) { @@ -2475,7 +2481,7 @@ bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) { // symbol+offset bool NVPTXDAGToDAGISel::SelectADDRsi_imp( SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) { - if (Addr.getOpcode() == ISD::ADD) { + if (isAddLike(Addr)) { if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) { SDValue base = Addr.getOperand(0); if (SelectDirectAddr(base, Base)) { @@ -2512,7 +2518,7 @@ bool NVPTXDAGToDAGISel::SelectADDRri_imp( Addr.getOpcode() == ISD::TargetGlobalAddress) return false; // direct calls. - if (Addr.getOpcode() == ISD::ADD) { + if (isAddLike(Addr)) { if (SelectDirectAddr(Addr.getOperand(0), Addr)) { return false; } diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h index 4a98fe2..c9b7e87 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h @@ -261,6 +261,9 @@ public: return true; } + bool isFAbsFree(EVT VT) const override { return true; } + bool isFNegFree(EVT VT) const override { return true; } + private: const NVPTXSubtarget &STI; // cache the subtarget here SDValue getParamSymbol(SelectionDAG &DAG, int idx, EVT) const; diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp index 42043ad..74ce6a9 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp @@ -34,19 +34,18 @@ void NVPTXSubtarget::anchor() {} NVPTXSubtarget &NVPTXSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) { - // Provide the default CPU if we don't have one. - TargetName = std::string(CPU.empty() ? "sm_30" : CPU); + TargetName = std::string(CPU); - ParseSubtargetFeatures(TargetName, /*TuneCPU*/ TargetName, FS); + ParseSubtargetFeatures(getTargetName(), /*TuneCPU=*/getTargetName(), FS); - // Re-map SM version numbers, SmVersion carries the regular SMs which do - // have relative order, while FullSmVersion allows distinguishing sm_90 from - // sm_90a, which would *not* be a subset of sm_91. - SmVersion = getSmVersion(); + // Re-map SM version numbers, SmVersion carries the regular SMs which do + // have relative order, while FullSmVersion allows distinguishing sm_90 from + // sm_90a, which would *not* be a subset of sm_91. + SmVersion = getSmVersion(); - // Set default to PTX 6.0 (CUDA 9.0) - if (PTXVersion == 0) { - PTXVersion = 60; + // Set default to PTX 6.0 (CUDA 9.0) + if (PTXVersion == 0) { + PTXVersion = 60; } return *this; diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h index 7555a23..bbc1cca 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h @@ -111,7 +111,12 @@ public: // - 0 represents base GPU model, // - non-zero value identifies particular architecture-accelerated variant. bool hasAAFeatures() const { return getFullSmVersion() % 10; } - std::string getTargetName() const { return TargetName; } + + // If the user did not provide a target we default to the `sm_30` target. + std::string getTargetName() const { + return TargetName.empty() ? "sm_30" : TargetName; + } + bool hasTargetName() const { return !TargetName.empty(); } // Get maximum value of required alignments among the supported data types. // From the PTX ISA doc, section 8.2.3: diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp index b3b2880..6d4b82a 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -255,7 +255,10 @@ void NVPTXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { PB.registerPipelineStartEPCallback( [this](ModulePassManager &PM, OptimizationLevel Level) { FunctionPassManager FPM; - FPM.addPass(NVVMReflectPass(Subtarget.getSmVersion())); + // We do not want to fold out calls to nvvm.reflect early if the user + // has not provided a target architecture just yet. + if (Subtarget.hasTargetName()) + FPM.addPass(NVVMReflectPass(Subtarget.getSmVersion())); // Note: NVVMIntrRangePass was causing numerical discrepancies at one // point, if issues crop up, consider disabling. FPM.addPass(NVVMIntrRangePass()); diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp index 98bffd9..0f2bec7 100644 --- a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp @@ -311,11 +311,13 @@ std::optional<unsigned> getMaxNReg(const Function &F) { } bool isKernelFunction(const Function &F) { + if (F.getCallingConv() == CallingConv::PTX_Kernel) + return true; + if (const auto X = findOneNVVMAnnotation(&F, "kernel")) return (*X == 1); - // There is no NVVM metadata, check the calling convention - return F.getCallingConv() == CallingConv::PTX_Kernel; + return false; } MaybeAlign getAlign(const Function &F, unsigned Index) { diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp index 56525a1..0cd584c 100644 --- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp +++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp @@ -21,6 +21,7 @@ #include "NVPTX.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/ConstantFolding.h" +#include "llvm/CodeGen/CommandFlags.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" @@ -219,7 +220,13 @@ bool NVVMReflect::runOnFunction(Function &F) { return runNVVMReflect(F, SmVersion); } -NVVMReflectPass::NVVMReflectPass() : NVVMReflectPass(0) {} +NVVMReflectPass::NVVMReflectPass() { + // Get the CPU string from the command line if not provided. + std::string MCPU = codegen::getMCPU(); + StringRef SM = MCPU; + if (!SM.consume_front("sm_") || SM.consumeInteger(10, SmVersion)) + SmVersion = 0; +} PreservedAnalyses NVVMReflectPass::run(Function &F, FunctionAnalysisManager &AM) { diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt index 4466164..98d3615 100644 --- a/llvm/lib/Target/RISCV/CMakeLists.txt +++ b/llvm/lib/Target/RISCV/CMakeLists.txt @@ -15,6 +15,7 @@ tablegen(LLVM RISCVGenRegisterBank.inc -gen-register-bank) tablegen(LLVM RISCVGenRegisterInfo.inc -gen-register-info) tablegen(LLVM RISCVGenSearchableTables.inc -gen-searchable-tables) tablegen(LLVM RISCVGenSubtargetInfo.inc -gen-subtarget) +tablegen(LLVM RISCVGenExegesis.inc -gen-exegesis) set(LLVM_TARGET_DEFINITIONS RISCVGISel.td) tablegen(LLVM RISCVGenGlobalISel.inc -gen-global-isel) diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp index 3012283..a490910 100644 --- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp +++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp @@ -698,6 +698,8 @@ DecodeStatus RISCVDisassembler::getInstruction32(MCInst &MI, uint64_t &Size, TRY_TO_DECODE_FEATURE( RISCV::FeatureVendorXqcicli, DecoderTableXqcicli32, "Qualcomm uC Conditional Load Immediate custom opcode table"); + TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXqcicm, DecoderTableXqcicm32, + "Qualcomm uC Conditional Move custom opcode table"); TRY_TO_DECODE(true, DecoderTable32, "RISCV32 table"); return MCDisassembler::Fail; @@ -727,6 +729,9 @@ DecodeStatus RISCVDisassembler::getInstruction16(MCInst &MI, uint64_t &Size, TRY_TO_DECODE_FEATURE( RISCV::FeatureVendorXqciac, DecoderTableXqciac16, "Qualcomm uC Load-Store Address Calculation custom 16bit opcode table"); + TRY_TO_DECODE_FEATURE( + RISCV::FeatureVendorXqcicm, DecoderTableXqcicm16, + "Qualcomm uC Conditional Move custom 16bit opcode table"); TRY_TO_DECODE_AND_ADD_SP(STI.hasFeature(RISCV::FeatureVendorXwchc), DecoderTableXwchc16, "WCH QingKe XW custom opcode table"); diff --git a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp index ef85057..3f1539d 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp @@ -80,7 +80,6 @@ private: bool selectFPCompare(MachineInstr &MI, MachineIRBuilder &MIB) const; void emitFence(AtomicOrdering FenceOrdering, SyncScope::ID FenceSSID, MachineIRBuilder &MIB) const; - bool selectMergeValues(MachineInstr &MI, MachineIRBuilder &MIB) const; bool selectUnmergeValues(MachineInstr &MI, MachineIRBuilder &MIB) const; ComplexRendererFns selectShiftMask(MachineOperand &Root, @@ -732,8 +731,6 @@ bool RISCVInstructionSelector::select(MachineInstr &MI) { } case TargetOpcode::G_IMPLICIT_DEF: return selectImplicitDef(MI, MIB); - case TargetOpcode::G_MERGE_VALUES: - return selectMergeValues(MI, MIB); case TargetOpcode::G_UNMERGE_VALUES: return selectUnmergeValues(MI, MIB); default: @@ -741,26 +738,13 @@ bool RISCVInstructionSelector::select(MachineInstr &MI) { } } -bool RISCVInstructionSelector::selectMergeValues(MachineInstr &MI, - MachineIRBuilder &MIB) const { - assert(MI.getOpcode() == TargetOpcode::G_MERGE_VALUES); - - // Build a F64 Pair from operands - if (MI.getNumOperands() != 3) - return false; - Register Dst = MI.getOperand(0).getReg(); - Register Lo = MI.getOperand(1).getReg(); - Register Hi = MI.getOperand(2).getReg(); - if (!isRegInFprb(Dst) || !isRegInGprb(Lo) || !isRegInGprb(Hi)) - return false; - MI.setDesc(TII.get(RISCV::BuildPairF64Pseudo)); - return constrainSelectedInstRegOperands(MI, TII, TRI, RBI); -} - bool RISCVInstructionSelector::selectUnmergeValues( MachineInstr &MI, MachineIRBuilder &MIB) const { assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES); + if (!Subtarget->hasStdExtZfa()) + return false; + // Split F64 Src into two s32 parts if (MI.getNumOperands() != 3) return false; @@ -769,8 +753,17 @@ bool RISCVInstructionSelector::selectUnmergeValues( Register Hi = MI.getOperand(1).getReg(); if (!isRegInFprb(Src) || !isRegInGprb(Lo) || !isRegInGprb(Hi)) return false; - MI.setDesc(TII.get(RISCV::SplitF64Pseudo)); - return constrainSelectedInstRegOperands(MI, TII, TRI, RBI); + + MachineInstr *ExtractLo = MIB.buildInstr(RISCV::FMV_X_W_FPR64, {Lo}, {Src}); + if (!constrainSelectedInstRegOperands(*ExtractLo, TII, TRI, RBI)) + return false; + + MachineInstr *ExtractHi = MIB.buildInstr(RISCV::FMVH_X_D, {Hi}, {Src}); + if (!constrainSelectedInstRegOperands(*ExtractHi, TII, TRI, RBI)) + return false; + + MI.eraseFromParent(); + return true; } bool RISCVInstructionSelector::replacePtrWithInt(MachineOperand &Op, diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp index 8284737..6f06459 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp @@ -21,6 +21,7 @@ #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineJumpTableInfo.h" #include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/CodeGen/ValueTypes.h" @@ -132,7 +133,14 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST) auto PtrVecTys = {nxv1p0, nxv2p0, nxv4p0, nxv8p0, nxv16p0}; - getActionDefinitionsBuilder({G_ADD, G_SUB, G_AND, G_OR, G_XOR}) + getActionDefinitionsBuilder({G_ADD, G_SUB}) + .legalFor({sXLen}) + .legalIf(typeIsLegalIntOrFPVec(0, IntOrFPVecTys, ST)) + .customFor(ST.is64Bit(), {s32}) + .widenScalarToNextPow2(0) + .clampScalar(0, sXLen, sXLen); + + getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) .legalFor({sXLen}) .legalIf(typeIsLegalIntOrFPVec(0, IntOrFPVecTys, ST)) .widenScalarToNextPow2(0) @@ -1330,6 +1338,24 @@ bool RISCVLegalizerInfo::legalizeCustom( return true; return Helper.lowerConstant(MI); } + case TargetOpcode::G_SUB: + case TargetOpcode::G_ADD: { + Helper.Observer.changingInstr(MI); + Helper.widenScalarSrc(MI, sXLen, 1, TargetOpcode::G_ANYEXT); + Helper.widenScalarSrc(MI, sXLen, 2, TargetOpcode::G_ANYEXT); + + Register DstALU = MRI.createGenericVirtualRegister(sXLen); + + MachineOperand &MO = MI.getOperand(0); + MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt()); + auto DstSext = MIRBuilder.buildSExtInReg(sXLen, DstALU, 32); + + MIRBuilder.buildInstr(TargetOpcode::G_TRUNC, {MO}, {DstSext}); + MO.setReg(DstALU); + + Helper.Observer.changedInstr(MI); + return true; + } case TargetOpcode::G_SEXT_INREG: { LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); int64_t SizeInBits = MI.getOperand(2).getImm(); diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp index eab4a5e..0cb1ef0 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp @@ -38,9 +38,12 @@ std::optional<MCFixupKind> RISCVAsmBackend::getFixupKind(StringRef Name) const { if (STI.getTargetTriple().isOSBinFormatELF()) { unsigned Type; Type = llvm::StringSwitch<unsigned>(Name) -#define ELF_RELOC(X, Y) .Case(#X, Y) +#define ELF_RELOC(NAME, ID) .Case(#NAME, ID) #include "llvm/BinaryFormat/ELFRelocs/RISCV.def" #undef ELF_RELOC +#define ELF_RISCV_NONSTANDARD_RELOC(_VENDOR, NAME, ID) .Case(#NAME, ID) +#include "llvm/BinaryFormat/ELFRelocs/RISCV_nonstandard.def" +#undef ELF_RISCV_NONSTANDARD_RELOC .Case("BFD_RELOC_NONE", ELF::R_RISCV_NONE) .Case("BFD_RELOC_32", ELF::R_RISCV_32) .Case("BFD_RELOC_64", ELF::R_RISCV_64) diff --git a/llvm/lib/Target/RISCV/RISCV.td b/llvm/lib/Target/RISCV/RISCV.td index 9631241..4e0c64a 100644 --- a/llvm/lib/Target/RISCV/RISCV.td +++ b/llvm/lib/Target/RISCV/RISCV.td @@ -64,6 +64,12 @@ include "RISCVSchedXiangShanNanHu.td" include "RISCVProcessors.td" //===----------------------------------------------------------------------===// +// Pfm Counters +//===----------------------------------------------------------------------===// + +include "RISCVPfmCounters.td" + +//===----------------------------------------------------------------------===// // Define the RISC-V target. //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index 0074be3..01bc5387 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -1294,6 +1294,14 @@ def HasVendorXqcicli AssemblerPredicate<(all_of FeatureVendorXqcicli), "'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension)">; +def FeatureVendorXqcicm + : RISCVExperimentalExtension<0, 2, "Qualcomm uC Conditional Move Extension", + [FeatureStdExtZca]>; +def HasVendorXqcicm + : Predicate<"Subtarget->hasVendorXqcicm()">, + AssemblerPredicate<(all_of FeatureVendorXqcicm), + "'Xqcicm' (Qualcomm uC Conditional Move Extension)">; + //===----------------------------------------------------------------------===// // LLVM specific features and extensions //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 4a0304f..6c58989 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -18385,7 +18385,7 @@ bool RISCVTargetLowering::isDesirableToCommuteWithShift( auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1)); // Bail if we might break a sh{1,2,3}add pattern. - if (Subtarget.hasStdExtZba() && C2->getZExtValue() >= 1 && + if (Subtarget.hasStdExtZba() && C2 && C2->getZExtValue() >= 1 && C2->getZExtValue() <= 3 && N->hasOneUse() && N->user_begin()->getOpcode() == ISD::ADD && !isUsedByLdSt(*N->user_begin(), nullptr) && @@ -20273,13 +20273,11 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI, for (auto &Reg : RegsToPass) Ops.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType())); - if (!IsTailCall) { - // Add a register mask operand representing the call-preserved registers. - const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); - const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv); - assert(Mask && "Missing call preserved mask for calling convention"); - Ops.push_back(DAG.getRegisterMask(Mask)); - } + // Add a register mask operand representing the call-preserved registers. + const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); + const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv); + assert(Mask && "Missing call preserved mask for calling convention"); + Ops.push_back(DAG.getRegisterMask(Mask)); // Glue the call to the argument copies, if any. if (Glue.getNode()) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td index ae969bff8..349bc36 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td @@ -23,7 +23,9 @@ def SDT_RISCVSplitF64 : SDTypeProfile<2, 1, [SDTCisVT<0, i32>, SDTCisVT<2, f64>]>; def RISCVBuildPairF64 : SDNode<"RISCVISD::BuildPairF64", SDT_RISCVBuildPairF64>; +def : GINodeEquiv<G_MERGE_VALUES, RISCVBuildPairF64>; def RISCVSplitF64 : SDNode<"RISCVISD::SplitF64", SDT_RISCVSplitF64>; +def : GINodeEquiv<G_UNMERGE_VALUES, RISCVSplitF64>; def AddrRegImmINX : ComplexPattern<iPTR, 2, "SelectAddrRegImmRV32Zdinx">; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td index 5e6722c..6f15646 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td @@ -150,6 +150,22 @@ class QCILICC<bits<3> funct3, bits<2> funct2, DAGOperand InTyRs2, string opcodes let Inst{31-25} = {simm, funct2}; } +let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in +class QCIMVCC<bits<3> funct3, string opcodestr> + : RVInstR4<0b00, funct3, OPC_CUSTOM_2, (outs GPRNoX0:$rd), + (ins GPRNoX0:$rs1, GPRNoX0:$rs2, GPRNoX0:$rs3), + opcodestr, "$rd, $rs1, $rs2, $rs3">; + +let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in +class QCIMVCCI<bits<3> funct3, string opcodestr, DAGOperand immType> + : RVInstR4<0b10, funct3, OPC_CUSTOM_2, (outs GPRNoX0:$rd), + (ins GPRNoX0:$rs1, immType:$imm, GPRNoX0:$rs3), + opcodestr, "$rd, $rs1, $imm, $rs3"> { + bits<5> imm; + + let rs2 = imm; +} + //===----------------------------------------------------------------------===// // Instructions //===----------------------------------------------------------------------===// @@ -270,6 +286,32 @@ let Predicates = [HasVendorXqcicli, IsRV32], DecoderNamespace = "Xqcicli" in { def QC_LIGEUI : QCILICC<0b111, 0b11, uimm5, "qc.ligeui">; } // Predicates = [HasVendorXqcicli, IsRV32], DecoderNamespace = "Xqcicli" +let Predicates = [HasVendorXqcicm, IsRV32], DecoderNamespace = "Xqcicm" in { +let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in + def QC_C_MVEQZ : RVInst16CL<0b101, 0b10, (outs GPRC:$rd_wb), + (ins GPRC:$rd, GPRC:$rs1), + "qc.c.mveqz", "$rd, $rs1"> { + let Constraints = "$rd = $rd_wb"; + + let Inst{12-10} = 0b011; + let Inst{6-5} = 0b00; + } + + def QC_MVEQ : QCIMVCC<0b000, "qc.mveq">; + def QC_MVNE : QCIMVCC<0b001, "qc.mvne">; + def QC_MVLT : QCIMVCC<0b100, "qc.mvlt">; + def QC_MVGE : QCIMVCC<0b101, "qc.mvge">; + def QC_MVLTU : QCIMVCC<0b110, "qc.mvltu">; + def QC_MVGEU : QCIMVCC<0b111, "qc.mvgeu">; + + def QC_MVEQI : QCIMVCCI<0b000, "qc.mveqi", simm5>; + def QC_MVNEI : QCIMVCCI<0b001, "qc.mvnei", simm5>; + def QC_MVLTI : QCIMVCCI<0b100, "qc.mvlti", simm5>; + def QC_MVGEI : QCIMVCCI<0b101, "qc.mvgei", simm5>; + def QC_MVLTUI : QCIMVCCI<0b110, "qc.mvltui", uimm5>; + def QC_MVGEUI : QCIMVCCI<0b111, "qc.mvgeui", uimm5>; +} // Predicates = [HasVendorXqcicm, IsRV32], DecoderNamespace = "Xqcicm" + //===----------------------------------------------------------------------===// // Aliases //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/RISCV/RISCVPfmCounters.td b/llvm/lib/Target/RISCV/RISCVPfmCounters.td new file mode 100644 index 0000000..013e789 --- /dev/null +++ b/llvm/lib/Target/RISCV/RISCVPfmCounters.td @@ -0,0 +1,18 @@ +//===---- RISCVPfmCounters.td - RISC-V Hardware Counters ---*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This describes the available hardware counters for RISC-V. +// +//===----------------------------------------------------------------------===// + +def CpuCyclesPfmCounter : PfmCounter<"CYCLES">; + +def DefaultPfmCounters : ProcPfmCounters { + let CycleCounter = CpuCyclesPfmCounter; +} +def : PfmCountersDefaultBinding<DefaultPfmCounters>; diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td index 61c7c21..6dfed7dd 100644 --- a/llvm/lib/Target/RISCV/RISCVProcessors.td +++ b/llvm/lib/Target/RISCV/RISCVProcessors.td @@ -321,6 +321,25 @@ def SIFIVE_P470 : RISCVProcessorModel<"sifive-p470", SiFiveP400Model, [TuneNoSinkSplatOperands, TuneVXRMPipelineFlush])>; +defvar SiFiveP500TuneFeatures = [TuneNoDefaultUnroll, + TuneConditionalCompressedMoveFusion, + TuneLUIADDIFusion, + TuneAUIPCADDIFusion, + TunePostRAScheduler]; + +def SIFIVE_P550 : RISCVProcessorModel<"sifive-p550", NoSchedModel, + [Feature64Bit, + FeatureStdExtI, + FeatureStdExtZifencei, + FeatureStdExtM, + FeatureStdExtA, + FeatureStdExtF, + FeatureStdExtD, + FeatureStdExtC, + FeatureStdExtZba, + FeatureStdExtZbb], + SiFiveP500TuneFeatures>; + def SIFIVE_P670 : RISCVProcessorModel<"sifive-p670", SiFiveP600Model, !listconcat(RVA22U64Features, [FeatureStdExtV, diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td index a86c255..396cbe2c 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td +++ b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td @@ -182,7 +182,7 @@ def P400WriteCMOV : SchedWriteRes<[SiFiveP400Branch, SiFiveP400IEXQ1]> { } def : InstRW<[P400WriteCMOV], (instrs PseudoCCMOVGPRNoX0)>; -let Latency = 3 in { +let Latency = 2 in { // Integer multiplication def : WriteRes<WriteIMul, [SiFiveP400MulDiv]>; def : WriteRes<WriteIMul32, [SiFiveP400MulDiv]>; diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp index 32d5526..ad61a77 100644 --- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp +++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp @@ -50,7 +50,10 @@ public: StringRef getPassName() const override { return PASS_NAME; } private: - bool checkUsers(const MachineOperand *&CommonVL, MachineInstr &MI); + std::optional<MachineOperand> getMinimumVLForUser(MachineOperand &UserOp); + /// Returns the largest common VL MachineOperand that may be used to optimize + /// MI. Returns std::nullopt if it failed to find a suitable VL. + std::optional<MachineOperand> checkUsers(MachineInstr &MI); bool tryReduceVL(MachineInstr &MI); bool isCandidate(const MachineInstr &MI) const; }; @@ -76,11 +79,6 @@ static bool isVectorRegClass(Register R, const MachineRegisterInfo *MRI) { /// Represents the EMUL and EEW of a MachineOperand. struct OperandInfo { - enum class State { - Unknown, - Known, - } S; - // Represent as 1,2,4,8, ... and fractional indicator. This is because // EMUL can take on values that don't map to RISCVII::VLMUL values exactly. // For example, a mask operand can have an EMUL less than MF8. @@ -89,34 +87,32 @@ struct OperandInfo { unsigned Log2EEW; OperandInfo(RISCVII::VLMUL EMUL, unsigned Log2EEW) - : S(State::Known), EMUL(RISCVVType::decodeVLMUL(EMUL)), Log2EEW(Log2EEW) { - } + : EMUL(RISCVVType::decodeVLMUL(EMUL)), Log2EEW(Log2EEW) {} OperandInfo(std::pair<unsigned, bool> EMUL, unsigned Log2EEW) - : S(State::Known), EMUL(EMUL), Log2EEW(Log2EEW) {} + : EMUL(EMUL), Log2EEW(Log2EEW) {} - OperandInfo() : S(State::Unknown) {} + OperandInfo(unsigned Log2EEW) : Log2EEW(Log2EEW) {} - bool isUnknown() const { return S == State::Unknown; } - bool isKnown() const { return S == State::Known; } + OperandInfo() = delete; static bool EMULAndEEWAreEqual(const OperandInfo &A, const OperandInfo &B) { - assert(A.isKnown() && B.isKnown() && "Both operands must be known"); - return A.Log2EEW == B.Log2EEW && A.EMUL->first == B.EMUL->first && A.EMUL->second == B.EMUL->second; } + static bool EEWAreEqual(const OperandInfo &A, const OperandInfo &B) { + return A.Log2EEW == B.Log2EEW; + } + void print(raw_ostream &OS) const { - if (isUnknown()) { - OS << "Unknown"; - return; - } - assert(EMUL && "Expected EMUL to have value"); - OS << "EMUL: m"; - if (EMUL->second) - OS << "f"; - OS << EMUL->first; + if (EMUL) { + OS << "EMUL: m"; + if (EMUL->second) + OS << "f"; + OS << EMUL->first; + } else + OS << "EMUL: unknown\n"; OS << ", EEW: " << (1 << Log2EEW); } }; @@ -127,30 +123,18 @@ static raw_ostream &operator<<(raw_ostream &OS, const OperandInfo &OI) { return OS; } -namespace llvm { -namespace RISCVVType { -/// Return the RISCVII::VLMUL that is two times VLMul. -/// Precondition: VLMul is not LMUL_RESERVED or LMUL_8. -static RISCVII::VLMUL twoTimesVLMUL(RISCVII::VLMUL VLMul) { - switch (VLMul) { - case RISCVII::VLMUL::LMUL_F8: - return RISCVII::VLMUL::LMUL_F4; - case RISCVII::VLMUL::LMUL_F4: - return RISCVII::VLMUL::LMUL_F2; - case RISCVII::VLMUL::LMUL_F2: - return RISCVII::VLMUL::LMUL_1; - case RISCVII::VLMUL::LMUL_1: - return RISCVII::VLMUL::LMUL_2; - case RISCVII::VLMUL::LMUL_2: - return RISCVII::VLMUL::LMUL_4; - case RISCVII::VLMUL::LMUL_4: - return RISCVII::VLMUL::LMUL_8; - case RISCVII::VLMUL::LMUL_8: - default: - llvm_unreachable("Could not multiply VLMul by 2"); - } +LLVM_ATTRIBUTE_UNUSED +static raw_ostream &operator<<(raw_ostream &OS, + const std::optional<OperandInfo> &OI) { + if (OI) + OI->print(OS); + else + OS << "nullopt"; + return OS; } +namespace llvm { +namespace RISCVVType { /// Return EMUL = (EEW / SEW) * LMUL where EEW comes from Log2EEW and LMUL and /// SEW are from the TSFlags of MI. static std::pair<unsigned, bool> @@ -180,24 +164,22 @@ getEMULEqualsEEWDivSEWTimesLMUL(unsigned Log2EEW, const MachineInstr &MI) { } // end namespace RISCVVType } // end namespace llvm -/// Dest has EEW=SEW and EMUL=LMUL. Source EEW=SEW/Factor (i.e. F2 => EEW/2). -/// Source has EMUL=(EEW/SEW)*LMUL. LMUL and SEW comes from TSFlags of MI. -static OperandInfo getIntegerExtensionOperandInfo(unsigned Factor, - const MachineInstr &MI, - const MachineOperand &MO) { - RISCVII::VLMUL MIVLMul = RISCVII::getLMul(MI.getDesc().TSFlags); +/// Dest has EEW=SEW. Source EEW=SEW/Factor (i.e. F2 => EEW/2). +/// SEW comes from TSFlags of MI. +static unsigned getIntegerExtensionOperandEEW(unsigned Factor, + const MachineInstr &MI, + const MachineOperand &MO) { unsigned MILog2SEW = MI.getOperand(RISCVII::getSEWOpNum(MI.getDesc())).getImm(); if (MO.getOperandNo() == 0) - return OperandInfo(MIVLMul, MILog2SEW); + return MILog2SEW; unsigned MISEW = 1 << MILog2SEW; unsigned EEW = MISEW / Factor; unsigned Log2EEW = Log2_32(EEW); - return OperandInfo(RISCVVType::getEMULEqualsEEWDivSEWTimesLMUL(Log2EEW, MI), - Log2EEW); + return Log2EEW; } /// Check whether MO is a mask operand of MI. @@ -211,18 +193,15 @@ static bool isMaskOperand(const MachineInstr &MI, const MachineOperand &MO, return Desc.operands()[MO.getOperandNo()].RegClass == RISCV::VMV0RegClassID; } -/// Return the OperandInfo for MO. -static OperandInfo getOperandInfo(const MachineOperand &MO, - const MachineRegisterInfo *MRI) { +static std::optional<unsigned> +getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) { const MachineInstr &MI = *MO.getParent(); const RISCVVPseudosTable::PseudoInfo *RVV = RISCVVPseudosTable::getPseudoInfo(MI.getOpcode()); assert(RVV && "Could not find MI in PseudoTable"); - // MI has a VLMUL and SEW associated with it. The RVV specification defines - // the LMUL and SEW of each operand and definition in relation to MI.VLMUL and - // MI.SEW. - RISCVII::VLMUL MIVLMul = RISCVII::getLMul(MI.getDesc().TSFlags); + // MI has a SEW associated with it. The RVV specification defines + // the EEW of each operand and definition in relation to MI.SEW. unsigned MILog2SEW = MI.getOperand(RISCVII::getSEWOpNum(MI.getDesc())).getImm(); @@ -233,13 +212,13 @@ static OperandInfo getOperandInfo(const MachineOperand &MO, // since they must preserve the entire register content. if (HasPassthru && MO.getOperandNo() == MI.getNumExplicitDefs() && (MO.getReg() != RISCV::NoRegister)) - return {}; + return std::nullopt; bool IsMODef = MO.getOperandNo() == 0; - // All mask operands have EEW=1, EMUL=(EEW/SEW)*LMUL + // All mask operands have EEW=1 if (isMaskOperand(MI, MO, MRI)) - return OperandInfo(RISCVVType::getEMULEqualsEEWDivSEWTimesLMUL(0, MI), 0); + return 0; // switch against BaseInstr to reduce number of cases that need to be // considered. @@ -256,55 +235,65 @@ static OperandInfo getOperandInfo(const MachineOperand &MO, // Vector Loads and Stores // Vector Unit-Stride Instructions // Vector Strided Instructions - /// Dest EEW encoded in the instruction and EMUL=(EEW/SEW)*LMUL + /// Dest EEW encoded in the instruction + case RISCV::VLM_V: + case RISCV::VSM_V: + return 0; + case RISCV::VLE8_V: case RISCV::VSE8_V: + case RISCV::VLSE8_V: case RISCV::VSSE8_V: - return OperandInfo(RISCVVType::getEMULEqualsEEWDivSEWTimesLMUL(3, MI), 3); + return 3; + case RISCV::VLE16_V: case RISCV::VSE16_V: + case RISCV::VLSE16_V: case RISCV::VSSE16_V: - return OperandInfo(RISCVVType::getEMULEqualsEEWDivSEWTimesLMUL(4, MI), 4); + return 4; + case RISCV::VLE32_V: case RISCV::VSE32_V: + case RISCV::VLSE32_V: case RISCV::VSSE32_V: - return OperandInfo(RISCVVType::getEMULEqualsEEWDivSEWTimesLMUL(5, MI), 5); + return 5; + case RISCV::VLE64_V: case RISCV::VSE64_V: + case RISCV::VLSE64_V: case RISCV::VSSE64_V: - return OperandInfo(RISCVVType::getEMULEqualsEEWDivSEWTimesLMUL(6, MI), 6); + return 6; // Vector Indexed Instructions // vs(o|u)xei<eew>.v - // Dest/Data (operand 0) EEW=SEW, EMUL=LMUL. Source EEW=<eew> and - // EMUL=(EEW/SEW)*LMUL. + // Dest/Data (operand 0) EEW=SEW. Source EEW=<eew>. case RISCV::VLUXEI8_V: case RISCV::VLOXEI8_V: case RISCV::VSUXEI8_V: case RISCV::VSOXEI8_V: { if (MO.getOperandNo() == 0) - return OperandInfo(MIVLMul, MILog2SEW); - return OperandInfo(RISCVVType::getEMULEqualsEEWDivSEWTimesLMUL(3, MI), 3); + return MILog2SEW; + return 3; } case RISCV::VLUXEI16_V: case RISCV::VLOXEI16_V: case RISCV::VSUXEI16_V: case RISCV::VSOXEI16_V: { if (MO.getOperandNo() == 0) - return OperandInfo(MIVLMul, MILog2SEW); - return OperandInfo(RISCVVType::getEMULEqualsEEWDivSEWTimesLMUL(4, MI), 4); + return MILog2SEW; + return 4; } case RISCV::VLUXEI32_V: case RISCV::VLOXEI32_V: case RISCV::VSUXEI32_V: case RISCV::VSOXEI32_V: { if (MO.getOperandNo() == 0) - return OperandInfo(MIVLMul, MILog2SEW); - return OperandInfo(RISCVVType::getEMULEqualsEEWDivSEWTimesLMUL(5, MI), 5); + return MILog2SEW; + return 5; } case RISCV::VLUXEI64_V: case RISCV::VLOXEI64_V: case RISCV::VSUXEI64_V: case RISCV::VSOXEI64_V: { if (MO.getOperandNo() == 0) - return OperandInfo(MIVLMul, MILog2SEW); - return OperandInfo(RISCVVType::getEMULEqualsEEWDivSEWTimesLMUL(6, MI), 6); + return MILog2SEW; + return 6; } // Vector Integer Arithmetic Instructions @@ -318,7 +307,7 @@ static OperandInfo getOperandInfo(const MachineOperand &MO, case RISCV::VRSUB_VX: // Vector Bitwise Logical Instructions // Vector Single-Width Shift Instructions - // EEW=SEW. EMUL=LMUL. + // EEW=SEW. case RISCV::VAND_VI: case RISCV::VAND_VV: case RISCV::VAND_VX: @@ -338,7 +327,7 @@ static OperandInfo getOperandInfo(const MachineOperand &MO, case RISCV::VSRA_VV: case RISCV::VSRA_VX: // Vector Integer Min/Max Instructions - // EEW=SEW. EMUL=LMUL. + // EEW=SEW. case RISCV::VMINU_VV: case RISCV::VMINU_VX: case RISCV::VMIN_VV: @@ -348,7 +337,7 @@ static OperandInfo getOperandInfo(const MachineOperand &MO, case RISCV::VMAX_VV: case RISCV::VMAX_VX: // Vector Single-Width Integer Multiply Instructions - // Source and Dest EEW=SEW and EMUL=LMUL. + // Source and Dest EEW=SEW. case RISCV::VMUL_VV: case RISCV::VMUL_VX: case RISCV::VMULH_VV: @@ -358,7 +347,7 @@ static OperandInfo getOperandInfo(const MachineOperand &MO, case RISCV::VMULHSU_VV: case RISCV::VMULHSU_VX: // Vector Integer Divide Instructions - // EEW=SEW. EMUL=LMUL. + // EEW=SEW. case RISCV::VDIVU_VV: case RISCV::VDIVU_VX: case RISCV::VDIV_VV: @@ -368,7 +357,7 @@ static OperandInfo getOperandInfo(const MachineOperand &MO, case RISCV::VREM_VV: case RISCV::VREM_VX: // Vector Single-Width Integer Multiply-Add Instructions - // EEW=SEW. EMUL=LMUL. + // EEW=SEW. case RISCV::VMACC_VV: case RISCV::VMACC_VX: case RISCV::VNMSAC_VV: @@ -379,8 +368,8 @@ static OperandInfo getOperandInfo(const MachineOperand &MO, case RISCV::VNMSUB_VX: // Vector Integer Merge Instructions // Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions - // EEW=SEW and EMUL=LMUL, except the mask operand has EEW=1 and EMUL= - // (EEW/SEW)*LMUL. Mask operand is handled before this switch. + // EEW=SEW, except the mask operand has EEW=1. Mask operand is handled + // before this switch. case RISCV::VMERGE_VIM: case RISCV::VMERGE_VVM: case RISCV::VMERGE_VXM: @@ -393,7 +382,7 @@ static OperandInfo getOperandInfo(const MachineOperand &MO, // Vector Fixed-Point Arithmetic Instructions // Vector Single-Width Saturating Add and Subtract // Vector Single-Width Averaging Add and Subtract - // EEW=SEW. EMUL=LMUL. + // EEW=SEW. case RISCV::VMV_V_I: case RISCV::VMV_V_V: case RISCV::VMV_V_X: @@ -416,12 +405,12 @@ static OperandInfo getOperandInfo(const MachineOperand &MO, case RISCV::VASUB_VV: case RISCV::VASUB_VX: // Vector Single-Width Fractional Multiply with Rounding and Saturation - // EEW=SEW. EMUL=LMUL. The instruction produces 2*SEW product internally but + // EEW=SEW. The instruction produces 2*SEW product internally but // saturates to fit into SEW bits. case RISCV::VSMUL_VV: case RISCV::VSMUL_VX: // Vector Single-Width Scaling Shift Instructions - // EEW=SEW. EMUL=LMUL. + // EEW=SEW. case RISCV::VSSRL_VI: case RISCV::VSSRL_VV: case RISCV::VSSRL_VX: @@ -431,13 +420,13 @@ static OperandInfo getOperandInfo(const MachineOperand &MO, // Vector Permutation Instructions // Integer Scalar Move Instructions // Floating-Point Scalar Move Instructions - // EMUL=LMUL. EEW=SEW. + // EEW=SEW. case RISCV::VMV_X_S: case RISCV::VMV_S_X: case RISCV::VFMV_F_S: case RISCV::VFMV_S_F: // Vector Slide Instructions - // EMUL=LMUL. EEW=SEW. + // EEW=SEW. case RISCV::VSLIDEUP_VI: case RISCV::VSLIDEUP_VX: case RISCV::VSLIDEDOWN_VI: @@ -447,12 +436,12 @@ static OperandInfo getOperandInfo(const MachineOperand &MO, case RISCV::VSLIDE1DOWN_VX: case RISCV::VFSLIDE1DOWN_VF: // Vector Register Gather Instructions - // EMUL=LMUL. EEW=SEW. For mask operand, EMUL=1 and EEW=1. + // EEW=SEW. For mask operand, EEW=1. case RISCV::VRGATHER_VI: case RISCV::VRGATHER_VV: case RISCV::VRGATHER_VX: // Vector Compress Instruction - // EMUL=LMUL. EEW=SEW. + // EEW=SEW. case RISCV::VCOMPRESS_VM: // Vector Element Index Instruction case RISCV::VID_V: @@ -499,10 +488,10 @@ static OperandInfo getOperandInfo(const MachineOperand &MO, case RISCV::VFCVT_F_X_V: // Vector Floating-Point Merge Instruction case RISCV::VFMERGE_VFM: - return OperandInfo(MIVLMul, MILog2SEW); + return MILog2SEW; // Vector Widening Integer Add/Subtract - // Def uses EEW=2*SEW and EMUL=2*LMUL. Operands use EEW=SEW and EMUL=LMUL. + // Def uses EEW=2*SEW . Operands use EEW=SEW. case RISCV::VWADDU_VV: case RISCV::VWADDU_VX: case RISCV::VWSUBU_VV: @@ -513,7 +502,7 @@ static OperandInfo getOperandInfo(const MachineOperand &MO, case RISCV::VWSUB_VX: case RISCV::VWSLL_VI: // Vector Widening Integer Multiply Instructions - // Source and Destination EMUL=LMUL. Destination EEW=2*SEW. Source EEW=SEW. + // Destination EEW=2*SEW. Source EEW=SEW. case RISCV::VWMUL_VV: case RISCV::VWMUL_VX: case RISCV::VWMULSU_VV: @@ -521,7 +510,7 @@ static OperandInfo getOperandInfo(const MachineOperand &MO, case RISCV::VWMULU_VV: case RISCV::VWMULU_VX: // Vector Widening Integer Multiply-Add Instructions - // Destination EEW=2*SEW and EMUL=2*LMUL. Source EEW=SEW and EMUL=LMUL. + // Destination EEW=2*SEW. Source EEW=SEW. // A SEW-bit*SEW-bit multiply of the sources forms a 2*SEW-bit value, which // is then added to the 2*SEW-bit Dest. These instructions never have a // passthru operand. @@ -542,7 +531,7 @@ static OperandInfo getOperandInfo(const MachineOperand &MO, case RISCV::VFWNMSAC_VF: case RISCV::VFWNMSAC_VV: // Vector Widening Floating-Point Add/Subtract Instructions - // Dest EEW=2*SEW and EMUL=2*LMUL. Source EEW=SEW and EMUL=LMUL. + // Dest EEW=2*SEW. Source EEW=SEW. case RISCV::VFWADD_VV: case RISCV::VFWADD_VF: case RISCV::VFWSUB_VV: @@ -559,12 +548,10 @@ static OperandInfo getOperandInfo(const MachineOperand &MO, case RISCV::VFWCVT_F_X_V: case RISCV::VFWCVT_F_F_V: { unsigned Log2EEW = IsMODef ? MILog2SEW + 1 : MILog2SEW; - RISCVII::VLMUL EMUL = - IsMODef ? RISCVVType::twoTimesVLMUL(MIVLMul) : MIVLMul; - return OperandInfo(EMUL, Log2EEW); + return Log2EEW; } - // Def and Op1 uses EEW=2*SEW and EMUL=2*LMUL. Op2 uses EEW=SEW and EMUL=LMUL + // Def and Op1 uses EEW=2*SEW. Op2 uses EEW=SEW. case RISCV::VWADDU_WV: case RISCV::VWADDU_WX: case RISCV::VWSUBU_WV: @@ -581,25 +568,22 @@ static OperandInfo getOperandInfo(const MachineOperand &MO, bool IsOp1 = HasPassthru ? MO.getOperandNo() == 2 : MO.getOperandNo() == 1; bool TwoTimes = IsMODef || IsOp1; unsigned Log2EEW = TwoTimes ? MILog2SEW + 1 : MILog2SEW; - RISCVII::VLMUL EMUL = - TwoTimes ? RISCVVType::twoTimesVLMUL(MIVLMul) : MIVLMul; - return OperandInfo(EMUL, Log2EEW); + return Log2EEW; } // Vector Integer Extension case RISCV::VZEXT_VF2: case RISCV::VSEXT_VF2: - return getIntegerExtensionOperandInfo(2, MI, MO); + return getIntegerExtensionOperandEEW(2, MI, MO); case RISCV::VZEXT_VF4: case RISCV::VSEXT_VF4: - return getIntegerExtensionOperandInfo(4, MI, MO); + return getIntegerExtensionOperandEEW(4, MI, MO); case RISCV::VZEXT_VF8: case RISCV::VSEXT_VF8: - return getIntegerExtensionOperandInfo(8, MI, MO); + return getIntegerExtensionOperandEEW(8, MI, MO); // Vector Narrowing Integer Right Shift Instructions - // Destination EEW=SEW and EMUL=LMUL, Op 1 has EEW=2*SEW EMUL=2*LMUL. Op2 has - // EEW=SEW EMUL=LMUL. + // Destination EEW=SEW, Op 1 has EEW=2*SEW. Op2 has EEW=SEW case RISCV::VNSRL_WX: case RISCV::VNSRL_WI: case RISCV::VNSRL_WV: @@ -607,7 +591,7 @@ static OperandInfo getOperandInfo(const MachineOperand &MO, case RISCV::VNSRA_WV: case RISCV::VNSRA_WX: // Vector Narrowing Fixed-Point Clip Instructions - // Destination and Op1 EEW=SEW and EMUL=LMUL. Op2 EEW=2*SEW and EMUL=2*LMUL + // Destination and Op1 EEW=SEW. Op2 EEW=2*SEW. case RISCV::VNCLIPU_WI: case RISCV::VNCLIPU_WV: case RISCV::VNCLIPU_WX: @@ -626,9 +610,7 @@ static OperandInfo getOperandInfo(const MachineOperand &MO, bool IsOp1 = HasPassthru ? MO.getOperandNo() == 2 : MO.getOperandNo() == 1; bool TwoTimes = IsOp1; unsigned Log2EEW = TwoTimes ? MILog2SEW + 1 : MILog2SEW; - RISCVII::VLMUL EMUL = - TwoTimes ? RISCVVType::twoTimesVLMUL(MIVLMul) : MIVLMul; - return OperandInfo(EMUL, Log2EEW); + return Log2EEW; } // Vector Mask Instructions @@ -636,7 +618,7 @@ static OperandInfo getOperandInfo(const MachineOperand &MO, // vmsbf.m set-before-first mask bit // vmsif.m set-including-first mask bit // vmsof.m set-only-first mask bit - // EEW=1 and EMUL=(EEW/SEW)*LMUL + // EEW=1 // We handle the cases when operand is a v0 mask operand above the switch, // but these instructions may use non-v0 mask operands and need to be handled // specifically. @@ -651,20 +633,20 @@ static OperandInfo getOperandInfo(const MachineOperand &MO, case RISCV::VMSBF_M: case RISCV::VMSIF_M: case RISCV::VMSOF_M: { - return OperandInfo(RISCVVType::getEMULEqualsEEWDivSEWTimesLMUL(0, MI), 0); + return 0; } // Vector Iota Instruction - // EEW=SEW and EMUL=LMUL, except the mask operand has EEW=1 and EMUL= - // (EEW/SEW)*LMUL. Mask operand is not handled before this switch. + // EEW=SEW, except the mask operand has EEW=1. Mask operand is not handled + // before this switch. case RISCV::VIOTA_M: { if (IsMODef || MO.getOperandNo() == 1) - return OperandInfo(MIVLMul, MILog2SEW); - return OperandInfo(RISCVVType::getEMULEqualsEEWDivSEWTimesLMUL(0, MI), 0); + return MILog2SEW; + return 0; } // Vector Integer Compare Instructions - // Dest EEW=1 and EMUL=(EEW/SEW)*LMUL. Source EEW=SEW and EMUL=LMUL. + // Dest EEW=1. Source EEW=SEW. case RISCV::VMSEQ_VI: case RISCV::VMSEQ_VV: case RISCV::VMSEQ_VX: @@ -686,21 +668,20 @@ static OperandInfo getOperandInfo(const MachineOperand &MO, case RISCV::VMSGT_VI: case RISCV::VMSGT_VX: // Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions - // Dest EEW=1 and EMUL=(EEW/SEW)*LMUL. Source EEW=SEW and EMUL=LMUL. Mask - // source operand handled above this switch. + // Dest EEW=1. Source EEW=SEW. Mask source operand handled above this switch. case RISCV::VMADC_VIM: case RISCV::VMADC_VVM: case RISCV::VMADC_VXM: case RISCV::VMSBC_VVM: case RISCV::VMSBC_VXM: - // Dest EEW=1 and EMUL=(EEW/SEW)*LMUL. Source EEW=SEW and EMUL=LMUL. + // Dest EEW=1. Source EEW=SEW. case RISCV::VMADC_VV: case RISCV::VMADC_VI: case RISCV::VMADC_VX: case RISCV::VMSBC_VV: case RISCV::VMSBC_VX: // 13.13. Vector Floating-Point Compare Instructions - // Dest EEW=1 and EMUL=(EEW/SEW)*LMUL. Source EEW=SEW EMUL=LMUL. + // Dest EEW=1. Source EEW=SEW case RISCV::VMFEQ_VF: case RISCV::VMFEQ_VV: case RISCV::VMFNE_VF: @@ -712,15 +693,62 @@ static OperandInfo getOperandInfo(const MachineOperand &MO, case RISCV::VMFGT_VF: case RISCV::VMFGE_VF: { if (IsMODef) - return OperandInfo(RISCVVType::getEMULEqualsEEWDivSEWTimesLMUL(0, MI), 0); - return OperandInfo(MIVLMul, MILog2SEW); + return 0; + return MILog2SEW; + } + + // Vector Reduction Operations + // Vector Single-Width Integer Reduction Instructions + case RISCV::VREDAND_VS: + case RISCV::VREDMAX_VS: + case RISCV::VREDMAXU_VS: + case RISCV::VREDMIN_VS: + case RISCV::VREDMINU_VS: + case RISCV::VREDOR_VS: + case RISCV::VREDSUM_VS: + case RISCV::VREDXOR_VS: { + return MILog2SEW; } default: - return {}; + return std::nullopt; } } +static std::optional<OperandInfo> +getOperandInfo(const MachineOperand &MO, const MachineRegisterInfo *MRI) { + const MachineInstr &MI = *MO.getParent(); + const RISCVVPseudosTable::PseudoInfo *RVV = + RISCVVPseudosTable::getPseudoInfo(MI.getOpcode()); + assert(RVV && "Could not find MI in PseudoTable"); + + std::optional<unsigned> Log2EEW = getOperandLog2EEW(MO, MRI); + if (!Log2EEW) + return std::nullopt; + + switch (RVV->BaseInstr) { + // Vector Reduction Operations + // Vector Single-Width Integer Reduction Instructions + // The Dest and VS1 only read element 0 of the vector register. Return just + // the EEW for these. + case RISCV::VREDAND_VS: + case RISCV::VREDMAX_VS: + case RISCV::VREDMAXU_VS: + case RISCV::VREDMIN_VS: + case RISCV::VREDMINU_VS: + case RISCV::VREDOR_VS: + case RISCV::VREDSUM_VS: + case RISCV::VREDXOR_VS: + if (MO.getOperandNo() != 2) + return OperandInfo(*Log2EEW); + break; + }; + + // All others have EMUL=EEW/SEW*LMUL + return OperandInfo(RISCVVType::getEMULEqualsEEWDivSEWTimesLMUL(*Log2EEW, MI), + *Log2EEW); +} + /// Return true if this optimization should consider MI for VL reduction. This /// white-list approach simplifies this optimization for instructions that may /// have more complex semantics with relation to how it uses VL. @@ -732,6 +760,32 @@ static bool isSupportedInstr(const MachineInstr &MI) { return false; switch (RVV->BaseInstr) { + // Vector Unit-Stride Instructions + // Vector Strided Instructions + case RISCV::VLM_V: + case RISCV::VLE8_V: + case RISCV::VLSE8_V: + case RISCV::VLE16_V: + case RISCV::VLSE16_V: + case RISCV::VLE32_V: + case RISCV::VLSE32_V: + case RISCV::VLE64_V: + case RISCV::VLSE64_V: + // Vector Indexed Instructions + case RISCV::VLUXEI8_V: + case RISCV::VLOXEI8_V: + case RISCV::VLUXEI16_V: + case RISCV::VLOXEI16_V: + case RISCV::VLUXEI32_V: + case RISCV::VLOXEI32_V: + case RISCV::VLUXEI64_V: + case RISCV::VLOXEI64_V: { + for (const MachineMemOperand *MMO : MI.memoperands()) + if (MMO->isVolatile()) + return false; + return true; + } + // Vector Single-Width Integer Add and Subtract case RISCV::VADD_VI: case RISCV::VADD_VV: @@ -901,6 +955,30 @@ static bool isSupportedInstr(const MachineInstr &MI) { case RISCV::VMSOF_M: case RISCV::VIOTA_M: case RISCV::VID_V: + // Single-Width Floating-Point/Integer Type-Convert Instructions + case RISCV::VFCVT_XU_F_V: + case RISCV::VFCVT_X_F_V: + case RISCV::VFCVT_RTZ_XU_F_V: + case RISCV::VFCVT_RTZ_X_F_V: + case RISCV::VFCVT_F_XU_V: + case RISCV::VFCVT_F_X_V: + // Widening Floating-Point/Integer Type-Convert Instructions + case RISCV::VFWCVT_XU_F_V: + case RISCV::VFWCVT_X_F_V: + case RISCV::VFWCVT_RTZ_XU_F_V: + case RISCV::VFWCVT_RTZ_X_F_V: + case RISCV::VFWCVT_F_XU_V: + case RISCV::VFWCVT_F_X_V: + case RISCV::VFWCVT_F_F_V: + // Narrowing Floating-Point/Integer Type-Convert Instructions + case RISCV::VFNCVT_XU_F_W: + case RISCV::VFNCVT_X_F_W: + case RISCV::VFNCVT_RTZ_XU_F_W: + case RISCV::VFNCVT_RTZ_X_F_W: + case RISCV::VFNCVT_F_XU_W: + case RISCV::VFNCVT_F_X_W: + case RISCV::VFNCVT_F_F_W: + case RISCV::VFNCVT_ROD_F_F_W: return true; } @@ -1007,6 +1085,11 @@ bool RISCVVLOptimizer::isCandidate(const MachineInstr &MI) const { return false; } + if (MI.mayRaiseFPException()) { + LLVM_DEBUG(dbgs() << "Not a candidate because may raise FP exception\n"); + return false; + } + // Some instructions that produce vectors have semantics that make it more // difficult to determine whether the VL can be reduced. For example, some // instructions, such as reductions, may write lanes past VL to a scalar @@ -1028,79 +1111,103 @@ bool RISCVVLOptimizer::isCandidate(const MachineInstr &MI) const { return true; } -bool RISCVVLOptimizer::checkUsers(const MachineOperand *&CommonVL, - MachineInstr &MI) { +std::optional<MachineOperand> +RISCVVLOptimizer::getMinimumVLForUser(MachineOperand &UserOp) { + const MachineInstr &UserMI = *UserOp.getParent(); + const MCInstrDesc &Desc = UserMI.getDesc(); + + if (!RISCVII::hasVLOp(Desc.TSFlags) || !RISCVII::hasSEWOp(Desc.TSFlags)) { + LLVM_DEBUG(dbgs() << " Abort due to lack of VL, assume that" + " use VLMAX\n"); + return std::nullopt; + } + + // Instructions like reductions may use a vector register as a scalar + // register. In this case, we should treat it as only reading the first lane. + if (isVectorOpUsedAsScalarOp(UserOp)) { + [[maybe_unused]] Register R = UserOp.getReg(); + [[maybe_unused]] const TargetRegisterClass *RC = MRI->getRegClass(R); + assert(RISCV::VRRegClass.hasSubClassEq(RC) && + "Expect LMUL 1 register class for vector as scalar operands!"); + LLVM_DEBUG(dbgs() << " Used this operand as a scalar operand\n"); + + return MachineOperand::CreateImm(1); + } + + unsigned VLOpNum = RISCVII::getVLOpNum(Desc); + const MachineOperand &VLOp = UserMI.getOperand(VLOpNum); + // Looking for an immediate or a register VL that isn't X0. + assert((!VLOp.isReg() || VLOp.getReg() != RISCV::X0) && + "Did not expect X0 VL"); + return VLOp; +} + +std::optional<MachineOperand> RISCVVLOptimizer::checkUsers(MachineInstr &MI) { // FIXME: Avoid visiting each user for each time we visit something on the // worklist, combined with an extra visit from the outer loop. Restructure // along lines of an instcombine style worklist which integrates the outer // pass. - bool CanReduceVL = true; + std::optional<MachineOperand> CommonVL; for (auto &UserOp : MRI->use_operands(MI.getOperand(0).getReg())) { const MachineInstr &UserMI = *UserOp.getParent(); LLVM_DEBUG(dbgs() << " Checking user: " << UserMI << "\n"); - - // Instructions like reductions may use a vector register as a scalar - // register. In this case, we should treat it like a scalar register which - // does not impact the decision on whether to optimize VL. - // TODO: Treat it like a scalar register instead of bailing out. - if (isVectorOpUsedAsScalarOp(UserOp)) { - CanReduceVL = false; - break; - } - if (mayReadPastVL(UserMI)) { LLVM_DEBUG(dbgs() << " Abort because used by unsafe instruction\n"); - CanReduceVL = false; - break; + return std::nullopt; } // Tied operands might pass through. if (UserOp.isTied()) { LLVM_DEBUG(dbgs() << " Abort because user used as tied operand\n"); - CanReduceVL = false; - break; + return std::nullopt; } - const MCInstrDesc &Desc = UserMI.getDesc(); - if (!RISCVII::hasVLOp(Desc.TSFlags) || !RISCVII::hasSEWOp(Desc.TSFlags)) { - LLVM_DEBUG(dbgs() << " Abort due to lack of VL or SEW, assume that" - " use VLMAX\n"); - CanReduceVL = false; - break; - } - - unsigned VLOpNum = RISCVII::getVLOpNum(Desc); - const MachineOperand &VLOp = UserMI.getOperand(VLOpNum); - - // Looking for an immediate or a register VL that isn't X0. - assert((!VLOp.isReg() || VLOp.getReg() != RISCV::X0) && - "Did not expect X0 VL"); + auto VLOp = getMinimumVLForUser(UserOp); + if (!VLOp) + return std::nullopt; // Use the largest VL among all the users. If we cannot determine this // statically, then we cannot optimize the VL. - if (!CommonVL || RISCV::isVLKnownLE(*CommonVL, VLOp)) { - CommonVL = &VLOp; + if (!CommonVL || RISCV::isVLKnownLE(*CommonVL, *VLOp)) { + CommonVL = *VLOp; LLVM_DEBUG(dbgs() << " User VL is: " << VLOp << "\n"); - } else if (!RISCV::isVLKnownLE(VLOp, *CommonVL)) { + } else if (!RISCV::isVLKnownLE(*VLOp, *CommonVL)) { LLVM_DEBUG(dbgs() << " Abort because cannot determine a common VL\n"); - CanReduceVL = false; - break; + return std::nullopt; + } + + if (!RISCVII::hasSEWOp(UserMI.getDesc().TSFlags)) { + LLVM_DEBUG(dbgs() << " Abort due to lack of SEW operand\n"); + return std::nullopt; + } + + std::optional<OperandInfo> ConsumerInfo = getOperandInfo(UserOp, MRI); + std::optional<OperandInfo> ProducerInfo = + getOperandInfo(MI.getOperand(0), MRI); + if (!ConsumerInfo || !ProducerInfo) { + LLVM_DEBUG(dbgs() << " Abort due to unknown operand information.\n"); + LLVM_DEBUG(dbgs() << " ConsumerInfo is: " << ConsumerInfo << "\n"); + LLVM_DEBUG(dbgs() << " ProducerInfo is: " << ProducerInfo << "\n"); + return std::nullopt; } - // The SEW and LMUL of destination and source registers need to match. - OperandInfo ConsumerInfo = getOperandInfo(UserOp, MRI); - OperandInfo ProducerInfo = getOperandInfo(MI.getOperand(0), MRI); - if (ConsumerInfo.isUnknown() || ProducerInfo.isUnknown() || - !OperandInfo::EMULAndEEWAreEqual(ConsumerInfo, ProducerInfo)) { - LLVM_DEBUG(dbgs() << " Abort due to incompatible or unknown " - "information for EMUL or EEW.\n"); + // If the operand is used as a scalar operand, then the EEW must be + // compatible. Otherwise, the EMUL *and* EEW must be compatible. + bool IsVectorOpUsedAsScalarOp = isVectorOpUsedAsScalarOp(UserOp); + if ((IsVectorOpUsedAsScalarOp && + !OperandInfo::EEWAreEqual(*ConsumerInfo, *ProducerInfo)) || + (!IsVectorOpUsedAsScalarOp && + !OperandInfo::EMULAndEEWAreEqual(*ConsumerInfo, *ProducerInfo))) { + LLVM_DEBUG( + dbgs() + << " Abort due to incompatible information for EMUL or EEW.\n"); LLVM_DEBUG(dbgs() << " ConsumerInfo is: " << ConsumerInfo << "\n"); LLVM_DEBUG(dbgs() << " ProducerInfo is: " << ProducerInfo << "\n"); - CanReduceVL = false; - break; + return std::nullopt; } } - return CanReduceVL; + + return CommonVL; } bool RISCVVLOptimizer::tryReduceVL(MachineInstr &OrigMI) { @@ -1112,12 +1219,11 @@ bool RISCVVLOptimizer::tryReduceVL(MachineInstr &OrigMI) { MachineInstr &MI = *Worklist.pop_back_val(); LLVM_DEBUG(dbgs() << "Trying to reduce VL for " << MI << "\n"); - const MachineOperand *CommonVL = nullptr; - bool CanReduceVL = true; - if (isVectorRegClass(MI.getOperand(0).getReg(), MRI)) - CanReduceVL = checkUsers(CommonVL, MI); + if (!isVectorRegClass(MI.getOperand(0).getReg(), MRI)) + continue; - if (!CanReduceVL || !CommonVL) + auto CommonVL = checkUsers(MI); + if (!CommonVL) continue; assert((CommonVL->isImm() || CommonVL->getReg().isVirtual()) && diff --git a/llvm/lib/Target/SPIRV/CMakeLists.txt b/llvm/lib/Target/SPIRV/CMakeLists.txt index aa83d99..a79e19f 100644 --- a/llvm/lib/Target/SPIRV/CMakeLists.txt +++ b/llvm/lib/Target/SPIRV/CMakeLists.txt @@ -20,7 +20,6 @@ add_llvm_target(SPIRVCodeGen SPIRVCallLowering.cpp SPIRVInlineAsmLowering.cpp SPIRVCommandLine.cpp - SPIRVDuplicatesTracker.cpp SPIRVEmitIntrinsics.cpp SPIRVGlobalRegistry.cpp SPIRVInstrInfo.cpp diff --git a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp index 4012bd7..78add92 100644 --- a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp @@ -274,7 +274,7 @@ void SPIRVAsmPrinter::emitInstruction(const MachineInstr *MI) { } void SPIRVAsmPrinter::outputModuleSection(SPIRV::ModuleSectionType MSType) { - for (MachineInstr *MI : MAI->getMSInstrs(MSType)) + for (const MachineInstr *MI : MAI->getMSInstrs(MSType)) outputInstruction(MI); } @@ -326,7 +326,7 @@ void SPIRVAsmPrinter::outputOpMemoryModel() { void SPIRVAsmPrinter::outputEntryPoints() { // Find all OpVariable IDs with required StorageClass. DenseSet<Register> InterfaceIDs; - for (MachineInstr *MI : MAI->GlobalVarList) { + for (const MachineInstr *MI : MAI->GlobalVarList) { assert(MI->getOpcode() == SPIRV::OpVariable); auto SC = static_cast<SPIRV::StorageClass::StorageClass>( MI->getOperand(2).getImm()); @@ -336,14 +336,14 @@ void SPIRVAsmPrinter::outputEntryPoints() { // declaring all global variables referenced by the entry point call tree. if (ST->isAtLeastSPIRVVer(VersionTuple(1, 4)) || SC == SPIRV::StorageClass::Input || SC == SPIRV::StorageClass::Output) { - MachineFunction *MF = MI->getMF(); + const MachineFunction *MF = MI->getMF(); Register Reg = MAI->getRegisterAlias(MF, MI->getOperand(0).getReg()); InterfaceIDs.insert(Reg); } } // Output OpEntryPoints adding interface args to all of them. - for (MachineInstr *MI : MAI->getMSInstrs(SPIRV::MB_EntryPoints)) { + for (const MachineInstr *MI : MAI->getMSInstrs(SPIRV::MB_EntryPoints)) { SPIRVMCInstLower MCInstLowering; MCInst TmpInst; MCInstLowering.lower(MI, TmpInst, MAI); @@ -381,9 +381,8 @@ void SPIRVAsmPrinter::outputGlobalRequirements() { void SPIRVAsmPrinter::outputExtFuncDecls() { // Insert OpFunctionEnd after each declaration. - SmallVectorImpl<MachineInstr *>::iterator - I = MAI->getMSInstrs(SPIRV::MB_ExtFuncDecls).begin(), - E = MAI->getMSInstrs(SPIRV::MB_ExtFuncDecls).end(); + auto I = MAI->getMSInstrs(SPIRV::MB_ExtFuncDecls).begin(), + E = MAI->getMSInstrs(SPIRV::MB_ExtFuncDecls).end(); for (; I != E; ++I) { outputInstruction(*I); if ((I + 1) == E || (*(I + 1))->getOpcode() == SPIRV::OpFunction) diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp index fa37313f..44b6f5f8 100644 --- a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp @@ -418,6 +418,7 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, .addImm(FuncControl) .addUse(GR->getSPIRVTypeID(FuncTy)); GR->recordFunctionDefinition(&F, &MB.getInstr()->getOperand(0)); + GR->addGlobalObject(&F, &MIRBuilder.getMF(), FuncVReg); // Add OpFunctionParameter instructions int i = 0; @@ -431,6 +432,7 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, .addUse(GR->getSPIRVTypeID(ArgTypeVRegs[i])); if (F.isDeclaration()) GR->add(&Arg, &MIRBuilder.getMF(), ArgReg); + GR->addGlobalObject(&Arg, &MIRBuilder.getMF(), ArgReg); i++; } // Name the function. diff --git a/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.cpp b/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.cpp deleted file mode 100644 index 48df845..0000000 --- a/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.cpp +++ /dev/null @@ -1,136 +0,0 @@ -//===-- SPIRVDuplicatesTracker.cpp - SPIR-V Duplicates Tracker --*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// General infrastructure for keeping track of the values that according to -// the SPIR-V binary layout should be global to the whole module. -// -//===----------------------------------------------------------------------===// - -#include "SPIRVDuplicatesTracker.h" -#include "SPIRVInstrInfo.h" - -#define DEBUG_TYPE "build-dep-graph" - -using namespace llvm; - -template <typename T> -void SPIRVGeneralDuplicatesTracker::prebuildReg2Entry( - SPIRVDuplicatesTracker<T> &DT, SPIRVReg2EntryTy &Reg2Entry, - const SPIRVInstrInfo *TII) { - for (auto &TPair : DT.getAllUses()) { - for (auto &RegPair : TPair.second) { - const MachineFunction *MF = RegPair.first; - Register R = RegPair.second; - MachineInstr *MI = MF->getRegInfo().getUniqueVRegDef(R); - if (!MI || (TPair.second.getIsConst() && !TII->isConstantInstr(*MI))) - continue; - Reg2Entry[&MI->getOperand(0)] = &TPair.second; - } - } -} - -void SPIRVGeneralDuplicatesTracker::buildDepsGraph( - std::vector<SPIRV::DTSortableEntry *> &Graph, const SPIRVInstrInfo *TII, - MachineModuleInfo *MMI = nullptr) { - SPIRVReg2EntryTy Reg2Entry; - prebuildReg2Entry(TT, Reg2Entry, TII); - prebuildReg2Entry(CT, Reg2Entry, TII); - prebuildReg2Entry(GT, Reg2Entry, TII); - prebuildReg2Entry(FT, Reg2Entry, TII); - prebuildReg2Entry(AT, Reg2Entry, TII); - prebuildReg2Entry(MT, Reg2Entry, TII); - prebuildReg2Entry(ST, Reg2Entry, TII); - - for (auto &Op2E : Reg2Entry) { - SPIRV::DTSortableEntry *E = Op2E.second; - Graph.push_back(E); - for (auto &U : *E) { - const MachineRegisterInfo &MRI = U.first->getRegInfo(); - MachineInstr *MI = MRI.getUniqueVRegDef(U.second); - if (!MI) - continue; - assert(MI && MI->getParent() && "No MachineInstr created yet"); - for (auto i = MI->getNumDefs(); i < MI->getNumOperands(); i++) { - MachineOperand &Op = MI->getOperand(i); - if (!Op.isReg()) - continue; - MachineInstr *VRegDef = MRI.getVRegDef(Op.getReg()); - // References to a function via function pointers generate virtual - // registers without a definition. We are able to resolve this - // reference using Globar Register info into an OpFunction instruction - // but do not expect to find it in Reg2Entry. - if (MI->getOpcode() == SPIRV::OpConstantFunctionPointerINTEL && i == 2) - continue; - MachineOperand *RegOp = &VRegDef->getOperand(0); - if (Reg2Entry.count(RegOp) == 0 && - (MI->getOpcode() != SPIRV::OpVariable || i != 3)) { - // try to repair the unexpected code pattern - bool IsFixed = false; - if (VRegDef->getOpcode() == TargetOpcode::G_CONSTANT && - RegOp->isReg() && MRI.getType(RegOp->getReg()).isScalar()) { - const Constant *C = VRegDef->getOperand(1).getCImm(); - add(C, MI->getParent()->getParent(), RegOp->getReg()); - auto Iter = CT.Storage.find(C); - if (Iter != CT.Storage.end()) { - SPIRV::DTSortableEntry &MissedEntry = Iter->second; - Reg2Entry[RegOp] = &MissedEntry; - IsFixed = true; - } - } - if (!IsFixed) { - std::string DiagMsg; - raw_string_ostream OS(DiagMsg); - OS << "Unexpected pattern while building a dependency " - "graph.\nInstruction: "; - MI->print(OS); - OS << "Operand: "; - Op.print(OS); - OS << "\nOperand definition: "; - VRegDef->print(OS); - report_fatal_error(DiagMsg.c_str()); - } - } - if (Reg2Entry.count(RegOp)) - E->addDep(Reg2Entry[RegOp]); - } - - if (E->getIsFunc()) { - MachineInstr *Next = MI->getNextNode(); - if (Next && (Next->getOpcode() == SPIRV::OpFunction || - Next->getOpcode() == SPIRV::OpFunctionParameter)) { - E->addDep(Reg2Entry[&Next->getOperand(0)]); - } - } - } - } - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) - if (MMI) { - const Module *M = MMI->getModule(); - for (auto F = M->begin(), E = M->end(); F != E; ++F) { - const MachineFunction *MF = MMI->getMachineFunction(*F); - if (!MF) - continue; - for (const MachineBasicBlock &MBB : *MF) { - for (const MachineInstr &CMI : MBB) { - MachineInstr &MI = const_cast<MachineInstr &>(CMI); - MI.dump(); - if (MI.getNumExplicitDefs() > 0 && - Reg2Entry.count(&MI.getOperand(0))) { - dbgs() << "\t["; - for (SPIRV::DTSortableEntry *D : - Reg2Entry.lookup(&MI.getOperand(0))->getDeps()) - dbgs() << Register::virtReg2Index(D->lookup(MF)) << ", "; - dbgs() << "]\n"; - } - } - } - } - } -#endif -} diff --git a/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.h b/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.h index 6847da0..e574892 100644 --- a/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.h +++ b/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.h @@ -211,23 +211,7 @@ class SPIRVGeneralDuplicatesTracker { SPIRVDuplicatesTracker<MachineInstr> MT; SPIRVDuplicatesTracker<SPIRV::SpecialTypeDescriptor> ST; - // NOTE: using MOs instead of regs to get rid of MF dependency to be able - // to use flat data structure. - // NOTE: replacing DenseMap with MapVector doesn't affect overall correctness - // but makes LITs more stable, should prefer DenseMap still due to - // significant perf difference. - using SPIRVReg2EntryTy = - MapVector<MachineOperand *, SPIRV::DTSortableEntry *>; - - template <typename T> - void prebuildReg2Entry(SPIRVDuplicatesTracker<T> &DT, - SPIRVReg2EntryTy &Reg2Entry, - const SPIRVInstrInfo *TII); - public: - void buildDepsGraph(std::vector<SPIRV::DTSortableEntry *> &Graph, - const SPIRVInstrInfo *TII, MachineModuleInfo *MMI); - void add(const Type *Ty, const MachineFunction *MF, Register R) { TT.add(unifyPtrType(Ty), MF, R); } diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp index 77b5421..d2b14d6 100644 --- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp @@ -1841,20 +1841,20 @@ void SPIRVEmitIntrinsics::processGlobalValue(GlobalVariable &GV, // Skip special artifical variable llvm.global.annotations. if (GV.getName() == "llvm.global.annotations") return; - if (GV.hasInitializer() && !isa<UndefValue>(GV.getInitializer())) { + Constant *Init = nullptr; + if (hasInitializer(&GV)) { // Deduce element type and store results in Global Registry. // Result is ignored, because TypedPointerType is not supported // by llvm IR general logic. deduceElementTypeHelper(&GV, false); - Constant *Init = GV.getInitializer(); + Init = GV.getInitializer(); Type *Ty = isAggrConstForceInt32(Init) ? B.getInt32Ty() : Init->getType(); Constant *Const = isAggrConstForceInt32(Init) ? B.getInt32(1) : Init; auto *InitInst = B.CreateIntrinsic(Intrinsic::spv_init_global, {GV.getType(), Ty}, {&GV, Const}); InitInst->setArgOperand(1, Init); } - if ((!GV.hasInitializer() || isa<UndefValue>(GV.getInitializer())) && - GV.getNumUses() == 0) + if (!Init && GV.getNumUses() == 0) B.CreateIntrinsic(Intrinsic::spv_unref_global, GV.getType(), &GV); } diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp index 0c424477..a06c62e 100644 --- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp @@ -721,6 +721,7 @@ Register SPIRVGlobalRegistry::buildGlobalVariable( } Reg = MIB->getOperand(0).getReg(); DT.add(GVar, &MIRBuilder.getMF(), Reg); + addGlobalObject(GVar, &MIRBuilder.getMF(), Reg); // Set to Reg the same type as ResVReg has. auto MRI = MIRBuilder.getMRI(); diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h index ec2386fa..528baf5 100644 --- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h +++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h @@ -89,6 +89,9 @@ class SPIRVGlobalRegistry { // Intrinsic::spv_assign_ptr_type instructions. DenseMap<Value *, CallInst *> AssignPtrTypeInstr; + // Maps OpVariable and OpFunction-related v-regs to its LLVM IR definition. + DenseMap<std::pair<const MachineFunction *, Register>, const Value *> Reg2GO; + // Add a new OpTypeXXX instruction without checking for duplicates. SPIRVType *createSPIRVType(const Type *Type, MachineIRBuilder &MIRBuilder, SPIRV::AccessQualifier::AccessQualifier AQ = @@ -151,15 +154,17 @@ public: return DT.find(F, MF); } - void buildDepsGraph(std::vector<SPIRV::DTSortableEntry *> &Graph, - const SPIRVInstrInfo *TII, - MachineModuleInfo *MMI = nullptr) { - DT.buildDepsGraph(Graph, TII, MMI); - } - void setBound(unsigned V) { Bound = V; } unsigned getBound() { return Bound; } + void addGlobalObject(const Value *V, const MachineFunction *MF, Register R) { + Reg2GO[std::make_pair(MF, R)] = V; + } + const Value *getGlobalObject(const MachineFunction *MF, Register R) { + auto It = Reg2GO.find(std::make_pair(MF, R)); + return It == Reg2GO.end() ? nullptr : It->second; + } + // Add a record to the map of function return pointer types. void addReturnType(const Function *ArgF, TypedPointerType *DerivedTy) { FunResPointerTypes[ArgF] = DerivedTy; diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp index bd9e77e..9a140e7 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp @@ -47,6 +47,19 @@ bool SPIRVInstrInfo::isConstantInstr(const MachineInstr &MI) const { } } +bool SPIRVInstrInfo::isSpecConstantInstr(const MachineInstr &MI) const { + switch (MI.getOpcode()) { + case SPIRV::OpSpecConstantTrue: + case SPIRV::OpSpecConstantFalse: + case SPIRV::OpSpecConstant: + case SPIRV::OpSpecConstantComposite: + case SPIRV::OpSpecConstantOp: + return true; + default: + return false; + } +} + bool SPIRVInstrInfo::isInlineAsmDefInstr(const MachineInstr &MI) const { switch (MI.getOpcode()) { case SPIRV::OpAsmTargetINTEL: diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h index 67d2d97..4e5059b 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h +++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h @@ -30,6 +30,7 @@ public: const SPIRVRegisterInfo &getRegisterInfo() const { return RI; } bool isHeaderInstr(const MachineInstr &MI) const; bool isConstantInstr(const MachineInstr &MI) const; + bool isSpecConstantInstr(const MachineInstr &MI) const; bool isInlineAsmDefInstr(const MachineInstr &MI) const; bool isTypeDeclInstr(const MachineInstr &MI) const; bool isDecorationInstr(const MachineInstr &MI) const; diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index 4815685..28c9b81 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -33,7 +33,6 @@ #include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/IR/IntrinsicsSPIRV.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/ErrorHandling.h" #define DEBUG_TYPE "spirv-isel" @@ -46,17 +45,6 @@ using ExtInstList = namespace { -llvm::SPIRV::SelectionControl::SelectionControl -getSelectionOperandForImm(int Imm) { - if (Imm == 2) - return SPIRV::SelectionControl::Flatten; - if (Imm == 1) - return SPIRV::SelectionControl::DontFlatten; - if (Imm == 0) - return SPIRV::SelectionControl::None; - llvm_unreachable("Invalid immediate"); -} - #define GET_GLOBALISEL_PREDICATE_BITSET #include "SPIRVGenGlobalISel.inc" #undef GET_GLOBALISEL_PREDICATE_BITSET @@ -1117,6 +1105,7 @@ bool SPIRVInstructionSelector::selectMemOperation(Register ResVReg, Constant::getNullValue(LLVMArrTy)); Register VarReg = MRI->createGenericVirtualRegister(LLT::scalar(64)); GR.add(GV, GR.CurMF, VarReg); + GR.addGlobalObject(GV, GR.CurMF, VarReg); Result &= BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SPIRV::OpVariable)) @@ -2829,8 +2818,12 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg, } return MIB.constrainAllUses(TII, TRI, RBI); } - case Intrinsic::spv_loop_merge: { - auto MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpLoopMerge)); + case Intrinsic::spv_loop_merge: + case Intrinsic::spv_selection_merge: { + const auto Opcode = IID == Intrinsic::spv_selection_merge + ? SPIRV::OpSelectionMerge + : SPIRV::OpLoopMerge; + auto MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(Opcode)); for (unsigned i = 1; i < I.getNumExplicitOperands(); ++i) { assert(I.getOperand(i).isMBB()); MIB.addMBB(I.getOperand(i).getMBB()); @@ -2838,15 +2831,6 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg, MIB.addImm(SPIRV::SelectionControl::None); return MIB.constrainAllUses(TII, TRI, RBI); } - case Intrinsic::spv_selection_merge: { - auto MIB = - BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpSelectionMerge)); - assert(I.getOperand(1).isMBB() && - "operand 1 to spv_selection_merge must be a basic block"); - MIB.addMBB(I.getOperand(1).getMBB()); - MIB.addImm(getSelectionOperandForImm(I.getOperand(2).getImm())); - return MIB.constrainAllUses(TII, TRI, RBI); - } case Intrinsic::spv_cmpxchg: return selectAtomicCmpXchg(ResVReg, ResType, I); case Intrinsic::spv_unreachable: @@ -3477,7 +3461,7 @@ bool SPIRVInstructionSelector::selectGlobalValue( ID = UnnamedGlobalIDs.size(); GlobalIdent = "__unnamed_" + Twine(ID).str(); } else { - GlobalIdent = GV->getGlobalIdentifier(); + GlobalIdent = GV->getName(); } // Behaviour of functions as operands depends on availability of the @@ -3509,18 +3493,25 @@ bool SPIRVInstructionSelector::selectGlobalValue( // References to a function via function pointers generate virtual // registers without a definition. We will resolve it later, during // module analysis stage. + Register ResTypeReg = GR.getSPIRVTypeID(ResType); MachineRegisterInfo *MRI = MIRBuilder.getMRI(); - Register FuncVReg = MRI->createGenericVirtualRegister(LLT::scalar(64)); - MRI->setRegClass(FuncVReg, &SPIRV::iIDRegClass); - MachineInstrBuilder MB = + Register FuncVReg = + MRI->createGenericVirtualRegister(GR.getRegType(ResType)); + MRI->setRegClass(FuncVReg, &SPIRV::pIDRegClass); + MachineInstrBuilder MIB1 = + BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpUndef)) + .addDef(FuncVReg) + .addUse(ResTypeReg); + MachineInstrBuilder MIB2 = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpConstantFunctionPointerINTEL)) .addDef(NewReg) - .addUse(GR.getSPIRVTypeID(ResType)) + .addUse(ResTypeReg) .addUse(FuncVReg); // mapping the function pointer to the used Function - GR.recordFunctionPointer(&MB.getInstr()->getOperand(2), GVFun); - return MB.constrainAllUses(TII, TRI, RBI); + GR.recordFunctionPointer(&MIB2.getInstr()->getOperand(2), GVFun); + return MIB1.constrainAllUses(TII, TRI, RBI) && + MIB2.constrainAllUses(TII, TRI, RBI); } return BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpConstantNull)) .addDef(NewReg) @@ -3533,18 +3524,16 @@ bool SPIRVInstructionSelector::selectGlobalValue( auto GlobalVar = cast<GlobalVariable>(GV); assert(GlobalVar->getName() != "llvm.global.annotations"); - bool HasInit = GlobalVar->hasInitializer() && - !isa<UndefValue>(GlobalVar->getInitializer()); - // Skip empty declaration for GVs with initilaizers till we get the decl with + // Skip empty declaration for GVs with initializers till we get the decl with // passed initializer. - if (HasInit && !Init) + if (hasInitializer(GlobalVar) && !Init) return true; - bool HasLnkTy = GV->getLinkage() != GlobalValue::InternalLinkage; + bool HasLnkTy = !GV->hasInternalLinkage() && !GV->hasPrivateLinkage(); SPIRV::LinkageType::LinkageType LnkType = - (GV->isDeclaration() || GV->hasAvailableExternallyLinkage()) + GV->isDeclarationForLinker() ? SPIRV::LinkageType::Import - : (GV->getLinkage() == GlobalValue::LinkOnceODRLinkage && + : (GV->hasLinkOnceODRLinkage() && STI.canUseExtension(SPIRV::Extension::SPV_KHR_linkonce_odr) ? SPIRV::LinkageType::LinkOnceODR : SPIRV::LinkageType::Export); diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp index 6371c67..63adf54 100644 --- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp @@ -216,102 +216,262 @@ void SPIRVModuleAnalysis::setBaseInfo(const Module &M) { } } -// Collect MI which defines the register in the given machine function. -static void collectDefInstr(Register Reg, const MachineFunction *MF, - SPIRV::ModuleAnalysisInfo *MAI, - SPIRV::ModuleSectionType MSType, - bool DoInsert = true) { - assert(MAI->hasRegisterAlias(MF, Reg) && "Cannot find register alias"); - MachineInstr *MI = MF->getRegInfo().getUniqueVRegDef(Reg); - assert(MI && "There should be an instruction that defines the register"); - MAI->setSkipEmission(MI); - if (DoInsert) - MAI->MS[MSType].push_back(MI); +// Returns a representation of an instruction as a vector of MachineOperand +// hash values, see llvm::hash_value(const MachineOperand &MO) for details. +// This creates a signature of the instruction with the same content +// that MachineOperand::isIdenticalTo uses for comparison. +static InstrSignature instrToSignature(const MachineInstr &MI, + SPIRV::ModuleAnalysisInfo &MAI, + bool UseDefReg) { + InstrSignature Signature{MI.getOpcode()}; + for (unsigned i = 0; i < MI.getNumOperands(); ++i) { + const MachineOperand &MO = MI.getOperand(i); + size_t h; + if (MO.isReg()) { + if (!UseDefReg && MO.isDef()) + continue; + Register RegAlias = MAI.getRegisterAlias(MI.getMF(), MO.getReg()); + if (!RegAlias.isValid()) { + LLVM_DEBUG({ + dbgs() << "Unexpectedly, no global id found for the operand "; + MO.print(dbgs()); + dbgs() << "\nInstruction: "; + MI.print(dbgs()); + dbgs() << "\n"; + }); + report_fatal_error("All v-regs must have been mapped to global id's"); + } + // mimic llvm::hash_value(const MachineOperand &MO) + h = hash_combine(MO.getType(), (unsigned)RegAlias, MO.getSubReg(), + MO.isDef()); + } else { + h = hash_value(MO); + } + Signature.push_back(h); + } + return Signature; } -void SPIRVModuleAnalysis::collectGlobalEntities( - const std::vector<SPIRV::DTSortableEntry *> &DepsGraph, - SPIRV::ModuleSectionType MSType, - std::function<bool(const SPIRV::DTSortableEntry *)> Pred, - bool UsePreOrder = false) { - DenseSet<const SPIRV::DTSortableEntry *> Visited; - for (const auto *E : DepsGraph) { - std::function<void(const SPIRV::DTSortableEntry *)> RecHoistUtil; - // NOTE: here we prefer recursive approach over iterative because - // we don't expect depchains long enough to cause SO. - RecHoistUtil = [MSType, UsePreOrder, &Visited, &Pred, - &RecHoistUtil](const SPIRV::DTSortableEntry *E) { - if (Visited.count(E) || !Pred(E)) - return; - Visited.insert(E); - - // Traversing deps graph in post-order allows us to get rid of - // register aliases preprocessing. - // But pre-order is required for correct processing of function - // declaration and arguments processing. - if (!UsePreOrder) - for (auto *S : E->getDeps()) - RecHoistUtil(S); - - Register GlobalReg = Register::index2VirtReg(MAI.getNextID()); - bool IsFirst = true; - for (auto &U : *E) { - const MachineFunction *MF = U.first; - Register Reg = U.second; - MAI.setRegisterAlias(MF, Reg, GlobalReg); - if (!MF->getRegInfo().getUniqueVRegDef(Reg)) - continue; - collectDefInstr(Reg, MF, &MAI, MSType, IsFirst); - IsFirst = false; - if (E->getIsGV()) - MAI.GlobalVarList.push_back(MF->getRegInfo().getUniqueVRegDef(Reg)); - } +bool SPIRVModuleAnalysis::isDeclSection(const MachineRegisterInfo &MRI, + const MachineInstr &MI) { + unsigned Opcode = MI.getOpcode(); + switch (Opcode) { + case SPIRV::OpTypeForwardPointer: + // omit now, collect later + return false; + case SPIRV::OpVariable: + return static_cast<SPIRV::StorageClass::StorageClass>( + MI.getOperand(2).getImm()) != SPIRV::StorageClass::Function; + case SPIRV::OpFunction: + case SPIRV::OpFunctionParameter: + return true; + } + if (GR->hasConstFunPtr() && Opcode == SPIRV::OpUndef) { + Register DefReg = MI.getOperand(0).getReg(); + for (MachineInstr &UseMI : MRI.use_instructions(DefReg)) { + if (UseMI.getOpcode() != SPIRV::OpConstantFunctionPointerINTEL) + continue; + // it's a dummy definition, FP constant refers to a function, + // and this is resolved in another way; let's skip this definition + assert(UseMI.getOperand(2).isReg() && + UseMI.getOperand(2).getReg() == DefReg); + MAI.setSkipEmission(&MI); + return false; + } + } + return TII->isTypeDeclInstr(MI) || TII->isConstantInstr(MI) || + TII->isInlineAsmDefInstr(MI); +} - if (UsePreOrder) - for (auto *S : E->getDeps()) - RecHoistUtil(S); - }; - RecHoistUtil(E); +// This is a special case of a function pointer refering to a possibly +// forward function declaration. The operand is a dummy OpUndef that +// requires a special treatment. +void SPIRVModuleAnalysis::visitFunPtrUse( + Register OpReg, InstrGRegsMap &SignatureToGReg, + std::map<const Value *, unsigned> &GlobalToGReg, const MachineFunction *MF, + const MachineInstr &MI) { + const MachineOperand *OpFunDef = + GR->getFunctionDefinitionByUse(&MI.getOperand(2)); + assert(OpFunDef && OpFunDef->isReg()); + // find the actual function definition and number it globally in advance + const MachineInstr *OpDefMI = OpFunDef->getParent(); + assert(OpDefMI && OpDefMI->getOpcode() == SPIRV::OpFunction); + const MachineFunction *FunDefMF = OpDefMI->getParent()->getParent(); + const MachineRegisterInfo &FunDefMRI = FunDefMF->getRegInfo(); + do { + visitDecl(FunDefMRI, SignatureToGReg, GlobalToGReg, FunDefMF, *OpDefMI); + OpDefMI = OpDefMI->getNextNode(); + } while (OpDefMI && (OpDefMI->getOpcode() == SPIRV::OpFunction || + OpDefMI->getOpcode() == SPIRV::OpFunctionParameter)); + // associate the function pointer with the newly assigned global number + Register GlobalFunDefReg = MAI.getRegisterAlias(FunDefMF, OpFunDef->getReg()); + assert(GlobalFunDefReg.isValid() && + "Function definition must refer to a global register"); + MAI.setRegisterAlias(MF, OpReg, GlobalFunDefReg); +} + +// Depth first recursive traversal of dependencies. Repeated visits are guarded +// by MAI.hasRegisterAlias(). +void SPIRVModuleAnalysis::visitDecl( + const MachineRegisterInfo &MRI, InstrGRegsMap &SignatureToGReg, + std::map<const Value *, unsigned> &GlobalToGReg, const MachineFunction *MF, + const MachineInstr &MI) { + unsigned Opcode = MI.getOpcode(); + DenseSet<Register> Deps; + + // Process each operand of the instruction to resolve dependencies + for (const MachineOperand &MO : MI.operands()) { + if (!MO.isReg() || MO.isDef()) + continue; + Register OpReg = MO.getReg(); + // Handle function pointers special case + if (Opcode == SPIRV::OpConstantFunctionPointerINTEL && + MRI.getRegClass(OpReg) == &SPIRV::pIDRegClass) { + visitFunPtrUse(OpReg, SignatureToGReg, GlobalToGReg, MF, MI); + continue; + } + // Skip already processed instructions + if (MAI.hasRegisterAlias(MF, MO.getReg())) + continue; + // Recursively visit dependencies + if (const MachineInstr *OpDefMI = MRI.getUniqueVRegDef(OpReg)) { + if (isDeclSection(MRI, *OpDefMI)) + visitDecl(MRI, SignatureToGReg, GlobalToGReg, MF, *OpDefMI); + continue; + } + // Handle the unexpected case of no unique definition for the SPIR-V + // instruction + LLVM_DEBUG({ + dbgs() << "Unexpectedly, no unique definition for the operand "; + MO.print(dbgs()); + dbgs() << "\nInstruction: "; + MI.print(dbgs()); + dbgs() << "\n"; + }); + report_fatal_error( + "No unique definition is found for the virtual register"); } + + Register GReg; + bool IsFunDef = false; + if (TII->isSpecConstantInstr(MI)) { + GReg = Register::index2VirtReg(MAI.getNextID()); + MAI.MS[SPIRV::MB_TypeConstVars].push_back(&MI); + } else if (Opcode == SPIRV::OpFunction || + Opcode == SPIRV::OpFunctionParameter) { + GReg = handleFunctionOrParameter(MF, MI, GlobalToGReg, IsFunDef); + } else if (TII->isTypeDeclInstr(MI) || TII->isConstantInstr(MI) || + TII->isInlineAsmDefInstr(MI)) { + GReg = handleTypeDeclOrConstant(MI, SignatureToGReg); + } else if (Opcode == SPIRV::OpVariable) { + GReg = handleVariable(MF, MI, GlobalToGReg); + } else { + LLVM_DEBUG({ + dbgs() << "\nInstruction: "; + MI.print(dbgs()); + dbgs() << "\n"; + }); + llvm_unreachable("Unexpected instruction is visited"); + } + MAI.setRegisterAlias(MF, MI.getOperand(0).getReg(), GReg); + if (!IsFunDef) + MAI.setSkipEmission(&MI); } -// The function initializes global register alias table for types, consts, -// global vars and func decls and collects these instruction for output -// at module level. Also it collects explicit OpExtension/OpCapability -// instructions. -void SPIRVModuleAnalysis::processDefInstrs(const Module &M) { - std::vector<SPIRV::DTSortableEntry *> DepsGraph; +Register SPIRVModuleAnalysis::handleFunctionOrParameter( + const MachineFunction *MF, const MachineInstr &MI, + std::map<const Value *, unsigned> &GlobalToGReg, bool &IsFunDef) { + const Value *GObj = GR->getGlobalObject(MF, MI.getOperand(0).getReg()); + assert(GObj && "Unregistered global definition"); + const Function *F = dyn_cast<Function>(GObj); + if (!F) + F = dyn_cast<Argument>(GObj)->getParent(); + assert(F && "Expected a reference to a function or an argument"); + IsFunDef = !F->isDeclaration(); + auto It = GlobalToGReg.find(GObj); + if (It != GlobalToGReg.end()) + return It->second; + Register GReg = Register::index2VirtReg(MAI.getNextID()); + GlobalToGReg[GObj] = GReg; + if (!IsFunDef) + MAI.MS[SPIRV::MB_ExtFuncDecls].push_back(&MI); + return GReg; +} - GR->buildDepsGraph(DepsGraph, TII, SPVDumpDeps ? MMI : nullptr); +Register +SPIRVModuleAnalysis::handleTypeDeclOrConstant(const MachineInstr &MI, + InstrGRegsMap &SignatureToGReg) { + InstrSignature MISign = instrToSignature(MI, MAI, false); + auto It = SignatureToGReg.find(MISign); + if (It != SignatureToGReg.end()) + return It->second; + Register GReg = Register::index2VirtReg(MAI.getNextID()); + SignatureToGReg[MISign] = GReg; + MAI.MS[SPIRV::MB_TypeConstVars].push_back(&MI); + return GReg; +} - collectGlobalEntities( - DepsGraph, SPIRV::MB_TypeConstVars, - [](const SPIRV::DTSortableEntry *E) { return !E->getIsFunc(); }); +Register SPIRVModuleAnalysis::handleVariable( + const MachineFunction *MF, const MachineInstr &MI, + std::map<const Value *, unsigned> &GlobalToGReg) { + MAI.GlobalVarList.push_back(&MI); + const Value *GObj = GR->getGlobalObject(MF, MI.getOperand(0).getReg()); + assert(GObj && "Unregistered global definition"); + auto It = GlobalToGReg.find(GObj); + if (It != GlobalToGReg.end()) + return It->second; + Register GReg = Register::index2VirtReg(MAI.getNextID()); + GlobalToGReg[GObj] = GReg; + MAI.MS[SPIRV::MB_TypeConstVars].push_back(&MI); + return GReg; +} +void SPIRVModuleAnalysis::collectDeclarations(const Module &M) { + InstrGRegsMap SignatureToGReg; + std::map<const Value *, unsigned> GlobalToGReg; for (auto F = M.begin(), E = M.end(); F != E; ++F) { MachineFunction *MF = MMI->getMachineFunction(*F); if (!MF) continue; - // Iterate through and collect OpExtension/OpCapability instructions. + const MachineRegisterInfo &MRI = MF->getRegInfo(); + unsigned PastHeader = 0; for (MachineBasicBlock &MBB : *MF) { for (MachineInstr &MI : MBB) { - if (MI.getOpcode() == SPIRV::OpExtension) { - // Here, OpExtension just has a single enum operand, not a string. - auto Ext = SPIRV::Extension::Extension(MI.getOperand(0).getImm()); - MAI.Reqs.addExtension(Ext); + if (MI.getNumOperands() == 0) + continue; + unsigned Opcode = MI.getOpcode(); + if (Opcode == SPIRV::OpFunction) { + if (PastHeader == 0) { + PastHeader = 1; + continue; + } + } else if (Opcode == SPIRV::OpFunctionParameter) { + if (PastHeader < 2) + continue; + } else if (PastHeader > 0) { + PastHeader = 2; + } + + const MachineOperand &DefMO = MI.getOperand(0); + switch (Opcode) { + case SPIRV::OpExtension: + MAI.Reqs.addExtension(SPIRV::Extension::Extension(DefMO.getImm())); MAI.setSkipEmission(&MI); - } else if (MI.getOpcode() == SPIRV::OpCapability) { - auto Cap = SPIRV::Capability::Capability(MI.getOperand(0).getImm()); - MAI.Reqs.addCapability(Cap); + break; + case SPIRV::OpCapability: + MAI.Reqs.addCapability(SPIRV::Capability::Capability(DefMO.getImm())); MAI.setSkipEmission(&MI); + if (PastHeader > 0) + PastHeader = 2; + break; + default: + if (DefMO.isReg() && isDeclSection(MRI, MI) && + !MAI.hasRegisterAlias(MF, DefMO.getReg())) + visitDecl(MRI, SignatureToGReg, GlobalToGReg, MF, MI); } } } } - - collectGlobalEntities( - DepsGraph, SPIRV::MB_ExtFuncDecls, - [](const SPIRV::DTSortableEntry *E) { return E->getIsFunc(); }, true); } // Look for IDs declared with Import linkage, and map the corresponding function @@ -342,58 +502,6 @@ void SPIRVModuleAnalysis::collectFuncNames(MachineInstr &MI, } } -// References to a function via function pointers generate virtual -// registers without a definition. We are able to resolve this -// reference using Globar Register info into an OpFunction instruction -// and replace dummy operands by the corresponding global register references. -void SPIRVModuleAnalysis::collectFuncPtrs() { - for (auto &MI : MAI.MS[SPIRV::MB_TypeConstVars]) - if (MI->getOpcode() == SPIRV::OpConstantFunctionPointerINTEL) - collectFuncPtrs(MI); -} - -void SPIRVModuleAnalysis::collectFuncPtrs(MachineInstr *MI) { - const MachineOperand *FunUse = &MI->getOperand(2); - if (const MachineOperand *FunDef = GR->getFunctionDefinitionByUse(FunUse)) { - const MachineInstr *FunDefMI = FunDef->getParent(); - assert(FunDefMI->getOpcode() == SPIRV::OpFunction && - "Constant function pointer must refer to function definition"); - Register FunDefReg = FunDef->getReg(); - Register GlobalFunDefReg = - MAI.getRegisterAlias(FunDefMI->getMF(), FunDefReg); - assert(GlobalFunDefReg.isValid() && - "Function definition must refer to a global register"); - Register FunPtrReg = FunUse->getReg(); - MAI.setRegisterAlias(MI->getMF(), FunPtrReg, GlobalFunDefReg); - } -} - -using InstrSignature = SmallVector<size_t>; -using InstrTraces = std::set<InstrSignature>; - -// Returns a representation of an instruction as a vector of MachineOperand -// hash values, see llvm::hash_value(const MachineOperand &MO) for details. -// This creates a signature of the instruction with the same content -// that MachineOperand::isIdenticalTo uses for comparison. -static InstrSignature instrToSignature(MachineInstr &MI, - SPIRV::ModuleAnalysisInfo &MAI) { - InstrSignature Signature; - for (unsigned i = 0; i < MI.getNumOperands(); ++i) { - const MachineOperand &MO = MI.getOperand(i); - size_t h; - if (MO.isReg()) { - Register RegAlias = MAI.getRegisterAlias(MI.getMF(), MO.getReg()); - // mimic llvm::hash_value(const MachineOperand &MO) - h = hash_combine(MO.getType(), (unsigned)RegAlias, MO.getSubReg(), - MO.isDef()); - } else { - h = hash_value(MO); - } - Signature.push_back(h); - } - return Signature; -} - // Collect the given instruction in the specified MS. We assume global register // numbering has already occurred by this point. We can directly compare reg // arguments when detecting duplicates. @@ -401,7 +509,7 @@ static void collectOtherInstr(MachineInstr &MI, SPIRV::ModuleAnalysisInfo &MAI, SPIRV::ModuleSectionType MSType, InstrTraces &IS, bool Append = true) { MAI.setSkipEmission(&MI); - InstrSignature MISign = instrToSignature(MI, MAI); + InstrSignature MISign = instrToSignature(MI, MAI, true); auto FoundMI = IS.insert(MISign); if (!FoundMI.second) return; // insert failed, so we found a duplicate; don't add it to MAI.MS @@ -465,7 +573,7 @@ void SPIRVModuleAnalysis::processOtherInstrs(const Module &M) { // Number registers in all functions globally from 0 onwards and store // the result in global register alias table. Some registers are already -// numbered in collectGlobalEntities. +// numbered. void SPIRVModuleAnalysis::numberRegistersGlobally(const Module &M) { for (auto F = M.begin(), E = M.end(); F != E; ++F) { if ((*F).isDeclaration()) @@ -1835,15 +1943,11 @@ bool SPIRVModuleAnalysis::runOnModule(Module &M) { // Process type/const/global var/func decl instructions, number their // destination registers from 0 to N, collect Extensions and Capabilities. - processDefInstrs(M); + collectDeclarations(M); // Number rest of registers from N+1 onwards. numberRegistersGlobally(M); - // Update references to OpFunction instructions to use Global Registers - if (GR->hasConstFunPtr()) - collectFuncPtrs(); - // Collect OpName, OpEntryPoint, OpDecorate etc, process other instructions. processOtherInstrs(M); diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h index ee2aaf1..79b5444 100644 --- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h +++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h @@ -124,7 +124,7 @@ public: const Capability::Capability IfPresent); }; -using InstrList = SmallVector<MachineInstr *>; +using InstrList = SmallVector<const MachineInstr *>; // Maps a local register to the corresponding global alias. using LocalToGlobalRegTable = std::map<Register, Register>; using RegisterAliasMapTy = @@ -142,12 +142,12 @@ struct ModuleAnalysisInfo { // Maps ExtInstSet to corresponding ID register. DenseMap<unsigned, Register> ExtInstSetMap; // Contains the list of all global OpVariables in the module. - SmallVector<MachineInstr *, 4> GlobalVarList; + SmallVector<const MachineInstr *, 4> GlobalVarList; // Maps functions to corresponding function ID registers. DenseMap<const Function *, Register> FuncMap; // The set contains machine instructions which are necessary // for correct MIR but will not be emitted in function bodies. - DenseSet<MachineInstr *> InstrsToDelete; + DenseSet<const MachineInstr *> InstrsToDelete; // The table contains global aliases of local registers for each machine // function. The aliases are used to substitute local registers during // code emission. @@ -167,7 +167,7 @@ struct ModuleAnalysisInfo { } Register getExtInstSetReg(unsigned SetNum) { return ExtInstSetMap[SetNum]; } InstrList &getMSInstrs(unsigned MSType) { return MS[MSType]; } - void setSkipEmission(MachineInstr *MI) { InstrsToDelete.insert(MI); } + void setSkipEmission(const MachineInstr *MI) { InstrsToDelete.insert(MI); } bool getSkipEmission(const MachineInstr *MI) { return InstrsToDelete.contains(MI); } @@ -204,6 +204,10 @@ struct ModuleAnalysisInfo { }; } // namespace SPIRV +using InstrSignature = SmallVector<size_t>; +using InstrTraces = std::set<InstrSignature>; +using InstrGRegsMap = std::map<SmallVector<size_t>, unsigned>; + struct SPIRVModuleAnalysis : public ModulePass { static char ID; @@ -216,17 +220,27 @@ public: private: void setBaseInfo(const Module &M); - void collectGlobalEntities( - const std::vector<SPIRV::DTSortableEntry *> &DepsGraph, - SPIRV::ModuleSectionType MSType, - std::function<bool(const SPIRV::DTSortableEntry *)> Pred, - bool UsePreOrder); - void processDefInstrs(const Module &M); void collectFuncNames(MachineInstr &MI, const Function *F); void processOtherInstrs(const Module &M); void numberRegistersGlobally(const Module &M); - void collectFuncPtrs(); - void collectFuncPtrs(MachineInstr *MI); + + // analyze dependencies to collect module scope definitions + void collectDeclarations(const Module &M); + void visitDecl(const MachineRegisterInfo &MRI, InstrGRegsMap &SignatureToGReg, + std::map<const Value *, unsigned> &GlobalToGReg, + const MachineFunction *MF, const MachineInstr &MI); + Register handleVariable(const MachineFunction *MF, const MachineInstr &MI, + std::map<const Value *, unsigned> &GlobalToGReg); + Register handleTypeDeclOrConstant(const MachineInstr &MI, + InstrGRegsMap &SignatureToGReg); + Register + handleFunctionOrParameter(const MachineFunction *MF, const MachineInstr &MI, + std::map<const Value *, unsigned> &GlobalToGReg, + bool &IsFunDef); + void visitFunPtrUse(Register OpReg, InstrGRegsMap &SignatureToGReg, + std::map<const Value *, unsigned> &GlobalToGReg, + const MachineFunction *MF, const MachineInstr &MI); + bool isDeclSection(const MachineRegisterInfo &MRI, const MachineInstr &MI); const SPIRVSubtarget *ST; SPIRVGlobalRegistry *GR; diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp index 8357c30..5b4c849 100644 --- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp @@ -58,9 +58,10 @@ addConstantsToTrack(MachineFunction &MF, SPIRVGlobalRegistry *GR, ->getValue()); if (auto *GV = dyn_cast<GlobalValue>(Const)) { Register Reg = GR->find(GV, &MF); - if (!Reg.isValid()) + if (!Reg.isValid()) { GR->add(GV, &MF, SrcReg); - else + GR->addGlobalObject(GV, &MF, SrcReg); + } else RegsAlreadyAddedToDT[&MI] = Reg; } else { Register Reg = GR->find(Const, &MF); diff --git a/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp b/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp index 2e4343c..336cde4 100644 --- a/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp @@ -18,16 +18,14 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/CodeGen/IntrinsicLowering.h" +#include "llvm/IR/Analysis.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicsSPIRV.h" -#include "llvm/IR/LegacyPassManager.h" #include "llvm/InitializePasses.h" -#include "llvm/PassRegistry.h" -#include "llvm/Transforms/Utils.h" #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/LoopSimplify.h" #include "llvm/Transforms/Utils/LowerMemIntrinsics.h" @@ -648,7 +646,8 @@ class SPIRVStructurizer : public FunctionPass { Builder.SetInsertPoint(Header->getTerminator()); auto MergeAddress = BlockAddress::get(BB.getParent(), &BB); - createOpSelectMerge(&Builder, MergeAddress); + SmallVector<Value *, 1> Args = {MergeAddress}; + Builder.CreateIntrinsic(Intrinsic::spv_selection_merge, {}, {Args}); Modified = true; } @@ -770,9 +769,10 @@ class SPIRVStructurizer : public FunctionPass { BasicBlock *Merge = Candidates[0]; auto MergeAddress = BlockAddress::get(Merge->getParent(), Merge); + SmallVector<Value *, 1> Args = {MergeAddress}; IRBuilder<> Builder(&BB); Builder.SetInsertPoint(BB.getTerminator()); - createOpSelectMerge(&Builder, MergeAddress); + Builder.CreateIntrinsic(Intrinsic::spv_selection_merge, {}, {Args}); } return Modified; @@ -1105,7 +1105,8 @@ class SPIRVStructurizer : public FunctionPass { Builder.SetInsertPoint(Header->getTerminator()); auto MergeAddress = BlockAddress::get(Merge->getParent(), Merge); - createOpSelectMerge(&Builder, MergeAddress); + SmallVector<Value *, 1> Args = {MergeAddress}; + Builder.CreateIntrinsic(Intrinsic::spv_selection_merge, {}, {Args}); continue; } @@ -1119,7 +1120,8 @@ class SPIRVStructurizer : public FunctionPass { Builder.SetInsertPoint(Header->getTerminator()); auto MergeAddress = BlockAddress::get(NewMerge->getParent(), NewMerge); - createOpSelectMerge(&Builder, MergeAddress); + SmallVector<Value *, 1> Args = {MergeAddress}; + Builder.CreateIntrinsic(Intrinsic::spv_selection_merge, {}, {Args}); } return Modified; @@ -1206,27 +1208,6 @@ public: AU.addPreserved<SPIRVConvergenceRegionAnalysisWrapperPass>(); FunctionPass::getAnalysisUsage(AU); } - - void createOpSelectMerge(IRBuilder<> *Builder, BlockAddress *MergeAddress) { - Instruction *BBTerminatorInst = Builder->GetInsertBlock()->getTerminator(); - - MDNode *MDNode = BBTerminatorInst->getMetadata("hlsl.controlflow.hint"); - - ConstantInt *BranchHint = llvm::ConstantInt::get(Builder->getInt32Ty(), 0); - - if (MDNode) { - assert(MDNode->getNumOperands() == 2 && - "invalid metadata hlsl.controlflow.hint"); - BranchHint = mdconst::extract<ConstantInt>(MDNode->getOperand(1)); - - assert(BranchHint && "invalid metadata value for hlsl.controlflow.hint"); - } - - llvm::SmallVector<llvm::Value *, 2> Args = {MergeAddress, BranchHint}; - - Builder->CreateIntrinsic(Intrinsic::spv_selection_merge, - {MergeAddress->getType()}, {Args}); - } }; } // namespace llvm @@ -1248,11 +1229,8 @@ FunctionPass *llvm::createSPIRVStructurizerPass() { PreservedAnalyses SPIRVStructurizerWrapper::run(Function &F, FunctionAnalysisManager &AF) { - - auto FPM = legacy::FunctionPassManager(F.getParent()); - FPM.add(createSPIRVStructurizerPass()); - - if (!FPM.run(F)) + FunctionPass *StructurizerPass = createSPIRVStructurizerPass(); + if (!StructurizerPass->runOnFunction(F)) return PreservedAnalyses::all(); PreservedAnalyses PA; PA.preserveSet<CFGAnalyses>(); diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.h b/llvm/lib/Target/SPIRV/SPIRVUtils.h index da2e24c..60649ea 100644 --- a/llvm/lib/Target/SPIRV/SPIRVUtils.h +++ b/llvm/lib/Target/SPIRV/SPIRVUtils.h @@ -17,6 +17,7 @@ #include "llvm/Analysis/LoopInfo.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/IR/Dominators.h" +#include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/TypedPointerType.h" #include <queue> @@ -236,6 +237,10 @@ Type *parseBasicTypeName(StringRef &TypeName, LLVMContext &Ctx); // Returns true if the function was changed. bool sortBlocks(Function &F); +inline bool hasInitializer(const GlobalVariable *GV) { + return GV->hasInitializer() && !isa<UndefValue>(GV->getInitializer()); +} + // True if this is an instance of TypedPointerType. inline bool isTypedPointerTy(const Type *T) { return T && T->getTypeID() == Type::TypedPointerTyID; diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 68bdeb1..6b0eb38 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -94,7 +94,7 @@ static cl::opt<int> BrMergingCcmpBias( static cl::opt<bool> WidenShift("x86-widen-shift", cl::init(true), - cl::desc("Replacte narrow shifts with wider shifts."), + cl::desc("Replace narrow shifts with wider shifts."), cl::Hidden); static cl::opt<int> BrMergingLikelyBias( @@ -41694,6 +41694,8 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { MVT VT = N.getSimpleValueType(); + unsigned NumElts = VT.getVectorNumElements(); + SmallVector<int, 4> Mask; unsigned Opcode = N.getOpcode(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); @@ -41979,7 +41981,7 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, APInt Mask = APInt::getHighBitsSet(64, 32); if (DAG.MaskedValueIsZero(In, Mask)) { SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In); - MVT VecVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2); + MVT VecVT = MVT::getVectorVT(MVT::i32, NumElts * 2); SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc); SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec); return DAG.getBitcast(VT, Movl); @@ -41994,7 +41996,6 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, // Create a vector constant - scalar constant followed by zeros. EVT ScalarVT = N0.getOperand(0).getValueType(); Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext()); - unsigned NumElts = VT.getVectorNumElements(); Constant *Zero = ConstantInt::getNullValue(ScalarTy); SmallVector<Constant *, 32> ConstantVec(NumElts, Zero); ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue()); @@ -42045,9 +42046,8 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, MVT SrcVT = N0.getOperand(0).getSimpleValueType(); unsigned SrcBits = SrcVT.getScalarSizeInBits(); if ((EltBits % SrcBits) == 0 && SrcBits >= 32) { - unsigned Size = VT.getVectorNumElements(); unsigned NewSize = SrcVT.getVectorNumElements(); - APInt BlendMask = N.getConstantOperandAPInt(2).zextOrTrunc(Size); + APInt BlendMask = N.getConstantOperandAPInt(2).zextOrTrunc(NumElts); APInt NewBlendMask = APIntOps::ScaleBitMask(BlendMask, NewSize); return DAG.getBitcast( VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0), @@ -42460,7 +42460,7 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2; DMask[DOffset + 0] = DOffset + 1; DMask[DOffset + 1] = DOffset + 0; - MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2); + MVT DVT = MVT::getVectorVT(MVT::i32, NumElts / 2); V = DAG.getBitcast(DVT, V); V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V, getV4X86ShuffleImm8ForMask(DMask, DL, DAG)); diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 7a7554c..c19bcfc 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -1650,6 +1650,13 @@ InstructionCost X86TTIImpl::getShuffleCost( return MatchingTypes ? TTI::TCC_Free : SubLT.first; } + // Attempt to match MOVSS (Idx == 0) or INSERTPS pattern. This will have + // been matched by improveShuffleKindFromMask as a SK_InsertSubvector of + // v1f32 (legalised to f32) into a v4f32. + if (LT.first == 1 && LT.second == MVT::v4f32 && SubLT.first == 1 && + SubLT.second == MVT::f32 && (Index == 0 || ST->hasSSE41())) + return 1; + // If the insertion isn't aligned, treat it like a 2-op shuffle. Kind = TTI::SK_PermuteTwoSrc; } @@ -4797,9 +4804,12 @@ InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, MVT MScalarTy = LT.second.getScalarType(); auto IsCheapPInsrPExtrInsertPS = [&]() { // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets. + // Inserting f32 into index0 is just movss. // Also, assume insertps is relatively cheap on all >= SSE41 targets. return (MScalarTy == MVT::i16 && ST->hasSSE2()) || (MScalarTy.isInteger() && ST->hasSSE41()) || + (MScalarTy == MVT::f32 && ST->hasSSE1() && Index == 0 && + Opcode == Instruction::InsertElement) || (MScalarTy == MVT::f32 && ST->hasSSE41() && Opcode == Instruction::InsertElement); }; diff --git a/llvm/lib/TargetParser/AArch64TargetParser.cpp b/llvm/lib/TargetParser/AArch64TargetParser.cpp index 50c9a56..7d0b8c3 100644 --- a/llvm/lib/TargetParser/AArch64TargetParser.cpp +++ b/llvm/lib/TargetParser/AArch64TargetParser.cpp @@ -48,17 +48,12 @@ std::optional<AArch64::ArchInfo> AArch64::ArchInfo::findBySubArch(StringRef SubA return {}; } -unsigned AArch64::getFMVPriority(ArrayRef<StringRef> Features) { - constexpr unsigned MaxFMVPriority = 1000; - unsigned Priority = 0; - unsigned NumFeatures = 0; - for (StringRef Feature : Features) { - if (auto Ext = parseFMVExtension(Feature)) { - Priority = std::max(Priority, Ext->Priority); - NumFeatures++; - } - } - return Priority + MaxFMVPriority * NumFeatures; +uint64_t AArch64::getFMVPriority(ArrayRef<StringRef> Features) { + uint64_t Priority = 0; + for (StringRef Feature : Features) + if (std::optional<FMVInfo> Info = parseFMVExtension(Feature)) + Priority |= (1ULL << Info->PriorityBit); + return Priority; } uint64_t AArch64::getCpuSupportsMask(ArrayRef<StringRef> Features) { @@ -73,7 +68,7 @@ uint64_t AArch64::getCpuSupportsMask(ArrayRef<StringRef> Features) { uint64_t FeaturesMask = 0; for (const FMVInfo &Info : getFMVInfo()) if (Info.ID && FeatureBits.Enabled.test(*Info.ID)) - FeaturesMask |= (1ULL << Info.Bit); + FeaturesMask |= (1ULL << Info.FeatureBit); return FeaturesMask; } diff --git a/llvm/lib/TargetParser/RISCVISAInfo.cpp b/llvm/lib/TargetParser/RISCVISAInfo.cpp index 4f403e9..d6e1eac 100644 --- a/llvm/lib/TargetParser/RISCVISAInfo.cpp +++ b/llvm/lib/TargetParser/RISCVISAInfo.cpp @@ -742,8 +742,8 @@ Error RISCVISAInfo::checkDependency() { bool HasZvl = MinVLen != 0; bool HasZcmt = Exts.count("zcmt") != 0; static constexpr StringLiteral XqciExts[] = { - {"xqcia"}, {"xqciac"}, {"xqcicli"}, {"xqcics"}, - {"xqcicsr"}, {"xqcilsm"}, {"xqcisls"}}; + {"xqcia"}, {"xqciac"}, {"xqcicli"}, {"xqcicm"}, + {"xqcics"}, {"xqcicsr"}, {"xqcilsm"}, {"xqcisls"}}; if (HasI && HasE) return getIncompatibleError("i", "e"); diff --git a/llvm/lib/Transforms/Coroutines/Coroutines.cpp b/llvm/lib/Transforms/Coroutines/Coroutines.cpp index 240d089..7b59c39 100644 --- a/llvm/lib/Transforms/Coroutines/Coroutines.cpp +++ b/llvm/lib/Transforms/Coroutines/Coroutines.cpp @@ -69,7 +69,6 @@ static const char *const CoroIntrinsics[] = { "llvm.coro.async.context.dealloc", "llvm.coro.async.resume", "llvm.coro.async.size.replace", - "llvm.coro.async.store_resume", "llvm.coro.await.suspend.bool", "llvm.coro.await.suspend.handle", "llvm.coro.await.suspend.void", diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp index 96956481..449d64d 100644 --- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp +++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp @@ -66,19 +66,19 @@ static cl::opt<unsigned> MaxCodeSizeGrowth( "Maximum codesize growth allowed per function")); static cl::opt<unsigned> MinCodeSizeSavings( - "funcspec-min-codesize-savings", cl::init(20), cl::Hidden, cl::desc( - "Reject specializations whose codesize savings are less than this" - "much percent of the original function size")); + "funcspec-min-codesize-savings", cl::init(20), cl::Hidden, + cl::desc("Reject specializations whose codesize savings are less than this " + "much percent of the original function size")); static cl::opt<unsigned> MinLatencySavings( "funcspec-min-latency-savings", cl::init(40), cl::Hidden, - cl::desc("Reject specializations whose latency savings are less than this" + cl::desc("Reject specializations whose latency savings are less than this " "much percent of the original function size")); static cl::opt<unsigned> MinInliningBonus( - "funcspec-min-inlining-bonus", cl::init(300), cl::Hidden, cl::desc( - "Reject specializations whose inlining bonus is less than this" - "much percent of the original function size")); + "funcspec-min-inlining-bonus", cl::init(300), cl::Hidden, + cl::desc("Reject specializations whose inlining bonus is less than this " + "much percent of the original function size")); static cl::opt<bool> SpecializeOnAddress( "funcspec-on-address", cl::init(false), cl::Hidden, cl::desc( diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp index 16a80e9..78cd249 100644 --- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -105,7 +105,7 @@ static cl::opt<int> ColdCCRelFreq( "coldcc-rel-freq", cl::Hidden, cl::init(2), cl::desc( "Maximum block frequency, expressed as a percentage of caller's " - "entry frequency, for a call site to be considered cold for enabling" + "entry frequency, for a call site to be considered cold for enabling " "coldcc")); /// Is this global variable possibly used by a leak checker as a root? If so, diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index 1bf7ff4..016db55 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -122,6 +122,20 @@ static cl::opt<unsigned> cl::desc("Max depth to recursively search for missing " "frames through tail calls.")); +// By default enable cloning of callsites involved with recursive cycles +static cl::opt<bool> AllowRecursiveCallsites( + "memprof-allow-recursive-callsites", cl::init(true), cl::Hidden, + cl::desc("Allow cloning of callsites involved in recursive cycles")); + +// When disabled, try to detect and prevent cloning of recursive contexts. +// This is only necessary until we support cloning through recursive cycles. +// Leave on by default for now, as disabling requires a little bit of compile +// time overhead and doesn't affect correctness, it will just inflate the cold +// hinted bytes reporting a bit when -memprof-report-hinted-sizes is enabled. +static cl::opt<bool> AllowRecursiveContexts( + "memprof-allow-recursive-contexts", cl::init(true), cl::Hidden, + cl::desc("Allow cloning of contexts through recursive cycles")); + namespace llvm { cl::opt<bool> EnableMemProfContextDisambiguation( "enable-memprof-context-disambiguation", cl::init(false), cl::Hidden, @@ -1236,9 +1250,13 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB( StackEntryIdToContextNodeMap[StackId] = StackNode; StackNode->OrigStackOrAllocId = StackId; } - auto Ins = StackIdSet.insert(StackId); - if (!Ins.second) - StackNode->Recursive = true; + // Marking a node recursive will prevent its cloning completely, even for + // non-recursive contexts flowing through it. + if (!AllowRecursiveCallsites) { + auto Ins = StackIdSet.insert(StackId); + if (!Ins.second) + StackNode->Recursive = true; + } StackNode->AllocTypes |= (uint8_t)AllocType; PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId); PrevNode = StackNode; @@ -1375,8 +1393,11 @@ static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node, set_union(CallerEdgeContextIds, Edge->ContextIds); } // Node can have more context ids than callers if some contexts terminate at - // node and some are longer. - assert(NodeContextIds == CallerEdgeContextIds || + // node and some are longer. If we are allowing recursive callsites but + // haven't disabled recursive contexts, this will be violated for + // incompletely cloned recursive cycles, so skip the checking in that case. + assert((AllowRecursiveCallsites && AllowRecursiveContexts) || + NodeContextIds == CallerEdgeContextIds || set_is_subset(CallerEdgeContextIds, NodeContextIds)); } if (Node->CalleeEdges.size()) { @@ -3370,6 +3391,21 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones( assert(Node->AllocTypes != (uint8_t)AllocationType::None); + DenseSet<uint32_t> RecursiveContextIds; + // If we are allowing recursive callsites, but have also disabled recursive + // contexts, look for context ids that show up in multiple caller edges. + if (AllowRecursiveCallsites && !AllowRecursiveContexts) { + DenseSet<uint32_t> AllCallerContextIds; + for (auto &CE : Node->CallerEdges) { + // Resize to the largest set of caller context ids, since we know the + // final set will be at least that large. + AllCallerContextIds.reserve(CE->getContextIds().size()); + for (auto Id : CE->getContextIds()) + if (!AllCallerContextIds.insert(Id).second) + RecursiveContextIds.insert(Id); + } + } + // Iterate until we find no more opportunities for disambiguating the alloc // types via cloning. In most cases this loop will terminate once the Node // has a single allocation type, in which case no more cloning is needed. @@ -3394,6 +3430,9 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones( // allocation. auto CallerEdgeContextsForAlloc = set_intersection(CallerEdge->getContextIds(), AllocContextIds); + if (!RecursiveContextIds.empty()) + CallerEdgeContextsForAlloc = + set_difference(CallerEdgeContextsForAlloc, RecursiveContextIds); if (CallerEdgeContextsForAlloc.empty()) { ++EI; continue; diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp index b40ab35..67585e9 100644 --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -129,7 +129,7 @@ static cl::opt<bool> PrintModuleBeforeOptimizations( static cl::opt<bool> AlwaysInlineDeviceFunctions( "openmp-opt-inline-device", - cl::desc("Inline all applicible functions on the device."), cl::Hidden, + cl::desc("Inline all applicable functions on the device."), cl::Hidden, cl::init(false)); static cl::opt<bool> diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp index 603beb3..b978c54 100644 --- a/llvm/lib/Transforms/IPO/SampleProfile.cpp +++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp @@ -162,7 +162,7 @@ static cl::opt<bool> ProfileSampleBlockAccurate( static cl::opt<bool> ProfileAccurateForSymsInList( "profile-accurate-for-symsinlist", cl::Hidden, cl::init(true), cl::desc("For symbols in profile symbol list, regard their profiles to " - "be accurate. It may be overriden by profile-sample-accurate. ")); + "be accurate. It may be overridden by profile-sample-accurate. ")); static cl::opt<bool> ProfileMergeInlinee( "sample-profile-merge-inlinee", cl::Hidden, cl::init(true), @@ -193,9 +193,10 @@ static cl::opt<bool> ProfileSizeInline( // and inline the hot functions (that are skipped in this pass). static cl::opt<bool> DisableSampleLoaderInlining( "disable-sample-loader-inlining", cl::Hidden, cl::init(false), - cl::desc("If true, artifically skip inline transformation in sample-loader " - "pass, and merge (or scale) profiles (as configured by " - "--sample-profile-merge-inlinee).")); + cl::desc( + "If true, artificially skip inline transformation in sample-loader " + "pass, and merge (or scale) profiles (as configured by " + "--sample-profile-merge-inlinee).")); namespace llvm { cl::opt<bool> @@ -255,7 +256,7 @@ static cl::opt<unsigned> PrecentMismatchForStalenessError( static cl::opt<bool> CallsitePrioritizedInline( "sample-profile-prioritized-inline", cl::Hidden, - cl::desc("Use call site prioritized inlining for sample profile loader." + cl::desc("Use call site prioritized inlining for sample profile loader. " "Currently only CSSPGO is supported.")); static cl::opt<bool> UsePreInlinerDecision( diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index e9bb2b8..f82a557 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -39,8 +39,7 @@ static Value *getNewICmpValue(unsigned Code, bool Sign, Value *LHS, Value *RHS, /// This is the complement of getFCmpCode, which turns an opcode and two /// operands into either a FCmp instruction, or a true/false constant. static Value *getFCmpValue(unsigned Code, Value *LHS, Value *RHS, - InstCombiner::BuilderTy &Builder, - FastMathFlags FMF) { + InstCombiner::BuilderTy &Builder, FMFSource FMF) { FCmpInst::Predicate NewPred; if (Constant *TorF = getPredForFCmpCode(Code, LHS->getType(), NewPred)) return TorF; @@ -514,7 +513,8 @@ static Value *foldLogOpOfMaskedICmpsAsymmetric( /// into a single (icmp(A & X) ==/!= Y). static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd, bool IsLogical, - InstCombiner::BuilderTy &Builder) { + InstCombiner::BuilderTy &Builder, + const SimplifyQuery &Q) { Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr, *E = nullptr; ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate(); std::optional<std::pair<unsigned, unsigned>> MaskPair = @@ -587,93 +587,107 @@ static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd, return Builder.CreateICmp(NewCC, NewAnd2, A); } - // Remaining cases assume at least that B and D are constant, and depend on - // their actual values. This isn't strictly necessary, just a "handle the - // easy cases for now" decision. const APInt *ConstB, *ConstD; - if (!match(B, m_APInt(ConstB)) || !match(D, m_APInt(ConstD))) - return nullptr; - - if (Mask & (Mask_NotAllZeros | BMask_NotAllOnes)) { - // (icmp ne (A & B), 0) & (icmp ne (A & D), 0) and - // (icmp ne (A & B), B) & (icmp ne (A & D), D) - // -> (icmp ne (A & B), 0) or (icmp ne (A & D), 0) - // Only valid if one of the masks is a superset of the other (check "B&D" is - // the same as either B or D). - APInt NewMask = *ConstB & *ConstD; - if (NewMask == *ConstB) - return LHS; - else if (NewMask == *ConstD) - return RHS; - } - - if (Mask & AMask_NotAllOnes) { - // (icmp ne (A & B), B) & (icmp ne (A & D), D) - // -> (icmp ne (A & B), A) or (icmp ne (A & D), A) - // Only valid if one of the masks is a superset of the other (check "B|D" is - // the same as either B or D). - APInt NewMask = *ConstB | *ConstD; - if (NewMask == *ConstB) - return LHS; - else if (NewMask == *ConstD) - return RHS; - } - - if (Mask & (BMask_Mixed | BMask_NotMixed)) { - // Mixed: - // (icmp eq (A & B), C) & (icmp eq (A & D), E) - // We already know that B & C == C && D & E == E. - // If we can prove that (B & D) & (C ^ E) == 0, that is, the bits of - // C and E, which are shared by both the mask B and the mask D, don't - // contradict, then we can transform to - // -> (icmp eq (A & (B|D)), (C|E)) - // Currently, we only handle the case of B, C, D, and E being constant. - // We can't simply use C and E because we might actually handle - // (icmp ne (A & B), B) & (icmp eq (A & D), D) - // with B and D, having a single bit set. - - // NotMixed: - // (icmp ne (A & B), C) & (icmp ne (A & D), E) - // -> (icmp ne (A & (B & D)), (C & E)) - // Check the intersection (B & D) for inequality. - // Assume that (B & D) == B || (B & D) == D, i.e B/D is a subset of D/B - // and (B & D) & (C ^ E) == 0, bits of C and E, which are shared by both the - // B and the D, don't contradict. - // Note that we can assume (~B & C) == 0 && (~D & E) == 0, previous - // operation should delete these icmps if it hadn't been met. - - const APInt *OldConstC, *OldConstE; - if (!match(C, m_APInt(OldConstC)) || !match(E, m_APInt(OldConstE))) - return nullptr; - - auto FoldBMixed = [&](ICmpInst::Predicate CC, bool IsNot) -> Value * { - CC = IsNot ? CmpInst::getInversePredicate(CC) : CC; - const APInt ConstC = PredL != CC ? *ConstB ^ *OldConstC : *OldConstC; - const APInt ConstE = PredR != CC ? *ConstD ^ *OldConstE : *OldConstE; + if (match(B, m_APInt(ConstB)) && match(D, m_APInt(ConstD))) { + if (Mask & (Mask_NotAllZeros | BMask_NotAllOnes)) { + // (icmp ne (A & B), 0) & (icmp ne (A & D), 0) and + // (icmp ne (A & B), B) & (icmp ne (A & D), D) + // -> (icmp ne (A & B), 0) or (icmp ne (A & D), 0) + // Only valid if one of the masks is a superset of the other (check "B&D" + // is the same as either B or D). + APInt NewMask = *ConstB & *ConstD; + if (NewMask == *ConstB) + return LHS; + if (NewMask == *ConstD) + return RHS; + } - if (((*ConstB & *ConstD) & (ConstC ^ ConstE)).getBoolValue()) - return IsNot ? nullptr : ConstantInt::get(LHS->getType(), !IsAnd); + if (Mask & AMask_NotAllOnes) { + // (icmp ne (A & B), B) & (icmp ne (A & D), D) + // -> (icmp ne (A & B), A) or (icmp ne (A & D), A) + // Only valid if one of the masks is a superset of the other (check "B|D" + // is the same as either B or D). + APInt NewMask = *ConstB | *ConstD; + if (NewMask == *ConstB) + return LHS; + if (NewMask == *ConstD) + return RHS; + } - if (IsNot && !ConstB->isSubsetOf(*ConstD) && !ConstD->isSubsetOf(*ConstB)) + if (Mask & (BMask_Mixed | BMask_NotMixed)) { + // Mixed: + // (icmp eq (A & B), C) & (icmp eq (A & D), E) + // We already know that B & C == C && D & E == E. + // If we can prove that (B & D) & (C ^ E) == 0, that is, the bits of + // C and E, which are shared by both the mask B and the mask D, don't + // contradict, then we can transform to + // -> (icmp eq (A & (B|D)), (C|E)) + // Currently, we only handle the case of B, C, D, and E being constant. + // We can't simply use C and E because we might actually handle + // (icmp ne (A & B), B) & (icmp eq (A & D), D) + // with B and D, having a single bit set. + + // NotMixed: + // (icmp ne (A & B), C) & (icmp ne (A & D), E) + // -> (icmp ne (A & (B & D)), (C & E)) + // Check the intersection (B & D) for inequality. + // Assume that (B & D) == B || (B & D) == D, i.e B/D is a subset of D/B + // and (B & D) & (C ^ E) == 0, bits of C and E, which are shared by both + // the B and the D, don't contradict. Note that we can assume (~B & C) == + // 0 && (~D & E) == 0, previous operation should delete these icmps if it + // hadn't been met. + + const APInt *OldConstC, *OldConstE; + if (!match(C, m_APInt(OldConstC)) || !match(E, m_APInt(OldConstE))) return nullptr; - APInt BD, CE; - if (IsNot) { - BD = *ConstB & *ConstD; - CE = ConstC & ConstE; - } else { - BD = *ConstB | *ConstD; - CE = ConstC | ConstE; - } - Value *NewAnd = Builder.CreateAnd(A, BD); - Value *CEVal = ConstantInt::get(A->getType(), CE); - return Builder.CreateICmp(CC, CEVal, NewAnd); - }; + auto FoldBMixed = [&](ICmpInst::Predicate CC, bool IsNot) -> Value * { + CC = IsNot ? CmpInst::getInversePredicate(CC) : CC; + const APInt ConstC = PredL != CC ? *ConstB ^ *OldConstC : *OldConstC; + const APInt ConstE = PredR != CC ? *ConstD ^ *OldConstE : *OldConstE; + + if (((*ConstB & *ConstD) & (ConstC ^ ConstE)).getBoolValue()) + return IsNot ? nullptr : ConstantInt::get(LHS->getType(), !IsAnd); + + if (IsNot && !ConstB->isSubsetOf(*ConstD) && + !ConstD->isSubsetOf(*ConstB)) + return nullptr; + + APInt BD, CE; + if (IsNot) { + BD = *ConstB & *ConstD; + CE = ConstC & ConstE; + } else { + BD = *ConstB | *ConstD; + CE = ConstC | ConstE; + } + Value *NewAnd = Builder.CreateAnd(A, BD); + Value *CEVal = ConstantInt::get(A->getType(), CE); + return Builder.CreateICmp(CC, CEVal, NewAnd); + }; + + if (Mask & BMask_Mixed) + return FoldBMixed(NewCC, false); + if (Mask & BMask_NotMixed) // can be else also + return FoldBMixed(NewCC, true); + } + } - if (Mask & BMask_Mixed) - return FoldBMixed(NewCC, false); - if (Mask & BMask_NotMixed) // can be else also - return FoldBMixed(NewCC, true); + // (icmp eq (A & B), 0) | (icmp eq (A & D), 0) + // -> (icmp ne (A & (B|D)), (B|D)) + // (icmp ne (A & B), 0) & (icmp ne (A & D), 0) + // -> (icmp eq (A & (B|D)), (B|D)) + // iff B and D is known to be a power of two + if (Mask & Mask_NotAllZeros && + isKnownToBeAPowerOfTwo(B, /*OrZero=*/false, /*Depth=*/0, Q) && + isKnownToBeAPowerOfTwo(D, /*OrZero=*/false, /*Depth=*/0, Q)) { + // If this is a logical and/or, then we must prevent propagation of a + // poison value from the RHS by inserting freeze. + if (IsLogical) + D = Builder.CreateFreeze(D); + Value *Mask = Builder.CreateOr(B, D); + Value *Masked = Builder.CreateAnd(A, Mask); + return Builder.CreateICmp(NewCC, Masked, Mask); } return nullptr; } @@ -776,46 +790,6 @@ foldAndOrOfICmpsWithPow2AndWithZero(InstCombiner::BuilderTy &Builder, return Builder.CreateICmp(Pred, And, Op); } -// Fold (iszero(A & K1) | iszero(A & K2)) -> (A & (K1 | K2)) != (K1 | K2) -// Fold (!iszero(A & K1) & !iszero(A & K2)) -> (A & (K1 | K2)) == (K1 | K2) -Value *InstCombinerImpl::foldAndOrOfICmpsOfAndWithPow2(ICmpInst *LHS, - ICmpInst *RHS, - Instruction *CxtI, - bool IsAnd, - bool IsLogical) { - CmpInst::Predicate Pred = IsAnd ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ; - if (LHS->getPredicate() != Pred || RHS->getPredicate() != Pred) - return nullptr; - - if (!match(LHS->getOperand(1), m_Zero()) || - !match(RHS->getOperand(1), m_Zero())) - return nullptr; - - Value *L1, *L2, *R1, *R2; - if (match(LHS->getOperand(0), m_And(m_Value(L1), m_Value(L2))) && - match(RHS->getOperand(0), m_And(m_Value(R1), m_Value(R2)))) { - if (L1 == R2 || L2 == R2) - std::swap(R1, R2); - if (L2 == R1) - std::swap(L1, L2); - - if (L1 == R1 && - isKnownToBeAPowerOfTwo(L2, false, 0, CxtI) && - isKnownToBeAPowerOfTwo(R2, false, 0, CxtI)) { - // If this is a logical and/or, then we must prevent propagation of a - // poison value from the RHS by inserting freeze. - if (IsLogical) - R2 = Builder.CreateFreeze(R2); - Value *Mask = Builder.CreateOr(L2, R2); - Value *Masked = Builder.CreateAnd(L1, Mask); - auto NewPred = IsAnd ? CmpInst::ICMP_EQ : CmpInst::ICMP_NE; - return Builder.CreateICmp(NewPred, Masked, Mask); - } - } - - return nullptr; -} - /// General pattern: /// X & Y /// @@ -1431,8 +1405,7 @@ static Value *matchIsFiniteTest(InstCombiner::BuilderTy &Builder, FCmpInst *LHS, return nullptr; return Builder.CreateFCmpFMF(FCmpInst::getOrderedPredicate(PredR), RHS0, RHS1, - LHS->getFastMathFlags() & - RHS->getFastMathFlags()); + FMFSource::intersect(LHS, RHS)); } Value *InstCombinerImpl::foldLogicOfFCmps(FCmpInst *LHS, FCmpInst *RHS, @@ -1469,7 +1442,7 @@ Value *InstCombinerImpl::foldLogicOfFCmps(FCmpInst *LHS, FCmpInst *RHS, // Intersect the fast math flags. // TODO: We can union the fast math flags unless this is a logical select. return getFCmpValue(NewPred, LHS0, LHS1, Builder, - LHS->getFastMathFlags() & RHS->getFastMathFlags()); + FMFSource::intersect(LHS, RHS)); } // This transform is not valid for a logical select. @@ -1486,8 +1459,8 @@ Value *InstCombinerImpl::foldLogicOfFCmps(FCmpInst *LHS, FCmpInst *RHS, // Ignore the constants because they are obviously not NANs: // (fcmp ord x, 0.0) & (fcmp ord y, 0.0) -> (fcmp ord x, y) // (fcmp uno x, 0.0) | (fcmp uno y, 0.0) -> (fcmp uno x, y) - return Builder.CreateFCmpFMF( - PredL, LHS0, RHS0, LHS->getFastMathFlags() & RHS->getFastMathFlags()); + return Builder.CreateFCmpFMF(PredL, LHS0, RHS0, + FMFSource::intersect(LHS, RHS)); } } @@ -3327,12 +3300,6 @@ Value *InstCombinerImpl::foldAndOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsLogical) { const SimplifyQuery Q = SQ.getWithInstruction(&I); - // Fold (iszero(A & K1) | iszero(A & K2)) -> (A & (K1 | K2)) != (K1 | K2) - // Fold (!iszero(A & K1) & !iszero(A & K2)) -> (A & (K1 | K2)) == (K1 | K2) - // if K1 and K2 are a one-bit mask. - if (Value *V = foldAndOrOfICmpsOfAndWithPow2(LHS, RHS, &I, IsAnd, IsLogical)) - return V; - ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate(); Value *LHS0 = LHS->getOperand(0), *RHS0 = RHS->getOperand(0); Value *LHS1 = LHS->getOperand(1), *RHS1 = RHS->getOperand(1); @@ -3359,7 +3326,7 @@ Value *InstCombinerImpl::foldAndOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, // handle (roughly): // (icmp ne (A & B), C) | (icmp ne (A & D), E) // (icmp eq (A & B), C) & (icmp eq (A & D), E) - if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, IsAnd, IsLogical, Builder)) + if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, IsAnd, IsLogical, Builder, Q)) return V; if (Value *V = @@ -4964,8 +4931,8 @@ Instruction *InstCombinerImpl::visitXor(BinaryOperator &I) { // (A & B) ^ (A | C) --> A ? ~B : C -- There are 4 commuted variants. if (I.getType()->isIntOrIntVectorTy(1) && - match(Op0, m_OneUse(m_LogicalAnd(m_Value(A), m_Value(B)))) && - match(Op1, m_OneUse(m_LogicalOr(m_Value(C), m_Value(D))))) { + match(&I, m_c_Xor(m_OneUse(m_LogicalAnd(m_Value(A), m_Value(B))), + m_OneUse(m_LogicalOr(m_Value(C), m_Value(D)))))) { bool NeedFreeze = isa<SelectInst>(Op0) && isa<SelectInst>(Op1) && B == D; if (B == C || B == D) std::swap(A, B); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 5494c70..c55c40c 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -2674,9 +2674,8 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { // copysign Mag, (copysign ?, X) --> copysign Mag, X Value *X; if (match(Sign, m_Intrinsic<Intrinsic::copysign>(m_Value(), m_Value(X)))) { - Value *CopySign = Builder.CreateCopySign( - Mag, X, - II->getFastMathFlags() & cast<Instruction>(Sign)->getFastMathFlags()); + Value *CopySign = + Builder.CreateCopySign(Mag, X, FMFSource::intersect(II, Sign)); return replaceInstUsesWith(*II, CopySign); } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index 8b23583..2e45725 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -2485,9 +2485,8 @@ Instruction *InstCombinerImpl::foldICmpShlConstant(ICmpInst &Cmp, // icmp ule i64 (shl X, 32), 8589934592 -> // icmp ule i32 (trunc X, i32), 2 -> // icmp ult i32 (trunc X, i32), 3 - if (auto FlippedStrictness = - InstCombiner::getFlippedStrictnessPredicateAndConstant( - Pred, ConstantInt::get(ShType->getContext(), C))) { + if (auto FlippedStrictness = getFlippedStrictnessPredicateAndConstant( + Pred, ConstantInt::get(ShType->getContext(), C))) { CmpPred = FlippedStrictness->first; RHSC = cast<ConstantInt>(FlippedStrictness->second)->getValue(); } @@ -3091,12 +3090,12 @@ Instruction *InstCombinerImpl::foldICmpAddConstant(ICmpInst &Cmp, unsigned BW = C.getBitWidth(); std::bitset<4> Table; auto ComputeTable = [&](bool Op0Val, bool Op1Val) { - int Res = 0; + APInt Res(BW, 0); if (Op0Val) - Res += isa<ZExtInst>(Ext0) ? 1 : -1; + Res += APInt(BW, isa<ZExtInst>(Ext0) ? 1 : -1, /*isSigned=*/true); if (Op1Val) - Res += isa<ZExtInst>(Ext1) ? 1 : -1; - return ICmpInst::compare(APInt(BW, Res, true), C, Pred); + Res += APInt(BW, isa<ZExtInst>(Ext1) ? 1 : -1, /*isSigned=*/true); + return ICmpInst::compare(Res, C, Pred); }; Table[0] = ComputeTable(false, false); @@ -3280,8 +3279,7 @@ bool InstCombinerImpl::matchThreeWayIntCompare(SelectInst *SI, Value *&LHS, if (PredB == ICmpInst::ICMP_SGT && isa<Constant>(RHS2)) { // x sgt C-1 <--> x sge C <--> not(x slt C) auto FlippedStrictness = - InstCombiner::getFlippedStrictnessPredicateAndConstant( - PredB, cast<Constant>(RHS2)); + getFlippedStrictnessPredicateAndConstant(PredB, cast<Constant>(RHS2)); if (!FlippedStrictness) return false; assert(FlippedStrictness->first == ICmpInst::ICMP_SGE && @@ -6908,79 +6906,6 @@ Instruction *InstCombinerImpl::foldICmpUsingBoolRange(ICmpInst &I) { return nullptr; } -std::optional<std::pair<CmpPredicate, Constant *>> -InstCombiner::getFlippedStrictnessPredicateAndConstant(CmpPredicate Pred, - Constant *C) { - assert(ICmpInst::isRelational(Pred) && ICmpInst::isIntPredicate(Pred) && - "Only for relational integer predicates."); - - Type *Type = C->getType(); - bool IsSigned = ICmpInst::isSigned(Pred); - - CmpInst::Predicate UnsignedPred = ICmpInst::getUnsignedPredicate(Pred); - bool WillIncrement = - UnsignedPred == ICmpInst::ICMP_ULE || UnsignedPred == ICmpInst::ICMP_UGT; - - // Check if the constant operand can be safely incremented/decremented - // without overflowing/underflowing. - auto ConstantIsOk = [WillIncrement, IsSigned](ConstantInt *C) { - return WillIncrement ? !C->isMaxValue(IsSigned) : !C->isMinValue(IsSigned); - }; - - Constant *SafeReplacementConstant = nullptr; - if (auto *CI = dyn_cast<ConstantInt>(C)) { - // Bail out if the constant can't be safely incremented/decremented. - if (!ConstantIsOk(CI)) - return std::nullopt; - } else if (auto *FVTy = dyn_cast<FixedVectorType>(Type)) { - unsigned NumElts = FVTy->getNumElements(); - for (unsigned i = 0; i != NumElts; ++i) { - Constant *Elt = C->getAggregateElement(i); - if (!Elt) - return std::nullopt; - - if (isa<UndefValue>(Elt)) - continue; - - // Bail out if we can't determine if this constant is min/max or if we - // know that this constant is min/max. - auto *CI = dyn_cast<ConstantInt>(Elt); - if (!CI || !ConstantIsOk(CI)) - return std::nullopt; - - if (!SafeReplacementConstant) - SafeReplacementConstant = CI; - } - } else if (isa<VectorType>(C->getType())) { - // Handle scalable splat - Value *SplatC = C->getSplatValue(); - auto *CI = dyn_cast_or_null<ConstantInt>(SplatC); - // Bail out if the constant can't be safely incremented/decremented. - if (!CI || !ConstantIsOk(CI)) - return std::nullopt; - } else { - // ConstantExpr? - return std::nullopt; - } - - // It may not be safe to change a compare predicate in the presence of - // undefined elements, so replace those elements with the first safe constant - // that we found. - // TODO: in case of poison, it is safe; let's replace undefs only. - if (C->containsUndefOrPoisonElement()) { - assert(SafeReplacementConstant && "Replacement constant not set"); - C = Constant::replaceUndefsWith(C, SafeReplacementConstant); - } - - CmpInst::Predicate NewPred = CmpInst::getFlippedStrictnessPredicate(Pred); - - // Increment or decrement the constant. - Constant *OneOrNegOne = ConstantInt::get(Type, WillIncrement ? 1 : -1, true); - Constant *NewC = ConstantExpr::getAdd(C, OneOrNegOne); - - return std::make_pair(NewPred, NewC); -} - /// If we have an icmp le or icmp ge instruction with a constant operand, turn /// it into the appropriate icmp lt or icmp gt instruction. This transform /// allows them to be folded in visitICmpInst. @@ -6996,8 +6921,7 @@ static ICmpInst *canonicalizeCmpWithConstant(ICmpInst &I) { if (!Op1C) return nullptr; - auto FlippedStrictness = - InstCombiner::getFlippedStrictnessPredicateAndConstant(Pred, Op1C); + auto FlippedStrictness = getFlippedStrictnessPredicateAndConstant(Pred, Op1C); if (!FlippedStrictness) return nullptr; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h index b31ae37..f6992119 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -435,9 +435,6 @@ private: Instruction * canonicalizeConditionalNegationViaMathToSelect(BinaryOperator &i); - Value *foldAndOrOfICmpsOfAndWithPow2(ICmpInst *LHS, ICmpInst *RHS, - Instruction *CxtI, bool IsAnd, - bool IsLogical = false); Value *matchSelectFromAndOr(Value *A, Value *B, Value *C, Value *D, bool InvertFalseVal = false); Value *getSelectCondition(Value *A, Value *B, bool ABIsTheSame); diff --git a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp index 272a194..80308bf 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp @@ -765,33 +765,14 @@ Instruction *InstCombinerImpl::foldPHIArgLoadIntoPHI(PHINode &PN) { NewPN->addIncoming(InVal, PN.getIncomingBlock(0)); LoadInst *NewLI = new LoadInst(FirstLI->getType(), NewPN, "", IsVolatile, LoadAlignment); - - unsigned KnownIDs[] = { - LLVMContext::MD_tbaa, - LLVMContext::MD_range, - LLVMContext::MD_invariant_load, - LLVMContext::MD_alias_scope, - LLVMContext::MD_noalias, - LLVMContext::MD_nonnull, - LLVMContext::MD_align, - LLVMContext::MD_dereferenceable, - LLVMContext::MD_dereferenceable_or_null, - LLVMContext::MD_access_group, - LLVMContext::MD_noundef, - }; - - for (unsigned ID : KnownIDs) - NewLI->setMetadata(ID, FirstLI->getMetadata(ID)); + NewLI->copyMetadata(*FirstLI); // Add all operands to the new PHI and combine TBAA metadata. for (auto Incoming : drop_begin(zip(PN.blocks(), PN.incoming_values()))) { BasicBlock *BB = std::get<0>(Incoming); Value *V = std::get<1>(Incoming); LoadInst *LI = cast<LoadInst>(V); - // FIXME: https://github.com/llvm/llvm-project/issues/121495 - // Call combineMetadataForCSE instead, so that an explicit set of KnownIDs - // doesn't need to be maintained here. - combineMetadata(NewLI, LI, KnownIDs, true); + combineMetadataForCSE(NewLI, LI, true); Value *NewInVal = LI->getOperand(0); if (NewInVal != InVal) InVal = nullptr; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index 7fd91c7..1eca177 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -1689,8 +1689,7 @@ tryToReuseConstantFromSelectInComparison(SelectInst &Sel, ICmpInst &Cmp, return nullptr; // Check the constant we'd have with flipped-strictness predicate. - auto FlippedStrictness = - InstCombiner::getFlippedStrictnessPredicateAndConstant(Pred, C0); + auto FlippedStrictness = getFlippedStrictnessPredicateAndConstant(Pred, C0); if (!FlippedStrictness) return nullptr; @@ -1970,8 +1969,7 @@ static Value *foldSelectWithConstOpToBinOp(ICmpInst *Cmp, Value *TrueVal, Value *RHS; SelectPatternFlavor SPF; const DataLayout &DL = BOp->getDataLayout(); - auto Flipped = - InstCombiner::getFlippedStrictnessPredicateAndConstant(Predicate, C1); + auto Flipped = getFlippedStrictnessPredicateAndConstant(Predicate, C1); if (C3 == ConstantFoldBinaryOpOperands(Opcode, C1, C2, DL)) { SPF = getSelectPattern(Predicate).Flavor; @@ -2823,9 +2821,9 @@ static Instruction *foldSelectWithSRem(SelectInst &SI, InstCombinerImpl &IC, // %cnd = icmp slt i32 %rem, 0 // %add = add i32 %rem, %n // %sel = select i1 %cnd, i32 %add, i32 %rem - if (match(TrueVal, m_Add(m_Specific(RemRes), m_Value(Remainder))) && + if (match(TrueVal, m_c_Add(m_Specific(RemRes), m_Value(Remainder))) && match(RemRes, m_SRem(m_Value(Op), m_Specific(Remainder))) && - IC.isKnownToBeAPowerOfTwo(Remainder, /*OrZero*/ true) && + IC.isKnownToBeAPowerOfTwo(Remainder, /*OrZero=*/true) && FalseVal == RemRes) return FoldToBitwiseAnd(Remainder); diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index f63de1f..2fb60ef 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -939,12 +939,11 @@ Instruction *InstCombinerImpl::foldBinOpShiftWithShift(BinaryOperator &I) { m_OneUse(m_Shift(m_Value(Y), m_Value(Shift))))) return nullptr; if (!match(I.getOperand(1 - ShOpnum), - m_BinOp(m_Value(ShiftedX), m_Value(Mask)))) + m_c_BinOp(m_CombineAnd( + m_OneUse(m_Shift(m_Value(X), m_Specific(Shift))), + m_Value(ShiftedX)), + m_Value(Mask)))) return nullptr; - - if (!match(ShiftedX, m_OneUse(m_Shift(m_Value(X), m_Specific(Shift))))) - return nullptr; - // Make sure we are matching instruction shifts and not ConstantExpr auto *IY = dyn_cast<Instruction>(I.getOperand(ShOpnum)); auto *IX = dyn_cast<Instruction>(ShiftedX); @@ -1822,12 +1821,29 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN, continue; } - // If the only use of phi is comparing it with a constant then we can - // put this comparison in the incoming BB directly after a ucmp/scmp call - // because we know that it will simplify to a single icmp. - const APInt *Ignored; - if (isa<CmpIntrinsic>(InVal) && InVal->hasOneUser() && - match(&I, m_ICmp(m_Specific(PN), m_APInt(Ignored)))) { + // Handle some cases that can't be fully simplified, but where we know that + // the two instructions will fold into one. + auto WillFold = [&]() { + if (!InVal->hasOneUser()) + return false; + + // icmp of ucmp/scmp with constant will fold to icmp. + const APInt *Ignored; + if (isa<CmpIntrinsic>(InVal) && + match(&I, m_ICmp(m_Specific(PN), m_APInt(Ignored)))) + return true; + + // icmp eq zext(bool), 0 will fold to !bool. + if (isa<ZExtInst>(InVal) && + cast<ZExtInst>(InVal)->getSrcTy()->isIntOrIntVectorTy(1) && + match(&I, + m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(PN), m_Zero()))) + return true; + + return false; + }; + + if (WillFold()) { OpsToMoveUseToIncomingBB.push_back(i); NewPhiValues.push_back(nullptr); continue; diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp index 530061e..2031728 100644 --- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp @@ -192,7 +192,7 @@ static cl::opt<bool> cl::Hidden); static cl::opt<int> ClHotPercentileCutoff("hwasan-percentile-cutoff-hot", - cl::desc("Hot percentile cuttoff.")); + cl::desc("Hot percentile cutoff.")); static cl::opt<float> ClRandomSkipRate("hwasan-random-rate", diff --git a/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp b/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp index 2418030..f27798c 100644 --- a/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp +++ b/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp @@ -30,7 +30,7 @@ using namespace llvm; static cl::opt<int> HotPercentileCutoff("lower-allow-check-percentile-cutoff-hot", - cl::desc("Hot percentile cuttoff.")); + cl::desc("Hot percentile cutoff.")); static cl::opt<float> RandomRate("lower-allow-check-random-rate", diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index 471086c..db4d62e 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -158,11 +158,11 @@ STATISTIC(NumCoveredBlocks, "Number of basic blocks that were executed"); // Command line option to specify the file to read profile from. This is // mainly used for testing. -static cl::opt<std::string> - PGOTestProfileFile("pgo-test-profile-file", cl::init(""), cl::Hidden, - cl::value_desc("filename"), - cl::desc("Specify the path of profile data file. This is" - "mainly for test purpose.")); +static cl::opt<std::string> PGOTestProfileFile( + "pgo-test-profile-file", cl::init(""), cl::Hidden, + cl::value_desc("filename"), + cl::desc("Specify the path of profile data file. This is " + "mainly for test purpose.")); static cl::opt<std::string> PGOTestProfileRemappingFile( "pgo-test-profile-remapping-file", cl::init(""), cl::Hidden, cl::value_desc("filename"), @@ -186,7 +186,7 @@ static cl::opt<unsigned> MaxNumAnnotations( // to write to the metadata for a single memop intrinsic. static cl::opt<unsigned> MaxNumMemOPAnnotations( "memop-max-annotations", cl::init(4), cl::Hidden, - cl::desc("Max number of preicise value annotations for a single memop" + cl::desc("Max number of precise value annotations for a single memop" "intrinsic")); // Command line option to control appending FunctionHash to the name of a COMDAT @@ -291,13 +291,13 @@ static cl::opt<bool> PGOVerifyHotBFI( cl::desc("Print out the non-match BFI count if a hot raw profile count " "becomes non-hot, or a cold raw profile count becomes hot. " "The print is enabled under -Rpass-analysis=pgo, or " - "internal option -pass-remakrs-analysis=pgo.")); + "internal option -pass-remarks-analysis=pgo.")); static cl::opt<bool> PGOVerifyBFI( "pgo-verify-bfi", cl::init(false), cl::Hidden, cl::desc("Print out mismatched BFI counts after setting profile metadata " "The print is enabled under -Rpass-analysis=pgo, or " - "internal option -pass-remakrs-analysis=pgo.")); + "internal option -pass-remarks-analysis=pgo.")); static cl::opt<unsigned> PGOVerifyBFIRatio( "pgo-verify-bfi-ratio", cl::init(2), cl::Hidden, diff --git a/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp index 1961095..9cd81f3 100644 --- a/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp @@ -70,7 +70,7 @@ namespace { /// violations. struct TypeSanitizer { TypeSanitizer(Module &M); - bool run(Function &F, const TargetLibraryInfo &TLI); + bool sanitizeFunction(Function &F, const TargetLibraryInfo &TLI); void instrumentGlobals(Module &M); private: @@ -510,7 +510,8 @@ void collectMemAccessInfo( } } -bool TypeSanitizer::run(Function &F, const TargetLibraryInfo &TLI) { +bool TypeSanitizer::sanitizeFunction(Function &F, + const TargetLibraryInfo &TLI) { // This is required to prevent instrumenting call to __tysan_init from within // the module constructor. if (&F == TysanCtorFunction.getCallee() || &F == TysanGlobalsSetTypeFunction) @@ -876,15 +877,8 @@ bool TypeSanitizer::instrumentMemInst(Value *V, Instruction *ShadowBase, return true; } -PreservedAnalyses TypeSanitizerPass::run(Function &F, - FunctionAnalysisManager &FAM) { - TypeSanitizer TySan(*F.getParent()); - TySan.run(F, FAM.getResult<TargetLibraryAnalysis>(F)); - return PreservedAnalyses::none(); -} - -PreservedAnalyses ModuleTypeSanitizerPass::run(Module &M, - ModuleAnalysisManager &AM) { +PreservedAnalyses TypeSanitizerPass::run(Module &M, + ModuleAnalysisManager &MAM) { Function *TysanCtorFunction; std::tie(TysanCtorFunction, std::ignore) = createSanitizerCtorAndInitFunctions(M, kTysanModuleCtorName, @@ -894,5 +888,12 @@ PreservedAnalyses ModuleTypeSanitizerPass::run(Module &M, TypeSanitizer TySan(M); TySan.instrumentGlobals(M); appendToGlobalCtors(M, TysanCtorFunction, 0); + + auto &FAM = MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); + for (Function &F : M) { + const TargetLibraryInfo &TLI = FAM.getResult<TargetLibraryAnalysis>(F); + TySan.sanitizeFunction(F, TLI); + } + return PreservedAnalyses::none(); } diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index ba1c224..3c82eed 100644 --- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -128,7 +128,7 @@ static cl::opt<bool, true> static cl::opt<bool> UseLIRCodeSizeHeurs( "use-lir-code-size-heurs", - cl::desc("Use loop idiom recognition code size heuristics when compiling" + cl::desc("Use loop idiom recognition code size heuristics when compiling " "with -Os/-Oz"), cl::init(true), cl::Hidden); diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp index 260cc72..0903488 100644 --- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -104,7 +104,7 @@ static cl::opt<unsigned> UnrollMaxPercentThresholdBoost( static cl::opt<unsigned> UnrollMaxIterationsCountToAnalyze( "unroll-max-iteration-count-to-analyze", cl::init(10), cl::Hidden, - cl::desc("Don't allow loop unrolling to simulate more than this number of" + cl::desc("Don't allow loop unrolling to simulate more than this number of " "iterations when checking full unroll profitability")); static cl::opt<unsigned> UnrollCount( diff --git a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp index f58dcb5..6e91c4f 100644 --- a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp +++ b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp @@ -95,7 +95,7 @@ static const char *LICMVersioningMetaData = "llvm.loop.licm_versioning.disable"; /// invariant instructions in a loop. static cl::opt<float> LVInvarThreshold("licm-versioning-invariant-threshold", - cl::desc("LoopVersioningLICM's minimum allowed percentage" + cl::desc("LoopVersioningLICM's minimum allowed percentage " "of possible invariant instructions per loop"), cl::init(25), cl::Hidden); diff --git a/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp b/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp index 1d4f561..b499ef8 100644 --- a/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp +++ b/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp @@ -28,8 +28,8 @@ using namespace llvm; namespace llvm { cl::opt<bool> ShouldPreserveAllAttributes( "assume-preserve-all", cl::init(false), cl::Hidden, - cl::desc("enable preservation of all attrbitues. even those that are " - "unlikely to be usefull")); + cl::desc("enable preservation of all attributes. even those that are " + "unlikely to be useful")); cl::opt<bool> EnableKnowledgeRetention( "enable-knowledge-retention", cl::init(false), cl::Hidden, diff --git a/llvm/lib/Transforms/Utils/LoopSimplify.cpp b/llvm/lib/Transforms/Utils/LoopSimplify.cpp index d829864..b3f9f76 100644 --- a/llvm/lib/Transforms/Utils/LoopSimplify.cpp +++ b/llvm/lib/Transforms/Utils/LoopSimplify.cpp @@ -778,7 +778,7 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_END(LoopSimplify, "loop-simplify", "Canonicalize natural loops", - false, true) + false, false) // Publicly exposed interface to pass... char &llvm::LoopSimplifyID = LoopSimplify::ID; diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 03dc6c1..e367b01 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -96,8 +96,9 @@ using namespace PatternMatch; cl::opt<bool> llvm::RequireAndPreserveDomTree( "simplifycfg-require-and-preserve-domtree", cl::Hidden, - cl::desc("Temorary development switch used to gradually uplift SimplifyCFG " - "into preserving DomTree,")); + cl::desc( + "Temporary development switch used to gradually uplift SimplifyCFG " + "into preserving DomTree,")); // Chosen as 2 so as to be cheap, but still to have enough power to fold // a select, so the "clamp" idiom (of a min followed by a max) will be caught. @@ -126,7 +127,7 @@ static cl::opt<bool> HoistLoadsStoresWithCondFaulting( static cl::opt<unsigned> HoistLoadsStoresWithCondFaultingThreshold( "hoist-loads-stores-with-cond-faulting-threshold", cl::Hidden, cl::init(6), - cl::desc("Control the maximal conditonal load/store that we are willing " + cl::desc("Control the maximal conditional load/store that we are willing " "to speculatively execute to eliminate conditional branch " "(default = 6)")); diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp index 02ec1d5..9e81573 100644 --- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -324,6 +324,11 @@ private: Instruction *ChainElem, Instruction *ChainBegin, const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets); + /// Merges the equivalence classes if they have underlying objects that differ + /// by one level of indirection (i.e., one is a getelementptr and the other is + /// the base pointer in that getelementptr). + void mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const; + /// Collects loads and stores grouped by "equivalence class", where: /// - all elements in an eq class are a load or all are a store, /// - they all load/store the same element size (it's OK to have e.g. i8 and @@ -1305,6 +1310,119 @@ std::optional<APInt> Vectorizer::getConstantOffsetSelects( return std::nullopt; } +void Vectorizer::mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const { + if (EQClasses.size() < 2) // There is nothing to merge. + return; + + // The reduced key has all elements of the ECClassKey except the underlying + // object. Check that EqClassKey has 4 elements and define the reduced key. + static_assert(std::tuple_size_v<EqClassKey> == 4, + "EqClassKey has changed - EqClassReducedKey needs changes too"); + using EqClassReducedKey = + std::tuple<std::tuple_element_t<1, EqClassKey> /* AddrSpace */, + std::tuple_element_t<2, EqClassKey> /* Element size */, + std::tuple_element_t<3, EqClassKey> /* IsLoad; */>; + using ECReducedKeyToUnderlyingObjectMap = + MapVector<EqClassReducedKey, + SmallPtrSet<std::tuple_element_t<0, EqClassKey>, 4>>; + + // Form a map from the reduced key (without the underlying object) to the + // underlying objects: 1 reduced key to many underlying objects, to form + // groups of potentially merge-able equivalence classes. + ECReducedKeyToUnderlyingObjectMap RedKeyToUOMap; + bool FoundPotentiallyOptimizableEC = false; + for (const auto &EC : EQClasses) { + const auto &Key = EC.first; + EqClassReducedKey RedKey{std::get<1>(Key), std::get<2>(Key), + std::get<3>(Key)}; + RedKeyToUOMap[RedKey].insert(std::get<0>(Key)); + if (RedKeyToUOMap[RedKey].size() > 1) + FoundPotentiallyOptimizableEC = true; + } + if (!FoundPotentiallyOptimizableEC) + return; + + LLVM_DEBUG({ + dbgs() << "LSV: mergeEquivalenceClasses: before merging:\n"; + for (const auto &EC : EQClasses) { + dbgs() << " Key: {" << EC.first << "}\n"; + for (const auto &Inst : EC.second) + dbgs() << " Inst: " << *Inst << '\n'; + } + }); + LLVM_DEBUG({ + dbgs() << "LSV: mergeEquivalenceClasses: RedKeyToUOMap:\n"; + for (const auto &RedKeyToUO : RedKeyToUOMap) { + dbgs() << " Reduced key: {" << std::get<0>(RedKeyToUO.first) << ", " + << std::get<1>(RedKeyToUO.first) << ", " + << static_cast<int>(std::get<2>(RedKeyToUO.first)) << "} --> " + << RedKeyToUO.second.size() << " underlying objects:\n"; + for (auto UObject : RedKeyToUO.second) + dbgs() << " " << *UObject << '\n'; + } + }); + + using UObjectToUObjectMap = DenseMap<const Value *, const Value *>; + + // Compute the ultimate targets for a set of underlying objects. + auto GetUltimateTargets = + [](SmallPtrSetImpl<const Value *> &UObjects) -> UObjectToUObjectMap { + UObjectToUObjectMap IndirectionMap; + for (const auto *UObject : UObjects) { + const unsigned MaxLookupDepth = 1; // look for 1-level indirections only + const auto *UltimateTarget = getUnderlyingObject(UObject, MaxLookupDepth); + if (UltimateTarget != UObject) + IndirectionMap[UObject] = UltimateTarget; + } + UObjectToUObjectMap UltimateTargetsMap; + for (const auto *UObject : UObjects) { + auto Target = UObject; + auto It = IndirectionMap.find(Target); + for (; It != IndirectionMap.end(); It = IndirectionMap.find(Target)) + Target = It->second; + UltimateTargetsMap[UObject] = Target; + } + return UltimateTargetsMap; + }; + + // For each item in RedKeyToUOMap, if it has more than one underlying object, + // try to merge the equivalence classes. + for (auto &[RedKey, UObjects] : RedKeyToUOMap) { + if (UObjects.size() < 2) + continue; + auto UTMap = GetUltimateTargets(UObjects); + for (const auto &[UObject, UltimateTarget] : UTMap) { + if (UObject == UltimateTarget) + continue; + + EqClassKey KeyFrom{UObject, std::get<0>(RedKey), std::get<1>(RedKey), + std::get<2>(RedKey)}; + EqClassKey KeyTo{UltimateTarget, std::get<0>(RedKey), std::get<1>(RedKey), + std::get<2>(RedKey)}; + // The entry for KeyFrom is guarantted to exist, unlike KeyTo. Thus, + // request the reference to the instructions vector for KeyTo first. + const auto &VecTo = EQClasses[KeyTo]; + const auto &VecFrom = EQClasses[KeyFrom]; + SmallVector<Instruction *, 8> MergedVec; + std::merge(VecFrom.begin(), VecFrom.end(), VecTo.begin(), VecTo.end(), + std::back_inserter(MergedVec), + [](Instruction *A, Instruction *B) { + return A && B && A->comesBefore(B); + }); + EQClasses[KeyTo] = std::move(MergedVec); + EQClasses.erase(KeyFrom); + } + } + LLVM_DEBUG({ + dbgs() << "LSV: mergeEquivalenceClasses: after merging:\n"; + for (const auto &EC : EQClasses) { + dbgs() << " Key: {" << EC.first << "}\n"; + for (const auto &Inst : EC.second) + dbgs() << " Inst: " << *Inst << '\n'; + } + }); +} + EquivalenceClassMap Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin, BasicBlock::iterator End) { @@ -1377,6 +1495,7 @@ Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin, .emplace_back(&I); } + mergeEquivalenceClasses(Ret); return Ret; } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index e0f629e..47866da 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8354,17 +8354,22 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands, auto *GEP = dyn_cast<GetElementPtrInst>( Ptr->getUnderlyingValue()->stripPointerCasts()); VPSingleDefRecipe *VectorPtr; - if (Reverse) + if (Reverse) { + // When folding the tail, we may compute an address that we don't in the + // original scalar loop and it may not be inbounds. Drop Inbounds in that + // case. + GEPNoWrapFlags Flags = + (CM.foldTailByMasking() || !GEP || !GEP->isInBounds()) + ? GEPNoWrapFlags::none() + : GEPNoWrapFlags::inBounds(); VectorPtr = new VPReverseVectorPointerRecipe( - Ptr, &Plan.getVF(), getLoadStoreType(I), - GEP && GEP->isInBounds() ? GEPNoWrapFlags::inBounds() - : GEPNoWrapFlags::none(), - I->getDebugLoc()); - else + Ptr, &Plan.getVF(), getLoadStoreType(I), Flags, I->getDebugLoc()); + } else { VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I), GEP ? GEP->getNoWrapFlags() : GEPNoWrapFlags::none(), I->getDebugLoc()); + } Builder.getInsertBlock()->appendRecipe(VectorPtr); Ptr = VectorPtr; } diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index c4582df..36fed89 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -104,6 +104,7 @@ using namespace llvm; using namespace llvm::PatternMatch; using namespace slpvectorizer; +using namespace std::placeholders; #define SV_NAME "slp-vectorizer" #define DEBUG_TYPE "SLP" @@ -4955,6 +4956,37 @@ getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args); } +/// Correctly creates insert_subvector, checking that the index is multiple of +/// the subvectors length. Otherwise, generates shuffle using \p Generator or +/// using default shuffle. +static Value *createInsertVector( + IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index, + function_ref<Value *(Value *, Value *, ArrayRef<int>)> Generator = {}) { + const unsigned SubVecVF = getNumElements(V->getType()); + if (Index % SubVecVF == 0) { + Vec = Builder.CreateInsertVector(Vec->getType(), Vec, V, + Builder.getInt64(Index)); + } else { + // Create shuffle, insertvector requires that index is multiple of + // the subvector length. + const unsigned VecVF = getNumElements(Vec->getType()); + SmallVector<int> Mask(VecVF, PoisonMaskElem); + std::iota(Mask.begin(), std::next(Mask.begin(), Index), 0); + for (unsigned I : seq<unsigned>(SubVecVF)) + Mask[I + Index] = I + VecVF; + if (Generator) { + Vec = Generator(Vec, V, Mask); + } else { + // 1. Resize V to the size of Vec. + SmallVector<int> ResizeMask(VecVF, PoisonMaskElem); + std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0); + V = Builder.CreateShuffleVector(V, ResizeMask); + Vec = Builder.CreateShuffleVector(Vec, V, Mask); + } + } + return Vec; +} + BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order, @@ -5355,11 +5387,10 @@ static bool clusterSortPtrAccesses(ArrayRef<Value *> VL, SmallPtrSet<Value *, 13> SecondPointers; Value *P1 = Ptr1; Value *P2 = Ptr2; - if (P1 == P2) - return false; unsigned Depth = 0; - while (!FirstPointers.contains(P2) && !SecondPointers.contains(P1) && - Depth <= RecursionMaxDepth) { + while (!FirstPointers.contains(P2) && !SecondPointers.contains(P1)) { + if (P1 == P2 || Depth > RecursionMaxDepth) + return false; FirstPointers.insert(P1); SecondPointers.insert(P2); P1 = getUnderlyingObject(P1, /*MaxLookup=*/1); @@ -13884,9 +13915,8 @@ Value *BoUpSLP::gather( Instruction *InsElt; if (auto *VecTy = dyn_cast<FixedVectorType>(Scalar->getType())) { assert(SLPReVec && "FixedVectorType is not expected."); - Vec = InsElt = Builder.CreateInsertVector( - Vec->getType(), Vec, Scalar, - Builder.getInt64(Pos * VecTy->getNumElements())); + Vec = InsElt = cast<Instruction>(createInsertVector( + Builder, Vec, Scalar, Pos * getNumElements(VecTy))); auto *II = dyn_cast<IntrinsicInst>(InsElt); if (!II || II->getIntrinsicID() != Intrinsic::vector_insert) return Vec; @@ -14486,23 +14516,10 @@ public: V, SimplifyQuery(*R.DL)); })); unsigned InsertionIndex = Idx * ScalarTyNumElements; - const unsigned SubVecVF = - cast<FixedVectorType>(V->getType())->getNumElements(); - if (InsertionIndex % SubVecVF == 0) { - Vec = Builder.CreateInsertVector(Vec->getType(), Vec, V, - Builder.getInt64(InsertionIndex)); - } else { - // Create shuffle, insertvector requires that index is multiple of - // the subvectors length. - const unsigned VecVF = - cast<FixedVectorType>(Vec->getType())->getNumElements(); - SmallVector<int> Mask(VecVF, PoisonMaskElem); - std::iota(Mask.begin(), Mask.end(), 0); - for (unsigned I : seq<unsigned>( - InsertionIndex, (Idx + SubVecVF) * ScalarTyNumElements)) - Mask[I] = I - Idx + VecVF; - Vec = createShuffle(Vec, V, Mask); - } + Vec = createInsertVector( + Builder, Vec, V, InsertionIndex, + std::bind(&ShuffleInstructionBuilder::createShuffle, this, _1, _2, + _3)); if (!CommonMask.empty()) { std::iota( std::next(CommonMask.begin(), InsertionIndex), @@ -17748,7 +17765,6 @@ bool BoUpSLP::collectValuesToDemote( BitWidth = std::max(BitWidth, BitWidth1); return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2); }; - using namespace std::placeholders; auto FinalAnalysis = [&]() { if (!IsProfitableToDemote) return false; diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 9d7bf97..cfbb4ad 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1351,6 +1351,9 @@ public: } } + /// Returns true if the underlying opcode may read from or write to memory. + bool opcodeMayReadOrWriteFromMemory() const; + /// Returns true if the recipe only uses the first lane of operand \p Op. bool onlyFirstLaneUsed(const VPValue *Op) const override; diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 76336ae..e54df8bd 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -51,24 +51,7 @@ extern cl::opt<unsigned> ForceTargetInstructionCost; bool VPRecipeBase::mayWriteToMemory() const { switch (getVPDefID()) { case VPInstructionSC: - if (Instruction::isBinaryOp(cast<VPInstruction>(this)->getOpcode())) - return false; - switch (cast<VPInstruction>(this)->getOpcode()) { - case Instruction::Or: - case Instruction::ICmp: - case Instruction::Select: - case VPInstruction::AnyOf: - case VPInstruction::Not: - case VPInstruction::CalculateTripCountMinusVF: - case VPInstruction::CanonicalIVIncrementForPart: - case VPInstruction::ExtractFromEnd: - case VPInstruction::FirstOrderRecurrenceSplice: - case VPInstruction::LogicalAnd: - case VPInstruction::PtrAdd: - return false; - default: - return true; - } + return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory(); case VPInterleaveSC: return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0; case VPWidenStoreEVLSC: @@ -115,6 +98,8 @@ bool VPRecipeBase::mayWriteToMemory() const { bool VPRecipeBase::mayReadFromMemory() const { switch (getVPDefID()) { + case VPInstructionSC: + return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory(); case VPWidenLoadEVLSC: case VPWidenLoadSC: return true; @@ -707,6 +692,26 @@ void VPInstruction::execute(VPTransformState &State) { /*IsScalar*/ GeneratesPerFirstLaneOnly); } +bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { + if (Instruction::isBinaryOp(getOpcode())) + return false; + switch (getOpcode()) { + case Instruction::ICmp: + case Instruction::Select: + case VPInstruction::AnyOf: + case VPInstruction::CalculateTripCountMinusVF: + case VPInstruction::CanonicalIVIncrementForPart: + case VPInstruction::ExtractFromEnd: + case VPInstruction::FirstOrderRecurrenceSplice: + case VPInstruction::LogicalAnd: + case VPInstruction::Not: + case VPInstruction::PtrAdd: + return false; + default: + return true; + } +} + bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); if (Instruction::isBinaryOp(getOpcode())) diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h index 9657770..7779442 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h @@ -49,7 +49,8 @@ inline bool isUniformAfterVectorization(const VPValue *VPV) { return all_of(GEP->operands(), isUniformAfterVectorization); if (auto *VPI = dyn_cast<VPInstruction>(Def)) return VPI->isSingleScalar() || VPI->isVectorToScalar(); - return false; + // VPExpandSCEVRecipes must be placed in the entry and are alway uniform. + return isa<VPExpandSCEVRecipe>(Def); } /// Return true if \p V is a header mask in \p Plan. diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index ea53a1a..1a669b5 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -705,7 +705,8 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) { InstructionCost NewCost = TTI.getArithmeticInstrCost(Instruction::FNeg, VecTy, CostKind) + - TTI.getShuffleCost(TargetTransformInfo::SK_Select, VecTy, Mask, CostKind); + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, VecTy, Mask, + CostKind); bool NeedLenChg = SrcVecTy->getNumElements() != NumElts; // If the lengths of the two vectors are not equal, @@ -2022,9 +2023,7 @@ bool VectorCombine::foldShuffleOfShuffles(Instruction &I) { if (Match1) InnerCost1 = TTI.getInstructionCost(cast<Instruction>(OuterV1), CostKind); - InstructionCost OuterCost = TTI.getShuffleCost( - TargetTransformInfo::SK_PermuteTwoSrc, ShuffleImmTy, OuterMask, CostKind, - 0, nullptr, {OuterV0, OuterV1}, &I); + InstructionCost OuterCost = TTI.getInstructionCost(&I, CostKind); InstructionCost OldCost = InnerCost0 + InnerCost1 + OuterCost; |