diff options
Diffstat (limited to 'llvm/lib/Transforms')
27 files changed, 533 insertions, 212 deletions
diff --git a/llvm/lib/Transforms/Coroutines/CoroCloner.h b/llvm/lib/Transforms/Coroutines/CoroCloner.h index e05fe28..1e549f1 100644 --- a/llvm/lib/Transforms/Coroutines/CoroCloner.h +++ b/llvm/lib/Transforms/Coroutines/CoroCloner.h @@ -77,7 +77,7 @@ public:        : OrigF(OrigF), Suffix(Suffix), Shape(Shape), FKind(FKind),          Builder(OrigF.getContext()), TTI(TTI) {} -  virtual ~BaseCloner() {} +  virtual ~BaseCloner() = default;    /// Create a clone for a continuation lowering.    static Function *createClone(Function &OrigF, const Twine &Suffix, diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp index 5048561..a6ac761 100644 --- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp +++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp @@ -3619,7 +3619,7 @@ struct AAIntraFnReachabilityFunction final        return true;      RQITy StackRQI(A, From, To, ExclusionSet, false); -    typename RQITy::Reachable Result; +    RQITy::Reachable Result;      if (!NonConstThis->checkQueryCache(A, StackRQI, Result))        return NonConstThis->isReachableImpl(A, StackRQI,                                             /*IsTemporaryRQI=*/true); @@ -5185,6 +5185,7 @@ struct AADereferenceableCallSiteReturned final  // ------------------------ Align Argument Attribute ------------------------  namespace { +  static unsigned getKnownAlignForUse(Attributor &A, AAAlign &QueryingAA,                                      Value &AssociatedValue, const Use *U,                                      const Instruction *I, bool &TrackUse) { @@ -5200,6 +5201,28 @@ static unsigned getKnownAlignForUse(Attributor &A, AAAlign &QueryingAA,        TrackUse = true;      return 0;    } +  if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) +    switch (II->getIntrinsicID()) { +    case Intrinsic::ptrmask: { +      // Is it appropriate to pull attribute in initialization? +      const auto *ConstVals = A.getAAFor<AAPotentialConstantValues>( +          QueryingAA, IRPosition::value(*II->getOperand(1)), DepClassTy::NONE); +      const auto *AlignAA = A.getAAFor<AAAlign>( +          QueryingAA, IRPosition::value(*II), DepClassTy::NONE); +      if (ConstVals && ConstVals->isValidState() && ConstVals->isAtFixpoint()) { +        unsigned ShiftValue = std::min(ConstVals->getAssumedMinTrailingZeros(), +                                       Value::MaxAlignmentExponent); +        Align ConstAlign(UINT64_C(1) << ShiftValue); +        if (ConstAlign >= AlignAA->getKnownAlign()) +          return Align(1).value(); +      } +      if (AlignAA) +        return AlignAA->getKnownAlign().value(); +      break; +    } +    default: +      break; +    }    MaybeAlign MA;    if (const auto *CB = dyn_cast<CallBase>(I)) { @@ -5499,6 +5522,44 @@ struct AAAlignCallSiteReturned final    AAAlignCallSiteReturned(const IRPosition &IRP, Attributor &A)        : Base(IRP, A) {} +  ChangeStatus updateImpl(Attributor &A) override { +    Instruction *I = getIRPosition().getCtxI(); +    if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { +      switch (II->getIntrinsicID()) { +      case Intrinsic::ptrmask: { +        Align Alignment; +        bool Valid = false; + +        const auto *ConstVals = A.getAAFor<AAPotentialConstantValues>( +            *this, IRPosition::value(*II->getOperand(1)), DepClassTy::REQUIRED); +        if (ConstVals && ConstVals->isValidState()) { +          unsigned ShiftValue = +              std::min(ConstVals->getAssumedMinTrailingZeros(), +                       Value::MaxAlignmentExponent); +          Alignment = Align(UINT64_C(1) << ShiftValue); +          Valid = true; +        } + +        const auto *AlignAA = +            A.getAAFor<AAAlign>(*this, IRPosition::value(*(II->getOperand(0))), +                                DepClassTy::REQUIRED); +        if (AlignAA && AlignAA->isValidState()) { +          Alignment = std::max(AlignAA->getAssumedAlign(), Alignment); +          Valid = true; +        } + +        if (Valid) +          return clampStateAndIndicateChange<StateType>( +              this->getState(), +              std::min(this->getAssumedAlign(), Alignment).value()); +        break; +      } +      default: +        break; +      } +    } +    return Base::updateImpl(A); +  };    /// See AbstractAttribute::trackStatistics()    void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(align); }  }; @@ -10701,7 +10762,7 @@ struct AAInterFnReachabilityFunction      auto *NonConstThis = const_cast<AAInterFnReachabilityFunction *>(this);      RQITy StackRQI(A, From, To, ExclusionSet, false); -    typename RQITy::Reachable Result; +    RQITy::Reachable Result;      if (!NonConstThis->checkQueryCache(A, StackRQI, Result))        return NonConstThis->isReachableImpl(A, StackRQI,                                             /*IsTemporaryRQI=*/true); diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index 894d83f..d35ae47 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -1034,11 +1034,11 @@ private:  } // namespace  template <> -struct llvm::DenseMapInfo<typename CallsiteContextGraph< +struct llvm::DenseMapInfo<CallsiteContextGraph<      ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo>      : public DenseMapInfo<std::pair<Instruction *, unsigned>> {};  template <> -struct llvm::DenseMapInfo<typename CallsiteContextGraph< +struct llvm::DenseMapInfo<CallsiteContextGraph<      IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo>      : public DenseMapInfo<std::pair<IndexCall, unsigned>> {};  template <> diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp index d7eb745..2a87a0f 100644 --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -208,7 +208,7 @@ namespace KernelInfo {  // };  #define KERNEL_ENVIRONMENT_IDX(MEMBER, IDX)                                    \ -  constexpr const unsigned MEMBER##Idx = IDX; +  constexpr unsigned MEMBER##Idx = IDX;  KERNEL_ENVIRONMENT_IDX(Configuration, 0)  KERNEL_ENVIRONMENT_IDX(Ident, 1) @@ -216,7 +216,7 @@ KERNEL_ENVIRONMENT_IDX(Ident, 1)  #undef KERNEL_ENVIRONMENT_IDX  #define KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MEMBER, IDX)                      \ -  constexpr const unsigned MEMBER##Idx = IDX; +  constexpr unsigned MEMBER##Idx = IDX;  KERNEL_ENVIRONMENT_CONFIGURATION_IDX(UseGenericStateMachine, 0)  KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MayUseNestedParallelism, 1) @@ -258,7 +258,7 @@ KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MaxTeams)  GlobalVariable *  getKernelEnvironementGVFromKernelInitCB(CallBase *KernelInitCB) { -  constexpr const int InitKernelEnvironmentArgNo = 0; +  constexpr int InitKernelEnvironmentArgNo = 0;    return cast<GlobalVariable>(        KernelInitCB->getArgOperand(InitKernelEnvironmentArgNo)            ->stripPointerCasts()); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index 3ddf182..cbaff29 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -3997,6 +3997,27 @@ static Value *foldOrUnsignedUMulOverflowICmp(BinaryOperator &I,    return nullptr;  } +/// Fold select(X >s 0, 0, -X) | smax(X, 0) --> abs(X) +///      select(X <s 0, -X, 0) | smax(X, 0) --> abs(X) +static Value *FoldOrOfSelectSmaxToAbs(BinaryOperator &I, +                                      InstCombiner::BuilderTy &Builder) { +  Value *X; +  Value *Sel; +  if (match(&I, +            m_c_Or(m_Value(Sel), m_OneUse(m_SMax(m_Value(X), m_ZeroInt()))))) { +    auto NegX = m_Neg(m_Specific(X)); +    if (match(Sel, m_Select(m_SpecificICmp(ICmpInst::ICMP_SGT, m_Specific(X), +                                           m_ZeroInt()), +                            m_ZeroInt(), NegX)) || +        match(Sel, m_Select(m_SpecificICmp(ICmpInst::ICMP_SLT, m_Specific(X), +                                           m_ZeroInt()), +                            NegX, m_ZeroInt()))) +      return Builder.CreateBinaryIntrinsic(Intrinsic::abs, X, +                                           Builder.getFalse()); +  } +  return nullptr; +} +  // FIXME: We use commutative matchers (m_c_*) for some, but not all, matches  // here. We should standardize that construct where it is needed or choose some  // other way to ensure that commutated variants of patterns are not missed. @@ -4545,6 +4566,9 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {      if (Value *V = SimplifyAddWithRemainder(I))        return replaceInstUsesWith(I, V); +  if (Value *Res = FoldOrOfSelectSmaxToAbs(I, Builder)) +    return replaceInstUsesWith(I, Res); +    return nullptr;  } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index f5130da..9572f9d 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -3599,6 +3599,21 @@ Instruction *InstCombinerImpl::foldSelectOfBools(SelectInst &SI) {                                   m_Not(m_Specific(SelCond->getTrueValue())));        if (MayNeedFreeze)          C = Builder.CreateFreeze(C); +      if (!ProfcheckDisableMetadataFixes) { +        Value *C2 = nullptr, *A2 = nullptr, *B2 = nullptr; +        if (match(CondVal, m_LogicalAnd(m_Specific(C), m_Value(A2))) && +            SelCond) { +          return SelectInst::Create(C, A, B, "", nullptr, SelCond); +        } else if (match(FalseVal, +                         m_LogicalAnd(m_Not(m_Value(C2)), m_Value(B2))) && +                   SelFVal) { +          SelectInst *NewSI = SelectInst::Create(C, A, B, "", nullptr, SelFVal); +          NewSI->swapProfMetadata(); +          return NewSI; +        } else { +          return createSelectInstWithUnknownProfile(C, A, B); +        } +      }        return SelectInst::Create(C, A, B);      } @@ -3615,6 +3630,20 @@ Instruction *InstCombinerImpl::foldSelectOfBools(SelectInst &SI) {                                   m_Not(m_Specific(SelFVal->getTrueValue())));        if (MayNeedFreeze)          C = Builder.CreateFreeze(C); +      if (!ProfcheckDisableMetadataFixes) { +        Value *C2 = nullptr, *A2 = nullptr, *B2 = nullptr; +        if (match(CondVal, m_LogicalAnd(m_Not(m_Value(C2)), m_Value(A2))) && +            SelCond) { +          SelectInst *NewSI = SelectInst::Create(C, B, A, "", nullptr, SelCond); +          NewSI->swapProfMetadata(); +          return NewSI; +        } else if (match(FalseVal, m_LogicalAnd(m_Specific(C), m_Value(B2))) && +                   SelFVal) { +          return SelectInst::Create(C, B, A, "", nullptr, SelFVal); +        } else { +          return createSelectInstWithUnknownProfile(C, B, A); +        } +      }        return SelectInst::Create(C, B, A);      }    } diff --git a/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp index 80e77e09..a2fad02 100644 --- a/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp @@ -161,7 +161,7 @@ template <char NsanTypeId>  class ShadowTypeConfigImpl : public ShadowTypeConfig {  public:    char getNsanTypeId() const override { return NsanTypeId; } -  static constexpr const char kNsanTypeId = NsanTypeId; +  static constexpr char kNsanTypeId = NsanTypeId;  };  // `double` (`d`) shadow type. diff --git a/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp b/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp index 89980d5..a577f51 100644 --- a/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp +++ b/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp @@ -122,7 +122,8 @@ DropUnnecessaryAssumesPass::run(Function &F, FunctionAnalysisManager &FAM) {      Value *Cond = Assume->getArgOperand(0);      // Don't drop type tests, which have special semantics. -    if (match(Cond, m_Intrinsic<Intrinsic::type_test>())) +    if (match(Cond, m_Intrinsic<Intrinsic::type_test>()) || +        match(Cond, m_Intrinsic<Intrinsic::public_type_test>()))        continue;      SmallVector<Value *> Affected; diff --git a/llvm/lib/Transforms/Scalar/GVNSink.cpp b/llvm/lib/Transforms/Scalar/GVNSink.cpp index a06f832..d564e32 100644 --- a/llvm/lib/Transforms/Scalar/GVNSink.cpp +++ b/llvm/lib/Transforms/Scalar/GVNSink.cpp @@ -514,7 +514,7 @@ public:  class GVNSink {  public: -  GVNSink() {} +  GVNSink() = default;    bool run(Function &F) {      LLVM_DEBUG(dbgs() << "GVNSink: running on function @" << F.getName() diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp index 3487e81..7e70ba2 100644 --- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp +++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp @@ -245,11 +245,14 @@ raw_ostream &operator<<(raw_ostream &OS, ShapeInfo SI) {  } // namespace -static bool isUniformShape(Value *V) { +static bool isShapePreserving(Value *V) {    Instruction *I = dyn_cast<Instruction>(V);    if (!I)      return true; +  if (isa<SelectInst>(I)) +    return true; +    if (I->isBinaryOp())      return true; @@ -300,6 +303,16 @@ static bool isUniformShape(Value *V) {    }  } +/// Return an iterator over the operands of \p I that should share shape +/// information with \p I. +static iterator_range<Use *> getShapedOperandsForInst(Instruction *I) { +  assert(isShapePreserving(I) && +         "Can't retrieve shaped operands for an instruction that does not " +         "preserve shape information"); +  auto Ops = I->operands(); +  return isa<SelectInst>(I) ? drop_begin(Ops) : Ops; +} +  /// Return the ShapeInfo for the result of \p I, it it can be determined.  static std::optional<ShapeInfo>  computeShapeInfoForInst(Instruction *I, @@ -329,9 +342,8 @@ computeShapeInfoForInst(Instruction *I,        return OpShape->second;    } -  if (isUniformShape(I) || isa<SelectInst>(I)) { -    auto Ops = I->operands(); -    auto ShapedOps = isa<SelectInst>(I) ? drop_begin(Ops) : Ops; +  if (isShapePreserving(I)) { +    auto ShapedOps = getShapedOperandsForInst(I);      // Find the first operand that has a known shape and use that.      for (auto &Op : ShapedOps) {        auto OpShape = ShapeMap.find(Op.get()); @@ -710,10 +722,9 @@ public:        case Intrinsic::matrix_column_major_store:          return true;        default: -        return isUniformShape(II); +        break;        } -    return isUniformShape(V) || isa<StoreInst>(V) || isa<LoadInst>(V) || -           isa<SelectInst>(V); +    return isShapePreserving(V) || isa<StoreInst>(V) || isa<LoadInst>(V);    }    /// Propagate the shape information of instructions to their users. @@ -800,9 +811,8 @@ public:        } else if (isa<StoreInst>(V)) {          // Nothing to do.  We forward-propagated to this so we would just          // backward propagate to an instruction with an already known shape. -      } else if (isUniformShape(V) || isa<SelectInst>(V)) { -        auto Ops = cast<Instruction>(V)->operands(); -        auto ShapedOps = isa<SelectInst>(V) ? drop_begin(Ops) : Ops; +      } else if (isShapePreserving(V)) { +        auto ShapedOps = getShapedOperandsForInst(cast<Instruction>(V));          // Propagate to all operands.          ShapeInfo Shape = ShapeMap[V];          for (Use &U : ShapedOps) { diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index bb6c879..239526e 100644 --- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -337,7 +337,7 @@ static void buildPartialUnswitchConditionalBranch(  static void buildPartialInvariantUnswitchConditionalBranch(      BasicBlock &BB, ArrayRef<Value *> ToDuplicate, bool Direction,      BasicBlock &UnswitchedSucc, BasicBlock &NormalSucc, Loop &L, -    MemorySSAUpdater *MSSAU) { +    MemorySSAUpdater *MSSAU, const BranchInst &OriginalBranch) {    ValueToValueMapTy VMap;    for (auto *Val : reverse(ToDuplicate)) {      Instruction *Inst = cast<Instruction>(Val); @@ -377,8 +377,19 @@ static void buildPartialInvariantUnswitchConditionalBranch(    IRBuilder<> IRB(&BB);    IRB.SetCurrentDebugLocation(DebugLoc::getCompilerGenerated());    Value *Cond = VMap[ToDuplicate[0]]; -  IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc, -                   Direction ? &NormalSucc : &UnswitchedSucc); +  // The expectation is that ToDuplicate[0] is the condition used by the +  // OriginalBranch, case in which we can clone the profile metadata from there. +  auto *ProfData = +      !ProfcheckDisableMetadataFixes && +              ToDuplicate[0] == skipTrivialSelect(OriginalBranch.getCondition()) +          ? OriginalBranch.getMetadata(LLVMContext::MD_prof) +          : nullptr; +  auto *BR = +      IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc, +                       Direction ? &NormalSucc : &UnswitchedSucc, ProfData); +  if (!ProfData) +    setExplicitlyUnknownBranchWeightsIfProfiled(*BR, *BR->getFunction(), +                                                DEBUG_TYPE);  }  /// Rewrite the PHI nodes in an unswitched loop exit basic block. @@ -2515,7 +2526,7 @@ static void unswitchNontrivialInvariants(      // the branch in the split block.      if (PartiallyInvariant)        buildPartialInvariantUnswitchConditionalBranch( -          *SplitBB, Invariants, Direction, *ClonedPH, *LoopPH, L, MSSAU); +          *SplitBB, Invariants, Direction, *ClonedPH, *LoopPH, L, MSSAU, *BI);      else {        buildPartialUnswitchConditionalBranch(            *SplitBB, Invariants, Direction, *ClonedPH, *LoopPH, diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp index 0f3978f..0a8f5ea 100644 --- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp +++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -143,8 +143,8 @@ struct SubGraphTraits {    class WrappedSuccIterator        : public iterator_adaptor_base<              WrappedSuccIterator, BaseSuccIterator, -            typename std::iterator_traits<BaseSuccIterator>::iterator_category, -            NodeRef, std::ptrdiff_t, NodeRef *, NodeRef> { +            std::iterator_traits<BaseSuccIterator>::iterator_category, NodeRef, +            std::ptrdiff_t, NodeRef *, NodeRef> {      SmallDenseSet<RegionNode *> *Nodes;    public: @@ -558,11 +558,10 @@ void StructurizeCFG::analyzeLoops(RegionNode *N) {    } else {      // Test for successors as back edge      BasicBlock *BB = N->getNodeAs<BasicBlock>(); -    BranchInst *Term = cast<BranchInst>(BB->getTerminator()); - -    for (BasicBlock *Succ : Term->successors()) -      if (Visited.count(Succ)) -        Loops[Succ] = BB; +    if (BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator())) +      for (BasicBlock *Succ : Term->successors()) +        if (Visited.count(Succ)) +          Loops[Succ] = BB;    }  } @@ -594,7 +593,7 @@ void StructurizeCFG::gatherPredicates(RegionNode *N) {    for (BasicBlock *P : predecessors(BB)) {      // Ignore it if it's a branch from outside into our region entry -    if (!ParentRegion->contains(P)) +    if (!ParentRegion->contains(P) || !dyn_cast<BranchInst>(P->getTerminator()))        continue;      Region *R = RI->getRegionFor(P); @@ -1402,13 +1401,17 @@ bool StructurizeCFG::makeUniformRegion(Region *R, UniformityInfo &UA) {  /// Run the transformation for each region found  bool StructurizeCFG::run(Region *R, DominatorTree *DT,                           const TargetTransformInfo *TTI) { -  if (R->isTopLevelRegion()) +  // CallBr and its corresponding direct target blocks are for now ignored by +  // this pass. This is not a limitation for the currently intended uses cases +  // of callbr in the AMDGPU backend. +  // Parent and child regions are not affected by this (current) restriction. +  // See `llvm/test/Transforms/StructurizeCFG/callbr.ll` for details. +  if (R->isTopLevelRegion() || isa<CallBrInst>(R->getEntry()->getTerminator()))      return false;    this->DT = DT;    this->TTI = TTI;    Func = R->getEntry()->getParent(); -  assert(hasOnlySimpleTerminator(*Func) && "Unsupported block terminator.");    ParentRegion = R; diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp index 5ba6f95f..6086615 100644 --- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp +++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp @@ -933,6 +933,7 @@ Function *CodeExtractor::constructFunctionDeclaration(        case Attribute::CoroDestroyOnlyWhenComplete:        case Attribute::CoroElideSafe:        case Attribute::NoDivergenceSource: +      case Attribute::NoCreateUndefOrPoison:          continue;        // Those attributes should be safe to propagate to the extracted function.        case Attribute::AlwaysInline: diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index 46f2903..a03cf6e 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -3416,7 +3416,11 @@ DIExpression *llvm::getExpressionForConstant(DIBuilder &DIB, const Constant &C,    // Create integer constant expression.    auto createIntegerExpression = [&DIB](const Constant &CV) -> DIExpression * {      const APInt &API = cast<ConstantInt>(&CV)->getValue(); -    std::optional<int64_t> InitIntOpt = API.trySExtValue(); +    std::optional<int64_t> InitIntOpt; +    if (API.getBitWidth() == 1) +      InitIntOpt = API.tryZExtValue(); +    else +      InitIntOpt = API.trySExtValue();      return InitIntOpt ? DIB.createConstantValueExpression(                              static_cast<uint64_t>(*InitIntOpt))                        : nullptr; diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index 8be471b..6e60b94 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -992,9 +992,12 @@ BranchProbability llvm::getBranchProbability(BranchInst *B,    uint64_t Weight0, Weight1;    if (!extractBranchWeights(*B, Weight0, Weight1))      return BranchProbability::getUnknown(); +  uint64_t Denominator = Weight0 + Weight1; +  if (Denominator == 0) +    return BranchProbability::getUnknown();    if (!ForFirstTarget)      std::swap(Weight0, Weight1); -  return BranchProbability::getBranchProbability(Weight0, Weight0 + Weight1); +  return BranchProbability::getBranchProbability(Weight0, Denominator);  }  bool llvm::setBranchProbability(BranchInst *B, BranchProbability P, diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index cbc604e..bb73327 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -7570,6 +7570,81 @@ static bool reduceSwitchRange(SwitchInst *SI, IRBuilder<> &Builder,    return true;  } +/// Tries to transform the switch when the condition is umin with a constant. +/// In that case, the default branch can be replaced by the constant's branch. +/// This method also removes dead cases when the simplification cannot replace +/// the default branch. +/// +/// For example: +/// switch(umin(a, 3)) { +/// case 0: +/// case 1: +/// case 2: +/// case 3: +/// case 4: +///   // ... +/// default: +///   unreachable +/// } +/// +/// Transforms into: +/// +/// switch(a) { +/// case 0: +/// case 1: +/// case 2: +/// default: +///   // This is case 3 +/// } +static bool simplifySwitchWhenUMin(SwitchInst *SI, DomTreeUpdater *DTU) { +  Value *A; +  ConstantInt *Constant; + +  if (!match(SI->getCondition(), m_UMin(m_Value(A), m_ConstantInt(Constant)))) +    return false; + +  SmallVector<DominatorTree::UpdateType> Updates; +  SwitchInstProfUpdateWrapper SIW(*SI); +  BasicBlock *BB = SIW->getParent(); + +  // Dead cases are removed even when the simplification fails. +  // A case is dead when its value is higher than the Constant. +  for (auto I = SI->case_begin(), E = SI->case_end(); I != E;) { +    if (!I->getCaseValue()->getValue().ugt(Constant->getValue())) { +      ++I; +      continue; +    } +    BasicBlock *DeadCaseBB = I->getCaseSuccessor(); +    DeadCaseBB->removePredecessor(BB); +    Updates.push_back({DominatorTree::Delete, BB, DeadCaseBB}); +    I = SIW->removeCase(I); +    E = SIW->case_end(); +  } + +  auto Case = SI->findCaseValue(Constant); +  // If the case value is not found, `findCaseValue` returns the default case. +  // In this scenario, since there is no explicit `case 3:`, the simplification +  // fails. The simplification also fails when the switch’s default destination +  // is reachable. +  if (!SI->defaultDestUnreachable() || Case == SI->case_default()) { +    if (DTU) +      DTU->applyUpdates(Updates); +    return !Updates.empty(); +  } + +  BasicBlock *Unreachable = SI->getDefaultDest(); +  SIW.replaceDefaultDest(Case); +  SIW.removeCase(Case); +  SIW->setCondition(A); + +  Updates.push_back({DominatorTree::Delete, BB, Unreachable}); + +  if (DTU) +    DTU->applyUpdates(Updates); + +  return true; +} +  /// Tries to transform switch of powers of two to reduce switch range.  /// For example, switch like:  /// switch (C) { case 1: case 2: case 64: case 128: } @@ -8037,6 +8112,9 @@ bool SimplifyCFGOpt::simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {    if (simplifyDuplicateSwitchArms(SI, DTU))      return requestResimplify(); +  if (simplifySwitchWhenUMin(SI, DTU)) +    return requestResimplify(); +    return false;  } diff --git a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp index 94c5c170..e86ab13 100644 --- a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp +++ b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp @@ -158,6 +158,7 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) {    SmallVector<BasicBlock *, 8> CallBrTargetBlocksToFix;    // Redirect exiting edges through a control flow hub.    ControlFlowHub CHub; +  bool Changed = false;    for (unsigned I = 0; I < ExitingBlocks.size(); ++I) {      BasicBlock *BB = ExitingBlocks[I]; @@ -182,6 +183,10 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) {          bool UpdatedLI = false;          BasicBlock *NewSucc =              SplitCallBrEdge(BB, Succ, J, &DTU, nullptr, &LI, &UpdatedLI); +        // SplitCallBrEdge modifies the CFG because it creates an intermediate +        // block. So we need to set the changed flag no matter what the +        // ControlFlowHub is going to do later. +        Changed = true;          // Even if CallBr and Succ do not have a common parent loop, we need to          // add the new target block to the parent loop of the current loop.          if (!UpdatedLI) @@ -207,6 +212,7 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) {    bool ChangedCFG;    std::tie(LoopExitBlock, ChangedCFG) = CHub.finalize(        &DTU, GuardBlocks, "loop.exit", MaxBooleansInControlFlowHub.getValue()); +  ChangedCFG |= Changed;    if (!ChangedCFG)      return false; diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 34b405c..bf3f52c 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -20975,6 +20975,27 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,    if (isa<PHINode>(S.getMainOp()) ||        isVectorLikeInstWithConstOps(S.getMainOp()))      return nullptr; +  // If the parent node is non-schedulable and the current node is copyable, and +  // any of parent instructions are used outside several basic blocks or in +  // bin-op node - cancel scheduling, it may cause wrong def-use deps in +  // analysis, leading to a crash. +  // Non-scheduled nodes may not have related ScheduleData model, which may lead +  // to a skipped dep analysis. +  if (S.areInstructionsWithCopyableElements() && EI && EI.UserTE->hasState() && +      EI.UserTE->doesNotNeedToSchedule() && +      EI.UserTE->getOpcode() != Instruction::PHI && +      any_of(EI.UserTE->Scalars, [](Value *V) { +        auto *I = dyn_cast<Instruction>(V); +        if (!I || I->hasOneUser()) +          return false; +        for (User *U : I->users()) { +          auto *UI = cast<Instruction>(U); +          if (isa<BinaryOperator>(UI)) +            return true; +        } +        return false; +      })) +    return std::nullopt;    bool HasCopyables = S.areInstructionsWithCopyableElements();    if (((!HasCopyables && doesNotNeedToSchedule(VL)) ||         all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))) { diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp index 9c869dd..d354933 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp @@ -92,7 +92,7 @@ void MemDGNode::print(raw_ostream &OS, bool PrintDeps) const {    DGNode::print(OS, false);    if (PrintDeps) {      // Print memory preds. -    static constexpr const unsigned Indent = 4; +    static constexpr unsigned Indent = 4;      for (auto *Pred : MemPreds)        OS.indent(Indent) << "<-" << *Pred->getInstruction() << "\n";    } diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp index 86dbd21..5534da9 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp @@ -25,14 +25,14 @@ static cl::opt<bool>                            "emit new instructions (*very* expensive)."));  #endif // NDEBUG -static constexpr const unsigned long StopAtDisabled = +static constexpr unsigned long StopAtDisabled =      std::numeric_limits<unsigned long>::max();  static cl::opt<unsigned long>      StopAt("sbvec-stop-at", cl::init(StopAtDisabled), cl::Hidden,             cl::desc("Vectorize if the invocation count is < than this. 0 "                      "disables vectorization.")); -static constexpr const unsigned long StopBundleDisabled = +static constexpr unsigned long StopBundleDisabled =      std::numeric_limits<unsigned long>::max();  static cl::opt<unsigned long>      StopBundle("sbvec-stop-bndl", cl::init(StopBundleDisabled), cl::Hidden, diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp index ed2f80b..2de6921 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp @@ -43,7 +43,7 @@ cl::opt<std::string> AllowFiles(      "sbvec-allow-files", cl::init(".*"), cl::Hidden,      cl::desc("Run the vectorizer only on file paths that match any in the "               "list of comma-separated regex's.")); -static constexpr const char AllowFilesDelim = ','; +static constexpr char AllowFilesDelim = ',';  SandboxVectorizerPass::SandboxVectorizerPass() : FPM("fpm") {    if (UserDefinedPassPipeline == DefaultPipelineMagicStr) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 9081ad7..cfe1f1e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -939,7 +939,7 @@ class VPIRMetadata {    SmallVector<std::pair<unsigned, MDNode *>> Metadata;  public: -  VPIRMetadata() {} +  VPIRMetadata() = default;    /// Adds metatadata that can be preserved from the original instruction    /// \p I. @@ -950,12 +950,9 @@ public:    VPIRMetadata(Instruction &I, LoopVersioning *LVer);    /// Copy constructor for cloning. -  VPIRMetadata(const VPIRMetadata &Other) : Metadata(Other.Metadata) {} +  VPIRMetadata(const VPIRMetadata &Other) = default; -  VPIRMetadata &operator=(const VPIRMetadata &Other) { -    Metadata = Other.Metadata; -    return *this; -  } +  VPIRMetadata &operator=(const VPIRMetadata &Other) = default;    /// Add all metadata to \p I.    void applyMetadata(Instruction &I) const; @@ -1113,9 +1110,8 @@ public:    VP_CLASSOF_IMPL(VPDef::VPInstructionSC)    VPInstruction *clone() override { -    SmallVector<VPValue *, 2> Operands(operands()); -    auto *New = -        new VPInstruction(Opcode, Operands, *this, *this, getDebugLoc(), Name); +    auto *New = new VPInstruction(Opcode, operands(), *this, *this, +                                  getDebugLoc(), Name);      if (getUnderlyingValue())        New->setUnderlyingValue(getUnderlyingInstr());      return New; @@ -1229,10 +1225,9 @@ public:    }    VPInstruction *clone() override { -    SmallVector<VPValue *, 2> Operands(operands());      auto *New = -        new VPInstructionWithType(getOpcode(), Operands, getResultType(), *this, -                                  getDebugLoc(), getName()); +        new VPInstructionWithType(getOpcode(), operands(), getResultType(), +                                  *this, getDebugLoc(), getName());      New->setUnderlyingValue(getUnderlyingValue());      return New;    } @@ -3214,6 +3209,9 @@ protected:        : VPRecipeBase(SC, Operands, DL), VPIRMetadata(Metadata), Ingredient(I),          Alignment(Alignment), Consecutive(Consecutive), Reverse(Reverse) {      assert((Consecutive || !Reverse) && "Reverse implies consecutive"); +    assert(isa<VPVectorEndPointerRecipe>(getAddr()) || +           !Reverse && +               "Reversed acccess without VPVectorEndPointerRecipe address?");    }  public: @@ -3985,7 +3983,7 @@ class VPIRBasicBlock : public VPBasicBlock {          IRBB(IRBB) {}  public: -  ~VPIRBasicBlock() override {} +  ~VPIRBasicBlock() override = default;    static inline bool classof(const VPBlockBase *V) {      return V->getVPBlockID() == VPBlockBase::VPIRBasicBlockSC; @@ -4037,7 +4035,7 @@ class LLVM_ABI_FOR_TEST VPRegionBlock : public VPBlockBase {          IsReplicator(IsReplicator) {}  public: -  ~VPRegionBlock() override {} +  ~VPRegionBlock() override = default;    /// Method to support type inquiry through isa, cast, and dyn_cast.    static inline bool classof(const VPBlockBase *V) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index b5b98c6..b57c448 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -313,7 +313,8 @@ private:      // Check for recipes that do not have opcodes.      if constexpr (std::is_same_v<RecipeTy, VPScalarIVStepsRecipe> ||                    std::is_same_v<RecipeTy, VPCanonicalIVPHIRecipe> || -                  std::is_same_v<RecipeTy, VPDerivedIVRecipe>) +                  std::is_same_v<RecipeTy, VPDerivedIVRecipe> || +                  std::is_same_v<RecipeTy, VPVectorEndPointerRecipe>)        return DefR;      else        return DefR && DefR->getOpcode() == Opcode; @@ -686,6 +687,64 @@ m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2) {    return VPDerivedIV_match<Op0_t, Op1_t, Op2_t>({Op0, Op1, Op2});  } +template <typename Addr_t, typename Mask_t> struct Load_match { +  Addr_t Addr; +  Mask_t Mask; + +  Load_match(Addr_t Addr, Mask_t Mask) : Addr(Addr), Mask(Mask) {} + +  template <typename OpTy> bool match(const OpTy *V) const { +    auto *Load = dyn_cast<VPWidenLoadRecipe>(V); +    if (!Load || !Addr.match(Load->getAddr()) || !Load->isMasked() || +        !Mask.match(Load->getMask())) +      return false; +    return true; +  } +}; + +/// Match a (possibly reversed) masked load. +template <typename Addr_t, typename Mask_t> +inline Load_match<Addr_t, Mask_t> m_MaskedLoad(const Addr_t &Addr, +                                               const Mask_t &Mask) { +  return Load_match<Addr_t, Mask_t>(Addr, Mask); +} + +template <typename Addr_t, typename Val_t, typename Mask_t> struct Store_match { +  Addr_t Addr; +  Val_t Val; +  Mask_t Mask; + +  Store_match(Addr_t Addr, Val_t Val, Mask_t Mask) +      : Addr(Addr), Val(Val), Mask(Mask) {} + +  template <typename OpTy> bool match(const OpTy *V) const { +    auto *Store = dyn_cast<VPWidenStoreRecipe>(V); +    if (!Store || !Addr.match(Store->getAddr()) || +        !Val.match(Store->getStoredValue()) || !Store->isMasked() || +        !Mask.match(Store->getMask())) +      return false; +    return true; +  } +}; + +/// Match a (possibly reversed) masked store. +template <typename Addr_t, typename Val_t, typename Mask_t> +inline Store_match<Addr_t, Val_t, Mask_t> +m_MaskedStore(const Addr_t &Addr, const Val_t &Val, const Mask_t &Mask) { +  return Store_match<Addr_t, Val_t, Mask_t>(Addr, Val, Mask); +} + +template <typename Op0_t, typename Op1_t> +using VectorEndPointerRecipe_match = +    Recipe_match<std::tuple<Op0_t, Op1_t>, 0, +                 /*Commutative*/ false, VPVectorEndPointerRecipe>; + +template <typename Op0_t, typename Op1_t> +VectorEndPointerRecipe_match<Op0_t, Op1_t> m_VecEndPtr(const Op0_t &Op0, +                                                       const Op1_t &Op1) { +  return VectorEndPointerRecipe_match<Op0_t, Op1_t>(Op0, Op1); +} +  /// Match a call argument at a given argument index.  template <typename Opnd_t> struct Argument_match {    /// Call argument index to match. diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 1a02117..1ee405a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -162,8 +162,12 @@ bool VPRecipeBase::mayHaveSideEffects() const {    case VPPredInstPHISC:    case VPVectorEndPointerSC:      return false; -  case VPInstructionSC: -    return mayWriteToMemory(); +  case VPInstructionSC: { +    auto *VPI = cast<VPInstruction>(this); +    return mayWriteToMemory() || +           VPI->getOpcode() == VPInstruction::BranchOnCount || +           VPI->getOpcode() == VPInstruction::BranchOnCond; +  }    case VPWidenCallSC: {      Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction();      return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn(); @@ -1241,6 +1245,8 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {    case Instruction::Select:    case Instruction::PHI:    case VPInstruction::AnyOf: +  case VPInstruction::BranchOnCond: +  case VPInstruction::BranchOnCount:    case VPInstruction::Broadcast:    case VPInstruction::BuildStructVector:    case VPInstruction::BuildVector: diff --git a/llvm/lib/Transforms/Vectorize/VPlanSLP.h b/llvm/lib/Transforms/Vectorize/VPlanSLP.h index 77ff36c..44972c68 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanSLP.h +++ b/llvm/lib/Transforms/Vectorize/VPlanSLP.h @@ -89,8 +89,7 @@ class VPlanSlp {    /// Width of the widest combined bundle in bits.    unsigned WidestBundleBits = 0; -  using MultiNodeOpTy = -      typename std::pair<VPInstruction *, SmallVector<VPValue *, 4>>; +  using MultiNodeOpTy = std::pair<VPInstruction *, SmallVector<VPValue *, 4>>;    // Input operand bundles for the current multi node. Each multi node operand    // bundle contains values not matching the multi node's opcode. They will diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index f50bf29..2588c87 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -151,59 +151,65 @@ static bool cannotHoistOrSinkRecipe(const VPRecipeBase &R) {  static bool sinkScalarOperands(VPlan &Plan) {    auto Iter = vp_depth_first_deep(Plan.getEntry()); +  bool ScalarVFOnly = Plan.hasScalarVFOnly();    bool Changed = false; + +  SetVector<std::pair<VPBasicBlock *, VPSingleDefRecipe *>> WorkList; +  auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList]( +                                        VPBasicBlock *SinkTo, VPValue *Op) { +    auto *Candidate = +        dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()); +    if (!Candidate) +      return; + +    // We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes +    // for now. +    if (!isa<VPReplicateRecipe, VPScalarIVStepsRecipe>(Candidate)) +      return; + +    if (Candidate->getParent() == SinkTo || Candidate->mayHaveSideEffects() || +        Candidate->mayReadOrWriteMemory()) +      return; + +    if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate)) +      if (!ScalarVFOnly && RepR->isSingleScalar()) +        return; + +    WorkList.insert({SinkTo, Candidate}); +  }; +    // First, collect the operands of all recipes in replicate blocks as seeds for    // sinking. -  SetVector<std::pair<VPBasicBlock *, VPSingleDefRecipe *>> WorkList;    for (VPRegionBlock *VPR : VPBlockUtils::blocksOnly<VPRegionBlock>(Iter)) {      VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();      if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)        continue; -    VPBasicBlock *VPBB = dyn_cast<VPBasicBlock>(EntryVPBB->getSuccessors()[0]); -    if (!VPBB || VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock()) +    VPBasicBlock *VPBB = cast<VPBasicBlock>(EntryVPBB->getSuccessors().front()); +    if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())        continue; -    for (auto &Recipe : *VPBB) { +    for (auto &Recipe : *VPBB)        for (VPValue *Op : Recipe.operands()) -        if (auto *Def = -                dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe())) -          WorkList.insert({VPBB, Def}); -    } +        InsertIfValidSinkCandidate(VPBB, Op);    } -  bool ScalarVFOnly = Plan.hasScalarVFOnly();    // Try to sink each replicate or scalar IV steps recipe in the worklist.    for (unsigned I = 0; I != WorkList.size(); ++I) {      VPBasicBlock *SinkTo;      VPSingleDefRecipe *SinkCandidate;      std::tie(SinkTo, SinkCandidate) = WorkList[I]; -    if (SinkCandidate->getParent() == SinkTo || -        SinkCandidate->mayHaveSideEffects() || -        SinkCandidate->mayReadOrWriteMemory()) -      continue; -    if (auto *RepR = dyn_cast<VPReplicateRecipe>(SinkCandidate)) { -      if (!ScalarVFOnly && RepR->isSingleScalar()) -        continue; -    } else if (!isa<VPScalarIVStepsRecipe>(SinkCandidate)) -      continue; -    bool NeedsDuplicating = false; -    // All recipe users of the sink candidate must be in the same block SinkTo -    // or all users outside of SinkTo must be uniform-after-vectorization ( -    // i.e., only first lane is used) . In the latter case, we need to duplicate -    // SinkCandidate. -    auto CanSinkWithUser = [SinkTo, &NeedsDuplicating, -                            SinkCandidate](VPUser *U) { -      auto *UI = cast<VPRecipeBase>(U); -      if (UI->getParent() == SinkTo) -        return true; -      NeedsDuplicating = UI->onlyFirstLaneUsed(SinkCandidate); -      // We only know how to duplicate VPReplicateRecipes and -      // VPScalarIVStepsRecipes for now. -      return NeedsDuplicating && -             isa<VPReplicateRecipe, VPScalarIVStepsRecipe>(SinkCandidate); -    }; -    if (!all_of(SinkCandidate->users(), CanSinkWithUser)) +    // All recipe users of SinkCandidate must be in the same block SinkTo or all +    // users outside of SinkTo must only use the first lane of SinkCandidate. In +    // the latter case, we need to duplicate SinkCandidate. +    auto UsersOutsideSinkTo = +        make_filter_range(SinkCandidate->users(), [SinkTo](VPUser *U) { +          return cast<VPRecipeBase>(U)->getParent() != SinkTo; +        }); +    if (any_of(UsersOutsideSinkTo, [SinkCandidate](VPUser *U) { +          return !U->onlyFirstLaneUsed(SinkCandidate); +        }))        continue; +    bool NeedsDuplicating = !UsersOutsideSinkTo.empty();      if (NeedsDuplicating) {        if (ScalarVFOnly) @@ -228,9 +234,7 @@ static bool sinkScalarOperands(VPlan &Plan) {      }      SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());      for (VPValue *Op : SinkCandidate->operands()) -      if (auto *Def = -              dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe())) -        WorkList.insert({SinkTo, Def}); +      InsertIfValidSinkCandidate(SinkTo, Op);      Changed = true;    }    return Changed; @@ -1056,13 +1060,9 @@ static VPValue *tryToFoldLiveIns(VPSingleDefRecipe &R,    return nullptr;  } -/// Try to simplify recipe \p R. -static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { -  VPlan *Plan = R.getParent()->getPlan(); - -  auto *Def = dyn_cast<VPSingleDefRecipe>(&R); -  if (!Def) -    return; +/// Try to simplify VPSingleDefRecipe \p Def. +static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) { +  VPlan *Plan = Def->getParent()->getPlan();    // Simplification of live-in IR values for SingleDef recipes using    // InstSimplifyFolder. @@ -1072,7 +1072,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {      return Def->replaceAllUsesWith(V);    // Fold PredPHI LiveIn -> LiveIn. -  if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(&R)) { +  if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(Def)) {      VPValue *Op = PredPHI->getOperand(0);      if (Op->isLiveIn())        PredPHI->replaceAllUsesWith(Op); @@ -1091,12 +1091,12 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {          return;        if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) { -        unsigned ExtOpcode = match(R.getOperand(0), m_SExt(m_VPValue())) +        unsigned ExtOpcode = match(Def->getOperand(0), m_SExt(m_VPValue()))                                   ? Instruction::SExt                                   : Instruction::ZExt;          auto *Ext = Builder.createWidenCast(Instruction::CastOps(ExtOpcode), A,                                              TruncTy); -        if (auto *UnderlyingExt = R.getOperand(0)->getUnderlyingValue()) { +        if (auto *UnderlyingExt = Def->getOperand(0)->getUnderlyingValue()) {            // UnderlyingExt has distinct return type, used to retain legacy cost.            Ext->setUnderlyingValue(UnderlyingExt);          } @@ -1159,7 +1159,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {          Builder.createLogicalAnd(X, Builder.createOr(Y, Z)));    // x && !x -> 0 -  if (match(&R, m_LogicalAnd(m_VPValue(X), m_Not(m_Deferred(X))))) +  if (match(Def, m_LogicalAnd(m_VPValue(X), m_Not(m_Deferred(X)))))      return Def->replaceAllUsesWith(Plan->getFalse());    if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X)))) @@ -1187,8 +1187,8 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {      return Def->replaceAllUsesWith(A);    if (match(Def, m_c_Mul(m_VPValue(A), m_ZeroInt()))) -    return Def->replaceAllUsesWith(R.getOperand(0) == A ? R.getOperand(1) -                                                        : R.getOperand(0)); +    return Def->replaceAllUsesWith( +        Def->getOperand(0) == A ? Def->getOperand(1) : Def->getOperand(0));    if (match(Def, m_Not(m_VPValue(A)))) {      if (match(A, m_Not(m_VPValue(A)))) @@ -1217,8 +1217,8 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {          }          // If Cmp doesn't have a debug location, use the one from the negation,          // to preserve the location. -        if (!Cmp->getDebugLoc() && R.getDebugLoc()) -          Cmp->setDebugLoc(R.getDebugLoc()); +        if (!Cmp->getDebugLoc() && Def->getDebugLoc()) +          Cmp->setDebugLoc(Def->getDebugLoc());        }      }    } @@ -1244,7 +1244,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {    if (match(Def, m_Intrinsic<Intrinsic::vp_merge>(m_True(), m_VPValue(A),                                                    m_VPValue(X), m_VPValue())) &&        match(A, m_c_BinaryOr(m_Specific(X), m_VPValue(Y))) && -      TypeInfo.inferScalarType(R.getVPSingleValue())->isIntegerTy(1)) { +      TypeInfo.inferScalarType(Def)->isIntegerTy(1)) {      Def->setOperand(1, Def->getOperand(0));      Def->setOperand(0, Y);      return; @@ -1252,36 +1252,36 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {    if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) {      if (Phi->getOperand(0) == Phi->getOperand(1)) -      Def->replaceAllUsesWith(Phi->getOperand(0)); +      Phi->replaceAllUsesWith(Phi->getOperand(0));      return;    }    // Look through ExtractLastElement (BuildVector ....). -  if (match(&R, m_CombineOr(m_ExtractLastElement(m_BuildVector()), -                            m_ExtractLastLanePerPart(m_BuildVector())))) { -    auto *BuildVector = cast<VPInstruction>(R.getOperand(0)); +  if (match(Def, m_CombineOr(m_ExtractLastElement(m_BuildVector()), +                             m_ExtractLastLanePerPart(m_BuildVector())))) { +    auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));      Def->replaceAllUsesWith(          BuildVector->getOperand(BuildVector->getNumOperands() - 1));      return;    }    // Look through ExtractPenultimateElement (BuildVector ....). -  if (match(&R, m_VPInstruction<VPInstruction::ExtractPenultimateElement>( -                    m_BuildVector()))) { -    auto *BuildVector = cast<VPInstruction>(R.getOperand(0)); +  if (match(Def, m_VPInstruction<VPInstruction::ExtractPenultimateElement>( +                     m_BuildVector()))) { +    auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));      Def->replaceAllUsesWith(          BuildVector->getOperand(BuildVector->getNumOperands() - 2));      return;    }    uint64_t Idx; -  if (match(&R, m_ExtractElement(m_BuildVector(), m_ConstantInt(Idx)))) { -    auto *BuildVector = cast<VPInstruction>(R.getOperand(0)); +  if (match(Def, m_ExtractElement(m_BuildVector(), m_ConstantInt(Idx)))) { +    auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));      Def->replaceAllUsesWith(BuildVector->getOperand(Idx));      return;    } -  if (match(Def, m_BuildVector()) && all_equal(R.operands())) { +  if (match(Def, m_BuildVector()) && all_equal(Def->operands())) {      Def->replaceAllUsesWith(          Builder.createNaryOp(VPInstruction::Broadcast, Def->getOperand(0)));      return; @@ -1303,7 +1303,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {        isa<VPPhi>(X)) {      auto *Phi = cast<VPPhi>(X);      if (Phi->getOperand(1) != Def && match(Phi->getOperand(0), m_ZeroInt()) && -        Phi->getNumUsers() == 1 && (*Phi->user_begin() == &R)) { +        Phi->getNumUsers() == 1 && (*Phi->user_begin() == Def)) {        Phi->setOperand(0, Y);        Def->replaceAllUsesWith(Phi);        return; @@ -1311,7 +1311,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {    }    // VPVectorPointer for part 0 can be replaced by their start pointer. -  if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(&R)) { +  if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(Def)) {      if (VecPtr->isFirstPart()) {        VecPtr->replaceAllUsesWith(VecPtr->getOperand(0));        return; @@ -1366,9 +1366,9 @@ void VPlanTransforms::simplifyRecipes(VPlan &Plan) {        Plan.getEntry());    VPTypeAnalysis TypeInfo(Plan);    for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) { -    for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { -      simplifyRecipe(R, TypeInfo); -    } +    for (VPRecipeBase &R : make_early_inc_range(*VPBB)) +      if (auto *Def = dyn_cast<VPSingleDefRecipe>(&R)) +        simplifyRecipe(Def, TypeInfo);    }  } @@ -2521,90 +2521,102 @@ void VPlanTransforms::addActiveLaneMask(    HeaderMask->eraseFromParent();  } +template <typename Op0_t, typename Op1_t> struct RemoveMask_match { +  Op0_t In; +  Op1_t &Out; + +  RemoveMask_match(const Op0_t &In, Op1_t &Out) : In(In), Out(Out) {} + +  template <typename OpTy> bool match(OpTy *V) const { +    if (m_Specific(In).match(V)) { +      Out = nullptr; +      return true; +    } +    if (m_LogicalAnd(m_Specific(In), m_VPValue(Out)).match(V)) +      return true; +    return false; +  } +}; + +/// Match a specific mask \p In, or a combination of it (logical-and In, Out). +/// Returns the remaining part \p Out if so, or nullptr otherwise. +template <typename Op0_t, typename Op1_t> +static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In, +                                                          Op1_t &Out) { +  return RemoveMask_match<Op0_t, Op1_t>(In, Out); +} +  /// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding  /// EVL-based recipe without the header mask. Returns nullptr if no EVL-based  /// recipe could be created.  /// \p HeaderMask  Header Mask.  /// \p CurRecipe   Recipe to be transform.  /// \p TypeInfo    VPlan-based type analysis. -/// \p AllOneMask  The vector mask parameter of vector-predication intrinsics.  /// \p EVL         The explicit vector length parameter of vector-predication  /// intrinsics.  static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,                                         VPRecipeBase &CurRecipe, -                                       VPTypeAnalysis &TypeInfo, -                                       VPValue &AllOneMask, VPValue &EVL) { -  // FIXME: Don't transform recipes to EVL recipes if they're not masked by the -  // header mask. -  auto GetNewMask = [&](VPValue *OrigMask) -> VPValue * { -    assert(OrigMask && "Unmasked recipe when folding tail"); -    // HeaderMask will be handled using EVL. -    VPValue *Mask; -    if (match(OrigMask, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(Mask)))) -      return Mask; -    return HeaderMask == OrigMask ? nullptr : OrigMask; -  }; +                                       VPTypeAnalysis &TypeInfo, VPValue &EVL) { +  VPlan *Plan = CurRecipe.getParent()->getPlan(); +  VPValue *Addr, *Mask, *EndPtr;    /// Adjust any end pointers so that they point to the end of EVL lanes not VF. -  auto GetNewAddr = [&CurRecipe, &EVL](VPValue *Addr) -> VPValue * { -    auto *EndPtr = dyn_cast<VPVectorEndPointerRecipe>(Addr); -    if (!EndPtr) -      return Addr; -    assert(EndPtr->getOperand(1) == &EndPtr->getParent()->getPlan()->getVF() && -           "VPVectorEndPointerRecipe with non-VF VF operand?"); -    assert( -        all_of(EndPtr->users(), -               [](VPUser *U) { -                 return cast<VPWidenMemoryRecipe>(U)->isReverse(); -               }) && -        "VPVectorEndPointRecipe not used by reversed widened memory recipe?"); -    VPVectorEndPointerRecipe *EVLAddr = EndPtr->clone(); -    EVLAddr->insertBefore(&CurRecipe); -    EVLAddr->setOperand(1, &EVL); -    return EVLAddr; +  auto AdjustEndPtr = [&CurRecipe, &EVL](VPValue *EndPtr) { +    auto *EVLEndPtr = cast<VPVectorEndPointerRecipe>(EndPtr)->clone(); +    EVLEndPtr->insertBefore(&CurRecipe); +    EVLEndPtr->setOperand(1, &EVL); +    return EVLEndPtr;    }; -  return TypeSwitch<VPRecipeBase *, VPRecipeBase *>(&CurRecipe) -      .Case<VPWidenLoadRecipe>([&](VPWidenLoadRecipe *L) { -        VPValue *NewMask = GetNewMask(L->getMask()); -        VPValue *NewAddr = GetNewAddr(L->getAddr()); -        return new VPWidenLoadEVLRecipe(*L, NewAddr, EVL, NewMask); -      }) -      .Case<VPWidenStoreRecipe>([&](VPWidenStoreRecipe *S) { -        VPValue *NewMask = GetNewMask(S->getMask()); -        VPValue *NewAddr = GetNewAddr(S->getAddr()); -        return new VPWidenStoreEVLRecipe(*S, NewAddr, EVL, NewMask); -      }) -      .Case<VPInterleaveRecipe>([&](VPInterleaveRecipe *IR) { -        VPValue *NewMask = GetNewMask(IR->getMask()); -        return new VPInterleaveEVLRecipe(*IR, EVL, NewMask); -      }) -      .Case<VPReductionRecipe>([&](VPReductionRecipe *Red) { -        VPValue *NewMask = GetNewMask(Red->getCondOp()); -        return new VPReductionEVLRecipe(*Red, EVL, NewMask); -      }) -      .Case<VPInstruction>([&](VPInstruction *VPI) -> VPRecipeBase * { -        VPValue *LHS, *RHS; -        // Transform select with a header mask condition -        //   select(header_mask, LHS, RHS) -        // into vector predication merge. -        //   vp.merge(all-true, LHS, RHS, EVL) -        if (!match(VPI, m_Select(m_Specific(HeaderMask), m_VPValue(LHS), -                                 m_VPValue(RHS)))) -          return nullptr; -        // Use all true as the condition because this transformation is -        // limited to selects whose condition is a header mask. -        return new VPWidenIntrinsicRecipe( -            Intrinsic::vp_merge, {&AllOneMask, LHS, RHS, &EVL}, -            TypeInfo.inferScalarType(LHS), VPI->getDebugLoc()); -      }) -      .Default([&](VPRecipeBase *R) { return nullptr; }); +  if (match(&CurRecipe, +            m_MaskedLoad(m_VPValue(Addr), m_RemoveMask(HeaderMask, Mask))) && +      !cast<VPWidenLoadRecipe>(CurRecipe).isReverse()) +    return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), Addr, +                                    EVL, Mask); + +  if (match(&CurRecipe, +            m_MaskedLoad(m_VPValue(EndPtr), m_RemoveMask(HeaderMask, Mask))) && +      match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) && +      cast<VPWidenLoadRecipe>(CurRecipe).isReverse()) +    return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), +                                    AdjustEndPtr(EndPtr), EVL, Mask); + +  if (match(&CurRecipe, m_MaskedStore(m_VPValue(Addr), m_VPValue(), +                                      m_RemoveMask(HeaderMask, Mask))) && +      !cast<VPWidenStoreRecipe>(CurRecipe).isReverse()) +    return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr, +                                     EVL, Mask); + +  if (match(&CurRecipe, m_MaskedStore(m_VPValue(EndPtr), m_VPValue(), +                                      m_RemoveMask(HeaderMask, Mask))) && +      match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) && +      cast<VPWidenStoreRecipe>(CurRecipe).isReverse()) +    return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), +                                     AdjustEndPtr(EndPtr), EVL, Mask); + +  if (auto *Rdx = dyn_cast<VPReductionRecipe>(&CurRecipe)) +    if (Rdx->isConditional() && +        match(Rdx->getCondOp(), m_RemoveMask(HeaderMask, Mask))) +      return new VPReductionEVLRecipe(*Rdx, EVL, Mask); + +  if (auto *Interleave = dyn_cast<VPInterleaveRecipe>(&CurRecipe)) +    if (Interleave->getMask() && +        match(Interleave->getMask(), m_RemoveMask(HeaderMask, Mask))) +      return new VPInterleaveEVLRecipe(*Interleave, EVL, Mask); + +  VPValue *LHS, *RHS; +  if (match(&CurRecipe, +            m_Select(m_Specific(HeaderMask), m_VPValue(LHS), m_VPValue(RHS)))) +    return new VPWidenIntrinsicRecipe( +        Intrinsic::vp_merge, {Plan->getTrue(), LHS, RHS, &EVL}, +        TypeInfo.inferScalarType(LHS), CurRecipe.getDebugLoc()); + +  return nullptr;  }  /// Replace recipes with their EVL variants.  static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {    VPTypeAnalysis TypeInfo(Plan); -  VPValue *AllOneMask = Plan.getTrue();    VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();    VPBasicBlock *Header = LoopRegion->getEntryBasicBlock(); @@ -2664,7 +2676,7 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {              ConstantInt::getSigned(Type::getInt32Ty(Plan.getContext()), -1));          VPWidenIntrinsicRecipe *VPSplice = new VPWidenIntrinsicRecipe(              Intrinsic::experimental_vp_splice, -            {V1, V2, Imm, AllOneMask, PrevEVL, &EVL}, +            {V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL},              TypeInfo.inferScalarType(R.getVPSingleValue()), R.getDebugLoc());          VPSplice->insertBefore(&R);          R.getVPSingleValue()->replaceAllUsesWith(VPSplice); @@ -2698,7 +2710,7 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {    for (VPUser *U : collectUsersRecursively(EVLMask)) {      auto *CurRecipe = cast<VPRecipeBase>(U);      VPRecipeBase *EVLRecipe = -        optimizeMaskToEVL(EVLMask, *CurRecipe, TypeInfo, *AllOneMask, EVL); +        optimizeMaskToEVL(EVLMask, *CurRecipe, TypeInfo, EVL);      if (!EVLRecipe)        continue; @@ -4174,7 +4186,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,    unsigned VFMinVal = VF.getKnownMinValue();    SmallVector<VPInterleaveRecipe *> StoreGroups;    for (auto &R : *VectorLoop->getEntryBasicBlock()) { -    if (isa<VPCanonicalIVPHIRecipe>(&R) || match(&R, m_BranchOnCount())) +    if (isa<VPCanonicalIVPHIRecipe>(&R))        continue;      if (isa<VPDerivedIVRecipe, VPScalarIVStepsRecipe>(&R) && diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index 8c23e78..c6380d3 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -32,22 +32,17 @@ bool vputils::onlyScalarValuesUsed(const VPValue *Def) {  }  VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr) { -  VPValue *Expanded = nullptr;    if (auto *E = dyn_cast<SCEVConstant>(Expr)) -    Expanded = Plan.getOrAddLiveIn(E->getValue()); -  else { -    auto *U = dyn_cast<SCEVUnknown>(Expr); -    // Skip SCEV expansion if Expr is a SCEVUnknown wrapping a non-instruction -    // value. Otherwise the value may be defined in a loop and using it directly -    // will break LCSSA form. The SCEV expansion takes care of preserving LCSSA -    // form. -    if (U && !isa<Instruction>(U->getValue())) { -      Expanded = Plan.getOrAddLiveIn(U->getValue()); -    } else { -      Expanded = new VPExpandSCEVRecipe(Expr); -      Plan.getEntry()->appendRecipe(Expanded->getDefiningRecipe()); -    } -  } +    return Plan.getOrAddLiveIn(E->getValue()); +  // Skip SCEV expansion if Expr is a SCEVUnknown wrapping a non-instruction +  // value. Otherwise the value may be defined in a loop and using it directly +  // will break LCSSA form. The SCEV expansion takes care of preserving LCSSA +  // form. +  auto *U = dyn_cast<SCEVUnknown>(Expr); +  if (U && !isa<Instruction>(U->getValue())) +    return Plan.getOrAddLiveIn(U->getValue()); +  auto *Expanded = new VPExpandSCEVRecipe(Expr); +  Plan.getEntry()->appendRecipe(Expanded);    return Expanded;  }  | 
