diff options
Diffstat (limited to 'clang/lib/Analysis')
-rw-r--r-- | clang/lib/Analysis/ExprMutationAnalyzer.cpp | 25 | ||||
-rw-r--r-- | clang/lib/Analysis/LifetimeSafety.cpp | 505 | ||||
-rw-r--r-- | clang/lib/Analysis/LifetimeSafety.md | 230 |
3 files changed, 549 insertions, 211 deletions
diff --git a/clang/lib/Analysis/ExprMutationAnalyzer.cpp b/clang/lib/Analysis/ExprMutationAnalyzer.cpp index 3fcd348..1e376da 100644 --- a/clang/lib/Analysis/ExprMutationAnalyzer.cpp +++ b/clang/lib/Analysis/ExprMutationAnalyzer.cpp @@ -755,22 +755,23 @@ ExprMutationAnalyzer::Analyzer::findPointeeMemberMutation(const Expr *Exp) { const Stmt * ExprMutationAnalyzer::Analyzer::findPointeeToNonConst(const Expr *Exp) { - const auto NonConstPointerOrDependentType = - type(anyOf(nonConstPointerType(), isDependentType())); + const auto NonConstPointerOrNonConstRefOrDependentType = type( + anyOf(nonConstPointerType(), nonConstReferenceType(), isDependentType())); // assign const auto InitToNonConst = - varDecl(hasType(NonConstPointerOrDependentType), + varDecl(hasType(NonConstPointerOrNonConstRefOrDependentType), hasInitializer(expr(canResolveToExprPointee(Exp)).bind("stmt"))); - const auto AssignToNonConst = - binaryOperation(hasOperatorName("="), - hasLHS(expr(hasType(NonConstPointerOrDependentType))), - hasRHS(canResolveToExprPointee(Exp))); + const auto AssignToNonConst = binaryOperation( + hasOperatorName("="), + hasLHS(expr(hasType(NonConstPointerOrNonConstRefOrDependentType))), + hasRHS(canResolveToExprPointee(Exp))); // arguments like const auto ArgOfInstantiationDependent = allOf( hasAnyArgument(canResolveToExprPointee(Exp)), isInstantiationDependent()); - const auto ArgOfNonConstParameter = forEachArgumentWithParamType( - canResolveToExprPointee(Exp), NonConstPointerOrDependentType); + const auto ArgOfNonConstParameter = + forEachArgumentWithParamType(canResolveToExprPointee(Exp), + NonConstPointerOrNonConstRefOrDependentType); const auto CallLikeMatcher = anyOf(ArgOfNonConstParameter, ArgOfInstantiationDependent); const auto PassAsNonConstArg = @@ -779,9 +780,9 @@ ExprMutationAnalyzer::Analyzer::findPointeeToNonConst(const Expr *Exp) { parenListExpr(has(canResolveToExprPointee(Exp))), initListExpr(hasAnyInit(canResolveToExprPointee(Exp))))); // cast - const auto CastToNonConst = - explicitCastExpr(hasSourceExpression(canResolveToExprPointee(Exp)), - hasDestinationType(NonConstPointerOrDependentType)); + const auto CastToNonConst = explicitCastExpr( + hasSourceExpression(canResolveToExprPointee(Exp)), + hasDestinationType(NonConstPointerOrNonConstRefOrDependentType)); // capture // FIXME: false positive if the pointee does not change in lambda diff --git a/clang/lib/Analysis/LifetimeSafety.cpp b/clang/lib/Analysis/LifetimeSafety.cpp index c18b8fb..6196ec3 100644 --- a/clang/lib/Analysis/LifetimeSafety.cpp +++ b/clang/lib/Analysis/LifetimeSafety.cpp @@ -19,12 +19,13 @@ #include "llvm/ADT/ImmutableMap.h" #include "llvm/ADT/ImmutableSet.h" #include "llvm/ADT/PointerUnion.h" -#include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/TimeProfiler.h" #include <cstdint> #include <memory> +#include <optional> namespace clang::lifetimes { namespace internal { @@ -872,22 +873,19 @@ public: InStates[Start] = D.getInitialState(); W.enqueueBlock(Start); - llvm::SmallBitVector Visited(Cfg.getNumBlockIDs() + 1); - while (const CFGBlock *B = W.dequeue()) { - Lattice StateIn = getInState(B); + Lattice StateIn = *getInState(B); Lattice StateOut = transferBlock(B, StateIn); OutStates[B] = StateOut; - Visited.set(B->getBlockID()); for (const CFGBlock *AdjacentB : isForward() ? B->succs() : B->preds()) { if (!AdjacentB) continue; - Lattice OldInState = getInState(AdjacentB); - Lattice NewInState = D.join(OldInState, StateOut); + std::optional<Lattice> OldInState = getInState(AdjacentB); + Lattice NewInState = + !OldInState ? StateOut : D.join(*OldInState, StateOut); // Enqueue the adjacent block if its in-state has changed or if we have - // never visited it. - if (!Visited.test(AdjacentB->getBlockID()) || - NewInState != OldInState) { + // never seen it. + if (!OldInState || NewInState != *OldInState) { InStates[AdjacentB] = NewInState; W.enqueueBlock(AdjacentB); } @@ -898,7 +896,12 @@ public: protected: Lattice getState(ProgramPoint P) const { return PerPointStates.lookup(P); } - Lattice getInState(const CFGBlock *B) const { return InStates.lookup(B); } + std::optional<Lattice> getInState(const CFGBlock *B) const { + auto It = InStates.find(B); + if (It == InStates.end()) + return std::nullopt; + return It->second; + } Lattice getOutState(const CFGBlock *B) const { return OutStates.lookup(B); } @@ -974,19 +977,21 @@ static llvm::ImmutableSet<T> join(llvm::ImmutableSet<T> A, return A; } -/// Checks if set A is a subset of set B. -template <typename T> -static bool isSubsetOf(const llvm::ImmutableSet<T> &A, - const llvm::ImmutableSet<T> &B) { - // Empty set is a subset of all sets. - if (A.isEmpty()) - return true; - - for (const T &Elem : A) - if (!B.contains(Elem)) - return false; - return true; -} +/// Describes the strategy for joining two `ImmutableMap` instances, primarily +/// differing in how they handle keys that are unique to one of the maps. +/// +/// A `Symmetric` join is universally correct, while an `Asymmetric` join +/// serves as a performance optimization. The latter is applicable only when the +/// join operation possesses a left identity element, allowing for a more +/// efficient, one-sided merge. +enum class JoinKind { + /// A symmetric join applies the `JoinValues` operation to keys unique to + /// either map, ensuring that values from both maps contribute to the result. + Symmetric, + /// An asymmetric join preserves keys unique to the first map as-is, while + /// applying the `JoinValues` operation only to keys unique to the second map. + Asymmetric, +}; /// Computes the key-wise union of two ImmutableMaps. // TODO(opt): This key-wise join is a performance bottleneck. A more @@ -994,22 +999,29 @@ static bool isSubsetOf(const llvm::ImmutableSet<T> &A, // instead of the current AVL-tree-based ImmutableMap. template <typename K, typename V, typename Joiner> static llvm::ImmutableMap<K, V> -join(llvm::ImmutableMap<K, V> A, llvm::ImmutableMap<K, V> B, - typename llvm::ImmutableMap<K, V>::Factory &F, Joiner JoinValues) { +join(const llvm::ImmutableMap<K, V> &A, const llvm::ImmutableMap<K, V> &B, + typename llvm::ImmutableMap<K, V>::Factory &F, Joiner JoinValues, + JoinKind Kind) { if (A.getHeight() < B.getHeight()) - std::swap(A, B); + return join(B, A, F, JoinValues, Kind); // For each element in B, join it with the corresponding element in A // (or with an empty value if it doesn't exist in A). + llvm::ImmutableMap<K, V> Res = A; for (const auto &Entry : B) { const K &Key = Entry.first; const V &ValB = Entry.second; - if (const V *ValA = A.lookup(Key)) - A = F.add(A, Key, JoinValues(*ValA, ValB)); - else - A = F.add(A, Key, ValB); + Res = F.add(Res, Key, JoinValues(A.lookup(Key), &ValB)); + } + if (Kind == JoinKind::Symmetric) { + for (const auto &Entry : A) { + const K &Key = Entry.first; + const V &ValA = Entry.second; + if (!B.contains(Key)) + Res = F.add(Res, Key, JoinValues(&ValA, nullptr)); + } } - return A; + return Res; } } // namespace utils @@ -1017,19 +1029,6 @@ join(llvm::ImmutableMap<K, V> A, llvm::ImmutableMap<K, V> B, // Loan Propagation Analysis // ========================================================================= // -using OriginLoanMap = llvm::ImmutableMap<OriginID, LoanSet>; -using ExpiredLoanMap = llvm::ImmutableMap<LoanID, const ExpireFact *>; - -/// An object to hold the factories for immutable collections, ensuring -/// that all created states share the same underlying memory management. -struct LifetimeFactory { - llvm::BumpPtrAllocator Allocator; - OriginLoanMap::Factory OriginMapFactory{Allocator, /*canonicalize=*/false}; - LoanSet::Factory LoanSetFactory{Allocator, /*canonicalize=*/false}; - ExpiredLoanMap::Factory ExpiredLoanMapFactory{Allocator, - /*canonicalize=*/false}; -}; - /// Represents the dataflow lattice for loan propagation. /// /// This lattice tracks which loans each origin may hold at a given program @@ -1073,10 +1072,10 @@ class LoanPropagationAnalysis public: LoanPropagationAnalysis(const CFG &C, AnalysisDeclContext &AC, FactManager &F, - LifetimeFactory &LFactory) - : DataflowAnalysis(C, AC, F), - OriginLoanMapFactory(LFactory.OriginMapFactory), - LoanSetFactory(LFactory.LoanSetFactory) {} + OriginLoanMap::Factory &OriginLoanMapFactory, + LoanSet::Factory &LoanSetFactory) + : DataflowAnalysis(C, AC, F), OriginLoanMapFactory(OriginLoanMapFactory), + LoanSetFactory(LoanSetFactory) {} using Base::transfer; @@ -1087,11 +1086,19 @@ public: /// Merges two lattices by taking the union of loans for each origin. // TODO(opt): Keep the state small by removing origins which become dead. Lattice join(Lattice A, Lattice B) { - OriginLoanMap JoinedOrigins = - utils::join(A.Origins, B.Origins, OriginLoanMapFactory, - [&](LoanSet S1, LoanSet S2) { - return utils::join(S1, S2, LoanSetFactory); - }); + OriginLoanMap JoinedOrigins = utils::join( + A.Origins, B.Origins, OriginLoanMapFactory, + [&](const LoanSet *S1, const LoanSet *S2) { + assert((S1 || S2) && "unexpectedly merging 2 empty sets"); + if (!S1) + return *S2; + if (!S2) + return *S1; + return utils::join(*S1, *S2, LoanSetFactory); + }, + // Asymmetric join is a performance win. For origins present only on one + // branch, the loan set can be carried over as-is. + utils::JoinKind::Asymmetric); return Lattice(JoinedOrigins); } @@ -1120,12 +1127,12 @@ public: OriginLoanMapFactory.add(In.Origins, DestOID, MergedLoans)); } - LoanSet getLoans(OriginID OID, ProgramPoint P) { + LoanSet getLoans(OriginID OID, ProgramPoint P) const { return getLoans(getState(P), OID); } private: - LoanSet getLoans(Lattice L, OriginID OID) { + LoanSet getLoans(Lattice L, OriginID OID) const { if (auto *Loans = L.Origins.lookup(OID)) return *Loans; return LoanSetFactory.getEmptySet(); @@ -1133,96 +1140,195 @@ private: }; // ========================================================================= // -// Expired Loans Analysis +// Live Origins Analysis +// ========================================================================= // +// +// A backward dataflow analysis that determines which origins are "live" at each +// program point. An origin is "live" at a program point if there's a potential +// future use of the pointer it represents. Liveness is "generated" by a read of +// origin's loan set (e.g., a `UseFact`) and is "killed" (i.e., it stops being +// live) when its loan set is overwritten (e.g. a OriginFlow killing the +// destination origin). +// +// This information is used for detecting use-after-free errors, as it allows us +// to check if a live origin holds a loan to an object that has already expired. // ========================================================================= // -/// The dataflow lattice for tracking the set of expired loans. -struct ExpiredLattice { - /// Map from an expired `LoanID` to the `ExpireFact` that made it expire. - ExpiredLoanMap Expired; +/// Information about why an origin is live at a program point. +struct LivenessInfo { + /// The use that makes the origin live. If liveness is propagated from + /// multiple uses along different paths, this will point to the use appearing + /// earlier in the translation unit. + /// This is 'null' when the origin is not live. + const UseFact *CausingUseFact; + /// The kind of liveness of the origin. + /// `Must`: The origin is live on all control-flow paths from the current + /// point to the function's exit (i.e. the current point is dominated by a set + /// of uses). + /// `Maybe`: indicates it is live on some but not all paths. + /// + /// This determines the diagnostic's confidence level. + /// `Must`-be-alive at expiration implies a definite use-after-free, + /// while `Maybe`-be-alive suggests a potential one on some paths. + LivenessKind Kind; + + LivenessInfo() : CausingUseFact(nullptr), Kind(LivenessKind::Dead) {} + LivenessInfo(const UseFact *UF, LivenessKind K) + : CausingUseFact(UF), Kind(K) {} + + bool operator==(const LivenessInfo &Other) const { + return CausingUseFact == Other.CausingUseFact && Kind == Other.Kind; + } + bool operator!=(const LivenessInfo &Other) const { return !(*this == Other); } + + void Profile(llvm::FoldingSetNodeID &IDBuilder) const { + IDBuilder.AddPointer(CausingUseFact); + IDBuilder.Add(Kind); + } +}; + +using LivenessMap = llvm::ImmutableMap<OriginID, LivenessInfo>; - ExpiredLattice() : Expired(nullptr) {}; - explicit ExpiredLattice(ExpiredLoanMap M) : Expired(M) {} +/// The dataflow lattice for origin liveness analysis. +/// It tracks which origins are live, why they're live (which UseFact), +/// and the confidence level of that liveness. +struct LivenessLattice { + LivenessMap LiveOrigins; - bool operator==(const ExpiredLattice &Other) const { - return Expired == Other.Expired; + LivenessLattice() : LiveOrigins(nullptr) {}; + + explicit LivenessLattice(LivenessMap L) : LiveOrigins(L) {} + + bool operator==(const LivenessLattice &Other) const { + return LiveOrigins == Other.LiveOrigins; } - bool operator!=(const ExpiredLattice &Other) const { + + bool operator!=(const LivenessLattice &Other) const { return !(*this == Other); } - void dump(llvm::raw_ostream &OS) const { - OS << "ExpiredLattice State:\n"; - if (Expired.isEmpty()) + void dump(llvm::raw_ostream &OS, const OriginManager &OM) const { + if (LiveOrigins.isEmpty()) OS << " <empty>\n"; - for (const auto &[ID, _] : Expired) - OS << " Loan " << ID << " is expired\n"; + for (const auto &Entry : LiveOrigins) { + OriginID OID = Entry.first; + const LivenessInfo &Info = Entry.second; + OS << " "; + OM.dump(OID, OS); + OS << " is "; + switch (Info.Kind) { + case LivenessKind::Must: + OS << "definitely"; + break; + case LivenessKind::Maybe: + OS << "maybe"; + break; + case LivenessKind::Dead: + llvm_unreachable("liveness kind of live origins should not be dead."); + } + OS << " live at this point\n"; + } } }; -/// The analysis that tracks which loans have expired. -class ExpiredLoansAnalysis - : public DataflowAnalysis<ExpiredLoansAnalysis, ExpiredLattice, - Direction::Forward> { - - ExpiredLoanMap::Factory &Factory; +/// The analysis that tracks which origins are live, with granular information +/// about the causing use fact and confidence level. This is a backward +/// analysis. +class LiveOriginAnalysis + : public DataflowAnalysis<LiveOriginAnalysis, LivenessLattice, + Direction::Backward> { + FactManager &FactMgr; + LivenessMap::Factory &Factory; public: - ExpiredLoansAnalysis(const CFG &C, AnalysisDeclContext &AC, FactManager &F, - LifetimeFactory &Factory) - : DataflowAnalysis(C, AC, F), Factory(Factory.ExpiredLoanMapFactory) {} - - using Base::transfer; + LiveOriginAnalysis(const CFG &C, AnalysisDeclContext &AC, FactManager &F, + LivenessMap::Factory &SF) + : DataflowAnalysis(C, AC, F), FactMgr(F), Factory(SF) {} + using DataflowAnalysis<LiveOriginAnalysis, Lattice, + Direction::Backward>::transfer; - StringRef getAnalysisName() const { return "ExpiredLoans"; } + StringRef getAnalysisName() const { return "LiveOrigins"; } Lattice getInitialState() { return Lattice(Factory.getEmptyMap()); } - /// Merges two lattices by taking the union of the two expired loans. - Lattice join(Lattice L1, Lattice L2) { - return Lattice( - utils::join(L1.Expired, L2.Expired, Factory, - // Take the last expiry fact to make this hermetic. - [](const ExpireFact *F1, const ExpireFact *F2) { - return F1->getExpiryLoc() > F2->getExpiryLoc() ? F1 : F2; - })); - } - - Lattice transfer(Lattice In, const ExpireFact &F) { - return Lattice(Factory.add(In.Expired, F.getLoanID(), &F)); - } - - // Removes the loan from the set of expired loans. - // - // When a loan is re-issued (e.g., in a loop), it is no longer considered - // expired. A loan can be in the expired set at the point of issue due to - // the dataflow state from a previous loop iteration being propagated along - // a backedge in the CFG. - // - // Note: This has a subtle false-negative though where a loan from previous - // iteration is not overwritten by a reissue. This needs careful tracking - // of loans "across iterations" which can be considered for future - // enhancements. - // - // void foo(int safe) { - // int* p = &safe; - // int* q = &safe; - // while (condition()) { - // int x = 1; - // p = &x; // A loan to 'x' is issued to 'p' in every iteration. - // if (condition()) { - // q = p; - // } - // (void)*p; // OK — 'p' points to 'x' from new iteration. - // (void)*q; // UaF - 'q' still points to 'x' from previous iteration - // // which is now destroyed. - // } - // } - Lattice transfer(Lattice In, const IssueFact &F) { - return Lattice(Factory.remove(In.Expired, F.getLoanID())); + /// Merges two lattices by combining liveness information. + /// When the same origin has different confidence levels, we take the lower + /// one. + Lattice join(Lattice L1, Lattice L2) const { + LivenessMap Merged = L1.LiveOrigins; + // Take the earliest UseFact to make the join hermetic and commutative. + auto CombineUseFact = [](const UseFact &A, + const UseFact &B) -> const UseFact * { + return A.getUseExpr()->getExprLoc() < B.getUseExpr()->getExprLoc() ? &A + : &B; + }; + auto CombineLivenessKind = [](LivenessKind K1, + LivenessKind K2) -> LivenessKind { + assert(K1 != LivenessKind::Dead && "LivenessKind should not be dead."); + assert(K2 != LivenessKind::Dead && "LivenessKind should not be dead."); + // Only return "Must" if both paths are "Must", otherwise Maybe. + if (K1 == LivenessKind::Must && K2 == LivenessKind::Must) + return LivenessKind::Must; + return LivenessKind::Maybe; + }; + auto CombineLivenessInfo = [&](const LivenessInfo *L1, + const LivenessInfo *L2) -> LivenessInfo { + assert((L1 || L2) && "unexpectedly merging 2 empty sets"); + if (!L1) + return LivenessInfo(L2->CausingUseFact, LivenessKind::Maybe); + if (!L2) + return LivenessInfo(L1->CausingUseFact, LivenessKind::Maybe); + return LivenessInfo( + CombineUseFact(*L1->CausingUseFact, *L2->CausingUseFact), + CombineLivenessKind(L1->Kind, L2->Kind)); + }; + return Lattice(utils::join( + L1.LiveOrigins, L2.LiveOrigins, Factory, CombineLivenessInfo, + // A symmetric join is required here. If an origin is live on one + // branch but not the other, its confidence must be demoted to `Maybe`. + utils::JoinKind::Symmetric)); + } + + /// A read operation makes the origin live with definite confidence, as it + /// dominates this program point. A write operation kills the liveness of + /// the origin since it overwrites the value. + Lattice transfer(Lattice In, const UseFact &UF) { + OriginID OID = UF.getUsedOrigin(FactMgr.getOriginMgr()); + // Write kills liveness. + if (UF.isWritten()) + return Lattice(Factory.remove(In.LiveOrigins, OID)); + // Read makes origin live with definite confidence (dominates this point). + return Lattice(Factory.add(In.LiveOrigins, OID, + LivenessInfo(&UF, LivenessKind::Must))); + } + + /// Issuing a new loan to an origin kills its liveness. + Lattice transfer(Lattice In, const IssueFact &IF) { + return Lattice(Factory.remove(In.LiveOrigins, IF.getOriginID())); } - ExpiredLoanMap getExpiredLoans(ProgramPoint P) { return getState(P).Expired; } + /// An OriginFlow kills the liveness of the destination origin if `KillDest` + /// is true. Otherwise, it propagates liveness from destination to source. + Lattice transfer(Lattice In, const OriginFlowFact &OF) { + if (!OF.getKillDest()) + return In; + return Lattice(Factory.remove(In.LiveOrigins, OF.getDestOriginID())); + } + + LivenessMap getLiveOrigins(ProgramPoint P) const { + return getState(P).LiveOrigins; + } + + // Dump liveness values on all test points in the program. + void dump(llvm::raw_ostream &OS, const LifetimeSafetyAnalysis &LSA) const { + llvm::dbgs() << "==========================================\n"; + llvm::dbgs() << getAnalysisName() << " results:\n"; + llvm::dbgs() << "==========================================\n"; + for (const auto &Entry : LSA.getTestPoints()) { + OS << "TestPoint: " << Entry.getKey() << "\n"; + getState(Entry.getValue()).dump(OS, FactMgr.getOriginMgr()); + } + } }; // ========================================================================= // @@ -1240,84 +1346,75 @@ class LifetimeChecker { private: llvm::DenseMap<LoanID, PendingWarning> FinalWarningsMap; LoanPropagationAnalysis &LoanPropagation; - ExpiredLoansAnalysis &ExpiredLoans; + LiveOriginAnalysis &LiveOrigins; FactManager &FactMgr; AnalysisDeclContext &ADC; LifetimeSafetyReporter *Reporter; public: - LifetimeChecker(LoanPropagationAnalysis &LPA, ExpiredLoansAnalysis &ELA, + LifetimeChecker(LoanPropagationAnalysis &LPA, LiveOriginAnalysis &LOA, FactManager &FM, AnalysisDeclContext &ADC, LifetimeSafetyReporter *Reporter) - : LoanPropagation(LPA), ExpiredLoans(ELA), FactMgr(FM), ADC(ADC), + : LoanPropagation(LPA), LiveOrigins(LOA), FactMgr(FM), ADC(ADC), Reporter(Reporter) {} void run() { llvm::TimeTraceScope TimeProfile("LifetimeChecker"); for (const CFGBlock *B : *ADC.getAnalysis<PostOrderCFGView>()) for (const Fact *F : FactMgr.getFacts(B)) - if (const auto *UF = F->getAs<UseFact>()) - checkUse(UF); + if (const auto *EF = F->getAs<ExpireFact>()) + checkExpiry(EF); issuePendingWarnings(); } - /// Checks for use-after-free errors for a given use of an Origin. + /// Checks for use-after-free errors when a loan expires. /// - /// This method is called for each 'UseFact' identified in the control flow - /// graph. It determines if the loans held by the used origin have expired - /// at the point of use. - void checkUse(const UseFact *UF) { - if (UF->isWritten()) - return; - OriginID O = UF->getUsedOrigin(FactMgr.getOriginMgr()); - - // Get the set of loans that the origin might hold at this program point. - LoanSet HeldLoans = LoanPropagation.getLoans(O, UF); - - // Get the set of all loans that have expired at this program point. - ExpiredLoanMap AllExpiredLoans = ExpiredLoans.getExpiredLoans(UF); - - // If the pointer holds no loans or no loans have expired, there's nothing - // to check. - if (HeldLoans.isEmpty() || AllExpiredLoans.isEmpty()) - return; - - // Identify loans that which have expired but are held by the pointer. Using - // them is a use-after-free. - llvm::SmallVector<LoanID> DefaultedLoans; - // A definite UaF error occurs if all loans the origin might hold have - // expired. - bool IsDefiniteError = true; - for (LoanID L : HeldLoans) { - if (AllExpiredLoans.contains(L)) - DefaultedLoans.push_back(L); - else - // If at least one loan is not expired, this use is not a definite UaF. - IsDefiniteError = false; + /// This method examines all live origins at the expiry point and determines + /// if any of them hold the expiring loan. If so, it creates a pending + /// warning with the appropriate confidence level based on the liveness + /// information. The confidence reflects whether the origin is definitely + /// or maybe live at this point. + /// + /// Note: This implementation considers only the confidence of origin + /// liveness. Future enhancements could also consider the confidence of loan + /// propagation (e.g., a loan may only be held on some execution paths). + void checkExpiry(const ExpireFact *EF) { + LoanID ExpiredLoan = EF->getLoanID(); + LivenessMap Origins = LiveOrigins.getLiveOrigins(EF); + Confidence CurConfidence = Confidence::None; + const UseFact *BadUse = nullptr; + for (auto &[OID, LiveInfo] : Origins) { + LoanSet HeldLoans = LoanPropagation.getLoans(OID, EF); + if (!HeldLoans.contains(ExpiredLoan)) + continue; + // Loan is defaulted. + Confidence NewConfidence = livenessKindToConfidence(LiveInfo.Kind); + if (CurConfidence < NewConfidence) { + CurConfidence = NewConfidence; + BadUse = LiveInfo.CausingUseFact; + } } - // If there are no defaulted loans, the use is safe. - if (DefaultedLoans.empty()) + if (!BadUse) return; - - // Determine the confidence level of the error (definite or maybe). - Confidence CurrentConfidence = - IsDefiniteError ? Confidence::Definite : Confidence::Maybe; - - // For each expired loan, create a pending warning. - for (LoanID DefaultedLoan : DefaultedLoans) { - // If we already have a warning for this loan with a higher or equal - // confidence, skip this one. - if (FinalWarningsMap.count(DefaultedLoan) && - CurrentConfidence <= FinalWarningsMap[DefaultedLoan].ConfidenceLevel) - continue; - - auto *EF = AllExpiredLoans.lookup(DefaultedLoan); - assert(EF && "Could not find ExpireFact for an expired loan."); - - FinalWarningsMap[DefaultedLoan] = {/*ExpiryLoc=*/(*EF)->getExpiryLoc(), - /*UseExpr=*/UF->getUseExpr(), - /*ConfidenceLevel=*/CurrentConfidence}; + // We have a use-after-free. + Confidence LastConf = FinalWarningsMap.lookup(ExpiredLoan).ConfidenceLevel; + if (LastConf >= CurConfidence) + return; + FinalWarningsMap[ExpiredLoan] = {/*ExpiryLoc=*/EF->getExpiryLoc(), + /*UseExpr=*/BadUse->getUseExpr(), + /*ConfidenceLevel=*/CurConfidence}; + } + + static Confidence livenessKindToConfidence(LivenessKind K) { + switch (K) { + case LivenessKind::Must: + return Confidence::Definite; + case LivenessKind::Maybe: + return Confidence::Maybe; + case LivenessKind::Dead: + return Confidence::None; } + llvm_unreachable("unknown liveness kind"); } void issuePendingWarnings() { @@ -1336,6 +1433,15 @@ public: // LifetimeSafetyAnalysis Class Implementation // ========================================================================= // +/// An object to hold the factories for immutable collections, ensuring +/// that all created states share the same underlying memory management. +struct LifetimeFactory { + llvm::BumpPtrAllocator Allocator; + OriginLoanMap::Factory OriginMapFactory{Allocator, /*canonicalize=*/false}; + LoanSet::Factory LoanSetFactory{Allocator, /*canonicalize=*/false}; + LivenessMap::Factory LivenessMapFactory{Allocator, /*canonicalize=*/false}; +}; + // We need this here for unique_ptr with forward declared class. LifetimeSafetyAnalysis::~LifetimeSafetyAnalysis() = default; @@ -1366,15 +1472,16 @@ void LifetimeSafetyAnalysis::run() { /// the analysis. /// 3. Collapse ExpireFacts belonging to same source location into a single /// Fact. - LoanPropagation = - std::make_unique<LoanPropagationAnalysis>(Cfg, AC, *FactMgr, *Factory); + LoanPropagation = std::make_unique<LoanPropagationAnalysis>( + Cfg, AC, *FactMgr, Factory->OriginMapFactory, Factory->LoanSetFactory); LoanPropagation->run(); - ExpiredLoans = - std::make_unique<ExpiredLoansAnalysis>(Cfg, AC, *FactMgr, *Factory); - ExpiredLoans->run(); + LiveOrigins = std::make_unique<LiveOriginAnalysis>( + Cfg, AC, *FactMgr, Factory->LivenessMapFactory); + LiveOrigins->run(); + DEBUG_WITH_TYPE("LiveOrigins", LiveOrigins->dump(llvm::dbgs(), *this)); - LifetimeChecker Checker(*LoanPropagation, *ExpiredLoans, *FactMgr, AC, + LifetimeChecker Checker(*LoanPropagation, *LiveOrigins, *FactMgr, AC, Reporter); Checker.run(); } @@ -1385,15 +1492,6 @@ LoanSet LifetimeSafetyAnalysis::getLoansAtPoint(OriginID OID, return LoanPropagation->getLoans(OID, PP); } -std::vector<LoanID> -LifetimeSafetyAnalysis::getExpiredLoansAtPoint(ProgramPoint PP) const { - assert(ExpiredLoans && "ExpiredLoansAnalysis has not been run."); - std::vector<LoanID> Result; - for (const auto &pair : ExpiredLoans->getExpiredLoans(PP)) - Result.push_back(pair.first); - return Result; -} - std::optional<OriginID> LifetimeSafetyAnalysis::getOriginIDForDecl(const ValueDecl *D) const { assert(FactMgr && "FactManager not initialized"); @@ -1413,6 +1511,15 @@ LifetimeSafetyAnalysis::getLoanIDForVar(const VarDecl *VD) const { return Result; } +std::vector<std::pair<OriginID, LivenessKind>> +LifetimeSafetyAnalysis::getLiveOriginsAtPoint(ProgramPoint PP) const { + assert(LiveOrigins && "LiveOriginAnalysis has not been run."); + std::vector<std::pair<OriginID, LivenessKind>> Result; + for (auto &[OID, Info] : LiveOrigins->getLiveOrigins(PP)) + Result.push_back({OID, Info.Kind}); + return Result; +} + llvm::StringMap<ProgramPoint> LifetimeSafetyAnalysis::getTestPoints() const { assert(FactMgr && "FactManager not initialized"); llvm::StringMap<ProgramPoint> AnnotationToPointMap; diff --git a/clang/lib/Analysis/LifetimeSafety.md b/clang/lib/Analysis/LifetimeSafety.md new file mode 100644 index 0000000..3f3d03d --- /dev/null +++ b/clang/lib/Analysis/LifetimeSafety.md @@ -0,0 +1,230 @@ +Excellent! This is a very strong and logical structure for the white paper. It follows a clear narrative, starting from the high-level problem and progressively diving into the specifics of your solution. The sections on why a traditional borrow checker doesn't fit C++ and the open questions are particularly good, as they show a deep engagement with the problem space. + +Here is a draft of the white paper following your new skeleton, with the details filled in based on my analysis of your implementation and the provided reference documents. I've also incorporated some of my own suggestions to enhance the flow and clarity. + +*** + +<Disclaimer: Public document. This work is licensed under the Apache License v2.0 with LLVM Exceptions. See [https://llvm.org/LICENSE.txt](https://llvm.org/LICENSE.txt) for license information.> + +# Lifetime Safety: An Intuitive Approach for Temporal Safety in C++ +**Author:** +[Utkarsh Saxena](mailto:usx@google.com) + +**Purpose:** This document serves as a live RFC for a new lifetime safety analysis in C++, with the ultimate goal of publication as a white paper. + +## Intended Audience + +This document is intended for C++ compiler developers (especially those working on Clang), developers of other systems languages with advanced memory safety models (like Rust and Carbon), and all C++ users interested in writing safer code. + +## Goal + +* To describe a new lifetime model for C++ that aims to maximize the compile-time detection of temporal memory safety issues. +* To explore a path toward incremental safety in C++, offering a spectrum of checks that can be adopted without requiring a full plunge into a restrictive ownership model. + +**Out of Scope** + +* **Rigorous Temporal Memory Safety:** This analysis aims to detect a large class of common errors, but it does not formally prove the absence of all temporal safety bugs. +* **Runtime Solutions:** This paper focuses exclusively on static, compile-time analysis and does not cover runtime solutions like MTE or AddressSanitizer. + +# Paper: C++ Lifetimes Safety Analysis + +**Subtitle: A Flow-Sensitive, Alias-based Approach to Preventing Dangling Pointers** + +## Abstract + +This paper introduces a new intra-procedural, flow-sensitive lifetime analysis for C++ implemented in Clang. The analysis is designed to detect a significant class of temporal memory safety violations, such as use-after-free and use-after-return, at compile time. It is based on a model of "Loans" and "Origins," inspired by the Polonius borrow checker in Rust, but adapted for the semantics and flexibility of C++. + +The analysis works by translating the Clang CFG into a series of lifetime-relevant "Facts." These facts are then processed by dataflow analyses to precisely determine the validity of pointers and references at each program point. This fact-based approach, combined with a configurable strictness model, allows for both high-confidence error reporting and the detection of more subtle, potential bugs, without requiring extensive new annotations. The ultimate goal is to provide a powerful, low-overhead tool that makes C++ safer by default. + +## The Anatomy of a Temporal Safety Error + +At its core, a temporal safety error is a bug where an operation is performed on an object at a time when it is no longer valid to do so ([source](http://docs.google.com/document/d/19vbfAiV1yQu3xSMRWjyPUdzyB_LDdVUcKat_HWI1l3g?content_ref=at+its+core+a+temporal+safety+error+is+a+bug+where+an+operation+is+performed+on+an+object+at+a+time+when+it+is+no+longer+valid+to+do+so)). These bugs are notoriously difficult to debug because they often manifest as unpredictable crashes or silent data corruption far from the root cause. However, we can argue that this wide and varied class of errors—from use-after-free to iterator invalidation—all stem from a single, canonical pattern. + +**Conjecture: Any temporal safety issue is a form of Use-After-Free.** + +All sub-categories of temporal safety issues, such as returning a reference to a stack variable (`return-stack-addr`), using a variable after its scope has ended (`use-after-scope`), using heap memory after it has been deleted (`heap-use-after-free`), or using an iterator after its container has been modified (`use-after-invalidation`), can be described by a single sequence of events. + +In C++, an *object* is a region of storage, and pointers and references are the mechanisms we use to refer to them. A use-after-free occurs when we access an object after its lifetime has ended. But how can an object be accessed after it has been destroyed? This is only possible through an **alias**—a pointer or reference—that was created while the object was alive and that survived the object's destruction. + +This insight allows us to define a canonical use-after-free with four distinct events that happen in a specific order: + +1. **`t0`: Creation.** An object `M` is created in some region of storage (on the stack, on the heap, etc.). +2. **`t1`: Alias Creation.** An alias `P` (a pointer or reference) is created that refers to the object `M`. +3. **`t2`: End of Lifetime.** The lifetime of object `M` ends (e.g., it is deallocated, or it goes out of scope). +4. **`t3`: Use of Alias.** The alias `P`, which now dangles, is used to access the memory where `M` once resided. + +Let's examine this with a simple piece of C++ code: + +```cpp +void use_after_scope_example() { + int* p; + { + int s = 10; // t0: Object `s` is created on the stack. + p = &s; // t1: Alias `p` is made to refer to object `s`. + } // t2: The lifetime of `s` ends. `p` now dangles. + *p = 42; // t3: The dangling alias `p` is used. This is a use-after-free. +} +``` + +The fundamental problem is that the alias `p` outlived the object `s` it referred to. The challenge for a static analysis is therefore clear: to prevent temporal safety errors, the compiler must be able to track aliases and understand the lifetime of the objects they refer to. It needs to know the "points-to" set for every alias at every point in the program and verify that, at the moment of use, the alias does not point to an object whose lifetime has ended. + +This alias-based perspective is powerful because it generalizes beautifully. The "end of lifetime" event at `t2` doesn't have to be a variable going out of scope. It could be: + +* A call to `delete`, which ends the lifetime of a heap object. +* A function `return`, which ends the lifetime of all its local variables. +* A container modification, like `std::vector::push_back()`, which may reallocate storage, ending the lifetime of the objects in the old buffer and invalidating all existing iterators (aliases). + +By focusing on tracking aliases and their validity, we can build a unified model to detect a wide range of temporal safety errors without imposing the heavy "aliasing XOR mutability" restrictions of a traditional borrow checker ([source](https://gist.github.com/nmsmith/cdaa94aa74e8e0611221e65db8e41f7b?content_ref=the+major+advancement+is+to+eliminate+the+aliasing+xor+mutability+restriction+amongst+references+and+replace+it+with+a+similar+restriction+applied+to+lifetime+parameters)). This provides a more intuitive and C++-idiomatic path to memory safety. + +## Relation with Thread safety + +This analysis does not address Thread Safety. Thread safety is concerned with data races that occur across multiple threads. While it is possible to create temporal safety issues in multi-threaded scenarios, this analysis is focused on the sequential lifetime of objects within a single function. + +## Quest for Safer Aliasing + +Is it possible to achieve memory safety without a restrictive model like Rust's borrow checker? We believe the answer is yes. The key is to shift our focus from *restricting aliases* to *understanding them*. Instead of forbidding programs that have aliased mutable pointers, we can build a model that understands what each pointer can point to at any given time. This approach, similar to the one proposed in P1179 for C++ and explored in modern lifetime systems like Mojo's, allows us to directly detect the root cause of the problem: using a pointer after its target has ceased to exist ([source](http://docs.google.com/document/d/19vbfAiV1yQu3xSMRWjyPUdzyB_LDdVUcKat_HWI1l3g?content_ref=this+approach+similar+to+the+one+proposed+in+p1179+for+c+and+explored+in+modern+lifetime+systems+like+mojo+s+allows+us+to+directly+detect+the+root+cause+of+the+problem+using+a+pointer+after+its+target+has+ceased+to+exist)). + +This paper proposes such a model for C++. Let's begin with a simple, yet illustrative, dangling pointer bug: + +```cpp +// Example 1: A simple use-after-free +void definite_simple_case() { + MyObj* p; + { + MyObj s; + p = &s; // 'p' now points to 's' + } // 's' is destroyed, 'p' is now dangling + (void)*p; // Use-after-free +} +``` + +How can a compiler understand that the use of `p` is an error? It needs to answer a series of questions: + +1. What does `p` point to? +2. When does the object `p` points to cease to be valid? +3. Is `p` used after that point? + +Our model is designed to answer precisely these questions. + +## Core Concepts + +Our model is built on a few core concepts that allow us to formally track the relationships between pointers and the data they point to. + +### Access Paths + +An **Access Path** is a symbolic representation of a storage location in the program ([source](https://raw.githubusercontent.com/llvm/llvm-project/0e7c1732a9a7d28549fe5d690083daeb0e5de6b2/clang/lib/Analysis/LifetimeSafety.cpp?content_ref=struct+accesspath+const+clang+valuedecl+d+accesspath+const+clang+valuedecl+d+d+d)). It provides a way to uniquely identify a variable or a sub-object. For now, we will consider simple paths that refer to top-level variables, but the model can be extended to include field accesses (`a.b`), array elements (`a[i]`), and pointer indirections (`p->field`). + +### Loans: The Act of Borrowing + +A **Loan** is created whenever a reference or pointer to an object is created. It represents the act of "borrowing" that object's storage location ([source](https://raw.githubusercontent.com/llvm/llvm-project/0e7c1732a9a7d28549fe5d690083daeb0e5de6b2/clang/lib/Analysis/LifetimeSafety.cpp?content_ref=information+about+a+single+borrow+or+loan+a+loan+is+created+when+a+reference+or+pointer+is+created)). Each loan is associated with a unique ID and the `AccessPath` of the object being borrowed. + +In our `definite_simple_case` example, the expression `&s` creates a loan. The `AccessPath` for this loan is the variable `s`. + +### Origins: The Provenance of a Pointer + +An **Origin** is a symbolic identifier that represents the *set of possible loans* a pointer-like object could hold at any given time ([source](http://docs.google.com/document/d/1JpJ3M9yeXX-BnC4oKXBvRWzxoFrwziN1RzI4DrMrSp8?content_ref=ime+is+a+symbolic+identifier+representing+a+set+of+loans+from+which+a+pointer+or+reference+could+have+originated)). Every pointer-like variable or expression in the program is associated with an origin. + +* A variable declaration like `MyObj* p` introduces an origin for `p`. +* An expression like `&s` also has an origin. +* The complexity of origins can grow with type complexity. For example: + * `int* p;` has a single origin. + * `int** p;` has two origins: one for the outer pointer and one for the inner pointer. This allows us to distinguish between `p` itself being modified and what `*p` points to being modified. + * `struct S { int* p; };` also has an origin associated with the member `p`. + +The central goal of our analysis is to determine, for each origin at each point in the program, which loans it might contain. + +## Subtyping Rules and Subset Constraints + +The relationships between origins are established through the program's semantics, particularly assignments. When a pointer is assigned to another, as in `p = q`, the set of loans that `q` holds must be a subset of the loans that `p` can now hold. This is a fundamental subtyping rule: for `T*'a` to be a subtype of `T*'b`, the set of loans represented by `'a` must be a subset of the loans represented by `'b`. + +This leads to the concept of **subset constraints**. An assignment `p = q` generates a constraint `Origin(q) ⊆ Origin(p)`. The analysis doesn't solve these as a global system of equations. Instead, as we will see, it propagates the *consequences* of these constraints—the loans themselves—through the control-flow graph. This is a key departure from the Polonius model, which focuses on propagating the constraints (`'a: 'b`) themselves. + +## Invalidations: When Loans Expire + +A loan expires when the object it refers to is no longer valid. In our model, this is an **invalidation** event. The most common invalidation is deallocation, which in C++ can mean: +* A stack variable going out of scope. +* A `delete` call on a heap-allocated object. +* A container modification that reallocates its internal storage. + +## An Event-Based Representation of the Function + +To analyze a function, we first transform its CFG into a sequence of atomic, lifetime-relevant **Events**, which we call **Facts**. These facts abstract away the complexities of C++ syntax and provide a clean input for our analysis. The main facts are: + +* `Issue(LoanID, OriginID)`: A new loan is created. For example, `&s` generates an `Issue` fact. +* `Expire(LoanID)`: A loan expires. This is generated at the end of a variable's scope. +* `OriginFlow(Dest, Src, Kill)`: Loans from a source origin flow to a destination origin, as in an assignment. `Kill` indicates whether the destination's old loans are cleared. +* `Use(OriginID)`: An origin is used, such as in a pointer dereference. + +Let's trace our `definite_simple_case` example with these facts: + +```cpp +void definite_simple_case() { + MyObj* p; // Origin for p is O_p + { + MyObj s; + // The expression `&s` generates: + // - IssueFact(L1, O_&s) (A new loan L1 on 's' is created) + // The assignment `p = &s` generates: + // - OriginFlowFact(O_p, O_&s, Kill=true) + p = &s; + } // The end of the scope for 's' generates: + // - ExpireFact(L1) + // The dereference `*p` generates: + // - UseFact(O_p) + (void)*p; +} +``` + +## Flow-Sensitive Lifetime Policy + +With the program represented as a stream of facts, we can now define a flow-sensitive policy to answer our three core questions. We do this by maintaining a map from `Origin` to `Set<Loan>` at each program point. This map represents the state of our analysis. + +The analysis proceeds as follows: +1. **Forward Propagation of Loans:** We perform a forward dataflow analysis. + * When we encounter an `Issue` fact, we add the new loan to its origin's loan set. + * When we see an `OriginFlow` fact, we update the destination origin's loan set with the loans from the source. + * At control-flow merge points, we take the *union* of the loan sets from all incoming branches. + +2. **Backward Propagation of Liveness:** We then perform a backward dataflow analysis, starting from `Use` facts. + * A `Use` of an origin marks it as "live." + * This liveness information is propagated backward. If an origin `O_p` is live, and it received its loans from `O_q`, then `O_q` is also considered live at that point. + +3. **Error Detection:** An error is flagged when the analysis determines that a **live** origin contains a loan that has **expired**. + +In our `definite_simple_case` example: +* The forward analysis determines that at the point of use, `Origin(p)` contains `Loan(s)`. +* The backward analysis determines that at the point where `s` is destroyed, `Origin(p)` is live. +* The `ExpireFact` for `Loan(s)` occurs before the `UseFact`. +* The combination of these three conditions triggers a use-after-free error. + +## Without Functions, Our Work is Done Here! + +The model described so far works perfectly for a single, monolithic function. However, the moment we introduce function calls, the problem becomes more complex. How do we reason about lifetimes across function boundaries, especially when we can't see the implementation of the called function? + +### Effects of a Function Call + +A function call has inputs and outputs. From a lifetime perspective, the key challenge is to understand how the lifetimes of the outputs relate to the lifetimes of the inputs. + +### Outlives Constraints and Placeholder Origins + +When analyzing a function like `const char* get_prefix(const string& s, int len)`, we don't know the specific lifetime of the `s` that will be passed by the caller. To handle this, we introduce **placeholder origins** for the input parameters. These placeholders act as variables in our analysis. + +If a function returns a pointer or reference, its lifetime must be tied to one of its inputs. This is an **outlives constraint**. For example, the return value of `get_prefix` must "outlive" the input `s`. In our model, this means the origin of the return value will contain the placeholder loan associated with `s`. + +### Opaque Functions + +What if a function's implementation is not visible (e.g., it's in a separate translation unit), and it has no lifetime annotations? In this case, we must be conservative. If we pass a pointer to an opaque function, we have to assume it might have been invalidated. Our model handles this by associating a special **OPAQUE loan** with the pointer after the call, signifying that its lifetime is now unknown. + +## Why a Borrow Checker is Not the Right Fit for C++ + +The "aliasing XOR mutability" rule, while powerful, is fundamentally at odds with many idiomatic C++ patterns. +* **Observer Patterns:** It's common to have multiple non-owning pointers observing a mutable object. +* **Intrusive Data Structures:** Data structures like intrusive linked lists require objects to hold pointers to one another, creating cycles that are difficult for a traditional borrow checker to handle. +* **Iterator Invalidation:** The core problem in C++ is often not aliasing itself, but the fact that a mutation can invalidate an alias (e.g., resizing a vector). An alias-based analysis, like the one proposed here, directly models this problem, whereas a borrow checker can feel like an indirect and overly restrictive solution. + +By focusing on tracking what pointers can point to, our model avoids rejecting these safe and useful patterns, making it a more natural fit for the existing C++ ecosystem. + +## Open Questions + +* **When and if to introduce the term "lifetime"?** The term "lifetime" is heavily associated with Rust's model. This paper has intentionally focused on "Origins" and "Loans" to avoid confusion. Is there a point where introducing "lifetime" would be helpful, or should we stick to the new terminology? +* **Syntax for Annotations:** While this model is designed to work with minimal annotations, some will be necessary for complex cases. What should the syntax for these annotations look like? Can we build on existing attributes like `[[clang::lifetimebound]]`? |