[analyzer] Support interestingness in ArrayBoundV2 (#78315)

This commit improves alpha.security.ArrayBoundV2 in two connected areas: (1) It calls `markInteresting()` on the symbolic values that are responsible for the out of bounds access. (2) Its index-is-in-bounds assumptions are reported in note tags if they provide information about the value of an interesting symbol. This commit is limited to "display" changes: it introduces new diagnostic pieces (potentially to bugs found by other checkers), but ArrayBoundV2 will make the same assumptions and detect the same bugs before and after this change. As a minor unrelated change, this commit also updates/removes some very old comments which became obsolete due to my previous changes.
author: NagyDonat <donat.nagy@ericsson.com> 2024-02-05 17:07:14 +0100
committer: GitHub <noreply@github.com> 2024-02-05 17:07:14 +0100
commit: fee204f0c9b3b77898c1faa2a7415b0f64f5e7f0 (patch)
tree: 0ddddd40574332257980705b3204e448e6c46551
parent: f2c84211d2834c73ff874389c6bb47b1c76d391a (diff)
download: llvm-fee204f0c9b3b77898c1faa2a7415b0f64f5e7f0.zip
llvm-fee204f0c9b3b77898c1faa2a7415b0f64f5e7f0.tar.gz
llvm-fee204f0c9b3b77898c1faa2a7415b0f64f5e7f0.tar.bz2
3 files changed, 607 insertions, 125 deletions
diff --git a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
index 6c7a160..05fc00a 100644
--- a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
@@ -33,7 +33,82 @@ using namespace taint;
 using llvm::formatv;
 
 namespace {
-enum OOB_Kind { OOB_Precedes, OOB_Exceeds, OOB_Taint };
+/// If `E` is a "clean" array subscript expression, return the type of the
+/// accessed element. If the base of the subscript expression is modified by
+/// pointer arithmetic (and not the beginning of a "full" memory region), this
+/// always returns nullopt because that's the right (or the least bad) thing to
+/// do for the diagnostic output that's relying on this.
+static std::optional<QualType> determineElementType(const Expr *E,
+                                                    const CheckerContext &C) {
+  const auto *ASE = dyn_cast<ArraySubscriptExpr>(E);
+  if (!ASE)
+    return std::nullopt;
+
+  const MemRegion *SubscriptBaseReg = C.getSVal(ASE->getBase()).getAsRegion();
+  if (!SubscriptBaseReg)
+    return std::nullopt;
+
+  // The base of the subscript expression is affected by pointer arithmetics,
+  // so we want to report byte offsets instead of indices.
+  if (isa<ElementRegion>(SubscriptBaseReg->StripCasts()))
+    return std::nullopt;
+
+  return ASE->getType();
+}
+
+static std::optional<int64_t>
+determineElementSize(const std::optional<QualType> T, const CheckerContext &C) {
+  if (!T)
+    return std::nullopt;
+  return C.getASTContext().getTypeSizeInChars(*T).getQuantity();
+}
+
+class StateUpdateReporter {
+  const SubRegion *Reg;
+  const NonLoc ByteOffsetVal;
+  const std::optional<QualType> ElementType;
+  const std::optional<int64_t> ElementSize;
+  bool AssumedNonNegative = false;
+  std::optional<NonLoc> AssumedUpperBound = std::nullopt;
+
+public:
+  StateUpdateReporter(const SubRegion *R, NonLoc ByteOffsVal, const Expr *E,
+                      CheckerContext &C)
+      : Reg(R), ByteOffsetVal(ByteOffsVal),
+        ElementType(determineElementType(E, C)),
+        ElementSize(determineElementSize(ElementType, C)) {}
+
+  void recordNonNegativeAssumption() { AssumedNonNegative = true; }
+  void recordUpperBoundAssumption(NonLoc UpperBoundVal) {
+    AssumedUpperBound = UpperBoundVal;
+  }
+
+  const NoteTag *createNoteTag(CheckerContext &C) const;
+
+private:
+  std::string getMessage(PathSensitiveBugReport &BR) const;
+
+  /// Return true if information about the value of `Sym` can put constraints
+  /// on some symbol which is interesting within the bug report `BR`.
+  /// In particular, this returns true when `Sym` is interesting within `BR`;
+  /// but it also returns true if `Sym` is an expression that contains integer
+  /// constants and a single symbolic operand which is interesting (in `BR`).
+  /// We need to use this instead of plain `BR.isInteresting()` because if we
+  /// are analyzing code like
+  ///   int array[10];
+  ///   int f(int arg) {
+  ///     return array[arg] && array[arg + 10];
+  ///   }
+  /// then the byte offsets are `arg * 4` and `(arg + 10) * 4`, which are not
+  /// sub-expressions of each other (but `getSimplifiedOffsets` is smart enough
+  /// to detect this out of bounds access).
+  static bool providesInformationAboutInteresting(SymbolRef Sym,
+                                                  PathSensitiveBugReport &BR);
+  static bool providesInformationAboutInteresting(SVal SV,
+                                                  PathSensitiveBugReport &BR) {
+    return providesInformationAboutInteresting(SV.getAsSymbol(), BR);
+  }
+};
 
 struct Messages {
   std::string Short, Full;
@@ -54,11 +129,19 @@ class ArrayBoundCheckerV2 : public Checker<check::PostStmt<ArraySubscriptExpr>,
 
   void performCheck(const Expr *E, CheckerContext &C) const;
 
-  void reportOOB(CheckerContext &C, ProgramStateRef ErrorState, OOB_Kind Kind,
-                 NonLoc Offset, Messages Msgs) const;
+  void reportOOB(CheckerContext &C, ProgramStateRef ErrorState, Messages Msgs,
+                 NonLoc Offset, std::optional<NonLoc> Extent,
+                 bool IsTaintBug = false) const;
+
+  static void markPartsInteresting(PathSensitiveBugReport &BR,
+                                   ProgramStateRef ErrorState, NonLoc Val,
+                                   bool MarkTaint);
 
   static bool isFromCtypeMacro(const Stmt *S, ASTContext &AC);
 
+  static bool isIdiomaticPastTheEndPtr(const Expr *E, ProgramStateRef State,
+                                       NonLoc Offset, NonLoc Limit,
+                                       CheckerContext &C);
   static bool isInAddressOf(const Stmt *S, ASTContext &AC);
 
 public:
@@ -133,12 +216,26 @@ computeOffset(ProgramStateRef State, SValBuilder &SVB, SVal Location) {
   return std::nullopt;
 }
 
-// TODO: once the constraint manager is smart enough to handle non simplified
-// symbolic expressions remove this function. Note that this can not be used in
-// the constraint manager as is, since this does not handle overflows. It is
-// safe to assume, however, that memory offsets will not overflow.
-// NOTE: callers of this function need to be aware of the effects of overflows
-// and signed<->unsigned conversions!
+// NOTE: This function is the "heart" of this checker. It simplifies
+// inequalities with transformations that are valid (and very elementary) in
+// pure mathematics, but become invalid if we use them in C++ number model
+// where the calculations may overflow.
+// Due to the overflow issues I think it's impossible (or at least not
+// practical) to integrate this kind of simplification into the resolution of
+// arbitrary inequalities (i.e. the code of `evalBinOp`); but this function
+// produces valid results when the calculations are handling memory offsets
+// and every value is well below SIZE_MAX.
+// TODO: This algorithm should be moved to a central location where it's
+// available for other checkers that need to compare memory offsets.
+// NOTE: the simplification preserves the order of the two operands in a
+// mathematical sense, but it may change the result produced by a C++
+// comparison operator (and the automatic type conversions).
+// For example, consider a comparison "X+1 < 0", where the LHS is stored as a
+// size_t and the RHS is stored in an int. (As size_t is unsigned, this
+// comparison is false for all values of "X".) However, the simplification may
+// turn it into "X < -1", which is still always false in a mathematical sense,
+// but can produce a true result when evaluated by `evalBinOp` (which follows
+// the rules of C++ and casts -1 to SIZE_MAX).
 static std::pair<NonLoc, nonloc::ConcreteInt>
 getSimplifiedOffsets(NonLoc offset, nonloc::ConcreteInt extent,
                      SValBuilder &svalBuilder) {
@@ -239,13 +336,8 @@ static std::optional<int64_t> getConcreteValue(NonLoc SV) {
   return std::nullopt;
 }
 
-static std::string getShortMsg(OOB_Kind Kind, std::string RegName) {
-  static const char *ShortMsgTemplates[] = {
-      "Out of bound access to memory preceding {0}",
-      "Out of bound access to memory after the end of {0}",
-      "Potential out of bound access to {0} with tainted offset"};
-
-  return formatv(ShortMsgTemplates[Kind], RegName);
+static std::optional<int64_t> getConcreteValue(std::optional<NonLoc> SV) {
+  return SV ? getConcreteValue(*SV) : std::nullopt;
 }
 
 static Messages getPrecedesMsgs(const SubRegion *Region, NonLoc Offset) {
@@ -255,7 +347,28 @@ static Messages getPrecedesMsgs(const SubRegion *Region, NonLoc Offset) {
   Out << "Access of " << RegName << " at negative byte offset";
   if (auto ConcreteIdx = Offset.getAs<nonloc::ConcreteInt>())
     Out << ' ' << ConcreteIdx->getValue();
-  return {getShortMsg(OOB_Precedes, RegName), std::string(Buf)};
+  return {formatv("Out of bound access to memory preceding {0}", RegName),
+          std::string(Buf)};
+}
+
+/// Try to divide `Val1` and `Val2` (in place) by `Divisor` and return true if
+/// it can be performed (`Divisor` is nonzero and there is no remainder). The
+/// values `Val1` and `Val2` may be nullopt and in that case the corresponding
+/// division is considered to be successful.
+static bool tryDividePair(std::optional<int64_t> &Val1,
+                          std::optional<int64_t> &Val2, int64_t Divisor) {
+  if (!Divisor)
+    return false;
+  const bool Val1HasRemainder = Val1 && *Val1 % Divisor;
+  const bool Val2HasRemainder = Val2 && *Val2 % Divisor;
+  if (!Val1HasRemainder && !Val2HasRemainder) {
+    if (Val1)
+      *Val1 /= Divisor;
+    if (Val2)
+      *Val2 /= Divisor;
+    return true;
+  }
+  return false;
 }
 
 static Messages getExceedsMsgs(ASTContext &ACtx, const SubRegion *Region,
@@ -268,18 +381,9 @@ static Messages getExceedsMsgs(ASTContext &ACtx, const SubRegion *Region,
   std::optional<int64_t> OffsetN = getConcreteValue(Offset);
   std::optional<int64_t> ExtentN = getConcreteValue(Extent);
 
-  bool UseByteOffsets = true;
-  if (int64_t ElemSize = ACtx.getTypeSizeInChars(ElemType).getQuantity()) {
-    const bool OffsetHasRemainder = OffsetN && *OffsetN % ElemSize;
-    const bool ExtentHasRemainder = ExtentN && *ExtentN % ElemSize;
-    if (!OffsetHasRemainder && !ExtentHasRemainder) {
-      UseByteOffsets = false;
-      if (OffsetN)
-        *OffsetN /= ElemSize;
-      if (ExtentN)
-        *ExtentN /= ElemSize;
-    }
-  }
+  int64_t ElemSize = ACtx.getTypeSizeInChars(ElemType).getQuantity();
+
+  bool UseByteOffsets = !tryDividePair(OffsetN, ExtentN, ElemSize);
 
   SmallString<256> Buf;
   llvm::raw_svector_ostream Out(Buf);
@@ -307,7 +411,9 @@ static Messages getExceedsMsgs(ASTContext &ACtx, const SubRegion *Region,
       Out << "s";
   }
 
-  return {getShortMsg(OOB_Exceeds, RegName), std::string(Buf)};
+  return {
+      formatv("Out of bound access to memory after the end of {0}", RegName),
+      std::string(Buf)};
 }
 
 static Messages getTaintMsgs(const SubRegion *Region, const char *OffsetName) {
@@ -318,17 +424,91 @@ static Messages getTaintMsgs(const SubRegion *Region, const char *OffsetName) {
                   RegName, OffsetName)};
 }
 
-void ArrayBoundCheckerV2::performCheck(const Expr *E, CheckerContext &C) const {
-  // NOTE: Instead of using ProgramState::assumeInBound(), we are prototyping
-  // some new logic here that reasons directly about memory region extents.
-  // Once that logic is more mature, we can bring it back to assumeInBound()
-  // for all clients to use.
-  //
-  // The algorithm we are using here for bounds checking is to see if the
-  // memory access is within the extent of the base region.  Since we
-  // have some flexibility in defining the base region, we can achieve
-  // various levels of conservatism in our buffer overflow checking.
+const NoteTag *StateUpdateReporter::createNoteTag(CheckerContext &C) const {
+  // Don't create a note tag if we didn't assume anything:
+  if (!AssumedNonNegative && !AssumedUpperBound)
+    return nullptr;
+
+  return C.getNoteTag([*this](PathSensitiveBugReport &BR) -> std::string {
+    return getMessage(BR);
+  });
+}
+
+std::string StateUpdateReporter::getMessage(PathSensitiveBugReport &BR) const {
+  bool ShouldReportNonNegative = AssumedNonNegative;
+  if (!providesInformationAboutInteresting(ByteOffsetVal, BR)) {
+    if (AssumedUpperBound &&
+        providesInformationAboutInteresting(*AssumedUpperBound, BR)) {
+      // Even if the byte offset isn't interesting (e.g. it's a constant value),
+      // the assumption can still be interesting if it provides information
+      // about an interesting symbolic upper bound.
+      ShouldReportNonNegative = false;
+    } else {
+      // We don't have anything interesting, don't report the assumption.
+      return "";
+    }
+  }
+
+  std::optional<int64_t> OffsetN = getConcreteValue(ByteOffsetVal);
+  std::optional<int64_t> ExtentN = getConcreteValue(AssumedUpperBound);
+
+  const bool UseIndex =
+      ElementSize && tryDividePair(OffsetN, ExtentN, *ElementSize);
+
+  SmallString<256> Buf;
+  llvm::raw_svector_ostream Out(Buf);
+  Out << "Assuming ";
+  if (UseIndex) {
+    Out << "index ";
+    if (OffsetN)
+      Out << "'" << OffsetN << "' ";
+  } else if (AssumedUpperBound) {
+    Out << "byte offset ";
+    if (OffsetN)
+      Out << "'" << OffsetN << "' ";
+  } else {
+    Out << "offset ";
+  }
+
+  Out << "is";
+  if (ShouldReportNonNegative) {
+    Out << " non-negative";
+  }
+  if (AssumedUpperBound) {
+    if (ShouldReportNonNegative)
+      Out << " and";
+    Out << " less than ";
+    if (ExtentN)
+      Out << *ExtentN << ", ";
+    if (UseIndex && ElementType)
+      Out << "the number of '" << ElementType->getAsString()
+          << "' elements in ";
+    else
+      Out << "the extent of ";
+    Out << getRegionName(Reg);
+  }
+  return std::string(Out.str());
+}
+
+bool StateUpdateReporter::providesInformationAboutInteresting(
+    SymbolRef Sym, PathSensitiveBugReport &BR) {
+  if (!Sym)
+    return false;
+  for (SymbolRef PartSym : Sym->symbols()) {
+    // The interestingess mark may appear on any layer as we're stripping off
+    // the SymIntExpr, UnarySymExpr etc. layers...
+    if (BR.isInteresting(PartSym))
+      return true;
+    // ...but if both sides of the expression are symbolic, then there is no
+    // practical algorithm to produce separate constraints for the two
+    // operands (from the single combined result).
+    if (isa<SymSymExpr>(PartSym))
+      return false;
+  }
+  return false;
+}
 
+void ArrayBoundCheckerV2::performCheck(const Expr *E, CheckerContext &C) const {
   const SVal Location = C.getSVal(E);
 
   // The header ctype.h (from e.g. glibc) implements the isXXXXX() macros as
@@ -350,6 +530,10 @@ void ArrayBoundCheckerV2::performCheck(const Expr *E, CheckerContext &C) const {
 
   auto [Reg, ByteOffset] = *RawOffset;
 
+  // The state updates will be reported as a single note tag, which will be
+  // composed by this helper class.
+  StateUpdateReporter SUR(Reg, ByteOffset, E, C);
+
   // CHECK LOWER BOUND
   const MemSpaceRegion *Space = Reg->getMemorySpace();
   if (!(isa<SymbolicRegion>(Reg) && isa<UnknownSpaceRegion>(Space))) {
@@ -363,13 +547,22 @@ void ArrayBoundCheckerV2::performCheck(const Expr *E, CheckerContext &C) const {
     auto [PrecedesLowerBound, WithinLowerBound] = compareValueToThreshold(
         State, ByteOffset, SVB.makeZeroArrayIndex(), SVB);
 
-    if (PrecedesLowerBound && !WithinLowerBound) {
-      // We know that the index definitely precedes the lower bound.
-      Messages Msgs = getPrecedesMsgs(Reg, ByteOffset);
-      reportOOB(C, PrecedesLowerBound, OOB_Precedes, ByteOffset, Msgs);
-      return;
+    if (PrecedesLowerBound) {
+      // The offset may be invalid (negative)...
+      if (!WithinLowerBound) {
+        // ...and it cannot be valid (>= 0), so report an error.
+        Messages Msgs = getPrecedesMsgs(Reg, ByteOffset);
+        reportOOB(C, PrecedesLowerBound, Msgs, ByteOffset, std::nullopt);
+        return;
+      }
+      // ...but it can be valid as well, so the checker will (optimistically)
+      // assume that it's valid and mention this in the note tag.
+      SUR.recordNonNegativeAssumption();
     }
 
+    // Actually update the state. The "if" only fails in the extremely unlikely
+    // case when compareValueToThreshold returns {nullptr, nullptr} becasue
+    // evalBinOpNN fails to evaluate the less-than operator.
     if (WithinLowerBound)
       State = WithinLowerBound;
   }
@@ -381,32 +574,28 @@ void ArrayBoundCheckerV2::performCheck(const Expr *E, CheckerContext &C) const {
         compareValueToThreshold(State, ByteOffset, *KnownSize, SVB);
 
     if (ExceedsUpperBound) {
+      // The offset may be invalid (>= Size)...
       if (!WithinUpperBound) {
-        // We know that the index definitely exceeds the upper bound.
-        if (isa<ArraySubscriptExpr>(E) && isInAddressOf(E, C.getASTContext())) {
-          // ...but this is within an addressof expression, so we need to check
-          // for the exceptional case that `&array[size]` is valid.
-          auto [EqualsToThreshold, NotEqualToThreshold] =
-              compareValueToThreshold(ExceedsUpperBound, ByteOffset, *KnownSize,
-                                      SVB, /*CheckEquality=*/true);
-          if (EqualsToThreshold && !NotEqualToThreshold) {
-            // We are definitely in the exceptional case, so return early
-            // instead of reporting a bug.
-            C.addTransition(EqualsToThreshold);
-            return;
-          }
+        // ...and it cannot be within bounds, so report an error, unless we can
+        // definitely determine that this is an idiomatic `&array[size]`
+        // expression that calculates the past-the-end pointer.
+        if (isIdiomaticPastTheEndPtr(E, ExceedsUpperBound, ByteOffset,
+                                     *KnownSize, C)) {
+          C.addTransition(ExceedsUpperBound, SUR.createNoteTag(C));
+          return;
         }
+
         Messages Msgs = getExceedsMsgs(C.getASTContext(), Reg, ByteOffset,
                                        *KnownSize, Location);
-        reportOOB(C, ExceedsUpperBound, OOB_Exceeds, ByteOffset, Msgs);
+        reportOOB(C, ExceedsUpperBound, Msgs, ByteOffset, KnownSize);
         return;
       }
+      // ...and it can be valid as well...
       if (isTainted(State, ByteOffset)) {
-        // Both cases are possible, but the offset is tainted, so report.
-        std::string RegName = getRegionName(Reg);
+        // ...but it's tainted, so report an error.
 
-        // Diagnostic detail: "tainted offset" is always correct, but the
-        // common case is that 'idx' is tainted in 'arr[idx]' and then it's
+        // Diagnostic detail: saying "tainted offset" is always correct, but
+        // the common case is that 'idx' is tainted in 'arr[idx]' and then it's
         // nicer to say "tainted index".
         const char *OffsetName = "offset";
         if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(E))
@@ -414,33 +603,77 @@ void ArrayBoundCheckerV2::performCheck(const Expr *E, CheckerContext &C) const {
             OffsetName = "index";
 
         Messages Msgs = getTaintMsgs(Reg, OffsetName);
-        reportOOB(C, ExceedsUpperBound, OOB_Taint, ByteOffset, Msgs);
+        reportOOB(C, ExceedsUpperBound, Msgs, ByteOffset, KnownSize,
+                  /*IsTaintBug=*/true);
         return;
       }
+      // ...and it isn't tainted, so the checker will (optimistically) assume
+      // that the offset is in bounds and mention this in the note tag.
+      SUR.recordUpperBoundAssumption(*KnownSize);
     }
 
+    // Actually update the state. The "if" only fails in the extremely unlikely
+    // case when compareValueToThreshold returns {nullptr, nullptr} becasue
+    // evalBinOpNN fails to evaluate the less-than operator.
     if (WithinUpperBound)
       State = WithinUpperBound;
   }
 
-  C.addTransition(State);
+  // Add a transition, reporting the state updates that we accumulated.
+  C.addTransition(State, SUR.createNoteTag(C));
+}
+
+void ArrayBoundCheckerV2::markPartsInteresting(PathSensitiveBugReport &BR,
+                                               ProgramStateRef ErrorState,
+                                               NonLoc Val, bool MarkTaint) {
+  if (SymbolRef Sym = Val.getAsSymbol()) {
+    // If the offset is a symbolic value, iterate over its "parts" with
+    // `SymExpr::symbols()` and mark each of them as interesting.
+    // For example, if the offset is `x*4 + y` then we put interestingness onto
+    // the SymSymExpr `x*4 + y`, the SymIntExpr `x*4` and the two data symbols
+    // `x` and `y`.
+    for (SymbolRef PartSym : Sym->symbols())
+      BR.markInteresting(PartSym);
+  }
+
+  if (MarkTaint) {
+    // If the issue that we're reporting depends on the taintedness of the
+    // offset, then put interestingness onto symbols that could be the origin
+    // of the taint. Note that this may find symbols that did not appear in
+    // `Sym->symbols()` (because they're only loosely connected to `Val`).
+    for (SymbolRef Sym : getTaintedSymbols(ErrorState, Val))
+      BR.markInteresting(Sym);
+  }
 }
 
 void ArrayBoundCheckerV2::reportOOB(CheckerContext &C,
-                                    ProgramStateRef ErrorState, OOB_Kind Kind,
-                                    NonLoc Offset, Messages Msgs) const {
+                                    ProgramStateRef ErrorState, Messages Msgs,
+                                    NonLoc Offset, std::optional<NonLoc> Extent,
+                                    bool IsTaintBug /*=false*/) const {
 
   ExplodedNode *ErrorNode = C.generateErrorNode(ErrorState);
   if (!ErrorNode)
     return;
 
   auto BR = std::make_unique<PathSensitiveBugReport>(
-      Kind == OOB_Taint ? TaintBT : BT, Msgs.Short, Msgs.Full, ErrorNode);
-
-  // Track back the propagation of taintedness.
-  if (Kind == OOB_Taint)
-    for (SymbolRef Sym : getTaintedSymbols(ErrorState, Offset))
-      BR->markInteresting(Sym);
+      IsTaintBug ? TaintBT : BT, Msgs.Short, Msgs.Full, ErrorNode);
+
+  // FIXME: ideally we would just call trackExpressionValue() and that would
+  // "do the right thing": mark the relevant symbols as interesting, track the
+  // control dependencies and statements storing the relevant values and add
+  // helpful diagnostic pieces. However, right now trackExpressionValue() is
+  // a heap of unreliable heuristics, so it would cause several issues:
+  // - Interestingness is not applied consistently, e.g. if `array[x+10]`
+  //   causes an overflow, then `x` is not marked as interesting.
+  // - We get irrelevant diagnostic pieces, e.g. in the code
+  //   `int *p = (int*)malloc(2*sizeof(int)); p[3] = 0;`
+  //   it places a "Storing uninitialized value" note on the `malloc` call
+  //   (which is technically true, but irrelevant).
+  // If trackExpressionValue() becomes reliable, it should be applied instead
+  // of this custom markPartsInteresting().
+  markPartsInteresting(*BR, ErrorState, Offset, IsTaintBug);
+  if (Extent)
+    markPartsInteresting(*BR, ErrorState, *Extent, IsTaintBug);
 
   C.emitReport(std::move(BR));
 }
@@ -476,6 +709,18 @@ bool ArrayBoundCheckerV2::isInAddressOf(const Stmt *S, ASTContext &ACtx) {
   return UnaryOp && UnaryOp->getOpcode() == UO_AddrOf;
 }
 
+bool ArrayBoundCheckerV2::isIdiomaticPastTheEndPtr(const Expr *E,
+                                                   ProgramStateRef State,
+                                                   NonLoc Offset, NonLoc Limit,
+                                                   CheckerContext &C) {
+  if (isa<ArraySubscriptExpr>(E) && isInAddressOf(E, C.getASTContext())) {
+    auto [EqualsToThreshold, NotEqualToThreshold] = compareValueToThreshold(
+        State, Offset, Limit, C.getSValBuilder(), /*CheckEquality=*/true);
+    return EqualsToThreshold && !NotEqualToThreshold;
+  }
+  return false;
+}
+
 void ento::registerArrayBoundCheckerV2(CheckerManager &mgr) {
   mgr.registerChecker<ArrayBoundCheckerV2>();
 }
diff --git a/clang/test/Analysis/out-of-bounds-diagnostics.c b/clang/test/Analysis/out-of-bounds-diagnostics.c
index 769a895..0c3c67c 100644
--- a/clang/test/Analysis/out-of-bounds-diagnostics.c
+++ b/clang/test/Analysis/out-of-bounds-diagnostics.c
@@ -1,20 +1,20 @@
 // RUN: %clang_analyze_cc1 -Wno-array-bounds -analyzer-output=text        \
 // RUN:     -analyzer-checker=core,alpha.security.ArrayBoundV2,unix.Malloc,alpha.security.taint -verify %s
 
-int array[10];
+int TenElements[10];
 
 void arrayUnderflow(void) {
-  array[-3] = 5;
-  // expected-warning@-1 {{Out of bound access to memory preceding 'array'}}
-  // expected-note@-2 {{Access of 'array' at negative byte offset -12}}
+  TenElements[-3] = 5;
+  // expected-warning@-1 {{Out of bound access to memory preceding 'TenElements'}}
+  // expected-note@-2 {{Access of 'TenElements' at negative byte offset -12}}
 }
 
 int underflowWithDeref(void) {
-  int *p = array;
+  int *p = TenElements;
   --p;
   return *p;
-  // expected-warning@-1 {{Out of bound access to memory preceding 'array'}}
-  // expected-note@-2 {{Access of 'array' at negative byte offset -4}}
+  // expected-warning@-1 {{Out of bound access to memory preceding 'TenElements'}}
+  // expected-note@-2 {{Access of 'TenElements' at negative byte offset -4}}
 }
 
 int scanf(const char *restrict fmt, ...);
@@ -24,30 +24,31 @@ void taintedIndex(void) {
   scanf("%d", &index);
   // expected-note@-1 {{Taint originated here}}
   // expected-note@-2 {{Taint propagated to the 2nd argument}}
-  array[index] = 5;
-  // expected-warning@-1 {{Potential out of bound access to 'array' with tainted index}}
-  // expected-note@-2 {{Access of 'array' with a tainted index that may be too large}}
+  TenElements[index] = 5;
+  // expected-warning@-1 {{Potential out of bound access to 'TenElements' with tainted index}}
+  // expected-note@-2 {{Access of 'TenElements' with a tainted index that may be too large}}
 }
 
 int *taintedIndexAfterTheEndPtr(void) {
   // NOTE: Technically speaking, this testcase does not trigger any UB because
-  // &array[10] is the after-the-end pointer which is well-defined; but this is
-  // a bug-prone situation and far from the idiomatic use of `&array[size]`, so
-  // it's better to report an error. This report can be easily silenced by
-  // writing array+index instead of &array[index].
+  // &TenElements[10] is the after-the-end pointer which is well-defined; but
+  // this is a bug-prone situation and far from the idiomatic use of
+  // `&TenElements[size]`, so it's better to report an error. This report can
+  // be easily silenced by writing TenElements+index instead of
+  // &TenElements[index].
   int index;
   scanf("%d", &index);
   // expected-note@-1 {{Taint originated here}}
   // expected-note@-2 {{Taint propagated to the 2nd argument}}
   if (index < 0 || index > 10)
-    return array;
+    return TenElements;
   // expected-note@-2 {{Assuming 'index' is >= 0}}
   // expected-note@-3 {{Left side of '||' is false}}
   // expected-note@-4 {{Assuming 'index' is <= 10}}
   // expected-note@-5 {{Taking false branch}}
-  return &array[index];
-  // expected-warning@-1 {{Potential out of bound access to 'array' with tainted index}}
-  // expected-note@-2 {{Access of 'array' with a tainted index that may be too large}}
+  return &TenElements[index];
+  // expected-warning@-1 {{Potential out of bound access to 'TenElements' with tainted index}}
+  // expected-note@-2 {{Access of 'TenElements' with a tainted index that may be too large}}
 }
 
 void taintedOffset(void) {
@@ -55,57 +56,57 @@ void taintedOffset(void) {
   scanf("%d", &index);
   // expected-note@-1 {{Taint originated here}}
   // expected-note@-2 {{Taint propagated to the 2nd argument}}
-  int *p = array + index;
+  int *p = TenElements + index;
   p[0] = 5;
-  // expected-warning@-1 {{Potential out of bound access to 'array' with tainted offset}}
-  // expected-note@-2 {{Access of 'array' with a tainted offset that may be too large}}
+  // expected-warning@-1 {{Potential out of bound access to 'TenElements' with tainted offset}}
+  // expected-note@-2 {{Access of 'TenElements' with a tainted offset that may be too large}}
 }
 
 void arrayOverflow(void) {
-  array[12] = 5;
-  // expected-warning@-1 {{Out of bound access to memory after the end of 'array'}}
-  // expected-note@-2 {{Access of 'array' at index 12, while it holds only 10 'int' elements}}
+  TenElements[12] = 5;
+  // expected-warning@-1 {{Out of bound access to memory after the end of 'TenElements'}}
+  // expected-note@-2 {{Access of 'TenElements' at index 12, while it holds only 10 'int' elements}}
 }
 
 void flippedOverflow(void) {
-  12[array] = 5;
-  // expected-warning@-1 {{Out of bound access to memory after the end of 'array'}}
-  // expected-note@-2 {{Access of 'array' at index 12, while it holds only 10 'int' elements}}
+  12[TenElements] = 5;
+  // expected-warning@-1 {{Out of bound access to memory after the end of 'TenElements'}}
+  // expected-note@-2 {{Access of 'TenElements' at index 12, while it holds only 10 'int' elements}}
 }
 
 int *afterTheEndPtr(void) {
-  // This is an unusual but standard-compliant way of writing (array + 10).
-  return &array[10]; // no-warning
+  // This is an unusual but standard-compliant way of writing (TenElements + 10).
+  return &TenElements[10]; // no-warning
 }
 
 int useAfterTheEndPtr(void) {
   // ... but dereferencing the after-the-end pointer is still invalid.
   return *afterTheEndPtr();
-  // expected-warning@-1 {{Out of bound access to memory after the end of 'array'}}
-  // expected-note@-2 {{Access of 'array' at index 10, while it holds only 10 'int' elements}}
+  // expected-warning@-1 {{Out of bound access to memory after the end of 'TenElements'}}
+  // expected-note@-2 {{Access of 'TenElements' at index 10, while it holds only 10 'int' elements}}
 }
 
 int *afterAfterTheEndPtr(void) {
   // This is UB, it's invalid to form an after-after-the-end pointer.
-  return &array[11];
-  // expected-warning@-1 {{Out of bound access to memory after the end of 'array'}}
-  // expected-note@-2 {{Access of 'array' at index 11, while it holds only 10 'int' elements}}
+  return &TenElements[11];
+  // expected-warning@-1 {{Out of bound access to memory after the end of 'TenElements'}}
+  // expected-note@-2 {{Access of 'TenElements' at index 11, while it holds only 10 'int' elements}}
 }
 
 int *potentialAfterTheEndPtr(int idx) {
   if (idx < 10) { /* ...do something... */ }
   // expected-note@-1 {{Assuming 'idx' is >= 10}}
   // expected-note@-2 {{Taking false branch}}
-  return &array[idx];
-  // expected-warning@-1 {{Out of bound access to memory after the end of 'array'}}
-  // expected-note@-2 {{Access of 'array' at an overflowing index, while it holds only 10 'int' elements}}
+  return &TenElements[idx];
+  // expected-warning@-1 {{Out of bound access to memory after the end of 'TenElements'}}
+  // expected-note@-2 {{Access of 'TenElements' at an overflowing index, while it holds only 10 'int' elements}}
   // NOTE: On the idx >= 10 branch the normal "optimistic" behavior would've
   // been continuing with the assumption that idx == 10 and the return value is
   // a legitimate after-the-end pointer. The checker deviates from this by
   // reporting an error because this situation is very suspicious and far from
-  // the idiomatic `&array[size]` expressions. If the report is FP, the
-  // developer can easily silence it by writing array+idx instead of
-  // &array[idx].
+  // the idiomatic `&TenElements[size]` expressions. If the report is FP, the
+  // developer can easily silence it by writing TenElements+idx instead of
+  // &TenElements[idx].
 }
 
 int scalar;
@@ -156,9 +157,9 @@ int arrayOfStructsArrow(void) {
 }
 
 short convertedArray(void) {
-  return ((short*)array)[47];
-  // expected-warning@-1 {{Out of bound access to memory after the end of 'array'}}
-  // expected-note@-2 {{Access of 'array' at index 47, while it holds only 20 'short' elements}}
+  return ((short*)TenElements)[47];
+  // expected-warning@-1 {{Out of bound access to memory after the end of 'TenElements'}}
+  // expected-note@-2 {{Access of 'TenElements' at index 47, while it holds only 20 'short' elements}}
 }
 
 struct two_bytes {
@@ -195,12 +196,22 @@ void *malloc(size_t size);
 
 int *mallocRegion(void) {
   int *mem = (int*)malloc(2*sizeof(int));
+
   mem[3] = -2;
   // expected-warning@-1 {{Out of bound access to memory after the end of the heap area}}
   // expected-note@-2 {{Access of the heap area at index 3, while it holds only 2 'int' elements}}
   return mem;
 }
 
+int *mallocRegionDeref(void) {
+  int *mem = (int*)malloc(2*sizeof(int));
+
+  *(mem + 3) = -2;
+  // expected-warning@-1 {{Out of bound access to memory after the end of the heap area}}
+  // expected-note@-2 {{Access of the heap area at index 3, while it holds only 2 'int' elements}}
+  return mem;
+}
+
 void *alloca(size_t size);
 
 int allocaRegion(void) {
@@ -211,34 +222,61 @@ int allocaRegion(void) {
   return *mem;
 }
 
-int *unknownExtent(int arg) {
-  if (arg >= 2)
+int *symbolicExtent(int arg) {
+  // expected-note@+2 {{Assuming 'arg' is < 5}}
+  // expected-note@+1 {{Taking false branch}}
+  if (arg >= 5)
     return 0;
   int *mem = (int*)malloc(arg);
+
+  // TODO: without the following reference to 'arg', the analyzer would discard
+  // the range information about (the symbolic value of) 'arg'. This is
+  // incorrect because while the variable itself is inaccessible, it becomes
+  // the symbolic extent of 'mem', so we still want to reason about its
+  // potential values.
+  (void)arg;
+
   mem[8] = -2;
-  // FIXME: this should produce
-  //   {{Out of bound access to memory after the end of the heap area}}
-  //   {{Access of 'int' element in the heap area at index 8}}
+  // expected-warning@-1 {{Out of bound access to memory after the end of the heap area}}
+  // expected-note@-2 {{Access of 'int' element in the heap area at index 8}}
   return mem;
 }
 
-void unknownIndex(int arg) {
+int *symbolicExtentDiscardedRangeInfo(int arg) {
+  // This is a copy of the case 'symbolicExtent' without the '(void)arg' hack.
+  // TODO: if the analyzer can detect the out-of-bounds access within this
+  // testcase, then remove this and the `(void)arg` hack from `symbolicExtent`.
+  if (arg >= 5)
+    return 0;
+  int *mem = (int*)malloc(arg);
+  mem[8] = -2;
+  return mem;
+}
+
+void symbolicIndex(int arg) {
   // expected-note@+2 {{Assuming 'arg' is >= 12}}
   // expected-note@+1 {{Taking true branch}}
   if (arg >= 12)
-    array[arg] = -2;
-  // expected-warning@-1 {{Out of bound access to memory after the end of 'array'}}
-  // expected-note@-2 {{Access of 'array' at an overflowing index, while it holds only 10 'int' elements}}
+    TenElements[arg] = -2;
+  // expected-warning@-1 {{Out of bound access to memory after the end of 'TenElements'}}
+  // expected-note@-2 {{Access of 'TenElements' at an overflowing index, while it holds only 10 'int' elements}}
 }
 
 int *nothingIsCertain(int x, int y) {
   if (x >= 2)
     return 0;
   int *mem = (int*)malloc(x);
+
   if (y >= 8)
     mem[y] = -2;
   // FIXME: this should produce
   //   {{Out of bound access to memory after the end of the heap area}}
   //   {{Access of 'int' element in the heap area at an overflowing index}}
+  // but apparently the analyzer isn't smart enough to deduce this.
+
+  // Keep constraints alive. (Without this, the overeager garbage collection of
+  // constraints would _also_ prevent the intended behavior in this testcase.)
+  (void)x;
+
   return mem;
 }
diff --git a/clang/test/Analysis/out-of-bounds-notes.c b/clang/test/Analysis/out-of-bounds-notes.c
new file mode 100644
index 0000000..c29b6f8
--- /dev/null
+++ b/clang/test/Analysis/out-of-bounds-notes.c
@@ -0,0 +1,199 @@
+// RUN: %clang_analyze_cc1 -Wno-array-bounds -analyzer-output=text        \
+// RUN:     -analyzer-checker=core,alpha.security.ArrayBoundV2,unix.Malloc,alpha.security.taint -verify %s
+
+int TenElements[10];
+
+int irrelevantAssumptions(int arg) {
+  int a = TenElements[arg];
+  // Here the analyzer assumes that `arg` is in bounds, but doesn't report this
+  // because `arg` is not interesting for the bug.
+  int b = TenElements[13];
+  // expected-warning@-1 {{Out of bound access to memory after the end of 'TenElements'}}
+  // expected-note@-2 {{Access of 'TenElements' at index 13, while it holds only 10 'int' elements}}
+  return a + b;
+}
+
+
+int assumingBoth(int arg) {
+  int a = TenElements[arg];
+  // expected-note@-1 {{Assuming index is non-negative and less than 10, the number of 'int' elements in 'TenElements'}}
+  int b = TenElements[arg]; // no additional note, we already assumed that 'arg' is in bounds
+  int c = TenElements[arg + 10];
+  // expected-warning@-1 {{Out of bound access to memory after the end of 'TenElements'}}
+  // expected-note@-2 {{Access of 'TenElements' at an overflowing index, while it holds only 10 'int' elements}}
+  return a + b + c;
+}
+
+int assumingBothPointerToMiddle(int arg) {
+  // If we're accessing an TenElements through a pointer pointing to its middle, the checker
+  // will speak about the "byte offset" measured from the beginning of the TenElements.
+  int *p = TenElements + 2;
+  int a = p[arg];
+  // FIXME: The following note does not appear:
+  //  {{Assuming byte offset is non-negative and less than 40, the extent of 'TenElements'}}
+  // It seems that the analyzer "gives up" modeling this pointer arithmetics
+  // and says that `p[arg]` is just an UnknownVal (instead of calculating that
+  // it's equivalent to `TenElements[2+arg]`).
+
+  int b = TenElements[arg]; // This is normal access, and only the lower bound is new.
+  // expected-note@-1 {{Assuming index is non-negative}}
+  int c = TenElements[arg + 10];
+  // expected-warning@-1 {{Out of bound access to memory after the end of 'TenElements'}}
+  // expected-note@-2 {{Access of 'TenElements' at an overflowing index, while it holds only 10 'int' elements}}
+  return a + b + c;
+}
+
+int assumingLower(int arg) {
+  // expected-note@+2 {{Assuming 'arg' is < 10}}
+  // expected-note@+1 {{Taking false branch}}
+  if (arg >= 10)
+    return 0;
+  int a = TenElements[arg];
+  // expected-note@-1 {{Assuming index is non-negative}}
+  int b = TenElements[arg + 10];
+  // expected-warning@-1 {{Out of bound access to memory after the end of 'TenElements'}}
+  // expected-note@-2 {{Access of 'TenElements' at an overflowing index, while it holds only 10 'int' elements}}
+  return a + b;
+}
+
+int assumingUpper(int arg) {
+  // expected-note@+2 {{Assuming 'arg' is >= 0}}
+  // expected-note@+1 {{Taking false branch}}
+  if (arg < 0)
+    return 0;
+  int a = TenElements[arg];
+  // expected-note@-1 {{Assuming index is less than 10, the number of 'int' elements in 'TenElements'}}
+  int b = TenElements[arg - 10];
+  // expected-warning@-1 {{Out of bound access to memory preceding 'TenElements'}}
+  // expected-note@-2 {{Access of 'TenElements' at negative byte offset}}
+  return a + b;
+}
+
+int assumingUpperIrrelevant(int arg) {
+  // FIXME: The assumption "assuming index is less than 10" is printed because
+  // it's assuming something about the interesting variable `arg`; however,
+  // it's irrelevant because in this testcase the out of bound access is
+  // deduced from the _lower_ bound on `arg`. Currently the analyzer cannot
+  // filter out assumptions that are logically irrelevant but "touch"
+  // interesting symbols; eventually it would be good to add support for this.
+
+  // expected-note@+2 {{Assuming 'arg' is >= 0}}
+  // expected-note@+1 {{Taking false branch}}
+  if (arg < 0)
+    return 0;
+  int a = TenElements[arg];
+  // expected-note@-1 {{Assuming index is less than 10, the number of 'int' elements in 'TenElements'}}
+  int b = TenElements[arg + 10];
+  // expected-warning@-1 {{Out of bound access to memory after the end of 'TenElements'}}
+  // expected-note@-2 {{Access of 'TenElements' at an overflowing index, while it holds only 10 'int' elements}}
+  return a + b;
+}
+
+int assumingUpperUnsigned(unsigned arg) {
+  int a = TenElements[arg];
+  // expected-note@-1 {{Assuming index is less than 10, the number of 'int' elements in 'TenElements'}}
+  int b = TenElements[(int)arg - 10];
+  // expected-warning@-1 {{Out of bound access to memory preceding 'TenElements'}}
+  // expected-note@-2 {{Access of 'TenElements' at negative byte offset}}
+  return a + b;
+}
+
+int assumingNothing(unsigned arg) {
+  // expected-note@+2 {{Assuming 'arg' is < 10}}
+  // expected-note@+1 {{Taking false branch}}
+  if (arg >= 10)
+    return 0;
+  int a = TenElements[arg]; // no note here, we already know that 'arg' is in bounds
+  int b = TenElements[(int)arg - 10];
+  // expected-warning@-1 {{Out of bound access to memory preceding 'TenElements'}}
+  // expected-note@-2 {{Access of 'TenElements' at negative byte offset}}
+  return a + b;
+}
+
+short assumingConvertedToCharP(int arg) {
+  // When indices are reported, the note will use the element type that's the
+  // result type of the subscript operator.
+  char *cp = (char*)TenElements;
+  char a = cp[arg];
+  // expected-note@-1 {{Assuming index is non-negative and less than 40, the number of 'char' elements in 'TenElements'}}
+  char b = cp[arg]; // no additional note, we already assumed that 'arg' is in bounds
+  char c = cp[arg + 40];
+  // expected-warning@-1 {{Out of bound access to memory after the end of 'TenElements'}}
+  // expected-note@-2 {{Access of 'TenElements' at an overflowing index, while it holds only 40 'char' elements}}
+  return a + b + c;
+}
+
+struct foo {
+  int num;
+  char a[8];
+  char b[5];
+};
+
+int assumingConvertedToIntP(struct foo f, int arg) {
+  // When indices are reported, the note will use the element type that's the
+  // result type of the subscript operator.
+  int a = ((int*)(f.a))[arg];
+  // expected-note@-1 {{Assuming index is non-negative and less than 2, the number of 'int' elements in 'f.a'}}
+  // However, if the extent of the memory region is not divisible by the
+  // element size, the checker measures the offset and extent in bytes.
+  int b = ((int*)(f.b))[arg];
+  // expected-note@-1 {{Assuming byte offset is less than 5, the extent of 'f.b'}}
+  int c = TenElements[arg-2];
+  // expected-warning@-1 {{Out of bound access to memory preceding 'TenElements'}}
+  // expected-note@-2 {{Access of 'TenElements' at negative byte offset}}
+  return a + b + c;
+}
+
+int assumingPlainOffset(struct foo f, int arg) {
+  // This TC is intended to check the corner case that the checker prints the
+  // shorter "offset" instead of "byte offset" when it's irrelevant that the
+  // offset is measured in bytes.
+
+  // expected-note@+2 {{Assuming 'arg' is < 2}}
+  // expected-note@+1 {{Taking false branch}}
+  if (arg >= 2)
+    return 0;
+
+  int b = ((int*)(f.b))[arg];
+  // expected-note@-1 {{Assuming byte offset is non-negative and less than 5, the extent of 'f.b'}}
+  // FIXME: this should be {{Assuming offset is non-negative}}
+  // but the current simplification algorithm doesn't realize that arg <= 1
+  // implies that the byte offset arg*4 will be less than 5.
+
+  int c = TenElements[arg+10];
+  // expected-warning@-1 {{Out of bound access to memory after the end of 'TenElements'}}
+  // expected-note@-2 {{Access of 'TenElements' at an overflowing index, while it holds only 10 'int' elements}}
+  return b + c;
+}
+
+typedef __typeof(sizeof(int)) size_t;
+void *malloc(size_t size);
+void free(void *ptr);
+
+int assumingExtent(int arg) {
+  // Verify that the assumption note is printed when the extent is interesting
+  // (even if the index isn't interesting).
+  int *mem = (int*)malloc(arg);
+
+  mem[12] = 123;
+  // expected-note@-1 {{Assuming index '12' is less than the number of 'int' elements in the heap area}}
+
+  free(mem);
+
+  return TenElements[arg];
+  // expected-warning@-1 {{Out of bound access to memory after the end of 'TenElements'}}
+  // expected-note@-2 {{Access of 'TenElements' at an overflowing index, while it holds only 10 'int' elements}}
+}
+
+int *extentInterestingness(int arg) {
+  // Verify that in an out-of-bounds access issue the extent is marked as
+  // interesting (so assumptions about its value are printed).
+  int *mem = (int*)malloc(arg);
+
+  TenElements[arg] = 123;
+  // expected-note@-1 {{Assuming index is non-negative and less than 10, the number of 'int' elements in 'TenElements'}}
+
+  return &mem[12];
+  // expected-warning@-1 {{Out of bound access to memory after the end of the heap area}}
+  // expected-note@-2 {{Access of 'int' element in the heap area at index 12}}
+}
author	NagyDonat <donat.nagy@ericsson.com>	2024-02-05 17:07:14 +0100
committer	GitHub <noreply@github.com>	2024-02-05 17:07:14 +0100
commit	fee204f0c9b3b77898c1faa2a7415b0f64f5e7f0 (patch)
tree	0ddddd40574332257980705b3204e448e6c46551
parent	f2c84211d2834c73ff874389c6bb47b1c76d391a (diff)
download	llvm-fee204f0c9b3b77898c1faa2a7415b0f64f5e7f0.zip llvm-fee204f0c9b3b77898c1faa2a7415b0f64f5e7f0.tar.gz llvm-fee204f0c9b3b77898c1faa2a7415b0f64f5e7f0.tar.bz2