diff options
author | NagyDonat <donat.nagy@ericsson.com> | 2024-02-05 17:07:14 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-02-05 17:07:14 +0100 |
commit | fee204f0c9b3b77898c1faa2a7415b0f64f5e7f0 (patch) | |
tree | 0ddddd40574332257980705b3204e448e6c46551 | |
parent | f2c84211d2834c73ff874389c6bb47b1c76d391a (diff) | |
download | llvm-fee204f0c9b3b77898c1faa2a7415b0f64f5e7f0.zip llvm-fee204f0c9b3b77898c1faa2a7415b0f64f5e7f0.tar.gz llvm-fee204f0c9b3b77898c1faa2a7415b0f64f5e7f0.tar.bz2 |
[analyzer] Support interestingness in ArrayBoundV2 (#78315)
This commit improves alpha.security.ArrayBoundV2 in two connected areas:
(1) It calls `markInteresting()` on the symbolic values that are
responsible for the out of bounds access.
(2) Its index-is-in-bounds assumptions are reported in note tags if they
provide information about the value of an interesting symbol.
This commit is limited to "display" changes: it introduces new
diagnostic pieces (potentially to bugs found by other checkers), but
ArrayBoundV2 will make the same assumptions and detect the same bugs
before and after this change.
As a minor unrelated change, this commit also updates/removes some very
old comments which became obsolete due to my previous changes.
-rw-r--r-- | clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp | 391 | ||||
-rw-r--r-- | clang/test/Analysis/out-of-bounds-diagnostics.c | 142 | ||||
-rw-r--r-- | clang/test/Analysis/out-of-bounds-notes.c | 199 |
3 files changed, 607 insertions, 125 deletions
diff --git a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp index 6c7a160..05fc00a 100644 --- a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp @@ -33,7 +33,82 @@ using namespace taint; using llvm::formatv; namespace { -enum OOB_Kind { OOB_Precedes, OOB_Exceeds, OOB_Taint }; +/// If `E` is a "clean" array subscript expression, return the type of the +/// accessed element. If the base of the subscript expression is modified by +/// pointer arithmetic (and not the beginning of a "full" memory region), this +/// always returns nullopt because that's the right (or the least bad) thing to +/// do for the diagnostic output that's relying on this. +static std::optional<QualType> determineElementType(const Expr *E, + const CheckerContext &C) { + const auto *ASE = dyn_cast<ArraySubscriptExpr>(E); + if (!ASE) + return std::nullopt; + + const MemRegion *SubscriptBaseReg = C.getSVal(ASE->getBase()).getAsRegion(); + if (!SubscriptBaseReg) + return std::nullopt; + + // The base of the subscript expression is affected by pointer arithmetics, + // so we want to report byte offsets instead of indices. + if (isa<ElementRegion>(SubscriptBaseReg->StripCasts())) + return std::nullopt; + + return ASE->getType(); +} + +static std::optional<int64_t> +determineElementSize(const std::optional<QualType> T, const CheckerContext &C) { + if (!T) + return std::nullopt; + return C.getASTContext().getTypeSizeInChars(*T).getQuantity(); +} + +class StateUpdateReporter { + const SubRegion *Reg; + const NonLoc ByteOffsetVal; + const std::optional<QualType> ElementType; + const std::optional<int64_t> ElementSize; + bool AssumedNonNegative = false; + std::optional<NonLoc> AssumedUpperBound = std::nullopt; + +public: + StateUpdateReporter(const SubRegion *R, NonLoc ByteOffsVal, const Expr *E, + CheckerContext &C) + : Reg(R), ByteOffsetVal(ByteOffsVal), + ElementType(determineElementType(E, C)), + ElementSize(determineElementSize(ElementType, C)) {} + + void recordNonNegativeAssumption() { AssumedNonNegative = true; } + void recordUpperBoundAssumption(NonLoc UpperBoundVal) { + AssumedUpperBound = UpperBoundVal; + } + + const NoteTag *createNoteTag(CheckerContext &C) const; + +private: + std::string getMessage(PathSensitiveBugReport &BR) const; + + /// Return true if information about the value of `Sym` can put constraints + /// on some symbol which is interesting within the bug report `BR`. + /// In particular, this returns true when `Sym` is interesting within `BR`; + /// but it also returns true if `Sym` is an expression that contains integer + /// constants and a single symbolic operand which is interesting (in `BR`). + /// We need to use this instead of plain `BR.isInteresting()` because if we + /// are analyzing code like + /// int array[10]; + /// int f(int arg) { + /// return array[arg] && array[arg + 10]; + /// } + /// then the byte offsets are `arg * 4` and `(arg + 10) * 4`, which are not + /// sub-expressions of each other (but `getSimplifiedOffsets` is smart enough + /// to detect this out of bounds access). + static bool providesInformationAboutInteresting(SymbolRef Sym, + PathSensitiveBugReport &BR); + static bool providesInformationAboutInteresting(SVal SV, + PathSensitiveBugReport &BR) { + return providesInformationAboutInteresting(SV.getAsSymbol(), BR); + } +}; struct Messages { std::string Short, Full; @@ -54,11 +129,19 @@ class ArrayBoundCheckerV2 : public Checker<check::PostStmt<ArraySubscriptExpr>, void performCheck(const Expr *E, CheckerContext &C) const; - void reportOOB(CheckerContext &C, ProgramStateRef ErrorState, OOB_Kind Kind, - NonLoc Offset, Messages Msgs) const; + void reportOOB(CheckerContext &C, ProgramStateRef ErrorState, Messages Msgs, + NonLoc Offset, std::optional<NonLoc> Extent, + bool IsTaintBug = false) const; + + static void markPartsInteresting(PathSensitiveBugReport &BR, + ProgramStateRef ErrorState, NonLoc Val, + bool MarkTaint); static bool isFromCtypeMacro(const Stmt *S, ASTContext &AC); + static bool isIdiomaticPastTheEndPtr(const Expr *E, ProgramStateRef State, + NonLoc Offset, NonLoc Limit, + CheckerContext &C); static bool isInAddressOf(const Stmt *S, ASTContext &AC); public: @@ -133,12 +216,26 @@ computeOffset(ProgramStateRef State, SValBuilder &SVB, SVal Location) { return std::nullopt; } -// TODO: once the constraint manager is smart enough to handle non simplified -// symbolic expressions remove this function. Note that this can not be used in -// the constraint manager as is, since this does not handle overflows. It is -// safe to assume, however, that memory offsets will not overflow. -// NOTE: callers of this function need to be aware of the effects of overflows -// and signed<->unsigned conversions! +// NOTE: This function is the "heart" of this checker. It simplifies +// inequalities with transformations that are valid (and very elementary) in +// pure mathematics, but become invalid if we use them in C++ number model +// where the calculations may overflow. +// Due to the overflow issues I think it's impossible (or at least not +// practical) to integrate this kind of simplification into the resolution of +// arbitrary inequalities (i.e. the code of `evalBinOp`); but this function +// produces valid results when the calculations are handling memory offsets +// and every value is well below SIZE_MAX. +// TODO: This algorithm should be moved to a central location where it's +// available for other checkers that need to compare memory offsets. +// NOTE: the simplification preserves the order of the two operands in a +// mathematical sense, but it may change the result produced by a C++ +// comparison operator (and the automatic type conversions). +// For example, consider a comparison "X+1 < 0", where the LHS is stored as a +// size_t and the RHS is stored in an int. (As size_t is unsigned, this +// comparison is false for all values of "X".) However, the simplification may +// turn it into "X < -1", which is still always false in a mathematical sense, +// but can produce a true result when evaluated by `evalBinOp` (which follows +// the rules of C++ and casts -1 to SIZE_MAX). static std::pair<NonLoc, nonloc::ConcreteInt> getSimplifiedOffsets(NonLoc offset, nonloc::ConcreteInt extent, SValBuilder &svalBuilder) { @@ -239,13 +336,8 @@ static std::optional<int64_t> getConcreteValue(NonLoc SV) { return std::nullopt; } -static std::string getShortMsg(OOB_Kind Kind, std::string RegName) { - static const char *ShortMsgTemplates[] = { - "Out of bound access to memory preceding {0}", - "Out of bound access to memory after the end of {0}", - "Potential out of bound access to {0} with tainted offset"}; - - return formatv(ShortMsgTemplates[Kind], RegName); +static std::optional<int64_t> getConcreteValue(std::optional<NonLoc> SV) { + return SV ? getConcreteValue(*SV) : std::nullopt; } static Messages getPrecedesMsgs(const SubRegion *Region, NonLoc Offset) { @@ -255,7 +347,28 @@ static Messages getPrecedesMsgs(const SubRegion *Region, NonLoc Offset) { Out << "Access of " << RegName << " at negative byte offset"; if (auto ConcreteIdx = Offset.getAs<nonloc::ConcreteInt>()) Out << ' ' << ConcreteIdx->getValue(); - return {getShortMsg(OOB_Precedes, RegName), std::string(Buf)}; + return {formatv("Out of bound access to memory preceding {0}", RegName), + std::string(Buf)}; +} + +/// Try to divide `Val1` and `Val2` (in place) by `Divisor` and return true if +/// it can be performed (`Divisor` is nonzero and there is no remainder). The +/// values `Val1` and `Val2` may be nullopt and in that case the corresponding +/// division is considered to be successful. +static bool tryDividePair(std::optional<int64_t> &Val1, + std::optional<int64_t> &Val2, int64_t Divisor) { + if (!Divisor) + return false; + const bool Val1HasRemainder = Val1 && *Val1 % Divisor; + const bool Val2HasRemainder = Val2 && *Val2 % Divisor; + if (!Val1HasRemainder && !Val2HasRemainder) { + if (Val1) + *Val1 /= Divisor; + if (Val2) + *Val2 /= Divisor; + return true; + } + return false; } static Messages getExceedsMsgs(ASTContext &ACtx, const SubRegion *Region, @@ -268,18 +381,9 @@ static Messages getExceedsMsgs(ASTContext &ACtx, const SubRegion *Region, std::optional<int64_t> OffsetN = getConcreteValue(Offset); std::optional<int64_t> ExtentN = getConcreteValue(Extent); - bool UseByteOffsets = true; - if (int64_t ElemSize = ACtx.getTypeSizeInChars(ElemType).getQuantity()) { - const bool OffsetHasRemainder = OffsetN && *OffsetN % ElemSize; - const bool ExtentHasRemainder = ExtentN && *ExtentN % ElemSize; - if (!OffsetHasRemainder && !ExtentHasRemainder) { - UseByteOffsets = false; - if (OffsetN) - *OffsetN /= ElemSize; - if (ExtentN) - *ExtentN /= ElemSize; - } - } + int64_t ElemSize = ACtx.getTypeSizeInChars(ElemType).getQuantity(); + + bool UseByteOffsets = !tryDividePair(OffsetN, ExtentN, ElemSize); SmallString<256> Buf; llvm::raw_svector_ostream Out(Buf); @@ -307,7 +411,9 @@ static Messages getExceedsMsgs(ASTContext &ACtx, const SubRegion *Region, Out << "s"; } - return {getShortMsg(OOB_Exceeds, RegName), std::string(Buf)}; + return { + formatv("Out of bound access to memory after the end of {0}", RegName), + std::string(Buf)}; } static Messages getTaintMsgs(const SubRegion *Region, const char *OffsetName) { @@ -318,17 +424,91 @@ static Messages getTaintMsgs(const SubRegion *Region, const char *OffsetName) { RegName, OffsetName)}; } -void ArrayBoundCheckerV2::performCheck(const Expr *E, CheckerContext &C) const { - // NOTE: Instead of using ProgramState::assumeInBound(), we are prototyping - // some new logic here that reasons directly about memory region extents. - // Once that logic is more mature, we can bring it back to assumeInBound() - // for all clients to use. - // - // The algorithm we are using here for bounds checking is to see if the - // memory access is within the extent of the base region. Since we - // have some flexibility in defining the base region, we can achieve - // various levels of conservatism in our buffer overflow checking. +const NoteTag *StateUpdateReporter::createNoteTag(CheckerContext &C) const { + // Don't create a note tag if we didn't assume anything: + if (!AssumedNonNegative && !AssumedUpperBound) + return nullptr; + + return C.getNoteTag([*this](PathSensitiveBugReport &BR) -> std::string { + return getMessage(BR); + }); +} + +std::string StateUpdateReporter::getMessage(PathSensitiveBugReport &BR) const { + bool ShouldReportNonNegative = AssumedNonNegative; + if (!providesInformationAboutInteresting(ByteOffsetVal, BR)) { + if (AssumedUpperBound && + providesInformationAboutInteresting(*AssumedUpperBound, BR)) { + // Even if the byte offset isn't interesting (e.g. it's a constant value), + // the assumption can still be interesting if it provides information + // about an interesting symbolic upper bound. + ShouldReportNonNegative = false; + } else { + // We don't have anything interesting, don't report the assumption. + return ""; + } + } + + std::optional<int64_t> OffsetN = getConcreteValue(ByteOffsetVal); + std::optional<int64_t> ExtentN = getConcreteValue(AssumedUpperBound); + + const bool UseIndex = + ElementSize && tryDividePair(OffsetN, ExtentN, *ElementSize); + + SmallString<256> Buf; + llvm::raw_svector_ostream Out(Buf); + Out << "Assuming "; + if (UseIndex) { + Out << "index "; + if (OffsetN) + Out << "'" << OffsetN << "' "; + } else if (AssumedUpperBound) { + Out << "byte offset "; + if (OffsetN) + Out << "'" << OffsetN << "' "; + } else { + Out << "offset "; + } + + Out << "is"; + if (ShouldReportNonNegative) { + Out << " non-negative"; + } + if (AssumedUpperBound) { + if (ShouldReportNonNegative) + Out << " and"; + Out << " less than "; + if (ExtentN) + Out << *ExtentN << ", "; + if (UseIndex && ElementType) + Out << "the number of '" << ElementType->getAsString() + << "' elements in "; + else + Out << "the extent of "; + Out << getRegionName(Reg); + } + return std::string(Out.str()); +} + +bool StateUpdateReporter::providesInformationAboutInteresting( + SymbolRef Sym, PathSensitiveBugReport &BR) { + if (!Sym) + return false; + for (SymbolRef PartSym : Sym->symbols()) { + // The interestingess mark may appear on any layer as we're stripping off + // the SymIntExpr, UnarySymExpr etc. layers... + if (BR.isInteresting(PartSym)) + return true; + // ...but if both sides of the expression are symbolic, then there is no + // practical algorithm to produce separate constraints for the two + // operands (from the single combined result). + if (isa<SymSymExpr>(PartSym)) + return false; + } + return false; +} +void ArrayBoundCheckerV2::performCheck(const Expr *E, CheckerContext &C) const { const SVal Location = C.getSVal(E); // The header ctype.h (from e.g. glibc) implements the isXXXXX() macros as @@ -350,6 +530,10 @@ void ArrayBoundCheckerV2::performCheck(const Expr *E, CheckerContext &C) const { auto [Reg, ByteOffset] = *RawOffset; + // The state updates will be reported as a single note tag, which will be + // composed by this helper class. + StateUpdateReporter SUR(Reg, ByteOffset, E, C); + // CHECK LOWER BOUND const MemSpaceRegion *Space = Reg->getMemorySpace(); if (!(isa<SymbolicRegion>(Reg) && isa<UnknownSpaceRegion>(Space))) { @@ -363,13 +547,22 @@ void ArrayBoundCheckerV2::performCheck(const Expr *E, CheckerContext &C) const { auto [PrecedesLowerBound, WithinLowerBound] = compareValueToThreshold( State, ByteOffset, SVB.makeZeroArrayIndex(), SVB); - if (PrecedesLowerBound && !WithinLowerBound) { - // We know that the index definitely precedes the lower bound. - Messages Msgs = getPrecedesMsgs(Reg, ByteOffset); - reportOOB(C, PrecedesLowerBound, OOB_Precedes, ByteOffset, Msgs); - return; + if (PrecedesLowerBound) { + // The offset may be invalid (negative)... + if (!WithinLowerBound) { + // ...and it cannot be valid (>= 0), so report an error. + Messages Msgs = getPrecedesMsgs(Reg, ByteOffset); + reportOOB(C, PrecedesLowerBound, Msgs, ByteOffset, std::nullopt); + return; + } + // ...but it can be valid as well, so the checker will (optimistically) + // assume that it's valid and mention this in the note tag. + SUR.recordNonNegativeAssumption(); } + // Actually update the state. The "if" only fails in the extremely unlikely + // case when compareValueToThreshold returns {nullptr, nullptr} becasue + // evalBinOpNN fails to evaluate the less-than operator. if (WithinLowerBound) State = WithinLowerBound; } @@ -381,32 +574,28 @@ void ArrayBoundCheckerV2::performCheck(const Expr *E, CheckerContext &C) const { compareValueToThreshold(State, ByteOffset, *KnownSize, SVB); if (ExceedsUpperBound) { + // The offset may be invalid (>= Size)... if (!WithinUpperBound) { - // We know that the index definitely exceeds the upper bound. - if (isa<ArraySubscriptExpr>(E) && isInAddressOf(E, C.getASTContext())) { - // ...but this is within an addressof expression, so we need to check - // for the exceptional case that `&array[size]` is valid. - auto [EqualsToThreshold, NotEqualToThreshold] = - compareValueToThreshold(ExceedsUpperBound, ByteOffset, *KnownSize, - SVB, /*CheckEquality=*/true); - if (EqualsToThreshold && !NotEqualToThreshold) { - // We are definitely in the exceptional case, so return early - // instead of reporting a bug. - C.addTransition(EqualsToThreshold); - return; - } + // ...and it cannot be within bounds, so report an error, unless we can + // definitely determine that this is an idiomatic `&array[size]` + // expression that calculates the past-the-end pointer. + if (isIdiomaticPastTheEndPtr(E, ExceedsUpperBound, ByteOffset, + *KnownSize, C)) { + C.addTransition(ExceedsUpperBound, SUR.createNoteTag(C)); + return; } + Messages Msgs = getExceedsMsgs(C.getASTContext(), Reg, ByteOffset, *KnownSize, Location); - reportOOB(C, ExceedsUpperBound, OOB_Exceeds, ByteOffset, Msgs); + reportOOB(C, ExceedsUpperBound, Msgs, ByteOffset, KnownSize); return; } + // ...and it can be valid as well... if (isTainted(State, ByteOffset)) { - // Both cases are possible, but the offset is tainted, so report. - std::string RegName = getRegionName(Reg); + // ...but it's tainted, so report an error. - // Diagnostic detail: "tainted offset" is always correct, but the - // common case is that 'idx' is tainted in 'arr[idx]' and then it's + // Diagnostic detail: saying "tainted offset" is always correct, but + // the common case is that 'idx' is tainted in 'arr[idx]' and then it's // nicer to say "tainted index". const char *OffsetName = "offset"; if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(E)) @@ -414,33 +603,77 @@ void ArrayBoundCheckerV2::performCheck(const Expr *E, CheckerContext &C) const { OffsetName = "index"; Messages Msgs = getTaintMsgs(Reg, OffsetName); - reportOOB(C, ExceedsUpperBound, OOB_Taint, ByteOffset, Msgs); + reportOOB(C, ExceedsUpperBound, Msgs, ByteOffset, KnownSize, + /*IsTaintBug=*/true); return; } + // ...and it isn't tainted, so the checker will (optimistically) assume + // that the offset is in bounds and mention this in the note tag. + SUR.recordUpperBoundAssumption(*KnownSize); } + // Actually update the state. The "if" only fails in the extremely unlikely + // case when compareValueToThreshold returns {nullptr, nullptr} becasue + // evalBinOpNN fails to evaluate the less-than operator. if (WithinUpperBound) State = WithinUpperBound; } - C.addTransition(State); + // Add a transition, reporting the state updates that we accumulated. + C.addTransition(State, SUR.createNoteTag(C)); +} + +void ArrayBoundCheckerV2::markPartsInteresting(PathSensitiveBugReport &BR, + ProgramStateRef ErrorState, + NonLoc Val, bool MarkTaint) { + if (SymbolRef Sym = Val.getAsSymbol()) { + // If the offset is a symbolic value, iterate over its "parts" with + // `SymExpr::symbols()` and mark each of them as interesting. + // For example, if the offset is `x*4 + y` then we put interestingness onto + // the SymSymExpr `x*4 + y`, the SymIntExpr `x*4` and the two data symbols + // `x` and `y`. + for (SymbolRef PartSym : Sym->symbols()) + BR.markInteresting(PartSym); + } + + if (MarkTaint) { + // If the issue that we're reporting depends on the taintedness of the + // offset, then put interestingness onto symbols that could be the origin + // of the taint. Note that this may find symbols that did not appear in + // `Sym->symbols()` (because they're only loosely connected to `Val`). + for (SymbolRef Sym : getTaintedSymbols(ErrorState, Val)) + BR.markInteresting(Sym); + } } void ArrayBoundCheckerV2::reportOOB(CheckerContext &C, - ProgramStateRef ErrorState, OOB_Kind Kind, - NonLoc Offset, Messages Msgs) const { + ProgramStateRef ErrorState, Messages Msgs, + NonLoc Offset, std::optional<NonLoc> Extent, + bool IsTaintBug /*=false*/) const { ExplodedNode *ErrorNode = C.generateErrorNode(ErrorState); if (!ErrorNode) return; auto BR = std::make_unique<PathSensitiveBugReport>( - Kind == OOB_Taint ? TaintBT : BT, Msgs.Short, Msgs.Full, ErrorNode); - - // Track back the propagation of taintedness. - if (Kind == OOB_Taint) - for (SymbolRef Sym : getTaintedSymbols(ErrorState, Offset)) - BR->markInteresting(Sym); + IsTaintBug ? TaintBT : BT, Msgs.Short, Msgs.Full, ErrorNode); + + // FIXME: ideally we would just call trackExpressionValue() and that would + // "do the right thing": mark the relevant symbols as interesting, track the + // control dependencies and statements storing the relevant values and add + // helpful diagnostic pieces. However, right now trackExpressionValue() is + // a heap of unreliable heuristics, so it would cause several issues: + // - Interestingness is not applied consistently, e.g. if `array[x+10]` + // causes an overflow, then `x` is not marked as interesting. + // - We get irrelevant diagnostic pieces, e.g. in the code + // `int *p = (int*)malloc(2*sizeof(int)); p[3] = 0;` + // it places a "Storing uninitialized value" note on the `malloc` call + // (which is technically true, but irrelevant). + // If trackExpressionValue() becomes reliable, it should be applied instead + // of this custom markPartsInteresting(). + markPartsInteresting(*BR, ErrorState, Offset, IsTaintBug); + if (Extent) + markPartsInteresting(*BR, ErrorState, *Extent, IsTaintBug); C.emitReport(std::move(BR)); } @@ -476,6 +709,18 @@ bool ArrayBoundCheckerV2::isInAddressOf(const Stmt *S, ASTContext &ACtx) { return UnaryOp && UnaryOp->getOpcode() == UO_AddrOf; } +bool ArrayBoundCheckerV2::isIdiomaticPastTheEndPtr(const Expr *E, + ProgramStateRef State, + NonLoc Offset, NonLoc Limit, + CheckerContext &C) { + if (isa<ArraySubscriptExpr>(E) && isInAddressOf(E, C.getASTContext())) { + auto [EqualsToThreshold, NotEqualToThreshold] = compareValueToThreshold( + State, Offset, Limit, C.getSValBuilder(), /*CheckEquality=*/true); + return EqualsToThreshold && !NotEqualToThreshold; + } + return false; +} + void ento::registerArrayBoundCheckerV2(CheckerManager &mgr) { mgr.registerChecker<ArrayBoundCheckerV2>(); } diff --git a/clang/test/Analysis/out-of-bounds-diagnostics.c b/clang/test/Analysis/out-of-bounds-diagnostics.c index 769a895..0c3c67c 100644 --- a/clang/test/Analysis/out-of-bounds-diagnostics.c +++ b/clang/test/Analysis/out-of-bounds-diagnostics.c @@ -1,20 +1,20 @@ // RUN: %clang_analyze_cc1 -Wno-array-bounds -analyzer-output=text \ // RUN: -analyzer-checker=core,alpha.security.ArrayBoundV2,unix.Malloc,alpha.security.taint -verify %s -int array[10]; +int TenElements[10]; void arrayUnderflow(void) { - array[-3] = 5; - // expected-warning@-1 {{Out of bound access to memory preceding 'array'}} - // expected-note@-2 {{Access of 'array' at negative byte offset -12}} + TenElements[-3] = 5; + // expected-warning@-1 {{Out of bound access to memory preceding 'TenElements'}} + // expected-note@-2 {{Access of 'TenElements' at negative byte offset -12}} } int underflowWithDeref(void) { - int *p = array; + int *p = TenElements; --p; return *p; - // expected-warning@-1 {{Out of bound access to memory preceding 'array'}} - // expected-note@-2 {{Access of 'array' at negative byte offset -4}} + // expected-warning@-1 {{Out of bound access to memory preceding 'TenElements'}} + // expected-note@-2 {{Access of 'TenElements' at negative byte offset -4}} } int scanf(const char *restrict fmt, ...); @@ -24,30 +24,31 @@ void taintedIndex(void) { scanf("%d", &index); // expected-note@-1 {{Taint originated here}} // expected-note@-2 {{Taint propagated to the 2nd argument}} - array[index] = 5; - // expected-warning@-1 {{Potential out of bound access to 'array' with tainted index}} - // expected-note@-2 {{Access of 'array' with a tainted index that may be too large}} + TenElements[index] = 5; + // expected-warning@-1 {{Potential out of bound access to 'TenElements' with tainted index}} + // expected-note@-2 {{Access of 'TenElements' with a tainted index that may be too large}} } int *taintedIndexAfterTheEndPtr(void) { // NOTE: Technically speaking, this testcase does not trigger any UB because - // &array[10] is the after-the-end pointer which is well-defined; but this is - // a bug-prone situation and far from the idiomatic use of `&array[size]`, so - // it's better to report an error. This report can be easily silenced by - // writing array+index instead of &array[index]. + // &TenElements[10] is the after-the-end pointer which is well-defined; but + // this is a bug-prone situation and far from the idiomatic use of + // `&TenElements[size]`, so it's better to report an error. This report can + // be easily silenced by writing TenElements+index instead of + // &TenElements[index]. int index; scanf("%d", &index); // expected-note@-1 {{Taint originated here}} // expected-note@-2 {{Taint propagated to the 2nd argument}} if (index < 0 || index > 10) - return array; + return TenElements; // expected-note@-2 {{Assuming 'index' is >= 0}} // expected-note@-3 {{Left side of '||' is false}} // expected-note@-4 {{Assuming 'index' is <= 10}} // expected-note@-5 {{Taking false branch}} - return &array[index]; - // expected-warning@-1 {{Potential out of bound access to 'array' with tainted index}} - // expected-note@-2 {{Access of 'array' with a tainted index that may be too large}} + return &TenElements[index]; + // expected-warning@-1 {{Potential out of bound access to 'TenElements' with tainted index}} + // expected-note@-2 {{Access of 'TenElements' with a tainted index that may be too large}} } void taintedOffset(void) { @@ -55,57 +56,57 @@ void taintedOffset(void) { scanf("%d", &index); // expected-note@-1 {{Taint originated here}} // expected-note@-2 {{Taint propagated to the 2nd argument}} - int *p = array + index; + int *p = TenElements + index; p[0] = 5; - // expected-warning@-1 {{Potential out of bound access to 'array' with tainted offset}} - // expected-note@-2 {{Access of 'array' with a tainted offset that may be too large}} + // expected-warning@-1 {{Potential out of bound access to 'TenElements' with tainted offset}} + // expected-note@-2 {{Access of 'TenElements' with a tainted offset that may be too large}} } void arrayOverflow(void) { - array[12] = 5; - // expected-warning@-1 {{Out of bound access to memory after the end of 'array'}} - // expected-note@-2 {{Access of 'array' at index 12, while it holds only 10 'int' elements}} + TenElements[12] = 5; + // expected-warning@-1 {{Out of bound access to memory after the end of 'TenElements'}} + // expected-note@-2 {{Access of 'TenElements' at index 12, while it holds only 10 'int' elements}} } void flippedOverflow(void) { - 12[array] = 5; - // expected-warning@-1 {{Out of bound access to memory after the end of 'array'}} - // expected-note@-2 {{Access of 'array' at index 12, while it holds only 10 'int' elements}} + 12[TenElements] = 5; + // expected-warning@-1 {{Out of bound access to memory after the end of 'TenElements'}} + // expected-note@-2 {{Access of 'TenElements' at index 12, while it holds only 10 'int' elements}} } int *afterTheEndPtr(void) { - // This is an unusual but standard-compliant way of writing (array + 10). - return &array[10]; // no-warning + // This is an unusual but standard-compliant way of writing (TenElements + 10). + return &TenElements[10]; // no-warning } int useAfterTheEndPtr(void) { // ... but dereferencing the after-the-end pointer is still invalid. return *afterTheEndPtr(); - // expected-warning@-1 {{Out of bound access to memory after the end of 'array'}} - // expected-note@-2 {{Access of 'array' at index 10, while it holds only 10 'int' elements}} + // expected-warning@-1 {{Out of bound access to memory after the end of 'TenElements'}} + // expected-note@-2 {{Access of 'TenElements' at index 10, while it holds only 10 'int' elements}} } int *afterAfterTheEndPtr(void) { // This is UB, it's invalid to form an after-after-the-end pointer. - return &array[11]; - // expected-warning@-1 {{Out of bound access to memory after the end of 'array'}} - // expected-note@-2 {{Access of 'array' at index 11, while it holds only 10 'int' elements}} + return &TenElements[11]; + // expected-warning@-1 {{Out of bound access to memory after the end of 'TenElements'}} + // expected-note@-2 {{Access of 'TenElements' at index 11, while it holds only 10 'int' elements}} } int *potentialAfterTheEndPtr(int idx) { if (idx < 10) { /* ...do something... */ } // expected-note@-1 {{Assuming 'idx' is >= 10}} // expected-note@-2 {{Taking false branch}} - return &array[idx]; - // expected-warning@-1 {{Out of bound access to memory after the end of 'array'}} - // expected-note@-2 {{Access of 'array' at an overflowing index, while it holds only 10 'int' elements}} + return &TenElements[idx]; + // expected-warning@-1 {{Out of bound access to memory after the end of 'TenElements'}} + // expected-note@-2 {{Access of 'TenElements' at an overflowing index, while it holds only 10 'int' elements}} // NOTE: On the idx >= 10 branch the normal "optimistic" behavior would've // been continuing with the assumption that idx == 10 and the return value is // a legitimate after-the-end pointer. The checker deviates from this by // reporting an error because this situation is very suspicious and far from - // the idiomatic `&array[size]` expressions. If the report is FP, the - // developer can easily silence it by writing array+idx instead of - // &array[idx]. + // the idiomatic `&TenElements[size]` expressions. If the report is FP, the + // developer can easily silence it by writing TenElements+idx instead of + // &TenElements[idx]. } int scalar; @@ -156,9 +157,9 @@ int arrayOfStructsArrow(void) { } short convertedArray(void) { - return ((short*)array)[47]; - // expected-warning@-1 {{Out of bound access to memory after the end of 'array'}} - // expected-note@-2 {{Access of 'array' at index 47, while it holds only 20 'short' elements}} + return ((short*)TenElements)[47]; + // expected-warning@-1 {{Out of bound access to memory after the end of 'TenElements'}} + // expected-note@-2 {{Access of 'TenElements' at index 47, while it holds only 20 'short' elements}} } struct two_bytes { @@ -195,12 +196,22 @@ void *malloc(size_t size); int *mallocRegion(void) { int *mem = (int*)malloc(2*sizeof(int)); + mem[3] = -2; // expected-warning@-1 {{Out of bound access to memory after the end of the heap area}} // expected-note@-2 {{Access of the heap area at index 3, while it holds only 2 'int' elements}} return mem; } +int *mallocRegionDeref(void) { + int *mem = (int*)malloc(2*sizeof(int)); + + *(mem + 3) = -2; + // expected-warning@-1 {{Out of bound access to memory after the end of the heap area}} + // expected-note@-2 {{Access of the heap area at index 3, while it holds only 2 'int' elements}} + return mem; +} + void *alloca(size_t size); int allocaRegion(void) { @@ -211,34 +222,61 @@ int allocaRegion(void) { return *mem; } -int *unknownExtent(int arg) { - if (arg >= 2) +int *symbolicExtent(int arg) { + // expected-note@+2 {{Assuming 'arg' is < 5}} + // expected-note@+1 {{Taking false branch}} + if (arg >= 5) return 0; int *mem = (int*)malloc(arg); + + // TODO: without the following reference to 'arg', the analyzer would discard + // the range information about (the symbolic value of) 'arg'. This is + // incorrect because while the variable itself is inaccessible, it becomes + // the symbolic extent of 'mem', so we still want to reason about its + // potential values. + (void)arg; + mem[8] = -2; - // FIXME: this should produce - // {{Out of bound access to memory after the end of the heap area}} - // {{Access of 'int' element in the heap area at index 8}} + // expected-warning@-1 {{Out of bound access to memory after the end of the heap area}} + // expected-note@-2 {{Access of 'int' element in the heap area at index 8}} return mem; } -void unknownIndex(int arg) { +int *symbolicExtentDiscardedRangeInfo(int arg) { + // This is a copy of the case 'symbolicExtent' without the '(void)arg' hack. + // TODO: if the analyzer can detect the out-of-bounds access within this + // testcase, then remove this and the `(void)arg` hack from `symbolicExtent`. + if (arg >= 5) + return 0; + int *mem = (int*)malloc(arg); + mem[8] = -2; + return mem; +} + +void symbolicIndex(int arg) { // expected-note@+2 {{Assuming 'arg' is >= 12}} // expected-note@+1 {{Taking true branch}} if (arg >= 12) - array[arg] = -2; - // expected-warning@-1 {{Out of bound access to memory after the end of 'array'}} - // expected-note@-2 {{Access of 'array' at an overflowing index, while it holds only 10 'int' elements}} + TenElements[arg] = -2; + // expected-warning@-1 {{Out of bound access to memory after the end of 'TenElements'}} + // expected-note@-2 {{Access of 'TenElements' at an overflowing index, while it holds only 10 'int' elements}} } int *nothingIsCertain(int x, int y) { if (x >= 2) return 0; int *mem = (int*)malloc(x); + if (y >= 8) mem[y] = -2; // FIXME: this should produce // {{Out of bound access to memory after the end of the heap area}} // {{Access of 'int' element in the heap area at an overflowing index}} + // but apparently the analyzer isn't smart enough to deduce this. + + // Keep constraints alive. (Without this, the overeager garbage collection of + // constraints would _also_ prevent the intended behavior in this testcase.) + (void)x; + return mem; } diff --git a/clang/test/Analysis/out-of-bounds-notes.c b/clang/test/Analysis/out-of-bounds-notes.c new file mode 100644 index 0000000..c29b6f8 --- /dev/null +++ b/clang/test/Analysis/out-of-bounds-notes.c @@ -0,0 +1,199 @@ +// RUN: %clang_analyze_cc1 -Wno-array-bounds -analyzer-output=text \ +// RUN: -analyzer-checker=core,alpha.security.ArrayBoundV2,unix.Malloc,alpha.security.taint -verify %s + +int TenElements[10]; + +int irrelevantAssumptions(int arg) { + int a = TenElements[arg]; + // Here the analyzer assumes that `arg` is in bounds, but doesn't report this + // because `arg` is not interesting for the bug. + int b = TenElements[13]; + // expected-warning@-1 {{Out of bound access to memory after the end of 'TenElements'}} + // expected-note@-2 {{Access of 'TenElements' at index 13, while it holds only 10 'int' elements}} + return a + b; +} + + +int assumingBoth(int arg) { + int a = TenElements[arg]; + // expected-note@-1 {{Assuming index is non-negative and less than 10, the number of 'int' elements in 'TenElements'}} + int b = TenElements[arg]; // no additional note, we already assumed that 'arg' is in bounds + int c = TenElements[arg + 10]; + // expected-warning@-1 {{Out of bound access to memory after the end of 'TenElements'}} + // expected-note@-2 {{Access of 'TenElements' at an overflowing index, while it holds only 10 'int' elements}} + return a + b + c; +} + +int assumingBothPointerToMiddle(int arg) { + // If we're accessing an TenElements through a pointer pointing to its middle, the checker + // will speak about the "byte offset" measured from the beginning of the TenElements. + int *p = TenElements + 2; + int a = p[arg]; + // FIXME: The following note does not appear: + // {{Assuming byte offset is non-negative and less than 40, the extent of 'TenElements'}} + // It seems that the analyzer "gives up" modeling this pointer arithmetics + // and says that `p[arg]` is just an UnknownVal (instead of calculating that + // it's equivalent to `TenElements[2+arg]`). + + int b = TenElements[arg]; // This is normal access, and only the lower bound is new. + // expected-note@-1 {{Assuming index is non-negative}} + int c = TenElements[arg + 10]; + // expected-warning@-1 {{Out of bound access to memory after the end of 'TenElements'}} + // expected-note@-2 {{Access of 'TenElements' at an overflowing index, while it holds only 10 'int' elements}} + return a + b + c; +} + +int assumingLower(int arg) { + // expected-note@+2 {{Assuming 'arg' is < 10}} + // expected-note@+1 {{Taking false branch}} + if (arg >= 10) + return 0; + int a = TenElements[arg]; + // expected-note@-1 {{Assuming index is non-negative}} + int b = TenElements[arg + 10]; + // expected-warning@-1 {{Out of bound access to memory after the end of 'TenElements'}} + // expected-note@-2 {{Access of 'TenElements' at an overflowing index, while it holds only 10 'int' elements}} + return a + b; +} + +int assumingUpper(int arg) { + // expected-note@+2 {{Assuming 'arg' is >= 0}} + // expected-note@+1 {{Taking false branch}} + if (arg < 0) + return 0; + int a = TenElements[arg]; + // expected-note@-1 {{Assuming index is less than 10, the number of 'int' elements in 'TenElements'}} + int b = TenElements[arg - 10]; + // expected-warning@-1 {{Out of bound access to memory preceding 'TenElements'}} + // expected-note@-2 {{Access of 'TenElements' at negative byte offset}} + return a + b; +} + +int assumingUpperIrrelevant(int arg) { + // FIXME: The assumption "assuming index is less than 10" is printed because + // it's assuming something about the interesting variable `arg`; however, + // it's irrelevant because in this testcase the out of bound access is + // deduced from the _lower_ bound on `arg`. Currently the analyzer cannot + // filter out assumptions that are logically irrelevant but "touch" + // interesting symbols; eventually it would be good to add support for this. + + // expected-note@+2 {{Assuming 'arg' is >= 0}} + // expected-note@+1 {{Taking false branch}} + if (arg < 0) + return 0; + int a = TenElements[arg]; + // expected-note@-1 {{Assuming index is less than 10, the number of 'int' elements in 'TenElements'}} + int b = TenElements[arg + 10]; + // expected-warning@-1 {{Out of bound access to memory after the end of 'TenElements'}} + // expected-note@-2 {{Access of 'TenElements' at an overflowing index, while it holds only 10 'int' elements}} + return a + b; +} + +int assumingUpperUnsigned(unsigned arg) { + int a = TenElements[arg]; + // expected-note@-1 {{Assuming index is less than 10, the number of 'int' elements in 'TenElements'}} + int b = TenElements[(int)arg - 10]; + // expected-warning@-1 {{Out of bound access to memory preceding 'TenElements'}} + // expected-note@-2 {{Access of 'TenElements' at negative byte offset}} + return a + b; +} + +int assumingNothing(unsigned arg) { + // expected-note@+2 {{Assuming 'arg' is < 10}} + // expected-note@+1 {{Taking false branch}} + if (arg >= 10) + return 0; + int a = TenElements[arg]; // no note here, we already know that 'arg' is in bounds + int b = TenElements[(int)arg - 10]; + // expected-warning@-1 {{Out of bound access to memory preceding 'TenElements'}} + // expected-note@-2 {{Access of 'TenElements' at negative byte offset}} + return a + b; +} + +short assumingConvertedToCharP(int arg) { + // When indices are reported, the note will use the element type that's the + // result type of the subscript operator. + char *cp = (char*)TenElements; + char a = cp[arg]; + // expected-note@-1 {{Assuming index is non-negative and less than 40, the number of 'char' elements in 'TenElements'}} + char b = cp[arg]; // no additional note, we already assumed that 'arg' is in bounds + char c = cp[arg + 40]; + // expected-warning@-1 {{Out of bound access to memory after the end of 'TenElements'}} + // expected-note@-2 {{Access of 'TenElements' at an overflowing index, while it holds only 40 'char' elements}} + return a + b + c; +} + +struct foo { + int num; + char a[8]; + char b[5]; +}; + +int assumingConvertedToIntP(struct foo f, int arg) { + // When indices are reported, the note will use the element type that's the + // result type of the subscript operator. + int a = ((int*)(f.a))[arg]; + // expected-note@-1 {{Assuming index is non-negative and less than 2, the number of 'int' elements in 'f.a'}} + // However, if the extent of the memory region is not divisible by the + // element size, the checker measures the offset and extent in bytes. + int b = ((int*)(f.b))[arg]; + // expected-note@-1 {{Assuming byte offset is less than 5, the extent of 'f.b'}} + int c = TenElements[arg-2]; + // expected-warning@-1 {{Out of bound access to memory preceding 'TenElements'}} + // expected-note@-2 {{Access of 'TenElements' at negative byte offset}} + return a + b + c; +} + +int assumingPlainOffset(struct foo f, int arg) { + // This TC is intended to check the corner case that the checker prints the + // shorter "offset" instead of "byte offset" when it's irrelevant that the + // offset is measured in bytes. + + // expected-note@+2 {{Assuming 'arg' is < 2}} + // expected-note@+1 {{Taking false branch}} + if (arg >= 2) + return 0; + + int b = ((int*)(f.b))[arg]; + // expected-note@-1 {{Assuming byte offset is non-negative and less than 5, the extent of 'f.b'}} + // FIXME: this should be {{Assuming offset is non-negative}} + // but the current simplification algorithm doesn't realize that arg <= 1 + // implies that the byte offset arg*4 will be less than 5. + + int c = TenElements[arg+10]; + // expected-warning@-1 {{Out of bound access to memory after the end of 'TenElements'}} + // expected-note@-2 {{Access of 'TenElements' at an overflowing index, while it holds only 10 'int' elements}} + return b + c; +} + +typedef __typeof(sizeof(int)) size_t; +void *malloc(size_t size); +void free(void *ptr); + +int assumingExtent(int arg) { + // Verify that the assumption note is printed when the extent is interesting + // (even if the index isn't interesting). + int *mem = (int*)malloc(arg); + + mem[12] = 123; + // expected-note@-1 {{Assuming index '12' is less than the number of 'int' elements in the heap area}} + + free(mem); + + return TenElements[arg]; + // expected-warning@-1 {{Out of bound access to memory after the end of 'TenElements'}} + // expected-note@-2 {{Access of 'TenElements' at an overflowing index, while it holds only 10 'int' elements}} +} + +int *extentInterestingness(int arg) { + // Verify that in an out-of-bounds access issue the extent is marked as + // interesting (so assumptions about its value are printed). + int *mem = (int*)malloc(arg); + + TenElements[arg] = 123; + // expected-note@-1 {{Assuming index is non-negative and less than 10, the number of 'int' elements in 'TenElements'}} + + return &mem[12]; + // expected-warning@-1 {{Out of bound access to memory after the end of the heap area}} + // expected-note@-2 {{Access of 'int' element in the heap area at index 12}} +} |