[analyzer] Check the correct first and last elements in cstring.UninitializedRead (#95408)

I intend to fix this checker up so that we can move it out of alpha. I made a bunch of analyses, and found many similar false positives: ```c++ int t[] = {1,2,3}; memcpy(dst, t, sizeof(t) / sizeof(t[0])); // warn ``` The problem here is the way CStringChecker checks whether the destination and source buffers are initialized: heuristically, it only checks the first and last element. This is fine, however, it retrieves these elements as characters, even if the underlaying object is not a character array. Reading the last byte of an integer is undefined, so the checker emits a bug here. A quick search tells you the rationale: "Both objects are reinterpreted as arrays of unsigned char.". But the static analyzer right now can't check byte-by-byte if a memory region is _initialized_, it can only check if its a well-defined character or not. In this patch, I pry the original array out of the arguments to memcpy (and similar functions), and retrieve the actual first and last elements according to the array's actual element type. Currently, my improvements reduced the number of reports to 29 on these projects: memcached,tmux,curl,twin,vim,openssl,sqlite,ffmpeg,postgres https://codechecker-demo.eastus.cloudapp.azure.com/Default/reports?detection-status=New&detection-status=Reopened&detection-status=Unresolved&is-unique=on&run=%2acstring_uninit_upper_bound_patched&newcheck=%2acstring_uninit_upper_bounds_patched&diff-type=New&checker-name=alpha.unix.cstring.UninitializedRead&items-per-page=100 Before my patch, there were 87. https://codechecker-demo.eastus.cloudapp.azure.com/Default/reports?detection-status=New&detection-status=Reopened&detection-status=Unresolved&is-unique=on&run=%2acstring_uninit_baseline&newcheck=%2acstring_uninit_upper_bounds_patched&diff-type=New&checker-name=alpha.unix.cstring.UninitializedRead&items-per-page=100
author: Kristóf Umann <dkszelethus@gmail.com> 2024-07-04 13:46:22 +0200
committer: GitHub <noreply@github.com> 2024-07-04 13:46:22 +0200
commit: 483557224b8d36761f39d5847e17ef7361757f1b (patch)
tree: cb4ce70453fc892ad86ea3bd9b06cab4faf4ae62 /clang/test
parent: d6af73e9fbc84315100499a096f17ec5eeeeea23 (diff)
download: llvm-483557224b8d36761f39d5847e17ef7361757f1b.zip
llvm-483557224b8d36761f39d5847e17ef7361757f1b.tar.gz
llvm-483557224b8d36761f39d5847e17ef7361757f1b.tar.bz2
1 files changed, 96 insertions, 23 deletions
diff --git a/clang/test/Analysis/bstring_UninitRead.c b/clang/test/Analysis/bstring_UninitRead.c
index c535e01..45e38dd 100644
--- a/clang/test/Analysis/bstring_UninitRead.c
+++ b/clang/test/Analysis/bstring_UninitRead.c
@@ -1,12 +1,11 @@
 // RUN: %clang_analyze_cc1 -verify %s \
 // RUN: -analyzer-checker=core,alpha.unix.cstring
 
-
-// This file is generally for the alpha.unix.cstring.UninitializedRead Checker, the reason for putting it into
-// the separate file because the checker is break the some existing test cases in bstring.c file , so we don't 
-// wanna mess up with some existing test case so it's better to create separate file for it, this file also include 
-// the broken test for the reference in future about the broken tests.
-
+//===----------------------------------------------------------------------===//
+// mempcpy() using character array. This is the easiest case, as memcpy
+// intepretrs the dst and src buffers as character arrays (regardless of their
+// actual type).
+//===----------------------------------------------------------------------===//
 
 typedef typeof(sizeof(int)) size_t;
 
@@ -14,46 +13,120 @@ void clang_analyzer_eval(int);
 
 void *memcpy(void *restrict s1, const void *restrict s2, size_t n);
 
-void top(char *dst) {
+void memcpy_array_fully_uninit(char *dst) {
+  char buf[10];
+  memcpy(dst, buf, 10); // expected-warning{{The first element of the 2nd argument is undefined}}
+                        // expected-note@-1{{Other elements might also be undefined}}
+  (void)buf;
+}
+
+void memcpy_array_partially_uninit(char *dst) {
   char buf[10];
-  memcpy(dst, buf, 10); // expected-warning{{Bytes string function accesses uninitialized/garbage values}}
+  buf[0] = 'i';
+  memcpy(dst, buf, 10); // expected-warning{{The last accessed element (at index 9) in the 2nd argument is undefined}}
+                        // expected-note@-1{{Other elements might also be undefined}}
+  (void)buf;
+}
+
+void memcpy_array_only_init_portion(char *dst) {
+  char buf[10];
+  buf[0] = 'i';
+  memcpy(dst, buf, 1);
+  (void)buf;
+}
+
+void memcpy_array_partially_init_error(char *dst) {
+  char buf[10];
+  buf[0] = 'i';
+  memcpy(dst, buf, 2); // expected-warning{{The last accessed element (at index 1) in the 2nd argument is undefined}}
+                      // expected-note@-1{{Other elements might also be undefined}}
+  (void)buf;
+}
+
+// The interesting case here is that the portion we're copying is initialized,
+// but not the whole matrix. We need to be careful to extract buf[1], and not
+// buf when trying to peel region layers off from the source argument.
+void memcpy_array_from_matrix(char *dst) {
+  char buf[2][2];
+  buf[1][0] = 'i';
+  buf[1][1] = 'j';
+  // FIXME: This is a FP -- we mistakenly retrieve the first element of buf,
+  // instead of the first element of buf[1]. getLValueElement simply peels off
+  // another ElementRegion layer, when in this case it really shouldn't.
+  memcpy(dst, buf[1], 2); // expected-warning{{The first element of the 2nd argument is undefined}}
+                          // expected-note@-1{{Other elements might also be undefined}}
   (void)buf;
 }
 
-//===----------------------------------------------------------------------===
-// mempcpy()
-//===----------------------------------------------------------------------===
+//===----------------------------------------------------------------------===//
+// mempcpy() using non-character arrays.
+//===----------------------------------------------------------------------===//
 
 void *mempcpy(void *restrict s1, const void *restrict s2, size_t n);
 
-void mempcpy14() {
+void memcpy_int_array_fully_init() {
   int src[] = {1, 2, 3, 4};
   int dst[5] = {0};
   int *p;
 
-  p = mempcpy(dst, src, 4 * sizeof(int)); // expected-warning{{Bytes string function accesses uninitialized/garbage values}}
-   // FIXME: This behaviour is actually surprising and needs to be fixed, 
-   // mempcpy seems to consider the very last byte of the src buffer uninitialized
-   // and returning undef unfortunately. It should have returned unknown or a conjured value instead.
+  p = mempcpy(dst, src, 4 * sizeof(int));
+  clang_analyzer_eval(p == &dst[4]);
+}
 
-  clang_analyzer_eval(p == &dst[4]); // no-warning (above is fatal)
+void memcpy_int_array_fully_init2(int *dest) {
+  int t[] = {1, 2, 3};
+  memcpy(dest, t, sizeof(t));
 }
 
+//===----------------------------------------------------------------------===//
+// mempcpy() using nonarrays.
+//===----------------------------------------------------------------------===//
+
 struct st {
   int i;
   int j;
 };
 
-
-void mempcpy15() {
+void mempcpy_struct_partially_uninit() {
   struct st s1 = {0};
   struct st s2;
   struct st *p1;
   struct st *p2;
 
   p1 = (&s2) + 1;
-  p2 = mempcpy(&s2, &s1, sizeof(struct st)); // expected-warning{{Bytes string function accesses uninitialized/garbage values}}
-  // FIXME: It seems same as mempcpy14() case.
-  
-  clang_analyzer_eval(p1 == p2); // no-warning (above is fatal)
+
+  // FIXME: Maybe ask UninitializedObjectChecker whether s1 is fully
+  // initialized?
+  p2 = mempcpy(&s2, &s1, sizeof(struct st));
+
+  clang_analyzer_eval(p1 == p2);
+}
+
+void mempcpy_struct_fully_uninit() {
+  struct st s1;
+  struct st s2;
+
+  // FIXME: Maybe ask UninitializedObjectChecker whether s1 is fully
+  // initialized?
+  mempcpy(&s2, &s1, sizeof(struct st));
+}
+
+// Creduced crash. In this case, an symbolicregion is wrapped in an
+// elementregion for the src argument.
+void *ga_copy_strings_from_0;
+void *memmove();
+void alloc();
+void ga_copy_strings() {
+  int i = 0;
+  for (;; ++i)
+    memmove(alloc, ((char **)ga_copy_strings_from_0)[i], 1);
+}
+
+// Creduced crash. In this case, retrieving the Loc for the first element failed.
+char mov_mdhd_language_map[][4] = {};
+int ff_mov_lang_to_iso639_code;
+char *ff_mov_lang_to_iso639_to;
+void ff_mov_lang_to_iso639() {
+  memcpy(ff_mov_lang_to_iso639_to,
+         mov_mdhd_language_map[ff_mov_lang_to_iso639_code], 4);
 }
author	Kristóf Umann <dkszelethus@gmail.com>	2024-07-04 13:46:22 +0200
committer	GitHub <noreply@github.com>	2024-07-04 13:46:22 +0200
commit	483557224b8d36761f39d5847e17ef7361757f1b (patch)
tree	cb4ce70453fc892ad86ea3bd9b06cab4faf4ae62 /clang/test
parent	d6af73e9fbc84315100499a096f17ec5eeeeea23 (diff)
download	llvm-483557224b8d36761f39d5847e17ef7361757f1b.zip llvm-483557224b8d36761f39d5847e17ef7361757f1b.tar.gz llvm-483557224b8d36761f39d5847e17ef7361757f1b.tar.bz2