libcpp: Add -Winvalid-utf8 warning [PR106655]

The following patch introduces a new warning - -Winvalid-utf8 similarly to what clang now has - to diagnose invalid UTF-8 byte sequences in comments, but not just in those, but also in string/character literals and outside of them. The warning is on by default when explicit -finput-charset=UTF-8 is used and C++23 compilation is requested and if -{,W}pedantic or -pedantic-errors it is actually a pedwarn. The reason it is on by default only for -finput-charset=UTF-8 is that the sources often are UTF-8, but sometimes could be some ASCII compatible single byte encoding where non-ASCII characters only appear in comments. So having the warning off by default is IMO desirable. The C++23 pedantic mode for when the source code is UTF-8 is -std=c++23 -pedantic-errors -finput-charset=UTF-8. 2022-09-01 Jakub Jelinek <jakub@redhat.com> PR c++/106655 libcpp/ * include/cpplib.h (struct cpp_options): Implement C++23 P2295R6 - Support for UTF-8 as a portable source file encoding. Add cpp_warn_invalid_utf8 and cpp_input_charset_explicit fields. (enum cpp_warning_reason): Add CPP_W_INVALID_UTF8 enumerator. * init.cc (cpp_create_reader): Initialize cpp_warn_invalid_utf8 and cpp_input_charset_explicit. * charset.cc (_cpp_valid_utf8): Adjust function comment. * lex.cc (UCS_LIMIT): Define. (utf8_continuation): New const variable. (utf8_signifier): Move earlier in the file. (_cpp_warn_invalid_utf8, _cpp_handle_multibyte_utf8): New functions. (_cpp_skip_block_comment): Handle -Winvalid-utf8 warning. (skip_line_comment): Likewise. (lex_raw_string, lex_string): Likewise. (_cpp_lex_direct): Likewise. gcc/ * doc/invoke.texi (-Winvalid-utf8): Document it. gcc/c-family/ * c.opt (-Winvalid-utf8): New warning. * c-opts.cc (c_common_handle_option) <case OPT_finput_charset_>: Set cpp_opts->cpp_input_charset_explicit. (c_common_post_options): If -finput-charset=UTF-8 is explicit in C++23, enable -Winvalid-utf8 by default and if -pedantic or -pedantic-errors, make it a pedwarn. gcc/testsuite/ * c-c++-common/cpp/Winvalid-utf8-1.c: New test. * c-c++-common/cpp/Winvalid-utf8-2.c: New test. * c-c++-common/cpp/Winvalid-utf8-3.c: New test. * g++.dg/cpp23/Winvalid-utf8-1.C: New test. * g++.dg/cpp23/Winvalid-utf8-2.C: New test. * g++.dg/cpp23/Winvalid-utf8-3.C: New test. * g++.dg/cpp23/Winvalid-utf8-4.C: New test. * g++.dg/cpp23/Winvalid-utf8-5.C: New test. * g++.dg/cpp23/Winvalid-utf8-6.C: New test. * g++.dg/cpp23/Winvalid-utf8-7.C: New test. * g++.dg/cpp23/Winvalid-utf8-8.C: New test. * g++.dg/cpp23/Winvalid-utf8-9.C: New test. * g++.dg/cpp23/Winvalid-utf8-10.C: New test. * g++.dg/cpp23/Winvalid-utf8-11.C: New test. * g++.dg/cpp23/Winvalid-utf8-12.C: New test.
author: Jakub Jelinek <jakub@redhat.com> 2022-09-01 09:48:01 +0200
committer: Jakub Jelinek <jakub@redhat.com> 2022-09-01 09:56:44 +0200
commit: 0b8c57ed40f19086e30ce54faec3222ac21cc0df (patch)
tree: 1ce3aa0f19ef45a7d2c03e272d1d8f835bb7f0b6 /libcpp/lex.cc
parent: bdfe0d1ce0aebdb68b77e2c04a0f45956c56b449 (diff)
download: gcc-0b8c57ed40f19086e30ce54faec3222ac21cc0df.zip
gcc-0b8c57ed40f19086e30ce54faec3222ac21cc0df.tar.gz
gcc-0b8c57ed40f19086e30ce54faec3222ac21cc0df.tar.bz2
1 files changed, 183 insertions, 26 deletions
diff --git a/libcpp/lex.cc b/libcpp/lex.cc
index 528d598..41f905de 100644
--- a/libcpp/lex.cc
+++ b/libcpp/lex.cc
@@ -50,6 +50,9 @@ static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
 
+/* ISO 10646 defines the UCS codespace as the range 0-0x10FFFF inclusive.  */
+#define UCS_LIMIT 0x10FFFF
+
 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
 static int skip_line_comment (cpp_reader *);
 static void skip_whitespace (cpp_reader *, cppchar_t);
@@ -1704,6 +1707,120 @@ maybe_warn_bidi_on_char (cpp_reader *pfile, bidi::kind kind,
   bidi::on_char (kind, ucn_p, loc);
 }
 
+static const cppchar_t utf8_continuation = 0x80;
+static const cppchar_t utf8_signifier = 0xC0;
+
+/* Emit -Winvalid-utf8 warning on invalid UTF-8 character starting
+   at PFILE->buffer->cur.  Return a pointer after the diagnosed
+   invalid character.  */
+
+static const uchar *
+_cpp_warn_invalid_utf8 (cpp_reader *pfile)
+{
+  cpp_buffer *buffer = pfile->buffer;
+  const uchar *cur = buffer->cur;
+  bool pedantic = (CPP_PEDANTIC (pfile)
+		   && CPP_OPTION (pfile, cpp_warn_invalid_utf8) == 2);
+
+  if (cur[0] < utf8_signifier
+      || cur[1] < utf8_continuation || cur[1] >= utf8_signifier)
+    {
+      if (pedantic)
+	cpp_error_with_line (pfile, CPP_DL_PEDWARN,
+			     pfile->line_table->highest_line,
+			     CPP_BUF_COL (buffer),
+			     "invalid UTF-8 character <%x>",
+			     cur[0]);
+      else
+	cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
+			       pfile->line_table->highest_line,
+			       CPP_BUF_COL (buffer),
+			       "invalid UTF-8 character <%x>",
+			       cur[0]);
+      return cur + 1;
+    }
+  else if (cur[2] < utf8_continuation || cur[2] >= utf8_signifier)
+    {
+      if (pedantic)
+	cpp_error_with_line (pfile, CPP_DL_PEDWARN,
+			     pfile->line_table->highest_line,
+			     CPP_BUF_COL (buffer),
+			     "invalid UTF-8 character <%x><%x>",
+			     cur[0], cur[1]);
+      else
+	cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
+			       pfile->line_table->highest_line,
+			       CPP_BUF_COL (buffer),
+			       "invalid UTF-8 character <%x><%x>",
+			       cur[0], cur[1]);
+      return cur + 2;
+    }
+  else if (cur[3] < utf8_continuation || cur[3] >= utf8_signifier)
+    {
+      if (pedantic)
+	cpp_error_with_line (pfile, CPP_DL_PEDWARN,
+			     pfile->line_table->highest_line,
+			     CPP_BUF_COL (buffer),
+			     "invalid UTF-8 character <%x><%x><%x>",
+			     cur[0], cur[1], cur[2]);
+      else
+	cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
+			       pfile->line_table->highest_line,
+			       CPP_BUF_COL (buffer),
+			       "invalid UTF-8 character <%x><%x><%x>",
+			       cur[0], cur[1], cur[2]);
+      return cur + 3;
+    }
+  else
+    {
+      if (pedantic)
+	cpp_error_with_line (pfile, CPP_DL_PEDWARN,
+			     pfile->line_table->highest_line,
+			     CPP_BUF_COL (buffer),
+			     "invalid UTF-8 character <%x><%x><%x><%x>",
+			     cur[0], cur[1], cur[2], cur[3]);
+      else
+	cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
+			       pfile->line_table->highest_line,
+			       CPP_BUF_COL (buffer),
+			       "invalid UTF-8 character <%x><%x><%x><%x>",
+			       cur[0], cur[1], cur[2], cur[3]);
+      return cur + 4;
+    }
+}
+
+/* Helper function of *skip_*_comment and lex*_string.  For C,
+   character at CUR[-1] with MSB set handle -Wbidi-chars* and
+   -Winvalid-utf8 diagnostics and return pointer to first character
+   that should be processed next.  */
+
+static inline const uchar *
+_cpp_handle_multibyte_utf8 (cpp_reader *pfile, uchar c,
+			    const uchar *cur, bool warn_bidi_p,
+			    bool warn_invalid_utf8_p)
+{
+  /* If this is a beginning of a UTF-8 encoding, it might be
+     a bidirectional control character.  */
+  if (c == bidi::utf8_start && warn_bidi_p)
+    {
+      location_t loc;
+      bidi::kind kind = get_bidi_utf8 (pfile, cur - 1, &loc);
+      maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
+    }
+  if (!warn_invalid_utf8_p)
+    return cur;
+  if (c >= utf8_signifier)
+    {
+      cppchar_t s;
+      const uchar *pstr = cur - 1;
+      if (_cpp_valid_utf8 (pfile, &pstr, pfile->buffer->rlimit, 0, NULL, &s)
+	  && s <= UCS_LIMIT)
+	return pstr;
+    }
+  pfile->buffer->cur = cur - 1;
+  return _cpp_warn_invalid_utf8 (pfile);
+}
+
 /* Skip a C-style block comment.  We find the end of the comment by
    seeing if an asterisk is before every '/' we encounter.  Returns
    nonzero if comment terminated by EOF, zero otherwise.
@@ -1716,6 +1833,8 @@ _cpp_skip_block_comment (cpp_reader *pfile)
   const uchar *cur = buffer->cur;
   uchar c;
   const bool warn_bidi_p = pfile->warn_bidi_p ();
+  const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
+  const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
 
   cur++;
   if (*cur == '/')
@@ -1765,14 +1884,10 @@ _cpp_skip_block_comment (cpp_reader *pfile)
 
 	  cur = buffer->cur;
 	}
-      /* If this is a beginning of a UTF-8 encoding, it might be
-	 a bidirectional control character.  */
-      else if (__builtin_expect (c == bidi::utf8_start, 0) && warn_bidi_p)
-	{
-	  location_t loc;
-	  bidi::kind kind = get_bidi_utf8 (pfile, cur - 1, &loc);
-	  maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
-	}
+      else if (__builtin_expect (c >= utf8_continuation, 0)
+	       && warn_bidi_or_invalid_utf8_p)
+	cur = _cpp_handle_multibyte_utf8 (pfile, c, cur, warn_bidi_p,
+					  warn_invalid_utf8_p);
     }
 
   buffer->cur = cur;
@@ -1789,11 +1904,13 @@ skip_line_comment (cpp_reader *pfile)
   cpp_buffer *buffer = pfile->buffer;
   location_t orig_line = pfile->line_table->highest_line;
   const bool warn_bidi_p = pfile->warn_bidi_p ();
+  const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
+  const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
 
-  if (!warn_bidi_p)
+  if (!warn_bidi_or_invalid_utf8_p)
     while (*buffer->cur != '\n')
       buffer->cur++;
-  else
+  else if (!warn_invalid_utf8_p)
     {
       while (*buffer->cur != '\n'
 	     && *buffer->cur != bidi::utf8_start)
@@ -1813,6 +1930,22 @@ skip_line_comment (cpp_reader *pfile)
 	  maybe_warn_bidi_on_close (pfile, buffer->cur);
 	}
     }
+  else
+    {
+      while (*buffer->cur != '\n')
+	{
+	  if (*buffer->cur < utf8_continuation)
+	    {
+	      buffer->cur++;
+	      continue;
+	    }
+	  buffer->cur
+	    = _cpp_handle_multibyte_utf8 (pfile, *buffer->cur, buffer->cur + 1,
+					  warn_bidi_p, warn_invalid_utf8_p);
+	}
+      if (warn_bidi_p)
+	maybe_warn_bidi_on_close (pfile, buffer->cur);
+    }
 
   _cpp_process_line_notes (pfile, true);
   return orig_line != pfile->line_table->highest_line;
@@ -1919,8 +2052,6 @@ warn_about_normalization (cpp_reader *pfile,
     }
 }
 
-static const cppchar_t utf8_signifier = 0xC0;
-
 /* Returns TRUE if the sequence starting at buffer->cur is valid in
    an identifier.  FIRST is TRUE if this starts an identifier.  */
 
@@ -2361,6 +2492,8 @@ lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
 {
   const uchar *pos = base;
   const bool warn_bidi_p = pfile->warn_bidi_p ();
+  const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
+  const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
 
   /* 'tis a pity this information isn't passed down from the lexer's
      initial categorization of the token.  */
@@ -2597,13 +2730,10 @@ lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
 	  pos = base = pfile->buffer->cur;
 	  note = &pfile->buffer->notes[pfile->buffer->cur_note];
 	}
-      else if (__builtin_expect ((unsigned char) c == bidi::utf8_start, 0)
-	       && warn_bidi_p)
-	{
-	  location_t loc;
-	  bidi::kind kind = get_bidi_utf8 (pfile, pos - 1, &loc);
-	  maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
-	}
+      else if (__builtin_expect ((unsigned char) c >= utf8_continuation, 0)
+	       && warn_bidi_or_invalid_utf8_p)
+	pos = _cpp_handle_multibyte_utf8 (pfile, c, pos, warn_bidi_p,
+					  warn_invalid_utf8_p);
     }
 
   if (warn_bidi_p)
@@ -2704,6 +2834,8 @@ lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
     terminator = '>', type = CPP_HEADER_NAME;
 
   const bool warn_bidi_p = pfile->warn_bidi_p ();
+  const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
+  const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
   for (;;)
     {
       cppchar_t c = *cur++;
@@ -2745,12 +2877,10 @@ lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
 	}
       else if (c == '\0')
 	saw_NUL = true;
-      else if (__builtin_expect (c == bidi::utf8_start, 0) && warn_bidi_p)
-	{
-	  location_t loc;
-	  bidi::kind kind = get_bidi_utf8 (pfile, cur - 1, &loc);
-	  maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
-	}
+      else if (__builtin_expect (c >= utf8_continuation, 0)
+	       && warn_bidi_or_invalid_utf8_p)
+	cur = _cpp_handle_multibyte_utf8 (pfile, c, cur, warn_bidi_p,
+					  warn_invalid_utf8_p);
     }
 
   if (saw_NUL && !pfile->state.skipping)
@@ -4052,6 +4182,7 @@ _cpp_lex_direct (cpp_reader *pfile)
     default:
       {
 	const uchar *base = --buffer->cur;
+	static int no_warn_cnt;
 
 	/* Check for an extended identifier ($ or UCN or UTF-8).  */
 	struct normalize_state nst = INITIAL_NORMALIZE_STATE;
@@ -4072,7 +4203,33 @@ _cpp_lex_direct (cpp_reader *pfile)
 	    const uchar *pstr = base;
 	    cppchar_t s;
 	    if (_cpp_valid_utf8 (pfile, &pstr, buffer->rlimit, 0, NULL, &s))
-	      buffer->cur = pstr;
+	      {
+		if (s > UCS_LIMIT && CPP_OPTION (pfile, cpp_warn_invalid_utf8))
+		  {
+		    buffer->cur = base;
+		    _cpp_warn_invalid_utf8 (pfile);
+		  }
+		buffer->cur = pstr;
+	      }
+	    else if (CPP_OPTION (pfile, cpp_warn_invalid_utf8))
+	      {
+		buffer->cur = base;
+		const uchar *end = _cpp_warn_invalid_utf8 (pfile);
+		buffer->cur = base + 1;
+		no_warn_cnt = end - buffer->cur;
+	      }
+	  }
+	else if (c >= utf8_continuation
+		 && CPP_OPTION (pfile, cpp_warn_invalid_utf8))
+	  {
+	    if (no_warn_cnt)
+	      --no_warn_cnt;
+	    else
+	      {
+		buffer->cur = base;
+		_cpp_warn_invalid_utf8 (pfile);
+		buffer->cur = base + 1;
+	      }
 	  }
 	create_literal (pfile, result, base, buffer->cur - base, CPP_OTHER);
 	break;
author	Jakub Jelinek <jakub@redhat.com>	2022-09-01 09:48:01 +0200
committer	Jakub Jelinek <jakub@redhat.com>	2022-09-01 09:56:44 +0200
commit	0b8c57ed40f19086e30ce54faec3222ac21cc0df (patch)
tree	1ce3aa0f19ef45a7d2c03e272d1d8f835bb7f0b6 /libcpp/lex.cc
parent	bdfe0d1ce0aebdb68b77e2c04a0f45956c56b449 (diff)
download	gcc-0b8c57ed40f19086e30ce54faec3222ac21cc0df.zip gcc-0b8c57ed40f19086e30ce54faec3222ac21cc0df.tar.gz gcc-0b8c57ed40f19086e30ce54faec3222ac21cc0df.tar.bz2