Fix lexer to not produce bad unicode escape values

There were a couple of issues in the lexer unicode escape code. Unicode escape sequences must always start with an opening curly bracket (and end with a closing one). Underscores are not allowed as starting character. And the produced values must be unicode scalar values, which excludes surrogate values (D800 to DFFF) or values larger than 10FFFF. Also try to recover more gracefully from errors by trying to skip past any bad characters to the end of the escape sequence. Test all of the above in a new testcase unicode_escape.rs.
author: Mark Wielaard <mark@klomp.org> 2021-10-02 22:50:52 +0200
committer: Mark Wielaard <mark@klomp.org> 2021-10-02 23:03:54 +0200
commit: 23475a8131d2bd284758bcab85fa1c5bb98f1010 (patch)
tree: 49aa28db4ebc157b521ea49547de2a5b21ec6af5 /gcc/rust/lex/rust-lex.cc
parent: fdcad086e134b889ba542fadc1150bb2fcef8aea (diff)
download: gcc-23475a8131d2bd284758bcab85fa1c5bb98f1010.zip
gcc-23475a8131d2bd284758bcab85fa1c5bb98f1010.tar.gz
gcc-23475a8131d2bd284758bcab85fa1c5bb98f1010.tar.bz2
1 files changed, 72 insertions, 16 deletions
diff --git a/gcc/rust/lex/rust-lex.cc b/gcc/rust/lex/rust-lex.cc
index bbddea0..2b3c89b 100644
--- a/gcc/rust/lex/rust-lex.cc
+++ b/gcc/rust/lex/rust-lex.cc
@@ -1273,6 +1273,8 @@ Lexer::parse_escape (char opening_char)
       rust_error_at (get_current_location (),
 		     "cannot have a unicode escape \\u in a byte %s",
 		     opening_char == '\'' ? "character" : "string");
+      // Try to parse it anyway, just to skip it
+      parse_partial_unicode_escape ();
       return std::make_tuple (output_char, additional_length_offset, false);
     case '\r':
     case '\n':
@@ -1461,16 +1463,34 @@ Lexer::parse_partial_unicode_escape ()
 {
   skip_input ();
   current_char = peek_input ();
-  int additional_length_offset = 1;
+  int additional_length_offset = 0;
 
-  bool need_close_brace = false;
-  if (current_char == '{')
+  if (current_char != '{')
     {
-      need_close_brace = true;
+      rust_error_at (get_current_location (),
+		     "unicode escape should start with %<{%>");
+      /* Skip what should probaby have been between brackets.  */
+      while (is_x_digit (current_char) || current_char == '_')
+	{
+	  skip_input ();
+	  current_char = peek_input ();
+	  additional_length_offset++;
+	}
+      return std::make_pair (Codepoint (0), additional_length_offset);
+    }
 
+  skip_input ();
+  current_char = peek_input ();
+  additional_length_offset++;
+
+  if (current_char == '_')
+    {
+      rust_error_at (get_current_location (),
+		     "unicode escape cannot start with %<_%>");
       skip_input ();
       current_char = peek_input ();
       additional_length_offset++;
+      // fallthrough and try to parse the rest anyway
     }
 
   // parse unicode escape - 1-6 hex digits
@@ -1500,21 +1520,45 @@ Lexer::parse_partial_unicode_escape ()
       current_char = peek_input ();
     }
 
-  // ensure closing brace if required
-  if (need_close_brace)
+  if (current_char == '}')
     {
-      if (current_char == '}')
+      skip_input ();
+      current_char = peek_input ();
+      additional_length_offset++;
+    }
+  else
+    {
+      // actually an error, but allow propagation anyway Assume that
+      // wrong bracketm whitespace or single/double quotes are wrong
+      // termination, otherwise it is a wrong character, then skip to the actual
+      // terminator.
+      if (current_char == '{' || is_whitespace (current_char)
+	  || current_char == '\'' || current_char == '"')
 	{
-	  skip_input ();
-	  current_char = peek_input ();
-	  additional_length_offset++;
+	  rust_error_at (get_current_location (),
+			 "expected terminating %<}%> in unicode escape");
+	  return std::make_pair (Codepoint (0), additional_length_offset);
 	}
       else
 	{
-	  // actually an error, but allow propagation anyway
 	  rust_error_at (get_current_location (),
-			 "expected terminating %<}%> in unicode escape");
-	  // return false;
+			 "invalid character %<%c%> in unicode escape",
+			 current_char);
+	  while (current_char != '}' && current_char != '{'
+		 && !is_whitespace (current_char) && current_char != '\''
+		 && current_char != '"')
+	    {
+	      skip_input ();
+	      current_char = peek_input ();
+	      additional_length_offset++;
+	    }
+	  // Consume the actual closing bracket if found
+	  if (current_char == '}')
+	    {
+	      skip_input ();
+	      current_char = peek_input ();
+	      additional_length_offset++;
+	    }
 	  return std::make_pair (Codepoint (0), additional_length_offset);
 	}
     }
@@ -1530,10 +1574,22 @@ Lexer::parse_partial_unicode_escape ()
       return std::make_pair (Codepoint (0), additional_length_offset);
     }
 
-  long hex_num = std::strtol (num_str.c_str (), nullptr, 16);
+  unsigned long hex_num = std::strtoul (num_str.c_str (), nullptr, 16);
 
-  // assert fits a uint32_t
-  gcc_assert (hex_num < 4294967296);
+  if (hex_num > 0xd7ff && hex_num < 0xe000)
+    {
+      rust_error_at (
+	get_current_location (),
+	"unicode escape cannot be a surrogate value (D800 to DFFF)");
+      return std::make_pair (Codepoint (0), additional_length_offset);
+    }
+
+  if (hex_num > 0x10ffff)
+    {
+      rust_error_at (get_current_location (),
+		     "unicode escape cannot be larger than 10FFFF");
+      return std::make_pair (Codepoint (0), additional_length_offset);
+    }
 
   // return true;
   return std::make_pair (Codepoint (static_cast<uint32_t> (hex_num)),
author	Mark Wielaard <mark@klomp.org>	2021-10-02 22:50:52 +0200
committer	Mark Wielaard <mark@klomp.org>	2021-10-02 23:03:54 +0200
commit	23475a8131d2bd284758bcab85fa1c5bb98f1010 (patch)
tree	49aa28db4ebc157b521ea49547de2a5b21ec6af5 /gcc/rust/lex/rust-lex.cc
parent	fdcad086e134b889ba542fadc1150bb2fcef8aea (diff)
download	gcc-23475a8131d2bd284758bcab85fa1c5bb98f1010.zip gcc-23475a8131d2bd284758bcab85fa1c5bb98f1010.tar.gz gcc-23475a8131d2bd284758bcab85fa1c5bb98f1010.tar.bz2