ast: Add new AttributeChecker visitor

This commit adds a new attribute checker visitor. Its role is to take care of validating builtin attributes and their inputs. In order to validate doc(alias) strings properly, as well as handle multiline (byte) strings, this also fixes the lexer to better handle EOF in bytes and codepoints.
author: Arthur Cohen <arthur.cohen@embecosm.com> 2022-07-20 15:28:23 +0200
committer: Arthur Cohen <arthur.cohen@embecosm.com> 2022-07-21 11:16:19 +0200
commit: 8dc692afc2c86cbf3b5124484dd2596514a5acf3 (patch)
tree: 9bc01a3484c3786743ab0d4a65cfca4dae95321e /gcc/rust/lex
parent: 137cd3bbaa05038b01c46f7ac7472da7ca662ed7 (diff)
download: gcc-8dc692afc2c86cbf3b5124484dd2596514a5acf3.zip
gcc-8dc692afc2c86cbf3b5124484dd2596514a5acf3.tar.gz
gcc-8dc692afc2c86cbf3b5124484dd2596514a5acf3.tar.bz2
2 files changed, 33 insertions, 20 deletions
diff --git a/gcc/rust/lex/rust-codepoint.h b/gcc/rust/lex/rust-codepoint.h
index cdadfcd..22da080 100644
--- a/gcc/rust/lex/rust-codepoint.h
+++ b/gcc/rust/lex/rust-codepoint.h
@@ -32,11 +32,13 @@ struct Codepoint
   // Creates a codepoint from an encoded UTF-8 value.
   Codepoint (uint32_t value) : value (value) {}
 
+  static Codepoint eof () { return Codepoint (UINT32_MAX); }
+  bool is_eof () const { return value == UINT32_MAX; }
+
   // Returns a C++ string containing string value of codepoint.
   std::string as_string ();
 
   bool operator== (Codepoint other) const { return value == other.value; }
-
   bool operator!= (Codepoint other) const { return !operator== (other); }
 };
 } // namespace Rust
diff --git a/gcc/rust/lex/rust-lex.cc b/gcc/rust/lex/rust-lex.cc
index ecf151d..70e6b50 100644
--- a/gcc/rust/lex/rust-lex.cc
+++ b/gcc/rust/lex/rust-lex.cc
@@ -1696,7 +1696,7 @@ Lexer::parse_byte_string (Location loc)
   int length = 1;
   current_char = peek_input ();
 
-  while (current_char != '"' && current_char != '\n')
+  while (current_char != '"' && current_char != EOF)
     {
       if (current_char == '\\')
 	{
@@ -1723,17 +1723,18 @@ Lexer::parse_byte_string (Location loc)
 
   current_column += length;
 
-  if (current_char == '\n')
-    {
-      rust_error_at (get_current_location (), "unended byte string literal");
-    }
-  else if (current_char == '"')
+  if (current_char == '"')
     {
       current_column++;
 
       skip_input ();
       current_char = peek_input ();
     }
+  else if (current_char == EOF)
+    {
+      rust_error_at (get_current_location (), "unended byte string literal");
+      return Token::make (END_OF_FILE, get_current_location ());
+    }
   else
     {
       gcc_unreachable ();
@@ -1917,7 +1918,8 @@ Lexer::parse_string (Location loc)
   int length = 1;
   current_char32 = peek_codepoint_input ();
 
-  while (current_char32.value != '\n' && current_char32.value != '"')
+  // FIXME: This fails if the input ends. How do we check for EOF?
+  while (current_char32.value != '"' && !current_char32.is_eof ())
     {
       if (current_char32.value == '\\')
 	{
@@ -1949,20 +1951,18 @@ Lexer::parse_string (Location loc)
 
   current_column += length;
 
-  if (current_char32.value == '\n')
-    {
-      rust_error_at (get_current_location (), "unended string literal");
-      // by this point, the parser will stuck at this position due to
-      // undetermined string termination. we now need to unstuck the parser
-      skip_broken_string_input (current_char32.value);
-    }
-  else if (current_char32.value == '"')
+  if (current_char32.value == '"')
     {
       current_column++;
 
       skip_input ();
       current_char = peek_input ();
     }
+  else if (current_char32.is_eof ())
+    {
+      rust_error_at (get_current_location (), "unended string literal");
+      return Token::make (END_OF_FILE, get_current_location ());
+    }
   else
     {
       gcc_unreachable ();
@@ -2046,7 +2046,7 @@ Lexer::parse_raw_string (Location loc, int initial_hash_count)
   skip_input ();
   Codepoint current_char32 = peek_codepoint_input ();
 
-  while (true)
+  while (!current_char32.is_eof ())
     {
       if (current_char32.value == '"')
 	{
@@ -2318,6 +2318,8 @@ Lexer::parse_char_or_lifetime (Location loc)
   int length = 1;
 
   current_char32 = peek_codepoint_input ();
+  if (current_char32.is_eof ())
+    return nullptr;
 
   // parse escaped char literal
   if (current_char32.value == '\\')
@@ -2398,6 +2400,9 @@ Lexer::get_input_codepoint_length ()
 {
   uint8_t input = peek_input ();
 
+  if ((int8_t) input == EOF)
+    return 0;
+
   if (input < 128)
     {
       // ascii -- 1 byte
@@ -2467,7 +2472,8 @@ Lexer::get_input_codepoint_length ()
     }
   else
     {
-      rust_error_at (get_current_location (), "invalid UTF-8 (too long)");
+      rust_error_at (get_current_location (),
+		     "invalid UTF-8 [FIRST] (too long)");
       return 0;
     }
 }
@@ -2478,6 +2484,9 @@ Lexer::peek_codepoint_input ()
 {
   uint8_t input = peek_input ();
 
+  if ((int8_t) input == EOF)
+    return Codepoint::eof ();
+
   if (input < 128)
     {
       // ascii -- 1 byte
@@ -2534,7 +2543,8 @@ Lexer::peek_codepoint_input ()
     }
   else
     {
-      rust_error_at (get_current_location (), "invalid UTF-8 (too long)");
+      rust_error_at (get_current_location (),
+		     "invalid UTF-8 [SECND] (too long)");
       return {0xFFFE};
     }
 }
@@ -2620,7 +2630,8 @@ Lexer::test_get_input_codepoint_n_length (int n_start_offset)
     }
   else
     {
-      rust_error_at (get_current_location (), "invalid UTF-8 (too long)");
+      rust_error_at (get_current_location (),
+		     "invalid UTF-8 [THIRD] (too long)");
       return 0;
     }
 }
author	Arthur Cohen <arthur.cohen@embecosm.com>	2022-07-20 15:28:23 +0200
committer	Arthur Cohen <arthur.cohen@embecosm.com>	2022-07-21 11:16:19 +0200
commit	8dc692afc2c86cbf3b5124484dd2596514a5acf3 (patch)
tree	9bc01a3484c3786743ab0d4a65cfca4dae95321e /gcc/rust/lex
parent	137cd3bbaa05038b01c46f7ac7472da7ca662ed7 (diff)
download	gcc-8dc692afc2c86cbf3b5124484dd2596514a5acf3.zip gcc-8dc692afc2c86cbf3b5124484dd2596514a5acf3.tar.gz gcc-8dc692afc2c86cbf3b5124484dd2596514a5acf3.tar.bz2