Handle shebang line, plus any whitespace and comment skipping in lexer

The lexer tried to handle the shebang line but used loc directly, instead of the current_column. And it assumed a '/' should immediately follow the "#!". But if the "#!" is followed by whitespace and/or comments and a '[' character, then the first line isn't see as a shebang line (even if the kernel or shell would) but as the start of an inner attribute. Add various tests for when the first line starting with "#!" is seen as a shebang line (and should be skipped). And some tests there is a '[' character following some whitespace and/or comments and the "#!" is seen as part of an inner attribute.
author: Mark Wielaard <mark@klomp.org> 2021-07-04 23:22:32 +0200
committer: Mark Wielaard <mark@klomp.org> 2021-07-04 23:56:06 +0200
commit: ff35f162daebd1ac6538aa74c0f270f8e19902de (patch)
tree: 487d1b338b5a4060bbcb605d0c9e298475d1587a /gcc/rust
parent: 210ae4f7b0fea9671482b8f01354fd5b9274f878 (diff)
download: gcc-ff35f162daebd1ac6538aa74c0f270f8e19902de.zip
gcc-ff35f162daebd1ac6538aa74c0f270f8e19902de.tar.gz
gcc-ff35f162daebd1ac6538aa74c0f270f8e19902de.tar.bz2
1 files changed, 60 insertions, 19 deletions
diff --git a/gcc/rust/lex/rust-lex.cc b/gcc/rust/lex/rust-lex.cc
index d138416..ebd69de 100644
--- a/gcc/rust/lex/rust-lex.cc
+++ b/gcc/rust/lex/rust-lex.cc
@@ -237,28 +237,63 @@ Lexer::build_token ()
       current_char = peek_input ();
       skip_input ();
 
-      // return end of file token if end of file
-      if (current_char == EOF)
-	return Token::make (END_OF_FILE, loc);
-
       // detect shebang
-      if (loc == 1 && current_line == 1 && current_char == '#')
+      // Must be the first thing on the first line, starting with #!
+      // But since an attribute can also start with an #! we don't count it as a
+      // shebang line when after any whitespace or comments there is a [. If it
+      // is a shebang line we simple drop the line. Otherwise we don't consume
+      // any characters and fall through to the real tokenizer.
+      if (current_line == 1 && current_column == 1 && current_char == '#'
+	  && peek_input () == '!')
 	{
-	  current_char = peek_input ();
-
-	  if (current_char == '!')
+	  int n = 1;
+	  while (true)
 	    {
-	      skip_input ();
-	      current_char = peek_input ();
-
-	      if (current_char == '/')
+	      int next_char = peek_input (n);
+	      if (is_whitespace (next_char))
+		n++;
+	      else if (next_char == '/' && peek_input (n + 1) == '/')
 		{
-		  // definitely shebang
-
-		  skip_input ();
-
-		  // ignore rest of line
-		  while (current_char != '\n')
+		  // A single line comment
+		  n += 2;
+		  next_char = peek_input (n);
+		  while (next_char != '\n' && next_char != EOF)
+		    {
+		      n++;
+		      next_char = peek_input (n);
+		    }
+		  if (next_char == '\n')
+		    n++;
+		}
+	      else if (next_char == '/' && peek_input (n + 1) == '*')
+		{
+		  // Start of a block comment
+		  n += 2;
+		  int level = 1;
+		  while (level > 0)
+		    {
+		      if (peek_input (n) == EOF)
+			break;
+		      else if (peek_input (n) == '/'
+			       && peek_input (n + 1) == '*')
+			{
+			  n += 2;
+			  level += 1;
+			}
+		      else if (peek_input (n) == '*'
+			       && peek_input (n + 1) == '/')
+			{
+			  n += 2;
+			  level -= 1;
+			}
+		      else
+			n++;
+		    }
+		}
+	      else if (next_char != '[')
+		{
+		  // definitely shebang, ignore the first line
+		  while (current_char != '\n' && current_char != EOF)
 		    {
 		      current_char = peek_input ();
 		      skip_input ();
@@ -269,11 +304,17 @@ Lexer::build_token ()
 		  current_column = 1;
 		  // tell line_table that new line starts
 		  line_map->start_line (current_line, max_column_hint);
-		  continue;
+		  break;
 		}
+	      else
+		break; /* Definitely not a shebang line. */
 	    }
 	}
 
+      // return end of file token if end of file
+      if (current_char == EOF)
+	return Token::make (END_OF_FILE, loc);
+
       // if not end of file, start tokenising
       switch (current_char)
 	{
author	Mark Wielaard <mark@klomp.org>	2021-07-04 23:22:32 +0200
committer	Mark Wielaard <mark@klomp.org>	2021-07-04 23:56:06 +0200
commit	ff35f162daebd1ac6538aa74c0f270f8e19902de (patch)
tree	487d1b338b5a4060bbcb605d0c9e298475d1587a /gcc/rust
parent	210ae4f7b0fea9671482b8f01354fd5b9274f878 (diff)
download	gcc-ff35f162daebd1ac6538aa74c0f270f8e19902de.zip gcc-ff35f162daebd1ac6538aa74c0f270f8e19902de.tar.gz gcc-ff35f162daebd1ac6538aa74c0f270f8e19902de.tar.bz2