aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
authorMark Wielaard <mark@klomp.org>2021-07-05 21:11:03 +0200
committerMark Wielaard <mark@klomp.org>2021-07-05 21:11:03 +0200
commit29192293fdce8b3cf4888bbe0d5f504655e26cd5 (patch)
treec79b390d50c717ad4e1abd2c262a7b53de69a467 /gcc
parent27b3d34428801397e562b7fcc5ca10b13961f3e1 (diff)
downloadgcc-29192293fdce8b3cf4888bbe0d5f504655e26cd5.zip
gcc-29192293fdce8b3cf4888bbe0d5f504655e26cd5.tar.gz
gcc-29192293fdce8b3cf4888bbe0d5f504655e26cd5.tar.bz2
Handle UTF-8 BOM in lexer
The very first thing in a rust source file might be the optional UTF-8 BOM. This is the 3 bytes 0xEF 0xBB 0xBF. They can simply be skipped, they just mark the file as UTF-8. Add some testcases to show we now handle such files.
Diffstat (limited to 'gcc')
-rw-r--r--gcc/rust/lex/rust-lex.cc13
-rw-r--r--gcc/testsuite/rust/compile/torture/bom.rs1
-rw-r--r--gcc/testsuite/rust/compile/torture/bom_comment.rs2
-rw-r--r--gcc/testsuite/rust/compile/torture/bom_shebang.rs2
-rw-r--r--gcc/testsuite/rust/compile/torture/bom_whitespace.rs2
5 files changed, 20 insertions, 0 deletions
diff --git a/gcc/rust/lex/rust-lex.cc b/gcc/rust/lex/rust-lex.cc
index ebd69de..617dd69 100644
--- a/gcc/rust/lex/rust-lex.cc
+++ b/gcc/rust/lex/rust-lex.cc
@@ -237,6 +237,19 @@ Lexer::build_token ()
current_char = peek_input ();
skip_input ();
+ // detect UTF8 bom
+ //
+ // Must be the first thing on the first line.
+ // There might be an optional BOM (Byte Order Mark), which for UTF-8 is
+ // the three bytes 0xEF, 0xBB and 0xBF. These can simply be skipped.
+ if (current_line == 1 && current_column == 1 && current_char == 0xef
+ && peek_input () == 0xbb && peek_input (1) == 0xbf)
+ {
+ skip_input (1);
+ current_char = peek_input ();
+ skip_input ();
+ }
+
// detect shebang
// Must be the first thing on the first line, starting with #!
// But since an attribute can also start with an #! we don't count it as a
diff --git a/gcc/testsuite/rust/compile/torture/bom.rs b/gcc/testsuite/rust/compile/torture/bom.rs
new file mode 100644
index 0000000..5edcab2
--- /dev/null
+++ b/gcc/testsuite/rust/compile/torture/bom.rs
@@ -0,0 +1 @@
+pub fn main () { }
diff --git a/gcc/testsuite/rust/compile/torture/bom_comment.rs b/gcc/testsuite/rust/compile/torture/bom_comment.rs
new file mode 100644
index 0000000..020e170
--- /dev/null
+++ b/gcc/testsuite/rust/compile/torture/bom_comment.rs
@@ -0,0 +1,2 @@
+// UTF8 BOM
+pub fn main () { }
diff --git a/gcc/testsuite/rust/compile/torture/bom_shebang.rs b/gcc/testsuite/rust/compile/torture/bom_shebang.rs
new file mode 100644
index 0000000..4c552e8
--- /dev/null
+++ b/gcc/testsuite/rust/compile/torture/bom_shebang.rs
@@ -0,0 +1,2 @@
+#!/usr/bin/cat
+pub fn main () { }
diff --git a/gcc/testsuite/rust/compile/torture/bom_whitespace.rs b/gcc/testsuite/rust/compile/torture/bom_whitespace.rs
new file mode 100644
index 0000000..b10d565
--- /dev/null
+++ b/gcc/testsuite/rust/compile/torture/bom_whitespace.rs
@@ -0,0 +1,2 @@
+
+pub fn main () { }