aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRaiki Tamura <tamaron1203@gmail.com>2023-07-04 18:21:48 +0900
committerPhilip Herron <philip.herron@embecosm.com>2023-07-06 16:12:19 +0000
commit46a61f02f12d697c6f25eb03371bbb22a5b8504a (patch)
treef067431b0d38e15aeb6716209838557c8387a84b
parent5e735e9aaa7d9c5fdab1796ace3045c6f2f7badf (diff)
downloadgcc-46a61f02f12d697c6f25eb03371bbb22a5b8504a.zip
gcc-46a61f02f12d697c6f25eb03371bbb22a5b8504a.tar.gz
gcc-46a61f02f12d697c6f25eb03371bbb22a5b8504a.tar.bz2
gccrs: add utf-8 validation for input source
gcc/rust/ChangeLog: * lex/rust-lex.cc (Lexer::input_source_is_valid_utf8): New method of `Lexer`. * lex/rust-lex.h: Likewise. * rust-session-manager.cc (Session::compile_crate): Add error. gcc/testsuite/ChangeLog: * rust/compile/broken_utf8.rs: New test. Signed-off-by: Raiki Tamura <tamaron1203@gmail.com>
-rw-r--r--gcc/rust/lex/rust-lex.cc6
-rw-r--r--gcc/rust/lex/rust-lex.h15
-rw-r--r--gcc/rust/rust-session-manager.cc8
-rw-r--r--gcc/testsuite/rust/compile/broken_utf8.rs2
4 files changed, 24 insertions, 7 deletions
diff --git a/gcc/rust/lex/rust-lex.cc b/gcc/rust/lex/rust-lex.cc
index 3593ee0..19bf5c4 100644
--- a/gcc/rust/lex/rust-lex.cc
+++ b/gcc/rust/lex/rust-lex.cc
@@ -167,6 +167,12 @@ Lexer::~Lexer ()
// line_map->stop();
}
+bool
+Lexer::input_source_is_valid_utf8 ()
+{
+ return raw_input_source->is_valid ();
+}
+
/* TODO: need to optimise somehow to avoid the virtual function call in the
* tight loop. Best idea at the moment is CRTP, but that might make lexer
* implementation annoying when storing the "base class" (i.e. would need
diff --git a/gcc/rust/lex/rust-lex.h b/gcc/rust/lex/rust-lex.h
index 4355394..2034fd7 100644
--- a/gcc/rust/lex/rust-lex.h
+++ b/gcc/rust/lex/rust-lex.h
@@ -175,6 +175,8 @@ public:
Lexer (Lexer &&other) = default;
Lexer &operator= (Lexer &&other) = default;
+ bool input_source_is_valid_utf8 ();
+
// Returns token n tokens ahead of current position.
const_TokenPtr peek_token (int n) { return token_queue.peek (n); }
// Peeks the current token.
@@ -217,9 +219,9 @@ public:
Codepoint next_codepoint ()
{
- uint8_t input = next_byte ();
+ uint32_t input = next_byte ();
- if ((int8_t) input == EOF)
+ if ((int32_t) input == EOF)
return Codepoint::eof ();
else if (input < 128)
{
@@ -246,11 +248,13 @@ public:
// 3 bytes or UTF-8 BOM
uint8_t input2 = next_byte ();
// If the second byte is equal to 0xBB then the input is no longer a
- // valid UTF-8 char.
+ // valid UTF-8 char. Then, we check if the third byte makes up a UTF
+ // BOM.
if (input == 0xEF && input2 == 0xBB)
{
uint8_t input3 = next_byte ();
if (input3 == 0xBF)
+ // found BOM
return next_codepoint ();
else
return {0xFFFE};
@@ -289,8 +293,6 @@ public:
}
else
{
- // rust_error_at (get_current_location (),
- // "invalid UTF-8 [SECND] (too long)");
return {0xFFFE};
}
}
@@ -362,8 +364,7 @@ public:
{
if (offs >= buffer.size ())
return EOF;
-
- return buffer.at (offs++);
+ return (uint8_t) buffer.at (offs++);
}
public:
diff --git a/gcc/rust/rust-session-manager.cc b/gcc/rust/rust-session-manager.cc
index 08e8cdb..0fe6aa5 100644
--- a/gcc/rust/rust-session-manager.cc
+++ b/gcc/rust/rust-session-manager.cc
@@ -497,6 +497,14 @@ Session::compile_crate (const char *filename)
Lexer lex (filename, std::move (file_wrap), linemap, dump_lex_opt);
+ if (!lex.input_source_is_valid_utf8 ())
+ {
+ rust_error_at (Linemap::unknown_location (),
+ "cannot read %s; stream did not contain valid UTF-8",
+ filename);
+ return;
+ }
+
Parser<Lexer> parser (lex);
// generate crate from parser
diff --git a/gcc/testsuite/rust/compile/broken_utf8.rs b/gcc/testsuite/rust/compile/broken_utf8.rs
new file mode 100644
index 0000000..8053b83
--- /dev/null
+++ b/gcc/testsuite/rust/compile/broken_utf8.rs
@@ -0,0 +1,2 @@
+// { dg-excess-errors "stream did not contain valid UTF-8" }
+ÿ \ No newline at end of file