diff options
author | Raiki Tamura <tamaron1203@gmail.com> | 2023-07-04 18:21:48 +0900 |
---|---|---|
committer | Philip Herron <philip.herron@embecosm.com> | 2023-07-06 16:12:19 +0000 |
commit | 46a61f02f12d697c6f25eb03371bbb22a5b8504a (patch) | |
tree | f067431b0d38e15aeb6716209838557c8387a84b | |
parent | 5e735e9aaa7d9c5fdab1796ace3045c6f2f7badf (diff) | |
download | gcc-46a61f02f12d697c6f25eb03371bbb22a5b8504a.zip gcc-46a61f02f12d697c6f25eb03371bbb22a5b8504a.tar.gz gcc-46a61f02f12d697c6f25eb03371bbb22a5b8504a.tar.bz2 |
gccrs: add utf-8 validation for input source
gcc/rust/ChangeLog:
* lex/rust-lex.cc (Lexer::input_source_is_valid_utf8): New method of `Lexer`.
* lex/rust-lex.h: Likewise.
* rust-session-manager.cc (Session::compile_crate): Add error.
gcc/testsuite/ChangeLog:
* rust/compile/broken_utf8.rs: New test.
Signed-off-by: Raiki Tamura <tamaron1203@gmail.com>
-rw-r--r-- | gcc/rust/lex/rust-lex.cc | 6 | ||||
-rw-r--r-- | gcc/rust/lex/rust-lex.h | 15 | ||||
-rw-r--r-- | gcc/rust/rust-session-manager.cc | 8 | ||||
-rw-r--r-- | gcc/testsuite/rust/compile/broken_utf8.rs | 2 |
4 files changed, 24 insertions, 7 deletions
diff --git a/gcc/rust/lex/rust-lex.cc b/gcc/rust/lex/rust-lex.cc index 3593ee0..19bf5c4 100644 --- a/gcc/rust/lex/rust-lex.cc +++ b/gcc/rust/lex/rust-lex.cc @@ -167,6 +167,12 @@ Lexer::~Lexer () // line_map->stop(); } +bool +Lexer::input_source_is_valid_utf8 () +{ + return raw_input_source->is_valid (); +} + /* TODO: need to optimise somehow to avoid the virtual function call in the * tight loop. Best idea at the moment is CRTP, but that might make lexer * implementation annoying when storing the "base class" (i.e. would need diff --git a/gcc/rust/lex/rust-lex.h b/gcc/rust/lex/rust-lex.h index 4355394..2034fd7 100644 --- a/gcc/rust/lex/rust-lex.h +++ b/gcc/rust/lex/rust-lex.h @@ -175,6 +175,8 @@ public: Lexer (Lexer &&other) = default; Lexer &operator= (Lexer &&other) = default; + bool input_source_is_valid_utf8 (); + // Returns token n tokens ahead of current position. const_TokenPtr peek_token (int n) { return token_queue.peek (n); } // Peeks the current token. @@ -217,9 +219,9 @@ public: Codepoint next_codepoint () { - uint8_t input = next_byte (); + uint32_t input = next_byte (); - if ((int8_t) input == EOF) + if ((int32_t) input == EOF) return Codepoint::eof (); else if (input < 128) { @@ -246,11 +248,13 @@ public: // 3 bytes or UTF-8 BOM uint8_t input2 = next_byte (); // If the second byte is equal to 0xBB then the input is no longer a - // valid UTF-8 char. + // valid UTF-8 char. Then, we check if the third byte makes up a UTF + // BOM. if (input == 0xEF && input2 == 0xBB) { uint8_t input3 = next_byte (); if (input3 == 0xBF) + // found BOM return next_codepoint (); else return {0xFFFE}; @@ -289,8 +293,6 @@ public: } else { - // rust_error_at (get_current_location (), - // "invalid UTF-8 [SECND] (too long)"); return {0xFFFE}; } } @@ -362,8 +364,7 @@ public: { if (offs >= buffer.size ()) return EOF; - - return buffer.at (offs++); + return (uint8_t) buffer.at (offs++); } public: diff --git a/gcc/rust/rust-session-manager.cc b/gcc/rust/rust-session-manager.cc index 08e8cdb..0fe6aa5 100644 --- a/gcc/rust/rust-session-manager.cc +++ b/gcc/rust/rust-session-manager.cc @@ -497,6 +497,14 @@ Session::compile_crate (const char *filename) Lexer lex (filename, std::move (file_wrap), linemap, dump_lex_opt); + if (!lex.input_source_is_valid_utf8 ()) + { + rust_error_at (Linemap::unknown_location (), + "cannot read %s; stream did not contain valid UTF-8", + filename); + return; + } + Parser<Lexer> parser (lex); // generate crate from parser diff --git a/gcc/testsuite/rust/compile/broken_utf8.rs b/gcc/testsuite/rust/compile/broken_utf8.rs new file mode 100644 index 0000000..8053b83 --- /dev/null +++ b/gcc/testsuite/rust/compile/broken_utf8.rs @@ -0,0 +1,2 @@ +// { dg-excess-errors "stream did not contain valid UTF-8" } +ÿ
\ No newline at end of file |