diff options
author | bors[bot] <26634292+bors[bot]@users.noreply.github.com> | 2021-07-06 07:04:19 +0000 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-07-06 07:04:19 +0000 |
commit | 630efc56d1059bac464b38807dab58cd322f7a7f (patch) | |
tree | 6e3a1c42920a2dd59c1509c92dba376e51a1ffa4 | |
parent | 27b3d34428801397e562b7fcc5ca10b13961f3e1 (diff) | |
parent | 94ada647bf35c727c17ade60d06af6803a7e5668 (diff) | |
download | gcc-630efc56d1059bac464b38807dab58cd322f7a7f.zip gcc-630efc56d1059bac464b38807dab58cd322f7a7f.tar.gz gcc-630efc56d1059bac464b38807dab58cd322f7a7f.tar.bz2 |
Merge #552
552: UTF-8 BOM handling r=dkm a=dkm
Mark Wielaard (https://gcc.gnu.org/pipermail/gcc-rust/2021-July/000072.html ) :
> A rust source file can start with a UTF-8 BOM sequence (EF BB
> BF). This simply indicates that the file is encoded as UTF-8 (all rust
> input is interpreted as asequence of Unicode code points encoded in
> UTF-8) so can be skipped before starting real lexing.
>
> It isn't necessary to keep track of the BOM in the AST or HIR Crate
> classes. So I removed the has_utf8bom flag.
>
> Also included are a couple of simple tests to show we handle the BOM
> correctly now.
Co-authored-by: Mark Wielaard <mark@klomp.org>
-rw-r--r-- | gcc/rust/ast/rust-ast-full-test.cc | 3 | ||||
-rw-r--r-- | gcc/rust/ast/rust-ast.h | 11 | ||||
-rw-r--r-- | gcc/rust/hir/rust-ast-lower.cc | 4 | ||||
-rw-r--r-- | gcc/rust/hir/tree/rust-hir-full-test.cc | 5 | ||||
-rw-r--r-- | gcc/rust/hir/tree/rust-hir.h | 12 | ||||
-rw-r--r-- | gcc/rust/lex/rust-lex.cc | 13 | ||||
-rw-r--r-- | gcc/rust/parse/rust-parse-impl.h | 8 | ||||
-rw-r--r-- | gcc/testsuite/rust/compile/torture/bom.rs | 1 | ||||
-rw-r--r-- | gcc/testsuite/rust/compile/torture/bom_comment.rs | 2 | ||||
-rw-r--r-- | gcc/testsuite/rust/compile/torture/bom_shebang.rs | 2 | ||||
-rw-r--r-- | gcc/testsuite/rust/compile/torture/bom_whitespace.rs | 2 |
11 files changed, 29 insertions, 34 deletions
diff --git a/gcc/rust/ast/rust-ast-full-test.cc b/gcc/rust/ast/rust-ast-full-test.cc index 12ef255..dd55e1d 100644 --- a/gcc/rust/ast/rust-ast-full-test.cc +++ b/gcc/rust/ast/rust-ast-full-test.cc @@ -172,9 +172,6 @@ Crate::as_string () const rust_debug ("beginning crate recursive as-string"); std::string str ("Crate: "); - // add utf8bom - if (has_utf8bom) - str += "\n has utf8bom"; // inner attributes str += append_attributes (inner_attrs, INNER); diff --git a/gcc/rust/ast/rust-ast.h b/gcc/rust/ast/rust-ast.h index ce55e1b..75b08f8 100644 --- a/gcc/rust/ast/rust-ast.h +++ b/gcc/rust/ast/rust-ast.h @@ -1550,8 +1550,6 @@ protected: // A crate AST object - holds all the data for a single compilation unit struct Crate { - bool has_utf8bom; - std::vector<Attribute> inner_attrs; // dodgy spacing required here /* TODO: is it better to have a vector of items here or a module (implicit @@ -1563,16 +1561,14 @@ struct Crate public: // Constructor Crate (std::vector<std::unique_ptr<Item> > items, - std::vector<Attribute> inner_attrs, bool has_utf8bom = false) - : has_utf8bom (has_utf8bom), inner_attrs (std::move (inner_attrs)), - items (std::move (items)), + std::vector<Attribute> inner_attrs) + : inner_attrs (std::move (inner_attrs)), items (std::move (items)), node_id (Analysis::Mappings::get ()->get_next_node_id ()) {} // Copy constructor with vector clone Crate (Crate const &other) - : has_utf8bom (other.has_utf8bom), inner_attrs (other.inner_attrs), - node_id (other.node_id) + : inner_attrs (other.inner_attrs), node_id (other.node_id) { items.reserve (other.items.size ()); for (const auto &e : other.items) @@ -1585,7 +1581,6 @@ public: Crate &operator= (Crate const &other) { inner_attrs = other.inner_attrs; - has_utf8bom = other.has_utf8bom; node_id = other.node_id; items.reserve (other.items.size ()); diff --git a/gcc/rust/hir/rust-ast-lower.cc b/gcc/rust/hir/rust-ast-lower.cc index 0f3c86d..01abd84 100644 --- a/gcc/rust/hir/rust-ast-lower.cc +++ b/gcc/rust/hir/rust-ast-lower.cc @@ -40,7 +40,6 @@ HIR::Crate ASTLowering::go () { std::vector<std::unique_ptr<HIR::Item> > items; - bool has_utf8bom = false; for (auto it = astCrate.items.begin (); it != astCrate.items.end (); it++) { @@ -55,8 +54,7 @@ ASTLowering::go () mappings->get_next_hir_id (crate_num), UNKNOWN_LOCAL_DEFID); - return HIR::Crate (std::move (items), astCrate.get_inner_attrs (), mapping, - has_utf8bom); + return HIR::Crate (std::move (items), astCrate.get_inner_attrs (), mapping); } // rust-ast-lower-block.h diff --git a/gcc/rust/hir/tree/rust-hir-full-test.cc b/gcc/rust/hir/tree/rust-hir-full-test.cc index 051ba87..05c75e0 100644 --- a/gcc/rust/hir/tree/rust-hir-full-test.cc +++ b/gcc/rust/hir/tree/rust-hir-full-test.cc @@ -73,11 +73,6 @@ std::string Crate::as_string () const { std::string str ("HIR::Crate: "); - // add utf8bom - if (has_utf8bom) - { - str += "\n has utf8bom"; - } // inner attributes str += "\n inner attributes: "; diff --git a/gcc/rust/hir/tree/rust-hir.h b/gcc/rust/hir/tree/rust-hir.h index f918f2d..1819d17 100644 --- a/gcc/rust/hir/tree/rust-hir.h +++ b/gcc/rust/hir/tree/rust-hir.h @@ -678,8 +678,6 @@ public: // A crate HIR object - holds all the data for a single compilation unit struct Crate { - bool has_utf8bom; - AST::AttrVec inner_attrs; // dodgy spacing required here /* TODO: is it better to have a vector of items here or a module (implicit @@ -691,15 +689,14 @@ struct Crate public: // Constructor Crate (std::vector<std::unique_ptr<Item> > items, AST::AttrVec inner_attrs, - Analysis::NodeMapping mappings, bool has_utf8bom = false) - : has_utf8bom (has_utf8bom), inner_attrs (std::move (inner_attrs)), - items (std::move (items)), mappings (mappings) + Analysis::NodeMapping mappings) + : inner_attrs (std::move (inner_attrs)), items (std::move (items)), + mappings (mappings) {} // Copy constructor with vector clone Crate (Crate const &other) - : has_utf8bom (other.has_utf8bom), inner_attrs (other.inner_attrs), - mappings (other.mappings) + : inner_attrs (other.inner_attrs), mappings (other.mappings) { items.reserve (other.items.size ()); for (const auto &e : other.items) @@ -712,7 +709,6 @@ public: Crate &operator= (Crate const &other) { inner_attrs = other.inner_attrs; - has_utf8bom = other.has_utf8bom; mappings = other.mappings; items.reserve (other.items.size ()); diff --git a/gcc/rust/lex/rust-lex.cc b/gcc/rust/lex/rust-lex.cc index ebd69de..617dd69 100644 --- a/gcc/rust/lex/rust-lex.cc +++ b/gcc/rust/lex/rust-lex.cc @@ -237,6 +237,19 @@ Lexer::build_token () current_char = peek_input (); skip_input (); + // detect UTF8 bom + // + // Must be the first thing on the first line. + // There might be an optional BOM (Byte Order Mark), which for UTF-8 is + // the three bytes 0xEF, 0xBB and 0xBF. These can simply be skipped. + if (current_line == 1 && current_column == 1 && current_char == 0xef + && peek_input () == 0xbb && peek_input (1) == 0xbf) + { + skip_input (1); + current_char = peek_input (); + skip_input (); + } + // detect shebang // Must be the first thing on the first line, starting with #! // But since an attribute can also start with an #! we don't count it as a diff --git a/gcc/rust/parse/rust-parse-impl.h b/gcc/rust/parse/rust-parse-impl.h index 136b343..a8597fa 100644 --- a/gcc/rust/parse/rust-parse-impl.h +++ b/gcc/rust/parse/rust-parse-impl.h @@ -393,12 +393,6 @@ template <typename ManagedTokenSource> AST::Crate Parser<ManagedTokenSource>::parse_crate () { - /* TODO: determine if has utf8bom. Currently, is eliminated - * by the lexing phase. Not useful for the compiler anyway, so maybe a - * better idea would be to eliminate - * the has_utf8bom variable from the crate data structure. */ - bool has_utf8bom = false; - // parse inner attributes AST::AttrVec inner_attrs = parse_inner_attributes (); @@ -429,7 +423,7 @@ Parser<ManagedTokenSource>::parse_crate () for (const auto &error : error_table) error.emit_error (); - return AST::Crate (std::move (items), std::move (inner_attrs), has_utf8bom); + return AST::Crate (std::move (items), std::move (inner_attrs)); } // Parse a contiguous block of inner attributes. diff --git a/gcc/testsuite/rust/compile/torture/bom.rs b/gcc/testsuite/rust/compile/torture/bom.rs new file mode 100644 index 0000000..5edcab2 --- /dev/null +++ b/gcc/testsuite/rust/compile/torture/bom.rs @@ -0,0 +1 @@ +pub fn main () { } diff --git a/gcc/testsuite/rust/compile/torture/bom_comment.rs b/gcc/testsuite/rust/compile/torture/bom_comment.rs new file mode 100644 index 0000000..020e170 --- /dev/null +++ b/gcc/testsuite/rust/compile/torture/bom_comment.rs @@ -0,0 +1,2 @@ +// UTF8 BOM +pub fn main () { } diff --git a/gcc/testsuite/rust/compile/torture/bom_shebang.rs b/gcc/testsuite/rust/compile/torture/bom_shebang.rs new file mode 100644 index 0000000..4c552e8 --- /dev/null +++ b/gcc/testsuite/rust/compile/torture/bom_shebang.rs @@ -0,0 +1,2 @@ +#!/usr/bin/cat +pub fn main () { } diff --git a/gcc/testsuite/rust/compile/torture/bom_whitespace.rs b/gcc/testsuite/rust/compile/torture/bom_whitespace.rs new file mode 100644 index 0000000..b10d565 --- /dev/null +++ b/gcc/testsuite/rust/compile/torture/bom_whitespace.rs @@ -0,0 +1,2 @@ + +pub fn main () { } |