aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
authorbors[bot] <26634292+bors[bot]@users.noreply.github.com>2021-07-06 07:04:19 +0000
committerGitHub <noreply@github.com>2021-07-06 07:04:19 +0000
commit630efc56d1059bac464b38807dab58cd322f7a7f (patch)
tree6e3a1c42920a2dd59c1509c92dba376e51a1ffa4 /gcc
parent27b3d34428801397e562b7fcc5ca10b13961f3e1 (diff)
parent94ada647bf35c727c17ade60d06af6803a7e5668 (diff)
downloadgcc-630efc56d1059bac464b38807dab58cd322f7a7f.zip
gcc-630efc56d1059bac464b38807dab58cd322f7a7f.tar.gz
gcc-630efc56d1059bac464b38807dab58cd322f7a7f.tar.bz2
Merge #552
552: UTF-8 BOM handling r=dkm a=dkm Mark Wielaard (https://gcc.gnu.org/pipermail/gcc-rust/2021-July/000072.html ) : > A rust source file can start with a UTF-8 BOM sequence (EF BB > BF). This simply indicates that the file is encoded as UTF-8 (all rust > input is interpreted as asequence of Unicode code points encoded in > UTF-8) so can be skipped before starting real lexing. > > It isn't necessary to keep track of the BOM in the AST or HIR Crate > classes. So I removed the has_utf8bom flag. > > Also included are a couple of simple tests to show we handle the BOM > correctly now. Co-authored-by: Mark Wielaard <mark@klomp.org>
Diffstat (limited to 'gcc')
-rw-r--r--gcc/rust/ast/rust-ast-full-test.cc3
-rw-r--r--gcc/rust/ast/rust-ast.h11
-rw-r--r--gcc/rust/hir/rust-ast-lower.cc4
-rw-r--r--gcc/rust/hir/tree/rust-hir-full-test.cc5
-rw-r--r--gcc/rust/hir/tree/rust-hir.h12
-rw-r--r--gcc/rust/lex/rust-lex.cc13
-rw-r--r--gcc/rust/parse/rust-parse-impl.h8
-rw-r--r--gcc/testsuite/rust/compile/torture/bom.rs1
-rw-r--r--gcc/testsuite/rust/compile/torture/bom_comment.rs2
-rw-r--r--gcc/testsuite/rust/compile/torture/bom_shebang.rs2
-rw-r--r--gcc/testsuite/rust/compile/torture/bom_whitespace.rs2
11 files changed, 29 insertions, 34 deletions
diff --git a/gcc/rust/ast/rust-ast-full-test.cc b/gcc/rust/ast/rust-ast-full-test.cc
index 12ef255..dd55e1d 100644
--- a/gcc/rust/ast/rust-ast-full-test.cc
+++ b/gcc/rust/ast/rust-ast-full-test.cc
@@ -172,9 +172,6 @@ Crate::as_string () const
rust_debug ("beginning crate recursive as-string");
std::string str ("Crate: ");
- // add utf8bom
- if (has_utf8bom)
- str += "\n has utf8bom";
// inner attributes
str += append_attributes (inner_attrs, INNER);
diff --git a/gcc/rust/ast/rust-ast.h b/gcc/rust/ast/rust-ast.h
index ce55e1b..75b08f8 100644
--- a/gcc/rust/ast/rust-ast.h
+++ b/gcc/rust/ast/rust-ast.h
@@ -1550,8 +1550,6 @@ protected:
// A crate AST object - holds all the data for a single compilation unit
struct Crate
{
- bool has_utf8bom;
-
std::vector<Attribute> inner_attrs;
// dodgy spacing required here
/* TODO: is it better to have a vector of items here or a module (implicit
@@ -1563,16 +1561,14 @@ struct Crate
public:
// Constructor
Crate (std::vector<std::unique_ptr<Item> > items,
- std::vector<Attribute> inner_attrs, bool has_utf8bom = false)
- : has_utf8bom (has_utf8bom), inner_attrs (std::move (inner_attrs)),
- items (std::move (items)),
+ std::vector<Attribute> inner_attrs)
+ : inner_attrs (std::move (inner_attrs)), items (std::move (items)),
node_id (Analysis::Mappings::get ()->get_next_node_id ())
{}
// Copy constructor with vector clone
Crate (Crate const &other)
- : has_utf8bom (other.has_utf8bom), inner_attrs (other.inner_attrs),
- node_id (other.node_id)
+ : inner_attrs (other.inner_attrs), node_id (other.node_id)
{
items.reserve (other.items.size ());
for (const auto &e : other.items)
@@ -1585,7 +1581,6 @@ public:
Crate &operator= (Crate const &other)
{
inner_attrs = other.inner_attrs;
- has_utf8bom = other.has_utf8bom;
node_id = other.node_id;
items.reserve (other.items.size ());
diff --git a/gcc/rust/hir/rust-ast-lower.cc b/gcc/rust/hir/rust-ast-lower.cc
index 0f3c86d..01abd84 100644
--- a/gcc/rust/hir/rust-ast-lower.cc
+++ b/gcc/rust/hir/rust-ast-lower.cc
@@ -40,7 +40,6 @@ HIR::Crate
ASTLowering::go ()
{
std::vector<std::unique_ptr<HIR::Item> > items;
- bool has_utf8bom = false;
for (auto it = astCrate.items.begin (); it != astCrate.items.end (); it++)
{
@@ -55,8 +54,7 @@ ASTLowering::go ()
mappings->get_next_hir_id (crate_num),
UNKNOWN_LOCAL_DEFID);
- return HIR::Crate (std::move (items), astCrate.get_inner_attrs (), mapping,
- has_utf8bom);
+ return HIR::Crate (std::move (items), astCrate.get_inner_attrs (), mapping);
}
// rust-ast-lower-block.h
diff --git a/gcc/rust/hir/tree/rust-hir-full-test.cc b/gcc/rust/hir/tree/rust-hir-full-test.cc
index 051ba87..05c75e0 100644
--- a/gcc/rust/hir/tree/rust-hir-full-test.cc
+++ b/gcc/rust/hir/tree/rust-hir-full-test.cc
@@ -73,11 +73,6 @@ std::string
Crate::as_string () const
{
std::string str ("HIR::Crate: ");
- // add utf8bom
- if (has_utf8bom)
- {
- str += "\n has utf8bom";
- }
// inner attributes
str += "\n inner attributes: ";
diff --git a/gcc/rust/hir/tree/rust-hir.h b/gcc/rust/hir/tree/rust-hir.h
index f918f2d..1819d17 100644
--- a/gcc/rust/hir/tree/rust-hir.h
+++ b/gcc/rust/hir/tree/rust-hir.h
@@ -678,8 +678,6 @@ public:
// A crate HIR object - holds all the data for a single compilation unit
struct Crate
{
- bool has_utf8bom;
-
AST::AttrVec inner_attrs;
// dodgy spacing required here
/* TODO: is it better to have a vector of items here or a module (implicit
@@ -691,15 +689,14 @@ struct Crate
public:
// Constructor
Crate (std::vector<std::unique_ptr<Item> > items, AST::AttrVec inner_attrs,
- Analysis::NodeMapping mappings, bool has_utf8bom = false)
- : has_utf8bom (has_utf8bom), inner_attrs (std::move (inner_attrs)),
- items (std::move (items)), mappings (mappings)
+ Analysis::NodeMapping mappings)
+ : inner_attrs (std::move (inner_attrs)), items (std::move (items)),
+ mappings (mappings)
{}
// Copy constructor with vector clone
Crate (Crate const &other)
- : has_utf8bom (other.has_utf8bom), inner_attrs (other.inner_attrs),
- mappings (other.mappings)
+ : inner_attrs (other.inner_attrs), mappings (other.mappings)
{
items.reserve (other.items.size ());
for (const auto &e : other.items)
@@ -712,7 +709,6 @@ public:
Crate &operator= (Crate const &other)
{
inner_attrs = other.inner_attrs;
- has_utf8bom = other.has_utf8bom;
mappings = other.mappings;
items.reserve (other.items.size ());
diff --git a/gcc/rust/lex/rust-lex.cc b/gcc/rust/lex/rust-lex.cc
index ebd69de..617dd69 100644
--- a/gcc/rust/lex/rust-lex.cc
+++ b/gcc/rust/lex/rust-lex.cc
@@ -237,6 +237,19 @@ Lexer::build_token ()
current_char = peek_input ();
skip_input ();
+ // detect UTF8 bom
+ //
+ // Must be the first thing on the first line.
+ // There might be an optional BOM (Byte Order Mark), which for UTF-8 is
+ // the three bytes 0xEF, 0xBB and 0xBF. These can simply be skipped.
+ if (current_line == 1 && current_column == 1 && current_char == 0xef
+ && peek_input () == 0xbb && peek_input (1) == 0xbf)
+ {
+ skip_input (1);
+ current_char = peek_input ();
+ skip_input ();
+ }
+
// detect shebang
// Must be the first thing on the first line, starting with #!
// But since an attribute can also start with an #! we don't count it as a
diff --git a/gcc/rust/parse/rust-parse-impl.h b/gcc/rust/parse/rust-parse-impl.h
index 136b343..a8597fa 100644
--- a/gcc/rust/parse/rust-parse-impl.h
+++ b/gcc/rust/parse/rust-parse-impl.h
@@ -393,12 +393,6 @@ template <typename ManagedTokenSource>
AST::Crate
Parser<ManagedTokenSource>::parse_crate ()
{
- /* TODO: determine if has utf8bom. Currently, is eliminated
- * by the lexing phase. Not useful for the compiler anyway, so maybe a
- * better idea would be to eliminate
- * the has_utf8bom variable from the crate data structure. */
- bool has_utf8bom = false;
-
// parse inner attributes
AST::AttrVec inner_attrs = parse_inner_attributes ();
@@ -429,7 +423,7 @@ Parser<ManagedTokenSource>::parse_crate ()
for (const auto &error : error_table)
error.emit_error ();
- return AST::Crate (std::move (items), std::move (inner_attrs), has_utf8bom);
+ return AST::Crate (std::move (items), std::move (inner_attrs));
}
// Parse a contiguous block of inner attributes.
diff --git a/gcc/testsuite/rust/compile/torture/bom.rs b/gcc/testsuite/rust/compile/torture/bom.rs
new file mode 100644
index 0000000..5edcab2
--- /dev/null
+++ b/gcc/testsuite/rust/compile/torture/bom.rs
@@ -0,0 +1 @@
+pub fn main () { }
diff --git a/gcc/testsuite/rust/compile/torture/bom_comment.rs b/gcc/testsuite/rust/compile/torture/bom_comment.rs
new file mode 100644
index 0000000..020e170
--- /dev/null
+++ b/gcc/testsuite/rust/compile/torture/bom_comment.rs
@@ -0,0 +1,2 @@
+// UTF8 BOM
+pub fn main () { }
diff --git a/gcc/testsuite/rust/compile/torture/bom_shebang.rs b/gcc/testsuite/rust/compile/torture/bom_shebang.rs
new file mode 100644
index 0000000..4c552e8
--- /dev/null
+++ b/gcc/testsuite/rust/compile/torture/bom_shebang.rs
@@ -0,0 +1,2 @@
+#!/usr/bin/cat
+pub fn main () { }
diff --git a/gcc/testsuite/rust/compile/torture/bom_whitespace.rs b/gcc/testsuite/rust/compile/torture/bom_whitespace.rs
new file mode 100644
index 0000000..b10d565
--- /dev/null
+++ b/gcc/testsuite/rust/compile/torture/bom_whitespace.rs
@@ -0,0 +1,2 @@
+
+pub fn main () { }