diff options
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/rust/lex/rust-input-source.h | 193 | ||||
-rw-r--r-- | gcc/rust/lex/rust-lex.cc | 7 | ||||
-rw-r--r-- | gcc/rust/lex/rust-lex.h | 181 | ||||
-rw-r--r-- | gcc/rust/lex/rust-token.cc | 18 | ||||
-rw-r--r-- | gcc/rust/lex/rust-token.h | 40 | ||||
-rw-r--r-- | gcc/rust/rust-lang.cc | 2 | ||||
-rw-r--r-- | gcc/rust/rust-session-manager.cc | 9 | ||||
-rw-r--r-- | gcc/rust/util/rust-unicode.cc | 41 | ||||
-rw-r--r-- | gcc/rust/util/rust-unicode.h | 35 | ||||
-rw-r--r-- | gcc/testsuite/rust/compile/unicode_norm1.rs | 6 |
10 files changed, 304 insertions, 228 deletions
diff --git a/gcc/rust/lex/rust-input-source.h b/gcc/rust/lex/rust-input-source.h new file mode 100644 index 0000000..07137de --- /dev/null +++ b/gcc/rust/lex/rust-input-source.h @@ -0,0 +1,193 @@ +#ifndef RUST_INPUT_SOURCE_H +#define RUST_INPUT_SOURCE_H + +#include "rust-codepoint.h" +#include "optional.h" + +namespace Rust { +// Input source wrapper thing. +class InputSource +{ +private: + // position of current character + unsigned int pos; + std::vector<Codepoint> chars; + bool is_valid_utf8; + + // Overload operator () to return next char from input stream. + virtual int next_byte () = 0; + + Codepoint next_codepoint () + { + uint32_t input = next_byte (); + + if ((int32_t) input == EOF) + return Codepoint::eof (); + else if (input < 128) + { + // ascii -- 1 byte + return {input}; + } + else if ((input & 0xC0) == 0x80) + { + // invalid (continuation; can't be first char) + return {0xFFFE}; + } + else if ((input & 0xE0) == 0xC0) + { + // 2 bytes + uint8_t input2 = next_byte (); + if ((input2 & 0xC0) != 0x80) + return {0xFFFE}; + + uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0); + return output; + } + else if ((input & 0xF0) == 0xE0) + { + // 3 bytes or UTF-8 BOM + uint8_t input2 = next_byte (); + // If the second byte is equal to 0xBB then the input is no longer a + // valid UTF-8 char. Then, we check if the third byte makes up a UTF + // BOM. + if (input == 0xEF && input2 == 0xBB) + { + uint8_t input3 = next_byte (); + if (input3 == 0xBF) + // found BOM + return next_codepoint (); + else + return {0xFFFE}; + } + + if ((input2 & 0xC0) != 0x80) + return {0xFFFE}; + + uint8_t input3 = next_byte (); + + if ((input3 & 0xC0) != 0x80) + return {0xFFFE}; + + uint32_t output = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) + | ((input3 & 0x3F) << 0); + return {output}; + } + else if ((input & 0xF8) == 0xF0) + { + // 4 bytes + uint8_t input2 = next_byte (); + if ((input2 & 0xC0) != 0x80) + return {0xFFFE}; + + uint8_t input3 = next_byte (); + if ((input3 & 0xC0) != 0x80) + return {0xFFFE}; + + uint8_t input4 = next_byte (); + if ((input4 & 0xC0) != 0x80) + return {0xFFFE}; + + uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12) + | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0); + return {output}; + } + else + { + return {0xFFFE}; + } + } + +protected: + // Check if the input source is valid as utf-8 and copy all characters to + // `chars`. + void init () + { + Codepoint char32 = next_codepoint (); + while (!char32.is_eof () && char32 != 0xFFFE) + { + chars.push_back (char32); + char32 = next_codepoint (); + } + + if (char32 == 0xFFFE) + { + // Input source is not valid as utf-8. + is_valid_utf8 = false; + } + } + +public: + InputSource () : pos (0), chars ({}), is_valid_utf8 (true) {} + + virtual ~InputSource () {} + + // Checks if input source is a valid UTF-8 string + bool is_valid () { return is_valid_utf8; } + + // get the next UTF-8 character + Codepoint next () + { + if (pos >= chars.size ()) + return Codepoint::eof (); + else + { + Codepoint c = chars[pos]; + pos++; + return c; + } + } + + // Returns codepoint if input source is a valid UTF-8 string. Returns + // nullopt otherwise. + tl::optional<std::vector<Codepoint>> get_chars () + { + if (is_valid ()) + return {chars}; + else + return tl::nullopt; + } +}; + +class FileInputSource : public InputSource +{ +private: + // Input source file. + FILE *input; + + int next_byte () override { return fgetc (input); } + +public: + // Create new input source from file. + FileInputSource (FILE *input) : InputSource (), input (input) + { + // TODO make this better? + init (); + } +}; + +class BufferInputSource : public InputSource +{ +private: + const std::string &buffer; + size_t offs; + + int next_byte () override + { + if (offs >= buffer.size ()) + return EOF; + return (uint8_t) buffer.at (offs++); + } + +public: + // Create new input source from file. + BufferInputSource (const std::string &b, size_t offset) + : InputSource (), buffer (b), offs (offset) + { + // TODO make this better? + init (); + } +}; + +} // namespace Rust + +#endif diff --git a/gcc/rust/lex/rust-lex.cc b/gcc/rust/lex/rust-lex.cc index c40e700..2a92465 100644 --- a/gcc/rust/lex/rust-lex.cc +++ b/gcc/rust/lex/rust-lex.cc @@ -2534,8 +2534,7 @@ namespace selftest { // Checks if `src` has the same contents as the given characters void -assert_source_content (Rust::Lexer::InputSource &src, - std::vector<uint32_t> expected) +assert_source_content (Rust::InputSource &src, std::vector<uint32_t> expected) { Rust::Codepoint src_char = src.next (); for (auto expected_char : expected) @@ -2553,7 +2552,7 @@ assert_source_content (Rust::Lexer::InputSource &src, void test_buffer_input_source (std::string str, std::vector<uint32_t> expected) { - Rust::Lexer::BufferInputSource source (str, 0); + Rust::BufferInputSource source (str, 0); assert_source_content (source, expected); } @@ -2564,7 +2563,7 @@ test_file_input_source (std::string str, std::vector<uint32_t> expected) // Moves to the first character fputs (str.c_str (), tmpf); std::rewind (tmpf); - Rust::Lexer::FileInputSource source (tmpf); + Rust::FileInputSource source (tmpf); assert_source_content (source, expected); } diff --git a/gcc/rust/lex/rust-lex.h b/gcc/rust/lex/rust-lex.h index 273b8c7..6a29c0a 100644 --- a/gcc/rust/lex/rust-lex.h +++ b/gcc/rust/lex/rust-lex.h @@ -24,6 +24,7 @@ #include "rust-token.h" #include "optional.h" #include "selftest.h" +#include "rust-input-source.h" namespace Rust { // Simple wrapper for FILE* that simplifies destruction. @@ -204,186 +205,6 @@ public: Linemap *get_line_map () { return line_map; } std::string get_filename () { return std::string (input.get_filename ()); } - // Input source wrapper thing. - class InputSource - { - private: - // position of current character - unsigned int pos; - std::vector<Codepoint> chars; - bool is_valid_utf8; - - // Overload operator () to return next char from input stream. - virtual int next_byte () = 0; - - Codepoint next_codepoint () - { - uint32_t input = next_byte (); - - if ((int32_t) input == EOF) - return Codepoint::eof (); - else if (input < 128) - { - // ascii -- 1 byte - return {input}; - } - else if ((input & 0xC0) == 0x80) - { - // invalid (continuation; can't be first char) - return {0xFFFE}; - } - else if ((input & 0xE0) == 0xC0) - { - // 2 bytes - uint8_t input2 = next_byte (); - if ((input2 & 0xC0) != 0x80) - return {0xFFFE}; - - uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0); - return output; - } - else if ((input & 0xF0) == 0xE0) - { - // 3 bytes or UTF-8 BOM - uint8_t input2 = next_byte (); - // If the second byte is equal to 0xBB then the input is no longer a - // valid UTF-8 char. Then, we check if the third byte makes up a UTF - // BOM. - if (input == 0xEF && input2 == 0xBB) - { - uint8_t input3 = next_byte (); - if (input3 == 0xBF) - // found BOM - return next_codepoint (); - else - return {0xFFFE}; - } - - if ((input2 & 0xC0) != 0x80) - return {0xFFFE}; - - uint8_t input3 = next_byte (); - - if ((input3 & 0xC0) != 0x80) - return {0xFFFE}; - - uint32_t output = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) - | ((input3 & 0x3F) << 0); - return {output}; - } - else if ((input & 0xF8) == 0xF0) - { - // 4 bytes - uint8_t input2 = next_byte (); - if ((input2 & 0xC0) != 0x80) - return {0xFFFE}; - - uint8_t input3 = next_byte (); - if ((input3 & 0xC0) != 0x80) - return {0xFFFE}; - - uint8_t input4 = next_byte (); - if ((input4 & 0xC0) != 0x80) - return {0xFFFE}; - - uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12) - | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0); - return {output}; - } - else - { - return {0xFFFE}; - } - } - - protected: - // Check if the input source is valid as utf-8 and copy all characters to - // `chars`. - void init () - { - Codepoint char32 = next_codepoint (); - while (!char32.is_eof () && char32 != 0xFFFE) - { - chars.push_back (char32); - char32 = next_codepoint (); - } - - if (char32 == 0xFFFE) - { - // Input source is not valid as utf-8. - is_valid_utf8 = false; - } - } - - public: - InputSource () : pos (0), chars ({}), is_valid_utf8 (true) {} - - virtual ~InputSource () {} - - bool is_valid () { return is_valid_utf8; } - - // get the next UTF-8 character - Codepoint next () - { - if (pos >= chars.size ()) - return Codepoint::eof (); - else - { - Codepoint c = chars[pos]; - pos++; - return c; - } - } - - tl::optional<std::vector<Codepoint>> get_chars () - { - if (is_valid ()) - return {chars}; - else - return tl::nullopt; - } - }; - - class FileInputSource : public InputSource - { - private: - // Input source file. - FILE *input; - - int next_byte () override { return fgetc (input); } - - public: - // Create new input source from file. - FileInputSource (FILE *input) : InputSource (), input (input) - { - // TODO make this better? - init (); - } - }; - - class BufferInputSource : public InputSource - { - private: - const std::string &buffer; - size_t offs; - - int next_byte () override - { - if (offs >= buffer.size ()) - return EOF; - return (uint8_t) buffer.at (offs++); - } - - public: - // Create new input source from file. - BufferInputSource (const std::string &b, size_t offset) - : InputSource (), buffer (b), offs (offset) - { - // TODO make this better? - init (); - } - }; - private: void start_line (int current_line, int current_column); diff --git a/gcc/rust/lex/rust-token.cc b/gcc/rust/lex/rust-token.cc index 77ec6cf..9a1132f 100644 --- a/gcc/rust/lex/rust-token.cc +++ b/gcc/rust/lex/rust-token.cc @@ -19,6 +19,7 @@ #include "rust-system.h" #include "rust-token.h" #include "rust-diagnostics.h" +#include "rust-unicode.h" namespace Rust { // Hackily defined way to get token description for enum value using x-macros @@ -150,6 +151,23 @@ Token::get_type_hint_str () const return get_type_hint_string (type_hint); } +std::string +nfc_normalize_token_string (location_t loc, TokenId id, const std::string &str) +{ + if (id == IDENTIFIER || id == LIFETIME) + { + tl::optional<Utf8String> ustring = Utf8String::make_utf8_string (str); + if (ustring.has_value ()) + return ustring.value ().nfc_normalize ().as_string (); + else + rust_internal_error_at (loc, + "identifier '%s' is not a valid UTF-8 string", + str.c_str ()); + } + else + return str; +} + const std::string & Token::get_str () const { diff --git a/gcc/rust/lex/rust-token.h b/gcc/rust/lex/rust-token.h index 5675351..c4ea176 100644 --- a/gcc/rust/lex/rust-token.h +++ b/gcc/rust/lex/rust-token.h @@ -21,7 +21,8 @@ #include "rust-system.h" #include "rust-linemap.h" -#include "rust-codepoint.h" +#include "rust-make-unique.h" +#include "rust-unicode.h" namespace Rust { // "Primitive core types" in Rust - the different int and float types, as well @@ -236,6 +237,10 @@ token_id_keyword_string (TokenId id); const char * get_type_hint_string (PrimitiveCoreType type); +/* Normalize string if a token is a identifier */ +std::string +nfc_normalize_token_string (location_t loc, TokenId id, const std::string &str); + // Represents a single token. Create using factory static methods. class Token { @@ -259,29 +264,40 @@ private: // Token constructor from token id, location, and a string. Token (TokenId token_id, location_t location, std::string &¶mStr) - : token_id (token_id), locus (location), - str (new std::string (std::move (paramStr))), type_hint (CORETYPE_UNKNOWN) - {} + : token_id (token_id), locus (location), type_hint (CORETYPE_UNKNOWN) + { + // Normalize identifier tokens + str = Rust::make_unique<std::string> ( + nfc_normalize_token_string (location, token_id, paramStr)); + } // Token constructor from token id, location, and a char. Token (TokenId token_id, location_t location, char paramChar) : token_id (token_id), locus (location), str (new std::string (1, paramChar)), type_hint (CORETYPE_UNKNOWN) - {} + { + // Do not need to normalize 1byte char + } // Token constructor from token id, location, and a "codepoint". Token (TokenId token_id, location_t location, Codepoint paramCodepoint) - : token_id (token_id), locus (location), - str (new std::string (paramCodepoint.as_string ())), - type_hint (CORETYPE_UNKNOWN) - {} + : token_id (token_id), locus (location), type_hint (CORETYPE_UNKNOWN) + { + // Normalize identifier tokens + str = Rust::make_unique<std::string> ( + nfc_normalize_token_string (location, token_id, + paramCodepoint.as_string ())); + } // Token constructor from token id, location, a string, and type hint. Token (TokenId token_id, location_t location, std::string &¶mStr, PrimitiveCoreType parType) - : token_id (token_id), locus (location), - str (new std::string (std::move (paramStr))), type_hint (parType) - {} + : token_id (token_id), locus (location), type_hint (parType) + { + // Normalize identifier tokens + str = Rust::make_unique<std::string> ( + nfc_normalize_token_string (location, token_id, paramStr)); + } public: // No default constructor. diff --git a/gcc/rust/rust-lang.cc b/gcc/rust/rust-lang.cc index f07d18f..44dc3fc 100644 --- a/gcc/rust/rust-lang.cc +++ b/gcc/rust/rust-lang.cc @@ -455,11 +455,11 @@ run_rust_tests () { // Call tests for the rust frontend here rust_input_source_test (); + rust_utf8_normalize_test (); rust_cfg_parser_test (); rust_privacy_ctx_test (); rust_crate_name_validation_test (); rust_simple_path_resolve_test (); - rust_utf8_normalize_test (); } } // namespace selftest diff --git a/gcc/rust/rust-session-manager.cc b/gcc/rust/rust-session-manager.cc index 3c00ccb..b860a32 100644 --- a/gcc/rust/rust-session-manager.cc +++ b/gcc/rust/rust-session-manager.cc @@ -115,16 +115,15 @@ infer_crate_name (const std::string &filename) static bool validate_crate_name (const std::string &crate_name, Error &error) { - Utf8String utf8_name = {crate_name}; - tl::optional<std::vector<Codepoint>> uchars_opt = utf8_name.get_chars (); - - if (!uchars_opt.has_value ()) + tl::optional<Utf8String> utf8_name_opt + = Utf8String::make_utf8_string (crate_name); + if (!utf8_name_opt.has_value ()) { error = Error (UNDEF_LOCATION, "crate name is not a valid UTF-8 string"); return false; } - std::vector<Codepoint> uchars = uchars_opt.value (); + std::vector<Codepoint> uchars = utf8_name_opt->get_chars (); if (uchars.empty ()) { error = Error (UNDEF_LOCATION, "crate name cannot be empty"); diff --git a/gcc/rust/util/rust-unicode.cc b/gcc/rust/util/rust-unicode.cc index c6aa063..b2ddaf0 100644 --- a/gcc/rust/util/rust-unicode.cc +++ b/gcc/rust/util/rust-unicode.cc @@ -1,12 +1,14 @@ #include "rust-system.h" #include "optional.h" #include "selftest.h" +#include "rust-lex.h" +#include "rust-unicode.h" #include "rust-unicode-data.h" namespace Rust { -typedef uint32_t codepoint_t; +typedef Codepoint codepoint_t; typedef std::vector<codepoint_t> string_t; // These constants are used to compose and decompose of Hangul syllables. @@ -85,7 +87,7 @@ binary_search_sorted_array (const std::array<uint32_t, SIZE> &array, int lookup_cc (codepoint_t c) { - auto it = Rust::CCC_TABLE.find (c); + auto it = Rust::CCC_TABLE.find (c.value); if (it != Rust::CCC_TABLE.end ()) return it->second; else @@ -96,11 +98,11 @@ lookup_cc (codepoint_t c) tl::optional<codepoint_t> lookup_recomp (codepoint_t starter, codepoint_t c) { - auto it = Rust::RECOMPOSITION_MAP.find ({starter, c}); + auto it = Rust::RECOMPOSITION_MAP.find ({starter.value, c.value}); if (it != Rust::RECOMPOSITION_MAP.end ()) return {it->second}; - it = Rust::RECOMPOSITION_MAP.find ({starter, 0}); + it = Rust::RECOMPOSITION_MAP.find ({starter.value, 0}); if (it != Rust::RECOMPOSITION_MAP.end ()) return {it->second}; @@ -110,11 +112,11 @@ lookup_recomp (codepoint_t starter, codepoint_t c) void recursive_decomp_cano (codepoint_t c, string_t &buf) { - auto it = Rust::DECOMPOSITION_MAP.find (c); + auto it = Rust::DECOMPOSITION_MAP.find (c.value); if (it != Rust::DECOMPOSITION_MAP.end ()) { - string_t decomped = it->second; - for (codepoint_t cp : decomped) + std::vector<uint32_t> decomped = it->second; + for (uint32_t cp : decomped) recursive_decomp_cano (cp, buf); } else @@ -127,7 +129,7 @@ decomp_cano (string_t s) string_t buf; for (codepoint_t c : s) { - int64_t s_index = c - S_BASE; + int64_t s_index = c.value - S_BASE; if (0 <= s_index && s_index < S_COUNT) { // decompose Hangul argorithmically @@ -160,7 +162,7 @@ sort_cano (string_t &s) if (cc_here > 0 && cc_prev > 0 && cc_prev > cc_here) { // swap - int tmp = s[i]; + codepoint_t tmp = s[i]; s[i] = s[i - 1]; s[i - 1] = tmp; if (i > 1) @@ -183,10 +185,10 @@ compose_hangul (string_t s) codepoint_t ch = s[src_pos]; // L V => LV - int64_t l_index = last - L_BASE; + int64_t l_index = last.value - L_BASE; if (0 <= l_index && l_index < L_COUNT) { - int64_t v_index = ch - V_BASE; + int64_t v_index = ch.value - V_BASE; if (0 <= v_index && v_index < V_COUNT) { last = S_BASE + (l_index * V_COUNT + v_index) * T_COUNT; @@ -198,13 +200,13 @@ compose_hangul (string_t s) } // LV T => LVT - int64_t s_index = last - S_BASE; + int64_t s_index = last.value - S_BASE; if (0 <= s_index && s_index < S_COUNT && (s_index % T_COUNT) == 0) { - int64_t t_index = ch - T_BASE; + int64_t t_index = ch.value - T_BASE; if (0 < t_index && t_index < T_COUNT) { - last += t_index; + last.value += t_index; // pop LV buf.pop_back (); buf.push_back (last); @@ -282,6 +284,12 @@ nfc_normalize (string_t s) return r; } +Utf8String +Utf8String::nfc_normalize () const +{ + return Utf8String (Rust::nfc_normalize (chars)); +} + bool is_alphabetic (uint32_t codepoint) { @@ -309,9 +317,10 @@ is_numeric (uint32_t codepoint) namespace selftest { void -assert_normalize (std::vector<uint32_t> origin, std::vector<uint32_t> expected) +assert_normalize (const std::vector<Rust::Codepoint> origin, + const std::vector<Rust::Codepoint> expected) { - std::vector<uint32_t> actual = Rust::nfc_normalize (origin); + std::vector<Rust::Codepoint> actual = Rust::nfc_normalize (origin); ASSERT_EQ (actual.size (), expected.size ()); for (unsigned int i = 0; i < actual.size (); i++) diff --git a/gcc/rust/util/rust-unicode.h b/gcc/rust/util/rust-unicode.h index 6800558..becf6fb 100644 --- a/gcc/rust/util/rust-unicode.h +++ b/gcc/rust/util/rust-unicode.h @@ -21,28 +21,43 @@ #include "optional.h" #include "rust-system.h" -#include "rust-lex.h" +#include "rust-input-source.h" namespace Rust { class Utf8String { private: - tl::optional<std::vector<Codepoint>> chars; + std::vector<Codepoint> chars; public: - Utf8String (const std::string &maybe_utf8) + static tl::optional<Utf8String> + make_utf8_string (const std::string &maybe_utf8) { - Lexer::BufferInputSource input_source = {maybe_utf8, 0}; - chars = input_source.get_chars (); + BufferInputSource input_source = {maybe_utf8, 0}; + tl::optional<std::vector<Codepoint>> chars_opt = input_source.get_chars (); + if (chars_opt.has_value ()) + return {Utf8String (chars_opt.value ())}; + else + return tl::nullopt; } - // Returns UTF codepoints when string is valid as UTF-8, returns nullopt - // otherwise. - tl::optional<std::vector<Codepoint>> get_chars () const { return chars; } -}; + Utf8String (const std::vector<Codepoint> codepoints) : chars ({codepoints}) {} + + std::string as_string () const + { + std::stringstream ss; + for (Codepoint c : chars) + ss << c.as_string (); -// TODO: add function nfc_normalize + return ss.str (); + }; + + // Returns characters + std::vector<Codepoint> get_chars () const { return chars; } + + Utf8String nfc_normalize () const; +}; bool is_alphabetic (uint32_t codepoint); diff --git a/gcc/testsuite/rust/compile/unicode_norm1.rs b/gcc/testsuite/rust/compile/unicode_norm1.rs new file mode 100644 index 0000000..d496054 --- /dev/null +++ b/gcc/testsuite/rust/compile/unicode_norm1.rs @@ -0,0 +1,6 @@ +fn main() { + // U+304C + let が = (); + // U+304B + U+3099 + let _ = が; +} |