diff options
author | Raiki Tamura <tamaron1203@gmail.com> | 2023-07-30 19:54:36 +0900 |
---|---|---|
committer | P-E-P <32375388+P-E-P@users.noreply.github.com> | 2023-08-09 09:23:03 +0000 |
commit | a4b7e7375facb701585b2989ef3490528ec5bc9f (patch) | |
tree | 188247f431c075ef10ec6e67277375849c3039a8 /gcc/rust/util/rust-unicode.h | |
parent | 74b83511e4ea68cc491d1f174a763bd87b449796 (diff) | |
download | gcc-a4b7e7375facb701585b2989ef3490528ec5bc9f.zip gcc-a4b7e7375facb701585b2989ef3490528ec5bc9f.tar.gz gcc-a4b7e7375facb701585b2989ef3490528ec5bc9f.tar.bz2 |
gccrs: Normalize all identifier tokens
gcc/rust/ChangeLog:
* lex/rust-lex.cc (assert_source_content): Fix namespace specifier
(test_buffer_input_source): Likewise.
(test_file_input_source): Likewise.
* lex/rust-lex.h: Move InputSource ...
* lex/rust-input-source.h: ... to here. (New file)
* lex/rust-token.cc (nfc_normalize_token_string): New function
* lex/rust-token.h (nfc_normalize_token_string): New function
* rust-lang.cc (run_rust_tests): Modify order of selftests.
* rust-session-manager.cc (validate_crate_name): Modify interface of Utf8String.
* util/rust-unicode.cc (lookup_cc): Modify codepoint_t typedef.
(lookup_recomp): Likewise.
(recursive_decomp_cano): Likewise.
(decomp_cano): Likewise.
(sort_cano): Likewise.
(compose_hangul): Likewise.
(assert_normalize): Likewise.
(Utf8String::nfc_normalize): New function.
* util/rust-unicode.h: Modify interface of Utf8String.
gcc/testsuite/ChangeLog:
* rust/compile/unicode_norm1.rs: New test.
Signed-off-by: Raiki Tamura <tamaron1203@gmail.com>
Diffstat (limited to 'gcc/rust/util/rust-unicode.h')
-rw-r--r-- | gcc/rust/util/rust-unicode.h | 35 |
1 files changed, 25 insertions, 10 deletions
diff --git a/gcc/rust/util/rust-unicode.h b/gcc/rust/util/rust-unicode.h index 6800558..becf6fb 100644 --- a/gcc/rust/util/rust-unicode.h +++ b/gcc/rust/util/rust-unicode.h @@ -21,28 +21,43 @@ #include "optional.h" #include "rust-system.h" -#include "rust-lex.h" +#include "rust-input-source.h" namespace Rust { class Utf8String { private: - tl::optional<std::vector<Codepoint>> chars; + std::vector<Codepoint> chars; public: - Utf8String (const std::string &maybe_utf8) + static tl::optional<Utf8String> + make_utf8_string (const std::string &maybe_utf8) { - Lexer::BufferInputSource input_source = {maybe_utf8, 0}; - chars = input_source.get_chars (); + BufferInputSource input_source = {maybe_utf8, 0}; + tl::optional<std::vector<Codepoint>> chars_opt = input_source.get_chars (); + if (chars_opt.has_value ()) + return {Utf8String (chars_opt.value ())}; + else + return tl::nullopt; } - // Returns UTF codepoints when string is valid as UTF-8, returns nullopt - // otherwise. - tl::optional<std::vector<Codepoint>> get_chars () const { return chars; } -}; + Utf8String (const std::vector<Codepoint> codepoints) : chars ({codepoints}) {} + + std::string as_string () const + { + std::stringstream ss; + for (Codepoint c : chars) + ss << c.as_string (); -// TODO: add function nfc_normalize + return ss.str (); + }; + + // Returns characters + std::vector<Codepoint> get_chars () const { return chars; } + + Utf8String nfc_normalize () const; +}; bool is_alphabetic (uint32_t codepoint); |