gccrs: Normalize all identifier tokens

gcc/rust/ChangeLog: * lex/rust-lex.cc (assert_source_content): Fix namespace specifier (test_buffer_input_source): Likewise. (test_file_input_source): Likewise. * lex/rust-lex.h: Move InputSource ... * lex/rust-input-source.h: ... to here. (New file) * lex/rust-token.cc (nfc_normalize_token_string): New function * lex/rust-token.h (nfc_normalize_token_string): New function * rust-lang.cc (run_rust_tests): Modify order of selftests. * rust-session-manager.cc (validate_crate_name): Modify interface of Utf8String. * util/rust-unicode.cc (lookup_cc): Modify codepoint_t typedef. (lookup_recomp): Likewise. (recursive_decomp_cano): Likewise. (decomp_cano): Likewise. (sort_cano): Likewise. (compose_hangul): Likewise. (assert_normalize): Likewise. (Utf8String::nfc_normalize): New function. * util/rust-unicode.h: Modify interface of Utf8String. gcc/testsuite/ChangeLog: * rust/compile/unicode_norm1.rs: New test. Signed-off-by: Raiki Tamura <tamaron1203@gmail.com>
author: Raiki Tamura <tamaron1203@gmail.com> 2023-07-30 19:54:36 +0900
committer: P-E-P <32375388+P-E-P@users.noreply.github.com> 2023-08-09 09:23:03 +0000
commit: a4b7e7375facb701585b2989ef3490528ec5bc9f (patch)
tree: 188247f431c075ef10ec6e67277375849c3039a8 /gcc/rust/util/rust-unicode.h
parent: 74b83511e4ea68cc491d1f174a763bd87b449796 (diff)
download: gcc-a4b7e7375facb701585b2989ef3490528ec5bc9f.zip
gcc-a4b7e7375facb701585b2989ef3490528ec5bc9f.tar.gz
gcc-a4b7e7375facb701585b2989ef3490528ec5bc9f.tar.bz2
1 files changed, 25 insertions, 10 deletions
diff --git a/gcc/rust/util/rust-unicode.h b/gcc/rust/util/rust-unicode.h
index 6800558..becf6fb 100644
--- a/gcc/rust/util/rust-unicode.h
+++ b/gcc/rust/util/rust-unicode.h
@@ -21,28 +21,43 @@
 
 #include "optional.h"
 #include "rust-system.h"
-#include "rust-lex.h"
+#include "rust-input-source.h"
 
 namespace Rust {
 
 class Utf8String
 {
 private:
-  tl::optional<std::vector<Codepoint>> chars;
+  std::vector<Codepoint> chars;
 
 public:
-  Utf8String (const std::string &maybe_utf8)
+  static tl::optional<Utf8String>
+  make_utf8_string (const std::string &maybe_utf8)
   {
-    Lexer::BufferInputSource input_source = {maybe_utf8, 0};
-    chars = input_source.get_chars ();
+    BufferInputSource input_source = {maybe_utf8, 0};
+    tl::optional<std::vector<Codepoint>> chars_opt = input_source.get_chars ();
+    if (chars_opt.has_value ())
+      return {Utf8String (chars_opt.value ())};
+    else
+      return tl::nullopt;
   }
 
-  // Returns UTF codepoints when string is valid as UTF-8, returns nullopt
-  // otherwise.
-  tl::optional<std::vector<Codepoint>> get_chars () const { return chars; }
-};
+  Utf8String (const std::vector<Codepoint> codepoints) : chars ({codepoints}) {}
+
+  std::string as_string () const
+  {
+    std::stringstream ss;
+    for (Codepoint c : chars)
+      ss << c.as_string ();
 
-// TODO: add function nfc_normalize
+    return ss.str ();
+  };
+
+  // Returns characters
+  std::vector<Codepoint> get_chars () const { return chars; }
+
+  Utf8String nfc_normalize () const;
+};
 
 bool
 is_alphabetic (uint32_t codepoint);
author	Raiki Tamura <tamaron1203@gmail.com>	2023-07-30 19:54:36 +0900
committer	P-E-P <32375388+P-E-P@users.noreply.github.com>	2023-08-09 09:23:03 +0000
commit	a4b7e7375facb701585b2989ef3490528ec5bc9f (patch)
tree	188247f431c075ef10ec6e67277375849c3039a8 /gcc/rust/util/rust-unicode.h
parent	74b83511e4ea68cc491d1f174a763bd87b449796 (diff)
download	gcc-a4b7e7375facb701585b2989ef3490528ec5bc9f.zip gcc-a4b7e7375facb701585b2989ef3490528ec5bc9f.tar.gz gcc-a4b7e7375facb701585b2989ef3490528ec5bc9f.tar.bz2