diff options
author | Raiki Tamura <tamaron1203@gmail.com> | 2023-07-14 14:45:34 +0900 |
---|---|---|
committer | Philip Herron <philip.herron@embecosm.com> | 2023-07-30 17:20:51 +0000 |
commit | a3fc40c6749aa0635c03f84b9682fa5e86be7b3d (patch) | |
tree | 8b07682de0b7ba8782f532248046e7ae709f7354 /gcc/rust | |
parent | 4553f58e19f30db5bbfb26889d4cbf5d02bcbc9f (diff) | |
download | gcc-a3fc40c6749aa0635c03f84b9682fa5e86be7b3d.zip gcc-a3fc40c6749aa0635c03f84b9682fa5e86be7b3d.tar.gz gcc-a3fc40c6749aa0635c03f84b9682fa5e86be7b3d.tar.bz2 |
gccrs: Add Unicode check for crate_name attributes
gcc/rust/ChangeLog:
* lex/rust-codepoint.h: Add comment
* lex/rust-lex.h: New method to get decoded characters
* rust-session-manager.cc (validate_crate_name): Modify unicode check
(rust_crate_name_validation_test): Add testcases
* util/rust-unicode.h (RUST_UNICODE_H): New class Utf8String.
(class Utf8String): New class.
* util/rust-unicode.cc (binary_search_sorted_array): Add comment.
(recursive_decomp_cano): Add comment.
(recomp): Remove dead code.
(dump_string): Removed.
gcc/testsuite/ChangeLog:
* rust/compile/bad-crate-name.rs: Moved to...
* rust/compile/bad-crate-name1.rs: ...here.
* rust/compile/bad-crate-name2.rs: New test.
Signed-off-by: Raiki Tamura <tamaron1203@gmail.com>
Diffstat (limited to 'gcc/rust')
-rw-r--r-- | gcc/rust/lex/rust-codepoint.h | 2 | ||||
-rw-r--r-- | gcc/rust/lex/rust-lex.h | 8 | ||||
-rw-r--r-- | gcc/rust/rust-session-manager.cc | 34 | ||||
-rw-r--r-- | gcc/rust/util/rust-unicode.cc | 23 | ||||
-rw-r--r-- | gcc/rust/util/rust-unicode.h | 19 |
5 files changed, 57 insertions, 29 deletions
diff --git a/gcc/rust/lex/rust-codepoint.h b/gcc/rust/lex/rust-codepoint.h index 7e25106..eaed664 100644 --- a/gcc/rust/lex/rust-codepoint.h +++ b/gcc/rust/lex/rust-codepoint.h @@ -22,6 +22,8 @@ #include "rust-system.h" namespace Rust { + +// FIXME: move this to rust-unicode.h? struct Codepoint { uint32_t value; diff --git a/gcc/rust/lex/rust-lex.h b/gcc/rust/lex/rust-lex.h index bb34291..273b8c7 100644 --- a/gcc/rust/lex/rust-lex.h +++ b/gcc/rust/lex/rust-lex.h @@ -334,6 +334,14 @@ public: return c; } } + + tl::optional<std::vector<Codepoint>> get_chars () + { + if (is_valid ()) + return {chars}; + else + return tl::nullopt; + } }; class FileInputSource : public InputSource diff --git a/gcc/rust/rust-session-manager.cc b/gcc/rust/rust-session-manager.cc index 98c7e66..0d7d98f 100644 --- a/gcc/rust/rust-session-manager.cc +++ b/gcc/rust/rust-session-manager.cc @@ -42,6 +42,7 @@ #include "rust-early-name-resolver.h" #include "rust-cfg-strip.h" #include "rust-expand-visitor.h" +#include "rust-unicode.h" #include "diagnostic.h" #include "input.h" @@ -107,30 +108,39 @@ infer_crate_name (const std::string &filename) return crate; } -/* Validate the crate name using the ASCII rules - TODO: Support Unicode version of the rules */ +/* Validate the crate name using the ASCII rules */ static bool validate_crate_name (const std::string &crate_name, Error &error) { - if (crate_name.empty ()) + Utf8String utf8_name = {crate_name}; + tl::optional<std::vector<Codepoint>> uchars_opt = utf8_name.get_chars (); + + if (!uchars_opt.has_value ()) + { + error = Error (UNDEF_LOCATION, "crate name is not a valid UTF-8 string"); + return false; + } + + std::vector<Codepoint> uchars = uchars_opt.value (); + if (uchars.empty ()) { error = Error (UNDEF_LOCATION, "crate name cannot be empty"); return false; } - if (crate_name.length () > kMaxNameLength) + if (uchars.size () > kMaxNameLength) { error = Error (UNDEF_LOCATION, "crate name cannot exceed %lu characters", (unsigned long) kMaxNameLength); return false; } - for (auto &c : crate_name) + for (Codepoint &c : uchars) { - if (!(ISALNUM (c) || c == '_')) + if (!(is_alphabetic (c.value) || is_numeric (c.value) || c.value == '_')) { error = Error (UNDEF_LOCATION, - "invalid character %<%c%> in crate name: %<%s%>", c, - crate_name.c_str ()); + "invalid character %<%s%> in crate name: %<%s%>", + c.as_string ().c_str (), crate_name.c_str ()); return false; } } @@ -1273,13 +1283,17 @@ rust_crate_name_validation_test (void) ASSERT_TRUE (Rust::validate_crate_name ("example", error)); ASSERT_TRUE (Rust::validate_crate_name ("abcdefg_1234", error)); ASSERT_TRUE (Rust::validate_crate_name ("1", error)); - // FIXME: The next test does not pass as of current implementation - // ASSERT_TRUE (Rust::CompileOptions::validate_crate_name ("惊吓")); + ASSERT_TRUE (Rust::validate_crate_name ("クレート", error)); + ASSERT_TRUE (Rust::validate_crate_name ("Sōkrátēs", error)); + ASSERT_TRUE (Rust::validate_crate_name ("惊吓", error)); + // NOTE: - is not allowed in the crate name ... ASSERT_FALSE (Rust::validate_crate_name ("abcdefg-1234", error)); ASSERT_FALSE (Rust::validate_crate_name ("a+b", error)); ASSERT_FALSE (Rust::validate_crate_name ("/a+b/", error)); + ASSERT_FALSE (Rust::validate_crate_name ("😸++", error)); + ASSERT_FALSE (Rust::validate_crate_name ("∀", error)); /* Tests for crate name inference */ ASSERT_EQ (Rust::infer_crate_name ("c.rs"), "c"); diff --git a/gcc/rust/util/rust-unicode.cc b/gcc/rust/util/rust-unicode.cc index 381b8aa..94364e0 100644 --- a/gcc/rust/util/rust-unicode.cc +++ b/gcc/rust/util/rust-unicode.cc @@ -12,6 +12,7 @@ typedef std::vector<codepoint_t> string_t; template <std::size_t SIZE> int64_t binary_search_ranges ( + // FIXME: use binray search function from <algorithm> const std::array<std::pair<uint32_t, uint32_t>, SIZE> &ranges, uint32_t target_cp) { @@ -49,6 +50,7 @@ int64_t binary_search_sorted_array (const std::array<std::uint32_t, SIZE> &array, uint32_t target) { + // FIXME: use binray search function from <algorithm> if (SIZE == 0) return -1; @@ -104,9 +106,7 @@ recursive_decomp_cano (codepoint_t c, string_t &buf) { string_t decomped = it->second; for (codepoint_t cp : decomped) - { - recursive_decomp_cano (cp, buf); - } + recursive_decomp_cano (cp, buf); } else buf.push_back (c); @@ -152,8 +152,7 @@ recomp (string_t s) if (s.size () > 0) { int last_class = -1; - // int starter_pos = 0; // Assume the first character is Starter. Correct? - // int target_pos = 1; + // Assume the first character is Starter. codepoint_t starter_ch = s[0]; for (unsigned int src_pos = 1; src_pos < s.size (); src_pos++) { @@ -189,20 +188,6 @@ recomp (string_t s) return buf; } -// TODO: remove -/* -void -dump_string (std::vector<uint32_t> s) -{ - std::cout << "dump="; - for (auto c : s) - { - std::cout << std::hex << c << ", "; - } - std::cout << std::endl; -} -*/ - string_t nfc_normalize (string_t s) { diff --git a/gcc/rust/util/rust-unicode.h b/gcc/rust/util/rust-unicode.h index 8c0bd06..6800558 100644 --- a/gcc/rust/util/rust-unicode.h +++ b/gcc/rust/util/rust-unicode.h @@ -19,10 +19,29 @@ #ifndef RUST_UNICODE_H #define RUST_UNICODE_H +#include "optional.h" #include "rust-system.h" +#include "rust-lex.h" namespace Rust { +class Utf8String +{ +private: + tl::optional<std::vector<Codepoint>> chars; + +public: + Utf8String (const std::string &maybe_utf8) + { + Lexer::BufferInputSource input_source = {maybe_utf8, 0}; + chars = input_source.get_chars (); + } + + // Returns UTF codepoints when string is valid as UTF-8, returns nullopt + // otherwise. + tl::optional<std::vector<Codepoint>> get_chars () const { return chars; } +}; + // TODO: add function nfc_normalize bool |