aboutsummaryrefslogtreecommitdiff
path: root/gcc/rust/util/rust-unicode.cc
diff options
context:
space:
mode:
authorRaiki Tamura <tamaron1203@gmail.com>2023-12-02 21:01:59 +0900
committerArthur Cohen <arthur.cohen@embecosm.com>2024-02-07 12:40:21 +0100
commit5eb4e5093aaf6964370b7daa231b926ca592eeb0 (patch)
tree85c7ea8c6d8fcfdaab10b2e3f5c4768c71b79ad3 /gcc/rust/util/rust-unicode.cc
parentf89186f962421f6d972035fc4b4c20490e7b1c5b (diff)
downloadgcc-5eb4e5093aaf6964370b7daa231b926ca592eeb0.zip
gcc-5eb4e5093aaf6964370b7daa231b926ca592eeb0.tar.gz
gcc-5eb4e5093aaf6964370b7daa231b926ca592eeb0.tar.bz2
gccrs: Implement quick-check for Unicode
gcc/rust/ChangeLog: * rust-lang.cc (run_rust_tests): Add test. * rust-system.h: Add <algorithm>. * util/make-rust-unicode.py: Output NFC_Quick_Check table. * util/rust-codepoint.h (struct Codepoint): Add is_supplementary method. * util/rust-unicode-data.h: Generated. * util/rust-unicode.cc (binary_search_sorted_array): Removed. (lookup_cc): Remove namespace. (is_alphabetic): Use std::binary_search (nfc_quick_check): New function. (nfc_normalize): Use nfc_quick_check. (is_nfc_qc_maybe): New function. (is_nfc_qc_no): New function. (rust_nfc_qc_test): New test. * util/rust-unicode.h (is_nfc_qc_no): New function. (is_nfc_qc_maybe): New function. (enum class): New enum class. (nfc_quick_check): New function. (rust_nfc_qc_test): New test. Signed-off-by: Raiki Tamura <tamaron1203@gmail.com>
Diffstat (limited to 'gcc/rust/util/rust-unicode.cc')
-rw-r--r--gcc/rust/util/rust-unicode.cc143
1 files changed, 73 insertions, 70 deletions
diff --git a/gcc/rust/util/rust-unicode.cc b/gcc/rust/util/rust-unicode.cc
index 999ecb0..6bd2db5 100644
--- a/gcc/rust/util/rust-unicode.cc
+++ b/gcc/rust/util/rust-unicode.cc
@@ -39,75 +39,27 @@ const uint32_t L_COUNT = 19, V_COUNT = 21, T_COUNT = 28;
const uint32_t N_COUNT = V_COUNT * T_COUNT;
const uint32_t S_COUNT = L_COUNT * N_COUNT;
+// Check if the codepoint is in any of the ranges (half-open intervals [a,b)).
template <std::size_t SIZE>
-int64_t
+bool
binary_search_ranges (
- // FIXME: use binray search function from <algorithm>
const std::array<std::pair<uint32_t, uint32_t>, SIZE> &ranges,
uint32_t target_cp)
{
- if (SIZE == 0)
- return -1;
-
- uint32_t low, high, mid;
- uint32_t start, end;
- low = 0;
- high = SIZE - 1;
- mid = (low + high) / 2;
- while (high - low > 1)
- {
- start = ranges[mid].first;
- end = ranges[mid].second;
- if (start <= target_cp && target_cp < end)
- {
- return mid;
- }
- else if (end <= target_cp)
- low = mid + 1;
- else
- high = mid - 1;
- mid = (low + high) / 2;
- }
-
- if (ranges[mid].first <= target_cp && target_cp < ranges[mid].second)
- return mid;
- else
- return -1;
-}
-
-template <std::size_t SIZE>
-int64_t
-binary_search_sorted_array (const std::array<uint32_t, SIZE> &array,
- uint32_t target)
-{
- // FIXME: use binray search function from <algorithm>
- if (SIZE == 0)
- return -1;
-
- uint32_t low, high, mid;
- low = 0;
- high = SIZE;
- mid = (low + high) / 2;
- while (high - low > 1)
- {
- if (array[mid] <= target)
- low = mid;
- else
- high = mid;
- mid = (low + high) / 2;
- }
-
- if (array[mid] == target)
- return mid;
+ auto it = std::lower_bound (ranges.begin (), ranges.end (), target_cp,
+ [] (const std::pair<uint32_t, uint32_t> &a,
+ uint32_t b) { return a.second <= b; });
+ if (it == ranges.end ())
+ return false;
else
- return -1;
+ return it->first <= target_cp && target_cp < it->second;
}
int
lookup_cc (codepoint_t c)
{
- auto it = Rust::CCC_TABLE.find (c.value);
- if (it != Rust::CCC_TABLE.end ())
+ auto it = CCC_TABLE.find (c.value);
+ if (it != CCC_TABLE.end ())
return it->second;
else
// Starter. Returns zero.
@@ -289,10 +241,44 @@ recomp (string_t s)
return buf;
}
+// see https://unicode.org/reports/tr15/#Detecting_Normalization_Forms
+QuickCheckResult
+nfc_quick_check (const string_t &s)
+{
+ int last_canonical_class = 0;
+ QuickCheckResult res = QuickCheckResult::YES;
+
+ for (unsigned long i = 0; i < s.size (); i++)
+ {
+ codepoint_t c = s[i];
+
+ if (c.is_supplementary_character ())
+ i++;
+
+ int canonical_class = lookup_cc (c);
+ if (last_canonical_class > canonical_class && canonical_class != 0)
+ return QuickCheckResult::NO;
+
+ if (is_nfc_qc_no (c.value))
+ return QuickCheckResult::NO;
+
+ if (is_nfc_qc_maybe (c.value))
+ res = QuickCheckResult::MAYBE;
+
+ last_canonical_class = canonical_class;
+ }
+ return res;
+}
+
string_t
-nfc_normalize (string_t s)
+nfc_normalize (const string_t &s)
{
- // TODO: Quick Check
+ if (nfc_quick_check (s) == QuickCheckResult::YES)
+ return s;
+
+ // TODO: optimize normalization.
+ // i.e. only normalize a limited area around MAYBE character, instead of
+ // performing complete normlization of the entire string
// decompose
string_t d = decomp_cano (s);
@@ -312,21 +298,26 @@ Utf8String::nfc_normalize () const
bool
is_alphabetic (uint32_t codepoint)
{
- int64_t res = binary_search_ranges (ALPHABETIC_RANGES, codepoint);
- if (res < 0)
- return false;
- else
- return true;
+ return binary_search_ranges (ALPHABETIC_RANGES, codepoint);
}
bool
is_numeric (uint32_t codepoint)
{
- int64_t res = binary_search_sorted_array (NUMERIC_CODEPOINTS, codepoint);
- if (res < 0)
- return false;
- else
- return true;
+ return std::binary_search (NUMERIC_CODEPOINTS.begin (),
+ NUMERIC_CODEPOINTS.end (), codepoint);
+}
+
+bool
+is_nfc_qc_maybe (uint32_t codepoint)
+{
+ return binary_search_ranges (NFC_QC_MAYBE_RANGES, codepoint);
+}
+
+bool
+is_nfc_qc_no (uint32_t codepoint)
+{
+ return binary_search_ranges (NFC_QC_NO_RANGES, codepoint);
}
bool
@@ -345,6 +336,18 @@ is_ascii_only (const std::string &str)
namespace selftest {
void
+rust_nfc_qc_test ()
+{
+ ASSERT_EQ (Rust::nfc_quick_check ({0x1e0a /* NFC_QC_YES */}),
+ Rust::QuickCheckResult::YES);
+ ASSERT_EQ (Rust::nfc_quick_check (
+ {0x1e0a /* NFC_QC_YES */, 0x0323 /* NFC_QC_MAYBE */}),
+ Rust::QuickCheckResult::MAYBE);
+ ASSERT_EQ (Rust::nfc_quick_check ({0x0340 /* NFC_QC_NO */}),
+ Rust::QuickCheckResult::NO);
+}
+
+void
assert_normalize (const std::vector<Rust::Codepoint> origin,
const std::vector<Rust::Codepoint> expected)
{