// Copyright (C) 2020-2025 Free Software Foundation, Inc. // This file is part of GCC. // GCC is free software; you can redistribute it and/or modify it under // the terms of the GNU General Public License as published by the Free // Software Foundation; either version 3, or (at your option) any later // version. // GCC is distributed in the hope that it will be useful, but WITHOUT ANY // WARRANTY; without even the implied warranty of MERCHANTABILITY or // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License // for more details. // You should have received a copy of the GNU General Public License // along with GCC; see the file COPYING3. If not see // . #include "rust-input-source.h" #include "rust-system.h" #include "optional.h" #include "selftest.h" #include "rust-lex.h" #include "rust-unicode.h" #include "rust-unicode-data.h" namespace Rust { typedef Codepoint codepoint_t; typedef std::vector string_t; // These constants are used to compose and decompose of Hangul syllables. // See `Sample Code for Hangul Algorithms` in 3.1.2 // unicode.org/versions/Unicode15.0.0/ch03.pdf const uint32_t S_BASE = 0xAC00; const uint32_t L_BASE = 0x1100, V_BASE = 0x1161, T_BASE = 0x11A7; const uint32_t L_COUNT = 19, V_COUNT = 21, T_COUNT = 28; const uint32_t N_COUNT = V_COUNT * T_COUNT; const uint32_t S_COUNT = L_COUNT * N_COUNT; // Check if the codepoint is in any of the ranges (half-open intervals [a,b)). template bool binary_search_ranges ( const std::array, SIZE> &ranges, uint32_t target_cp) { auto it = std::lower_bound (ranges.begin (), ranges.end (), target_cp, [] (const std::pair &a, uint32_t b) { return a.second <= b; }); if (it == ranges.end ()) return false; else return it->first <= target_cp && target_cp < it->second; } int lookup_cc (codepoint_t c) { auto it = CCC_TABLE.find (c.value); if (it != CCC_TABLE.end ()) return it->second; else // Starter. Returns zero. return 0; } tl::optional lookup_recomp (codepoint_t starter, codepoint_t c) { auto it = Rust::RECOMPOSITION_MAP.find ({starter.value, c.value}); if (it != Rust::RECOMPOSITION_MAP.end ()) return {it->second}; it = Rust::RECOMPOSITION_MAP.find ({starter.value, 0}); if (it != Rust::RECOMPOSITION_MAP.end ()) return {it->second}; return tl::nullopt; } void recursive_decomp_cano (codepoint_t c, string_t &buf) { auto it = Rust::DECOMPOSITION_MAP.find (c.value); if (it != Rust::DECOMPOSITION_MAP.end ()) { std::vector decomped = it->second; for (uint32_t cp : decomped) recursive_decomp_cano (cp, buf); } else buf.push_back (c); } string_t decomp_cano (string_t s) { string_t buf; for (codepoint_t c : s) { int64_t s_index = c.value - S_BASE; if (0 <= s_index && s_index < S_COUNT) { // decompose Hangul argorithmically uint32_t l = L_BASE + s_index / N_COUNT; uint32_t v = V_BASE + (s_index % N_COUNT) / T_COUNT; uint32_t t = T_BASE + s_index % T_COUNT; buf.push_back (l); buf.push_back (v); if (t != T_BASE) buf.push_back (t); continue; } // Current character is not hangul recursive_decomp_cano (c, buf); } return buf; } void sort_cano (string_t &s) { int cc_here, cc_prev; if (s.size () == 1) return; for (unsigned int i = 1; i < s.size (); i++) { cc_here = lookup_cc (s[i]); cc_prev = lookup_cc (s[i - 1]); if (cc_here > 0 && cc_prev > 0 && cc_prev > cc_here) { // swap codepoint_t tmp = s[i]; s[i] = s[i - 1]; s[i - 1] = tmp; if (i > 1) i -= 2; } } } string_t compose_hangul (string_t s) { string_t buf; if (s.size () < 2) return s; codepoint_t last = s[0]; buf.push_back (last); for (unsigned int src_pos = 1; src_pos < s.size (); src_pos++) { codepoint_t ch = s[src_pos]; // L V => LV int64_t l_index = last.value - L_BASE; if (0 <= l_index && l_index < L_COUNT) { int64_t v_index = ch.value - V_BASE; if (0 <= v_index && v_index < V_COUNT) { last = S_BASE + (l_index * V_COUNT + v_index) * T_COUNT; // pop L buf.pop_back (); buf.push_back (last); continue; } } // LV T => LVT int64_t s_index = last.value - S_BASE; if (0 <= s_index && s_index < S_COUNT && (s_index % T_COUNT) == 0) { int64_t t_index = ch.value - T_BASE; if (0 < t_index && t_index < T_COUNT) { last.value += t_index; // pop LV buf.pop_back (); buf.push_back (last); continue; } } last = ch; buf.push_back (last); } return buf; } string_t recomp (string_t s) { // compose hangul first s = compose_hangul (s); string_t buf; if (s.size () < 2) return s; int last_class = -1; // int starter_pos = 0; // Assume the first character is Starter. Correct? // int target_pos = 1; codepoint_t starter_ch = s[0]; for (unsigned int src_pos = 1; src_pos < s.size (); src_pos++) { // get current character codepoint_t ch = s[src_pos]; int ch_class = lookup_cc (ch); tl::optional composite = lookup_recomp (starter_ch, ch); if (composite.has_value () && last_class < ch_class) { // ch can be composed buf.push_back (composite.value ()); starter_ch = composite.value (); } else if (ch_class == 0) { // ch is Starter and cannot be composed. if (src_pos == 1) // FIXME: buggy? buf.push_back (starter_ch); starter_ch = ch; last_class = -1; buf.push_back (ch); } else { if (src_pos == 1) // FIXME: buggy? buf.push_back (starter_ch); // ch is not Starter. last_class = ch_class; buf.push_back (ch); } } return buf; } // see https://unicode.org/reports/tr15/#Detecting_Normalization_Forms QuickCheckResult nfc_quick_check (const string_t &s) { int last_canonical_class = 0; QuickCheckResult res = QuickCheckResult::YES; for (unsigned long i = 0; i < s.size (); i++) { codepoint_t c = s[i]; if (c.is_supplementary_character ()) i++; int canonical_class = lookup_cc (c); if (last_canonical_class > canonical_class && canonical_class != 0) return QuickCheckResult::NO; if (is_nfc_qc_no (c.value)) return QuickCheckResult::NO; if (is_nfc_qc_maybe (c.value)) res = QuickCheckResult::MAYBE; last_canonical_class = canonical_class; } return res; } string_t nfc_normalize (const string_t &s) { if (nfc_quick_check (s) == QuickCheckResult::YES) return s; // TODO: optimize normalization. // i.e. only normalize a limited area around MAYBE character, instead of // performing complete normlization of the entire string // decompose string_t d = decomp_cano (s); sort_cano (d); // recompose string_t r = recomp (d); return r; } Utf8String Utf8String::nfc_normalize () const { return Utf8String (Rust::nfc_normalize (chars)); } bool is_alphabetic (uint32_t codepoint) { return binary_search_ranges (ALPHABETIC_RANGES, codepoint); } bool is_numeric (uint32_t codepoint) { return std::binary_search (NUMERIC_CODEPOINTS.begin (), NUMERIC_CODEPOINTS.end (), codepoint); } bool is_nfc_qc_maybe (uint32_t codepoint) { return binary_search_ranges (NFC_QC_MAYBE_RANGES, codepoint); } bool is_nfc_qc_no (uint32_t codepoint) { return binary_search_ranges (NFC_QC_NO_RANGES, codepoint); } bool is_ascii_only (const std::string &str) { for (char c : str) if (static_cast (c) > MAX_ASCII_CODEPOINT) return false; return true; } } // namespace Rust #if CHECKING_P namespace selftest { void rust_nfc_qc_test () { ASSERT_EQ (Rust::nfc_quick_check ({0x1e0a /* NFC_QC_YES */}), Rust::QuickCheckResult::YES); ASSERT_EQ (Rust::nfc_quick_check ( {0x1e0a /* NFC_QC_YES */, 0x0323 /* NFC_QC_MAYBE */}), Rust::QuickCheckResult::MAYBE); ASSERT_EQ (Rust::nfc_quick_check ({0x0340 /* NFC_QC_NO */}), Rust::QuickCheckResult::NO); } void assert_normalize (const std::vector origin, const std::vector expected) { std::vector actual = Rust::nfc_normalize (origin); ASSERT_EQ (actual.size (), expected.size ()); for (unsigned int i = 0; i < actual.size (); i++) { ASSERT_EQ (actual[i], expected[i]); } } void rust_utf8_normalize_test () { // ASCII assert_normalize ({'h', 'e', 'l', 'l', 'o'}, {'h', 'e', 'l', 'l', 'o'}); // ASCII assert_normalize ({'/', '\\', '.', ':', '*'}, {'/', '\\', '.', ':', '*'}); // testcases retrieved from Part0 of // https://unicode.org/Public/UNIDATA/NormalizationTest.txt assert_normalize ({0x1e0a}, {0x1e0a}); assert_normalize ({0x1e0c}, {0x1e0c}); assert_normalize ({0x1e0a, 0x0323}, {0x1e0c, 0x0307}); assert_normalize ({0x1e0c, 0x0307}, {0x1e0c, 0x0307}); assert_normalize ({0x0044, 0x0307, 0x0323}, {0x1e0c, 0x0307}); // testcases for Hangul from Part0 assert_normalize ({0x1100, 0xac00, 0x11a8}, {0x1100, 0xac01}); assert_normalize ({0x1100, 0xac00, 0x11a8, 0x11a8}, {0x1100, 0xac01, 0x11a8}); // testcases for Hangul from Part1 assert_normalize ({0x3131}, {0x3131}); assert_normalize ({0x3132}, {0x3132}); // testcases for Hangul from Part3 assert_normalize ({0x1100, 0x0334, 0x1161}, {0x1100, 0x0334, 0x1161}); assert_normalize ({0xac54, 0x0334, 0x11ae}, {0xac54, 0x0334, 0x11ae}); // TODO: add more testcases in // https://unicode.org/Public/UNIDATA/NormalizationTest.txt } void rust_utf8_property_test () { ASSERT_TRUE (Rust::is_alphabetic ('A')); ASSERT_TRUE (Rust::is_alphabetic ('B')); ASSERT_TRUE (Rust::is_alphabetic ('x')); ASSERT_TRUE (Rust::is_alphabetic ('z')); ASSERT_TRUE (Rust::is_alphabetic (0x00b5)); // µ ASSERT_TRUE (Rust::is_alphabetic (0x3093)); // ん ASSERT_TRUE (Rust::is_alphabetic (0xa8f2)); // ꣲ ASSERT_TRUE (Rust::is_alphabetic (0x2b743)); // 𫝃 ASSERT_FALSE (Rust::is_alphabetic ('\v')); ASSERT_FALSE (Rust::is_alphabetic ('-')); ASSERT_FALSE (Rust::is_alphabetic ('_')); ASSERT_FALSE (Rust::is_alphabetic ('+')); ASSERT_FALSE (Rust::is_alphabetic ('0')); ASSERT_FALSE (Rust::is_alphabetic ('1')); ASSERT_FALSE (Rust::is_alphabetic ('2')); ASSERT_FALSE (Rust::is_alphabetic ('9')); ASSERT_FALSE (Rust::is_alphabetic (0xa720)); // ◌ ASSERT_FALSE (Rust::is_alphabetic (0xaac1)); // ◌꫁ // `Nd`s ASSERT_TRUE (Rust::is_numeric ('0')); ASSERT_TRUE (Rust::is_numeric ('1')); ASSERT_TRUE (Rust::is_numeric ('7')); ASSERT_TRUE (Rust::is_numeric ('9')); ASSERT_TRUE (Rust::is_numeric (0x07c2)); // ߂ ASSERT_TRUE (Rust::is_numeric (0x096d)); // ७ // `Nl`s ASSERT_TRUE (Rust::is_numeric (0x16e6)); // ᛮ ASSERT_TRUE (Rust::is_numeric (0xa6e6)); // ꛦ ASSERT_TRUE (Rust::is_numeric (0x12400)); // 𒐀 ASSERT_TRUE (Rust::is_numeric (0x1243a)); // 𒐺 // `No`s ASSERT_TRUE (Rust::is_numeric (0x00b2)); // ² ASSERT_TRUE (Rust::is_numeric (0x32b1)); // ㊱ ASSERT_FALSE (Rust::is_numeric ('\n')); ASSERT_FALSE (Rust::is_numeric ('-')); ASSERT_FALSE (Rust::is_numeric ('_')); ASSERT_FALSE (Rust::is_numeric ('(')); ASSERT_FALSE (Rust::is_numeric ('z')); ASSERT_FALSE (Rust::is_numeric (';')); ASSERT_FALSE (Rust::is_numeric (0x03f4)); // ϴ ASSERT_FALSE (Rust::is_numeric (0x0628)); // ب ASSERT_FALSE (Rust::is_numeric (0x0975)); // ॵ ASSERT_FALSE (Rust::is_numeric (0x18f0)); // ᣰ ASSERT_FALSE (Rust::is_numeric (0x2f30)); // ⼰ } } // namespace selftest #endif // CHECKING_P