// Copyright (C) 2020-2025 Free Software Foundation, Inc.
// This file is part of GCC.
// GCC is free software; you can redistribute it and/or modify it under
// the terms of the GNU General Public License as published by the Free
// Software Foundation; either version 3, or (at your option) any later
// version.
// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
// WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
// You should have received a copy of the GNU General Public License
// along with GCC; see the file COPYING3. If not see
// .
#include "rust-input-source.h"
#include "rust-system.h"
#include "optional.h"
#include "selftest.h"
#include "rust-lex.h"
#include "rust-unicode.h"
#include "rust-unicode-data.h"
namespace Rust {
typedef Codepoint codepoint_t;
typedef std::vector string_t;
// These constants are used to compose and decompose of Hangul syllables.
// See `Sample Code for Hangul Algorithms` in 3.1.2
// unicode.org/versions/Unicode15.0.0/ch03.pdf
const uint32_t S_BASE = 0xAC00;
const uint32_t L_BASE = 0x1100, V_BASE = 0x1161, T_BASE = 0x11A7;
const uint32_t L_COUNT = 19, V_COUNT = 21, T_COUNT = 28;
const uint32_t N_COUNT = V_COUNT * T_COUNT;
const uint32_t S_COUNT = L_COUNT * N_COUNT;
// Check if the codepoint is in any of the ranges (half-open intervals [a,b)).
template
bool
binary_search_ranges (
const std::array, SIZE> &ranges,
uint32_t target_cp)
{
auto it = std::lower_bound (ranges.begin (), ranges.end (), target_cp,
[] (const std::pair &a,
uint32_t b) { return a.second <= b; });
if (it == ranges.end ())
return false;
else
return it->first <= target_cp && target_cp < it->second;
}
int
lookup_cc (codepoint_t c)
{
auto it = CCC_TABLE.find (c.value);
if (it != CCC_TABLE.end ())
return it->second;
else
// Starter. Returns zero.
return 0;
}
tl::optional
lookup_recomp (codepoint_t starter, codepoint_t c)
{
auto it = Rust::RECOMPOSITION_MAP.find ({starter.value, c.value});
if (it != Rust::RECOMPOSITION_MAP.end ())
return {it->second};
it = Rust::RECOMPOSITION_MAP.find ({starter.value, 0});
if (it != Rust::RECOMPOSITION_MAP.end ())
return {it->second};
return tl::nullopt;
}
void
recursive_decomp_cano (codepoint_t c, string_t &buf)
{
auto it = Rust::DECOMPOSITION_MAP.find (c.value);
if (it != Rust::DECOMPOSITION_MAP.end ())
{
std::vector decomped = it->second;
for (uint32_t cp : decomped)
recursive_decomp_cano (cp, buf);
}
else
buf.push_back (c);
}
string_t
decomp_cano (string_t s)
{
string_t buf;
for (codepoint_t c : s)
{
int64_t s_index = c.value - S_BASE;
if (0 <= s_index && s_index < S_COUNT)
{
// decompose Hangul argorithmically
uint32_t l = L_BASE + s_index / N_COUNT;
uint32_t v = V_BASE + (s_index % N_COUNT) / T_COUNT;
uint32_t t = T_BASE + s_index % T_COUNT;
buf.push_back (l);
buf.push_back (v);
if (t != T_BASE)
buf.push_back (t);
continue;
}
// Current character is not hangul
recursive_decomp_cano (c, buf);
}
return buf;
}
void
sort_cano (string_t &s)
{
int cc_here, cc_prev;
if (s.size () == 1)
return;
for (unsigned int i = 1; i < s.size (); i++)
{
cc_here = lookup_cc (s[i]);
cc_prev = lookup_cc (s[i - 1]);
if (cc_here > 0 && cc_prev > 0 && cc_prev > cc_here)
{
// swap
codepoint_t tmp = s[i];
s[i] = s[i - 1];
s[i - 1] = tmp;
if (i > 1)
i -= 2;
}
}
}
string_t
compose_hangul (string_t s)
{
string_t buf;
if (s.size () < 2)
return s;
codepoint_t last = s[0];
buf.push_back (last);
for (unsigned int src_pos = 1; src_pos < s.size (); src_pos++)
{
codepoint_t ch = s[src_pos];
// L V => LV
int64_t l_index = last.value - L_BASE;
if (0 <= l_index && l_index < L_COUNT)
{
int64_t v_index = ch.value - V_BASE;
if (0 <= v_index && v_index < V_COUNT)
{
last = S_BASE + (l_index * V_COUNT + v_index) * T_COUNT;
// pop L
buf.pop_back ();
buf.push_back (last);
continue;
}
}
// LV T => LVT
int64_t s_index = last.value - S_BASE;
if (0 <= s_index && s_index < S_COUNT && (s_index % T_COUNT) == 0)
{
int64_t t_index = ch.value - T_BASE;
if (0 < t_index && t_index < T_COUNT)
{
last.value += t_index;
// pop LV
buf.pop_back ();
buf.push_back (last);
continue;
}
}
last = ch;
buf.push_back (last);
}
return buf;
}
string_t
recomp (string_t s)
{
// compose hangul first
s = compose_hangul (s);
string_t buf;
if (s.size () < 2)
return s;
int last_class = -1;
// int starter_pos = 0; // Assume the first character is Starter. Correct?
// int target_pos = 1;
codepoint_t starter_ch = s[0];
for (unsigned int src_pos = 1; src_pos < s.size (); src_pos++)
{
// get current character
codepoint_t ch = s[src_pos];
int ch_class = lookup_cc (ch);
tl::optional composite = lookup_recomp (starter_ch, ch);
if (composite.has_value () && last_class < ch_class)
{
// ch can be composed
buf.push_back (composite.value ());
starter_ch = composite.value ();
}
else if (ch_class == 0)
{
// ch is Starter and cannot be composed.
if (src_pos == 1)
// FIXME: buggy?
buf.push_back (starter_ch);
starter_ch = ch;
last_class = -1;
buf.push_back (ch);
}
else
{
if (src_pos == 1)
// FIXME: buggy?
buf.push_back (starter_ch);
// ch is not Starter.
last_class = ch_class;
buf.push_back (ch);
}
}
return buf;
}
// see https://unicode.org/reports/tr15/#Detecting_Normalization_Forms
QuickCheckResult
nfc_quick_check (const string_t &s)
{
int last_canonical_class = 0;
QuickCheckResult res = QuickCheckResult::YES;
for (unsigned long i = 0; i < s.size (); i++)
{
codepoint_t c = s[i];
if (c.is_supplementary_character ())
i++;
int canonical_class = lookup_cc (c);
if (last_canonical_class > canonical_class && canonical_class != 0)
return QuickCheckResult::NO;
if (is_nfc_qc_no (c.value))
return QuickCheckResult::NO;
if (is_nfc_qc_maybe (c.value))
res = QuickCheckResult::MAYBE;
last_canonical_class = canonical_class;
}
return res;
}
string_t
nfc_normalize (const string_t &s)
{
if (nfc_quick_check (s) == QuickCheckResult::YES)
return s;
// TODO: optimize normalization.
// i.e. only normalize a limited area around MAYBE character, instead of
// performing complete normlization of the entire string
// decompose
string_t d = decomp_cano (s);
sort_cano (d);
// recompose
string_t r = recomp (d);
return r;
}
Utf8String
Utf8String::nfc_normalize () const
{
return Utf8String (Rust::nfc_normalize (chars));
}
bool
is_alphabetic (uint32_t codepoint)
{
return binary_search_ranges (ALPHABETIC_RANGES, codepoint);
}
bool
is_numeric (uint32_t codepoint)
{
return std::binary_search (NUMERIC_CODEPOINTS.begin (),
NUMERIC_CODEPOINTS.end (), codepoint);
}
bool
is_nfc_qc_maybe (uint32_t codepoint)
{
return binary_search_ranges (NFC_QC_MAYBE_RANGES, codepoint);
}
bool
is_nfc_qc_no (uint32_t codepoint)
{
return binary_search_ranges (NFC_QC_NO_RANGES, codepoint);
}
bool
is_ascii_only (const std::string &str)
{
for (char c : str)
if (static_cast (c) > MAX_ASCII_CODEPOINT)
return false;
return true;
}
} // namespace Rust
#if CHECKING_P
namespace selftest {
void
rust_nfc_qc_test ()
{
ASSERT_EQ (Rust::nfc_quick_check ({0x1e0a /* NFC_QC_YES */}),
Rust::QuickCheckResult::YES);
ASSERT_EQ (Rust::nfc_quick_check (
{0x1e0a /* NFC_QC_YES */, 0x0323 /* NFC_QC_MAYBE */}),
Rust::QuickCheckResult::MAYBE);
ASSERT_EQ (Rust::nfc_quick_check ({0x0340 /* NFC_QC_NO */}),
Rust::QuickCheckResult::NO);
}
void
assert_normalize (const std::vector origin,
const std::vector expected)
{
std::vector actual = Rust::nfc_normalize (origin);
ASSERT_EQ (actual.size (), expected.size ());
for (unsigned int i = 0; i < actual.size (); i++)
{
ASSERT_EQ (actual[i], expected[i]);
}
}
void
rust_utf8_normalize_test ()
{
// ASCII
assert_normalize ({'h', 'e', 'l', 'l', 'o'}, {'h', 'e', 'l', 'l', 'o'});
// ASCII
assert_normalize ({'/', '\\', '.', ':', '*'}, {'/', '\\', '.', ':', '*'});
// testcases retrieved from Part0 of
// https://unicode.org/Public/UNIDATA/NormalizationTest.txt
assert_normalize ({0x1e0a}, {0x1e0a});
assert_normalize ({0x1e0c}, {0x1e0c});
assert_normalize ({0x1e0a, 0x0323}, {0x1e0c, 0x0307});
assert_normalize ({0x1e0c, 0x0307}, {0x1e0c, 0x0307});
assert_normalize ({0x0044, 0x0307, 0x0323}, {0x1e0c, 0x0307});
// testcases for Hangul from Part0
assert_normalize ({0x1100, 0xac00, 0x11a8}, {0x1100, 0xac01});
assert_normalize ({0x1100, 0xac00, 0x11a8, 0x11a8}, {0x1100, 0xac01, 0x11a8});
// testcases for Hangul from Part1
assert_normalize ({0x3131}, {0x3131});
assert_normalize ({0x3132}, {0x3132});
// testcases for Hangul from Part3
assert_normalize ({0x1100, 0x0334, 0x1161}, {0x1100, 0x0334, 0x1161});
assert_normalize ({0xac54, 0x0334, 0x11ae}, {0xac54, 0x0334, 0x11ae});
// TODO: add more testcases in
// https://unicode.org/Public/UNIDATA/NormalizationTest.txt
}
void
rust_utf8_property_test ()
{
ASSERT_TRUE (Rust::is_alphabetic ('A'));
ASSERT_TRUE (Rust::is_alphabetic ('B'));
ASSERT_TRUE (Rust::is_alphabetic ('x'));
ASSERT_TRUE (Rust::is_alphabetic ('z'));
ASSERT_TRUE (Rust::is_alphabetic (0x00b5)); // µ
ASSERT_TRUE (Rust::is_alphabetic (0x3093)); // ん
ASSERT_TRUE (Rust::is_alphabetic (0xa8f2)); // ꣲ
ASSERT_TRUE (Rust::is_alphabetic (0x2b743)); // 𫝃
ASSERT_FALSE (Rust::is_alphabetic ('\v'));
ASSERT_FALSE (Rust::is_alphabetic ('-'));
ASSERT_FALSE (Rust::is_alphabetic ('_'));
ASSERT_FALSE (Rust::is_alphabetic ('+'));
ASSERT_FALSE (Rust::is_alphabetic ('0'));
ASSERT_FALSE (Rust::is_alphabetic ('1'));
ASSERT_FALSE (Rust::is_alphabetic ('2'));
ASSERT_FALSE (Rust::is_alphabetic ('9'));
ASSERT_FALSE (Rust::is_alphabetic (0xa720)); // ◌
ASSERT_FALSE (Rust::is_alphabetic (0xaac1)); // ◌꫁
// `Nd`s
ASSERT_TRUE (Rust::is_numeric ('0'));
ASSERT_TRUE (Rust::is_numeric ('1'));
ASSERT_TRUE (Rust::is_numeric ('7'));
ASSERT_TRUE (Rust::is_numeric ('9'));
ASSERT_TRUE (Rust::is_numeric (0x07c2)); // ߂
ASSERT_TRUE (Rust::is_numeric (0x096d)); // ७
// `Nl`s
ASSERT_TRUE (Rust::is_numeric (0x16e6)); // ᛮ
ASSERT_TRUE (Rust::is_numeric (0xa6e6)); // ꛦ
ASSERT_TRUE (Rust::is_numeric (0x12400)); // 𒐀
ASSERT_TRUE (Rust::is_numeric (0x1243a)); // 𒐺
// `No`s
ASSERT_TRUE (Rust::is_numeric (0x00b2)); // ²
ASSERT_TRUE (Rust::is_numeric (0x32b1)); // ㊱
ASSERT_FALSE (Rust::is_numeric ('\n'));
ASSERT_FALSE (Rust::is_numeric ('-'));
ASSERT_FALSE (Rust::is_numeric ('_'));
ASSERT_FALSE (Rust::is_numeric ('('));
ASSERT_FALSE (Rust::is_numeric ('z'));
ASSERT_FALSE (Rust::is_numeric (';'));
ASSERT_FALSE (Rust::is_numeric (0x03f4)); // ϴ
ASSERT_FALSE (Rust::is_numeric (0x0628)); // ب
ASSERT_FALSE (Rust::is_numeric (0x0975)); // ॵ
ASSERT_FALSE (Rust::is_numeric (0x18f0)); // ᣰ
ASSERT_FALSE (Rust::is_numeric (0x2f30)); // ⼰
}
} // namespace selftest
#endif // CHECKING_P