aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--gcc/rust/rust-lang.cc1
-rw-r--r--gcc/rust/rust-system.h1
-rw-r--r--gcc/rust/util/make-rust-unicode.py39
-rw-r--r--gcc/rust/util/rust-codepoint.h1
-rw-r--r--gcc/rust/util/rust-unicode-data.h158
-rw-r--r--gcc/rust/util/rust-unicode.cc143
-rw-r--r--gcc/rust/util/rust-unicode.h19
7 files changed, 280 insertions, 82 deletions
diff --git a/gcc/rust/rust-lang.cc b/gcc/rust/rust-lang.cc
index 8b76ba2..4c2ef10 100644
--- a/gcc/rust/rust-lang.cc
+++ b/gcc/rust/rust-lang.cc
@@ -438,6 +438,7 @@ run_rust_tests ()
{
// Call tests for the rust frontend here
rust_input_source_test ();
+ rust_nfc_qc_test ();
rust_utf8_normalize_test ();
rust_punycode_encode_test ();
rust_cfg_parser_test ();
diff --git a/gcc/rust/rust-system.h b/gcc/rust/rust-system.h
index 88d6c10..d8a4218 100644
--- a/gcc/rust/rust-system.h
+++ b/gcc/rust/rust-system.h
@@ -44,6 +44,7 @@
#include <utility>
#include <fstream>
#include <array>
+#include <algorithm>
// Rust frontend requires C++11 minimum, so will have unordered_map and set
#include <unordered_map>
diff --git a/gcc/rust/util/make-rust-unicode.py b/gcc/rust/util/make-rust-unicode.py
index 5303440..f6f04eb 100644
--- a/gcc/rust/util/make-rust-unicode.py
+++ b/gcc/rust/util/make-rust-unicode.py
@@ -250,6 +250,30 @@ def write_numeric() -> None:
print("}};")
+def write_nfc_qc():
+ print(
+ "const std::array<std::pair<uint32_t, uint32_t>, {}> NFC_QC_NO_RANGES = {{{{".format(
+ len(nfc_qc_no_ranges)
+ )
+ )
+ print(" // clang-format off")
+ for r in nfc_qc_no_ranges:
+ print(" {{{:#06x}, {:#06x}}},".format(r[0], r[1]))
+ print(" // clang-format on")
+ print("}};")
+
+ print(
+ "const std::array<std::pair<uint32_t, uint32_t>, {}> NFC_QC_MAYBE_RANGES = {{{{".format(
+ len(nfc_qc_maybe_ranges)
+ )
+ )
+ print(" // clang-format off")
+ for r in nfc_qc_maybe_ranges:
+ print(" {{{:#06x}, {:#06x}}},".format(r[0], r[1]))
+ print(" // clang-format on")
+ print("}};")
+
+
def main() -> None:
if len(sys.argv) != 4:
print("too few arguments", file=sys.stderr)
@@ -265,13 +289,12 @@ def main() -> None:
print(COPYRIGHT)
print()
- print('#include "rust-system.h"')
- print()
- print("namespace Rust {")
- print()
+ print('#include "rust-system.h"\n')
+ print("namespace Rust {\n")
print("const uint32_t NUM_ALPHABETIC_RANGES = {};".format(len(alphabetic_ranges)))
- print("const uint32_t NUM_NUMERIC_CODEPOINTS = {};".format(len(numeric_codepoints)))
- print()
+ print(
+ "const uint32_t NUM_NUMERIC_CODEPOINTS = {};\n".format(len(numeric_codepoints))
+ )
write_decomposition()
print()
@@ -283,8 +306,8 @@ def main() -> None:
print()
write_numeric()
print()
-
- # TODO: write NFC_QC table
+ write_nfc_qc()
+ print()
print("} // namespace Rust")
diff --git a/gcc/rust/util/rust-codepoint.h b/gcc/rust/util/rust-codepoint.h
index a75e99e..2c5fece 100644
--- a/gcc/rust/util/rust-codepoint.h
+++ b/gcc/rust/util/rust-codepoint.h
@@ -40,6 +40,7 @@ struct Codepoint
static Codepoint eof () { return Codepoint (UINT32_MAX); }
bool is_eof () const { return value == UINT32_MAX; }
bool is_ascii () const { return value <= MAX_ASCII_CODEPOINT; }
+ bool is_supplementary_character () const { return value > 0xFFFF; }
// Returns a C++ string containing string value of codepoint.
std::string as_string ();
diff --git a/gcc/rust/util/rust-unicode-data.h b/gcc/rust/util/rust-unicode-data.h
index c42460f..e07752a 100644
--- a/gcc/rust/util/rust-unicode-data.h
+++ b/gcc/rust/util/rust-unicode-data.h
@@ -20,7 +20,7 @@
namespace Rust {
-const uint32_t NUM_ALPHABETIC_RANGES = 1117;
+const uint32_t NUM_ALPHABETIC_RANGES = 1141;
const uint32_t NUM_NUMERIC_CODEPOINTS = 1831;
const std::map<uint32_t, std::vector<uint32_t>> DECOMPOSITION_MAP = {
@@ -4167,6 +4167,7 @@ const std::array<std::pair<uint32_t, uint32_t>, NUM_ALPHABETIC_RANGES>
{0x0bd7, 0x0bd8},
{0x0c00, 0x0c01},
{0x0c01, 0x0c04},
+ {0x0c04, 0x0c05},
{0x0c05, 0x0c0d},
{0x0c0e, 0x0c11},
{0x0c12, 0x0c29},
@@ -4202,6 +4203,7 @@ const std::array<std::pair<uint32_t, uint32_t>, NUM_ALPHABETIC_RANGES>
{0x0ce0, 0x0ce2},
{0x0ce2, 0x0ce4},
{0x0cf1, 0x0cf3},
+ {0x0cf3, 0x0cf4},
{0x0d00, 0x0d02},
{0x0d02, 0x0d04},
{0x0d04, 0x0d0d},
@@ -4257,7 +4259,7 @@ const std::array<std::pair<uint32_t, uint32_t>, NUM_ALPHABETIC_RANGES>
{0x0f49, 0x0f6d},
{0x0f71, 0x0f7f},
{0x0f7f, 0x0f80},
- {0x0f80, 0x0f82},
+ {0x0f80, 0x0f84},
{0x0f88, 0x0f8d},
{0x0f8d, 0x0f98},
{0x0f99, 0x0fbd},
@@ -4758,6 +4760,7 @@ const std::array<std::pair<uint32_t, uint32_t>, NUM_ALPHABETIC_RANGES>
{0x11071, 0x11073},
{0x11073, 0x11075},
{0x11075, 0x11076},
+ {0x11080, 0x11082},
{0x11082, 0x11083},
{0x11083, 0x110b0},
{0x110b0, 0x110b3},
@@ -4794,6 +4797,8 @@ const std::array<std::pair<uint32_t, uint32_t>, NUM_ALPHABETIC_RANGES>
{0x11234, 0x11235},
{0x11237, 0x11238},
{0x1123e, 0x1123f},
+ {0x1123f, 0x11241},
+ {0x11241, 0x11242},
{0x11280, 0x11287},
{0x11288, 0x11289},
{0x1128a, 0x1128e},
@@ -4948,12 +4953,22 @@ const std::array<std::pair<uint32_t, uint32_t>, NUM_ALPHABETIC_RANGES>
{0x11ee0, 0x11ef3},
{0x11ef3, 0x11ef5},
{0x11ef5, 0x11ef7},
+ {0x11f00, 0x11f02},
+ {0x11f02, 0x11f03},
+ {0x11f03, 0x11f04},
+ {0x11f04, 0x11f11},
+ {0x11f12, 0x11f34},
+ {0x11f34, 0x11f36},
+ {0x11f36, 0x11f3b},
+ {0x11f3e, 0x11f40},
+ {0x11f40, 0x11f41},
{0x11fb0, 0x11fb1},
{0x12000, 0x1239a},
{0x12400, 0x1246f},
{0x12480, 0x12544},
{0x12f90, 0x12ff1},
- {0x13000, 0x1342f},
+ {0x13000, 0x13430},
+ {0x13441, 0x13447},
{0x14400, 0x14647},
{0x16800, 0x16a39},
{0x16a40, 0x16a5f},
@@ -4980,7 +4995,9 @@ const std::array<std::pair<uint32_t, uint32_t>, NUM_ALPHABETIC_RANGES>
{0x1aff5, 0x1affc},
{0x1affd, 0x1afff},
{0x1b000, 0x1b123},
+ {0x1b132, 0x1b133},
{0x1b150, 0x1b153},
+ {0x1b155, 0x1b156},
{0x1b164, 0x1b168},
{0x1b170, 0x1b2fc},
{0x1bc00, 0x1bc6b},
@@ -5021,16 +5038,21 @@ const std::array<std::pair<uint32_t, uint32_t>, NUM_ALPHABETIC_RANGES>
{0x1df00, 0x1df0a},
{0x1df0a, 0x1df0b},
{0x1df0b, 0x1df1f},
+ {0x1df25, 0x1df2b},
{0x1e000, 0x1e007},
{0x1e008, 0x1e019},
{0x1e01b, 0x1e022},
{0x1e023, 0x1e025},
{0x1e026, 0x1e02b},
+ {0x1e030, 0x1e06e},
+ {0x1e08f, 0x1e090},
{0x1e100, 0x1e12d},
{0x1e137, 0x1e13e},
{0x1e14e, 0x1e14f},
{0x1e290, 0x1e2ae},
{0x1e2c0, 0x1e2ec},
+ {0x1e4d0, 0x1e4eb},
+ {0x1e4eb, 0x1e4ec},
{0x1e7e0, 0x1e7e7},
{0x1e7e8, 0x1e7ec},
{0x1e7ed, 0x1e7ef},
@@ -5076,12 +5098,14 @@ const std::array<std::pair<uint32_t, uint32_t>, NUM_ALPHABETIC_RANGES>
{0x1f150, 0x1f16a},
{0x1f170, 0x1f18a},
{0x20000, 0x2a6e0},
- {0x2a700, 0x2b739},
+ {0x2a700, 0x2b73a},
{0x2b740, 0x2b81e},
{0x2b820, 0x2cea2},
{0x2ceb0, 0x2ebe1},
+ {0x2ebf0, 0x2ee5e},
{0x2f800, 0x2fa1e},
{0x30000, 0x3134b},
+ {0x31350, 0x323b0},
// clang-format on
}};
@@ -5205,4 +5229,130 @@ const std::array<uint32_t, NUM_NUMERIC_CODEPOINTS> NUMERIC_CODEPOINTS = {{
// clang-format on
}};
+const std::array<std::pair<uint32_t, uint32_t>, 74> NFC_QC_NO_RANGES = {{
+ // clang-format off
+ {0x0340, 0x0342},
+ {0x0343, 0x0345},
+ {0x0374, 0x0375},
+ {0x037e, 0x037f},
+ {0x0387, 0x0388},
+ {0x0958, 0x0960},
+ {0x09dc, 0x09de},
+ {0x09df, 0x09e0},
+ {0x0a33, 0x0a34},
+ {0x0a36, 0x0a37},
+ {0x0a59, 0x0a5c},
+ {0x0a5e, 0x0a5f},
+ {0x0b5c, 0x0b5e},
+ {0x0f43, 0x0f44},
+ {0x0f4d, 0x0f4e},
+ {0x0f52, 0x0f53},
+ {0x0f57, 0x0f58},
+ {0x0f5c, 0x0f5d},
+ {0x0f69, 0x0f6a},
+ {0x0f73, 0x0f74},
+ {0x0f75, 0x0f77},
+ {0x0f78, 0x0f79},
+ {0x0f81, 0x0f82},
+ {0x0f93, 0x0f94},
+ {0x0f9d, 0x0f9e},
+ {0x0fa2, 0x0fa3},
+ {0x0fa7, 0x0fa8},
+ {0x0fac, 0x0fad},
+ {0x0fb9, 0x0fba},
+ {0x1f71, 0x1f72},
+ {0x1f73, 0x1f74},
+ {0x1f75, 0x1f76},
+ {0x1f77, 0x1f78},
+ {0x1f79, 0x1f7a},
+ {0x1f7b, 0x1f7c},
+ {0x1f7d, 0x1f7e},
+ {0x1fbb, 0x1fbc},
+ {0x1fbe, 0x1fbf},
+ {0x1fc9, 0x1fca},
+ {0x1fcb, 0x1fcc},
+ {0x1fd3, 0x1fd4},
+ {0x1fdb, 0x1fdc},
+ {0x1fe3, 0x1fe4},
+ {0x1feb, 0x1fec},
+ {0x1fee, 0x1ff0},
+ {0x1ff9, 0x1ffa},
+ {0x1ffb, 0x1ffc},
+ {0x1ffd, 0x1ffe},
+ {0x2000, 0x2002},
+ {0x2126, 0x2127},
+ {0x212a, 0x212c},
+ {0x2329, 0x232a},
+ {0x232a, 0x232b},
+ {0x2adc, 0x2add},
+ {0xf900, 0xfa0e},
+ {0xfa10, 0xfa11},
+ {0xfa12, 0xfa13},
+ {0xfa15, 0xfa1f},
+ {0xfa20, 0xfa21},
+ {0xfa22, 0xfa23},
+ {0xfa25, 0xfa27},
+ {0xfa2a, 0xfa6e},
+ {0xfa70, 0xfada},
+ {0xfb1d, 0xfb1e},
+ {0xfb1f, 0xfb20},
+ {0xfb2a, 0xfb37},
+ {0xfb38, 0xfb3d},
+ {0xfb3e, 0xfb3f},
+ {0xfb40, 0xfb42},
+ {0xfb43, 0xfb45},
+ {0xfb46, 0xfb4f},
+ {0x1d15e, 0x1d165},
+ {0x1d1bb, 0x1d1c1},
+ {0x2f800, 0x2fa1e},
+ // clang-format on
+}};
+const std::array<std::pair<uint32_t, uint32_t>, 43> NFC_QC_MAYBE_RANGES = {{
+ // clang-format off
+ {0x0300, 0x0305},
+ {0x0306, 0x030d},
+ {0x030f, 0x0310},
+ {0x0311, 0x0312},
+ {0x0313, 0x0315},
+ {0x031b, 0x031c},
+ {0x0323, 0x0329},
+ {0x032d, 0x032f},
+ {0x0330, 0x0332},
+ {0x0338, 0x0339},
+ {0x0342, 0x0343},
+ {0x0345, 0x0346},
+ {0x0653, 0x0656},
+ {0x093c, 0x093d},
+ {0x09be, 0x09bf},
+ {0x09d7, 0x09d8},
+ {0x0b3e, 0x0b3f},
+ {0x0b56, 0x0b57},
+ {0x0b57, 0x0b58},
+ {0x0bbe, 0x0bbf},
+ {0x0bd7, 0x0bd8},
+ {0x0c56, 0x0c57},
+ {0x0cc2, 0x0cc3},
+ {0x0cd5, 0x0cd7},
+ {0x0d3e, 0x0d3f},
+ {0x0d57, 0x0d58},
+ {0x0dca, 0x0dcb},
+ {0x0dcf, 0x0dd0},
+ {0x0ddf, 0x0de0},
+ {0x102e, 0x102f},
+ {0x1161, 0x1176},
+ {0x11a8, 0x11c3},
+ {0x1b35, 0x1b36},
+ {0x3099, 0x309b},
+ {0x110ba, 0x110bb},
+ {0x11127, 0x11128},
+ {0x1133e, 0x1133f},
+ {0x11357, 0x11358},
+ {0x114b0, 0x114b1},
+ {0x114ba, 0x114bb},
+ {0x114bd, 0x114be},
+ {0x115af, 0x115b0},
+ {0x11930, 0x11931},
+ // clang-format on
+}};
+
} // namespace Rust
diff --git a/gcc/rust/util/rust-unicode.cc b/gcc/rust/util/rust-unicode.cc
index 999ecb0..6bd2db5 100644
--- a/gcc/rust/util/rust-unicode.cc
+++ b/gcc/rust/util/rust-unicode.cc
@@ -39,75 +39,27 @@ const uint32_t L_COUNT = 19, V_COUNT = 21, T_COUNT = 28;
const uint32_t N_COUNT = V_COUNT * T_COUNT;
const uint32_t S_COUNT = L_COUNT * N_COUNT;
+// Check if the codepoint is in any of the ranges (half-open intervals [a,b)).
template <std::size_t SIZE>
-int64_t
+bool
binary_search_ranges (
- // FIXME: use binray search function from <algorithm>
const std::array<std::pair<uint32_t, uint32_t>, SIZE> &ranges,
uint32_t target_cp)
{
- if (SIZE == 0)
- return -1;
-
- uint32_t low, high, mid;
- uint32_t start, end;
- low = 0;
- high = SIZE - 1;
- mid = (low + high) / 2;
- while (high - low > 1)
- {
- start = ranges[mid].first;
- end = ranges[mid].second;
- if (start <= target_cp && target_cp < end)
- {
- return mid;
- }
- else if (end <= target_cp)
- low = mid + 1;
- else
- high = mid - 1;
- mid = (low + high) / 2;
- }
-
- if (ranges[mid].first <= target_cp && target_cp < ranges[mid].second)
- return mid;
- else
- return -1;
-}
-
-template <std::size_t SIZE>
-int64_t
-binary_search_sorted_array (const std::array<uint32_t, SIZE> &array,
- uint32_t target)
-{
- // FIXME: use binray search function from <algorithm>
- if (SIZE == 0)
- return -1;
-
- uint32_t low, high, mid;
- low = 0;
- high = SIZE;
- mid = (low + high) / 2;
- while (high - low > 1)
- {
- if (array[mid] <= target)
- low = mid;
- else
- high = mid;
- mid = (low + high) / 2;
- }
-
- if (array[mid] == target)
- return mid;
+ auto it = std::lower_bound (ranges.begin (), ranges.end (), target_cp,
+ [] (const std::pair<uint32_t, uint32_t> &a,
+ uint32_t b) { return a.second <= b; });
+ if (it == ranges.end ())
+ return false;
else
- return -1;
+ return it->first <= target_cp && target_cp < it->second;
}
int
lookup_cc (codepoint_t c)
{
- auto it = Rust::CCC_TABLE.find (c.value);
- if (it != Rust::CCC_TABLE.end ())
+ auto it = CCC_TABLE.find (c.value);
+ if (it != CCC_TABLE.end ())
return it->second;
else
// Starter. Returns zero.
@@ -289,10 +241,44 @@ recomp (string_t s)
return buf;
}
+// see https://unicode.org/reports/tr15/#Detecting_Normalization_Forms
+QuickCheckResult
+nfc_quick_check (const string_t &s)
+{
+ int last_canonical_class = 0;
+ QuickCheckResult res = QuickCheckResult::YES;
+
+ for (unsigned long i = 0; i < s.size (); i++)
+ {
+ codepoint_t c = s[i];
+
+ if (c.is_supplementary_character ())
+ i++;
+
+ int canonical_class = lookup_cc (c);
+ if (last_canonical_class > canonical_class && canonical_class != 0)
+ return QuickCheckResult::NO;
+
+ if (is_nfc_qc_no (c.value))
+ return QuickCheckResult::NO;
+
+ if (is_nfc_qc_maybe (c.value))
+ res = QuickCheckResult::MAYBE;
+
+ last_canonical_class = canonical_class;
+ }
+ return res;
+}
+
string_t
-nfc_normalize (string_t s)
+nfc_normalize (const string_t &s)
{
- // TODO: Quick Check
+ if (nfc_quick_check (s) == QuickCheckResult::YES)
+ return s;
+
+ // TODO: optimize normalization.
+ // i.e. only normalize a limited area around MAYBE character, instead of
+ // performing complete normlization of the entire string
// decompose
string_t d = decomp_cano (s);
@@ -312,21 +298,26 @@ Utf8String::nfc_normalize () const
bool
is_alphabetic (uint32_t codepoint)
{
- int64_t res = binary_search_ranges (ALPHABETIC_RANGES, codepoint);
- if (res < 0)
- return false;
- else
- return true;
+ return binary_search_ranges (ALPHABETIC_RANGES, codepoint);
}
bool
is_numeric (uint32_t codepoint)
{
- int64_t res = binary_search_sorted_array (NUMERIC_CODEPOINTS, codepoint);
- if (res < 0)
- return false;
- else
- return true;
+ return std::binary_search (NUMERIC_CODEPOINTS.begin (),
+ NUMERIC_CODEPOINTS.end (), codepoint);
+}
+
+bool
+is_nfc_qc_maybe (uint32_t codepoint)
+{
+ return binary_search_ranges (NFC_QC_MAYBE_RANGES, codepoint);
+}
+
+bool
+is_nfc_qc_no (uint32_t codepoint)
+{
+ return binary_search_ranges (NFC_QC_NO_RANGES, codepoint);
}
bool
@@ -345,6 +336,18 @@ is_ascii_only (const std::string &str)
namespace selftest {
void
+rust_nfc_qc_test ()
+{
+ ASSERT_EQ (Rust::nfc_quick_check ({0x1e0a /* NFC_QC_YES */}),
+ Rust::QuickCheckResult::YES);
+ ASSERT_EQ (Rust::nfc_quick_check (
+ {0x1e0a /* NFC_QC_YES */, 0x0323 /* NFC_QC_MAYBE */}),
+ Rust::QuickCheckResult::MAYBE);
+ ASSERT_EQ (Rust::nfc_quick_check ({0x0340 /* NFC_QC_NO */}),
+ Rust::QuickCheckResult::NO);
+}
+
+void
assert_normalize (const std::vector<Rust::Codepoint> origin,
const std::vector<Rust::Codepoint> expected)
{
diff --git a/gcc/rust/util/rust-unicode.h b/gcc/rust/util/rust-unicode.h
index 2538436..aa7bd8a 100644
--- a/gcc/rust/util/rust-unicode.h
+++ b/gcc/rust/util/rust-unicode.h
@@ -68,6 +68,22 @@ is_ascii_only (const std::string &str);
bool
is_numeric (uint32_t codepoint);
+bool
+is_nfc_qc_no (uint32_t codepoint);
+
+bool
+is_nfc_qc_maybe (uint32_t codepoint);
+
+enum class QuickCheckResult
+{
+ YES,
+ NO,
+ MAYBE
+};
+
+QuickCheckResult
+nfc_quick_check (const std::vector<Codepoint> &s);
+
} // namespace Rust
#if CHECKING_P
@@ -75,6 +91,9 @@ is_numeric (uint32_t codepoint);
namespace selftest {
void
+rust_nfc_qc_test ();
+
+void
rust_utf8_normalize_test ();
void