aboutsummaryrefslogtreecommitdiff
path: root/gcc/rust/util/make-rust-unicode.py
diff options
context:
space:
mode:
authorRaiki Tamura <tamaron1203@gmail.com>2023-07-14 14:45:34 +0900
committerPhilip Herron <philip.herron@embecosm.com>2023-07-29 16:05:39 +0000
commit7ce263e17a59c44d057bfb1ed6a8ab1c4d837f28 (patch)
tree6d43a655d88f1698b64b19317e223a3c35001391 /gcc/rust/util/make-rust-unicode.py
parent42bd81f8df1a8bf55e6c718f24994566eeaf1b5f (diff)
downloadgcc-7ce263e17a59c44d057bfb1ed6a8ab1c4d837f28.zip
gcc-7ce263e17a59c44d057bfb1ed6a8ab1c4d837f28.tar.gz
gcc-7ce263e17a59c44d057bfb1ed6a8ab1c4d837f28.tar.bz2
Add function for Unicode NFC normalization
gcc/rust/ChangeLog: * Make-lang.in: Add rust-unicode.o * rust-lang.cc (run_rust_tests): Add test. * rust-system.h: Include <array> * util/make-rust-unicode.py: Generater of rust-unicode-data.h. * util/rust-unicode-data.h: Auto-generated file. * util/rust-unicode.cc: New file. * util/rust-unicode.h: New file. Signed-off-by: Raiki Tamura <tamaron1203@gmail.com>
Diffstat (limited to 'gcc/rust/util/make-rust-unicode.py')
-rw-r--r--gcc/rust/util/make-rust-unicode.py289
1 files changed, 289 insertions, 0 deletions
diff --git a/gcc/rust/util/make-rust-unicode.py b/gcc/rust/util/make-rust-unicode.py
new file mode 100644
index 0000000..eaf2fc8
--- /dev/null
+++ b/gcc/rust/util/make-rust-unicode.py
@@ -0,0 +1,289 @@
+# Copyright (C) 2020-2023 Free Software Foundation, Inc.
+
+# This file is part of GCC.
+
+# GCC is free software; you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free
+# Software Foundation; either version 3, or (at your option) any later
+# version.
+
+# GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+# for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with GCC; see the file COPYING3. If not see
+# <http://www.gnu.org/licenses/>.
+
+# Run this program as
+# python ./make-rust-unicode.py UnicodeData.txt \
+# DerivedNormalizationProps.txt DerivedCoreProperties.txt \
+# > rust-unicode-data.h
+
+import sys
+
+COPYRIGHT = (
+ "// Copyright (C) 2020-2023 Free Software Foundation, Inc.\n"
+ "\n"
+ "// This file is part of GCC.\n"
+ "\n"
+ "// GCC is free software; you can redistribute it and/or modify it under\n"
+ "// the terms of the GNU General Public License as published by the Free\n"
+ "// Software Foundation; either version 3, or (at your option) any later\n"
+ "// version.\n"
+ "\n"
+ "// GCC is distributed in the hope that it will be useful, but WITHOUT ANY\n"
+ "// WARRANTY; without even the implied warranty of MERCHANTABILITY or\n"
+ "// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License\n"
+ "// for more details.\n"
+ "\n"
+ "// You should have received a copy of the GNU General Public License\n"
+ "// along with GCC; see the file COPYING3. If not see\n"
+ "// <http://www.gnu.org/licenses/>."
+)
+
+# Decomposition_Mapping table
+decomposition_map = {}
+# Canonical_Combining_Class table
+ccc_table = {}
+# Ranges of codepoints with the Full_Composition_Exclusion property
+composition_exclusion_ranges = []
+# Ranges of codepoints with the Full_Composition_Exclusion property
+alphabetic_ranges = []
+# Ranges of codepoints with NFC_QC=No
+nfc_qc_no_ranges = []
+# Ranges of codepoints with NFC_QC=Maybe
+nfc_qc_maybe_ranges = []
+numeric_codepoints = []
+
+# Note that an element of range `[m, n]` (a list in python) represents [m, n)
+
+
+def binary_search_ranges(ranges, target):
+ low = 0
+ high = len(ranges) - 1
+ while low <= high:
+ mid = (low + high) // 2
+ start, end = ranges[mid]
+ if start <= target <= end - 1:
+ return mid # target found. returns index.
+ elif target < start:
+ high = mid - 1
+ else:
+ low = mid + 1
+ # target not found.
+ return -1
+
+
+# Utility function to parse '<codepoint>...<codepoint>' or '<codepoint>'
+def parse_codepoint_range(range_str):
+ codepoint_range = range_str.split("..")
+ assert len(codepoint_range) == 1 or len(codepoint_range) == 2, "Invalid format"
+ start_cp, end_cp = 0, 0
+ if len(codepoint_range) == 1:
+ # m..n => [m, n+1)
+ start_cp = int(codepoint_range[0], 16)
+ end_cp = start_cp + 1
+ else:
+ # m => [m, m+1)
+ start_cp = int(codepoint_range[0], 16)
+ end_cp = int(codepoint_range[1], 16) + 1
+ return [start_cp, end_cp]
+
+
+def read_unicode_data_txt(filepath):
+ def process_line(line):
+ rows = line.split(";")
+ if len(rows) != 15:
+ return
+ # Parse codepoint
+ cp = int(rows[0], 16)
+ # Parse general category
+ category = rows[2]
+ if category == "Nd" or category == "Nl" or category == "No":
+ numeric_codepoints.append(cp)
+
+ # Parse CCC
+ ccc = int(rows[3], 10)
+ if ccc != 0:
+ ccc_table[cp] = ccc
+ # Parse decomposition mapping
+ # Ignore compatibility decomposition mapping because
+ # it is not required for **NFC** normalization.
+ if not rows[5].startswith("<"):
+ decomp_cp_strs = rows[5].split(" ")
+ decomp_cps = []
+ for s in decomp_cp_strs:
+ if s == "":
+ continue
+ decomp_cps.append(int(s, 16))
+ assert (
+ len(decomp_cps) <= 2
+ ), "Decomposition_Mapping must not contain more than 2 characters."
+ if len(decomp_cps) > 0:
+ decomposition_map[cp] = decomp_cps
+
+ with open(sys.argv[1], "r", encoding="UTF-8") as file:
+ while line := file.readline():
+ process_line(line.rstrip())
+
+
+def read_derived_norm_props_txt(filepath):
+ def process_line(line):
+ # Ignore comments
+ line = line.split("#")[0]
+ rows = line.split(";")
+ # Too few rows. Skipped.
+ if len(rows) < 2:
+ return
+ rows[0] = rows[0].lstrip().rstrip()
+ rows[1] = rows[1].lstrip().rstrip()
+ cp_range = parse_codepoint_range(rows[0])
+ if rows[1] == "Full_Composition_Exclusion":
+ composition_exclusion_ranges.append(cp_range)
+ elif rows[1] == "NFC_QC":
+ assert len(rows) >= 3, "Too few rows for NFC_QC"
+ rows[2] = rows[2].lstrip().rstrip()
+ if rows[2] == "N":
+ nfc_qc_no_ranges.append(cp_range)
+ elif rows[2] == "M":
+ nfc_qc_maybe_ranges.append(cp_range)
+ else:
+ raise RuntimeError("Value of NFC_QC must be N or M")
+
+ with open(filepath, "r", encoding="UTF-8") as file:
+ while line := file.readline():
+ process_line(line.rstrip())
+
+
+def read_derived_core_props_txt(filepath):
+ def process_line(line):
+ # Ignore comments
+ line = line.split("#")[0]
+ rows = line.split(";")
+ # Too few rows. Skipped.
+ if len(rows) < 2:
+ return
+ rows[0] = rows[0].lstrip().rstrip()
+ rows[1] = rows[1].lstrip().rstrip()
+ if rows[1] != "Alphabetic":
+ return
+ cp_range = parse_codepoint_range(rows[0])
+ alphabetic_ranges.append(cp_range)
+
+ with open(filepath, "r", encoding="UTF-8") as file:
+ while line := file.readline():
+ process_line(line.rstrip())
+
+
+def write_decomposition():
+ print("const std::map<uint32_t, std::vector<uint32_t>> DECOMPOSITION_MAP = {")
+ print(" // clang-format off")
+ for cp in sorted(decomposition_map):
+ print(" {{{:#06x}, ".format(cp), end="")
+ print("{", end="")
+ for decomp_cp in decomposition_map[cp]:
+ print("{:#06x}, ".format(decomp_cp), end="")
+ print("}},")
+ print(" // clang-format on")
+ print("};")
+
+
+def write_recomposition():
+ print(
+ "const std::map<std::pair<uint32_t, uint32_t>, uint32_t> RECOMPOSITION_MAP = {{"
+ )
+ print(" // clang-format off")
+ for cp in decomposition_map:
+ if binary_search_ranges(composition_exclusion_ranges, cp) != -1:
+ continue
+ if len(decomposition_map[cp]) == 1:
+ d1 = decomposition_map[cp][0]
+ d2 = 0
+ else:
+ d1 = decomposition_map[cp][0]
+ d2 = decomposition_map[cp][1]
+ print(" {{{{{:#06x}, {:#06x}}}, {:#06x}}},".format(d1, d2, cp))
+ print(" // clang-format on")
+ print("}};")
+
+
+def write_ccc():
+ print("const std::map<uint32_t, int32_t> CCC_TABLE = {")
+ print(" // clang-format off")
+ for cp in ccc_table:
+ print(" {{{:#06x}, {}}},".format(cp, ccc_table[cp]))
+ print(" // clang-format on")
+ print("};")
+
+
+def write_alphabetic():
+ print(
+ "const std::array<std::pair<uint32_t, uint32_t>, NUM_ALPHABETIC_RANGES> ALPHABETIC_RANGES = {{"
+ )
+ print(" // clang-format off")
+ for r in alphabetic_ranges:
+ print(" {{{:#06x}, {:#06x}}},".format(r[0], r[1]))
+ print(" // clang-format on")
+ print("}};")
+
+
+def write_numeric():
+ print("const std::array<uint32_t, NUM_NUMERIC_CODEPOINTS> NUMERIC_CODEPOINTS = {{")
+ print(" // clang-format off")
+ for i, cp in enumerate(numeric_codepoints):
+ if i % 16 == 0:
+ print(" ", end="")
+ print("{:#06x}, ".format(cp), end="")
+ if i % 16 == 15:
+ print()
+ if i % 16 != 15:
+ print()
+ print(" // clang-format on")
+ print("}};")
+
+
+def main():
+ if len(sys.argv) != 4:
+ print("too few arguments", file=sys.stderr)
+ exit(-1)
+ unicode_txt_path = sys.argv[1]
+ norm_props_txt_path = sys.argv[2]
+ core_props_txt_path = sys.argv[3]
+
+ read_unicode_data_txt(unicode_txt_path)
+ read_derived_norm_props_txt(norm_props_txt_path)
+ read_derived_core_props_txt(core_props_txt_path)
+
+ print(COPYRIGHT)
+ print()
+
+ print('#include "rust-system.h"')
+ print()
+ print("namespace Rust {")
+ print()
+ print("const uint32_t NUM_ALPHABETIC_RANGES = {};".format(len(alphabetic_ranges)))
+ print("const uint32_t NUM_NUMERIC_CODEPOINTS = {};".format(len(numeric_codepoints)))
+ print()
+
+ write_decomposition()
+ print()
+ write_recomposition()
+ print()
+ # write_composition_exclusion()
+ # print()
+ write_ccc()
+ print()
+ write_alphabetic()
+ print()
+ write_numeric()
+ print()
+
+ # TODO: write NFC_QC table
+
+ print("} // namespace Rust")
+
+
+if __name__ == "__main__":
+ main()