diff options
author | Raiki Tamura <tamaron1203@gmail.com> | 2023-07-14 14:45:34 +0900 |
---|---|---|
committer | Philip Herron <philip.herron@embecosm.com> | 2023-07-29 16:05:39 +0000 |
commit | 7ce263e17a59c44d057bfb1ed6a8ab1c4d837f28 (patch) | |
tree | 6d43a655d88f1698b64b19317e223a3c35001391 /gcc/rust/util/make-rust-unicode.py | |
parent | 42bd81f8df1a8bf55e6c718f24994566eeaf1b5f (diff) | |
download | gcc-7ce263e17a59c44d057bfb1ed6a8ab1c4d837f28.zip gcc-7ce263e17a59c44d057bfb1ed6a8ab1c4d837f28.tar.gz gcc-7ce263e17a59c44d057bfb1ed6a8ab1c4d837f28.tar.bz2 |
Add function for Unicode NFC normalization
gcc/rust/ChangeLog:
* Make-lang.in: Add rust-unicode.o
* rust-lang.cc (run_rust_tests): Add test.
* rust-system.h: Include <array>
* util/make-rust-unicode.py: Generater of rust-unicode-data.h.
* util/rust-unicode-data.h: Auto-generated file.
* util/rust-unicode.cc: New file.
* util/rust-unicode.h: New file.
Signed-off-by: Raiki Tamura <tamaron1203@gmail.com>
Diffstat (limited to 'gcc/rust/util/make-rust-unicode.py')
-rw-r--r-- | gcc/rust/util/make-rust-unicode.py | 289 |
1 files changed, 289 insertions, 0 deletions
diff --git a/gcc/rust/util/make-rust-unicode.py b/gcc/rust/util/make-rust-unicode.py new file mode 100644 index 0000000..eaf2fc8 --- /dev/null +++ b/gcc/rust/util/make-rust-unicode.py @@ -0,0 +1,289 @@ +# Copyright (C) 2020-2023 Free Software Foundation, Inc. + +# This file is part of GCC. + +# GCC is free software; you can redistribute it and/or modify it under +# the terms of the GNU General Public License as published by the Free +# Software Foundation; either version 3, or (at your option) any later +# version. + +# GCC is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or +# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +# for more details. + +# You should have received a copy of the GNU General Public License +# along with GCC; see the file COPYING3. If not see +# <http://www.gnu.org/licenses/>. + +# Run this program as +# python ./make-rust-unicode.py UnicodeData.txt \ +# DerivedNormalizationProps.txt DerivedCoreProperties.txt \ +# > rust-unicode-data.h + +import sys + +COPYRIGHT = ( + "// Copyright (C) 2020-2023 Free Software Foundation, Inc.\n" + "\n" + "// This file is part of GCC.\n" + "\n" + "// GCC is free software; you can redistribute it and/or modify it under\n" + "// the terms of the GNU General Public License as published by the Free\n" + "// Software Foundation; either version 3, or (at your option) any later\n" + "// version.\n" + "\n" + "// GCC is distributed in the hope that it will be useful, but WITHOUT ANY\n" + "// WARRANTY; without even the implied warranty of MERCHANTABILITY or\n" + "// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License\n" + "// for more details.\n" + "\n" + "// You should have received a copy of the GNU General Public License\n" + "// along with GCC; see the file COPYING3. If not see\n" + "// <http://www.gnu.org/licenses/>." +) + +# Decomposition_Mapping table +decomposition_map = {} +# Canonical_Combining_Class table +ccc_table = {} +# Ranges of codepoints with the Full_Composition_Exclusion property +composition_exclusion_ranges = [] +# Ranges of codepoints with the Full_Composition_Exclusion property +alphabetic_ranges = [] +# Ranges of codepoints with NFC_QC=No +nfc_qc_no_ranges = [] +# Ranges of codepoints with NFC_QC=Maybe +nfc_qc_maybe_ranges = [] +numeric_codepoints = [] + +# Note that an element of range `[m, n]` (a list in python) represents [m, n) + + +def binary_search_ranges(ranges, target): + low = 0 + high = len(ranges) - 1 + while low <= high: + mid = (low + high) // 2 + start, end = ranges[mid] + if start <= target <= end - 1: + return mid # target found. returns index. + elif target < start: + high = mid - 1 + else: + low = mid + 1 + # target not found. + return -1 + + +# Utility function to parse '<codepoint>...<codepoint>' or '<codepoint>' +def parse_codepoint_range(range_str): + codepoint_range = range_str.split("..") + assert len(codepoint_range) == 1 or len(codepoint_range) == 2, "Invalid format" + start_cp, end_cp = 0, 0 + if len(codepoint_range) == 1: + # m..n => [m, n+1) + start_cp = int(codepoint_range[0], 16) + end_cp = start_cp + 1 + else: + # m => [m, m+1) + start_cp = int(codepoint_range[0], 16) + end_cp = int(codepoint_range[1], 16) + 1 + return [start_cp, end_cp] + + +def read_unicode_data_txt(filepath): + def process_line(line): + rows = line.split(";") + if len(rows) != 15: + return + # Parse codepoint + cp = int(rows[0], 16) + # Parse general category + category = rows[2] + if category == "Nd" or category == "Nl" or category == "No": + numeric_codepoints.append(cp) + + # Parse CCC + ccc = int(rows[3], 10) + if ccc != 0: + ccc_table[cp] = ccc + # Parse decomposition mapping + # Ignore compatibility decomposition mapping because + # it is not required for **NFC** normalization. + if not rows[5].startswith("<"): + decomp_cp_strs = rows[5].split(" ") + decomp_cps = [] + for s in decomp_cp_strs: + if s == "": + continue + decomp_cps.append(int(s, 16)) + assert ( + len(decomp_cps) <= 2 + ), "Decomposition_Mapping must not contain more than 2 characters." + if len(decomp_cps) > 0: + decomposition_map[cp] = decomp_cps + + with open(sys.argv[1], "r", encoding="UTF-8") as file: + while line := file.readline(): + process_line(line.rstrip()) + + +def read_derived_norm_props_txt(filepath): + def process_line(line): + # Ignore comments + line = line.split("#")[0] + rows = line.split(";") + # Too few rows. Skipped. + if len(rows) < 2: + return + rows[0] = rows[0].lstrip().rstrip() + rows[1] = rows[1].lstrip().rstrip() + cp_range = parse_codepoint_range(rows[0]) + if rows[1] == "Full_Composition_Exclusion": + composition_exclusion_ranges.append(cp_range) + elif rows[1] == "NFC_QC": + assert len(rows) >= 3, "Too few rows for NFC_QC" + rows[2] = rows[2].lstrip().rstrip() + if rows[2] == "N": + nfc_qc_no_ranges.append(cp_range) + elif rows[2] == "M": + nfc_qc_maybe_ranges.append(cp_range) + else: + raise RuntimeError("Value of NFC_QC must be N or M") + + with open(filepath, "r", encoding="UTF-8") as file: + while line := file.readline(): + process_line(line.rstrip()) + + +def read_derived_core_props_txt(filepath): + def process_line(line): + # Ignore comments + line = line.split("#")[0] + rows = line.split(";") + # Too few rows. Skipped. + if len(rows) < 2: + return + rows[0] = rows[0].lstrip().rstrip() + rows[1] = rows[1].lstrip().rstrip() + if rows[1] != "Alphabetic": + return + cp_range = parse_codepoint_range(rows[0]) + alphabetic_ranges.append(cp_range) + + with open(filepath, "r", encoding="UTF-8") as file: + while line := file.readline(): + process_line(line.rstrip()) + + +def write_decomposition(): + print("const std::map<uint32_t, std::vector<uint32_t>> DECOMPOSITION_MAP = {") + print(" // clang-format off") + for cp in sorted(decomposition_map): + print(" {{{:#06x}, ".format(cp), end="") + print("{", end="") + for decomp_cp in decomposition_map[cp]: + print("{:#06x}, ".format(decomp_cp), end="") + print("}},") + print(" // clang-format on") + print("};") + + +def write_recomposition(): + print( + "const std::map<std::pair<uint32_t, uint32_t>, uint32_t> RECOMPOSITION_MAP = {{" + ) + print(" // clang-format off") + for cp in decomposition_map: + if binary_search_ranges(composition_exclusion_ranges, cp) != -1: + continue + if len(decomposition_map[cp]) == 1: + d1 = decomposition_map[cp][0] + d2 = 0 + else: + d1 = decomposition_map[cp][0] + d2 = decomposition_map[cp][1] + print(" {{{{{:#06x}, {:#06x}}}, {:#06x}}},".format(d1, d2, cp)) + print(" // clang-format on") + print("}};") + + +def write_ccc(): + print("const std::map<uint32_t, int32_t> CCC_TABLE = {") + print(" // clang-format off") + for cp in ccc_table: + print(" {{{:#06x}, {}}},".format(cp, ccc_table[cp])) + print(" // clang-format on") + print("};") + + +def write_alphabetic(): + print( + "const std::array<std::pair<uint32_t, uint32_t>, NUM_ALPHABETIC_RANGES> ALPHABETIC_RANGES = {{" + ) + print(" // clang-format off") + for r in alphabetic_ranges: + print(" {{{:#06x}, {:#06x}}},".format(r[0], r[1])) + print(" // clang-format on") + print("}};") + + +def write_numeric(): + print("const std::array<uint32_t, NUM_NUMERIC_CODEPOINTS> NUMERIC_CODEPOINTS = {{") + print(" // clang-format off") + for i, cp in enumerate(numeric_codepoints): + if i % 16 == 0: + print(" ", end="") + print("{:#06x}, ".format(cp), end="") + if i % 16 == 15: + print() + if i % 16 != 15: + print() + print(" // clang-format on") + print("}};") + + +def main(): + if len(sys.argv) != 4: + print("too few arguments", file=sys.stderr) + exit(-1) + unicode_txt_path = sys.argv[1] + norm_props_txt_path = sys.argv[2] + core_props_txt_path = sys.argv[3] + + read_unicode_data_txt(unicode_txt_path) + read_derived_norm_props_txt(norm_props_txt_path) + read_derived_core_props_txt(core_props_txt_path) + + print(COPYRIGHT) + print() + + print('#include "rust-system.h"') + print() + print("namespace Rust {") + print() + print("const uint32_t NUM_ALPHABETIC_RANGES = {};".format(len(alphabetic_ranges))) + print("const uint32_t NUM_NUMERIC_CODEPOINTS = {};".format(len(numeric_codepoints))) + print() + + write_decomposition() + print() + write_recomposition() + print() + # write_composition_exclusion() + # print() + write_ccc() + print() + write_alphabetic() + print() + write_numeric() + print() + + # TODO: write NFC_QC table + + print("} // namespace Rust") + + +if __name__ == "__main__": + main() |