# Copyright (C) 2020-2025 Free Software Foundation, Inc. # This file is part of GCC. # GCC is free software; you can redistribute it and/or modify it under # the terms of the GNU General Public License as published by the Free # Software Foundation; either version 3, or (at your option) any later # version. # GCC is distributed in the hope that it will be useful, but WITHOUT ANY # WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License # for more details. # You should have received a copy of the GNU General Public License # along with GCC; see the file COPYING3. If not see # . # Run this program as # python ./make-rust-unicode.py UnicodeData.txt \ # DerivedNormalizationProps.txt DerivedCoreProperties.txt \ # > rust-unicode-data.h import sys from typing import Tuple Codepoint = int Range = Tuple[Codepoint, Codepoint] COPYRIGHT = ( "// Copyright (C) 2020-2025 Free Software Foundation, Inc.\n" "\n" "// This file is part of GCC.\n" "\n" "// GCC is free software; you can redistribute it and/or modify it under\n" "// the terms of the GNU General Public License as published by the Free\n" "// Software Foundation; either version 3, or (at your option) any later\n" "// version.\n" "\n" "// GCC is distributed in the hope that it will be useful, but WITHOUT ANY\n" "// WARRANTY; without even the implied warranty of MERCHANTABILITY or\n" "// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License\n" "// for more details.\n" "\n" "// You should have received a copy of the GNU General Public License\n" "// along with GCC; see the file COPYING3. If not see\n" "// ." ) # Decomposition_Mapping table decomposition_map: dict[Codepoint, list[Codepoint]] = {} # Canonical_Combining_Class table ccc_table: dict[Codepoint, int] = {} # Ranges of codepoints with the Full_Composition_Exclusion property composition_exclusion_ranges: list[Range] = [] # Ranges of codepoints with the Full_Composition_Exclusion property alphabetic_ranges: list[Range] = [] # Ranges of codepoints with NFC_QC=No nfc_qc_no_ranges: list[Range] = [] # Ranges of codepoints with NFC_QC=Maybe nfc_qc_maybe_ranges: list[Range] = [] numeric_codepoints: list[Codepoint] = [] # Note that an element of range `[m, n]` (a list in python) represents [m, n) def binary_search_ranges(ranges: list[Range], target: Codepoint) -> int: low: int = 0 high: int = len(ranges) - 1 while low <= high: mid = (low + high) // 2 start, end = ranges[mid] if start <= target <= end - 1: return mid # target found. returns index. elif target < start: high = mid - 1 else: low = mid + 1 # target not found. return -1 # Utility function to parse '...' or '' def parse_codepoint_range(range_str: str) -> Range: codepoint_range: list[str] = range_str.split("..") assert len(codepoint_range) == 1 or len(codepoint_range) == 2, "Invalid format" start_cp, end_cp = 0, 0 if len(codepoint_range) == 1: # m..n => [m, n+1) start_cp = int(codepoint_range[0], 16) end_cp = start_cp + 1 else: # m => [m, m+1) start_cp = int(codepoint_range[0], 16) end_cp = int(codepoint_range[1], 16) + 1 return start_cp, end_cp def read_unicode_data_txt(filepath: str) -> None: def process_line(line: str) -> None: rows = line.split(";") if len(rows) != 15: return # Parse codepoint cp = int(rows[0], 16) # Parse general category category = rows[2] if category == "Nd" or category == "Nl" or category == "No": numeric_codepoints.append(cp) # Parse CCC ccc = int(rows[3], 10) if ccc != 0: ccc_table[cp] = ccc # Parse decomposition mapping # Ignore compatibility decomposition mapping because # it is not required for **NFC** normalization. if not rows[5].startswith("<"): decomp_cp_strs = rows[5].split(" ") decomp_cps = [] for s in decomp_cp_strs: if s == "": continue decomp_cps.append(int(s, 16)) assert ( len(decomp_cps) <= 2 ), "Decomposition_Mapping must not contain more than 2 characters." if len(decomp_cps) > 0: decomposition_map[cp] = decomp_cps with open(filepath, "r", encoding="UTF-8") as file: while line := file.readline(): process_line(line.rstrip()) def read_derived_norm_props_txt(filepath: str) -> None: def process_line(line) -> None: # Ignore comments line = line.split("#")[0] rows = line.split(";") # Too few rows. Skipped. if len(rows) < 2: return rows[0] = rows[0].lstrip().rstrip() rows[1] = rows[1].lstrip().rstrip() cp_range = parse_codepoint_range(rows[0]) if rows[1] == "Full_Composition_Exclusion": composition_exclusion_ranges.append(cp_range) elif rows[1] == "NFC_QC": assert len(rows) >= 3, "Too few rows for NFC_QC" rows[2] = rows[2].lstrip().rstrip() if rows[2] == "N": nfc_qc_no_ranges.append(cp_range) elif rows[2] == "M": nfc_qc_maybe_ranges.append(cp_range) else: raise RuntimeError("Value of NFC_QC must be N or M") with open(filepath, "r", encoding="UTF-8") as file: while line := file.readline(): process_line(line.rstrip()) def read_derived_core_props_txt(filepath: str) -> None: def process_line(line: str) -> None: # Ignore comments line = line.split("#")[0] rows = line.split(";") # Too few rows. Skipped. if len(rows) < 2: return rows[0] = rows[0].lstrip().rstrip() rows[1] = rows[1].lstrip().rstrip() if rows[1] != "Alphabetic": return cp_range: Range = parse_codepoint_range(rows[0]) alphabetic_ranges.append(cp_range) with open(filepath, "r", encoding="UTF-8") as file: while line := file.readline(): process_line(line.rstrip()) def write_decomposition() -> None: print("const std::map> DECOMPOSITION_MAP = {") print(" // clang-format off") for cp in sorted(decomposition_map): print(" {{{:#06x}, ".format(cp), end="") print("{", end="") for decomp_cp in decomposition_map[cp]: print("{:#06x}, ".format(decomp_cp), end="") print("}},") print(" // clang-format on") print("};") def write_recomposition() -> None: print( "const std::map, uint32_t> RECOMPOSITION_MAP = {{" ) print(" // clang-format off") for cp in decomposition_map: if binary_search_ranges(composition_exclusion_ranges, cp) != -1: continue d1: Codepoint d2: Codepoint if len(decomposition_map[cp]) == 1: d1 = decomposition_map[cp][0] d2 = 0 else: d1 = decomposition_map[cp][0] d2 = decomposition_map[cp][1] print(" {{{{{:#06x}, {:#06x}}}, {:#06x}}},".format(d1, d2, cp)) print(" // clang-format on") print("}};") def write_ccc() -> None: print("const std::map CCC_TABLE = {") print(" // clang-format off") for cp in ccc_table: print(" {{{:#06x}, {}}},".format(cp, ccc_table[cp])) print(" // clang-format on") print("};") def write_alphabetic() -> None: print( "const std::array, NUM_ALPHABETIC_RANGES> ALPHABETIC_RANGES = {{" ) print(" // clang-format off") for r in alphabetic_ranges: print(" {{{:#06x}, {:#06x}}},".format(r[0], r[1])) print(" // clang-format on") print("}};") def write_numeric() -> None: print("const std::array NUMERIC_CODEPOINTS = {{") print(" // clang-format off") for i, cp in enumerate(numeric_codepoints): if i % 16 == 0: print(" ", end="") print("{:#06x}, ".format(cp), end="") if i % 16 == 15: print() if i % 16 != 15: print() print(" // clang-format on") print("}};") def write_nfc_qc(): print( "const std::array, {}> NFC_QC_NO_RANGES = {{{{".format( len(nfc_qc_no_ranges) ) ) print(" // clang-format off") for r in nfc_qc_no_ranges: print(" {{{:#06x}, {:#06x}}},".format(r[0], r[1])) print(" // clang-format on") print("}};") print( "const std::array, {}> NFC_QC_MAYBE_RANGES = {{{{".format( len(nfc_qc_maybe_ranges) ) ) print(" // clang-format off") for r in nfc_qc_maybe_ranges: print(" {{{:#06x}, {:#06x}}},".format(r[0], r[1])) print(" // clang-format on") print("}};") def main() -> None: if len(sys.argv) != 4: print("too few arguments", file=sys.stderr) exit(-1) unicode_txt_path: str = sys.argv[1] norm_props_txt_path: str = sys.argv[2] core_props_txt_path: str = sys.argv[3] read_unicode_data_txt(unicode_txt_path) read_derived_norm_props_txt(norm_props_txt_path) read_derived_core_props_txt(core_props_txt_path) print(COPYRIGHT) print() print('#include "rust-system.h"\n') print("namespace Rust {\n") print("const uint32_t NUM_ALPHABETIC_RANGES = {};".format(len(alphabetic_ranges))) print( "const uint32_t NUM_NUMERIC_CODEPOINTS = {};\n".format(len(numeric_codepoints)) ) write_decomposition() print() write_recomposition() print() write_ccc() print() write_alphabetic() print() write_numeric() print() write_nfc_qc() print() print("} // namespace Rust") if __name__ == "__main__": main()