diff options
Diffstat (limited to 'contrib/unicode/gen_wcwidth.py')
-rwxr-xr-x | contrib/unicode/gen_wcwidth.py | 106 |
1 files changed, 106 insertions, 0 deletions
diff --git a/contrib/unicode/gen_wcwidth.py b/contrib/unicode/gen_wcwidth.py new file mode 100755 index 0000000..02b28bc --- /dev/null +++ b/contrib/unicode/gen_wcwidth.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python3 +# +# Script to generate tables for cpp_wcwidth, leveraging glibc's utf8_gen.py. +# +# This file is part of GCC. +# +# GCC is free software; you can redistribute it and/or modify it under +# the terms of the GNU General Public License as published by the Free +# Software Foundation; either version 3, or (at your option) any later +# version. +# +# GCC is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or +# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +# for more details. +# +# You should have received a copy of the GNU General Public License +# along with GCC; see the file COPYING3. If not see +# <http://www.gnu.org/licenses/>. */ + +import sys +import os + +if len(sys.argv) != 2: + print("usage: %s <unicode version>", file=sys.stderr) + sys.exit(1) +unicode_version = sys.argv[1] + +# Parse a codepoint in the format output by glibc tools. +def parse_ucn(s): + if not (s.startswith("<U") and s.endswith(">")): + raise ValueError + return int(s[2:-1], base=16) + +# Process a line of width output from utf_gen.py and update global array. +widths = [1] * (1 + 0x10FFFF) +def process_width(line): + # Example lines: + # <UA8FF> 0 + # <UA926>...<UA92D> 0 + + s = line.split() + width = int(s[1]) + r = s[0].split("...") + if len(r) == 1: + begin = parse_ucn(r[0]) + end = begin + 1 + elif len(r) == 2: + begin = parse_ucn(r[0]) + end = parse_ucn(r[1]) + 1 + else: + raise ValueError + widths[begin:end] = [width] * (end - begin) + +# To keep things simple, we use glibc utf8_gen.py as-is. It only outputs to a +# file named UTF-8, which is not configurable. Then we parse this into the form +# we want it. +os.system("from_glibc/utf8_gen.py --unicode_version %s" % unicode_version) +processing = False +for line in open("UTF-8", "r"): + if processing: + if line == "END WIDTH\n": + processing = False + else: + try: + process_width(line) + except (ValueError, IndexError): + print(e, "warning: ignored unexpected line: %s" % line, + file=sys.stderr, end="") + elif line == "WIDTH\n": + processing = True + +# All bytes < 256 we treat as width 1. +widths[0:255] = [1] * 255 + +# Condense the list to contiguous ranges. +cur_range = [-1, 1] +all_ranges = [] +for i, width in enumerate(widths): + if width == cur_range[1]: + cur_range[0] = i + else: + all_ranges.append(cur_range) + cur_range = [i, width] + +# Output the arrays for generated_cpp_wcwidth.h +print("/* Generated by contrib/unicode/gen_wcwidth.py,", + "with the help of glibc's") +print(" utf8_gen.py, using version %s" % unicode_version, + "of the Unicode standard. */") +print("\nstatic const cppchar_t wcwidth_range_ends[] = {", end="") +for i, r in enumerate(all_ranges): + if i % 8: + print(" ", end="") + else: + print("\n ", end="") + print("0x%x," % (r[0]), end="") +print("\n};\n") +print("static const unsigned char wcwidth_widths[] = {", end="") +for i, r in enumerate(all_ranges): + if i % 24: + print(" ", end="") + else: + print("\n ", end="") + print("%d," % r[1], end="") +print("\n};") |