aboutsummaryrefslogtreecommitdiff
path: root/contrib/unicode/gen_wcwidth.py
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/unicode/gen_wcwidth.py')
-rwxr-xr-xcontrib/unicode/gen_wcwidth.py106
1 files changed, 106 insertions, 0 deletions
diff --git a/contrib/unicode/gen_wcwidth.py b/contrib/unicode/gen_wcwidth.py
new file mode 100755
index 0000000..02b28bc
--- /dev/null
+++ b/contrib/unicode/gen_wcwidth.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python3
+#
+# Script to generate tables for cpp_wcwidth, leveraging glibc's utf8_gen.py.
+#
+# This file is part of GCC.
+#
+# GCC is free software; you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free
+# Software Foundation; either version 3, or (at your option) any later
+# version.
+#
+# GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+# for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GCC; see the file COPYING3. If not see
+# <http://www.gnu.org/licenses/>. */
+
+import sys
+import os
+
+if len(sys.argv) != 2:
+ print("usage: %s <unicode version>", file=sys.stderr)
+ sys.exit(1)
+unicode_version = sys.argv[1]
+
+# Parse a codepoint in the format output by glibc tools.
+def parse_ucn(s):
+ if not (s.startswith("<U") and s.endswith(">")):
+ raise ValueError
+ return int(s[2:-1], base=16)
+
+# Process a line of width output from utf_gen.py and update global array.
+widths = [1] * (1 + 0x10FFFF)
+def process_width(line):
+ # Example lines:
+ # <UA8FF> 0
+ # <UA926>...<UA92D> 0
+
+ s = line.split()
+ width = int(s[1])
+ r = s[0].split("...")
+ if len(r) == 1:
+ begin = parse_ucn(r[0])
+ end = begin + 1
+ elif len(r) == 2:
+ begin = parse_ucn(r[0])
+ end = parse_ucn(r[1]) + 1
+ else:
+ raise ValueError
+ widths[begin:end] = [width] * (end - begin)
+
+# To keep things simple, we use glibc utf8_gen.py as-is. It only outputs to a
+# file named UTF-8, which is not configurable. Then we parse this into the form
+# we want it.
+os.system("from_glibc/utf8_gen.py --unicode_version %s" % unicode_version)
+processing = False
+for line in open("UTF-8", "r"):
+ if processing:
+ if line == "END WIDTH\n":
+ processing = False
+ else:
+ try:
+ process_width(line)
+ except (ValueError, IndexError):
+ print(e, "warning: ignored unexpected line: %s" % line,
+ file=sys.stderr, end="")
+ elif line == "WIDTH\n":
+ processing = True
+
+# All bytes < 256 we treat as width 1.
+widths[0:255] = [1] * 255
+
+# Condense the list to contiguous ranges.
+cur_range = [-1, 1]
+all_ranges = []
+for i, width in enumerate(widths):
+ if width == cur_range[1]:
+ cur_range[0] = i
+ else:
+ all_ranges.append(cur_range)
+ cur_range = [i, width]
+
+# Output the arrays for generated_cpp_wcwidth.h
+print("/* Generated by contrib/unicode/gen_wcwidth.py,",
+ "with the help of glibc's")
+print(" utf8_gen.py, using version %s" % unicode_version,
+ "of the Unicode standard. */")
+print("\nstatic const cppchar_t wcwidth_range_ends[] = {", end="")
+for i, r in enumerate(all_ranges):
+ if i % 8:
+ print(" ", end="")
+ else:
+ print("\n ", end="")
+ print("0x%x," % (r[0]), end="")
+print("\n};\n")
+print("static const unsigned char wcwidth_widths[] = {", end="")
+for i, r in enumerate(all_ranges):
+ if i % 24:
+ print(" ", end="")
+ else:
+ print("\n ", end="")
+ print("%d," % r[1], end="")
+print("\n};")