#!/usr/bin/env python3 # # Script to generate tables for libstdc++ std::format width estimation. # # This file is part of GCC. # # GCC is free software; you can redistribute it and/or modify it under # the terms of the GNU General Public License as published by the Free # Software Foundation; either version 3, or (at your option) any later # version. # # GCC is distributed in the hope that it will be useful, but WITHOUT ANY # WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License # for more details. # # You should have received a copy of the GNU General Public License # along with GCC; see the file COPYING3. If not see # <http://www.gnu.org/licenses/>. # To update the Libstdc++ static data in <bits/unicode-data.h> download the latest: # ftp://ftp.unicode.org/Public/UNIDATA/EastAsianWidth.txt # ftp://ftp.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt # ftp://ftp.unicode.org/Public/UNIDATA/auxiliary/GraphemeBreakProperty.txt # ftp://ftp.unicode.org/Public/UNIDATA/emoji/emoji-data.txt # Then run this script and save the output to # ../../libstdc++-v3/include/bits/unicode-data.h import sys import re import math import os self = os.path.basename(__file__) print("// Generated by contrib/unicode/{}, do not edit.".format(self)) print(""" // Copyright The GNU Toolchain Authors. // // This file is part of the GNU ISO C++ Library. This library is free // software; you can redistribute it and/or modify it under the // terms of the GNU General Public License as published by the // Free Software Foundation; either version 3, or (at your option) // any later version. // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // Under Section 7 of GPL version 3, you are granted additional // permissions described in the GCC Runtime Library Exception, version // 3.1, as published by the Free Software Foundation. // You should have received a copy of the GNU General Public License and // a copy of the GCC Runtime Library Exception along with this program; // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see // <http://www.gnu.org/licenses/>. /** @file bits/unicode-data.h * This is an internal header file, included by other library headers. * Do not attempt to use it directly. @headername{format} */ """) print("#ifndef _GLIBCXX_GET_UNICODE_DATA") print('# error "This is not a public header, do not include it directly"') print("#elif _GLIBCXX_GET_UNICODE_DATA != 160000") print('# error "Version mismatch for Unicode static data"') print("#endif\n") # Process a list and return a list of tuples (index, val) which are the elements # in the list that have a different val from the previous element. # e.g. find_edges([a, a, b, b, c, b, b, d]) is [(0,a), (2,b), (4,c), (5,b), (7,d)] # and find_edges([a, a, b, b, c, b, b, d], a) is [(2,b), (4,c), (5,b), (7,d)] def find_edges(vals, init = None): edges = [] prev_val = init for i, v in enumerate(vals): if v != prev_val: edges.append((i,v)) prev_val = v return edges all_code_points = [] # Process a code point value or range of code point values with given property. def process_code_points(code_points, val): # Example arguments: # 1100..115F, x # 232A, y r = code_points.split("..") if len(r) == 1: c = int(r[0], base=16) all_code_points[c] = val elif len(r) == 2: begin = int(r[0], base=16) end = int(r[1], base=16) + 1 all_code_points[begin:end] = [val] * (end - begin) else: raise ValueError # By default every code point has width 1. This is what the C++ standard says, # even though the Unicode standard says some code points have width 0. all_code_points = [1] * (1 + 0x10FFFF) # Extract all code points with East_Asian_Width=W or East_Asian_Width=F for line in open("EastAsianWidth.txt", "r"): # Example lines: # 3000 ; F # 3001..3003 ; W line = line.split("#")[0] if re.match(r'^[\dA-Fa-f][^;]+;\s*[WF]\s*$', line): process_code_points(line.split(";")[0], 2) # The C++ standard also gives width 2 to the following ranges: # U+4DC0 – U+4DFF (Yijing Hexagram Symbols) process_code_points("4DC0..4DFF", 2) # U+1F300 – U+1F5FF (Miscellaneous Symbols and Pictographs) process_code_points("1F300..1F5FF", 2) # U+1F900 – U+1F9FF (Supplemental Symbols and Pictographs) process_code_points("1F900..1F9FF", 2) # Create a list that only contains the code points that have a different width # to the previous code point. edges = find_edges(all_code_points, 1) # Table for std::__unicode::__format_width(char32_t) print(" // Table generated by contrib/unicode/gen_std_format_width.py,") print(" // from EastAsianWidth.txt from the Unicode standard."); print(" inline constexpr char32_t __width_edges[] = {", end="") for i, e in enumerate(edges): if i % 8: print(" ", end="") else: print("\n ", end="") c,_ = e print("{:#x},".format(c), end="") print("\n };\n") # By default every code point has Grapheme_Cluster_Break=Other. all_code_points = ["Other"] * (1 + 0x10FFFF) # Extract Grapheme_Cluster_Break property for all code points. for line in open("GraphemeBreakProperty.txt", "r"): # Example lines: # "0600..0605", "Prepend" # "00AD", "Control" line = line.split("#")[0] if re.match(r'^[\dA-Fa-f][^;]+;', line): code_points, grapheme_property = line.split(";") process_code_points(code_points, grapheme_property.strip()) edges = find_edges(all_code_points) gcb_props = {"Other":0} for c, p in edges: if p not in gcb_props: gcb_props[p] = len(gcb_props) shift_bits = int(math.ceil(math.log2(len(gcb_props)))) # Enum definition for std::__unicode::_Gcb_property print(" enum class _Gcb_property {") for p in gcb_props.items(): print(" _Gcb_{} = {},".format(p[0],p[1])) print(" };\n") # Tables for std::__unicode::_Grapheme_cluster_state print(" // Values generated by contrib/unicode/gen_std_format_width.py,") print(" // from GraphemeBreakProperty.txt from the Unicode standard."); print(" // Entries are (code_point << shift_bits) + property.") print(" inline constexpr int __gcb_shift_bits = {:#x};".format(shift_bits)) print(" inline constexpr uint32_t __gcb_edges[] = {", end="") for i, e in enumerate(edges): if i % 6: print(" ", end="") else: print("\n ", end="") c, p = e x = (c << shift_bits) + gcb_props[p] print("{0:#x},".format(x), end="") print("\n };\n") # By default every code point has Indic_Conjunct_Break=None. all_code_points = [None] * (1 + 0x10FFFF) # Extract Indic_Conjunct_Break property for all code points. for line in open("DerivedCoreProperties.txt", "r"): # Example lines: # 094D ; InCB; Linker # 0B71 ; InCB; Consonant # 0300..034E ; InCB; Extend line = line.split("#")[0] if re.match(r'^[\dA-Fa-f][^;]+; InCB;', line): code_points, _, incb_property = line.split(";") process_code_points(code_points, incb_property.strip()) # Table for std::__unicode::__is_incb_linker # This table is tiny, so just contains the list of code points. print(" inline constexpr char32_t __incb_linkers[] = {\n ", end="") for i in [i for i,p in enumerate(all_code_points) if p == "Linker"]: print(" 0x{:04x},".format(i), end="") all_code_points[i] = None print("\n };\n") edges = find_edges(all_code_points) incb_props = {None:0, "Consonant":1, "Extend":2} print(" enum class _InCB { _Consonant = 1, _Extend = 2 };\n") # Table for std::__unicode::__incb_property print(" // Values generated by contrib/unicode/gen_std_format_width.py,") print(" // from DerivedCoreProperties.txt from the Unicode standard."); print(" // Entries are (code_point << 2) + property.") print(" inline constexpr uint32_t __incb_edges[] = {", end="") for i, e in enumerate(edges): if i % 6: print(" ", end="") else: print("\n ", end="") c, p = e x = (c << 2) + incb_props[p] print("{0:#x},".format(x), end="") print("\n };\n") # By default every code point has Emoji=No. all_code_points = [False] * (1 + 0x10FFFF) # Extract Emoji=Extended_Pictographic for all code points. for line in open("emoji-data.txt", "r"): # Example lines: # 1100..115F ; Extended_Pictographic # 232A ; Extended_Pictographic line = line.split("#")[0] if re.match(r'^[\dA-Fa-f][^;]+; Extended_Pictographic', line): process_code_points(line.split(";")[0], True) edges = find_edges(all_code_points, False) # Table for std::__unicode::__is_extended_pictographic print(" // Table generated by contrib/unicode/gen_std_format_width.py,") print(" // from emoji-data.txt from the Unicode standard."); print(" inline constexpr char32_t __xpicto_edges[] = {", end="") for i, e in enumerate(edges): if i % 8: print(" ", end="") else: print("\n ", end="") c,_ = e print("{:#x},".format(c), end="") print("\n };\n") # <bits/unicode.h> gives an error if this macro is left defined. # Do this last, so that the generated output is not usable unless we reach here. print("#undef _GLIBCXX_GET_UNICODE_DATA")