#!/usr/bin/python3 # -*- coding: utf-8 -*- # Copyright (C) 2014-2024 Free Software Foundation, Inc. # Copyright The GNU Toolchain Authors. # This file is part of the GNU C Library. # # The GNU C Library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # The GNU C Library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with the GNU C Library; if not, see # . '''glibc/localedata/charmaps/UTF-8 file generator script This script generates a glibc/localedata/charmaps/UTF-8 file from Unicode data. Usage: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt It will output UTF-8 file ''' import argparse import re import unicode_utils # Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book, # sections 3.11 and 4.4. JAMO_INITIAL_SHORT_NAME = ( 'G', 'GG', 'N', 'D', 'DD', 'R', 'M', 'B', 'BB', 'S', 'SS', '', 'J', 'JJ', 'C', 'K', 'T', 'P', 'H' ) JAMO_MEDIAL_SHORT_NAME = ( 'A', 'AE', 'YA', 'YAE', 'EO', 'E', 'YEO', 'YE', 'O', 'WA', 'WAE', 'OE', 'YO', 'U', 'WEO', 'WE', 'WI', 'YU', 'EU', 'YI', 'I' ) JAMO_FINAL_SHORT_NAME = ( '', 'G', 'GG', 'GS', 'N', 'NJ', 'NH', 'D', 'L', 'LG', 'LM', 'LB', 'LS', 'LT', 'LP', 'LH', 'M', 'B', 'BS', 'S', 'SS', 'NG', 'J', 'C', 'K', 'T', 'P', 'H' ) def process_range(start, end, outfile, name): '''Writes a range of code points into the CHARMAP section of the output file ''' if 'Hangul Syllable' in name: # from glibc/localedata/ChangeLog: # # 2000-09-24 Bruno Haible # * charmaps/UTF-8: Expand and ranges, # so they become printable and carry a width. Comment out surrogate # ranges. Add a WIDTH table # # So we expand the Hangul Syllables here: for i in range(int(start, 16), int(end, 16)+1 ): index2, index3 = divmod(i - 0xaC00, 28) index1, index2 = divmod(index2, 21) hangul_syllable_name = 'HANGUL SYLLABLE ' \ + JAMO_INITIAL_SHORT_NAME[index1] \ + JAMO_MEDIAL_SHORT_NAME[index2] \ + JAMO_FINAL_SHORT_NAME[index3] outfile.write('{:<11s} {:<12s} {:s}\n'.format( unicode_utils.ucs_symbol(i), convert_to_hex(i), hangul_syllable_name)) return # UnicodeData.txt file has contains code point ranges like this: # # 3400;;Lo;0;L;;;;;N;;;;; # 4DB5;;Lo;0;L;;;;;N;;;;; # # The glibc UTF-8 file splits ranges like these into shorter # ranges of 64 code points each: # # .. /xe3/x90/x80 # … # .. /xe4/xb6/x80 for i in range(int(start, 16), int(end, 16), 64 ): if i > (int(end, 16)-64): outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format( unicode_utils.ucs_symbol(i), unicode_utils.ucs_symbol(int(end,16)), convert_to_hex(i), name)) break outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format( unicode_utils.ucs_symbol(i), unicode_utils.ucs_symbol(i+63), convert_to_hex(i), name)) def process_charmap(flines, outfile): '''This function takes an array which contains *all* lines of of UnicodeData.txt and write lines to outfile as used in the CHARMAP … END CHARMAP section of the UTF-8 file in glibc/localedata/charmaps/UTF-8. Samples for input lines: 0010;;Cc;0;BN;;;;;N;DATA LINK ESCAPE;;;; 3400;;Lo;0;L;;;;;N;;;;; 4DB5;;Lo;0;L;;;;;N;;;;; D800;;Cs;0;L;;;;;N;;;;; DB7F;;Cs;0;L;;;;;N;;;;; 100000;;Co;0;L;;;;;N;;;;; 10FFFD;;Co;0;L;;;;;N;;;;; Samples for output lines (Unicode-Value UTF-8-HEX Unicode-Char-Name): /x10 DATA LINK ESCAPE .. /xe3/x90/x80 % /xed/xa0/x80 % /xed/xad/xbf .. /xf4/x8f/xbf/x80 ''' fields_start = [] for line in flines: fields = line.split(";") # Some characters have “” as their name. We try to # use the “Unicode 1.0 Name” (10th field in # UnicodeData.txt) for them. # # The Characters U+0080, U+0081, U+0084 and U+0099 have # “” as their name but do not even have aa # ”Unicode 1.0 Name”. We could write code to take their # alternate names from NameAliases.txt. if fields[1] == "" and fields[10]: fields[1] = fields[10] # Handling code point ranges like: # # 3400;;Lo;0;L;;;;;N;;;;; # 4DB5;;Lo;0;L;;;;;N;;;;; if fields[1].endswith(', First>') and not 'Surrogate,' in fields[1]: fields_start = fields continue if fields[1].endswith(', Last>') and not 'Surrogate,' in fields[1]: process_range(fields_start[0], fields[0], outfile, fields[1][:-7]+'>') fields_start = [] continue fields_start = [] if 'Surrogate,' in fields[1]: # Comment out the surrogates in the UTF-8 file. # One could of course skip them completely but # the original UTF-8 file in glibc had them as # comments, so we keep these comment lines. outfile.write('%') outfile.write('{:<11s} {:<12s} {:s}\n'.format( unicode_utils.ucs_symbol(int(fields[0], 16)), convert_to_hex(int(fields[0], 16)), fields[1])) def convert_to_hex(code_point): '''Converts a code point to a hexadecimal UTF-8 representation like /x**/x**/x**.''' # Getting UTF8 of Unicode characters. # In Python3, .encode('UTF-8') does not work for # surrogates. Therefore, we use this conversion table surrogates = { 0xD800: '/xed/xa0/x80', 0xDB7F: '/xed/xad/xbf', 0xDB80: '/xed/xae/x80', 0xDBFF: '/xed/xaf/xbf', 0xDC00: '/xed/xb0/x80', 0xDFFF: '/xed/xbf/xbf', } if code_point in surrogates: return surrogates[code_point] return ''.join([ '/x{:02x}'.format(c) for c in chr(code_point).encode('UTF-8') ]) def write_header_charmap(outfile): '''Write the header on top of the CHARMAP section to the output file''' outfile.write(" UTF-8\n") outfile.write(" %\n") outfile.write(" /\n") outfile.write(" 1\n") outfile.write(" 6\n\n") outfile.write("% CHARMAP generated using utf8_gen.py\n") outfile.write("% alias ISO-10646/UTF-8\n") outfile.write("CHARMAP\n") def write_header_width(outfile, unicode_version): '''Writes the header on top of the WIDTH section to the output file''' outfile.write('% Character width according to Unicode {:s}.\n'.format(unicode_version)) outfile.write('% Width is determined by the following rules, in order of decreasing precedence:\n') outfile.write('% - U+00AD SOFT HYPHEN has width 1, as a special case for compatibility (https://archive.is/b5Ck).\n') outfile.write('% - U+115F HANGUL CHOSEONG FILLER has width 2.\n') outfile.write('% This character stands in for an intentionally omitted leading consonant\n') outfile.write('% in a Hangul syllable block; as such it must be assigned width 2 despite its lack\n') outfile.write('% of visible display to ensure that the complete block has the correct width.\n') outfile.write('% (See below for more information on Hangul syllables.)\n') outfile.write('% - Combining jungseong and jongseong Hangul jamo have width 0; generated from\n') outfile.write('% "grep \'^[^;]*;[VT]\' HangulSyllableType.txt".\n') outfile.write('% One composed Hangul "syllable block" like 퓛 is made up of\n') outfile.write('% two to three individual component characters called "jamo".\n') outfile.write('% The complete block must have total width 2;\n') outfile.write('% to achieve this, we assign a width of 2 to leading "choseong" jamo,\n') outfile.write('% and of 0 to medial vowel "jungseong" and trailing "jongseong" jamo.\n') outfile.write('% - Non-spacing and enclosing marks have width 0; generated from\n') outfile.write('% "grep -E \'^[^;]*;[^;]*;(Mn|Me);\' UnicodeData.txt".\n') outfile.write('% - "Default_Ignorable_Code_Point"s have width 0; generated from\n') outfile.write('% "grep \'^[^;]*;\\s*Default_Ignorable_Code_Point\' DerivedCoreProperties.txt".\n') outfile.write('% - Double-width characters have width 2; generated from\n') outfile.write('% "grep \'^[^;]*;[WF]\' EastAsianWidth.txt".\n') outfile.write('% - Default width for all other characters is 1.\n') outfile.write("WIDTH\n") def process_width(outfile, ulines, dlines, elines, klines): '''ulines are lines from UnicodeData.txt. elines are lines from EastAsianWidth.txt containing characters with width “W” or “F”. dlines are lines from DerivedCoreProperties.txt which contain characters with the property “Default_Ignorable_Code_Point”. klines are lines from HangulSyllableType.txt which contain characters with syllable type “V” or “T”. ''' # Wide and fullwidth characters have width 1 width_dict = {} for line in elines: fields = line.split(";") if not '..' in fields[0]: code_points = (fields[0], fields[0]) else: code_points = fields[0].split("..") for key in range(int(code_points[0], 16), int(code_points[1], 16)+1): width_dict[key] = 2 # Nonspacing and enclosing marks have width 0 for line in ulines: fields = line.split(";") if fields[4] == "NSM" or fields[2] in ("Me", "Mn"): width_dict[int(fields[0], 16)] = 0 # Conjoining vowel and trailing jamo have width 0 for line in klines: fields = line.split(";") if not '..' in fields[0]: code_points = (fields[0], fields[0]) else: code_points = fields[0].split("..") for key in range(int(code_points[0], 16), int(code_points[1], 16)+1): width_dict[key] = 0 # “Default_Ignorable_Code_Point”s have width 0 for line in dlines: fields = line.split(";") if not '..' in fields[0]: code_points = (fields[0], fields[0]) else: code_points = fields[0].split("..") for key in range(int(code_points[0], 16), int(code_points[1], 16)+1): width_dict[key] = 0 # default width is 1 # Special case: U+00AD SOFT HYPHEN del width_dict[0x00AD] # Special case: U+115F HANGUL CHOSEONG FILLER width_dict[0x115F] = 2 for key in list(range(0x3248, 0x3250)): # These are “A” which means we can decide whether to treat them # as “W” or “N” based on context: # http://www.unicode.org/mail-arch/unicode-ml/y2017-m08/0023.html # For us, “W” seems better. width_dict[key] = 2 for key in list(range(0x4DC0, 0x4E00)): width_dict[key] = 2 same_width_lists = [] current_width_list = [] for key in sorted(width_dict): if not current_width_list: current_width_list = [key] elif (key == current_width_list[-1] + 1 and width_dict[key] == width_dict[current_width_list[0]]): current_width_list.append(key) else: same_width_lists.append(current_width_list) current_width_list = [key] if current_width_list: same_width_lists.append(current_width_list) for same_width_list in same_width_lists: if len(same_width_list) == 1: outfile.write('{:s}\t{:d}\n'.format( unicode_utils.ucs_symbol(same_width_list[0]), width_dict[same_width_list[0]])) else: outfile.write('{:s}...{:s}\t{:d}\n'.format( unicode_utils.ucs_symbol(same_width_list[0]), unicode_utils.ucs_symbol(same_width_list[-1]), width_dict[same_width_list[0]])) if __name__ == "__main__": PARSER = argparse.ArgumentParser( description=''' Generate a UTF-8 file from UnicodeData.txt, DerivedCoreProperties.txt, EastAsianWidth.txt, and HangulSyllableType.txt ''') PARSER.add_argument( '-u', '--unicode_data_file', nargs='?', type=str, default='UnicodeData.txt', help=('The UnicodeData.txt file to read, ' + 'default: %(default)s')) PARSER.add_argument( '-d', '--derived_core_properties_file', nargs='?', type=str, default='DerivedCoreProperties.txt', help=('The DerivedCoreProperties.txt file to read, ' + 'default: %(default)s')) PARSER.add_argument( '-e', '--east_asian_with_file', nargs='?', type=str, default='EastAsianWidth.txt', help=('The EastAsianWidth.txt file to read, ' + 'default: %(default)s')) PARSER.add_argument( '-k', '--hangul_syllable_type_file', nargs='?', type=str, default='HangulSyllableType.txt', help=('The HangulSyllableType.txt file to read, ' + 'default: %(default)s')) PARSER.add_argument( '--unicode_version', nargs='?', required=True, type=str, help='The Unicode version of the input files used.') ARGS = PARSER.parse_args() unicode_utils.fill_attributes(ARGS.unicode_data_file) with open(ARGS.unicode_data_file, mode='r') as UNIDATA_FILE: UNICODE_DATA_LINES = UNIDATA_FILE.readlines() with open(ARGS.derived_core_properties_file, mode='r') as DERIVED_CORE_PROPERTIES_FILE: DERIVED_CORE_PROPERTIES_LINES = [] for LINE in DERIVED_CORE_PROPERTIES_FILE: # If characters which are from reserved ranges # (i.e. not yet assigned code points) # are added to the WIDTH section of the UTF-8 file, then # “make check” produces “Unknown Character” errors for # these code points because such unassigned code points # are not in the CHARMAP section of the UTF-8 file. # # Therefore, we skip all reserved code points. if re.match(r'.*', LINE): continue if re.match(r'^[^;]*;\s*Default_Ignorable_Code_Point', LINE): DERIVED_CORE_PROPERTIES_LINES.append(LINE.strip()) with open(ARGS.east_asian_with_file, mode='r') as EAST_ASIAN_WIDTH_FILE: EAST_ASIAN_WIDTH_LINES = [] for LINE in EAST_ASIAN_WIDTH_FILE: if re.match(r'.*', LINE): continue if re.match(r'^[^;]*;\s*[WF]', LINE): EAST_ASIAN_WIDTH_LINES.append(LINE.strip()) with open(ARGS.hangul_syllable_type_file, mode='r') as HANGUL_SYLLABLE_TYPE_FILE: HANGUL_SYLLABLE_TYPE_LINES = [] for LINE in HANGUL_SYLLABLE_TYPE_FILE: if re.match(r'.*', LINE): continue if re.match(r'^[^;]*;\s*[VT]', LINE): HANGUL_SYLLABLE_TYPE_LINES.append(LINE.strip()) with open('UTF-8', mode='w') as OUTFILE: # Processing UnicodeData.txt and write CHARMAP to UTF-8 file write_header_charmap(OUTFILE) process_charmap(UNICODE_DATA_LINES, OUTFILE) OUTFILE.write("END CHARMAP\n\n") # Processing EastAsianWidth.txt and write WIDTH to UTF-8 file write_header_width(OUTFILE, ARGS.unicode_version) process_width(OUTFILE, UNICODE_DATA_LINES, DERIVED_CORE_PROPERTIES_LINES, EAST_ASIAN_WIDTH_LINES, HANGUL_SYLLABLE_TYPE_LINES) OUTFILE.write("END WIDTH\n")