aboutsummaryrefslogtreecommitdiff
path: root/localedata
diff options
context:
space:
mode:
authorMike FABIAN <mfabian@redhat.com>2017-08-18 10:12:29 +0200
committerMike FABIAN <mfabian@redhat.com>2017-09-06 12:37:49 +0200
commitaf83ed5c4647bda196fc1a7efebbe8019aa83f4a (patch)
tree222b4c599fc16758fc87b85220acd38e2ba5d56c /localedata
parent4f3647e46e3f645c6516faa299efc6e89d520d7b (diff)
downloadglibc-af83ed5c4647bda196fc1a7efebbe8019aa83f4a.zip
glibc-af83ed5c4647bda196fc1a7efebbe8019aa83f4a.tar.gz
glibc-af83ed5c4647bda196fc1a7efebbe8019aa83f4a.tar.bz2
Write all ranges of neighbouring characters with the same width using the range notation in charmaps/UTF-8
Writing ranges of neighbouring characters with the same with like this <U000E0100>...<U000E01EF> 0 in charmaps/UTF-8 is more efficient than writing many single character lines like: <U000E0100> 0 <U000E0101> 0 ... [BZ #21750] * unicode-gen/utf8_gen.py: Write all ranges of neighbouring characters with the same width using the range notation in charmaps/UTF-8.
Diffstat (limited to 'localedata')
-rwxr-xr-xlocaledata/unicode-gen/utf8_gen.py51
1 files changed, 38 insertions, 13 deletions
diff --git a/localedata/unicode-gen/utf8_gen.py b/localedata/unicode-gen/utf8_gen.py
index 1563aa1..52c79e8 100755
--- a/localedata/unicode-gen/utf8_gen.py
+++ b/localedata/unicode-gen/utf8_gen.py
@@ -199,7 +199,7 @@ def write_header_charmap(outfile):
def write_header_width(outfile):
'''Writes the header on top of the WIDTH section to the output file'''
- outfile.write('% Character width according to Unicode 7.0.0.\n')
+ outfile.write('% Character width according to Unicode 10.0.0.\n')
outfile.write('% - Default width is 1.\n')
outfile.write('% - Double-width characters have width 2; generated from\n')
outfile.write('% "grep \'^[^;]*;[WF]\' EastAsianWidth.txt"\n')
@@ -229,27 +229,52 @@ def process_width(outfile, ulines, elines):
code_points = fields[0].split("..")
for key in range(int(code_points[0], 16),
int(code_points[1], 16)+1):
- width_dict[key] = unicode_utils.ucs_symbol(key) + '\t2'
+ width_dict[key] = 2
for line in ulines:
fields = line.split(";")
if fields[4] == "NSM" or fields[2] in ("Cf", "Me", "Mn"):
- width_dict[int(fields[0], 16)] = unicode_utils.ucs_symbol(
- int(fields[0], 16)) + '\t0'
+ width_dict[int(fields[0], 16)] = 0
# handle special cases for compatibility
- for key in list(range(0x1160, 0x1200)) + list(range(0x3248, 0x3250)) + \
- list(range(0x4DC0, 0x4E00)) + list((0x00AD,)):
+ for key in list((0x00AD,)):
+ # https://www.cs.tut.fi/~jkorpela/shy.html
if key in width_dict:
del width_dict[key]
- width_dict[0x1160] = '{:s}...{:s}\t0'.format(
- unicode_utils.ucs_symbol(0x1160), unicode_utils.ucs_symbol(0x11FF))
- width_dict[0x3248] = '{:s}...{:s}\t2'.format(
- unicode_utils.ucs_symbol(0x3248), unicode_utils.ucs_symbol(0x324F))
- width_dict[0x4DC0] = '{:s}...{:s}\t2'.format(
- unicode_utils.ucs_symbol(0x4DC0), unicode_utils.ucs_symbol(0x4DFF))
+ for key in list(range(0x1160, 0x1200)):
+ width_dict[key] = 0
+ for key in list(range(0x3248, 0x3250)):
+ # These are “A” which means we can decide whether to treat them
+ # as “W” or “N” based on context:
+ # http://www.unicode.org/mail-arch/unicode-ml/y2017-m08/0023.html
+ # For us, “W” seems better.
+ width_dict[key] = 2
+ for key in list(range(0x4DC0, 0x4E00)):
+ width_dict[key] = 2
+ same_width_lists = []
+ current_width_list = []
for key in sorted(width_dict):
- outfile.write(width_dict[key]+'\n')
+ if not current_width_list:
+ current_width_list = [key]
+ elif (key == current_width_list[-1] + 1
+ and width_dict[key] == width_dict[current_width_list[0]]):
+ current_width_list.append(key)
+ else:
+ same_width_lists.append(current_width_list)
+ current_width_list = [key]
+ if current_width_list:
+ same_width_lists.append(current_width_list)
+
+ for same_width_list in same_width_lists:
+ if len(same_width_list) == 1:
+ outfile.write('{:s}\t{:d}\n'.format(
+ unicode_utils.ucs_symbol(same_width_list[0]),
+ width_dict[same_width_list[0]]))
+ else:
+ outfile.write('{:s}...{:s}\t{:d}\n'.format(
+ unicode_utils.ucs_symbol(same_width_list[0]),
+ unicode_utils.ucs_symbol(same_width_list[-1]),
+ width_dict[same_width_list[0]]))
if __name__ == "__main__":
if len(sys.argv) < 3: