diff options
Diffstat (limited to 'contrib/unicode/gen_libstdcxx_unicode_data.py')
-rwxr-xr-x | contrib/unicode/gen_libstdcxx_unicode_data.py | 47 |
1 files changed, 43 insertions, 4 deletions
diff --git a/contrib/unicode/gen_libstdcxx_unicode_data.py b/contrib/unicode/gen_libstdcxx_unicode_data.py index ff4bee4..c50884d 100755 --- a/contrib/unicode/gen_libstdcxx_unicode_data.py +++ b/contrib/unicode/gen_libstdcxx_unicode_data.py @@ -126,7 +126,7 @@ edges = find_edges(all_code_points, 1) # Table for std::__unicode::__format_width(char32_t) -print(" // Table generated by contrib/unicode/gen_std_format_width.py,") +print(" // Table generated by contrib/unicode/gen_libstdcxx_unicode_data.py,") print(" // from EastAsianWidth.txt from the Unicode standard."); print(" inline constexpr char32_t __width_edges[] = {", end="") for i, e in enumerate(edges): @@ -138,6 +138,45 @@ for i, e in enumerate(edges): print("{:#x},".format(c), end="") print("\n };\n") +# By default escape each code point +all_code_points = [True] * (1 + 0x10FFFF) + +escaped_general_categories = { + # Separator (Z) + "Zs", "Zl", "Zp", + # Other (C) + "Cc", "Cf", "Cs", "Co", "Cn", +} + +# Extract General_Category and detrmine if it should be escaped +# for all code points. +for line in open("DerivedGeneralCategory.txt", "r"): + # Example lines: + # 0530 ; Cn # <reserved-0530> + # 0557..0558 ; Cn # [2] <reserved-0557>..<reserved-0558> + line = line.split("#")[0] + if re.match(r'^[\dA-Fa-f][^;]+;', line): + code_points, general_category = line.split(";") + gc_escaped = general_category.strip() in escaped_general_categories + process_code_points(code_points, gc_escaped) + +edges = find_edges(all_code_points) + +shift_bits = 1 +print(" // Values generated by contrib/unicode/gen_libstdcxx_unicode_data.py,") +print(" // from DerivedGeneralCategory.txt from the Unicode standard."); +print(" // Entries are (code_point << 1) + escape.") +print(" inline constexpr uint32_t __escape_edges[] = {", end="") +for i, e in enumerate(edges): + if i % 6: + print(" ", end="") + else: + print("\n ", end="") + c, p = e + x = (c << shift_bits) + (1 if p else 0) + print("{0:#x},".format(x), end="") +print("\n };\n") + # By default every code point has Grapheme_Cluster_Break=Other. all_code_points = ["Other"] * (1 + 0x10FFFF) @@ -167,7 +206,7 @@ print(" };\n") # Tables for std::__unicode::_Grapheme_cluster_state -print(" // Values generated by contrib/unicode/gen_std_format_width.py,") +print(" // Values generated by contrib/unicode/gen_libstdcxx_unicode_data.py,") print(" // from GraphemeBreakProperty.txt from the Unicode standard."); print(" // Entries are (code_point << shift_bits) + property.") print(" inline constexpr int __gcb_shift_bits = {:#x};".format(shift_bits)) @@ -209,7 +248,7 @@ edges = find_edges(all_code_points) incb_props = {None:0, "Consonant":1, "Extend":2} print(" enum class _InCB { _Consonant = 1, _Extend = 2 };\n") # Table for std::__unicode::__incb_property -print(" // Values generated by contrib/unicode/gen_std_format_width.py,") +print(" // Values generated by contrib/unicode/gen_libstdcxx_unicode_data.py,") print(" // from DerivedCoreProperties.txt from the Unicode standard."); print(" // Entries are (code_point << 2) + property.") print(" inline constexpr uint32_t __incb_edges[] = {", end="") @@ -238,7 +277,7 @@ for line in open("emoji-data.txt", "r"): edges = find_edges(all_code_points, False) # Table for std::__unicode::__is_extended_pictographic -print(" // Table generated by contrib/unicode/gen_std_format_width.py,") +print(" // Table generated by contrib/unicode/gen_libstdcxx_unicode_data.py,") print(" // from emoji-data.txt from the Unicode standard."); print(" inline constexpr char32_t __xpicto_edges[] = {", end="") for i, e in enumerate(edges): |