aboutsummaryrefslogtreecommitdiff
path: root/contrib/unicode/gen_libstdcxx_unicode_data.py
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/unicode/gen_libstdcxx_unicode_data.py')
-rwxr-xr-xcontrib/unicode/gen_libstdcxx_unicode_data.py47
1 files changed, 43 insertions, 4 deletions
diff --git a/contrib/unicode/gen_libstdcxx_unicode_data.py b/contrib/unicode/gen_libstdcxx_unicode_data.py
index ff4bee4..c50884d 100755
--- a/contrib/unicode/gen_libstdcxx_unicode_data.py
+++ b/contrib/unicode/gen_libstdcxx_unicode_data.py
@@ -126,7 +126,7 @@ edges = find_edges(all_code_points, 1)
# Table for std::__unicode::__format_width(char32_t)
-print(" // Table generated by contrib/unicode/gen_std_format_width.py,")
+print(" // Table generated by contrib/unicode/gen_libstdcxx_unicode_data.py,")
print(" // from EastAsianWidth.txt from the Unicode standard.");
print(" inline constexpr char32_t __width_edges[] = {", end="")
for i, e in enumerate(edges):
@@ -138,6 +138,45 @@ for i, e in enumerate(edges):
print("{:#x},".format(c), end="")
print("\n };\n")
+# By default escape each code point
+all_code_points = [True] * (1 + 0x10FFFF)
+
+escaped_general_categories = {
+ # Separator (Z)
+ "Zs", "Zl", "Zp",
+ # Other (C)
+ "Cc", "Cf", "Cs", "Co", "Cn",
+}
+
+# Extract General_Category and detrmine if it should be escaped
+# for all code points.
+for line in open("DerivedGeneralCategory.txt", "r"):
+ # Example lines:
+ # 0530 ; Cn # <reserved-0530>
+ # 0557..0558 ; Cn # [2] <reserved-0557>..<reserved-0558>
+ line = line.split("#")[0]
+ if re.match(r'^[\dA-Fa-f][^;]+;', line):
+ code_points, general_category = line.split(";")
+ gc_escaped = general_category.strip() in escaped_general_categories
+ process_code_points(code_points, gc_escaped)
+
+edges = find_edges(all_code_points)
+
+shift_bits = 1
+print(" // Values generated by contrib/unicode/gen_libstdcxx_unicode_data.py,")
+print(" // from DerivedGeneralCategory.txt from the Unicode standard.");
+print(" // Entries are (code_point << 1) + escape.")
+print(" inline constexpr uint32_t __escape_edges[] = {", end="")
+for i, e in enumerate(edges):
+ if i % 6:
+ print(" ", end="")
+ else:
+ print("\n ", end="")
+ c, p = e
+ x = (c << shift_bits) + (1 if p else 0)
+ print("{0:#x},".format(x), end="")
+print("\n };\n")
+
# By default every code point has Grapheme_Cluster_Break=Other.
all_code_points = ["Other"] * (1 + 0x10FFFF)
@@ -167,7 +206,7 @@ print(" };\n")
# Tables for std::__unicode::_Grapheme_cluster_state
-print(" // Values generated by contrib/unicode/gen_std_format_width.py,")
+print(" // Values generated by contrib/unicode/gen_libstdcxx_unicode_data.py,")
print(" // from GraphemeBreakProperty.txt from the Unicode standard.");
print(" // Entries are (code_point << shift_bits) + property.")
print(" inline constexpr int __gcb_shift_bits = {:#x};".format(shift_bits))
@@ -209,7 +248,7 @@ edges = find_edges(all_code_points)
incb_props = {None:0, "Consonant":1, "Extend":2}
print(" enum class _InCB { _Consonant = 1, _Extend = 2 };\n")
# Table for std::__unicode::__incb_property
-print(" // Values generated by contrib/unicode/gen_std_format_width.py,")
+print(" // Values generated by contrib/unicode/gen_libstdcxx_unicode_data.py,")
print(" // from DerivedCoreProperties.txt from the Unicode standard.");
print(" // Entries are (code_point << 2) + property.")
print(" inline constexpr uint32_t __incb_edges[] = {", end="")
@@ -238,7 +277,7 @@ for line in open("emoji-data.txt", "r"):
edges = find_edges(all_code_points, False)
# Table for std::__unicode::__is_extended_pictographic
-print(" // Table generated by contrib/unicode/gen_std_format_width.py,")
+print(" // Table generated by contrib/unicode/gen_libstdcxx_unicode_data.py,")
print(" // from emoji-data.txt from the Unicode standard.");
print(" inline constexpr char32_t __xpicto_edges[] = {", end="")
for i, e in enumerate(edges):