1 files changed, 43 insertions, 4 deletions
diff --git a/contrib/unicode/gen_libstdcxx_unicode_data.py b/contrib/unicode/gen_libstdcxx_unicode_data.py
index ff4bee4..c50884d 100755
--- a/contrib/unicode/gen_libstdcxx_unicode_data.py
+++ b/contrib/unicode/gen_libstdcxx_unicode_data.py
@@ -126,7 +126,7 @@ edges = find_edges(all_code_points, 1)
 
 # Table for std::__unicode::__format_width(char32_t)
 
-print("  // Table generated by contrib/unicode/gen_std_format_width.py,")
+print("  // Table generated by contrib/unicode/gen_libstdcxx_unicode_data.py,")
 print("  // from EastAsianWidth.txt from the Unicode standard.");
 print("  inline constexpr char32_t __width_edges[] = {", end="")
 for i, e in enumerate(edges):
@@ -138,6 +138,45 @@ for i, e in enumerate(edges):
     print("{:#x},".format(c), end="")
 print("\n  };\n")
 
+# By default escape each code point
+all_code_points = [True] * (1 + 0x10FFFF)
+
+escaped_general_categories = {
+    # Separator (Z)
+    "Zs", "Zl", "Zp",
+    # Other (C)
+    "Cc", "Cf", "Cs", "Co", "Cn",
+}
+
+# Extract General_Category and detrmine if it should be escaped 
+# for all code points.
+for line in open("DerivedGeneralCategory.txt", "r"):
+    # Example lines:
+    # 0530          ; Cn #       <reserved-0530>
+    # 0557..0558    ; Cn #   [2] <reserved-0557>..<reserved-0558>
+    line = line.split("#")[0]
+    if re.match(r'^[\dA-Fa-f][^;]+;', line):
+        code_points, general_category = line.split(";")
+        gc_escaped = general_category.strip() in escaped_general_categories
+        process_code_points(code_points, gc_escaped)
+
+edges = find_edges(all_code_points)
+
+shift_bits = 1
+print("  // Values generated by contrib/unicode/gen_libstdcxx_unicode_data.py,")
+print("  // from DerivedGeneralCategory.txt from the Unicode standard.");
+print("  // Entries are (code_point << 1) + escape.")
+print("  inline constexpr uint32_t __escape_edges[] = {", end="")
+for i, e in enumerate(edges):
+    if i % 6:
+        print(" ", end="")
+    else:
+        print("\n    ", end="")
+    c, p = e
+    x = (c << shift_bits) + (1 if p else 0)
+    print("{0:#x},".format(x), end="")
+print("\n  };\n")
+
 # By default every code point has Grapheme_Cluster_Break=Other.
 all_code_points = ["Other"] * (1 + 0x10FFFF)
 
@@ -167,7 +206,7 @@ print("  };\n")
 
 # Tables for std::__unicode::_Grapheme_cluster_state
 
-print("  // Values generated by contrib/unicode/gen_std_format_width.py,")
+print("  // Values generated by contrib/unicode/gen_libstdcxx_unicode_data.py,")
 print("  // from GraphemeBreakProperty.txt from the Unicode standard.");
 print("  // Entries are (code_point << shift_bits) + property.")
 print("  inline constexpr int __gcb_shift_bits = {:#x};".format(shift_bits))
@@ -209,7 +248,7 @@ edges = find_edges(all_code_points)
 incb_props = {None:0, "Consonant":1, "Extend":2}
 print("  enum class _InCB { _Consonant = 1, _Extend = 2 };\n")
 # Table for std::__unicode::__incb_property
-print("  // Values generated by contrib/unicode/gen_std_format_width.py,")
+print("  // Values generated by contrib/unicode/gen_libstdcxx_unicode_data.py,")
 print("  // from DerivedCoreProperties.txt from the Unicode standard.");
 print("  // Entries are (code_point << 2) + property.")
 print("  inline constexpr uint32_t __incb_edges[] = {", end="")
@@ -238,7 +277,7 @@ for line in open("emoji-data.txt", "r"):
 edges = find_edges(all_code_points, False)
 
 # Table for std::__unicode::__is_extended_pictographic
-print("  // Table generated by contrib/unicode/gen_std_format_width.py,")
+print("  // Table generated by contrib/unicode/gen_libstdcxx_unicode_data.py,")
 print("  // from emoji-data.txt from the Unicode standard.");
 print("  inline constexpr char32_t __xpicto_edges[] = {", end="")
 for i, e in enumerate(edges):