From 1d72edfab5cff0e7bbc1e1e22ae7b1b6bdc756b4 Mon Sep 17 00:00:00 2001
From: Steve Bennett <steveb@workware.net.au>
Date: Wed, 27 Jul 2011 09:31:03 +1000
Subject: Extend UTF-8 support past the BMP

Now codepoints up to U+1FFFFF are supported, including
as literals with the new \u{NNNNNN} syntax (up to six hex digits)

Signed-off-by: Steve Bennett <steveb@workware.net.au>
---
 jim.c             | 36 ++++++++++++++++++++++++++++++------
 jimregexp.c       | 15 +++++++++++++--
 tests/utf8.test   | 16 ++++++++++++++++
 tests/utftcl.test |  5 ++++-
 utf8.c            | 18 ++++++++++++++++--
 utf8.h            | 19 +++++++++++--------
 6 files changed, 90 insertions(+), 19 deletions(-)

diff --git a/jim.c b/jim.c
index 652c690..e064671 100644
--- a/jim.c
+++ b/jim.c
@@ -1724,30 +1724,54 @@ static int JimEscape(char *dest, const char *s, int slen)
                     case 'u':
                     case 'x':
                         /* A unicode or hex sequence.
-                         * \u Expect 1-4 hex chars and convert to utf-8.
                          * \x Expect 1-2 hex chars and convert to hex.
+                         * \u Expect 1-4 hex chars and convert to utf-8.
+                         * \u{NNN} supports 1-6 hex chars and convert to utf-8.
                          * An invalid sequence means simply the escaped char.
                          */
                         {
-                            int val = 0;
+                            unsigned val = 0;
                             int k;
+                            int maxchars = 2;
 
                             i++;
 
-                            for (k = 0; k < (s[i] == 'u' ? 4 : 2); k++) {
+                            if (s[i] == 'u') {
+                                if (s[i + 1] == '{') {
+                                    maxchars = 6;
+                                    i++;
+                                }
+                                else {
+                                    maxchars = 4;
+                                }
+                            }
+
+                            for (k = 0; k < maxchars; k++) {
                                 int c = xdigitval(s[i + k + 1]);
                                 if (c == -1) {
                                     break;
                                 }
                                 val = (val << 4) | c;
                             }
+                            /* The \u{nnn} syntax supports up to 21 bit codepoints. */
+                            if (s[i] == '{') {
+                                if (k == 0 || val > 0x1fffff || s[i + k + 1] != '}') {
+                                    /* Back up */
+                                    i--;
+                                    k = 0;
+                                }
+                                else {
+                                    /* Skip the closing brace */
+                                    k++;
+                                }
+                            }
                             if (k) {
                                 /* Got a valid sequence, so convert */
-                                if (s[i] == 'u') {
-                                    p += utf8_fromunicode(p, val);
+                                if (s[i] == 'x') {
+                                    *p++ = val;
                                 }
                                 else {
-                                    *p++ = val;
+                                    p += utf8_fromunicode(p, val);
                                 }
                                 i += k;
                                 break;
diff --git a/jimregexp.c b/jimregexp.c
index 0c5a4dd..e899924 100644
--- a/jimregexp.c
+++ b/jimregexp.c
@@ -632,7 +632,18 @@ static int reg_decode_escape(const char *s, int *ch)
 		case 't': *ch = '\t'; break;
 		case 'v': *ch = '\v'; break;
 		case 'u':
-			if ((n = parse_hex(s, 4, ch)) > 0) {
+			if (*s == '{') {
+				/* Expect \u{NNNN} */
+				n = parse_hex(s + 1, 6, ch);
+				if (n > 0 && s[n + 1] == '}' && *ch >= 0 && *ch <= 0x1fffff) {
+					s += n + 2;
+				}
+				else {
+					/* Invalid, so just treat as an escaped 'u' */
+					*ch = 'u';
+				}
+			}
+			else if ((n = parse_hex(s, 4, ch)) > 0) {
 				s += n;
 			}
 			break;
@@ -1609,7 +1620,7 @@ static void regdump(regex_t *preg)
 	int s;
 	int op = EXACTLY;	/* Arbitrary non-END op. */
 	int next;
-	char buf[4];
+	char buf[MAX_UTF8_LEN + 1];
 
 	int i;
 	for (i = 1; i < preg->p; i++) {
diff --git a/tests/utf8.test b/tests/utf8.test
index 715df13..287f6b1 100644
--- a/tests/utf8.test
+++ b/tests/utf8.test
@@ -129,4 +129,20 @@ test utf8-7.3 {Upper, lower for titlecase utf-8} {
 	list [string toupper \u01c5] [string tolower \u01c5]
 } "\u01c4 \u01c6"
 
+test utf8-8.1 {Chars outside the BMP} jim {
+	string length \u{12000}\u{13000}
+} 2
+
+test utf8-8.2 {Chars outside the BMP} jim {
+	string match "ab\[\u{12000}c\]d" ab\u{12000}d
+} 1
+
+test utf8-8.3 {Chars outside the BMP} jim {
+	string last d "ab\u{101fff}cd"
+} 4
+
+test utf8-8.4 {Longer sequences} {
+	string length \u12000
+} 2
+
 testreport
diff --git a/tests/utftcl.test b/tests/utftcl.test
index db058c2..a58fdda 100644
--- a/tests/utftcl.test
+++ b/tests/utftcl.test
@@ -53,9 +53,12 @@ test utf-2.6 {Tcl_UtfToUniChar: lead (3-byte) followed by 1 trail} {
 test utf-2.7 {Tcl_UtfToUniChar: lead (3-byte) followed by 2 trail} {
     string length [bytestring "\xE4\xb9\x8e"]
 } {1}
-test utf-2.8 {Tcl_UtfToUniChar: longer UTF sequences not supported} {
+test utf-2.8 {Tcl_UtfToUniChar: longer UTF sequences not supported} tcl {
     string length [bytestring "\xF4\xA2\xA2\xA2"]
 } {4}
+test utf-2.9 {Tcl_UtfToUniChar: 4-byte UTF sequence} jim {
+    string length [bytestring "\xF4\xA2\xA2\xA2"]
+} {1}
 
 test utf-3.1 {Tcl_UtfCharComplete} {
 } {}
diff --git a/utf8.c b/utf8.c
index 1368f00..562dca9 100644
--- a/utf8.c
+++ b/utf8.c
@@ -14,7 +14,7 @@
 #include "utf8.h"
 
 /* This one is always implemented */
-int utf8_fromunicode(char *p, unsigned short uc)
+int utf8_fromunicode(char *p, unsigned uc)
 {
     if (uc <= 0x7f) {
         *p = uc;
@@ -25,12 +25,20 @@ int utf8_fromunicode(char *p, unsigned short uc)
         *p = 0x80 | (uc & 0x3f);
         return 2;
     }
-    else {
+    else if (uc <= 0xffff) {
         *p++ = 0xe0 | ((uc & 0xf000) >> 12);
         *p++ = 0x80 | ((uc & 0xfc0) >> 6);
         *p = 0x80 | (uc & 0x3f);
         return 3;
     }
+    /* Note: We silently truncate to 21 bits here: 0x1fffff */
+    else {
+        *p++ = 0xf0 | ((uc & 0x1c0000) >> 18);
+        *p++ = 0x80 | ((uc & 0x3f000) >> 12);
+        *p++ = 0x80 | ((uc & 0xfc0) >> 6);
+        *p = 0x80 | (uc & 0x3f);
+        return 4;
+    }
 }
 
 #if defined(JIM_UTF8) && !defined(JIM_BOOTSTRAP)
@@ -129,6 +137,12 @@ int utf8_tounicode(const char *str, int *uc)
             return 3;
         }
     }
+    else if (s[0] < 0xf8) {
+        if (((str[1] & 0xc0) == 0x80) && ((str[2] & 0xc0) == 0x80) && ((str[3] & 0xc0) == 0x80)) {
+            *uc = ((s[0] & ~0xf0) << 18) | ((s[1] & ~0x80) << 12) | ((s[2] & ~0x80) << 6) | (s[3] & ~0x80);
+            return 4;
+        }
+    }
 
     /* Invalid sequence, so just return the byte */
     *uc = *s;
diff --git a/utf8.h b/utf8.h
index 39da384..2a3ce01 100644
--- a/utf8.h
+++ b/utf8.h
@@ -9,13 +9,16 @@
  */
 #include <jim-config.h>
 
+/* Currently we support unicode points up to 2^22-1 */
+#define MAX_UTF8_LEN 4
+
 /**
- * Converts the given unicode codepoint (0 - 0xffff) to utf-8
+ * Converts the given unicode codepoint (0 - 0x1fffff) to utf-8
  * and stores the result at 'p'.
- *
- * Returns the number of utf-8 characters (1-3).
+ * 
+ * Returns the number of utf-8 characters (up to MAX_UTF8_LEN).
  */
-int utf8_fromunicode(char *p, unsigned short uc);
+int utf8_fromunicode(char *p, unsigned uc);
 
 #ifndef JIM_UTF8
 #include <ctype.h>
@@ -50,7 +53,7 @@ int utf8_charlen(int c);
  *
  * The string *must* be null terminated.
  *
- * Does not support unicode code points > \uffff
+ * Does not support unicode code points > \u1fffff
  */
 int utf8_strlen(const char *str, int bytelen);
 
@@ -76,7 +79,7 @@ int utf8_index(const char *str, int charindex);
  *
  * If it is not null terminated, the length *must* be checked first.
  *
- * Does not support unicode code points > \uffff
+ * Does not support unicode code points > \u1fffff
  */
 int utf8_tounicode(const char *str, int *uc);
 
@@ -92,7 +95,7 @@ int utf8_prev_len(const char *str, int len);
 /**
  * Returns the upper-case variant of the given unicode codepoint.
  *
- * Does not support unicode code points > \uffff
+ * Unicode code points > \uffff are returned unchanged.
  */
 int utf8_upper(int uc);
 
@@ -110,7 +113,7 @@ int utf8_title(int uc);
  *
  * NOTE: Use utf8_upper() in preference for case-insensitive matching.
  *
- * Does not support unicode code points > \uffff
+ * Unicode code points > \uffff are returned unchanged.
  */
 int utf8_lower(int uc);
 #endif /* JIM_BOOTSTRAP */
-- 
cgit v1.1