aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--jim.c36
-rw-r--r--jimregexp.c15
-rw-r--r--tests/utf8.test16
-rw-r--r--tests/utftcl.test5
-rw-r--r--utf8.c18
-rw-r--r--utf8.h19
6 files changed, 90 insertions, 19 deletions
diff --git a/jim.c b/jim.c
index 652c690..e064671 100644
--- a/jim.c
+++ b/jim.c
@@ -1724,30 +1724,54 @@ static int JimEscape(char *dest, const char *s, int slen)
case 'u':
case 'x':
/* A unicode or hex sequence.
- * \u Expect 1-4 hex chars and convert to utf-8.
* \x Expect 1-2 hex chars and convert to hex.
+ * \u Expect 1-4 hex chars and convert to utf-8.
+ * \u{NNN} supports 1-6 hex chars and convert to utf-8.
* An invalid sequence means simply the escaped char.
*/
{
- int val = 0;
+ unsigned val = 0;
int k;
+ int maxchars = 2;
i++;
- for (k = 0; k < (s[i] == 'u' ? 4 : 2); k++) {
+ if (s[i] == 'u') {
+ if (s[i + 1] == '{') {
+ maxchars = 6;
+ i++;
+ }
+ else {
+ maxchars = 4;
+ }
+ }
+
+ for (k = 0; k < maxchars; k++) {
int c = xdigitval(s[i + k + 1]);
if (c == -1) {
break;
}
val = (val << 4) | c;
}
+ /* The \u{nnn} syntax supports up to 21 bit codepoints. */
+ if (s[i] == '{') {
+ if (k == 0 || val > 0x1fffff || s[i + k + 1] != '}') {
+ /* Back up */
+ i--;
+ k = 0;
+ }
+ else {
+ /* Skip the closing brace */
+ k++;
+ }
+ }
if (k) {
/* Got a valid sequence, so convert */
- if (s[i] == 'u') {
- p += utf8_fromunicode(p, val);
+ if (s[i] == 'x') {
+ *p++ = val;
}
else {
- *p++ = val;
+ p += utf8_fromunicode(p, val);
}
i += k;
break;
diff --git a/jimregexp.c b/jimregexp.c
index 0c5a4dd..e899924 100644
--- a/jimregexp.c
+++ b/jimregexp.c
@@ -632,7 +632,18 @@ static int reg_decode_escape(const char *s, int *ch)
case 't': *ch = '\t'; break;
case 'v': *ch = '\v'; break;
case 'u':
- if ((n = parse_hex(s, 4, ch)) > 0) {
+ if (*s == '{') {
+ /* Expect \u{NNNN} */
+ n = parse_hex(s + 1, 6, ch);
+ if (n > 0 && s[n + 1] == '}' && *ch >= 0 && *ch <= 0x1fffff) {
+ s += n + 2;
+ }
+ else {
+ /* Invalid, so just treat as an escaped 'u' */
+ *ch = 'u';
+ }
+ }
+ else if ((n = parse_hex(s, 4, ch)) > 0) {
s += n;
}
break;
@@ -1609,7 +1620,7 @@ static void regdump(regex_t *preg)
int s;
int op = EXACTLY; /* Arbitrary non-END op. */
int next;
- char buf[4];
+ char buf[MAX_UTF8_LEN + 1];
int i;
for (i = 1; i < preg->p; i++) {
diff --git a/tests/utf8.test b/tests/utf8.test
index 715df13..287f6b1 100644
--- a/tests/utf8.test
+++ b/tests/utf8.test
@@ -129,4 +129,20 @@ test utf8-7.3 {Upper, lower for titlecase utf-8} {
list [string toupper \u01c5] [string tolower \u01c5]
} "\u01c4 \u01c6"
+test utf8-8.1 {Chars outside the BMP} jim {
+ string length \u{12000}\u{13000}
+} 2
+
+test utf8-8.2 {Chars outside the BMP} jim {
+ string match "ab\[\u{12000}c\]d" ab\u{12000}d
+} 1
+
+test utf8-8.3 {Chars outside the BMP} jim {
+ string last d "ab\u{101fff}cd"
+} 4
+
+test utf8-8.4 {Longer sequences} {
+ string length \u12000
+} 2
+
testreport
diff --git a/tests/utftcl.test b/tests/utftcl.test
index db058c2..a58fdda 100644
--- a/tests/utftcl.test
+++ b/tests/utftcl.test
@@ -53,9 +53,12 @@ test utf-2.6 {Tcl_UtfToUniChar: lead (3-byte) followed by 1 trail} {
test utf-2.7 {Tcl_UtfToUniChar: lead (3-byte) followed by 2 trail} {
string length [bytestring "\xE4\xb9\x8e"]
} {1}
-test utf-2.8 {Tcl_UtfToUniChar: longer UTF sequences not supported} {
+test utf-2.8 {Tcl_UtfToUniChar: longer UTF sequences not supported} tcl {
string length [bytestring "\xF4\xA2\xA2\xA2"]
} {4}
+test utf-2.9 {Tcl_UtfToUniChar: 4-byte UTF sequence} jim {
+ string length [bytestring "\xF4\xA2\xA2\xA2"]
+} {1}
test utf-3.1 {Tcl_UtfCharComplete} {
} {}
diff --git a/utf8.c b/utf8.c
index 1368f00..562dca9 100644
--- a/utf8.c
+++ b/utf8.c
@@ -14,7 +14,7 @@
#include "utf8.h"
/* This one is always implemented */
-int utf8_fromunicode(char *p, unsigned short uc)
+int utf8_fromunicode(char *p, unsigned uc)
{
if (uc <= 0x7f) {
*p = uc;
@@ -25,12 +25,20 @@ int utf8_fromunicode(char *p, unsigned short uc)
*p = 0x80 | (uc & 0x3f);
return 2;
}
- else {
+ else if (uc <= 0xffff) {
*p++ = 0xe0 | ((uc & 0xf000) >> 12);
*p++ = 0x80 | ((uc & 0xfc0) >> 6);
*p = 0x80 | (uc & 0x3f);
return 3;
}
+ /* Note: We silently truncate to 21 bits here: 0x1fffff */
+ else {
+ *p++ = 0xf0 | ((uc & 0x1c0000) >> 18);
+ *p++ = 0x80 | ((uc & 0x3f000) >> 12);
+ *p++ = 0x80 | ((uc & 0xfc0) >> 6);
+ *p = 0x80 | (uc & 0x3f);
+ return 4;
+ }
}
#if defined(JIM_UTF8) && !defined(JIM_BOOTSTRAP)
@@ -129,6 +137,12 @@ int utf8_tounicode(const char *str, int *uc)
return 3;
}
}
+ else if (s[0] < 0xf8) {
+ if (((str[1] & 0xc0) == 0x80) && ((str[2] & 0xc0) == 0x80) && ((str[3] & 0xc0) == 0x80)) {
+ *uc = ((s[0] & ~0xf0) << 18) | ((s[1] & ~0x80) << 12) | ((s[2] & ~0x80) << 6) | (s[3] & ~0x80);
+ return 4;
+ }
+ }
/* Invalid sequence, so just return the byte */
*uc = *s;
diff --git a/utf8.h b/utf8.h
index 39da384..2a3ce01 100644
--- a/utf8.h
+++ b/utf8.h
@@ -9,13 +9,16 @@
*/
#include <jim-config.h>
+/* Currently we support unicode points up to 2^22-1 */
+#define MAX_UTF8_LEN 4
+
/**
- * Converts the given unicode codepoint (0 - 0xffff) to utf-8
+ * Converts the given unicode codepoint (0 - 0x1fffff) to utf-8
* and stores the result at 'p'.
- *
- * Returns the number of utf-8 characters (1-3).
+ *
+ * Returns the number of utf-8 characters (up to MAX_UTF8_LEN).
*/
-int utf8_fromunicode(char *p, unsigned short uc);
+int utf8_fromunicode(char *p, unsigned uc);
#ifndef JIM_UTF8
#include <ctype.h>
@@ -50,7 +53,7 @@ int utf8_charlen(int c);
*
* The string *must* be null terminated.
*
- * Does not support unicode code points > \uffff
+ * Does not support unicode code points > \u1fffff
*/
int utf8_strlen(const char *str, int bytelen);
@@ -76,7 +79,7 @@ int utf8_index(const char *str, int charindex);
*
* If it is not null terminated, the length *must* be checked first.
*
- * Does not support unicode code points > \uffff
+ * Does not support unicode code points > \u1fffff
*/
int utf8_tounicode(const char *str, int *uc);
@@ -92,7 +95,7 @@ int utf8_prev_len(const char *str, int len);
/**
* Returns the upper-case variant of the given unicode codepoint.
*
- * Does not support unicode code points > \uffff
+ * Unicode code points > \uffff are returned unchanged.
*/
int utf8_upper(int uc);
@@ -110,7 +113,7 @@ int utf8_title(int uc);
*
* NOTE: Use utf8_upper() in preference for case-insensitive matching.
*
- * Does not support unicode code points > \uffff
+ * Unicode code points > \uffff are returned unchanged.
*/
int utf8_lower(int uc);
#endif /* JIM_BOOTSTRAP */