From 1d72edfab5cff0e7bbc1e1e22ae7b1b6bdc756b4 Mon Sep 17 00:00:00 2001 From: Steve Bennett Date: Wed, 27 Jul 2011 09:31:03 +1000 Subject: Extend UTF-8 support past the BMP Now codepoints up to U+1FFFFF are supported, including as literals with the new \u{NNNNNN} syntax (up to six hex digits) Signed-off-by: Steve Bennett --- jim.c | 36 ++++++++++++++++++++++++++++++------ jimregexp.c | 15 +++++++++++++-- tests/utf8.test | 16 ++++++++++++++++ tests/utftcl.test | 5 ++++- utf8.c | 18 ++++++++++++++++-- utf8.h | 19 +++++++++++-------- 6 files changed, 90 insertions(+), 19 deletions(-) diff --git a/jim.c b/jim.c index 652c690..e064671 100644 --- a/jim.c +++ b/jim.c @@ -1724,30 +1724,54 @@ static int JimEscape(char *dest, const char *s, int slen) case 'u': case 'x': /* A unicode or hex sequence. - * \u Expect 1-4 hex chars and convert to utf-8. * \x Expect 1-2 hex chars and convert to hex. + * \u Expect 1-4 hex chars and convert to utf-8. + * \u{NNN} supports 1-6 hex chars and convert to utf-8. * An invalid sequence means simply the escaped char. */ { - int val = 0; + unsigned val = 0; int k; + int maxchars = 2; i++; - for (k = 0; k < (s[i] == 'u' ? 4 : 2); k++) { + if (s[i] == 'u') { + if (s[i + 1] == '{') { + maxchars = 6; + i++; + } + else { + maxchars = 4; + } + } + + for (k = 0; k < maxchars; k++) { int c = xdigitval(s[i + k + 1]); if (c == -1) { break; } val = (val << 4) | c; } + /* The \u{nnn} syntax supports up to 21 bit codepoints. */ + if (s[i] == '{') { + if (k == 0 || val > 0x1fffff || s[i + k + 1] != '}') { + /* Back up */ + i--; + k = 0; + } + else { + /* Skip the closing brace */ + k++; + } + } if (k) { /* Got a valid sequence, so convert */ - if (s[i] == 'u') { - p += utf8_fromunicode(p, val); + if (s[i] == 'x') { + *p++ = val; } else { - *p++ = val; + p += utf8_fromunicode(p, val); } i += k; break; diff --git a/jimregexp.c b/jimregexp.c index 0c5a4dd..e899924 100644 --- a/jimregexp.c +++ b/jimregexp.c @@ -632,7 +632,18 @@ static int reg_decode_escape(const char *s, int *ch) case 't': *ch = '\t'; break; case 'v': *ch = '\v'; break; case 'u': - if ((n = parse_hex(s, 4, ch)) > 0) { + if (*s == '{') { + /* Expect \u{NNNN} */ + n = parse_hex(s + 1, 6, ch); + if (n > 0 && s[n + 1] == '}' && *ch >= 0 && *ch <= 0x1fffff) { + s += n + 2; + } + else { + /* Invalid, so just treat as an escaped 'u' */ + *ch = 'u'; + } + } + else if ((n = parse_hex(s, 4, ch)) > 0) { s += n; } break; @@ -1609,7 +1620,7 @@ static void regdump(regex_t *preg) int s; int op = EXACTLY; /* Arbitrary non-END op. */ int next; - char buf[4]; + char buf[MAX_UTF8_LEN + 1]; int i; for (i = 1; i < preg->p; i++) { diff --git a/tests/utf8.test b/tests/utf8.test index 715df13..287f6b1 100644 --- a/tests/utf8.test +++ b/tests/utf8.test @@ -129,4 +129,20 @@ test utf8-7.3 {Upper, lower for titlecase utf-8} { list [string toupper \u01c5] [string tolower \u01c5] } "\u01c4 \u01c6" +test utf8-8.1 {Chars outside the BMP} jim { + string length \u{12000}\u{13000} +} 2 + +test utf8-8.2 {Chars outside the BMP} jim { + string match "ab\[\u{12000}c\]d" ab\u{12000}d +} 1 + +test utf8-8.3 {Chars outside the BMP} jim { + string last d "ab\u{101fff}cd" +} 4 + +test utf8-8.4 {Longer sequences} { + string length \u12000 +} 2 + testreport diff --git a/tests/utftcl.test b/tests/utftcl.test index db058c2..a58fdda 100644 --- a/tests/utftcl.test +++ b/tests/utftcl.test @@ -53,9 +53,12 @@ test utf-2.6 {Tcl_UtfToUniChar: lead (3-byte) followed by 1 trail} { test utf-2.7 {Tcl_UtfToUniChar: lead (3-byte) followed by 2 trail} { string length [bytestring "\xE4\xb9\x8e"] } {1} -test utf-2.8 {Tcl_UtfToUniChar: longer UTF sequences not supported} { +test utf-2.8 {Tcl_UtfToUniChar: longer UTF sequences not supported} tcl { string length [bytestring "\xF4\xA2\xA2\xA2"] } {4} +test utf-2.9 {Tcl_UtfToUniChar: 4-byte UTF sequence} jim { + string length [bytestring "\xF4\xA2\xA2\xA2"] +} {1} test utf-3.1 {Tcl_UtfCharComplete} { } {} diff --git a/utf8.c b/utf8.c index 1368f00..562dca9 100644 --- a/utf8.c +++ b/utf8.c @@ -14,7 +14,7 @@ #include "utf8.h" /* This one is always implemented */ -int utf8_fromunicode(char *p, unsigned short uc) +int utf8_fromunicode(char *p, unsigned uc) { if (uc <= 0x7f) { *p = uc; @@ -25,12 +25,20 @@ int utf8_fromunicode(char *p, unsigned short uc) *p = 0x80 | (uc & 0x3f); return 2; } - else { + else if (uc <= 0xffff) { *p++ = 0xe0 | ((uc & 0xf000) >> 12); *p++ = 0x80 | ((uc & 0xfc0) >> 6); *p = 0x80 | (uc & 0x3f); return 3; } + /* Note: We silently truncate to 21 bits here: 0x1fffff */ + else { + *p++ = 0xf0 | ((uc & 0x1c0000) >> 18); + *p++ = 0x80 | ((uc & 0x3f000) >> 12); + *p++ = 0x80 | ((uc & 0xfc0) >> 6); + *p = 0x80 | (uc & 0x3f); + return 4; + } } #if defined(JIM_UTF8) && !defined(JIM_BOOTSTRAP) @@ -129,6 +137,12 @@ int utf8_tounicode(const char *str, int *uc) return 3; } } + else if (s[0] < 0xf8) { + if (((str[1] & 0xc0) == 0x80) && ((str[2] & 0xc0) == 0x80) && ((str[3] & 0xc0) == 0x80)) { + *uc = ((s[0] & ~0xf0) << 18) | ((s[1] & ~0x80) << 12) | ((s[2] & ~0x80) << 6) | (s[3] & ~0x80); + return 4; + } + } /* Invalid sequence, so just return the byte */ *uc = *s; diff --git a/utf8.h b/utf8.h index 39da384..2a3ce01 100644 --- a/utf8.h +++ b/utf8.h @@ -9,13 +9,16 @@ */ #include +/* Currently we support unicode points up to 2^22-1 */ +#define MAX_UTF8_LEN 4 + /** - * Converts the given unicode codepoint (0 - 0xffff) to utf-8 + * Converts the given unicode codepoint (0 - 0x1fffff) to utf-8 * and stores the result at 'p'. - * - * Returns the number of utf-8 characters (1-3). + * + * Returns the number of utf-8 characters (up to MAX_UTF8_LEN). */ -int utf8_fromunicode(char *p, unsigned short uc); +int utf8_fromunicode(char *p, unsigned uc); #ifndef JIM_UTF8 #include @@ -50,7 +53,7 @@ int utf8_charlen(int c); * * The string *must* be null terminated. * - * Does not support unicode code points > \uffff + * Does not support unicode code points > \u1fffff */ int utf8_strlen(const char *str, int bytelen); @@ -76,7 +79,7 @@ int utf8_index(const char *str, int charindex); * * If it is not null terminated, the length *must* be checked first. * - * Does not support unicode code points > \uffff + * Does not support unicode code points > \u1fffff */ int utf8_tounicode(const char *str, int *uc); @@ -92,7 +95,7 @@ int utf8_prev_len(const char *str, int len); /** * Returns the upper-case variant of the given unicode codepoint. * - * Does not support unicode code points > \uffff + * Unicode code points > \uffff are returned unchanged. */ int utf8_upper(int uc); @@ -110,7 +113,7 @@ int utf8_title(int uc); * * NOTE: Use utf8_upper() in preference for case-insensitive matching. * - * Does not support unicode code points > \uffff + * Unicode code points > \uffff are returned unchanged. */ int utf8_lower(int uc); #endif /* JIM_BOOTSTRAP */ -- cgit v1.1