From 1d72edfab5cff0e7bbc1e1e22ae7b1b6bdc756b4 Mon Sep 17 00:00:00 2001 From: Steve Bennett <steveb@workware.net.au> Date: Wed, 27 Jul 2011 09:31:03 +1000 Subject: Extend UTF-8 support past the BMP Now codepoints up to U+1FFFFF are supported, including as literals with the new \u{NNNNNN} syntax (up to six hex digits) Signed-off-by: Steve Bennett <steveb@workware.net.au> --- jimregexp.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'jimregexp.c') diff --git a/jimregexp.c b/jimregexp.c index 0c5a4dd..e899924 100644 --- a/jimregexp.c +++ b/jimregexp.c @@ -632,7 +632,18 @@ static int reg_decode_escape(const char *s, int *ch) case 't': *ch = '\t'; break; case 'v': *ch = '\v'; break; case 'u': - if ((n = parse_hex(s, 4, ch)) > 0) { + if (*s == '{') { + /* Expect \u{NNNN} */ + n = parse_hex(s + 1, 6, ch); + if (n > 0 && s[n + 1] == '}' && *ch >= 0 && *ch <= 0x1fffff) { + s += n + 2; + } + else { + /* Invalid, so just treat as an escaped 'u' */ + *ch = 'u'; + } + } + else if ((n = parse_hex(s, 4, ch)) > 0) { s += n; } break; @@ -1609,7 +1620,7 @@ static void regdump(regex_t *preg) int s; int op = EXACTLY; /* Arbitrary non-END op. */ int next; - char buf[4]; + char buf[MAX_UTF8_LEN + 1]; int i; for (i = 1; i < preg->p; i++) { -- cgit v1.1