From 1d72edfab5cff0e7bbc1e1e22ae7b1b6bdc756b4 Mon Sep 17 00:00:00 2001
From: Steve Bennett <steveb@workware.net.au>
Date: Wed, 27 Jul 2011 09:31:03 +1000
Subject: Extend UTF-8 support past the BMP

Now codepoints up to U+1FFFFF are supported, including
as literals with the new \u{NNNNNN} syntax (up to six hex digits)

Signed-off-by: Steve Bennett <steveb@workware.net.au>
---
 jimregexp.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'jimregexp.c')

diff --git a/jimregexp.c b/jimregexp.c
index 0c5a4dd..e899924 100644
--- a/jimregexp.c
+++ b/jimregexp.c
@@ -632,7 +632,18 @@ static int reg_decode_escape(const char *s, int *ch)
 		case 't': *ch = '\t'; break;
 		case 'v': *ch = '\v'; break;
 		case 'u':
-			if ((n = parse_hex(s, 4, ch)) > 0) {
+			if (*s == '{') {
+				/* Expect \u{NNNN} */
+				n = parse_hex(s + 1, 6, ch);
+				if (n > 0 && s[n + 1] == '}' && *ch >= 0 && *ch <= 0x1fffff) {
+					s += n + 2;
+				}
+				else {
+					/* Invalid, so just treat as an escaped 'u' */
+					*ch = 'u';
+				}
+			}
+			else if ((n = parse_hex(s, 4, ch)) > 0) {
 				s += n;
 			}
 			break;
@@ -1609,7 +1620,7 @@ static void regdump(regex_t *preg)
 	int s;
 	int op = EXACTLY;	/* Arbitrary non-END op. */
 	int next;
-	char buf[4];
+	char buf[MAX_UTF8_LEN + 1];
 
 	int i;
 	for (i = 1; i < preg->p; i++) {
-- 
cgit v1.1