From 1d72edfab5cff0e7bbc1e1e22ae7b1b6bdc756b4 Mon Sep 17 00:00:00 2001
From: Steve Bennett <steveb@workware.net.au>
Date: Wed, 27 Jul 2011 09:31:03 +1000
Subject: Extend UTF-8 support past the BMP

Now codepoints up to U+1FFFFF are supported, including
as literals with the new \u{NNNNNN} syntax (up to six hex digits)

Signed-off-by: Steve Bennett <steveb@workware.net.au>
---
 utf8.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

(limited to 'utf8.c')

diff --git a/utf8.c b/utf8.c
index 1368f00..562dca9 100644
--- a/utf8.c
+++ b/utf8.c
@@ -14,7 +14,7 @@
 #include "utf8.h"
 
 /* This one is always implemented */
-int utf8_fromunicode(char *p, unsigned short uc)
+int utf8_fromunicode(char *p, unsigned uc)
 {
     if (uc <= 0x7f) {
         *p = uc;
@@ -25,12 +25,20 @@ int utf8_fromunicode(char *p, unsigned short uc)
         *p = 0x80 | (uc & 0x3f);
         return 2;
     }
-    else {
+    else if (uc <= 0xffff) {
         *p++ = 0xe0 | ((uc & 0xf000) >> 12);
         *p++ = 0x80 | ((uc & 0xfc0) >> 6);
         *p = 0x80 | (uc & 0x3f);
         return 3;
     }
+    /* Note: We silently truncate to 21 bits here: 0x1fffff */
+    else {
+        *p++ = 0xf0 | ((uc & 0x1c0000) >> 18);
+        *p++ = 0x80 | ((uc & 0x3f000) >> 12);
+        *p++ = 0x80 | ((uc & 0xfc0) >> 6);
+        *p = 0x80 | (uc & 0x3f);
+        return 4;
+    }
 }
 
 #if defined(JIM_UTF8) && !defined(JIM_BOOTSTRAP)
@@ -129,6 +137,12 @@ int utf8_tounicode(const char *str, int *uc)
             return 3;
         }
     }
+    else if (s[0] < 0xf8) {
+        if (((str[1] & 0xc0) == 0x80) && ((str[2] & 0xc0) == 0x80) && ((str[3] & 0xc0) == 0x80)) {
+            *uc = ((s[0] & ~0xf0) << 18) | ((s[1] & ~0x80) << 12) | ((s[2] & ~0x80) << 6) | (s[3] & ~0x80);
+            return 4;
+        }
+    }
 
     /* Invalid sequence, so just return the byte */
     *uc = *s;
-- 
cgit v1.1