From 1d72edfab5cff0e7bbc1e1e22ae7b1b6bdc756b4 Mon Sep 17 00:00:00 2001 From: Steve Bennett Date: Wed, 27 Jul 2011 09:31:03 +1000 Subject: Extend UTF-8 support past the BMP Now codepoints up to U+1FFFFF are supported, including as literals with the new \u{NNNNNN} syntax (up to six hex digits) Signed-off-by: Steve Bennett --- utf8.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'utf8.c') diff --git a/utf8.c b/utf8.c index 1368f00..562dca9 100644 --- a/utf8.c +++ b/utf8.c @@ -14,7 +14,7 @@ #include "utf8.h" /* This one is always implemented */ -int utf8_fromunicode(char *p, unsigned short uc) +int utf8_fromunicode(char *p, unsigned uc) { if (uc <= 0x7f) { *p = uc; @@ -25,12 +25,20 @@ int utf8_fromunicode(char *p, unsigned short uc) *p = 0x80 | (uc & 0x3f); return 2; } - else { + else if (uc <= 0xffff) { *p++ = 0xe0 | ((uc & 0xf000) >> 12); *p++ = 0x80 | ((uc & 0xfc0) >> 6); *p = 0x80 | (uc & 0x3f); return 3; } + /* Note: We silently truncate to 21 bits here: 0x1fffff */ + else { + *p++ = 0xf0 | ((uc & 0x1c0000) >> 18); + *p++ = 0x80 | ((uc & 0x3f000) >> 12); + *p++ = 0x80 | ((uc & 0xfc0) >> 6); + *p = 0x80 | (uc & 0x3f); + return 4; + } } #if defined(JIM_UTF8) && !defined(JIM_BOOTSTRAP) @@ -129,6 +137,12 @@ int utf8_tounicode(const char *str, int *uc) return 3; } } + else if (s[0] < 0xf8) { + if (((str[1] & 0xc0) == 0x80) && ((str[2] & 0xc0) == 0x80) && ((str[3] & 0xc0) == 0x80)) { + *uc = ((s[0] & ~0xf0) << 18) | ((s[1] & ~0x80) << 12) | ((s[2] & ~0x80) << 6) | (s[3] & ~0x80); + return 4; + } + } /* Invalid sequence, so just return the byte */ *uc = *s; -- cgit v1.1