aboutsummaryrefslogtreecommitdiff
path: root/utf8.c
diff options
context:
space:
mode:
authorSteve Bennett <steveb@workware.net.au>2011-07-27 09:31:03 +1000
committerSteve Bennett <steveb@workware.net.au>2011-12-02 20:56:50 +1000
commit1d72edfab5cff0e7bbc1e1e22ae7b1b6bdc756b4 (patch)
tree5764121cb8c33a22892da4e0eb89205a600750d3 /utf8.c
parentc2e5f7502026349106314843cad7f24020aad7fb (diff)
downloadjimtcl-1d72edfab5cff0e7bbc1e1e22ae7b1b6bdc756b4.zip
jimtcl-1d72edfab5cff0e7bbc1e1e22ae7b1b6bdc756b4.tar.gz
jimtcl-1d72edfab5cff0e7bbc1e1e22ae7b1b6bdc756b4.tar.bz2
Extend UTF-8 support past the BMP
Now codepoints up to U+1FFFFF are supported, including as literals with the new \u{NNNNNN} syntax (up to six hex digits) Signed-off-by: Steve Bennett <steveb@workware.net.au>
Diffstat (limited to 'utf8.c')
-rw-r--r--utf8.c18
1 files changed, 16 insertions, 2 deletions
diff --git a/utf8.c b/utf8.c
index 1368f00..562dca9 100644
--- a/utf8.c
+++ b/utf8.c
@@ -14,7 +14,7 @@
#include "utf8.h"
/* This one is always implemented */
-int utf8_fromunicode(char *p, unsigned short uc)
+int utf8_fromunicode(char *p, unsigned uc)
{
if (uc <= 0x7f) {
*p = uc;
@@ -25,12 +25,20 @@ int utf8_fromunicode(char *p, unsigned short uc)
*p = 0x80 | (uc & 0x3f);
return 2;
}
- else {
+ else if (uc <= 0xffff) {
*p++ = 0xe0 | ((uc & 0xf000) >> 12);
*p++ = 0x80 | ((uc & 0xfc0) >> 6);
*p = 0x80 | (uc & 0x3f);
return 3;
}
+ /* Note: We silently truncate to 21 bits here: 0x1fffff */
+ else {
+ *p++ = 0xf0 | ((uc & 0x1c0000) >> 18);
+ *p++ = 0x80 | ((uc & 0x3f000) >> 12);
+ *p++ = 0x80 | ((uc & 0xfc0) >> 6);
+ *p = 0x80 | (uc & 0x3f);
+ return 4;
+ }
}
#if defined(JIM_UTF8) && !defined(JIM_BOOTSTRAP)
@@ -129,6 +137,12 @@ int utf8_tounicode(const char *str, int *uc)
return 3;
}
}
+ else if (s[0] < 0xf8) {
+ if (((str[1] & 0xc0) == 0x80) && ((str[2] & 0xc0) == 0x80) && ((str[3] & 0xc0) == 0x80)) {
+ *uc = ((s[0] & ~0xf0) << 18) | ((s[1] & ~0x80) << 12) | ((s[2] & ~0x80) << 6) | (s[3] & ~0x80);
+ return 4;
+ }
+ }
/* Invalid sequence, so just return the byte */
*uc = *s;