diff options
author | Steve Bennett <steveb@workware.net.au> | 2011-07-27 09:31:03 +1000 |
---|---|---|
committer | Steve Bennett <steveb@workware.net.au> | 2011-12-02 20:56:50 +1000 |
commit | 1d72edfab5cff0e7bbc1e1e22ae7b1b6bdc756b4 (patch) | |
tree | 5764121cb8c33a22892da4e0eb89205a600750d3 /utf8.c | |
parent | c2e5f7502026349106314843cad7f24020aad7fb (diff) | |
download | jimtcl-1d72edfab5cff0e7bbc1e1e22ae7b1b6bdc756b4.zip jimtcl-1d72edfab5cff0e7bbc1e1e22ae7b1b6bdc756b4.tar.gz jimtcl-1d72edfab5cff0e7bbc1e1e22ae7b1b6bdc756b4.tar.bz2 |
Extend UTF-8 support past the BMP
Now codepoints up to U+1FFFFF are supported, including
as literals with the new \u{NNNNNN} syntax (up to six hex digits)
Signed-off-by: Steve Bennett <steveb@workware.net.au>
Diffstat (limited to 'utf8.c')
-rw-r--r-- | utf8.c | 18 |
1 files changed, 16 insertions, 2 deletions
@@ -14,7 +14,7 @@ #include "utf8.h" /* This one is always implemented */ -int utf8_fromunicode(char *p, unsigned short uc) +int utf8_fromunicode(char *p, unsigned uc) { if (uc <= 0x7f) { *p = uc; @@ -25,12 +25,20 @@ int utf8_fromunicode(char *p, unsigned short uc) *p = 0x80 | (uc & 0x3f); return 2; } - else { + else if (uc <= 0xffff) { *p++ = 0xe0 | ((uc & 0xf000) >> 12); *p++ = 0x80 | ((uc & 0xfc0) >> 6); *p = 0x80 | (uc & 0x3f); return 3; } + /* Note: We silently truncate to 21 bits here: 0x1fffff */ + else { + *p++ = 0xf0 | ((uc & 0x1c0000) >> 18); + *p++ = 0x80 | ((uc & 0x3f000) >> 12); + *p++ = 0x80 | ((uc & 0xfc0) >> 6); + *p = 0x80 | (uc & 0x3f); + return 4; + } } #if defined(JIM_UTF8) && !defined(JIM_BOOTSTRAP) @@ -129,6 +137,12 @@ int utf8_tounicode(const char *str, int *uc) return 3; } } + else if (s[0] < 0xf8) { + if (((str[1] & 0xc0) == 0x80) && ((str[2] & 0xc0) == 0x80) && ((str[3] & 0xc0) == 0x80)) { + *uc = ((s[0] & ~0xf0) << 18) | ((s[1] & ~0x80) << 12) | ((s[2] & ~0x80) << 6) | (s[3] & ~0x80); + return 4; + } + } /* Invalid sequence, so just return the byte */ *uc = *s; |