aboutsummaryrefslogtreecommitdiff
path: root/utf8.c
diff options
context:
space:
mode:
authorSteve Bennett <steveb@workware.net.au>2017-12-14 20:27:53 +1000
committerSteve Bennett <steveb@workware.net.au>2017-12-31 11:47:55 +1000
commit6fd58cfc22b0968e71f67f378555aba74e392847 (patch)
tree58c8a628858657dded758c818361af325a235c8c /utf8.c
parent2d2f74ebfeeb056130a37fec19189766a85cec81 (diff)
downloadjimtcl-6fd58cfc22b0968e71f67f378555aba74e392847.zip
jimtcl-6fd58cfc22b0968e71f67f378555aba74e392847.tar.gz
jimtcl-6fd58cfc22b0968e71f67f378555aba74e392847.tar.bz2
utf8: Be more strict at rejecting invalid UTF-8 sequences.
RFC 3629 says: Implementations of the decoding algorithm above MUST protect against decoding invalid sequences Signed-off-by: Steve Bennett <steveb@workware.net.au>
Diffstat (limited to 'utf8.c')
-rw-r--r--utf8.c15
1 files changed, 12 insertions, 3 deletions
diff --git a/utf8.c b/utf8.c
index 2698f66..56a036b 100644
--- a/utf8.c
+++ b/utf8.c
@@ -131,19 +131,28 @@ int utf8_tounicode(const char *str, int *uc)
if (s[0] < 0xe0) {
if ((s[1] & 0xc0) == 0x80) {
*uc = ((s[0] & ~0xc0) << 6) | (s[1] & ~0x80);
- return 2;
+ if (*uc >= 0x80) {
+ return 2;
+ }
+ /* Otherwise this is an invalid sequence */
}
}
else if (s[0] < 0xf0) {
if (((str[1] & 0xc0) == 0x80) && ((str[2] & 0xc0) == 0x80)) {
*uc = ((s[0] & ~0xe0) << 12) | ((s[1] & ~0x80) << 6) | (s[2] & ~0x80);
- return 3;
+ if (*uc >= 0x800) {
+ return 3;
+ }
+ /* Otherwise this is an invalid sequence */
}
}
else if (s[0] < 0xf8) {
if (((str[1] & 0xc0) == 0x80) && ((str[2] & 0xc0) == 0x80) && ((str[3] & 0xc0) == 0x80)) {
*uc = ((s[0] & ~0xf0) << 18) | ((s[1] & ~0x80) << 12) | ((s[2] & ~0x80) << 6) | (s[3] & ~0x80);
- return 4;
+ if (*uc >= 0x10000) {
+ return 4;
+ }
+ /* Otherwise this is an invalid sequence */
}
}