From e59f39d40397645477b959255aedfa17a7c9c779 Mon Sep 17 00:00:00 2001 From: Markus Armbruster Date: Thu, 23 Aug 2018 18:39:49 +0200 Subject: json: Reject invalid UTF-8 sequences We reject bytes that can't occur in valid UTF-8 (\xC0..\xC1, \xF5..\xFF in the lexer. That's insufficient; there's plenty of invalid UTF-8 not containing these bytes, as demonstrated by check-qjson: * Malformed sequences - Unexpected continuation bytes - Missing continuation bytes after start bytes other than \xC0..\xC1, \xF5..\xFD. * Overlong sequences with start bytes other than \xC0..\xC1, \xF5..\xFD. * Invalid code points Fixing this in the lexer would be bothersome. Fixing it in the parser is straightforward, so do that. Signed-off-by: Markus Armbruster Reviewed-by: Eric Blake Message-Id: <20180823164025.12553-23-armbru@redhat.com> --- util/unicode.c | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 62 insertions(+), 7 deletions(-) (limited to 'util/unicode.c') diff --git a/util/unicode.c b/util/unicode.c index a812a35..8580bc5 100644 --- a/util/unicode.c +++ b/util/unicode.c @@ -13,6 +13,21 @@ #include "qemu/osdep.h" #include "qemu/unicode.h" +static bool is_valid_codepoint(int codepoint) +{ + if (codepoint > 0x10FFFFu) { + return false; /* beyond Unicode range */ + } + if ((codepoint >= 0xFDD0 && codepoint <= 0xFDEF) + || (codepoint & 0xFFFE) == 0xFFFE) { + return false; /* noncharacter */ + } + if (codepoint >= 0xD800 && codepoint <= 0xDFFF) { + return false; /* surrogate code point */ + } + return true; +} + /** * mod_utf8_codepoint: * @s: string encoded in modified UTF-8 @@ -83,13 +98,8 @@ int mod_utf8_codepoint(const char *s, size_t n, char **end) cp <<= 6; cp |= byte & 0x3F; } - if (cp > 0x10FFFF) { - cp = -1; /* beyond Unicode range */ - } else if ((cp >= 0xFDD0 && cp <= 0xFDEF) - || (cp & 0xFFFE) == 0xFFFE) { - cp = -1; /* noncharacter */ - } else if (cp >= 0xD800 && cp <= 0xDFFF) { - cp = -1; /* surrogate code point */ + if (!is_valid_codepoint(cp)) { + cp = -1; } else if (cp < min_cp[len - 2] && !(cp == 0 && len == 2)) { cp = -1; /* overlong, not \xC0\x80 */ } @@ -99,3 +109,48 @@ out: *end = (char *)p; return cp; } + +/** + * mod_utf8_encode: + * @buf: Destination buffer + * @bufsz: size of @buf, at least 5. + * @codepoint: Unicode codepoint to encode + * + * Convert Unicode codepoint @codepoint to modified UTF-8. + * + * Returns: the length of the UTF-8 sequence on success, -1 when + * @codepoint is invalid. + */ +ssize_t mod_utf8_encode(char buf[], size_t bufsz, int codepoint) +{ + assert(bufsz >= 5); + + if (!is_valid_codepoint(codepoint)) { + return -1; + } + + if (codepoint > 0 && codepoint <= 0x7F) { + buf[0] = codepoint & 0x7F; + buf[1] = 0; + return 1; + } + if (codepoint <= 0x7FF) { + buf[0] = 0xC0 | ((codepoint >> 6) & 0x1F); + buf[1] = 0x80 | (codepoint & 0x3F); + buf[2] = 0; + return 2; + } + if (codepoint <= 0xFFFF) { + buf[0] = 0xE0 | ((codepoint >> 12) & 0x0F); + buf[1] = 0x80 | ((codepoint >> 6) & 0x3F); + buf[2] = 0x80 | (codepoint & 0x3F); + buf[3] = 0; + return 3; + } + buf[0] = 0xF0 | ((codepoint >> 18) & 0x07); + buf[1] = 0x80 | ((codepoint >> 12) & 0x3F); + buf[2] = 0x80 | ((codepoint >> 6) & 0x3F); + buf[3] = 0x80 | (codepoint & 0x3F); + buf[4] = 0; + return 4; +} -- cgit v1.1