From 50031440a3b7ab2623e9468bd20e837250250cd9 Mon Sep 17 00:00:00 2001 From: Petri Lehtinen Date: Sat, 5 Dec 2009 22:55:30 +0200 Subject: Implement JSON_ENSURE_ASCII encoding flag With this flag, all Unicode characters outside the ASCII range are escaped. --- src/dump.c | 67 +++++++++++++++++++++++++++++++++++++++++++++-------------- src/jansson.h | 5 +++-- src/load.c | 2 +- src/utf.c | 33 +++++++++++++++++++++++++++-- src/utf.h | 3 ++- 5 files changed, 88 insertions(+), 22 deletions(-) (limited to 'src') diff --git a/src/dump.c b/src/dump.c index 8d2a82b..dc3fcbc 100644 --- a/src/dump.c +++ b/src/dump.c @@ -14,6 +14,7 @@ #include #include "jansson_private.h" #include "strbuffer.h" +#include "utf.h" #define MAX_INTEGER_STR_LENGTH 100 #define MAX_REAL_STR_LENGTH 100 @@ -65,34 +66,49 @@ static int dump_indent(unsigned long flags, int depth, int space, dump_func dump return 0; } -static int dump_string(const char *str, dump_func dump, void *data) +static int dump_string(const char *str, int ascii, dump_func dump, void *data) { - const char *end; + const char *pos, *end; + int32_t codepoint; if(dump("\"", 1, data)) return -1; - end = str; + end = pos = str; while(1) { const char *text; - char seq[7]; + char seq[13]; int length; - while(*end && *end != '\\' && *end != '"' && (unsigned char)*end > 0x1F) - end++; + while(*end) + { + end = utf8_iterate(pos, &codepoint); + if(!end) + return -1; - if(end != str) { - if(dump(str, end - str, data)) + /* mandatory escape or control char */ + if(codepoint == '\\' || codepoint == '"' || codepoint < 0x20) + break; + + /* non-ASCII */ + if(ascii && codepoint > 0x7F) + break; + + pos = end; + } + + if(pos != str) { + if(dump(str, pos - str, data)) return -1; } - if(!*end) + if(end == pos) break; /* handle \, ", and control codes */ length = 2; - switch(*end) + switch(codepoint) { case '\\': text = "\\\\"; break; case '\"': text = "\\\""; break; @@ -103,9 +119,27 @@ static int dump_string(const char *str, dump_func dump, void *data) case '\t': text = "\\t"; break; default: { - sprintf(seq, "\\u00%02x", *end); + /* codepoint is in BMP */ + if(codepoint < 0x10000) + { + sprintf(seq, "\\u%04x", codepoint); + length = 6; + } + + /* not in BMP -> construct a UTF-16 surrogate pair */ + else + { + int32_t first, last; + + codepoint -= 0x10000; + first = 0xD800 | ((codepoint & 0xffc00) >> 10); + last = 0xDC00 | (codepoint & 0x003ff); + + sprintf(seq, "\\u%04x\\u%04x", first, last); + length = 12; + } + text = seq; - length = 6; break; } } @@ -113,8 +147,7 @@ static int dump_string(const char *str, dump_func dump, void *data) if(dump(text, length, data)) return -1; - end++; - str = end; + str = pos = end; } return dump("\"", 1, data); @@ -123,6 +156,8 @@ static int dump_string(const char *str, dump_func dump, void *data) static int do_dump(const json_t *json, unsigned long flags, int depth, dump_func dump, void *data) { + int ascii = flags & JSON_ENSURE_ASCII ? 1 : 0; + switch(json_typeof(json)) { case JSON_NULL: return dump("null", 4, data); @@ -158,7 +193,7 @@ static int do_dump(const json_t *json, unsigned long flags, int depth, } case JSON_STRING: - return dump_string(json_string_value(json), dump, data); + return dump_string(json_string_value(json), ascii, dump, data); case JSON_ARRAY: { @@ -238,7 +273,7 @@ static int do_dump(const json_t *json, unsigned long flags, int depth, { void *next = json_object_iter_next((json_t *)json, iter); - dump_string(json_object_iter_key(iter), dump, data); + dump_string(json_object_iter_key(iter), ascii, dump, data); if(dump(separator, separator_length, data) || do_dump(json_object_iter_value(iter), flags, depth + 1, dump, data)) diff --git a/src/jansson.h b/src/jansson.h index c8a5a90..d59fe10 100644 --- a/src/jansson.h +++ b/src/jansson.h @@ -141,8 +141,9 @@ json_t *json_loads(const char *input, json_error_t *error); json_t *json_loadf(FILE *input, json_error_t *error); json_t *json_load_file(const char *path, json_error_t *error); -#define JSON_INDENT(n) (n & 0xFF) -#define JSON_COMPACT 0x100 +#define JSON_INDENT(n) (n & 0xFF) +#define JSON_COMPACT 0x100 +#define JSON_ENSURE_ASCII 0x200 char *json_dumps(const json_t *json, unsigned long flags); int json_dumpf(const json_t *json, FILE *output, unsigned long flags); diff --git a/src/load.c b/src/load.c index 32d6500..278f35e 100644 --- a/src/load.c +++ b/src/load.c @@ -149,7 +149,7 @@ static char stream_get(stream_t *stream, json_error_t *error) for(i = 1; i < count; i++) stream->buffer[i] = stream->get(stream->data); - if(!utf8_check_full(stream->buffer, count)) + if(!utf8_check_full(stream->buffer, count, NULL)) goto out; stream->stream_pos += count; diff --git a/src/utf.c b/src/utf.c index 2efcb68..dda80f0 100644 --- a/src/utf.c +++ b/src/utf.c @@ -80,7 +80,7 @@ int utf8_check_first(char byte) } } -int utf8_check_full(const char *buffer, int size) +int utf8_check_full(const char *buffer, int size, int32_t *codepoint) { int i; int32_t value = 0; @@ -130,9 +130,38 @@ int utf8_check_full(const char *buffer, int size) return 0; } + if(codepoint) + *codepoint = value; + return 1; } +const char *utf8_iterate(const char *buffer, int32_t *codepoint) +{ + int count; + int32_t value; + + if(!*buffer) + return buffer; + + count = utf8_check_first(buffer[0]); + if(count <= 0) + return NULL; + + if(count == 1) + value = (unsigned char)buffer[0]; + else + { + if(!utf8_check_full(buffer, count, &value)) + return NULL; + } + + if(codepoint) + *codepoint = value; + + return buffer + count; +} + int utf8_check_string(const char *string, int length) { int i; @@ -150,7 +179,7 @@ int utf8_check_string(const char *string, int length) if(i + count > length) return 0; - if(!utf8_check_full(&string[i], count)) + if(!utf8_check_full(&string[i], count, NULL)) return 0; i += count - 1; diff --git a/src/utf.h b/src/utf.h index 75d7b6e..03fba69 100644 --- a/src/utf.h +++ b/src/utf.h @@ -11,7 +11,8 @@ int utf8_encode(int codepoint, char *buffer, int *size); int utf8_check_first(char byte); -int utf8_check_full(const char *buffer, int size); +int utf8_check_full(const char *buffer, int size, int32_t *codepoint); +const char *utf8_iterate(const char *buffer, int32_t *codepoint); int utf8_check_string(const char *string, int length); -- cgit v1.1