From 9954432e309c8fddaec2fe53e601702a5c981624 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Sat, 7 Jan 2012 10:52:53 -0500 Subject: More char16_t and char32_t support It works now for UTF-8 locales --- iconv/gconv_builtin.h | 25 ++-- iconv/gconv_int.h | 8 +- iconv/gconv_simple.c | 340 ++++++++++++++++++++++++++++++++++++++++++++++++-- iconv/iconv_prog.c | 9 +- 4 files changed, 350 insertions(+), 32 deletions(-) (limited to 'iconv') diff --git a/iconv/gconv_builtin.h b/iconv/gconv_builtin.h index fd736a4..6820f82 100644 --- a/iconv/gconv_builtin.h +++ b/iconv/gconv_builtin.h @@ -1,5 +1,5 @@ /* Builtin transformations. - Copyright (C) 1997-1999, 2000-2002, 2006, 2011 Free Software Foundation, Inc. + Copyright (C) 1997-1999, 2000-2002, 2006, 2011, 2012 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper , 1997. @@ -124,22 +124,15 @@ BUILTIN_TRANSFORMATION ("INTERNAL", "UNICODEBIG//", 1, #endif -BUILTIN_TRANSFORMATION ("ANSI_X3.4-1968//", "UTF-16//", 1, "=ascii->UTF-16", - __gconv_transform_ascii_utf16, NULL, 2, 2, 1, 1) +BUILTIN_TRANSFORMATION ("ANSI_X3.4-1968//", "CHAR16", 1, "=ascii->CHAR16", + __gconv_transform_ascii_char16, NULL, 1, 1, 2, 4) -BUILTIN_TRANSFORMATION ("UTF-16//", "ANSI_X3.4-1968//", 1, "=UTF-16->ascii", - __gconv_transform_utf16_ascii, NULL, 2, 2, 1, 1) +BUILTIN_TRANSFORMATION ("CHAR16", "ANSI_X3.4-1968//", 1, "=CHAR16->ascii", + __gconv_transform_char16_ascii, NULL, 2, 4, 1, 1) -#if BYTE_ORDER == BIG_ENDIAN -BUILTIN_TRANSFORMATION ("ANSI_X3.4-1968//", "UTF-16BE//", 1, "=ascii->UTF-16BE", - __gconv_transform_ascii_utf16, NULL, 2, 2, 1, 1) -BUILTIN_TRANSFORMATION ("UTF-16BE//", "ANSI_X3.4-1968//", 1, "=UTF-16BE->ascii", - __gconv_transform_utf16_ascii, NULL, 2, 2, 1, 1) -#else -BUILTIN_TRANSFORMATION ("ANSI_X3.4-1968//", "UTF-16LE//", 1, "=ascii->UTF-16LE", - __gconv_transform_ascii_utf16, NULL, 2, 2, 1, 1) +BUILTIN_TRANSFORMATION ("ISO-10646/UTF8/", "CHAR16", 1, "=utf8->CHAR16", + __gconv_transform_utf8_char16, NULL, 1, 6, 2, 4) -BUILTIN_TRANSFORMATION ("UTF-16LE//", "ANSI_X3.4-1968//", 1, "=UTF-16LE->ascii", - __gconv_transform_utf16_ascii, NULL, 2, 2, 1, 1) -#endif +BUILTIN_TRANSFORMATION ("CHAR16", "ISO-10646/UTF8/", 1, "=CHAR16->utf8", + __gconv_transform_char16_utf8, NULL, 2, 4, 1, 6) diff --git a/iconv/gconv_int.h b/iconv/gconv_int.h index 80253dd..79de975 100644 --- a/iconv/gconv_int.h +++ b/iconv/gconv_int.h @@ -1,4 +1,4 @@ -/* Copyright (C) 1997-2005, 2006, 2007, 2011 Free Software Foundation, Inc. +/* Copyright (C) 1997-2005, 2006, 2007, 2011, 2012 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper , 1997. @@ -303,8 +303,10 @@ __BUILTIN_TRANSFORM (__gconv_transform_internal_ucs4le); __BUILTIN_TRANSFORM (__gconv_transform_ucs4le_internal); __BUILTIN_TRANSFORM (__gconv_transform_internal_utf16); __BUILTIN_TRANSFORM (__gconv_transform_utf16_internal); -__BUILTIN_TRANSFORM (__gconv_transform_ascii_utf16); -__BUILTIN_TRANSFORM (__gconv_transform_utf16_ascii); +__BUILTIN_TRANSFORM (__gconv_transform_ascii_char16); +__BUILTIN_TRANSFORM (__gconv_transform_char16_ascii); +__BUILTIN_TRANSFORM (__gconv_transform_utf8_char16); +__BUILTIN_TRANSFORM (__gconv_transform_char16_utf8); # undef __BUITLIN_TRANSFORM /* Specialized conversion function for a single byte to INTERNAL, recognizing diff --git a/iconv/gconv_simple.c b/iconv/gconv_simple.c index b0ef3e6..d145a3e 100644 --- a/iconv/gconv_simple.c +++ b/iconv/gconv_simple.c @@ -1,5 +1,5 @@ /* Simple transformations functions. - Copyright (C) 1997-2005, 2007, 2008, 2009, 2011 Free Software Foundation, Inc. + Copyright (C) 1997-2005, 2007, 2008, 2009, 2011, 2012 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper , 1997. @@ -1065,6 +1065,7 @@ ucs4le_internal_loop_single (struct __gconv_step *step, \ state->__count = inend - *inptrp; \ \ + assert (ch != 0xc0 && ch != 0xc1); \ if (ch >= 0xc2 && ch < 0xe0) \ { \ /* We expect two bytes. The first byte cannot be 0xc0 or \ @@ -1322,15 +1323,15 @@ ucs4le_internal_loop_single (struct __gconv_step *step, #include -/* Convert from ISO 646-IRV to UTF-16. */ +/* Convert from ISO 646-IRV to the char16_t format. */ #define DEFINE_INIT 0 #define DEFINE_FINI 0 #define MIN_NEEDED_FROM 1 #define MIN_NEEDED_TO 2 #define FROM_DIRECTION 1 -#define FROM_LOOP ascii_utf16_loop -#define TO_LOOP ascii_utf16_loop /* This is not used. */ -#define FUNCTION_NAME __gconv_transform_ascii_utf16 +#define FROM_LOOP ascii_char16_loop +#define TO_LOOP ascii_char16_loop /* This is not used. */ +#define FUNCTION_NAME __gconv_transform_ascii_char16 #define ONE_DIRECTION 1 #define MIN_NEEDED_INPUT MIN_NEEDED_FROM @@ -1358,15 +1359,15 @@ ucs4le_internal_loop_single (struct __gconv_step *step, #include -/* Convert from UTF-16 to ISO 646-IRV. */ +/* Convert from the char16_t format to ISO 646-IRV. */ #define DEFINE_INIT 0 #define DEFINE_FINI 0 #define MIN_NEEDED_FROM 2 #define MIN_NEEDED_TO 1 #define FROM_DIRECTION 1 -#define FROM_LOOP utf16_ascii_loop -#define TO_LOOP utf16_ascii_loop /* This is not used. */ -#define FUNCTION_NAME __gconv_transform_utf16_ascii +#define FROM_LOOP char16_ascii_loop +#define TO_LOOP char16_ascii_loop /* This is not used. */ +#define FUNCTION_NAME __gconv_transform_char16_ascii #define ONE_DIRECTION 1 #define MIN_NEEDED_INPUT MIN_NEEDED_FROM @@ -1383,9 +1384,328 @@ ucs4le_internal_loop_single (struct __gconv_step *step, { \ /* It's an one byte sequence. */ \ *outptr++ = *((const uint16_t *) inptr); \ - inptr += sizeof (uint16_t); \ + inptr += 2; \ + } \ + } +#define LOOP_NEED_FLAGS +#include +#include + + +/* Convert from the char16_t format to UTF-8. */ +#define DEFINE_INIT 0 +#define DEFINE_FINI 0 +#define MIN_NEEDED_FROM 2 +#define MAX_NEEDED_FROM 4 +#define MIN_NEEDED_TO 1 +#define MAX_NEEDED_TO 6 +#define FROM_DIRECTION 1 +#define FROM_LOOP char16_utf8_loop +#define TO_LOOP char16_utf8_loop /* This is not used. */ +#define FUNCTION_NAME __gconv_transform_char16_utf8 +#define ONE_DIRECTION 1 + +#define MIN_NEEDED_INPUT MIN_NEEDED_FROM +#define MAX_NEEDED_INPUT MAX_NEEDED_FROM +#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO +#define MAX_NEEDED_OUTPUT MAX_NEEDED_TO +#define LOOPFCT FROM_LOOP +#define BODY \ + { \ + /* Yes, reading a 16-bit number and storing it as 32-bit is correct. */ \ + uint32_t wc = *((const uint16_t *) inptr); \ + inptr += 2; \ + \ + if (__builtin_expect (wc < 0x80, 1)) \ + /* It's an one byte sequence. */ \ + *outptr++ = (unsigned char) wc; \ + else \ + { \ + size_t step; \ + \ + if (__builtin_expect (wc < 0xd800 || wc > 0xdfff, 1)) \ + step = wc < 0x800 ? 2 : 3; \ + else \ + { \ + if (__builtin_expect (inptr + 2 > inend, 0)) \ + { \ + /* We don't have enough input for another complete input \ + character. */ \ + inptr -= 2; \ + result = __GCONV_INCOMPLETE_INPUT; \ + break; \ + } \ + \ + uint32_t sec = *((const uint16_t *) inptr); \ + if (__builtin_expect (sec < 0xdc00, 0) \ + || __builtin_expect (sec > 0xdfff, 0)) \ + { \ + /* This is no valid second word for a surrogate. */ \ + STANDARD_FROM_LOOP_ERR_HANDLER (2); \ + } \ + inptr += 2; \ + wc = ((wc - 0xd7c0) << 10) + (sec - 0xdc00); \ + \ + step = wc < 0x200000 ? 4 : 5; \ + } \ + \ + if (__builtin_expect (outptr + step > outend, 0)) \ + { \ + /* Too long. */ \ + result = __GCONV_FULL_OUTPUT; \ + inptr -= step >= 4 ? 4 : 2; \ + break; \ + } \ + \ + unsigned char *start = outptr; \ + *outptr = (unsigned char) (~0xff >> step); \ + outptr += step; \ + do \ + { \ + start[--step] = 0x80 | (wc & 0x3f); \ + wc >>= 6; \ + } \ + while (step > 1); \ + start[0] |= wc; \ } \ } #define LOOP_NEED_FLAGS #include #include + + +/* Convert from UTF-8 to the char16_t format. */ +#define DEFINE_INIT 0 +#define DEFINE_FINI 0 +#define MIN_NEEDED_FROM 1 +#define MAX_NEEDED_FROM 6 +#define MIN_NEEDED_TO 2 +#define MAX_NEEDED_TO 4 +#define FROM_DIRECTION 1 +#define FROM_LOOP utf8_char16_loop +#define TO_LOOP utf8_char16_loop /* This is not used. */ +#define FUNCTION_NAME __gconv_transform_utf8_char16 +#define ONE_DIRECTION 1 + +#define MIN_NEEDED_INPUT MIN_NEEDED_FROM +#define MAX_NEEDED_INPUT MAX_NEEDED_FROM +#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO +#define LOOPFCT FROM_LOOP +#define BODY \ + { \ + /* Next input byte. */ \ + uint32_t ch = *inptr; \ + \ + if (__builtin_expect (ch < 0x80, 1)) \ + { \ + /* One byte sequence. */ \ + *((uint16_t *) outptr) = ch; \ + outptr += 2; \ + ++inptr; \ + } \ + else \ + { \ + uint_fast32_t cnt; \ + uint_fast32_t i; \ + \ + if (ch >= 0xc2 && ch < 0xe0) \ + { \ + /* We expect two bytes. The first byte cannot be 0xc0 or 0xc1, \ + otherwise the wide character could have been represented \ + using a single byte. */ \ + cnt = 2; \ + ch &= 0x1f; \ + } \ + else if (__builtin_expect ((ch & 0xf0) == 0xe0, 1)) \ + { \ + /* We expect three bytes. */ \ + cnt = 3; \ + ch &= 0x0f; \ + } \ + else if (__builtin_expect ((ch & 0xf8) == 0xf0, 1)) \ + { \ + /* We expect four bytes. */ \ + cnt = 4; \ + ch &= 0x07; \ + } \ + else if (__builtin_expect ((ch & 0xfc) == 0xf8, 1)) \ + { \ + /* We expect five bytes. */ \ + cnt = 5; \ + ch &= 0x03; \ + } \ + else if (__builtin_expect ((ch & 0xfe) == 0xfc, 1)) \ + { \ + /* We expect six bytes. */ \ + cnt = 6; \ + ch &= 0x01; \ + } \ + else \ + { \ + /* Search the end of this ill-formed UTF-8 character. This \ + is the next byte with (x & 0xc0) != 0x80. */ \ + i = 0; \ + do \ + ++i; \ + while (inptr + i < inend \ + && (*(inptr + i) & 0xc0) == 0x80 \ + && i < 5); \ + \ + errout: \ + STANDARD_FROM_LOOP_ERR_HANDLER (i); \ + } \ + \ + if (__builtin_expect (inptr + cnt > inend, 0)) \ + { \ + /* We don't have enough input. But before we report that check \ + that all the bytes are correct. */ \ + for (i = 1; inptr + i < inend; ++i) \ + if ((inptr[i] & 0xc0) != 0x80) \ + break; \ + \ + if (__builtin_expect (inptr + i == inend, 1)) \ + { \ + result = __GCONV_INCOMPLETE_INPUT; \ + break; \ + } \ + \ + goto errout; \ + } \ + \ + /* Read the possible remaining bytes. */ \ + for (i = 1; i < cnt; ++i) \ + { \ + uint32_t byte = inptr[i]; \ + \ + if ((byte & 0xc0) != 0x80) \ + /* This is an illegal encoding. */ \ + break; \ + \ + ch <<= 6; \ + ch |= byte & 0x3f; \ + } \ + \ + /* If i < cnt, some trail byte was not >= 0x80, < 0xc0. \ + If cnt > 2 and ch < 2^(5*cnt-4), the wide character ch could \ + have been represented with fewer than cnt bytes. */ \ + if (i < cnt || (cnt > 2 && (ch >> (5 * cnt - 4)) == 0) \ + /* Do not accept UTF-16 surrogates. */ \ + || (ch >= 0xd800 && ch <= 0xdfff)) \ + { \ + /* This is an illegal encoding. */ \ + goto errout; \ + } \ + \ + /* Now adjust the pointers and store the result. */ \ + if (ch < 0x10000) \ + *((uint16_t *) outptr) = ch; \ + else \ + { \ + if (__builtin_expect (outptr + 4 > outend, 0)) \ + { \ + result = __GCONV_FULL_OUTPUT; \ + break; \ + } \ + \ + *((uint16_t *) outptr) = 0xd7c0 + (ch >> 10); \ + outptr += 2; \ + *((uint16_t *) outptr) = 0xdc00 + (ch & 0x3ff); \ + } \ + \ + outptr += 2; \ + inptr += cnt; \ + } \ + } +#define LOOP_NEED_FLAGS + +#define STORE_REST \ + { \ + /* We store the remaining bytes while converting them into the UCS4 \ + format. We can assume that the first byte in the buffer is \ + correct and that it requires a larger number of bytes than there \ + are in the input buffer. */ \ + wint_t ch = **inptrp; \ + size_t cnt, r; \ + \ + state->__count = inend - *inptrp; \ + \ + assert (ch != 0xc0 && ch != 0xc1); \ + if (ch >= 0xc2 && ch < 0xe0) \ + { \ + /* We expect two bytes. The first byte cannot be 0xc0 or \ + 0xc1, otherwise the wide character could have been \ + represented using a single byte. */ \ + cnt = 2; \ + ch &= 0x1f; \ + } \ + else if (__builtin_expect ((ch & 0xf0) == 0xe0, 1)) \ + { \ + /* We expect three bytes. */ \ + cnt = 3; \ + ch &= 0x0f; \ + } \ + else if (__builtin_expect ((ch & 0xf8) == 0xf0, 1)) \ + { \ + /* We expect four bytes. */ \ + cnt = 4; \ + ch &= 0x07; \ + } \ + else if (__builtin_expect ((ch & 0xfc) == 0xf8, 1)) \ + { \ + /* We expect five bytes. */ \ + cnt = 5; \ + ch &= 0x03; \ + } \ + else \ + { \ + /* We expect six bytes. */ \ + cnt = 6; \ + ch &= 0x01; \ + } \ + \ + /* The first byte is already consumed. */ \ + r = cnt - 1; \ + while (++(*inptrp) < inend) \ + { \ + ch <<= 6; \ + ch |= **inptrp & 0x3f; \ + --r; \ + } \ + \ + /* Shift for the so far missing bytes. */ \ + ch <<= r * 6; \ + \ + /* Store the number of bytes expected for the entire sequence. */ \ + state->__count |= cnt << 8; \ + \ + /* Store the value. */ \ + state->__value.__wch = ch; \ + } + +#define UNPACK_BYTES \ + { \ + static const unsigned char inmask[5] = { 0xc0, 0xe0, 0xf0, 0xf8, 0xfc }; \ + wint_t wch = state->__value.__wch; \ + size_t ntotal = state->__count >> 8; \ + \ + inlen = state->__count & 255; \ + \ + bytebuf[0] = inmask[ntotal - 2]; \ + \ + do \ + { \ + if (--ntotal < inlen) \ + bytebuf[ntotal] = 0x80 | (wch & 0x3f); \ + wch >>= 6; \ + } \ + while (ntotal > 1); \ + \ + bytebuf[0] |= wch; \ + } + +#define CLEAR_STATE \ + state->__count = 0 + + +#include +#include diff --git a/iconv/iconv_prog.c b/iconv/iconv_prog.c index 59c6402..13facc8 100644 --- a/iconv/iconv_prog.c +++ b/iconv/iconv_prog.c @@ -719,10 +719,12 @@ add_known_names (struct gconv_module *node) add_known_names (node->right); do { - if (strcmp (node->from_string, "INTERNAL")) + if (strcmp (node->from_string, "INTERNAL") != 0 + && strcmp (node->from_string, "CHAR16") != 0) tsearch (node->from_string, &printlist, (__compar_fn_t) strverscmp); - if (strcmp (node->to_string, "INTERNAL") != 0) + if (strcmp (node->to_string, "INTERNAL") != 0 + && strcmp (node->to_string, "CHAR16") != 0) tsearch (node->to_string, &printlist, (__compar_fn_t) strverscmp); node = node->same; @@ -748,7 +750,8 @@ insert_cache (void) { const char *str = strtab + hashtab[cnt].string_offset; - if (strcmp (str, "INTERNAL") != 0) + if (strcmp (str, "INTERNAL") != 0 + && strcmp (str, "CHAR16") != 0) tsearch (str, &printlist, (__compar_fn_t) strverscmp); } } -- cgit v1.1