aboutsummaryrefslogtreecommitdiff
path: root/iconv
diff options
context:
space:
mode:
authorUlrich Drepper <drepper@gmail.com>2012-01-07 10:52:53 -0500
committerUlrich Drepper <drepper@gmail.com>2012-01-07 10:52:53 -0500
commit9954432e309c8fddaec2fe53e601702a5c981624 (patch)
tree3eb7513694e25391b3393afbb847dbd85ebf097a /iconv
parentc3a87236702cb73be1dada3438bbd3c3934e83f8 (diff)
downloadglibc-9954432e309c8fddaec2fe53e601702a5c981624.zip
glibc-9954432e309c8fddaec2fe53e601702a5c981624.tar.gz
glibc-9954432e309c8fddaec2fe53e601702a5c981624.tar.bz2
More char16_t and char32_t support
It works now for UTF-8 locales
Diffstat (limited to 'iconv')
-rw-r--r--iconv/gconv_builtin.h25
-rw-r--r--iconv/gconv_int.h8
-rw-r--r--iconv/gconv_simple.c340
-rw-r--r--iconv/iconv_prog.c9
4 files changed, 350 insertions, 32 deletions
diff --git a/iconv/gconv_builtin.h b/iconv/gconv_builtin.h
index fd736a4..6820f82 100644
--- a/iconv/gconv_builtin.h
+++ b/iconv/gconv_builtin.h
@@ -1,5 +1,5 @@
/* Builtin transformations.
- Copyright (C) 1997-1999, 2000-2002, 2006, 2011 Free Software Foundation, Inc.
+ Copyright (C) 1997-1999, 2000-2002, 2006, 2011, 2012 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
@@ -124,22 +124,15 @@ BUILTIN_TRANSFORMATION ("INTERNAL", "UNICODEBIG//", 1,
#endif
-BUILTIN_TRANSFORMATION ("ANSI_X3.4-1968//", "UTF-16//", 1, "=ascii->UTF-16",
- __gconv_transform_ascii_utf16, NULL, 2, 2, 1, 1)
+BUILTIN_TRANSFORMATION ("ANSI_X3.4-1968//", "CHAR16", 1, "=ascii->CHAR16",
+ __gconv_transform_ascii_char16, NULL, 1, 1, 2, 4)
-BUILTIN_TRANSFORMATION ("UTF-16//", "ANSI_X3.4-1968//", 1, "=UTF-16->ascii",
- __gconv_transform_utf16_ascii, NULL, 2, 2, 1, 1)
+BUILTIN_TRANSFORMATION ("CHAR16", "ANSI_X3.4-1968//", 1, "=CHAR16->ascii",
+ __gconv_transform_char16_ascii, NULL, 2, 4, 1, 1)
-#if BYTE_ORDER == BIG_ENDIAN
-BUILTIN_TRANSFORMATION ("ANSI_X3.4-1968//", "UTF-16BE//", 1, "=ascii->UTF-16BE",
- __gconv_transform_ascii_utf16, NULL, 2, 2, 1, 1)
-BUILTIN_TRANSFORMATION ("UTF-16BE//", "ANSI_X3.4-1968//", 1, "=UTF-16BE->ascii",
- __gconv_transform_utf16_ascii, NULL, 2, 2, 1, 1)
-#else
-BUILTIN_TRANSFORMATION ("ANSI_X3.4-1968//", "UTF-16LE//", 1, "=ascii->UTF-16LE",
- __gconv_transform_ascii_utf16, NULL, 2, 2, 1, 1)
+BUILTIN_TRANSFORMATION ("ISO-10646/UTF8/", "CHAR16", 1, "=utf8->CHAR16",
+ __gconv_transform_utf8_char16, NULL, 1, 6, 2, 4)
-BUILTIN_TRANSFORMATION ("UTF-16LE//", "ANSI_X3.4-1968//", 1, "=UTF-16LE->ascii",
- __gconv_transform_utf16_ascii, NULL, 2, 2, 1, 1)
-#endif
+BUILTIN_TRANSFORMATION ("CHAR16", "ISO-10646/UTF8/", 1, "=CHAR16->utf8",
+ __gconv_transform_char16_utf8, NULL, 2, 4, 1, 6)
diff --git a/iconv/gconv_int.h b/iconv/gconv_int.h
index 80253dd..79de975 100644
--- a/iconv/gconv_int.h
+++ b/iconv/gconv_int.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 1997-2005, 2006, 2007, 2011 Free Software Foundation, Inc.
+/* Copyright (C) 1997-2005, 2006, 2007, 2011, 2012 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
@@ -303,8 +303,10 @@ __BUILTIN_TRANSFORM (__gconv_transform_internal_ucs4le);
__BUILTIN_TRANSFORM (__gconv_transform_ucs4le_internal);
__BUILTIN_TRANSFORM (__gconv_transform_internal_utf16);
__BUILTIN_TRANSFORM (__gconv_transform_utf16_internal);
-__BUILTIN_TRANSFORM (__gconv_transform_ascii_utf16);
-__BUILTIN_TRANSFORM (__gconv_transform_utf16_ascii);
+__BUILTIN_TRANSFORM (__gconv_transform_ascii_char16);
+__BUILTIN_TRANSFORM (__gconv_transform_char16_ascii);
+__BUILTIN_TRANSFORM (__gconv_transform_utf8_char16);
+__BUILTIN_TRANSFORM (__gconv_transform_char16_utf8);
# undef __BUITLIN_TRANSFORM
/* Specialized conversion function for a single byte to INTERNAL, recognizing
diff --git a/iconv/gconv_simple.c b/iconv/gconv_simple.c
index b0ef3e6..d145a3e 100644
--- a/iconv/gconv_simple.c
+++ b/iconv/gconv_simple.c
@@ -1,5 +1,5 @@
/* Simple transformations functions.
- Copyright (C) 1997-2005, 2007, 2008, 2009, 2011 Free Software Foundation, Inc.
+ Copyright (C) 1997-2005, 2007, 2008, 2009, 2011, 2012 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
@@ -1065,6 +1065,7 @@ ucs4le_internal_loop_single (struct __gconv_step *step,
\
state->__count = inend - *inptrp; \
\
+ assert (ch != 0xc0 && ch != 0xc1); \
if (ch >= 0xc2 && ch < 0xe0) \
{ \
/* We expect two bytes. The first byte cannot be 0xc0 or \
@@ -1322,15 +1323,15 @@ ucs4le_internal_loop_single (struct __gconv_step *step,
#include <iconv/skeleton.c>
-/* Convert from ISO 646-IRV to UTF-16. */
+/* Convert from ISO 646-IRV to the char16_t format. */
#define DEFINE_INIT 0
#define DEFINE_FINI 0
#define MIN_NEEDED_FROM 1
#define MIN_NEEDED_TO 2
#define FROM_DIRECTION 1
-#define FROM_LOOP ascii_utf16_loop
-#define TO_LOOP ascii_utf16_loop /* This is not used. */
-#define FUNCTION_NAME __gconv_transform_ascii_utf16
+#define FROM_LOOP ascii_char16_loop
+#define TO_LOOP ascii_char16_loop /* This is not used. */
+#define FUNCTION_NAME __gconv_transform_ascii_char16
#define ONE_DIRECTION 1
#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
@@ -1358,15 +1359,15 @@ ucs4le_internal_loop_single (struct __gconv_step *step,
#include <iconv/skeleton.c>
-/* Convert from UTF-16 to ISO 646-IRV. */
+/* Convert from the char16_t format to ISO 646-IRV. */
#define DEFINE_INIT 0
#define DEFINE_FINI 0
#define MIN_NEEDED_FROM 2
#define MIN_NEEDED_TO 1
#define FROM_DIRECTION 1
-#define FROM_LOOP utf16_ascii_loop
-#define TO_LOOP utf16_ascii_loop /* This is not used. */
-#define FUNCTION_NAME __gconv_transform_utf16_ascii
+#define FROM_LOOP char16_ascii_loop
+#define TO_LOOP char16_ascii_loop /* This is not used. */
+#define FUNCTION_NAME __gconv_transform_char16_ascii
#define ONE_DIRECTION 1
#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
@@ -1383,9 +1384,328 @@ ucs4le_internal_loop_single (struct __gconv_step *step,
{ \
/* It's an one byte sequence. */ \
*outptr++ = *((const uint16_t *) inptr); \
- inptr += sizeof (uint16_t); \
+ inptr += 2; \
+ } \
+ }
+#define LOOP_NEED_FLAGS
+#include <iconv/loop.c>
+#include <iconv/skeleton.c>
+
+
+/* Convert from the char16_t format to UTF-8. */
+#define DEFINE_INIT 0
+#define DEFINE_FINI 0
+#define MIN_NEEDED_FROM 2
+#define MAX_NEEDED_FROM 4
+#define MIN_NEEDED_TO 1
+#define MAX_NEEDED_TO 6
+#define FROM_DIRECTION 1
+#define FROM_LOOP char16_utf8_loop
+#define TO_LOOP char16_utf8_loop /* This is not used. */
+#define FUNCTION_NAME __gconv_transform_char16_utf8
+#define ONE_DIRECTION 1
+
+#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
+#define MAX_NEEDED_INPUT MAX_NEEDED_FROM
+#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
+#define MAX_NEEDED_OUTPUT MAX_NEEDED_TO
+#define LOOPFCT FROM_LOOP
+#define BODY \
+ { \
+ /* Yes, reading a 16-bit number and storing it as 32-bit is correct. */ \
+ uint32_t wc = *((const uint16_t *) inptr); \
+ inptr += 2; \
+ \
+ if (__builtin_expect (wc < 0x80, 1)) \
+ /* It's an one byte sequence. */ \
+ *outptr++ = (unsigned char) wc; \
+ else \
+ { \
+ size_t step; \
+ \
+ if (__builtin_expect (wc < 0xd800 || wc > 0xdfff, 1)) \
+ step = wc < 0x800 ? 2 : 3; \
+ else \
+ { \
+ if (__builtin_expect (inptr + 2 > inend, 0)) \
+ { \
+ /* We don't have enough input for another complete input \
+ character. */ \
+ inptr -= 2; \
+ result = __GCONV_INCOMPLETE_INPUT; \
+ break; \
+ } \
+ \
+ uint32_t sec = *((const uint16_t *) inptr); \
+ if (__builtin_expect (sec < 0xdc00, 0) \
+ || __builtin_expect (sec > 0xdfff, 0)) \
+ { \
+ /* This is no valid second word for a surrogate. */ \
+ STANDARD_FROM_LOOP_ERR_HANDLER (2); \
+ } \
+ inptr += 2; \
+ wc = ((wc - 0xd7c0) << 10) + (sec - 0xdc00); \
+ \
+ step = wc < 0x200000 ? 4 : 5; \
+ } \
+ \
+ if (__builtin_expect (outptr + step > outend, 0)) \
+ { \
+ /* Too long. */ \
+ result = __GCONV_FULL_OUTPUT; \
+ inptr -= step >= 4 ? 4 : 2; \
+ break; \
+ } \
+ \
+ unsigned char *start = outptr; \
+ *outptr = (unsigned char) (~0xff >> step); \
+ outptr += step; \
+ do \
+ { \
+ start[--step] = 0x80 | (wc & 0x3f); \
+ wc >>= 6; \
+ } \
+ while (step > 1); \
+ start[0] |= wc; \
} \
}
#define LOOP_NEED_FLAGS
#include <iconv/loop.c>
#include <iconv/skeleton.c>
+
+
+/* Convert from UTF-8 to the char16_t format. */
+#define DEFINE_INIT 0
+#define DEFINE_FINI 0
+#define MIN_NEEDED_FROM 1
+#define MAX_NEEDED_FROM 6
+#define MIN_NEEDED_TO 2
+#define MAX_NEEDED_TO 4
+#define FROM_DIRECTION 1
+#define FROM_LOOP utf8_char16_loop
+#define TO_LOOP utf8_char16_loop /* This is not used. */
+#define FUNCTION_NAME __gconv_transform_utf8_char16
+#define ONE_DIRECTION 1
+
+#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
+#define MAX_NEEDED_INPUT MAX_NEEDED_FROM
+#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
+#define LOOPFCT FROM_LOOP
+#define BODY \
+ { \
+ /* Next input byte. */ \
+ uint32_t ch = *inptr; \
+ \
+ if (__builtin_expect (ch < 0x80, 1)) \
+ { \
+ /* One byte sequence. */ \
+ *((uint16_t *) outptr) = ch; \
+ outptr += 2; \
+ ++inptr; \
+ } \
+ else \
+ { \
+ uint_fast32_t cnt; \
+ uint_fast32_t i; \
+ \
+ if (ch >= 0xc2 && ch < 0xe0) \
+ { \
+ /* We expect two bytes. The first byte cannot be 0xc0 or 0xc1, \
+ otherwise the wide character could have been represented \
+ using a single byte. */ \
+ cnt = 2; \
+ ch &= 0x1f; \
+ } \
+ else if (__builtin_expect ((ch & 0xf0) == 0xe0, 1)) \
+ { \
+ /* We expect three bytes. */ \
+ cnt = 3; \
+ ch &= 0x0f; \
+ } \
+ else if (__builtin_expect ((ch & 0xf8) == 0xf0, 1)) \
+ { \
+ /* We expect four bytes. */ \
+ cnt = 4; \
+ ch &= 0x07; \
+ } \
+ else if (__builtin_expect ((ch & 0xfc) == 0xf8, 1)) \
+ { \
+ /* We expect five bytes. */ \
+ cnt = 5; \
+ ch &= 0x03; \
+ } \
+ else if (__builtin_expect ((ch & 0xfe) == 0xfc, 1)) \
+ { \
+ /* We expect six bytes. */ \
+ cnt = 6; \
+ ch &= 0x01; \
+ } \
+ else \
+ { \
+ /* Search the end of this ill-formed UTF-8 character. This \
+ is the next byte with (x & 0xc0) != 0x80. */ \
+ i = 0; \
+ do \
+ ++i; \
+ while (inptr + i < inend \
+ && (*(inptr + i) & 0xc0) == 0x80 \
+ && i < 5); \
+ \
+ errout: \
+ STANDARD_FROM_LOOP_ERR_HANDLER (i); \
+ } \
+ \
+ if (__builtin_expect (inptr + cnt > inend, 0)) \
+ { \
+ /* We don't have enough input. But before we report that check \
+ that all the bytes are correct. */ \
+ for (i = 1; inptr + i < inend; ++i) \
+ if ((inptr[i] & 0xc0) != 0x80) \
+ break; \
+ \
+ if (__builtin_expect (inptr + i == inend, 1)) \
+ { \
+ result = __GCONV_INCOMPLETE_INPUT; \
+ break; \
+ } \
+ \
+ goto errout; \
+ } \
+ \
+ /* Read the possible remaining bytes. */ \
+ for (i = 1; i < cnt; ++i) \
+ { \
+ uint32_t byte = inptr[i]; \
+ \
+ if ((byte & 0xc0) != 0x80) \
+ /* This is an illegal encoding. */ \
+ break; \
+ \
+ ch <<= 6; \
+ ch |= byte & 0x3f; \
+ } \
+ \
+ /* If i < cnt, some trail byte was not >= 0x80, < 0xc0. \
+ If cnt > 2 and ch < 2^(5*cnt-4), the wide character ch could \
+ have been represented with fewer than cnt bytes. */ \
+ if (i < cnt || (cnt > 2 && (ch >> (5 * cnt - 4)) == 0) \
+ /* Do not accept UTF-16 surrogates. */ \
+ || (ch >= 0xd800 && ch <= 0xdfff)) \
+ { \
+ /* This is an illegal encoding. */ \
+ goto errout; \
+ } \
+ \
+ /* Now adjust the pointers and store the result. */ \
+ if (ch < 0x10000) \
+ *((uint16_t *) outptr) = ch; \
+ else \
+ { \
+ if (__builtin_expect (outptr + 4 > outend, 0)) \
+ { \
+ result = __GCONV_FULL_OUTPUT; \
+ break; \
+ } \
+ \
+ *((uint16_t *) outptr) = 0xd7c0 + (ch >> 10); \
+ outptr += 2; \
+ *((uint16_t *) outptr) = 0xdc00 + (ch & 0x3ff); \
+ } \
+ \
+ outptr += 2; \
+ inptr += cnt; \
+ } \
+ }
+#define LOOP_NEED_FLAGS
+
+#define STORE_REST \
+ { \
+ /* We store the remaining bytes while converting them into the UCS4 \
+ format. We can assume that the first byte in the buffer is \
+ correct and that it requires a larger number of bytes than there \
+ are in the input buffer. */ \
+ wint_t ch = **inptrp; \
+ size_t cnt, r; \
+ \
+ state->__count = inend - *inptrp; \
+ \
+ assert (ch != 0xc0 && ch != 0xc1); \
+ if (ch >= 0xc2 && ch < 0xe0) \
+ { \
+ /* We expect two bytes. The first byte cannot be 0xc0 or \
+ 0xc1, otherwise the wide character could have been \
+ represented using a single byte. */ \
+ cnt = 2; \
+ ch &= 0x1f; \
+ } \
+ else if (__builtin_expect ((ch & 0xf0) == 0xe0, 1)) \
+ { \
+ /* We expect three bytes. */ \
+ cnt = 3; \
+ ch &= 0x0f; \
+ } \
+ else if (__builtin_expect ((ch & 0xf8) == 0xf0, 1)) \
+ { \
+ /* We expect four bytes. */ \
+ cnt = 4; \
+ ch &= 0x07; \
+ } \
+ else if (__builtin_expect ((ch & 0xfc) == 0xf8, 1)) \
+ { \
+ /* We expect five bytes. */ \
+ cnt = 5; \
+ ch &= 0x03; \
+ } \
+ else \
+ { \
+ /* We expect six bytes. */ \
+ cnt = 6; \
+ ch &= 0x01; \
+ } \
+ \
+ /* The first byte is already consumed. */ \
+ r = cnt - 1; \
+ while (++(*inptrp) < inend) \
+ { \
+ ch <<= 6; \
+ ch |= **inptrp & 0x3f; \
+ --r; \
+ } \
+ \
+ /* Shift for the so far missing bytes. */ \
+ ch <<= r * 6; \
+ \
+ /* Store the number of bytes expected for the entire sequence. */ \
+ state->__count |= cnt << 8; \
+ \
+ /* Store the value. */ \
+ state->__value.__wch = ch; \
+ }
+
+#define UNPACK_BYTES \
+ { \
+ static const unsigned char inmask[5] = { 0xc0, 0xe0, 0xf0, 0xf8, 0xfc }; \
+ wint_t wch = state->__value.__wch; \
+ size_t ntotal = state->__count >> 8; \
+ \
+ inlen = state->__count & 255; \
+ \
+ bytebuf[0] = inmask[ntotal - 2]; \
+ \
+ do \
+ { \
+ if (--ntotal < inlen) \
+ bytebuf[ntotal] = 0x80 | (wch & 0x3f); \
+ wch >>= 6; \
+ } \
+ while (ntotal > 1); \
+ \
+ bytebuf[0] |= wch; \
+ }
+
+#define CLEAR_STATE \
+ state->__count = 0
+
+
+#include <iconv/loop.c>
+#include <iconv/skeleton.c>
diff --git a/iconv/iconv_prog.c b/iconv/iconv_prog.c
index 59c6402..13facc8 100644
--- a/iconv/iconv_prog.c
+++ b/iconv/iconv_prog.c
@@ -719,10 +719,12 @@ add_known_names (struct gconv_module *node)
add_known_names (node->right);
do
{
- if (strcmp (node->from_string, "INTERNAL"))
+ if (strcmp (node->from_string, "INTERNAL") != 0
+ && strcmp (node->from_string, "CHAR16") != 0)
tsearch (node->from_string, &printlist,
(__compar_fn_t) strverscmp);
- if (strcmp (node->to_string, "INTERNAL") != 0)
+ if (strcmp (node->to_string, "INTERNAL") != 0
+ && strcmp (node->to_string, "CHAR16") != 0)
tsearch (node->to_string, &printlist, (__compar_fn_t) strverscmp);
node = node->same;
@@ -748,7 +750,8 @@ insert_cache (void)
{
const char *str = strtab + hashtab[cnt].string_offset;
- if (strcmp (str, "INTERNAL") != 0)
+ if (strcmp (str, "INTERNAL") != 0
+ && strcmp (str, "CHAR16") != 0)
tsearch (str, &printlist, (__compar_fn_t) strverscmp);
}
}