aboutsummaryrefslogtreecommitdiff
path: root/newlib/libc/stdlib/mbtowc_r.c
diff options
context:
space:
mode:
authorCorinna Vinschen <corinna@vinschen.de>2009-02-25 09:10:09 +0000
committerCorinna Vinschen <corinna@vinschen.de>2009-02-25 09:10:09 +0000
commit8d8bf5a5e292a2b436ad4d5dedd4595ecf15f9ee (patch)
treef1e9eafe5ee6277ac116cc680ff322d2ffe9366d /newlib/libc/stdlib/mbtowc_r.c
parent56eafaf6e399ac17343e845b4f6bbcf93be61f94 (diff)
downloadnewlib-8d8bf5a5e292a2b436ad4d5dedd4595ecf15f9ee.zip
newlib-8d8bf5a5e292a2b436ad4d5dedd4595ecf15f9ee.tar.gz
newlib-8d8bf5a5e292a2b436ad4d5dedd4595ecf15f9ee.tar.bz2
* mbtowc_r.c (_mbtowc_r): Remove conversion of 5 and 6 byte UTF-8
sequences since they are invalid in the Unicode standard. Handle surrogate pairs in case of wchar_t == UTF-16. * wctomb_r.c (_wctomb_r): Don't convert invalid Unicode wchar_t values beyond 0x10ffff into UTF-8 chars. Handle surrogate pairs in case of wchar_t == UTF-16.
Diffstat (limited to 'newlib/libc/stdlib/mbtowc_r.c')
-rw-r--r--newlib/libc/stdlib/mbtowc_r.c153
1 files changed, 32 insertions, 121 deletions
diff --git a/newlib/libc/stdlib/mbtowc_r.c b/newlib/libc/stdlib/mbtowc_r.c
index 71bbf85..00021be 100644
--- a/newlib/libc/stdlib/mbtowc_r.c
+++ b/newlib/libc/stdlib/mbtowc_r.c
@@ -75,6 +75,18 @@ _DEFUN (_mbtowc_r, (r, pwc, s, n, state),
if (s == NULL)
return 0; /* UTF-8 character encodings are not state-dependent */
+ if (state->__count == 4)
+ {
+ /* Create the second half of the surrogate pair. For a description
+ see the comment below. */
+ wint_t tmp = (wchar_t)((state->__value.__wchb[0] & 0x07) << 18)
+ | (wchar_t)((state->__value.__wchb[1] & 0x3f) << 12)
+ | (wchar_t)((state->__value.__wchb[2] & 0x3f) << 6)
+ | (wchar_t)(state->__value.__wchb[3] & 0x3f);
+ state->__count = 0;
+ *pwc = 0xdc00 | ((tmp - 0x10000) & 0x3ff);
+ return 2;
+ }
if (state->__count == 0)
ch = t[i++];
else
@@ -153,8 +165,7 @@ _DEFUN (_mbtowc_r, (r, pwc, s, n, state),
else if (ch >= 0xf0 && ch <= 0xf7)
{
/* four-byte sequence */
- if (sizeof(wchar_t) < 4)
- return -1; /* we can't store such a value */
+ wint_t tmp;
state->__value.__wchb[0] = ch;
if (state->__count == 0)
state->__count = 1;
@@ -185,125 +196,25 @@ _DEFUN (_mbtowc_r, (r, pwc, s, n, state),
ch = t[i++];
if (ch < 0x80 || ch > 0xbf)
return -1;
- *pwc = (wchar_t)((state->__value.__wchb[0] & 0x07) << 18)
- | (wchar_t)((state->__value.__wchb[1] & 0x3f) << 12)
- | (wchar_t)((state->__value.__wchb[2] & 0x3f) << 6)
- | (wchar_t)(ch & 0x3f);
-
- state->__count = 0;
- return i;
- }
- else if (ch >= 0xf8 && ch <= 0xfb)
- {
- /* five-byte sequence */
- if (sizeof(wchar_t) < 4)
- return -1; /* we can't store such a value */
- state->__value.__wchb[0] = ch;
- if (state->__count == 0)
- state->__count = 1;
- else if (n < (size_t)-1)
- ++n;
- if (n < 2)
- return -2;
- ch = (state->__count == 1) ? t[i++] : state->__value.__wchb[1];
- if (state->__value.__wchb[0] == 0xf8 && ch < 0x88)
- /* overlong UTF-8 sequence */
- return -1;
- if (ch < 0x80 || ch > 0xbf)
- return -1;
- state->__value.__wchb[1] = ch;
- if (state->__count == 1)
- state->__count = 2;
- else if (n < (size_t)-1)
- ++n;
- if (n < 3)
- return -2;
- ch = (state->__count == 2) ? t[i++] : state->__value.__wchb[2];
- if (ch < 0x80 || ch > 0xbf)
- return -1;
- state->__value.__wchb[2] = ch;
- if (state->__count == 2)
- state->__count = 3;
- else if (n < (size_t)-1)
- ++n;
- if (n < 4)
- return -2;
- ch = (state->__count == 3) ? t[i++] : state->__value.__wchb[3];
- if (ch < 0x80 || ch > 0xbf)
- return -1;
- state->__value.__wchb[3] = ch;
- state->__count = 4;
- if (n < 5)
- return -2;
- ch = t[i++];
- *pwc = (wchar_t)((state->__value.__wchb[0] & 0x03) << 24)
- | (wchar_t)((state->__value.__wchb[1] & 0x3f) << 18)
- | (wchar_t)((state->__value.__wchb[2] & 0x3f) << 12)
- | (wchar_t)((state->__value.__wchb[3] & 0x3f) << 6)
- | (wchar_t)(ch & 0x3f);
-
- state->__count = 0;
- return i;
- }
- else if (ch >= 0xfc && ch <= 0xfd)
- {
- /* six-byte sequence */
- int ch2;
- if (sizeof(wchar_t) < 4)
- return -1; /* we can't store such a value */
- state->__value.__wchb[0] = ch;
- if (state->__count == 0)
- state->__count = 1;
- else if (n < (size_t)-1)
- ++n;
- if (n < 2)
- return -2;
- ch = (state->__count == 1) ? t[i++] : state->__value.__wchb[1];
- if (state->__value.__wchb[0] == 0xfc && ch < 0x84)
- /* overlong UTF-8 sequence */
- return -1;
- if (ch < 0x80 || ch > 0xbf)
- return -1;
- state->__value.__wchb[1] = ch;
- if (state->__count == 1)
- state->__count = 2;
- else if (n < (size_t)-1)
- ++n;
- if (n < 3)
- return -2;
- ch = (state->__count == 2) ? t[i++] : state->__value.__wchb[2];
- if (ch < 0x80 || ch > 0xbf)
- return -1;
- state->__value.__wchb[2] = ch;
- if (state->__count == 2)
- state->__count = 3;
- else if (n < (size_t)-1)
- ++n;
- if (n < 4)
- return -2;
- ch = (state->__count == 3) ? t[i++] : state->__value.__wchb[3];
- if (ch < 0x80 || ch > 0xbf)
- return -1;
- state->__value.__wchb[3] = ch;
- if (state->__count == 3)
- state->__count = 4;
- else if (n < (size_t)-1)
- ++n;
- if (n < 5)
- return -2;
- if (n == 5)
- return -1; /* at this point we can't save enough to restart */
- ch = t[i++];
- if (ch < 0x80 || ch > 0xbf)
- return -1;
- ch2 = t[i++];
- *pwc = (wchar_t)((state->__value.__wchb[0] & 0x01) << 30)
- | (wchar_t)((state->__value.__wchb[1] & 0x3f) << 24)
- | (wchar_t)((state->__value.__wchb[2] & 0x3f) << 18)
- | (wchar_t)((state->__value.__wchb[3] & 0x3f) << 12)
- | (wchar_t)((ch & 0x3f) << 6)
- | (wchar_t)(ch2 & 0x3f);
-
+ tmp = (wint_t)((state->__value.__wchb[0] & 0x07) << 18)
+ | (wint_t)((state->__value.__wchb[1] & 0x3f) << 12)
+ | (wint_t)((state->__value.__wchb[2] & 0x3f) << 6)
+ | (wint_t)(ch & 0x3f);
+ if (tmp > 0xffff && sizeof(wchar_t) == 2)
+ {
+ /* On systems which have wchar_t being UTF-16 values, the value
+ doesn't fit into a single wchar_t in this case. So what we
+ do here is to store the state with a special value of __count
+ and return the first half of a surrogate pair. As return
+ value we choose to return the half of the actual UTF-8 char.
+ The second half is returned in case we recognize the special
+ __count value above. */
+ state->__value.__wchb[3] = ch;
+ state->__count = 4;
+ *pwc = 0xd800 | (((tmp - 0x10000) >> 10) & 0x3ff);
+ return 2;
+ }
+ *pwc = tmp;
state->__count = 0;
return i;
}