diff options
author | Corinna Vinschen <corinna@vinschen.de> | 2025-07-22 14:54:47 +0200 |
---|---|---|
committer | Corinna Vinschen <corinna@vinschen.de> | 2025-07-24 10:12:48 +0200 |
commit | ba962ee04543855cfc6e2dc79a7369a78218815a (patch) | |
tree | 28bdde9be20eab092da833d4e5abca868e1f7c7f /newlib/libc | |
parent | 9e0162a18d7db74f8692789bf726aa753540fb51 (diff) | |
download | newlib-ba962ee04543855cfc6e2dc79a7369a78218815a.zip newlib-ba962ee04543855cfc6e2dc79a7369a78218815a.tar.gz newlib-ba962ee04543855cfc6e2dc79a7369a78218815a.tar.bz2 |
Revert "mbrtowc: fix handling invalid UTF-8 4 byte sequences if wchar_t == UTF-16"
This reverts commit b374973d14ac7969b10ba719feedc709f6971c0d.
Turns out this patch breaks mbrtowc. Example:
--- SNIP ---
void mb(unsigned char c)
{
wchar_t wc;
int ret = mbrtowc(&wc, &c, 1, 0);
printf("%02X -> %04X : %d\n", c, wc, ret);
}
void main ()
{
setlocale (LC_CTYPE, "");
mb(0xF0);
mb(0x9F);
mb(0x98);
mb(0x8E);
}
--- SNAP ---
Output before commit b374973d14ac:
F0 -> 0000 : -2
9F -> 0000 : -2
98 -> D83D : 1
8E -> DE0E : 1
Output after commit b374973d14ac:
F0 -> 0000 : -2
9F -> 0000 : -2
98 -> 0000 : -2
8E -> D83D : 3
By using mbrtowc(), the high surrogate is only emitted after byte 4, and
there's no way to recover the low surrogate. The byte count is also incorrect.
Conclusion: We have to emit the high surrogate already after byte 3
to be able to emit the low surrogate after byte 4.
Reported-by: Thomas Wolff <towo@towo.net>
Addresses: https://cygwin.com/pipermail/cygwin/2025-July/258513.html
Fixes: b374973d14ac ("mbrtowc: fix handling invalid UTF-8 4 byte sequences if wchar_t == UTF-16")
Signed-off-by: Corinna Vinschen <corinna@vinschen.de>
Diffstat (limited to 'newlib/libc')
-rw-r--r-- | newlib/libc/stdlib/mbtowc_r.c | 25 |
1 files changed, 9 insertions, 16 deletions
diff --git a/newlib/libc/stdlib/mbtowc_r.c b/newlib/libc/stdlib/mbtowc_r.c index 6c3bd3d..cab8333 100644 --- a/newlib/libc/stdlib/mbtowc_r.c +++ b/newlib/libc/stdlib/mbtowc_r.c @@ -677,21 +677,6 @@ __utf8_mbtowc (struct _reent *r, state->__count = 3; else if (n < (size_t)-1) ++n; - if (n < 4) - return -2; - ch = t[i++]; - if (ch < 0x80 || ch > 0xbf) - { - _REENT_ERRNO(r) = EILSEQ; - return -1; - } - /* Note: Originally we created the low surrogate pair on systems with - wchar_t == UTF-16 *before* checking the 4th byte. This was utterly - wrong, because this failed to check the last byte for being a valid - value for a complete UTF-8 4 byte sequence. As a result, calling - functions happily digested the low surrogate and then got an entirely - different character and handled this separately, thus generating - invalid UTF-16 values. */ if (state->__count == 3 && sizeof(wchar_t) == 2) { /* On systems which have wchar_t being UTF-16 values, the value @@ -710,7 +695,15 @@ __utf8_mbtowc (struct _reent *r, | (wint_t)((state->__value.__wchb[2] & 0x3f) << 6); state->__count = 4; *pwc = 0xd800 | ((tmp - 0x10000) >> 10); - return 3; + return i; + } + if (n < 4) + return -2; + ch = t[i++]; + if (ch < 0x80 || ch > 0xbf) + { + _REENT_ERRNO(r) = EILSEQ; + return -1; } tmp = (wint_t)((state->__value.__wchb[0] & 0x07) << 18) | (wint_t)((state->__value.__wchb[1] & 0x3f) << 12) |