Revert "mbrtowc: fix handling invalid UTF-8 4 byte sequences if wchar_t == UTF-16"

This reverts commit b374973d14ac7969b10ba719feedc709f6971c0d. Turns out this patch breaks mbrtowc. Example: --- SNIP --- void mb(unsigned char c) { wchar_t wc; int ret = mbrtowc(&wc, &c, 1, 0); printf("%02X -> %04X : %d\n", c, wc, ret); } void main () { setlocale (LC_CTYPE, ""); mb(0xF0); mb(0x9F); mb(0x98); mb(0x8E); } --- SNAP --- Output before commit b374973d14ac: F0 -> 0000 : -2 9F -> 0000 : -2 98 -> D83D : 1 8E -> DE0E : 1 Output after commit b374973d14ac: F0 -> 0000 : -2 9F -> 0000 : -2 98 -> 0000 : -2 8E -> D83D : 3 By using mbrtowc(), the high surrogate is only emitted after byte 4, and there's no way to recover the low surrogate. The byte count is also incorrect. Conclusion: We have to emit the high surrogate already after byte 3 to be able to emit the low surrogate after byte 4. Reported-by: Thomas Wolff <towo@towo.net> Addresses: https://cygwin.com/pipermail/cygwin/2025-July/258513.html Fixes: b374973d14ac ("mbrtowc: fix handling invalid UTF-8 4 byte sequences if wchar_t == UTF-16") Signed-off-by: Corinna Vinschen <corinna@vinschen.de>
author: Corinna Vinschen <corinna@vinschen.de> 2025-07-22 14:54:47 +0200
committer: Corinna Vinschen <corinna@vinschen.de> 2025-07-24 10:12:48 +0200
commit: ba962ee04543855cfc6e2dc79a7369a78218815a (patch)
tree: 28bdde9be20eab092da833d4e5abca868e1f7c7f /newlib/libc
parent: 9e0162a18d7db74f8692789bf726aa753540fb51 (diff)
download: newlib-ba962ee04543855cfc6e2dc79a7369a78218815a.zip
newlib-ba962ee04543855cfc6e2dc79a7369a78218815a.tar.gz
newlib-ba962ee04543855cfc6e2dc79a7369a78218815a.tar.bz2
1 files changed, 9 insertions, 16 deletions
diff --git a/newlib/libc/stdlib/mbtowc_r.c b/newlib/libc/stdlib/mbtowc_r.c
index 6c3bd3d..cab8333 100644
--- a/newlib/libc/stdlib/mbtowc_r.c
+++ b/newlib/libc/stdlib/mbtowc_r.c
@@ -677,21 +677,6 @@ __utf8_mbtowc (struct _reent *r,
 	state->__count = 3;
       else if (n < (size_t)-1)
 	++n;
-      if (n < 4)
-	return -2;
-      ch = t[i++];
-      if (ch < 0x80 || ch > 0xbf)
-	{
-	  _REENT_ERRNO(r) = EILSEQ;
-	  return -1;
-	}
-      /* Note: Originally we created the low surrogate pair on systems with
-	 wchar_t == UTF-16 *before* checking the 4th byte.  This was utterly
-	 wrong, because this failed to check the last byte for being a valid
-	 value for a complete UTF-8 4 byte sequence.  As a result, calling
-	 functions happily digested the low surrogate and then got an entirely
-	 different character and handled this separately, thus generating
-	 invalid UTF-16 values. */
       if (state->__count == 3 && sizeof(wchar_t) == 2)
 	{
 	  /* On systems which have wchar_t being UTF-16 values, the value
@@ -710,7 +695,15 @@ __utf8_mbtowc (struct _reent *r,
 	    |   (wint_t)((state->__value.__wchb[2] & 0x3f) << 6);
 	  state->__count = 4;
 	  *pwc = 0xd800 | ((tmp - 0x10000) >> 10);
-	  return 3;
+	  return i;
+	}
+      if (n < 4)
+	return -2;
+      ch = t[i++];
+      if (ch < 0x80 || ch > 0xbf)
+	{
+	  _REENT_ERRNO(r) = EILSEQ;
+	  return -1;
 	}
       tmp = (wint_t)((state->__value.__wchb[0] & 0x07) << 18)
 	|   (wint_t)((state->__value.__wchb[1] & 0x3f) << 12)
author	Corinna Vinschen <corinna@vinschen.de>	2025-07-22 14:54:47 +0200
committer	Corinna Vinschen <corinna@vinschen.de>	2025-07-24 10:12:48 +0200
commit	ba962ee04543855cfc6e2dc79a7369a78218815a (patch)
tree	28bdde9be20eab092da833d4e5abca868e1f7c7f /newlib/libc
parent	9e0162a18d7db74f8692789bf726aa753540fb51 (diff)
download	newlib-ba962ee04543855cfc6e2dc79a7369a78218815a.zip newlib-ba962ee04543855cfc6e2dc79a7369a78218815a.tar.gz newlib-ba962ee04543855cfc6e2dc79a7369a78218815a.tar.bz2