aboutsummaryrefslogtreecommitdiff
path: root/libcxx/src
diff options
context:
space:
mode:
authorDimitrij Mijoski <dmjpp@hotmail.com>2023-11-27 18:31:03 +0100
committerGitHub <noreply@github.com>2023-11-27 18:31:03 +0100
commit390840fbdf280b68a5498af952926ffcd678954e (patch)
treeb6d69f6537b2d8914c7172075b7f932d92895db9 /libcxx/src
parentd79aee9f7339e80facedb140492181a4bba3c5b7 (diff)
downloadllvm-390840fbdf280b68a5498af952926ffcd678954e.zip
llvm-390840fbdf280b68a5498af952926ffcd678954e.tar.gz
llvm-390840fbdf280b68a5498af952926ffcd678954e.tar.bz2
[libc++] Fix UTF-8 decoding in codecvts (#68442)
This patch fixes one case where the decoding member function `in()` was returning `partial` instead of `error`. Additionally, it adds large testsuite that tests all `codecvt` facets that were added in C++11 and in C++20. The testsuite covers this bug. Fixes #60177.
Diffstat (limited to 'libcxx/src')
-rw-r--r--libcxx/src/locale.cpp72
1 files changed, 49 insertions, 23 deletions
diff --git a/libcxx/src/locale.cpp b/libcxx/src/locale.cpp
index 2cb75f8..663f412 100644
--- a/libcxx/src/locale.cpp
+++ b/libcxx/src/locale.cpp
@@ -1972,10 +1972,9 @@ utf8_to_utf16(const uint8_t* frm, const uint8_t* frm_end, const uint8_t*& frm_nx
}
else if (c1 < 0xF0)
{
- if (frm_end-frm_nxt < 3)
+ if (frm_end-frm_nxt < 2)
return codecvt_base::partial;
uint8_t c2 = frm_nxt[1];
- uint8_t c3 = frm_nxt[2];
switch (c1)
{
case 0xE0:
@@ -1991,6 +1990,9 @@ utf8_to_utf16(const uint8_t* frm, const uint8_t* frm_end, const uint8_t*& frm_nx
return codecvt_base::error;
break;
}
+ if (frm_end-frm_nxt < 3)
+ return codecvt_base::partial;
+ uint8_t c3 = frm_nxt[2];
if ((c3 & 0xC0) != 0x80)
return codecvt_base::error;
uint16_t t = static_cast<uint16_t>(((c1 & 0x0F) << 12)
@@ -2003,11 +2005,9 @@ utf8_to_utf16(const uint8_t* frm, const uint8_t* frm_end, const uint8_t*& frm_nx
}
else if (c1 < 0xF5)
{
- if (frm_end-frm_nxt < 4)
+ if (frm_end-frm_nxt < 2)
return codecvt_base::partial;
uint8_t c2 = frm_nxt[1];
- uint8_t c3 = frm_nxt[2];
- uint8_t c4 = frm_nxt[3];
switch (c1)
{
case 0xF0:
@@ -2023,8 +2023,16 @@ utf8_to_utf16(const uint8_t* frm, const uint8_t* frm_end, const uint8_t*& frm_nx
return codecvt_base::error;
break;
}
- if ((c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
- return codecvt_base::error;
+ if (frm_end-frm_nxt < 3)
+ return codecvt_base::partial;
+ uint8_t c3 = frm_nxt[2];
+ if ((c3 & 0xC0) != 0x80)
+ return codecvt_base::error;
+ if (frm_end-frm_nxt < 4)
+ return codecvt_base::partial;
+ uint8_t c4 = frm_nxt[3];
+ if ((c4 & 0xC0) != 0x80)
+ return codecvt_base::error;
if (to_end-to_nxt < 2)
return codecvt_base::partial;
if ((((c1 & 7UL) << 18) +
@@ -2093,10 +2101,9 @@ utf8_to_utf16(const uint8_t* frm, const uint8_t* frm_end, const uint8_t*& frm_nx
}
else if (c1 < 0xF0)
{
- if (frm_end-frm_nxt < 3)
+ if (frm_end-frm_nxt < 2)
return codecvt_base::partial;
uint8_t c2 = frm_nxt[1];
- uint8_t c3 = frm_nxt[2];
switch (c1)
{
case 0xE0:
@@ -2112,6 +2119,9 @@ utf8_to_utf16(const uint8_t* frm, const uint8_t* frm_end, const uint8_t*& frm_nx
return codecvt_base::error;
break;
}
+ if (frm_end-frm_nxt < 3)
+ return codecvt_base::partial;
+ uint8_t c3 = frm_nxt[2];
if ((c3 & 0xC0) != 0x80)
return codecvt_base::error;
uint16_t t = static_cast<uint16_t>(((c1 & 0x0F) << 12)
@@ -2124,11 +2134,9 @@ utf8_to_utf16(const uint8_t* frm, const uint8_t* frm_end, const uint8_t*& frm_nx
}
else if (c1 < 0xF5)
{
- if (frm_end-frm_nxt < 4)
+ if (frm_end-frm_nxt < 2)
return codecvt_base::partial;
uint8_t c2 = frm_nxt[1];
- uint8_t c3 = frm_nxt[2];
- uint8_t c4 = frm_nxt[3];
switch (c1)
{
case 0xF0:
@@ -2144,8 +2152,16 @@ utf8_to_utf16(const uint8_t* frm, const uint8_t* frm_end, const uint8_t*& frm_nx
return codecvt_base::error;
break;
}
- if ((c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
- return codecvt_base::error;
+ if (frm_end-frm_nxt < 3)
+ return codecvt_base::partial;
+ uint8_t c3 = frm_nxt[2];
+ if ((c3 & 0xC0) != 0x80)
+ return codecvt_base::error;
+ if (frm_end-frm_nxt < 4)
+ return codecvt_base::partial;
+ uint8_t c4 = frm_nxt[3];
+ if ((c4 & 0xC0) != 0x80)
+ return codecvt_base::error;
if (to_end-to_nxt < 2)
return codecvt_base::partial;
if ((((c1 & 7UL) << 18) +
@@ -2371,10 +2387,9 @@ utf8_to_ucs4(const uint8_t* frm, const uint8_t* frm_end, const uint8_t*& frm_nxt
}
else if (c1 < 0xF0)
{
- if (frm_end-frm_nxt < 3)
+ if (frm_end-frm_nxt < 2)
return codecvt_base::partial;
uint8_t c2 = frm_nxt[1];
- uint8_t c3 = frm_nxt[2];
switch (c1)
{
case 0xE0:
@@ -2390,6 +2405,9 @@ utf8_to_ucs4(const uint8_t* frm, const uint8_t* frm_end, const uint8_t*& frm_nxt
return codecvt_base::error;
break;
}
+ if (frm_end-frm_nxt < 3)
+ return codecvt_base::partial;
+ uint8_t c3 = frm_nxt[2];
if ((c3 & 0xC0) != 0x80)
return codecvt_base::error;
uint32_t t = static_cast<uint32_t>(((c1 & 0x0F) << 12)
@@ -2402,11 +2420,9 @@ utf8_to_ucs4(const uint8_t* frm, const uint8_t* frm_end, const uint8_t*& frm_nxt
}
else if (c1 < 0xF5)
{
- if (frm_end-frm_nxt < 4)
+ if (frm_end-frm_nxt < 2)
return codecvt_base::partial;
uint8_t c2 = frm_nxt[1];
- uint8_t c3 = frm_nxt[2];
- uint8_t c4 = frm_nxt[3];
switch (c1)
{
case 0xF0:
@@ -2422,8 +2438,16 @@ utf8_to_ucs4(const uint8_t* frm, const uint8_t* frm_end, const uint8_t*& frm_nxt
return codecvt_base::error;
break;
}
- if ((c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
- return codecvt_base::error;
+ if (frm_end-frm_nxt < 3)
+ return codecvt_base::partial;
+ uint8_t c3 = frm_nxt[2];
+ if ((c3 & 0xC0) != 0x80)
+ return codecvt_base::error;
+ if (frm_end-frm_nxt < 4)
+ return codecvt_base::partial;
+ uint8_t c4 = frm_nxt[3];
+ if ((c4 & 0xC0) != 0x80)
+ return codecvt_base::error;
uint32_t t = static_cast<uint32_t>(((c1 & 0x07) << 18)
| ((c2 & 0x3F) << 12)
| ((c3 & 0x3F) << 6)
@@ -2629,10 +2653,9 @@ utf8_to_ucs2(const uint8_t* frm, const uint8_t* frm_end, const uint8_t*& frm_nxt
}
else if (c1 < 0xF0)
{
- if (frm_end-frm_nxt < 3)
+ if (frm_end-frm_nxt < 2)
return codecvt_base::partial;
uint8_t c2 = frm_nxt[1];
- uint8_t c3 = frm_nxt[2];
switch (c1)
{
case 0xE0:
@@ -2648,6 +2671,9 @@ utf8_to_ucs2(const uint8_t* frm, const uint8_t* frm_end, const uint8_t*& frm_nxt
return codecvt_base::error;
break;
}
+ if (frm_end-frm_nxt < 3)
+ return codecvt_base::partial;
+ uint8_t c3 = frm_nxt[2];
if ((c3 & 0xC0) != 0x80)
return codecvt_base::error;
uint16_t t = static_cast<uint16_t>(((c1 & 0x0F) << 12)