aboutsummaryrefslogtreecommitdiff
path: root/libstdc++-v3/src
diff options
context:
space:
mode:
authorJonathan Wakely <jwakely@redhat.com>2015-03-04 17:19:55 +0000
committerJonathan Wakely <redi@gcc.gnu.org>2015-03-04 17:19:55 +0000
commitb6584a72ac8d305731e1771a05c117dc11a3d553 (patch)
tree51d24fd829c29cb8633b241a2c12b9fdd9596f15 /libstdc++-v3/src
parentd50a26f2bad59ba73c52694190aac02e90423bbd (diff)
downloadgcc-b6584a72ac8d305731e1771a05c117dc11a3d553.zip
gcc-b6584a72ac8d305731e1771a05c117dc11a3d553.tar.gz
gcc-b6584a72ac8d305731e1771a05c117dc11a3d553.tar.bz2
re PR libstdc++/64797 (22_locale/conversions/string/2.cc FAILs)
PR libstdc++/64797 * include/bits/locale_conv.h (wstring_convert::_M_conv): Handle incomplete multibyte sequences correctly. * include/std/codecvt (codecvt_utf8, codecvt_utf16, codecvt_utf8_utf16): Limit _Maxcode to maximum Unicode code point. * src/c++11/codecvt.cc (invalid_mb_sequence, incomplete_mb_character): Define constants. (is_high_surrogate, is_low_surrogate, surrogate_pair_to_code_point): Define convenience functions. (read_utf8_code_point): Return relevant constant to distinguish incomplete characters from invalid sequences. (read_utf16_code_point): Likewise. Check for invalid sequences. (ucs4_in, utf16_in): Use incomplete_mb_character constant. (utf16_out): Check for invalid sequences. (utf16_span): Fix condition. (ucs2_out): Use is_high_surrogate. (ucs2_in): Use incomplete_mb_character constant and fix condition. * testsuite/22_locale/codecvt/char16_t.cc: Fix whitespace. * testsuite/22_locale/conversions/buffer/1.cc: New. * testsuite/22_locale/conversions/string/2.cc: Use char16_t and char32_t instead of wchar_t. * testsuite/22_locale/conversions/string/3.cc: New. From-SVN: r221189
Diffstat (limited to 'libstdc++-v3/src')
-rw-r--r--libstdc++-v3/src/c++11/codecvt.cc113
1 files changed, 73 insertions, 40 deletions
diff --git a/libstdc++-v3/src/c++11/codecvt.cc b/libstdc++-v3/src/c++11/codecvt.cc
index aebd3f3..83ee6e0 100644
--- a/libstdc++-v3/src/c++11/codecvt.cc
+++ b/libstdc++-v3/src/c++11/codecvt.cc
@@ -35,8 +35,14 @@ namespace
{
// Largest code point that fits in a single UTF-16 code unit.
const char32_t max_single_utf16_unit = 0xFFFF;
+
const char32_t max_code_point = 0x10FFFF;
+ // The functions below rely on maxcode < incomplete_mb_character
+ // (which is enforced by the codecvt_utf* classes on construction).
+ const char32_t incomplete_mb_character = char32_t(-2);
+ const char32_t invalid_mb_sequence = char32_t(-1);
+
template<typename Elem>
struct range
{
@@ -131,13 +137,13 @@ namespace
// Read a codepoint from a UTF-8 multibyte sequence.
// Updates from.next if the codepoint is not greater than maxcode.
- // Returns -1 if there is an invalid or incomplete multibyte character.
+ // Returns invalid_mb_sequence, incomplete_mb_character or the code point.
char32_t
read_utf8_code_point(range<const char>& from, unsigned long maxcode)
{
- size_t avail = from.size();
+ const size_t avail = from.size();
if (avail == 0)
- return -1;
+ return incomplete_mb_character;
unsigned char c1 = from.next[0];
// https://en.wikipedia.org/wiki/UTF-8#Sample_code
if (c1 < 0x80)
@@ -146,14 +152,14 @@ namespace
return c1;
}
else if (c1 < 0xC2) // continuation or overlong 2-byte sequence
- return -1;
+ return invalid_mb_sequence;
else if (c1 < 0xE0) // 2-byte sequence
{
if (avail < 2)
- return -1;
+ return incomplete_mb_character;
unsigned char c2 = from.next[1];
if ((c2 & 0xC0) != 0x80)
- return -1;
+ return invalid_mb_sequence;
char32_t c = (c1 << 6) + c2 - 0x3080;
if (c <= maxcode)
from.next += 2;
@@ -162,15 +168,15 @@ namespace
else if (c1 < 0xF0) // 3-byte sequence
{
if (avail < 3)
- return -1;
+ return incomplete_mb_character;
unsigned char c2 = from.next[1];
if ((c2 & 0xC0) != 0x80)
- return -1;
+ return invalid_mb_sequence;
if (c1 == 0xE0 && c2 < 0xA0) // overlong
- return -1;
+ return invalid_mb_sequence;
unsigned char c3 = from.next[2];
if ((c3 & 0xC0) != 0x80)
- return -1;
+ return invalid_mb_sequence;
char32_t c = (c1 << 12) + (c2 << 6) + c3 - 0xE2080;
if (c <= maxcode)
from.next += 3;
@@ -179,27 +185,27 @@ namespace
else if (c1 < 0xF5) // 4-byte sequence
{
if (avail < 4)
- return -1;
+ return incomplete_mb_character;
unsigned char c2 = from.next[1];
if ((c2 & 0xC0) != 0x80)
- return -1;
+ return invalid_mb_sequence;
if (c1 == 0xF0 && c2 < 0x90) // overlong
- return -1;
+ return invalid_mb_sequence;
if (c1 == 0xF4 && c2 >= 0x90) // > U+10FFFF
- return -1;
+ return invalid_mb_sequence;
unsigned char c3 = from.next[2];
if ((c3 & 0xC0) != 0x80)
- return -1;
+ return invalid_mb_sequence;
unsigned char c4 = from.next[3];
if ((c4 & 0xC0) != 0x80)
- return -1;
+ return invalid_mb_sequence;
char32_t c = (c1 << 18) + (c2 << 12) + (c3 << 6) + c4 - 0x3C82080;
if (c <= maxcode)
from.next += 4;
return c;
}
else // > U+10FFFF
- return -1;
+ return invalid_mb_sequence;
}
bool
@@ -250,27 +256,54 @@ namespace
#endif
}
+ // Return true if c is a high-surrogate (aka leading) code point.
+ inline bool
+ is_high_surrogate(char32_t c)
+ {
+ return c >= 0xD800 && c <= 0xDBFF;
+ }
+
+ // Return true if c is a low-surrogate (aka trailing) code point.
+ inline bool
+ is_low_surrogate(char32_t c)
+ {
+ return c >= 0xDC00 && c <= 0xDFFF;
+ }
+
+ inline char32_t
+ surrogate_pair_to_code_point(char32_t high, char32_t low)
+ {
+ return (high << 10) + low - 0x35FDC00;
+ }
+
// Read a codepoint from a UTF-16 multibyte sequence.
// The sequence's endianness is indicated by (mode & little_endian).
// Updates from.next if the codepoint is not greater than maxcode.
- // Returns -1 if there is an incomplete multibyte character.
+ // Returns invalid_mb_sequence, incomplete_mb_character or the code point.
char32_t
read_utf16_code_point(range<const char16_t>& from, unsigned long maxcode,
codecvt_mode mode)
{
+ const size_t avail = from.size();
+ if (avail == 0)
+ return incomplete_mb_character;
int inc = 1;
char32_t c = adjust_byte_order(from.next[0], mode);
- if (c >= 0xD800 && c <= 0xDBFF)
+ if (is_high_surrogate(c))
{
- if (from.size() < 2)
- return -1;
+ if (avail < 2)
+ return incomplete_mb_character;
const char16_t c2 = adjust_byte_order(from.next[1], mode);
- if (c2 >= 0xDC00 && c2 <= 0xDFFF)
+ if (is_low_surrogate(c2))
{
- c = (c << 10) + c2 - 0x35FDC00;
+ c = surrogate_pair_to_code_point(c, c2);
inc = 2;
}
+ else
+ return invalid_mb_sequence;
}
+ else if (is_low_surrogate(c))
+ return invalid_mb_sequence;
if (c <= maxcode)
from.next += inc;
return c;
@@ -314,8 +347,8 @@ namespace
while (from.size() && to.size())
{
const char32_t codepoint = read_utf8_code_point(from, maxcode);
- if (codepoint == char32_t(-1))
- break;
+ if (codepoint == incomplete_mb_character)
+ return codecvt_base::partial;
if (codepoint > maxcode)
return codecvt_base::error;
*to.next++ = codepoint;
@@ -352,8 +385,8 @@ namespace
while (from.size() && to.size())
{
const char32_t codepoint = read_utf16_code_point(from, maxcode, mode);
- if (codepoint == char32_t(-1))
- break;
+ if (codepoint == incomplete_mb_character)
+ return codecvt_base::partial;
if (codepoint > maxcode)
return codecvt_base::error;
*to.next++ = codepoint;
@@ -389,11 +422,9 @@ namespace
read_utf8_bom(from, mode);
while (from.size() && to.size())
{
- const char* first = from.next;
- if ((unsigned char)*first >= 0xF0 && to.size() < 2)
- return codecvt_base::partial;
+ const char* const first = from.next;
const char32_t codepoint = read_utf8_code_point(from, maxcode);
- if (codepoint == char32_t(-1))
+ if (codepoint == incomplete_mb_character)
return codecvt_base::partial;
if (codepoint > maxcode)
return codecvt_base::error;
@@ -418,20 +449,22 @@ namespace
{
char32_t c = from.next[0];
int inc = 1;
- if (c >= 0xD800 && c <= 0xDBFF) // start of surrogate pair
+ if (is_high_surrogate(c))
{
if (from.size() < 2)
return codecvt_base::ok; // stop converting at this point
const char32_t c2 = from.next[1];
- if (c2 >= 0xDC00 && c2 <= 0xDFFF)
+ if (is_low_surrogate(c2))
{
+ c = surrogate_pair_to_code_point(c, c2);
inc = 2;
- c = (c << 10) + c2 - 0x35FDC00;
}
else
return codecvt_base::error;
}
+ else if (is_low_surrogate(c))
+ return codecvt_base::error;
if (c > maxcode)
return codecvt_base::error;
if (!write_utf8_code_point(to, c))
@@ -452,8 +485,8 @@ namespace
while (count+1 < max)
{
char32_t c = read_utf8_code_point(from, maxcode);
- if (c == char32_t(-1))
- break;
+ if (c > maxcode)
+ return from.next;
else if (c > max_single_utf16_unit)
++count;
++count;
@@ -489,7 +522,7 @@ namespace
while (from.size() && to.size())
{
char16_t c = from.next[0];
- if (c >= 0xD800 && c <= 0xDBFF) // start of surrogate pair
+ if (is_high_surrogate(c))
return codecvt_base::error;
if (c > maxcode)
return codecvt_base::error;
@@ -510,9 +543,9 @@ namespace
while (from.size() && to.size())
{
const char32_t c = read_utf16_code_point(from, maxcode, mode);
- if (c == char32_t(-1))
- break;
- if (c >= maxcode)
+ if (c == incomplete_mb_character)
+ return codecvt_base::partial;
+ if (c > maxcode)
return codecvt_base::error;
*to.next++ = c;
}