diff options
author | Jonathan Wakely <jwakely@redhat.com> | 2017-03-16 15:27:57 +0000 |
---|---|---|
committer | Jonathan Wakely <redi@gcc.gnu.org> | 2017-03-16 15:27:57 +0000 |
commit | 516231de7317bbaf0f83a27047924bb690a217e5 (patch) | |
tree | b2ce70fd6aaa1d1beff946081aa9e670e1441d64 /libstdc++-v3 | |
parent | bcd682e1faed71fd861518ca43235706fc39a7cd (diff) | |
download | gcc-516231de7317bbaf0f83a27047924bb690a217e5.zip gcc-516231de7317bbaf0f83a27047924bb690a217e5.tar.gz gcc-516231de7317bbaf0f83a27047924bb690a217e5.tar.bz2 |
Fix encoding() and max_length() values for codecvt facets
* src/c++11/codecvt.cc (codecvt<char16_t, char, mbstate_t>)
(codecvt<char32_t, char, mbstate_t>, __codecvt_utf8_base<char16_t>)
(__codecvt_utf8_base<char32_t>, __codecvt_utf8_base<wchar_t>)
(__codecvt_utf16_base<char16_t>, __codecvt_utf16_base<char32_t>)
(__codecvt_utf16_base<wchar_t>, __codecvt_utf8_utf16_base<char16_t>)
(__codecvt_utf8_utf16_base<char32_t>)
(__codecvt_utf8_utf16_base<wchar_t>): Fix do_encoding() and
do_max_length() return values.
* testsuite/22_locale/codecvt/codecvt_utf16/members.cc: New test.
* testsuite/22_locale/codecvt/codecvt_utf8/members.cc: New test.
* testsuite/22_locale/codecvt/codecvt_utf8_utf16/members.cc: New test.
From-SVN: r246201
Diffstat (limited to 'libstdc++-v3')
-rw-r--r-- | libstdc++-v3/ChangeLog | 12 | ||||
-rw-r--r-- | libstdc++-v3/src/c++11/codecvt.cc | 122 | ||||
-rw-r--r-- | libstdc++-v3/testsuite/22_locale/codecvt/char16_t.cc | 2 | ||||
-rw-r--r-- | libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/members.cc | 81 | ||||
-rw-r--r-- | libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8/members.cc | 81 | ||||
-rw-r--r-- | libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8_utf16/members.cc | 76 |
6 files changed, 341 insertions, 33 deletions
diff --git a/libstdc++-v3/ChangeLog b/libstdc++-v3/ChangeLog index 83f74ef..9facce8 100644 --- a/libstdc++-v3/ChangeLog +++ b/libstdc++-v3/ChangeLog @@ -1,5 +1,17 @@ 2017-03-16 Jonathan Wakely <jwakely@redhat.com> + * src/c++11/codecvt.cc (codecvt<char16_t, char, mbstate_t>) + (codecvt<char32_t, char, mbstate_t>, __codecvt_utf8_base<char16_t>) + (__codecvt_utf8_base<char32_t>, __codecvt_utf8_base<wchar_t>) + (__codecvt_utf16_base<char16_t>, __codecvt_utf16_base<char32_t>) + (__codecvt_utf16_base<wchar_t>, __codecvt_utf8_utf16_base<char16_t>) + (__codecvt_utf8_utf16_base<char32_t>) + (__codecvt_utf8_utf16_base<wchar_t>): Fix do_encoding() and + do_max_length() return values. + * testsuite/22_locale/codecvt/codecvt_utf16/members.cc: New test. + * testsuite/22_locale/codecvt/codecvt_utf8/members.cc: New test. + * testsuite/22_locale/codecvt/codecvt_utf8_utf16/members.cc: New test. + PR libstdc++/79980 * include/bits/locale_conv.h (__do_str_codecvt): Set __count on error path. diff --git a/libstdc++-v3/src/c++11/codecvt.cc b/libstdc++-v3/src/c++11/codecvt.cc index a50804c..9c91725 100644 --- a/libstdc++-v3/src/c++11/codecvt.cc +++ b/libstdc++-v3/src/c++11/codecvt.cc @@ -72,8 +72,8 @@ namespace // Multibyte sequences can have "header" consisting of Byte Order Mark const unsigned char utf8_bom[3] = { 0xEF, 0xBB, 0xBF }; - const unsigned char utf16_bom[4] = { 0xFE, 0xFF }; - const unsigned char utf16le_bom[4] = { 0xFF, 0xFE }; + const unsigned char utf16_bom[2] = { 0xFE, 0xFF }; + const unsigned char utf16le_bom[2] = { 0xFF, 0xFE }; template<size_t N> inline bool @@ -695,7 +695,7 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end, int codecvt<char16_t, char, mbstate_t>::do_encoding() const throw() -{ return 0; } +{ return 0; } // UTF-8 is not a fixed-width encoding bool codecvt<char16_t, char, mbstate_t>::do_always_noconv() const throw() @@ -713,9 +713,9 @@ do_length(state_type&, const extern_type* __from, int codecvt<char16_t, char, mbstate_t>::do_max_length() const throw() { - // Any valid UTF-8 sequence of 3 bytes fits in a single 16-bit code unit, - // whereas 4 byte sequences require two 16-bit code units. - return 3; + // A single character (one or two UTF-16 code units) requires + // up to four UTF-8 code units. + return 4; } // Define members of codecvt<char32_t, char, mbstate_t> specialization. @@ -766,7 +766,7 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end, int codecvt<char32_t, char, mbstate_t>::do_encoding() const throw() -{ return 0; } +{ return 0; } // UTF-8 is not a fixed-width encoding bool codecvt<char32_t, char, mbstate_t>::do_always_noconv() const throw() @@ -783,7 +783,11 @@ do_length(state_type&, const extern_type* __from, int codecvt<char32_t, char, mbstate_t>::do_max_length() const throw() -{ return 4; } +{ + // A single character (one UTF-32 code unit) requires + // up to 4 UTF-8 code units. + return 4; +} // Define members of codecvt_utf8<char16_t> base class implementation. // Converts from UTF-8 to UCS-2. @@ -835,7 +839,7 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end, int __codecvt_utf8_base<char16_t>::do_encoding() const throw() -{ return 0; } +{ return 0; } // UTF-8 is not a fixed-width encoding bool __codecvt_utf8_base<char16_t>::do_always_noconv() const throw() @@ -852,7 +856,14 @@ do_length(state_type&, const extern_type* __from, int __codecvt_utf8_base<char16_t>::do_max_length() const throw() -{ return 3; } +{ + // A single UCS-2 character requires up to three UTF-8 code units. + // (UCS-2 cannot represent characters that use four UTF-8 code units). + int max = 3; + if (_M_mode & consume_header) + max += sizeof(utf8_bom); + return max; +} // Define members of codecvt_utf8<char32_t> base class implementation. // Converts from UTF-8 to UTF-32 (aka UCS-4). @@ -900,7 +911,7 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end, int __codecvt_utf8_base<char32_t>::do_encoding() const throw() -{ return 0; } +{ return 0; } // UTF-8 is not a fixed-width encoding bool __codecvt_utf8_base<char32_t>::do_always_noconv() const throw() @@ -917,7 +928,13 @@ do_length(state_type&, const extern_type* __from, int __codecvt_utf8_base<char32_t>::do_max_length() const throw() -{ return 4; } +{ + // A single UCS-4 character requires up to four UTF-8 code units. + int max = 4; + if (_M_mode & consume_header) + max += sizeof(utf8_bom); + return max; +} #ifdef _GLIBCXX_USE_WCHAR_T // Define members of codecvt_utf8<wchar_t> base class implementation. @@ -992,7 +1009,7 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end, int __codecvt_utf8_base<wchar_t>::do_encoding() const throw() -{ return 0; } +{ return 0; } // UTF-8 is not a fixed-width encoding bool __codecvt_utf8_base<wchar_t>::do_always_noconv() const throw() @@ -1015,7 +1032,16 @@ do_length(state_type&, const extern_type* __from, int __codecvt_utf8_base<wchar_t>::do_max_length() const throw() -{ return 4; } +{ +#if __SIZEOF_WCHAR_T__ == 2 + int max = 3; // See __codecvt_utf8_base<char16_t>::do_max_length() +#else + int max = 4; // See __codecvt_utf8_base<char32_t>::do_max_length() +#endif + if (_M_mode & consume_header) + max += sizeof(utf8_bom); + return max; +} #endif // Define members of codecvt_utf16<char16_t> base class implementation. @@ -1070,7 +1096,7 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end, int __codecvt_utf16_base<char16_t>::do_encoding() const throw() -{ return 1; } +{ return 0; } // UTF-16 is not a fixed-width encoding bool __codecvt_utf16_base<char16_t>::do_always_noconv() const throw() @@ -1089,7 +1115,14 @@ do_length(state_type&, const extern_type* __from, int __codecvt_utf16_base<char16_t>::do_max_length() const throw() -{ return 3; } +{ + // A single UCS-2 character requires one UTF-16 code unit (so two chars). + // (UCS-2 cannot represent characters that use multiple UTF-16 code units). + int max = 2; + if (_M_mode & consume_header) + max += sizeof(utf16_bom); + return max; +} // Define members of codecvt_utf16<char32_t> base class implementation. // Converts from UTF-16 to UTF-32 (aka UCS-4). @@ -1143,7 +1176,7 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end, int __codecvt_utf16_base<char32_t>::do_encoding() const throw() -{ return 0; } +{ return 0; } // UTF-16 is not a fixed-width encoding bool __codecvt_utf16_base<char32_t>::do_always_noconv() const throw() @@ -1162,7 +1195,14 @@ do_length(state_type&, const extern_type* __from, int __codecvt_utf16_base<char32_t>::do_max_length() const throw() -{ return 4; } +{ + // A single UCS-4 character requires one or two UTF-16 code units + // (so up to four chars). + int max = 4; + if (_M_mode & consume_header) + max += sizeof(utf16_bom); + return max; +} #ifdef _GLIBCXX_USE_WCHAR_T // Define members of codecvt_utf16<wchar_t> base class implementation. @@ -1237,7 +1277,7 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end, int __codecvt_utf16_base<wchar_t>::do_encoding() const throw() -{ return 0; } +{ return 0; } // UTF-16 is not a fixed-width encoding bool __codecvt_utf16_base<wchar_t>::do_always_noconv() const throw() @@ -1261,7 +1301,16 @@ do_length(state_type&, const extern_type* __from, int __codecvt_utf16_base<wchar_t>::do_max_length() const throw() -{ return 4; } +{ +#if __SIZEOF_WCHAR_T__ == 2 + int max = 2; // See __codecvt_utf16_base<char16_t>::do_max_length() +#else + int max = 4; // See __codecvt_utf16_base<char32_t>::do_max_length() +#endif + if (_M_mode & consume_header) + max += sizeof(utf16_bom); + return max; +} #endif // Define members of codecvt_utf8_utf16<char16_t> base class implementation. @@ -1314,7 +1363,7 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end, int __codecvt_utf8_utf16_base<char16_t>::do_encoding() const throw() -{ return 0; } +{ return 0; } // UTF-8 is not a fixed-width encoding bool __codecvt_utf8_utf16_base<char16_t>::do_always_noconv() const throw() @@ -1332,9 +1381,12 @@ do_length(state_type&, const extern_type* __from, int __codecvt_utf8_utf16_base<char16_t>::do_max_length() const throw() { - // Any valid UTF-8 sequence of 3 bytes fits in a single 16-bit code unit, - // whereas 4 byte sequences require two 16-bit code units. - return 3; + // A single character can be 1 or 2 UTF-16 code units, + // requiring up to 4 UTF-8 code units. + int max = 4; + if (_M_mode & consume_header) + max += sizeof(utf8_bom); + return max; } // Define members of codecvt_utf8_utf16<char32_t> base class implementation. @@ -1387,7 +1439,7 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end, int __codecvt_utf8_utf16_base<char32_t>::do_encoding() const throw() -{ return 0; } +{ return 0; } // UTF-8 is not a fixed-width encoding bool __codecvt_utf8_utf16_base<char32_t>::do_always_noconv() const throw() @@ -1405,9 +1457,12 @@ do_length(state_type&, const extern_type* __from, int __codecvt_utf8_utf16_base<char32_t>::do_max_length() const throw() { - // Any valid UTF-8 sequence of 3 bytes fits in a single 16-bit code unit, - // whereas 4 byte sequences require two 16-bit code units. - return 3; + // A single character can be 1 or 2 UTF-16 code units, + // requiring up to 4 UTF-8 code units. + int max = 4; + if (_M_mode & consume_header) + max += sizeof(utf8_bom); + return max; } #ifdef _GLIBCXX_USE_WCHAR_T @@ -1461,7 +1516,7 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end, int __codecvt_utf8_utf16_base<wchar_t>::do_encoding() const throw() -{ return 0; } +{ return 0; } // UTF-8 is not a fixed-width encoding bool __codecvt_utf8_utf16_base<wchar_t>::do_always_noconv() const throw() @@ -1479,9 +1534,12 @@ do_length(state_type&, const extern_type* __from, int __codecvt_utf8_utf16_base<wchar_t>::do_max_length() const throw() { - // Any valid UTF-8 sequence of 3 bytes fits in a single 16-bit code unit, - // whereas 4 byte sequences require two 16-bit code units. - return 3; + // A single character can be 1 or 2 UTF-16 code units, + // requiring up to 4 UTF-8 code units. + int max = 4; + if (_M_mode & consume_header) + max += sizeof(utf8_bom); + return max; } #endif diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/char16_t.cc b/libstdc++-v3/testsuite/22_locale/codecvt/char16_t.cc index b40fc65..3288e77 100644 --- a/libstdc++-v3/testsuite/22_locale/codecvt/char16_t.cc +++ b/libstdc++-v3/testsuite/22_locale/codecvt/char16_t.cc @@ -34,7 +34,7 @@ test01() const codecvt_c16* const cvt = &use_facet<codecvt_c16>(loc_c); VERIFY(!cvt->always_noconv()); - VERIFY(cvt->max_length() == 3); + VERIFY(cvt->max_length() == 4); VERIFY(cvt->encoding() == 0); const char u8dat[] = u8"H\U000000E4ll\U000000F6 \U0001F63F \U000056FD " diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/members.cc b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/members.cc new file mode 100644 index 0000000..993c860 --- /dev/null +++ b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/members.cc @@ -0,0 +1,81 @@ +// Copyright (C) 2017 Free Software Foundation, Inc. +// +// This file is part of the GNU ISO C++ Library. This library is free +// software; you can redistribute it and/or modify it under the +// terms of the GNU General Public License as published by the +// Free Software Foundation; either version 3, or (at your option) +// any later version. + +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License along +// with this library; see the file COPYING3. If not see +// <http://www.gnu.org/licenses/>. + +// { dg-do run { target c++11 } } + +#include <codecvt> +#include <testsuite_hooks.h> + +const int bomlen = 2; // UTF-16 BOM is 16 bits + +void +test01() +{ + const int maxlen = 2; + + std::codecvt_utf16<char16_t> c; + VERIFY( c.always_noconv() == false ); + VERIFY( c.encoding() == 0 ); + VERIFY( c.max_length() == maxlen ); + + std::codecvt_utf16<char16_t, 0x10ffff, std::consume_header> c_bom; + VERIFY( c_bom.always_noconv() == false ); + VERIFY( c_bom.encoding() == 0 ); + VERIFY( c_bom.max_length() == (maxlen + bomlen) ); +} + +void +test02() +{ + const int maxlen = 4; + + std::codecvt_utf16<char32_t> c; + VERIFY( c.always_noconv() == false ); + VERIFY( c.encoding() == 0 ); + VERIFY( c.max_length() == maxlen ); + + std::codecvt_utf16<char32_t, 0x10ffff, std::consume_header> c_bom; + VERIFY( c_bom.always_noconv() == false ); + VERIFY( c_bom.encoding() == 0 ); + VERIFY( c_bom.max_length() == (maxlen + bomlen) ); +} + +void +test03() +{ +#ifdef _GLIBCXX_USE_WCHAR_T + const int maxlen = sizeof(wchar_t) == 4 ? 4 : 2; + + std::codecvt_utf16<wchar_t> c; + VERIFY( c.always_noconv() == false ); + VERIFY( c.encoding() == 0 ); + VERIFY( c.max_length() == maxlen ); + + std::codecvt_utf16<wchar_t, 0x10ffff, std::consume_header> c_bom; + VERIFY( c_bom.always_noconv() == false ); + VERIFY( c_bom.encoding() == 0 ); + VERIFY( c_bom.max_length() == (maxlen + bomlen) ); +#endif +} + +int +main() +{ + test01(); + test02(); + test03(); +} diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8/members.cc b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8/members.cc new file mode 100644 index 0000000..baeb049 --- /dev/null +++ b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8/members.cc @@ -0,0 +1,81 @@ +// Copyright (C) 2017 Free Software Foundation, Inc. +// +// This file is part of the GNU ISO C++ Library. This library is free +// software; you can redistribute it and/or modify it under the +// terms of the GNU General Public License as published by the +// Free Software Foundation; either version 3, or (at your option) +// any later version. + +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License along +// with this library; see the file COPYING3. If not see +// <http://www.gnu.org/licenses/>. + +// { dg-do run { target c++11 } } + +#include <codecvt> +#include <testsuite_hooks.h> + +const int bomlen = 3; // UTF-8 BOM is 24 bits + +void +test01() +{ + const int maxlen = 3; + + std::codecvt_utf8<char16_t> c; + VERIFY( c.always_noconv() == false ); + VERIFY( c.encoding() == 0 ); + VERIFY( c.max_length() == maxlen ); + + std::codecvt_utf8<char16_t, 0x10ffff, std::consume_header> c_bom; + VERIFY( c_bom.always_noconv() == false ); + VERIFY( c_bom.encoding() == 0 ); + VERIFY( c_bom.max_length() == (maxlen + bomlen) ); +} + +void +test02() +{ + const int maxlen = 4; + + std::codecvt_utf8<char32_t> c; + VERIFY( c.always_noconv() == false ); + VERIFY( c.encoding() == 0 ); + VERIFY( c.max_length() == maxlen ); + + std::codecvt_utf8<char32_t, 0x10ffff, std::consume_header> c_bom; + VERIFY( c_bom.always_noconv() == false ); + VERIFY( c_bom.encoding() == 0 ); + VERIFY( c_bom.max_length() == (maxlen + bomlen) ); +} + +void +test03() +{ +#ifdef _GLIBCXX_USE_WCHAR_T + const int maxlen = sizeof(wchar_t) == 4 ? 4 : 3; + + std::codecvt_utf8<wchar_t> c; + VERIFY( c.always_noconv() == false ); + VERIFY( c.encoding() == 0 ); + VERIFY( c.max_length() == maxlen ); + + std::codecvt_utf8<wchar_t, 0x10ffff, std::consume_header> c_bom; + VERIFY( c_bom.always_noconv() == false ); + VERIFY( c_bom.encoding() == 0 ); + VERIFY( c_bom.max_length() == (maxlen + bomlen) ); +#endif +} + +int +main() +{ + test01(); + test02(); + test03(); +} diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8_utf16/members.cc b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8_utf16/members.cc new file mode 100644 index 0000000..8fcdfff --- /dev/null +++ b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8_utf16/members.cc @@ -0,0 +1,76 @@ +// Copyright (C) 2017 Free Software Foundation, Inc. +// +// This file is part of the GNU ISO C++ Library. This library is free +// software; you can redistribute it and/or modify it under the +// terms of the GNU General Public License as published by the +// Free Software Foundation; either version 3, or (at your option) +// any later version. + +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License along +// with this library; see the file COPYING3. If not see +// <http://www.gnu.org/licenses/>. + +// { dg-do run { target c++11 } } + +#include <codecvt> +#include <testsuite_hooks.h> + +const int bomlen = 3; // UTF-8 BOM is 24 bits +const int maxlen = 4; + +void +test01() +{ + std::codecvt_utf8_utf16<char16_t> c; + VERIFY( c.always_noconv() == false ); + VERIFY( c.encoding() == 0 ); + VERIFY( c.max_length() == maxlen ); + + std::codecvt_utf8_utf16<char16_t, 0x10ffff, std::consume_header> c_bom; + VERIFY( c_bom.always_noconv() == false ); + VERIFY( c_bom.encoding() == 0 ); + VERIFY( c_bom.max_length() == (maxlen + bomlen) ); +} + +void +test02() +{ + std::codecvt_utf8_utf16<char32_t> c; + VERIFY( c.always_noconv() == false ); + VERIFY( c.encoding() == 0 ); + VERIFY( c.max_length() == maxlen ); + + std::codecvt_utf8_utf16<char32_t, 0x10ffff, std::consume_header> c_bom; + VERIFY( c_bom.always_noconv() == false ); + VERIFY( c_bom.encoding() == 0 ); + VERIFY( c_bom.max_length() == (maxlen + bomlen) ); +} + +void +test03() +{ +#ifdef _GLIBCXX_USE_WCHAR_T + std::codecvt_utf8_utf16<wchar_t> c; + VERIFY( c.always_noconv() == false ); + VERIFY( c.encoding() == 0 ); + VERIFY( c.max_length() == maxlen ); + + std::codecvt_utf8_utf16<wchar_t, 0x10ffff, std::consume_header> c_bom; + VERIFY( c_bom.always_noconv() == false ); + VERIFY( c_bom.encoding() == 0 ); + VERIFY( c_bom.max_length() == (maxlen + bomlen) ); +#endif +} + +int +main() +{ + test01(); + test02(); + test03(); +} |