diff options
author | Dimitrij Mijoski <dmjpp@hotmail.com> | 2023-01-10 13:58:59 +0100 |
---|---|---|
committer | Jonathan Wakely <jwakely@redhat.com> | 2023-01-13 13:34:20 +0000 |
commit | 02dab998665dda0f6df31740e8897c42de3d740f (patch) | |
tree | 54183d7eb0819f3f09d06817991c223717deac8b /libstdc++-v3/testsuite/22_locale | |
parent | e2fc12a5dafadf15d804e1d2541528296e97a847 (diff) | |
download | gcc-02dab998665dda0f6df31740e8897c42de3d740f.zip gcc-02dab998665dda0f6df31740e8897c42de3d740f.tar.gz gcc-02dab998665dda0f6df31740e8897c42de3d740f.tar.bz2 |
libstdc++: Fix Unicode codecvt and add tests [PR86419]
Fixes the conversion from UTF-8 to UTF-16 to properly return partial
instead ok.
Fixes the conversion from UTF-16 to UTF-8 to properly return partial
instead ok.
Fixes the conversion from UTF-8 to UCS-2 to properly return partial
instead error.
Fixes the conversion from UTF-8 to UCS-2 to treat 4-byte UTF-8 sequences
as error just by seeing the leading byte.
Fixes UTF-8 decoding for all codecvts so they detect error at the end of
the input range when the last code point is also incomplete.
libstdc++-v3/ChangeLog:
PR libstdc++/86419
* src/c++11/codecvt.cc (read_utf8_code_point): Correctly detect
errors in incomplete multibyte sequences.
(utf16_in): Remove surrogates parameter. Fix conditions for
returning partial.
(utf16_out): Fix condition for returning partial.
(ucs2_in): Do not pass surrogates argument to utf16_in.
* testsuite/22_locale/codecvt/codecvt_unicode.cc: New test.
* testsuite/22_locale/codecvt/codecvt_unicode.h: New header for
tests.
* testsuite/22_locale/codecvt/codecvt_unicode_wchar_t.cc: New
test.
Diffstat (limited to 'libstdc++-v3/testsuite/22_locale')
3 files changed, 1396 insertions, 0 deletions
diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_unicode.cc b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_unicode.cc new file mode 100644 index 0000000..ae4b6c8 --- /dev/null +++ b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_unicode.cc @@ -0,0 +1,68 @@ +// Copyright (C) 2020-2023 Free Software Foundation, Inc. +// +// This file is part of the GNU ISO C++ Library. This library is free +// software; you can redistribute it and/or modify it under the +// terms of the GNU General Public License as published by the +// Free Software Foundation; either version 3, or (at your option) +// any later version. + +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License along +// with this library; see the file COPYING3. If not see +// <http://www.gnu.org/licenses/>. + +// { dg-do run { target c++11 } } + +#include "codecvt_unicode.h" + +#include <codecvt> + +using namespace std; + +void +test_utf8_utf32_codecvts () +{ + using codecvt_c32 = codecvt<char32_t, char, mbstate_t>; + auto loc_c = locale::classic (); + VERIFY (has_facet<codecvt_c32> (loc_c)); + auto &cvt = use_facet<codecvt_c32> (loc_c); + test_utf8_utf32_codecvts (cvt); + + auto cvt_ptr = to_unique_ptr (new codecvt_utf8<char32_t> ()); + test_utf8_utf32_codecvts (*cvt_ptr); +} + +void +test_utf8_utf16_codecvts () +{ + using codecvt_c16 = codecvt<char16_t, char, mbstate_t>; + auto loc_c = locale::classic (); + VERIFY (has_facet<codecvt_c16> (loc_c)); + auto &cvt = use_facet<codecvt_c16> (loc_c); + test_utf8_utf16_cvts (cvt); + + auto cvt_ptr = to_unique_ptr (new codecvt_utf8_utf16<char16_t> ()); + test_utf8_utf16_cvts (*cvt_ptr); + + auto cvt_ptr2 = to_unique_ptr (new codecvt_utf8_utf16<char32_t> ()); + test_utf8_utf16_cvts (*cvt_ptr2); +} + +void +test_utf8_ucs2_codecvts () +{ + auto cvt_ptr = to_unique_ptr (new codecvt_utf8<char16_t> ()); + test_utf8_ucs2_cvts (*cvt_ptr); +} + +int +main () +{ + test_utf8_utf32_codecvts (); + test_utf8_utf16_codecvts (); + test_utf8_ucs2_codecvts (); +} diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_unicode.h b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_unicode.h new file mode 100644 index 0000000..99d1a468 --- /dev/null +++ b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_unicode.h @@ -0,0 +1,1269 @@ +// Copyright (C) 2020-2023 Free Software Foundation, Inc. +// +// This file is part of the GNU ISO C++ Library. This library is free +// software; you can redistribute it and/or modify it under the +// terms of the GNU General Public License as published by the +// Free Software Foundation; either version 3, or (at your option) +// any later version. + +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License along +// with this library; see the file COPYING3. If not see +// <http://www.gnu.org/licenses/>. + +#include <locale> +#include <string> +#include <memory> +#include <testsuite_hooks.h> + +template <typename T> +std::unique_ptr<T> +to_unique_ptr (T *ptr) +{ + return std::unique_ptr<T> (ptr); +} + +struct test_offsets_ok +{ + size_t in_size, out_size; +}; +struct test_offsets_partial +{ + size_t in_size, out_size, expected_in_next, expected_out_next; +}; + +template <class CharT> struct test_offsets_error +{ + size_t in_size, out_size, expected_in_next, expected_out_next; + CharT replace_char; + size_t replace_pos; +}; + +template <class T, size_t N> +auto constexpr array_size (const T (&)[N]) -> size_t +{ + return N; +} + +template <class CharT> +void +utf8_to_utf32_in_ok (const std::codecvt<CharT, char, mbstate_t> &cvt) +{ + using namespace std; + // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP + const char in[] = "bш\uAAAA\U0010AAAA"; + const char32_t exp_literal[] = U"bш\uAAAA\U0010AAAA"; + CharT exp[array_size (exp_literal)] = {}; + std::copy (begin (exp_literal), end (exp_literal), begin (exp)); + + static_assert (array_size (in) == 11, ""); + static_assert (array_size (exp_literal) == 5, ""); + static_assert (array_size (exp) == 5, ""); + VERIFY (char_traits<char>::length (in) == 10); + VERIFY (char_traits<char32_t>::length (exp_literal) == 4); + VERIFY (char_traits<CharT>::length (exp) == 4); + + test_offsets_ok offsets[] = {{0, 0}, {1, 1}, {3, 2}, {6, 3}, {10, 4}}; + for (auto t : offsets) + { + CharT out[array_size (exp) - 1] = {}; + VERIFY (t.in_size <= array_size (in)); + VERIFY (t.out_size <= array_size (out)); + auto state = mbstate_t{}; + auto in_next = (const char *) nullptr; + auto out_next = (CharT *) nullptr; + auto res = codecvt_base::result (); + + res = cvt.in (state, in, in + t.in_size, in_next, out, out + t.out_size, + out_next); + VERIFY (res == cvt.ok); + VERIFY (in_next == in + t.in_size); + VERIFY (out_next == out + t.out_size); + VERIFY (char_traits<CharT>::compare (out, exp, t.out_size) == 0); + if (t.out_size < array_size (out)) + VERIFY (out[t.out_size] == 0); + } + + for (auto t : offsets) + { + CharT out[array_size (exp)] = {}; + VERIFY (t.in_size <= array_size (in)); + VERIFY (t.out_size <= array_size (out)); + auto state = mbstate_t{}; + auto in_next = (const char *) nullptr; + auto out_next = (CharT *) nullptr; + auto res = codecvt_base::result (); + + res + = cvt.in (state, in, in + t.in_size, in_next, out, end (out), out_next); + VERIFY (res == cvt.ok); + VERIFY (in_next == in + t.in_size); + VERIFY (out_next == out + t.out_size); + VERIFY (char_traits<CharT>::compare (out, exp, t.out_size) == 0); + if (t.out_size < array_size (out)) + VERIFY (out[t.out_size] == 0); + } +} + +template <class CharT> +void +utf8_to_utf32_in_partial (const std::codecvt<CharT, char, mbstate_t> &cvt) +{ + using namespace std; + // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP + const char in[] = "bш\uAAAA\U0010AAAA"; + const char32_t exp_literal[] = U"bш\uAAAA\U0010AAAA"; + CharT exp[array_size (exp_literal)] = {}; + std::copy (begin (exp_literal), end (exp_literal), begin (exp)); + + static_assert (array_size (in) == 11, ""); + static_assert (array_size (exp_literal) == 5, ""); + static_assert (array_size (exp) == 5, ""); + VERIFY (char_traits<char>::length (in) == 10); + VERIFY (char_traits<char32_t>::length (exp_literal) == 4); + VERIFY (char_traits<CharT>::length (exp) == 4); + + test_offsets_partial offsets[] = { + {1, 0, 0, 0}, // no space for first CP + + {3, 1, 1, 1}, // no space for second CP + {2, 2, 1, 1}, // incomplete second CP + {2, 1, 1, 1}, // incomplete second CP, and no space for it + + {6, 2, 3, 2}, // no space for third CP + {4, 3, 3, 2}, // incomplete third CP + {5, 3, 3, 2}, // incomplete third CP + {4, 2, 3, 2}, // incomplete third CP, and no space for it + {5, 2, 3, 2}, // incomplete third CP, and no space for it + + {10, 3, 6, 3}, // no space for fourth CP + {7, 4, 6, 3}, // incomplete fourth CP + {8, 4, 6, 3}, // incomplete fourth CP + {9, 4, 6, 3}, // incomplete fourth CP + {7, 3, 6, 3}, // incomplete fourth CP, and no space for it + {8, 3, 6, 3}, // incomplete fourth CP, and no space for it + {9, 3, 6, 3}, // incomplete fourth CP, and no space for it + }; + + for (auto t : offsets) + { + CharT out[array_size (exp) - 1] = {}; + VERIFY (t.in_size <= array_size (in)); + VERIFY (t.out_size <= array_size (out)); + VERIFY (t.expected_in_next <= t.in_size); + VERIFY (t.expected_out_next <= t.out_size); + auto state = mbstate_t{}; + auto in_next = (const char *) nullptr; + auto out_next = (CharT *) nullptr; + auto res = codecvt_base::result (); + + res = cvt.in (state, in, in + t.in_size, in_next, out, out + t.out_size, + out_next); + VERIFY (res == cvt.partial); + VERIFY (in_next == in + t.expected_in_next); + VERIFY (out_next == out + t.expected_out_next); + VERIFY (char_traits<CharT>::compare (out, exp, t.expected_out_next) == 0); + if (t.expected_out_next < array_size (out)) + VERIFY (out[t.expected_out_next] == 0); + } +} + +template <class CharT> +void +utf8_to_utf32_in_error (const std::codecvt<CharT, char, mbstate_t> &cvt) +{ + using namespace std; + // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP + const char valid_in[] = "bш\uAAAA\U0010AAAA"; + const char32_t exp_literal[] = U"bш\uAAAA\U0010AAAA"; + CharT exp[array_size (exp_literal)] = {}; + std::copy (begin (exp_literal), end (exp_literal), begin (exp)); + + static_assert (array_size (valid_in) == 11, ""); + static_assert (array_size (exp_literal) == 5, ""); + static_assert (array_size (exp) == 5, ""); + VERIFY (char_traits<char>::length (valid_in) == 10); + VERIFY (char_traits<char32_t>::length (exp_literal) == 4); + VERIFY (char_traits<CharT>::length (exp) == 4); + + test_offsets_error<char> offsets[] = { + + // replace leading byte with invalid byte + {1, 4, 0, 0, '\xFF', 0}, + {3, 4, 1, 1, '\xFF', 1}, + {6, 4, 3, 2, '\xFF', 3}, + {10, 4, 6, 3, '\xFF', 6}, + + // replace first trailing byte with ASCII byte + {3, 4, 1, 1, 'z', 2}, + {6, 4, 3, 2, 'z', 4}, + {10, 4, 6, 3, 'z', 7}, + + // replace first trailing byte with invalid byte + {3, 4, 1, 1, '\xFF', 2}, + {6, 4, 3, 2, '\xFF', 4}, + {10, 4, 6, 3, '\xFF', 7}, + + // replace second trailing byte with ASCII byte + {6, 4, 3, 2, 'z', 5}, + {10, 4, 6, 3, 'z', 8}, + + // replace second trailing byte with invalid byte + {6, 4, 3, 2, '\xFF', 5}, + {10, 4, 6, 3, '\xFF', 8}, + + // replace third trailing byte + {10, 4, 6, 3, 'z', 9}, + {10, 4, 6, 3, '\xFF', 9}, + + // replace first trailing byte with ASCII byte, also incomplete at end + {5, 4, 3, 2, 'z', 4}, + {8, 4, 6, 3, 'z', 7}, + {9, 4, 6, 3, 'z', 7}, + + // replace first trailing byte with invalid byte, also incomplete at end + {5, 4, 3, 2, '\xFF', 4}, + {8, 4, 6, 3, '\xFF', 7}, + {9, 4, 6, 3, '\xFF', 7}, + + // replace second trailing byte with ASCII byte, also incomplete at end + {9, 4, 6, 3, 'z', 8}, + + // replace second trailing byte with invalid byte, also incomplete at end + {9, 4, 6, 3, '\xFF', 8}, + }; + for (auto t : offsets) + { + char in[array_size (valid_in)] = {}; + CharT out[array_size (exp) - 1] = {}; + VERIFY (t.in_size <= array_size (in)); + VERIFY (t.out_size <= array_size (out)); + VERIFY (t.expected_in_next <= t.in_size); + VERIFY (t.expected_out_next <= t.out_size); + char_traits<char>::copy (in, valid_in, array_size (valid_in)); + in[t.replace_pos] = t.replace_char; + + auto state = mbstate_t{}; + auto in_next = (const char *) nullptr; + auto out_next = (CharT *) nullptr; + auto res = codecvt_base::result (); + + res = cvt.in (state, in, in + t.in_size, in_next, out, out + t.out_size, + out_next); + VERIFY (res == cvt.error); + VERIFY (in_next == in + t.expected_in_next); + VERIFY (out_next == out + t.expected_out_next); + VERIFY (char_traits<CharT>::compare (out, exp, t.expected_out_next) == 0); + if (t.expected_out_next < array_size (out)) + VERIFY (out[t.expected_out_next] == 0); + } +} + +template <class CharT> +void +utf8_to_utf32_in (const std::codecvt<CharT, char, mbstate_t> &cvt) +{ + utf8_to_utf32_in_ok (cvt); + utf8_to_utf32_in_partial (cvt); + utf8_to_utf32_in_error (cvt); +} + +template <class CharT> +void +utf32_to_utf8_out_ok (const std::codecvt<CharT, char, mbstate_t> &cvt) +{ + using namespace std; + // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP + const char32_t in_literal[] = U"bш\uAAAA\U0010AAAA"; + const char exp[] = "bш\uAAAA\U0010AAAA"; + CharT in[array_size (in_literal)] = {}; + copy (begin (in_literal), end (in_literal), begin (in)); + + static_assert (array_size (in_literal) == 5, ""); + static_assert (array_size (in) == 5, ""); + static_assert (array_size (exp) == 11, ""); + VERIFY (char_traits<char32_t>::length (in_literal) == 4); + VERIFY (char_traits<CharT>::length (in) == 4); + VERIFY (char_traits<char>::length (exp) == 10); + + const test_offsets_ok offsets[] = {{0, 0}, {1, 1}, {2, 3}, {3, 6}, {4, 10}}; + for (auto t : offsets) + { + char out[array_size (exp) - 1] = {}; + VERIFY (t.in_size <= array_size (in)); + VERIFY (t.out_size <= array_size (out)); + auto state = mbstate_t{}; + auto in_next = (const CharT *) nullptr; + auto out_next = (char *) nullptr; + auto res = codecvt_base::result (); + + res = cvt.out (state, in, in + t.in_size, in_next, out, out + t.out_size, + out_next); + VERIFY (res == cvt.ok); + VERIFY (in_next == in + t.in_size); + VERIFY (out_next == out + t.out_size); + VERIFY (char_traits<char>::compare (out, exp, t.out_size) == 0); + if (t.out_size < array_size (out)) + VERIFY (out[t.out_size] == 0); + } +} + +template <class CharT> +void +utf32_to_utf8_out_partial (const std::codecvt<CharT, char, mbstate_t> &cvt) +{ + using namespace std; + // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP + const char32_t in_literal[] = U"bш\uAAAA\U0010AAAA"; + const char exp[] = "bш\uAAAA\U0010AAAA"; + CharT in[array_size (in_literal)] = {}; + copy (begin (in_literal), end (in_literal), begin (in)); + + static_assert (array_size (in_literal) == 5, ""); + static_assert (array_size (in) == 5, ""); + static_assert (array_size (exp) == 11, ""); + VERIFY (char_traits<char32_t>::length (in_literal) == 4); + VERIFY (char_traits<CharT>::length (in) == 4); + VERIFY (char_traits<char>::length (exp) == 10); + + const test_offsets_partial offsets[] = { + {1, 0, 0, 0}, // no space for first CP + + {2, 1, 1, 1}, // no space for second CP + {2, 2, 1, 1}, // no space for second CP + + {3, 3, 2, 3}, // no space for third CP + {3, 4, 2, 3}, // no space for third CP + {3, 5, 2, 3}, // no space for third CP + + {4, 6, 3, 6}, // no space for fourth CP + {4, 7, 3, 6}, // no space for fourth CP + {4, 8, 3, 6}, // no space for fourth CP + {4, 9, 3, 6}, // no space for fourth CP + }; + for (auto t : offsets) + { + char out[array_size (exp) - 1] = {}; + VERIFY (t.in_size <= array_size (in)); + VERIFY (t.out_size <= array_size (out)); + VERIFY (t.expected_in_next <= t.in_size); + VERIFY (t.expected_out_next <= t.out_size); + auto state = mbstate_t{}; + auto in_next = (const CharT *) nullptr; + auto out_next = (char *) nullptr; + auto res = codecvt_base::result (); + + res = cvt.out (state, in, in + t.in_size, in_next, out, out + t.out_size, + out_next); + VERIFY (res == cvt.partial); + VERIFY (in_next == in + t.expected_in_next); + VERIFY (out_next == out + t.expected_out_next); + VERIFY (char_traits<char>::compare (out, exp, t.expected_out_next) == 0); + if (t.expected_out_next < array_size (out)) + VERIFY (out[t.expected_out_next] == 0); + } +} + +template <class CharT> +void +utf32_to_utf8_out_error (const std::codecvt<CharT, char, mbstate_t> &cvt) +{ + using namespace std; + const char32_t valid_in[] = U"bш\uAAAA\U0010AAAA"; + const char exp[] = "bш\uAAAA\U0010AAAA"; + + static_assert (array_size (valid_in) == 5, ""); + static_assert (array_size (exp) == 11, ""); + VERIFY (char_traits<char32_t>::length (valid_in) == 4); + VERIFY (char_traits<char>::length (exp) == 10); + + test_offsets_error<CharT> offsets[] = {{4, 10, 0, 0, 0x00110000, 0}, + {4, 10, 1, 1, 0x00110000, 1}, + {4, 10, 2, 3, 0x00110000, 2}, + {4, 10, 3, 6, 0x00110000, 3}}; + + for (auto t : offsets) + { + CharT in[array_size (valid_in)] = {}; + char out[array_size (exp) - 1] = {}; + VERIFY (t.in_size <= array_size (in)); + VERIFY (t.out_size <= array_size (out)); + VERIFY (t.expected_in_next <= t.in_size); + VERIFY (t.expected_out_next <= t.out_size); + copy (begin (valid_in), end (valid_in), begin (in)); + in[t.replace_pos] = t.replace_char; + + auto state = mbstate_t{}; + auto in_next = (const CharT *) nullptr; + auto out_next = (char *) nullptr; + auto res = codecvt_base::result (); + + res = cvt.out (state, in, in + t.in_size, in_next, out, out + t.out_size, + out_next); + VERIFY (res == cvt.error); + VERIFY (in_next == in + t.expected_in_next); + VERIFY (out_next == out + t.expected_out_next); + VERIFY (char_traits<char>::compare (out, exp, t.expected_out_next) == 0); + if (t.expected_out_next < array_size (out)) + VERIFY (out[t.expected_out_next] == 0); + } +} + +template <class CharT> +void +utf32_to_utf8_out (const std::codecvt<CharT, char, mbstate_t> &cvt) +{ + utf32_to_utf8_out_ok (cvt); + utf32_to_utf8_out_partial (cvt); + utf32_to_utf8_out_error (cvt); +} + +template <class CharT> +void +test_utf8_utf32_codecvts (const std::codecvt<CharT, char, mbstate_t> &cvt) +{ + utf8_to_utf32_in (cvt); + utf32_to_utf8_out (cvt); +} + +template <class CharT> +void +utf8_to_utf16_in_ok (const std::codecvt<CharT, char, mbstate_t> &cvt) +{ + using namespace std; + // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP + const char in[] = "bш\uAAAA\U0010AAAA"; + const char16_t exp_literal[] = u"bш\uAAAA\U0010AAAA"; + CharT exp[array_size (exp_literal)] = {}; + copy (begin (exp_literal), end (exp_literal), begin (exp)); + + static_assert (array_size (in) == 11, ""); + static_assert (array_size (exp_literal) == 6, ""); + static_assert (array_size (exp) == 6, ""); + VERIFY (char_traits<char>::length (in) == 10); + VERIFY (char_traits<char16_t>::length (exp_literal) == 5); + VERIFY (char_traits<CharT>::length (exp) == 5); + + test_offsets_ok offsets[] = {{0, 0}, {1, 1}, {3, 2}, {6, 3}, {10, 5}}; + for (auto t : offsets) + { + CharT out[array_size (exp) - 1] = {}; + VERIFY (t.in_size <= array_size (in)); + VERIFY (t.out_size <= array_size (out)); + auto state = mbstate_t{}; + auto in_next = (const char *) nullptr; + auto out_next = (CharT *) nullptr; + auto res = codecvt_base::result (); + + res = cvt.in (state, in, in + t.in_size, in_next, out, out + t.out_size, + out_next); + VERIFY (res == cvt.ok); + VERIFY (in_next == in + t.in_size); + VERIFY (out_next == out + t.out_size); + VERIFY (char_traits<CharT>::compare (out, exp, t.out_size) == 0); + if (t.out_size < array_size (out)) + VERIFY (out[t.out_size] == 0); + } + + for (auto t : offsets) + { + CharT out[array_size (exp)] = {}; + VERIFY (t.in_size <= array_size (in)); + VERIFY (t.out_size <= array_size (out)); + auto state = mbstate_t{}; + auto in_next = (const char *) nullptr; + auto out_next = (CharT *) nullptr; + auto res = codecvt_base::result (); + + res + = cvt.in (state, in, in + t.in_size, in_next, out, end (out), out_next); + VERIFY (res == cvt.ok); + VERIFY (in_next == in + t.in_size); + VERIFY (out_next == out + t.out_size); + VERIFY (char_traits<CharT>::compare (out, exp, t.out_size) == 0); + if (t.out_size < array_size (out)) + VERIFY (out[t.out_size] == 0); + } +} + +template <class CharT> +void +utf8_to_utf16_in_partial (const std::codecvt<CharT, char, mbstate_t> &cvt) +{ + using namespace std; + // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP + const char in[] = "bш\uAAAA\U0010AAAA"; + const char16_t exp_literal[] = u"bш\uAAAA\U0010AAAA"; + CharT exp[array_size (exp_literal)] = {}; + copy (begin (exp_literal), end (exp_literal), begin (exp)); + + static_assert (array_size (in) == 11, ""); + static_assert (array_size (exp_literal) == 6, ""); + static_assert (array_size (exp) == 6, ""); + VERIFY (char_traits<char>::length (in) == 10); + VERIFY (char_traits<char16_t>::length (exp_literal) == 5); + VERIFY (char_traits<CharT>::length (exp) == 5); + + test_offsets_partial offsets[] = { + {1, 0, 0, 0}, // no space for first CP + + {3, 1, 1, 1}, // no space for second CP + {2, 2, 1, 1}, // incomplete second CP + {2, 1, 1, 1}, // incomplete second CP, and no space for it + + {6, 2, 3, 2}, // no space for third CP + {4, 3, 3, 2}, // incomplete third CP + {5, 3, 3, 2}, // incomplete third CP + {4, 2, 3, 2}, // incomplete third CP, and no space for it + {5, 2, 3, 2}, // incomplete third CP, and no space for it + + {10, 3, 6, 3}, // no space for fourth CP + {10, 4, 6, 3}, // no space for fourth CP + {7, 5, 6, 3}, // incomplete fourth CP + {8, 5, 6, 3}, // incomplete fourth CP + {9, 5, 6, 3}, // incomplete fourth CP + {7, 3, 6, 3}, // incomplete fourth CP, and no space for it + {8, 3, 6, 3}, // incomplete fourth CP, and no space for it + {9, 3, 6, 3}, // incomplete fourth CP, and no space for it + {7, 4, 6, 3}, // incomplete fourth CP, and no space for it + {8, 4, 6, 3}, // incomplete fourth CP, and no space for it + {9, 4, 6, 3}, // incomplete fourth CP, and no space for it + + }; + + for (auto t : offsets) + { + CharT out[array_size (exp) - 1] = {}; + VERIFY (t.in_size <= array_size (in)); + VERIFY (t.out_size <= array_size (out)); + VERIFY (t.expected_in_next <= t.in_size); + VERIFY (t.expected_out_next <= t.out_size); + auto state = mbstate_t{}; + auto in_next = (const char *) nullptr; + auto out_next = (CharT *) nullptr; + auto res = codecvt_base::result (); + + res = cvt.in (state, in, in + t.in_size, in_next, out, out + t.out_size, + out_next); + VERIFY (res == cvt.partial); + VERIFY (in_next == in + t.expected_in_next); + VERIFY (out_next == out + t.expected_out_next); + VERIFY (char_traits<CharT>::compare (out, exp, t.expected_out_next) == 0); + if (t.expected_out_next < array_size (out)) + VERIFY (out[t.expected_out_next] == 0); + } +} + +template <class CharT> +void +utf8_to_utf16_in_error (const std::codecvt<CharT, char, mbstate_t> &cvt) +{ + using namespace std; + const char valid_in[] = "bш\uAAAA\U0010AAAA"; + const char16_t exp_literal[] = u"bш\uAAAA\U0010AAAA"; + CharT exp[array_size (exp_literal)] = {}; + copy (begin (exp_literal), end (exp_literal), begin (exp)); + + static_assert (array_size (valid_in) == 11, ""); + static_assert (array_size (exp_literal) == 6, ""); + static_assert (array_size (exp) == 6, ""); + VERIFY (char_traits<char>::length (valid_in) == 10); + VERIFY (char_traits<char16_t>::length (exp_literal) == 5); + VERIFY (char_traits<CharT>::length (exp) == 5); + + test_offsets_error<char> offsets[] = { + + // replace leading byte with invalid byte + {1, 5, 0, 0, '\xFF', 0}, + {3, 5, 1, 1, '\xFF', 1}, + {6, 5, 3, 2, '\xFF', 3}, + {10, 5, 6, 3, '\xFF', 6}, + + // replace first trailing byte with ASCII byte + {3, 5, 1, 1, 'z', 2}, + {6, 5, 3, 2, 'z', 4}, + {10, 5, 6, 3, 'z', 7}, + + // replace first trailing byte with invalid byte + {3, 5, 1, 1, '\xFF', 2}, + {6, 5, 3, 2, '\xFF', 4}, + {10, 5, 6, 3, '\xFF', 7}, + + // replace second trailing byte with ASCII byte + {6, 5, 3, 2, 'z', 5}, + {10, 5, 6, 3, 'z', 8}, + + // replace second trailing byte with invalid byte + {6, 5, 3, 2, '\xFF', 5}, + {10, 5, 6, 3, '\xFF', 8}, + + // replace third trailing byte + {10, 5, 6, 3, 'z', 9}, + {10, 5, 6, 3, '\xFF', 9}, + + // replace first trailing byte with ASCII byte, also incomplete at end + {5, 5, 3, 2, 'z', 4}, + {8, 5, 6, 3, 'z', 7}, + {9, 5, 6, 3, 'z', 7}, + + // replace first trailing byte with invalid byte, also incomplete at end + {5, 5, 3, 2, '\xFF', 4}, + {8, 5, 6, 3, '\xFF', 7}, + {9, 5, 6, 3, '\xFF', 7}, + + // replace second trailing byte with ASCII byte, also incomplete at end + {9, 5, 6, 3, 'z', 8}, + + // replace second trailing byte with invalid byte, also incomplete at end + {9, 5, 6, 3, '\xFF', 8}, + }; + for (auto t : offsets) + { + char in[array_size (valid_in)] = {}; + CharT out[array_size (exp) - 1] = {}; + VERIFY (t.in_size <= array_size (in)); + VERIFY (t.out_size <= array_size (out)); + VERIFY (t.expected_in_next <= t.in_size); + VERIFY (t.expected_out_next <= t.out_size); + char_traits<char>::copy (in, valid_in, array_size (valid_in)); + in[t.replace_pos] = t.replace_char; + + auto state = mbstate_t{}; + auto in_next = (const char *) nullptr; + auto out_next = (CharT *) nullptr; + auto res = codecvt_base::result (); + + res = cvt.in (state, in, in + t.in_size, in_next, out, out + t.out_size, + out_next); + VERIFY (res == cvt.error); + VERIFY (in_next == in + t.expected_in_next); + VERIFY (out_next == out + t.expected_out_next); + VERIFY (char_traits<CharT>::compare (out, exp, t.expected_out_next) == 0); + if (t.expected_out_next < array_size (out)) + VERIFY (out[t.expected_out_next] == 0); + } +} + +template <class CharT> +void +utf8_to_utf16_in (const std::codecvt<CharT, char, mbstate_t> &cvt) +{ + utf8_to_utf16_in_ok (cvt); + utf8_to_utf16_in_partial (cvt); + utf8_to_utf16_in_error (cvt); +} + +template <class CharT> +void +utf16_to_utf8_out_ok (const std::codecvt<CharT, char, mbstate_t> &cvt) +{ + using namespace std; + // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP + const char16_t in_literal[] = u"bш\uAAAA\U0010AAAA"; + const char exp[] = "bш\uAAAA\U0010AAAA"; + CharT in[array_size (in_literal)]; + copy (begin (in_literal), end (in_literal), begin (in)); + + static_assert (array_size (in_literal) == 6, ""); + static_assert (array_size (exp) == 11, ""); + static_assert (array_size (in) == 6, ""); + VERIFY (char_traits<char16_t>::length (in_literal) == 5); + VERIFY (char_traits<char>::length (exp) == 10); + VERIFY (char_traits<CharT>::length (in) == 5); + + const test_offsets_ok offsets[] = {{0, 0}, {1, 1}, {2, 3}, {3, 6}, {5, 10}}; + for (auto t : offsets) + { + char out[array_size (exp) - 1] = {}; + VERIFY (t.in_size <= array_size (in)); + VERIFY (t.out_size <= array_size (out)); + auto state = mbstate_t{}; + auto in_next = (const CharT *) nullptr; + auto out_next = (char *) nullptr; + auto res = codecvt_base::result (); + + res = cvt.out (state, in, in + t.in_size, in_next, out, out + t.out_size, + out_next); + VERIFY (res == cvt.ok); + VERIFY (in_next == in + t.in_size); + VERIFY (out_next == out + t.out_size); + VERIFY (char_traits<char>::compare (out, exp, t.out_size) == 0); + if (t.out_size < array_size (out)) + VERIFY (out[t.out_size] == 0); + } +} + +template <class CharT> +void +utf16_to_utf8_out_partial (const std::codecvt<CharT, char, mbstate_t> &cvt) +{ + using namespace std; + // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP + const char16_t in_literal[] = u"bш\uAAAA\U0010AAAA"; + const char exp[] = "bш\uAAAA\U0010AAAA"; + CharT in[array_size (in_literal)]; + copy (begin (in_literal), end (in_literal), begin (in)); + + static_assert (array_size (in_literal) == 6, ""); + static_assert (array_size (exp) == 11, ""); + static_assert (array_size (in) == 6, ""); + VERIFY (char_traits<char16_t>::length (in_literal) == 5); + VERIFY (char_traits<char>::length (exp) == 10); + VERIFY (char_traits<CharT>::length (in) == 5); + + const test_offsets_partial offsets[] = { + {1, 0, 0, 0}, // no space for first CP + + {2, 1, 1, 1}, // no space for second CP + {2, 2, 1, 1}, // no space for second CP + + {3, 3, 2, 3}, // no space for third CP + {3, 4, 2, 3}, // no space for third CP + {3, 5, 2, 3}, // no space for third CP + + {5, 6, 3, 6}, // no space for fourth CP + {5, 7, 3, 6}, // no space for fourth CP + {5, 8, 3, 6}, // no space for fourth CP + {5, 9, 3, 6}, // no space for fourth CP + + {4, 10, 3, 6}, // incomplete fourth CP + + {4, 6, 3, 6}, // incomplete fourth CP, and no space for it + {4, 7, 3, 6}, // incomplete fourth CP, and no space for it + {4, 8, 3, 6}, // incomplete fourth CP, and no space for it + {4, 9, 3, 6}, // incomplete fourth CP, and no space for it + }; + for (auto t : offsets) + { + char out[array_size (exp) - 1] = {}; + VERIFY (t.in_size <= array_size (in)); + VERIFY (t.out_size <= array_size (out)); + VERIFY (t.expected_in_next <= t.in_size); + VERIFY (t.expected_out_next <= t.out_size); + auto state = mbstate_t{}; + auto in_next = (const CharT *) nullptr; + auto out_next = (char *) nullptr; + auto res = codecvt_base::result (); + + res = cvt.out (state, in, in + t.in_size, in_next, out, out + t.out_size, + out_next); + VERIFY (res == cvt.partial); + VERIFY (in_next == in + t.expected_in_next); + VERIFY (out_next == out + t.expected_out_next); + VERIFY (char_traits<char>::compare (out, exp, t.expected_out_next) == 0); + if (t.expected_out_next < array_size (out)) + VERIFY (out[t.expected_out_next] == 0); + } +} + +template <class CharT> +void +utf16_to_utf8_out_error (const std::codecvt<CharT, char, mbstate_t> &cvt) +{ + using namespace std; + const char16_t valid_in[] = u"bш\uAAAA\U0010AAAA"; + const char exp[] = "bш\uAAAA\U0010AAAA"; + + static_assert (array_size (valid_in) == 6, ""); + static_assert (array_size (exp) == 11, ""); + VERIFY (char_traits<char16_t>::length (valid_in) == 5); + VERIFY (char_traits<char>::length (exp) == 10); + + test_offsets_error<CharT> offsets[] = { + {5, 10, 0, 0, 0xD800, 0}, + {5, 10, 0, 0, 0xDBFF, 0}, + {5, 10, 0, 0, 0xDC00, 0}, + {5, 10, 0, 0, 0xDFFF, 0}, + + {5, 10, 1, 1, 0xD800, 1}, + {5, 10, 1, 1, 0xDBFF, 1}, + {5, 10, 1, 1, 0xDC00, 1}, + {5, 10, 1, 1, 0xDFFF, 1}, + + {5, 10, 2, 3, 0xD800, 2}, + {5, 10, 2, 3, 0xDBFF, 2}, + {5, 10, 2, 3, 0xDC00, 2}, + {5, 10, 2, 3, 0xDFFF, 2}, + + // make the leading surrogate a trailing one + {5, 10, 3, 6, 0xDC00, 3}, + {5, 10, 3, 6, 0xDFFF, 3}, + + // make the trailing surrogate a leading one + {5, 10, 3, 6, 0xD800, 4}, + {5, 10, 3, 6, 0xDBFF, 4}, + + // make the trailing surrogate a BMP char + {5, 10, 3, 6, u'z', 4}, + }; + + for (auto t : offsets) + { + CharT in[array_size (valid_in)] = {}; + char out[array_size (exp) - 1] = {}; + VERIFY (t.in_size <= array_size (in)); + VERIFY (t.out_size <= array_size (out)); + VERIFY (t.expected_in_next <= t.in_size); + VERIFY (t.expected_out_next <= t.out_size); + copy (begin (valid_in), end (valid_in), begin (in)); + in[t.replace_pos] = t.replace_char; + + auto state = mbstate_t{}; + auto in_next = (const CharT *) nullptr; + auto out_next = (char *) nullptr; + auto res = codecvt_base::result (); + + res = cvt.out (state, in, in + t.in_size, in_next, out, out + t.out_size, + out_next); + VERIFY (res == cvt.error); + VERIFY (in_next == in + t.expected_in_next); + VERIFY (out_next == out + t.expected_out_next); + VERIFY (char_traits<char>::compare (out, exp, t.expected_out_next) == 0); + if (t.expected_out_next < array_size (out)) + VERIFY (out[t.expected_out_next] == 0); + } +} + +template <class CharT> +void +utf16_to_utf8_out (const std::codecvt<CharT, char, mbstate_t> &cvt) +{ + utf16_to_utf8_out_ok (cvt); + utf16_to_utf8_out_partial (cvt); + utf16_to_utf8_out_error (cvt); +} + +template <class CharT> +void +test_utf8_utf16_cvts (const std::codecvt<CharT, char, mbstate_t> &cvt) +{ + utf8_to_utf16_in (cvt); + utf16_to_utf8_out (cvt); +} + +template <class CharT> +void +utf8_to_ucs2_in_ok (const std::codecvt<CharT, char, mbstate_t> &cvt) +{ + using namespace std; + // UTF-8 string of 1-byte CP, 2-byte CP and 3-byte CP + const char in[] = "bш\uAAAA"; + const char16_t exp_literal[] = u"bш\uAAAA"; + CharT exp[array_size (exp_literal)] = {}; + copy (begin (exp_literal), end (exp_literal), begin (exp)); + + static_assert (array_size (in) == 7, ""); + static_assert (array_size (exp_literal) == 4, ""); + static_assert (array_size (exp) == 4, ""); + VERIFY (char_traits<char>::length (in) == 6); + VERIFY (char_traits<char16_t>::length (exp_literal) == 3); + VERIFY (char_traits<CharT>::length (exp) == 3); + + test_offsets_ok offsets[] = {{0, 0}, {1, 1}, {3, 2}, {6, 3}}; + for (auto t : offsets) + { + CharT out[array_size (exp) - 1] = {}; + VERIFY (t.in_size <= array_size (in)); + VERIFY (t.out_size <= array_size (out)); + auto state = mbstate_t{}; + auto in_next = (const char *) nullptr; + auto out_next = (CharT *) nullptr; + auto res = codecvt_base::result (); + + res = cvt.in (state, in, in + t.in_size, in_next, out, out + t.out_size, + out_next); + VERIFY (res == cvt.ok); + VERIFY (in_next == in + t.in_size); + VERIFY (out_next == out + t.out_size); + VERIFY (char_traits<CharT>::compare (out, exp, t.out_size) == 0); + if (t.out_size < array_size (out)) + VERIFY (out[t.out_size] == 0); + } + + for (auto t : offsets) + { + CharT out[array_size (exp)] = {}; + VERIFY (t.in_size <= array_size (in)); + VERIFY (t.out_size <= array_size (out)); + auto state = mbstate_t{}; + auto in_next = (const char *) nullptr; + auto out_next = (CharT *) nullptr; + auto res = codecvt_base::result (); + + res + = cvt.in (state, in, in + t.in_size, in_next, out, end (out), out_next); + VERIFY (res == cvt.ok); + VERIFY (in_next == in + t.in_size); + VERIFY (out_next == out + t.out_size); + VERIFY (char_traits<CharT>::compare (out, exp, t.out_size) == 0); + if (t.out_size < array_size (out)) + VERIFY (out[t.out_size] == 0); + } +} + +template <class CharT> +void +utf8_to_ucs2_in_partial (const std::codecvt<CharT, char, mbstate_t> &cvt) +{ + using namespace std; + // UTF-8 string of 1-byte CP, 2-byte CP and 3-byte CP + const char in[] = "bш\uAAAA"; + const char16_t exp_literal[] = u"bш\uAAAA"; + CharT exp[array_size (exp_literal)] = {}; + copy (begin (exp_literal), end (exp_literal), begin (exp)); + + static_assert (array_size (in) == 7, ""); + static_assert (array_size (exp_literal) == 4, ""); + static_assert (array_size (exp) == 4, ""); + VERIFY (char_traits<char>::length (in) == 6); + VERIFY (char_traits<char16_t>::length (exp_literal) == 3); + VERIFY (char_traits<CharT>::length (exp) == 3); + + test_offsets_partial offsets[] = { + {1, 0, 0, 0}, // no space for first CP + + {3, 1, 1, 1}, // no space for second CP + {2, 2, 1, 1}, // incomplete second CP + {2, 1, 1, 1}, // incomplete second CP, and no space for it + + {6, 2, 3, 2}, // no space for third CP + {4, 3, 3, 2}, // incomplete third CP + {5, 3, 3, 2}, // incomplete third CP + {4, 2, 3, 2}, // incomplete third CP, and no space for it + {5, 2, 3, 2}, // incomplete third CP, and no space for it + }; + + for (auto t : offsets) + { + CharT out[array_size (exp) - 1] = {}; + VERIFY (t.in_size <= array_size (in)); + VERIFY (t.out_size <= array_size (out)); + VERIFY (t.expected_in_next <= t.in_size); + VERIFY (t.expected_out_next <= t.out_size); + auto state = mbstate_t{}; + auto in_next = (const char *) nullptr; + auto out_next = (CharT *) nullptr; + auto res = codecvt_base::result (); + + res = cvt.in (state, in, in + t.in_size, in_next, out, out + t.out_size, + out_next); + VERIFY (res == cvt.partial); + VERIFY (in_next == in + t.expected_in_next); + VERIFY (out_next == out + t.expected_out_next); + VERIFY (char_traits<CharT>::compare (out, exp, t.expected_out_next) == 0); + if (t.expected_out_next < array_size (out)) + VERIFY (out[t.expected_out_next] == 0); + } +} + +template <class CharT> +void +utf8_to_ucs2_in_error (const std::codecvt<CharT, char, mbstate_t> &cvt) +{ + using namespace std; + const char valid_in[] = "bш\uAAAA\U0010AAAA"; + const char16_t exp_literal[] = u"bш\uAAAA\U0010AAAA"; + CharT exp[array_size (exp_literal)] = {}; + copy (begin (exp_literal), end (exp_literal), begin (exp)); + + static_assert (array_size (valid_in) == 11, ""); + static_assert (array_size (exp_literal) == 6, ""); + static_assert (array_size (exp) == 6, ""); + VERIFY (char_traits<char>::length (valid_in) == 10); + VERIFY (char_traits<char16_t>::length (exp_literal) == 5); + VERIFY (char_traits<CharT>::length (exp) == 5); + + test_offsets_error<char> offsets[] = { + + // replace leading byte with invalid byte + {1, 5, 0, 0, '\xFF', 0}, + {3, 5, 1, 1, '\xFF', 1}, + {6, 5, 3, 2, '\xFF', 3}, + {10, 5, 6, 3, '\xFF', 6}, + + // replace first trailing byte with ASCII byte + {3, 5, 1, 1, 'z', 2}, + {6, 5, 3, 2, 'z', 4}, + {10, 5, 6, 3, 'z', 7}, + + // replace first trailing byte with invalid byte + {3, 5, 1, 1, '\xFF', 2}, + {6, 5, 3, 2, '\xFF', 4}, + {10, 5, 6, 3, '\xFF', 7}, + + // replace second trailing byte with ASCII byte + {6, 5, 3, 2, 'z', 5}, + {10, 5, 6, 3, 'z', 8}, + + // replace second trailing byte with invalid byte + {6, 5, 3, 2, '\xFF', 5}, + {10, 5, 6, 3, '\xFF', 8}, + + // replace third trailing byte + {10, 5, 6, 3, 'z', 9}, + {10, 5, 6, 3, '\xFF', 9}, + + // When we see a leading byte of 4-byte CP, we should return error, no + // matter if it is incomplete at the end or has errors in the trailing + // bytes. + + // Don't replace anything, show full 4-byte CP + {10, 4, 6, 3, 'b', 0}, + {10, 5, 6, 3, 'b', 0}, + + // Don't replace anything, show incomplete 4-byte CP at the end + {7, 4, 6, 3, 'b', 0}, // incomplete fourth CP + {8, 4, 6, 3, 'b', 0}, // incomplete fourth CP + {9, 4, 6, 3, 'b', 0}, // incomplete fourth CP + {7, 5, 6, 3, 'b', 0}, // incomplete fourth CP + {8, 5, 6, 3, 'b', 0}, // incomplete fourth CP + {9, 5, 6, 3, 'b', 0}, // incomplete fourth CP + + // replace first trailing byte with ASCII byte, also incomplete at end + {5, 5, 3, 2, 'z', 4}, + + // replace first trailing byte with invalid byte, also incomplete at end + {5, 5, 3, 2, '\xFF', 4}, + + // replace first trailing byte with ASCII byte, also incomplete at end + {8, 5, 6, 3, 'z', 7}, + {9, 5, 6, 3, 'z', 7}, + + // replace first trailing byte with invalid byte, also incomplete at end + {8, 5, 6, 3, '\xFF', 7}, + {9, 5, 6, 3, '\xFF', 7}, + + // replace second trailing byte with ASCII byte, also incomplete at end + {9, 5, 6, 3, 'z', 8}, + + // replace second trailing byte with invalid byte, also incomplete at end + {9, 5, 6, 3, '\xFF', 8}, + }; + for (auto t : offsets) + { + char in[array_size (valid_in)] = {}; + CharT out[array_size (exp) - 1] = {}; + VERIFY (t.in_size <= array_size (in)); + VERIFY (t.out_size <= array_size (out)); + VERIFY (t.expected_in_next <= t.in_size); + VERIFY (t.expected_out_next <= t.out_size); + char_traits<char>::copy (in, valid_in, array_size (valid_in)); + in[t.replace_pos] = t.replace_char; + + auto state = mbstate_t{}; + auto in_next = (const char *) nullptr; + auto out_next = (CharT *) nullptr; + auto res = codecvt_base::result (); + + res = cvt.in (state, in, in + t.in_size, in_next, out, out + t.out_size, + out_next); + VERIFY (res == cvt.error); + VERIFY (in_next == in + t.expected_in_next); + VERIFY (out_next == out + t.expected_out_next); + VERIFY (char_traits<CharT>::compare (out, exp, t.expected_out_next) == 0); + if (t.expected_out_next < array_size (out)) + VERIFY (out[t.expected_out_next] == 0); + } +} + +template <class CharT> +void +utf8_to_ucs2_in (const std::codecvt<CharT, char, mbstate_t> &cvt) +{ + utf8_to_ucs2_in_ok (cvt); + utf8_to_ucs2_in_partial (cvt); + utf8_to_ucs2_in_error (cvt); +} + +template <class CharT> +void +ucs2_to_utf8_out_ok (const std::codecvt<CharT, char, mbstate_t> &cvt) +{ + using namespace std; + // UTF-8 string of 1-byte CP, 2-byte CP and 3-byte CP + const char16_t in_literal[] = u"bш\uAAAA"; + const char exp[] = "bш\uAAAA"; + CharT in[array_size (in_literal)] = {}; + copy (begin (in_literal), end (in_literal), begin (in)); + + static_assert (array_size (in_literal) == 4, ""); + static_assert (array_size (exp) == 7, ""); + static_assert (array_size (in) == 4, ""); + VERIFY (char_traits<char16_t>::length (in_literal) == 3); + VERIFY (char_traits<char>::length (exp) == 6); + VERIFY (char_traits<CharT>::length (in) == 3); + + const test_offsets_ok offsets[] = {{0, 0}, {1, 1}, {2, 3}, {3, 6}}; + for (auto t : offsets) + { + char out[array_size (exp) - 1] = {}; + VERIFY (t.in_size <= array_size (in)); + VERIFY (t.out_size <= array_size (out)); + auto state = mbstate_t{}; + auto in_next = (const CharT *) nullptr; + auto out_next = (char *) nullptr; + auto res = codecvt_base::result (); + + res = cvt.out (state, in, in + t.in_size, in_next, out, out + t.out_size, + out_next); + VERIFY (res == cvt.ok); + VERIFY (in_next == in + t.in_size); + VERIFY (out_next == out + t.out_size); + VERIFY (char_traits<char>::compare (out, exp, t.out_size) == 0); + if (t.out_size < array_size (out)) + VERIFY (out[t.out_size] == 0); + } +} + +template <class CharT> +void +ucs2_to_utf8_out_partial (const std::codecvt<CharT, char, mbstate_t> &cvt) +{ + using namespace std; + // UTF-8 string of 1-byte CP, 2-byte CP and 3-byte CP + const char16_t in_literal[] = u"bш\uAAAA"; + const char exp[] = "bш\uAAAA"; + CharT in[array_size (in_literal)] = {}; + copy (begin (in_literal), end (in_literal), begin (in)); + + static_assert (array_size (in_literal) == 4, ""); + static_assert (array_size (exp) == 7, ""); + static_assert (array_size (in) == 4, ""); + VERIFY (char_traits<char16_t>::length (in_literal) == 3); + VERIFY (char_traits<char>::length (exp) == 6); + VERIFY (char_traits<CharT>::length (in) == 3); + + const test_offsets_partial offsets[] = { + {1, 0, 0, 0}, // no space for first CP + + {2, 1, 1, 1}, // no space for second CP + {2, 2, 1, 1}, // no space for second CP + + {3, 3, 2, 3}, // no space for third CP + {3, 4, 2, 3}, // no space for third CP + {3, 5, 2, 3}, // no space for third CP + }; + for (auto t : offsets) + { + char out[array_size (exp) - 1] = {}; + VERIFY (t.in_size <= array_size (in)); + VERIFY (t.out_size <= array_size (out)); + VERIFY (t.expected_in_next <= t.in_size); + VERIFY (t.expected_out_next <= t.out_size); + auto state = mbstate_t{}; + auto in_next = (const CharT *) nullptr; + auto out_next = (char *) nullptr; + auto res = codecvt_base::result (); + + res = cvt.out (state, in, in + t.in_size, in_next, out, out + t.out_size, + out_next); + VERIFY (res == cvt.partial); + VERIFY (in_next == in + t.expected_in_next); + VERIFY (out_next == out + t.expected_out_next); + VERIFY (char_traits<char>::compare (out, exp, t.expected_out_next) == 0); + if (t.expected_out_next < array_size (out)) + VERIFY (out[t.expected_out_next] == 0); + } +} + +template <class CharT> +void +ucs2_to_utf8_out_error (const std::codecvt<CharT, char, mbstate_t> &cvt) +{ + using namespace std; + const char16_t valid_in[] = u"bш\uAAAA\U0010AAAA"; + const char exp[] = "bш\uAAAA\U0010AAAA"; + + static_assert (array_size (valid_in) == 6, ""); + static_assert (array_size (exp) == 11, ""); + VERIFY (char_traits<char16_t>::length (valid_in) == 5); + VERIFY (char_traits<char>::length (exp) == 10); + + test_offsets_error<CharT> offsets[] = { + {5, 10, 0, 0, 0xD800, 0}, + {5, 10, 0, 0, 0xDBFF, 0}, + {5, 10, 0, 0, 0xDC00, 0}, + {5, 10, 0, 0, 0xDFFF, 0}, + + {5, 10, 1, 1, 0xD800, 1}, + {5, 10, 1, 1, 0xDBFF, 1}, + {5, 10, 1, 1, 0xDC00, 1}, + {5, 10, 1, 1, 0xDFFF, 1}, + + {5, 10, 2, 3, 0xD800, 2}, + {5, 10, 2, 3, 0xDBFF, 2}, + {5, 10, 2, 3, 0xDC00, 2}, + {5, 10, 2, 3, 0xDFFF, 2}, + + // dont replace anything, just show the surrogate pair + {5, 10, 3, 6, u'b', 0}, + + // make the leading surrogate a trailing one + {5, 10, 3, 6, 0xDC00, 3}, + {5, 10, 3, 6, 0xDFFF, 3}, + + // make the trailing surrogate a leading one + {5, 10, 3, 6, 0xD800, 4}, + {5, 10, 3, 6, 0xDBFF, 4}, + + // make the trailing surrogate a BMP char + {5, 10, 3, 6, u'z', 4}, + + {5, 7, 3, 6, u'b', 0}, // no space for fourth CP + {5, 8, 3, 6, u'b', 0}, // no space for fourth CP + {5, 9, 3, 6, u'b', 0}, // no space for fourth CP + + {4, 10, 3, 6, u'b', 0}, // incomplete fourth CP + {4, 7, 3, 6, u'b', 0}, // incomplete fourth CP, and no space for it + {4, 8, 3, 6, u'b', 0}, // incomplete fourth CP, and no space for it + {4, 9, 3, 6, u'b', 0}, // incomplete fourth CP, and no space for it + + }; + + for (auto t : offsets) + { + CharT in[array_size (valid_in)] = {}; + char out[array_size (exp) - 1] = {}; + VERIFY (t.in_size <= array_size (in)); + VERIFY (t.out_size <= array_size (out)); + VERIFY (t.expected_in_next <= t.in_size); + VERIFY (t.expected_out_next <= t.out_size); + copy (begin (valid_in), end (valid_in), begin (in)); + in[t.replace_pos] = t.replace_char; + + auto state = mbstate_t{}; + auto in_next = (const CharT *) nullptr; + auto out_next = (char *) nullptr; + auto res = codecvt_base::result (); + + res = cvt.out (state, in, in + t.in_size, in_next, out, out + t.out_size, + out_next); + VERIFY (res == cvt.error); + VERIFY (in_next == in + t.expected_in_next); + VERIFY (out_next == out + t.expected_out_next); + VERIFY (char_traits<char>::compare (out, exp, t.expected_out_next) == 0); + if (t.expected_out_next < array_size (out)) + VERIFY (out[t.expected_out_next] == 0); + } +} + +template <class CharT> +void +ucs2_to_utf8_out (const std::codecvt<CharT, char, mbstate_t> &cvt) +{ + ucs2_to_utf8_out_ok (cvt); + ucs2_to_utf8_out_partial (cvt); + ucs2_to_utf8_out_error (cvt); +} + +template <class CharT> +void +test_utf8_ucs2_cvts (const std::codecvt<CharT, char, mbstate_t> &cvt) +{ + utf8_to_ucs2_in (cvt); + ucs2_to_utf8_out (cvt); +} diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_unicode_wchar_t.cc b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_unicode_wchar_t.cc new file mode 100644 index 0000000..1695049 --- /dev/null +++ b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_unicode_wchar_t.cc @@ -0,0 +1,59 @@ +// Copyright (C) 2020-2023 Free Software Foundation, Inc. +// +// This file is part of the GNU ISO C++ Library. This library is free +// software; you can redistribute it and/or modify it under the +// terms of the GNU General Public License as published by the +// Free Software Foundation; either version 3, or (at your option) +// any later version. + +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License along +// with this library; see the file COPYING3. If not see +// <http://www.gnu.org/licenses/>. + +// { dg-do run { target c++11 } } + +#include "codecvt_unicode.h" + +#include <codecvt> + +using namespace std; + +void +test_utf8_utf32_codecvts () +{ +#if __SIZEOF_WCHAR_T__ == 4 + auto cvt_ptr = to_unique_ptr (new codecvt_utf8<wchar_t> ()); + test_utf8_utf32_codecvts (*cvt_ptr); +#endif +} + +void +test_utf8_utf16_codecvts () +{ +#if __SIZEOF_WCHAR_T__ >= 2 + auto cvt_ptr = to_unique_ptr (new codecvt_utf8_utf16<wchar_t> ()); + test_utf8_utf16_cvts (*cvt_ptr); +#endif +} + +void +test_utf8_ucs2_codecvts () +{ +#if __SIZEOF_WCHAR_T__ == 2 + auto cvt_ptr = to_unique_ptr (new codecvt_utf8<wchar_t> ()); + test_utf8_ucs2_cvts (*cvt_ptr); +#endif +} + +int +main () +{ + test_utf8_utf32_codecvts (); + test_utf8_utf16_codecvts (); + test_utf8_ucs2_codecvts (); +} |