From 3b33d792cf1e4d2ea3d36d3ad403cbb452243cd8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tomasz=20Kami=C5=84ski?= Date: Wed, 2 Apr 2025 14:19:26 +0200 Subject: libstdc++: Implement debug format for strings and characters formatters [PR109162] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch implements part P2286R8 that specified debug (escaped) format for the strings and characters sequences. This include both handling of the '?' format specifier and set_debug_format member. To indicate partial support we define __glibcxx_format_ranges macro value 1, without defining __cpp_lib_format_ranges. We provide two separate escaping routines depending on the literal encoding for the corresponding character types. If the character encoding is Unicode, we follow the specification for the standard (__format::__write_escaped_unicode). For other encodings, we escape only characters in range [0x00, 0x80), interpreting them as ASCII values: [0x00, 0x20), 0x7f and '\t', '\r', '\n', '\\', '"', '\'' are escaped. We assume every character outside this range is printable (__format::_write_escaped_ascii). In particular we do not yet implement special handling of shift sequences. For Unicode escaping a new __unicode::__escape_edges table is introduced, that encodes information if character belongs to General_Category that is escaped by the standard (Control or Other). This table is generated from DerivedGeneralCategory.txt provided by Unicode. Only boolean flag is preserved to reduce the number of entries. The additional rules for escaping are handled by __format::__should_escape_unicode. When width or precision is specified, we emit escaped string to the temporary buffer and format the resulting string according to the format spec. For characters use a fixed size stack buffer, for which a new _Fixedbuf_sink is introduced. For strings, we use _Str_sink and to avoid allocations, we compute the estimated size of (possibly truncated) input, and if it is larger than width field we print directly. PR libstdc++/109162 contrib/ChangeLog: * unicode/README: Mentioned DerivedGeneralCategory.txt. * unicode/gen_libstdcxx_unicode_data.py: Generation __escape_edges table from DerivedGeneralCategory.txt. Update file name in comments. * unicode/DerivedGeneralCategory.txt: Copy of file distributed by Unicode Consortium. libstdc++-v3/ChangeLog: * include/bits/chrono_io.h (__detail::_Widen): Moved to std/format file. * include/bits/unicode-data.h: Regnerate. * include/bits/unicode.h (__unicode::_Utf_iterator::_M_units) (__unicode::__should_escape_category): Define. * include/std/format (_GLIBCXX_WIDEN_, _GLIBCXX_WIDEN): Copied from include/bits/chrono_io.h. (__format::_Widen): Moved from include/bits/chrono_io.h. (__format::_Term_char, __format::_Escapes, __format::_Separators) (__format::__should_escape_ascii, __format::__should_escape_unicode) (__format::__write_escape_seq, __format::__write_escaped_char) (__format::__write_escaped_acii, __format::__write_escaped_unicode) (__format::__write_escaped): Define. (__formatter_str::_S_trunc): Extracted truncation of character sequences. (__formatter_str::format): Handle _Pres_esc. (__formatter_int::_M_do_parse) [__glibcxx_format_ranges]: Parse '?'. (__formatter_int::_M_format_character_escaped): Define. (formatter<_CharT, _CharT>::format, formatter::format): Handle _Pres_esc. (__formatter_str::set_debug_format, formatter<...>::set_debug_format) Guard with __glibcxx_format_ranges. (__format::_Fixedbuf_sink): Define. * testsuite/23_containers/vector/bool/format.cc: Use __format::_Widen and remove unnecessary include. * testsuite/std/format/debug.cc: New test. * testsuite/std/format/debug_nonunicode.cc: New test. * testsuite/std/format/parse_ctx.cc (escaped_strings_supported): Define to true if __glibcxx_format_ranges is defined. * testsuite/std/format/string.cc (escaped_strings_supported): Define to true if __glibcxx_format_ranges is defined. Reviewed-by: Jonathan Wakely Signed-off-by: Tomasz Kamiński --- libstdc++-v3/testsuite/std/format/debug.cc | 454 +++++++++++++++++++++ .../testsuite/std/format/debug_nonunicode.cc | 5 + libstdc++-v3/testsuite/std/format/parse_ctx.cc | 2 +- libstdc++-v3/testsuite/std/format/string.cc | 2 +- 4 files changed, 461 insertions(+), 2 deletions(-) create mode 100644 libstdc++-v3/testsuite/std/format/debug.cc create mode 100644 libstdc++-v3/testsuite/std/format/debug_nonunicode.cc (limited to 'libstdc++-v3/testsuite/std') diff --git a/libstdc++-v3/testsuite/std/format/debug.cc b/libstdc++-v3/testsuite/std/format/debug.cc new file mode 100644 index 0000000..07cd1e0 --- /dev/null +++ b/libstdc++-v3/testsuite/std/format/debug.cc @@ -0,0 +1,454 @@ +// { dg-options "-fexec-charset=UTF-8 -fwide-exec-charset=UTF-32LE -DUNICODE_ENC" } +// { dg-do run { target c++23 } } +// { dg-add-options no_pch } + +#include +#include + +std::string +fdebug(char t) +{ return std::format("{:?}", t); } + +std::wstring +fdebug(wchar_t t) +{ return std::format(L"{:?}", t); } + +std::string +fdebug(std::string_view t) +{ return std::format("{:?}", t); } + +std::wstring +fdebug(std::wstring_view t) +{ return std::format(L"{:?}", t); } + + +#define WIDEN_(C, S) ::std::__format::_Widen(S, L##S) +#define WIDEN(S) WIDEN_(_CharT, S) + +template +void +test_basic_escapes() +{ + std::basic_string<_CharT> res; + + const auto tab = WIDEN("\t"); + res = fdebug(tab); + VERIFY( res == WIDEN(R"("\t")") ); + res = fdebug(tab[0]); + VERIFY( res == WIDEN(R"('\t')") ); + + const auto nline = WIDEN("\n"); + res = fdebug(nline); + VERIFY( res == WIDEN(R"("\n")") ); + res = fdebug(nline[0]); + VERIFY( res == WIDEN(R"('\n')") ); + + const auto carret = WIDEN("\r"); + res = fdebug(carret); + VERIFY( res == WIDEN(R"("\r")") ); + res = fdebug(carret[0]); + VERIFY( res == WIDEN(R"('\r')") ); + + const auto bslash = WIDEN("\\"); + res = fdebug(bslash); + VERIFY( res == WIDEN(R"("\\")") ); + res = fdebug(bslash[0]); + VERIFY( res == WIDEN(R"('\\')") ); + + const auto quote = WIDEN("\""); + res = fdebug(quote); + VERIFY( res == WIDEN(R"("\"")") ); + res = fdebug(quote[0]); + VERIFY( res == WIDEN(R"('"')") ); + + const auto apos = WIDEN("\'"); + res = fdebug(apos); + VERIFY( res == WIDEN(R"("'")") ); + res = fdebug(apos[0]); + VERIFY( res == WIDEN(R"('\'')") ); +} + +template +void +test_ascii_escapes() +{ + std::basic_string<_CharT> res; + + const auto in = WIDEN("\x10 abcde\x7f\t0123"); + res = fdebug(in); + VERIFY( res == WIDEN(R"("\u{10} abcde\u{7f}\t0123")") ); + res = fdebug(in[0]); + VERIFY( res == WIDEN(R"('\u{10}')") ); + res = fdebug(in[1]); + VERIFY( res == WIDEN(R"(' ')") ); + res = fdebug(in[2]); + VERIFY( res == WIDEN(R"('a')") ); +} + +template +void +test_extended_ascii() +{ + std::basic_string<_CharT> res; + + const auto in = WIDEN("Åëÿ"); + res = fdebug(in); + VERIFY( res == WIDEN(R"("Åëÿ")") ); + + static constexpr bool __test_characters +#if UNICODE_ENC + = sizeof(_CharT) >= 2; +#else // ISO8859-1 + = true; +#endif // UNICODE_ENC + + if constexpr (__test_characters) + { + res = fdebug(in[0]); + VERIFY( res == WIDEN(R"('Å')") ); + res = fdebug(in[1]); + VERIFY( res == WIDEN(R"('ë')") ); + res = fdebug(in[2]); + VERIFY( res == WIDEN(R"('ÿ')") ); + } +} + +#if UNICODE_ENC +template +void +test_unicode_escapes() +{ + std::basic_string<_CharT> res; + + const auto in = WIDEN( + "\u008a" // Cc, Control, Line Tabulation Set, + "\u00ad" // Cf, Format, Soft Hyphen + "\u1d3d" // Lm, Modifier letter, Modifier Letter Capital Ou + "\u00a0" // Zs, Space Separator, No-Break Space (NBSP) + "\u2029" // Zp, Paragraph Separator, Paragraph Separator + "\U0001f984" // So, Other Symbol, Unicorn Face + ); + const auto out = WIDEN("\"" + R"(\u{8a})" + R"(\u{ad})" + "\u1d3d" + R"(\u{a0})" + R"(\u{2029})" + "\U0001f984" + "\""); + + res = fdebug(in); + VERIFY( res == out ); + + if constexpr (sizeof(_CharT) >= 2) + { + res = fdebug(in[0]); + VERIFY( res == WIDEN(R"('\u{8a}')") ); + res = fdebug(in[1]); + VERIFY( res == WIDEN(R"('\u{ad}')") ); + res = fdebug(in[2]); + VERIFY( res == WIDEN("'\u1d3d'") ); + res = fdebug(in[3]); + VERIFY( res == WIDEN(R"('\u{a0}')") ); + res = fdebug(in[4]); + VERIFY( res == WIDEN(R"('\u{2029}')") ); + } + + if constexpr (sizeof(_CharT) >= 4) + { + res = fdebug(in[5]); + VERIFY( res == WIDEN("'\U0001f984'") ); + } +} + +template +void +test_grapheme_extend() +{ + std::basic_string<_CharT> res; + + const auto vin = WIDEN("o\u0302\u0323"); + res = fdebug(vin); + VERIFY( res == WIDEN("\"o\u0302\u0323\"") ); + + std::basic_string_view<_CharT> in = WIDEN("\t\u0302\u0323"); + res = fdebug(in); + VERIFY( res == WIDEN(R"("\t\u{302}\u{323}")") ); + + res = fdebug(in.substr(1)); + VERIFY( res == WIDEN(R"("\u{302}\u{323}")") ); + + if constexpr (sizeof(_CharT) >= 2) + { + res = fdebug(in[1]); + VERIFY( res == WIDEN(R"('\u{302}')") ); + } +} + +template +void +test_replacement_char() +{ + std::basic_string<_CharT> repl = WIDEN("\uFFFD"); + std::basic_string<_CharT> res = fdebug(repl); + VERIFY( res == WIDEN("\"\uFFFD\"") ); + + repl = WIDEN("\uFFFD\uFFFD"); + res = fdebug(repl); + VERIFY( res == WIDEN("\"\uFFFD\uFFFD\"") ); +} + +void +test_ill_formed_utf8_seq() +{ + std::string_view seq = "\xf0\x9f\xa6\x84"; // \U0001F984 + std::string res; + + res = fdebug(seq); + VERIFY( res == "\"\U0001F984\"" ); + + res = fdebug(seq.substr(1)); + VERIFY( res == R"("\x{9f}\x{a6}\x{84}")" ); + + res = fdebug(seq.substr(2)); + VERIFY( res == R"("\x{a6}\x{84}")" ); + + res = fdebug(seq[0]); + VERIFY( res == R"('\x{f0}')" ); + res = fdebug(seq.substr(0, 1)); + VERIFY( res == R"("\x{f0}")" ); + + res = fdebug(seq[1]); + VERIFY( res == R"('\x{9f}')" ); + res = fdebug(seq.substr(1, 1)); + VERIFY( res == R"("\x{9f}")" ); + + res = fdebug(seq[2]); + VERIFY( res == R"('\x{a6}')" ); + res = fdebug(seq.substr(2, 1)); + VERIFY( res == R"("\x{a6}")" ); + + res = fdebug(seq[3]); + VERIFY( res == R"('\x{84}')" ); + res = fdebug(seq.substr(3, 1)); + VERIFY( res == R"("\x{84}")" ); +} + +void +test_ill_formed_utf32() +{ + std::wstring res; + + wchar_t ic1 = static_cast(0xff'ffff); + res = fdebug(ic1); + VERIFY( res == LR"('\x{ffffff}')" ); + + std::wstring is1(1, ic1); + res = fdebug(is1); + VERIFY( res == LR"("\x{ffffff}")" ); + + wchar_t ic2 = static_cast(0xffff'ffff); + res = fdebug(ic2); + VERIFY( res == LR"('\x{ffffffff}')" ); + + std::wstring is2(1, ic2); + res = fdebug(is2); + VERIFY( res == LR"("\x{ffffffff}")" ); +} +#endif // UNICODE_ENC + +template +void +test_fill() +{ + std::basic_string<_CharT> res; + + std::basic_string_view<_CharT> in = WIDEN("a\t\x10\u00ad"); + res = std::format(WIDEN("{:10?}"), in.substr(0, 1)); + VERIFY( res == WIDEN(R"("a" )") ); + + res = std::format(WIDEN("{:->10?}"), in.substr(1, 1)); + VERIFY( res == WIDEN(R"(------"\t")") ); + + res = std::format(WIDEN("{:+<10?}"), in.substr(2, 1)); + VERIFY( res == WIDEN(R"("\u{10}"++)") ); + + + res = std::format(WIDEN("{:10?}"), in[0]); + VERIFY( res == WIDEN(R"('a' )") ); + + res = std::format(WIDEN("{:->10?}"), in[1]); + VERIFY( res == WIDEN(R"(------'\t')") ); + + res = std::format(WIDEN("{:+<10?}"), in[2]); + VERIFY( res == WIDEN(R"('\u{10}'++)") ); + +#if UNICODE_ENC + res = std::format(WIDEN("{:=^10?}"), in.substr(3)); + VERIFY( res == WIDEN(R"(="\u{ad}"=)") ); + + // width is 2 + std::basic_string_view<_CharT> in2 = WIDEN("\u1100"); + res = std::format(WIDEN("{:*^10?}"), in2); + VERIFY( res == WIDEN("***\"\u1100\"***") ); + + if constexpr (sizeof(_CharT) >= 2) + { + res = std::format(WIDEN("{:=^10?}"), in[3]); + VERIFY( res == WIDEN(R"(='\u{ad}'=)") ); + + res = std::format(WIDEN("{:*^10?}"), in2[0]); + VERIFY( res == WIDEN("***'\u1100'***") ); + } +#endif // UNICODE_ENC +} + +template +void +test_prec() +{ + std::basic_string<_CharT> res; + // with ? escpaed presentation is copied to ouput, same as source + + std::basic_string_view<_CharT> in = WIDEN("a\t\x10\u00ad"); + res = std::format(WIDEN("{:.2?}"), in.substr(0, 1)); + VERIFY( res == WIDEN(R"("a)") ); + + res = std::format(WIDEN("{:.4?}"), in.substr(1, 1)); + VERIFY( res == WIDEN(R"("\t")") ); + + res = std::format(WIDEN("{:.5?}"), in.substr(2, 1)); + VERIFY( res == WIDEN(R"("\u{1)") ); + +#if UNICODE_ENC + res = std::format(WIDEN("{:.10?}"), in.substr(3)); + VERIFY( res == WIDEN(R"("\u{ad}")") ); + + std::basic_string_view<_CharT> in2 = WIDEN("\u1100"); + res = std::format(WIDEN("{:.3?}"), in2); + VERIFY( res == WIDEN("\"\u1100") ); +#endif // UNICODE_ENC +} + +void test_char_as_wchar() +{ + std::wstring res; + + res = std::format(L"{:?}", 'a'); + VERIFY( res == LR"('a')" ); + + res = std::format(L"{:?}", '\t'); + VERIFY( res == LR"('\t')" ); + + res = std::format(L"{:+<10?}", '\x10'); + VERIFY( res == LR"('\u{10}'++)" ); +} + +template +struct DebugWrapper +{ + T val; +}; + +template +struct std::formatter, CharT> +{ + constexpr std::basic_format_parse_context::iterator + parse(std::basic_format_parse_context& pc) + { + auto out = under.parse(pc); + under.set_debug_format(); + return out; + } + + template + Out format(DebugWrapper const& t, + std::basic_format_context& fc) const + { return under.format(t.val, fc); } + +private: + std::formatter under; +}; + +template +void +test_formatter_str() +{ + _CharT buf[]{ 'a', 'b', 'c', 0 }; + DebugWrapper in{ buf }; + std::basic_string<_CharT> res = std::format(WIDEN("{:?}"), in ); + VERIFY( res == WIDEN(R"("abc")") ); +} + +template +void +test_formatter_arr() +{ + std::basic_string<_CharT> res; + + DebugWrapper<_CharT[3]> in3{ 'a', 'b', 'c' }; + res = std::format(WIDEN("{:?}"), in3 ); + VERIFY( res == WIDEN(R"("abc")") ); + + // We print all characters, including null-terminator + DebugWrapper<_CharT[4]> in4{ 'a', 'b', 'c', 0 }; + res = std::format(WIDEN("{:?}"), in4 ); + VERIFY( res == WIDEN(R"("abc\u{0}")") ); +} + +template +void +test_formatter_char() +{ + DebugWrapper in{ 'a' }; + std::basic_string<_CharT> res = std::format(WIDEN("{:?}"), in); + VERIFY( res == WIDEN(R"('a')") ); +} + +template +void +test_formatters() +{ + test_formatter_char(); + test_formatter_str(); + test_formatter_str(); + test_formatter_str>(); + test_formatter_str>(); + test_formatter_arr(); +} + +void +test_formatters_c() +{ + test_formatters(); + test_formatters(); + test_formatter_char(); +} + +int main() +{ + test_basic_escapes(); + test_basic_escapes(); + test_ascii_escapes(); + test_ascii_escapes(); + test_extended_ascii(); + test_extended_ascii(); + +#if UNICODE_ENC + test_unicode_escapes(); + test_unicode_escapes(); + test_grapheme_extend(); + test_grapheme_extend(); + test_replacement_char(); + test_replacement_char(); + test_ill_formed_utf8_seq(); + test_ill_formed_utf32(); +#endif // UNICODE_ENC + + test_fill(); + test_fill(); + test_prec(); + test_prec(); + + test_formatters_c(); +} diff --git a/libstdc++-v3/testsuite/std/format/debug_nonunicode.cc b/libstdc++-v3/testsuite/std/format/debug_nonunicode.cc new file mode 100644 index 0000000..5c03171 --- /dev/null +++ b/libstdc++-v3/testsuite/std/format/debug_nonunicode.cc @@ -0,0 +1,5 @@ +// { dg-options "-fexec-charset=ISO8859-1 -fwide-exec-charset=UTF-32LE" } +// { dg-do run { target c++23 } } +// { dg-add-options no_pch } + +#include "debug.cc" diff --git a/libstdc++-v3/testsuite/std/format/parse_ctx.cc b/libstdc++-v3/testsuite/std/format/parse_ctx.cc index b5dd7cd..b338ac7 100644 --- a/libstdc++-v3/testsuite/std/format/parse_ctx.cc +++ b/libstdc++-v3/testsuite/std/format/parse_ctx.cc @@ -108,7 +108,7 @@ is_std_format_spec_for(std::string_view spec) } } -#if __cpp_lib_format_ranges +#if __glibcxx_format_ranges constexpr bool escaped_strings_supported = true; #else constexpr bool escaped_strings_supported = false; diff --git a/libstdc++-v3/testsuite/std/format/string.cc b/libstdc++-v3/testsuite/std/format/string.cc index ee987a1..76614d4 100644 --- a/libstdc++-v3/testsuite/std/format/string.cc +++ b/libstdc++-v3/testsuite/std/format/string.cc @@ -62,7 +62,7 @@ test_indexing() VERIFY( ! is_format_string_for("{} {0}", 1) ); } -#if __cpp_lib_format_ranges +#if __glibcxx_format_ranges constexpr bool escaped_strings_supported = true; #else constexpr bool escaped_strings_supported = false; -- cgit v1.1