diff options
author | Corentin Jabot <corentinjabot@gmail.com> | 2021-08-20 17:52:28 +0200 |
---|---|---|
committer | Corentin Jabot <corentinjabot@gmail.com> | 2022-06-28 22:26:00 +0200 |
commit | 870b6d21839707a3e4c40a29b526995f065a220f (patch) | |
tree | baa86806a0dcc922f5507837e9c41c5505da2202 /clang/lib/Basic | |
parent | 04235d07ad5b44f639f329840b5ffec48bf96b44 (diff) | |
download | llvm-870b6d21839707a3e4c40a29b526995f065a220f.zip llvm-870b6d21839707a3e4c40a29b526995f065a220f.tar.gz llvm-870b6d21839707a3e4c40a29b526995f065a220f.tar.bz2 |
Improve handling of static assert messages.
Instead of dumping the string literal (which
quotes it and escape every non-ascii symbol),
we can use the content of the string when it is a
8 byte string.
Wide, UTF-8/UTF-16/32 strings are still completely
escaped, until we clarify how these entities should
behave (cf https://wg21.link/p2361).
`FormatDiagnostic` is modified to escape
non printable characters and invalid UTF-8.
This ensures that unicode characters, spaces and new
lines are properly rendered in static messages.
This make clang more consistent with other implementation
and fixes this tweet
https://twitter.com/jfbastien/status/1298307325443231744 :)
Of note, `PaddingChecker` did print out new lines that were
later removed by the diagnostic printing code.
To be consistent with its tests, the new lines are removed
from the diagnostic.
Unicode tables updated to both use the Unicode definitions
and the Unicode 14.0 data.
U+00AD SOFT HYPHEN is still considered a print character
to match existing practices in terminals, in addition of
being considered a formatting character as per Unicode.
Reviewed By: aaron.ballman, #clang-language-wg
Differential Revision: https://reviews.llvm.org/D108469
Diffstat (limited to 'clang/lib/Basic')
-rw-r--r-- | clang/lib/Basic/Diagnostic.cpp | 58 |
1 files changed, 49 insertions, 9 deletions
diff --git a/clang/lib/Basic/Diagnostic.cpp b/clang/lib/Basic/Diagnostic.cpp index deb3987..dbe62ec 100644 --- a/clang/lib/Basic/Diagnostic.cpp +++ b/clang/lib/Basic/Diagnostic.cpp @@ -25,8 +25,9 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" +#include "llvm/Support/ConvertUTF.h" #include "llvm/Support/CrashRecoveryContext.h" -#include "llvm/Support/Locale.h" +#include "llvm/Support/Unicode.h" #include "llvm/Support/raw_ostream.h" #include <algorithm> #include <cassert> @@ -803,6 +804,50 @@ FormatDiagnostic(SmallVectorImpl<char> &OutStr) const { FormatDiagnostic(Diag.begin(), Diag.end(), OutStr); } +/// pushEscapedString - Append Str to the diagnostic buffer, +/// escaping non-printable characters and ill-formed code unit sequences. +static void pushEscapedString(StringRef Str, SmallVectorImpl<char> &OutStr) { + OutStr.reserve(OutStr.size() + Str.size()); + auto *Begin = reinterpret_cast<const unsigned char *>(Str.data()); + llvm::raw_svector_ostream OutStream(OutStr); + const unsigned char *End = Begin + Str.size(); + while (Begin != End) { + // ASCII case + if (isPrintable(*Begin) || isWhitespace(*Begin)) { + OutStream << *Begin; + ++Begin; + continue; + } + if (llvm::isLegalUTF8Sequence(Begin, End)) { + llvm::UTF32 CodepointValue; + llvm::UTF32 *CpPtr = &CodepointValue; + const unsigned char *CodepointBegin = Begin; + const unsigned char *CodepointEnd = + Begin + llvm::getNumBytesForUTF8(*Begin); + llvm::ConversionResult Res = llvm::ConvertUTF8toUTF32( + &Begin, CodepointEnd, &CpPtr, CpPtr + 1, llvm::strictConversion); + (void)Res; + assert( + llvm::conversionOK == Res && + "the sequence is legal UTF-8 but we couldn't convert it to UTF-32"); + assert(Begin == CodepointEnd && + "we must be further along in the string now"); + if (llvm::sys::unicode::isPrintable(CodepointValue) || + llvm::sys::unicode::isFormatting(CodepointValue)) { + OutStr.append(CodepointBegin, CodepointEnd); + continue; + } + // Unprintable code point. + OutStream << "<U+" << llvm::format_hex_no_prefix(CodepointValue, 4, true) + << ">"; + continue; + } + // Invalid code unit. + OutStream << "<" << llvm::format_hex_no_prefix(*Begin, 2, true) << ">"; + ++Begin; + } +} + void Diagnostic:: FormatDiagnostic(const char *DiagStr, const char *DiagEnd, SmallVectorImpl<char> &OutStr) const { @@ -813,11 +858,7 @@ FormatDiagnostic(const char *DiagStr, const char *DiagEnd, StringRef(DiagStr, DiagEnd - DiagStr).equals("%0") && getArgKind(0) == DiagnosticsEngine::ak_std_string) { const std::string &S = getArgStdStr(0); - for (char c : S) { - if (llvm::sys::locale::isPrint(c) || c == '\t') { - OutStr.push_back(c); - } - } + pushEscapedString(S, OutStr); return; } @@ -924,7 +965,7 @@ FormatDiagnostic(const char *DiagStr, const char *DiagEnd, case DiagnosticsEngine::ak_std_string: { const std::string &S = getArgStdStr(ArgNo); assert(ModifierLen == 0 && "No modifiers for strings yet"); - OutStr.append(S.begin(), S.end()); + pushEscapedString(S, OutStr); break; } case DiagnosticsEngine::ak_c_string: { @@ -934,8 +975,7 @@ FormatDiagnostic(const char *DiagStr, const char *DiagEnd, // Don't crash if get passed a null pointer by accident. if (!S) S = "(null)"; - - OutStr.append(S, S + strlen(S)); + pushEscapedString(S, OutStr); break; } // ---- INTEGERS ---- |