aboutsummaryrefslogtreecommitdiff
path: root/clang/lib/Basic
diff options
context:
space:
mode:
authorCorentin Jabot <corentinjabot@gmail.com>2021-08-20 17:52:28 +0200
committerCorentin Jabot <corentinjabot@gmail.com>2022-06-28 22:26:00 +0200
commit870b6d21839707a3e4c40a29b526995f065a220f (patch)
treebaa86806a0dcc922f5507837e9c41c5505da2202 /clang/lib/Basic
parent04235d07ad5b44f639f329840b5ffec48bf96b44 (diff)
downloadllvm-870b6d21839707a3e4c40a29b526995f065a220f.zip
llvm-870b6d21839707a3e4c40a29b526995f065a220f.tar.gz
llvm-870b6d21839707a3e4c40a29b526995f065a220f.tar.bz2
Improve handling of static assert messages.
Instead of dumping the string literal (which quotes it and escape every non-ascii symbol), we can use the content of the string when it is a 8 byte string. Wide, UTF-8/UTF-16/32 strings are still completely escaped, until we clarify how these entities should behave (cf https://wg21.link/p2361). `FormatDiagnostic` is modified to escape non printable characters and invalid UTF-8. This ensures that unicode characters, spaces and new lines are properly rendered in static messages. This make clang more consistent with other implementation and fixes this tweet https://twitter.com/jfbastien/status/1298307325443231744 :) Of note, `PaddingChecker` did print out new lines that were later removed by the diagnostic printing code. To be consistent with its tests, the new lines are removed from the diagnostic. Unicode tables updated to both use the Unicode definitions and the Unicode 14.0 data. U+00AD SOFT HYPHEN is still considered a print character to match existing practices in terminals, in addition of being considered a formatting character as per Unicode. Reviewed By: aaron.ballman, #clang-language-wg Differential Revision: https://reviews.llvm.org/D108469
Diffstat (limited to 'clang/lib/Basic')
-rw-r--r--clang/lib/Basic/Diagnostic.cpp58
1 files changed, 49 insertions, 9 deletions
diff --git a/clang/lib/Basic/Diagnostic.cpp b/clang/lib/Basic/Diagnostic.cpp
index deb3987..dbe62ec 100644
--- a/clang/lib/Basic/Diagnostic.cpp
+++ b/clang/lib/Basic/Diagnostic.cpp
@@ -25,8 +25,9 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/ConvertUTF.h"
#include "llvm/Support/CrashRecoveryContext.h"
-#include "llvm/Support/Locale.h"
+#include "llvm/Support/Unicode.h"
#include "llvm/Support/raw_ostream.h"
#include <algorithm>
#include <cassert>
@@ -803,6 +804,50 @@ FormatDiagnostic(SmallVectorImpl<char> &OutStr) const {
FormatDiagnostic(Diag.begin(), Diag.end(), OutStr);
}
+/// pushEscapedString - Append Str to the diagnostic buffer,
+/// escaping non-printable characters and ill-formed code unit sequences.
+static void pushEscapedString(StringRef Str, SmallVectorImpl<char> &OutStr) {
+ OutStr.reserve(OutStr.size() + Str.size());
+ auto *Begin = reinterpret_cast<const unsigned char *>(Str.data());
+ llvm::raw_svector_ostream OutStream(OutStr);
+ const unsigned char *End = Begin + Str.size();
+ while (Begin != End) {
+ // ASCII case
+ if (isPrintable(*Begin) || isWhitespace(*Begin)) {
+ OutStream << *Begin;
+ ++Begin;
+ continue;
+ }
+ if (llvm::isLegalUTF8Sequence(Begin, End)) {
+ llvm::UTF32 CodepointValue;
+ llvm::UTF32 *CpPtr = &CodepointValue;
+ const unsigned char *CodepointBegin = Begin;
+ const unsigned char *CodepointEnd =
+ Begin + llvm::getNumBytesForUTF8(*Begin);
+ llvm::ConversionResult Res = llvm::ConvertUTF8toUTF32(
+ &Begin, CodepointEnd, &CpPtr, CpPtr + 1, llvm::strictConversion);
+ (void)Res;
+ assert(
+ llvm::conversionOK == Res &&
+ "the sequence is legal UTF-8 but we couldn't convert it to UTF-32");
+ assert(Begin == CodepointEnd &&
+ "we must be further along in the string now");
+ if (llvm::sys::unicode::isPrintable(CodepointValue) ||
+ llvm::sys::unicode::isFormatting(CodepointValue)) {
+ OutStr.append(CodepointBegin, CodepointEnd);
+ continue;
+ }
+ // Unprintable code point.
+ OutStream << "<U+" << llvm::format_hex_no_prefix(CodepointValue, 4, true)
+ << ">";
+ continue;
+ }
+ // Invalid code unit.
+ OutStream << "<" << llvm::format_hex_no_prefix(*Begin, 2, true) << ">";
+ ++Begin;
+ }
+}
+
void Diagnostic::
FormatDiagnostic(const char *DiagStr, const char *DiagEnd,
SmallVectorImpl<char> &OutStr) const {
@@ -813,11 +858,7 @@ FormatDiagnostic(const char *DiagStr, const char *DiagEnd,
StringRef(DiagStr, DiagEnd - DiagStr).equals("%0") &&
getArgKind(0) == DiagnosticsEngine::ak_std_string) {
const std::string &S = getArgStdStr(0);
- for (char c : S) {
- if (llvm::sys::locale::isPrint(c) || c == '\t') {
- OutStr.push_back(c);
- }
- }
+ pushEscapedString(S, OutStr);
return;
}
@@ -924,7 +965,7 @@ FormatDiagnostic(const char *DiagStr, const char *DiagEnd,
case DiagnosticsEngine::ak_std_string: {
const std::string &S = getArgStdStr(ArgNo);
assert(ModifierLen == 0 && "No modifiers for strings yet");
- OutStr.append(S.begin(), S.end());
+ pushEscapedString(S, OutStr);
break;
}
case DiagnosticsEngine::ak_c_string: {
@@ -934,8 +975,7 @@ FormatDiagnostic(const char *DiagStr, const char *DiagEnd,
// Don't crash if get passed a null pointer by accident.
if (!S)
S = "(null)";
-
- OutStr.append(S, S + strlen(S));
+ pushEscapedString(S, OutStr);
break;
}
// ---- INTEGERS ----