diff options
author | Naveen Seth Hanig <naveen.hanig@outlook.com> | 2025-06-25 09:13:00 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2025-06-25 18:13:00 +0200 |
commit | dd47b845a62cdaf4a1b0aba354cd80a4eabd9570 (patch) | |
tree | db42d57fff968c972928473d7d48271ebe2c65e0 /clang/lib/Format/FormatTokenLexer.cpp | |
parent | 9a7720ad2f96fc5911be3ed2c53ec2bdf6fbd9a6 (diff) | |
download | llvm-dd47b845a62cdaf4a1b0aba354cd80a4eabd9570.zip llvm-dd47b845a62cdaf4a1b0aba354cd80a4eabd9570.tar.gz llvm-dd47b845a62cdaf4a1b0aba354cd80a4eabd9570.tar.bz2 |
[clang-format] Handle Trailing Whitespace After Line Continuation (P2223R2) (#145243)
Fixes #145226.
Implement
[P2223R2](https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2021/p2223r2.pdf)
in clang-format to correctly handle cases where a backslash '\\' is
followed by trailing whitespace before the newline.
Previously, `clang-format` failed to properly detect and handle such
cases, leading to misformatted code.
With this, `clang-format` matches the behavior already implemented in
Clang's lexer and `DependencyDirectivesScanner.cpp`, which allow
trailing whitespace after a line continuation in any C++ standard.
Diffstat (limited to 'clang/lib/Format/FormatTokenLexer.cpp')
-rw-r--r-- | clang/lib/Format/FormatTokenLexer.cpp | 30 |
1 files changed, 21 insertions, 9 deletions
diff --git a/clang/lib/Format/FormatTokenLexer.cpp b/clang/lib/Format/FormatTokenLexer.cpp index 4cc4f5f..06f68ec 100644 --- a/clang/lib/Format/FormatTokenLexer.cpp +++ b/clang/lib/Format/FormatTokenLexer.cpp @@ -14,6 +14,7 @@ #include "FormatTokenLexer.h" #include "FormatToken.h" +#include "clang/Basic/CharInfo.h" #include "clang/Basic/SourceLocation.h" #include "clang/Basic/SourceManager.h" #include "clang/Format/Format.h" @@ -1203,16 +1204,22 @@ static size_t countLeadingWhitespace(StringRef Text) { const unsigned char *const End = Text.bytes_end(); const unsigned char *Cur = Begin; while (Cur < End) { - if (isspace(Cur[0])) { + if (isWhitespace(Cur[0])) { ++Cur; - } else if (Cur[0] == '\\' && (Cur[1] == '\n' || Cur[1] == '\r')) { - // A '\' followed by a newline always escapes the newline, regardless - // of whether there is another '\' before it. + } else if (Cur[0] == '\\') { + // A backslash followed by optional horizontal whitespaces (P22232R2) and + // then a newline always escapes the newline. // The source has a null byte at the end. So the end of the entire input // isn't reached yet. Also the lexer doesn't break apart an escaped // newline. - assert(End - Cur >= 2); - Cur += 2; + const auto *Lookahead = Cur + 1; + while (isHorizontalWhitespace(*Lookahead)) + ++Lookahead; + // No line splice found; the backslash is a token. + if (!isVerticalWhitespace(*Lookahead)) + break; + // Splice found, consume it. + Cur = Lookahead + 1; } else if (Cur[0] == '?' && Cur[1] == '?' && Cur[2] == '/' && (Cur[3] == '\n' || Cur[3] == '\r')) { // Newlines can also be escaped by a '?' '?' '/' trigraph. By the way, the @@ -1295,13 +1302,18 @@ FormatToken *FormatTokenLexer::getNextToken() { case '/': // The text was entirely whitespace when this loop was entered. Thus // this has to be an escape sequence. - assert(Text.substr(i, 2) == "\\\r" || Text.substr(i, 2) == "\\\n" || - Text.substr(i, 4) == "\?\?/\r" || + assert(Text.substr(i, 4) == "\?\?/\r" || Text.substr(i, 4) == "\?\?/\n" || (i >= 1 && (Text.substr(i - 1, 4) == "\?\?/\r" || Text.substr(i - 1, 4) == "\?\?/\n")) || (i >= 2 && (Text.substr(i - 2, 4) == "\?\?/\r" || - Text.substr(i - 2, 4) == "\?\?/\n"))); + Text.substr(i - 2, 4) == "\?\?/\n")) || + (Text[i] == '\\' && [&]() -> bool { + size_t j = i + 1; + while (j < Text.size() && isHorizontalWhitespace(Text[j])) + ++j; + return j < Text.size() && (Text[j] == '\n' || Text[j] == '\r'); + }())); InEscape = true; break; default: |