diff options
Diffstat (limited to 'clang-tools-extra/clangd/support/Markup.cpp')
-rw-r--r-- | clang-tools-extra/clangd/support/Markup.cpp | 357 |
1 files changed, 318 insertions, 39 deletions
diff --git a/clang-tools-extra/clangd/support/Markup.cpp b/clang-tools-extra/clangd/support/Markup.cpp index 63aff96..a130830 100644 --- a/clang-tools-extra/clangd/support/Markup.cpp +++ b/clang-tools-extra/clangd/support/Markup.cpp @@ -6,12 +6,12 @@ // //===----------------------------------------------------------------------===// #include "support/Markup.h" +#include "clang/Basic/CharInfo.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" -#include "llvm/Support/Compiler.h" #include "llvm/Support/raw_ostream.h" #include <cstddef> #include <iterator> @@ -64,8 +64,8 @@ bool looksLikeTag(llvm::StringRef Contents) { // It's always safe to escape punctuation, but want minimal escaping. // The strategy is to escape the first character of anything that might start // a markdown grammar construct. -bool needsLeadingEscape(char C, llvm::StringRef Before, llvm::StringRef After, - bool StartsLine) { +bool needsLeadingEscapePlaintext(char C, llvm::StringRef Before, + llvm::StringRef After, bool StartsLine) { assert(Before.take_while(llvm::isSpace).empty()); auto RulerLength = [&]() -> /*Length*/ unsigned { if (!StartsLine || !Before.empty()) @@ -151,16 +151,94 @@ bool needsLeadingEscape(char C, llvm::StringRef Before, llvm::StringRef After, } } -/// Escape a markdown text block. Ensures the punctuation will not introduce +/// \brief Tests whether \p C should be backslash-escaped in markdown. +/// +/// The MarkupContent LSP specification defines that `markdown` content needs to +/// follow GFM (GitHub Flavored Markdown) rules. And we can assume that markdown +/// is rendered on the client side. This means we do not need to escape any +/// markdown constructs. +/// The only exception is when the client does not support HTML rendering in +/// markdown. In that case, we need to escape HTML tags and HTML entities. +/// +/// **FIXME:** handle the case when the client does support HTML rendering in +/// markdown. For this, the LSP server needs to check the +/// [supportsHtml +/// capability](https://github.com/microsoft/language-server-protocol/issues/1344) +/// of the client. +/// +/// \param C The character to check. +/// \param After The string that follows \p C . +/// This is used to determine if \p C is part of a tag or an entity reference. +/// +/// \returns true if \p C should be escaped, false otherwise. +bool needsLeadingEscapeMarkdown(char C, llvm::StringRef After) { + switch (C) { + case '<': // HTML tag (or autolink, which we choose not to escape) + return looksLikeTag(After); + case '&': { // HTML entity reference + auto End = After.find(';'); + if (End == llvm::StringRef::npos) + return false; + llvm::StringRef Content = After.substr(0, End); + if (Content.consume_front("#")) { + if (Content.consume_front("x") || Content.consume_front("X")) + return llvm::all_of(Content, llvm::isHexDigit); + return llvm::all_of(Content, llvm::isDigit); + } + return llvm::all_of(Content, llvm::isAlpha); + } + default: + return false; + } +} + +bool needsLeadingEscape(char C, llvm::StringRef Before, llvm::StringRef After, + bool StartsLine, bool EscapeMarkdown) { + if (EscapeMarkdown) + return needsLeadingEscapePlaintext(C, Before, After, StartsLine); + return needsLeadingEscapeMarkdown(C, After); +} + +/// Escape a markdown text block. +/// If \p EscapeMarkdown is true it ensures the punctuation will not introduce /// any of the markdown constructs. -std::string renderText(llvm::StringRef Input, bool StartsLine) { +/// Else, markdown syntax is not escaped, only HTML tags and entities. +std::string renderText(llvm::StringRef Input, bool StartsLine, + bool EscapeMarkdown) { std::string R; - for (unsigned I = 0; I < Input.size(); ++I) { - if (needsLeadingEscape(Input[I], Input.substr(0, I), Input.substr(I + 1), - StartsLine)) - R.push_back('\\'); - R.push_back(Input[I]); + R.reserve(Input.size()); + + // split the input into lines, and escape each line separately. + llvm::StringRef Line, Rest; + + bool IsFirstLine = true; + + for (std::tie(Line, Rest) = Input.split('\n'); + !(Line.empty() && Rest.empty()); + std::tie(Line, Rest) = Rest.split('\n')) { + + bool StartsLineIntern = IsFirstLine ? StartsLine : true; + + // Ignore leading spaces for the escape logic, but preserve them in the + // output. + StringRef LeadingSpaces = Line.take_while(llvm::isSpace); + if (!LeadingSpaces.empty()) { + R.append(LeadingSpaces); + } + + for (unsigned I = LeadingSpaces.size(); I < Line.size(); ++I) { + if (needsLeadingEscape(Line[I], Line.substr(LeadingSpaces.size(), I), + Line.substr(I + 1), StartsLineIntern, + EscapeMarkdown)) + R.push_back('\\'); + R.push_back(Line[I]); + } + + IsFirstLine = false; + if (!Rest.empty()) + R.push_back('\n'); } + return R; } @@ -168,6 +246,7 @@ std::string renderText(llvm::StringRef Input, bool StartsLine) { /// is surrounded by backticks and the inner contents are properly escaped. std::string renderInlineBlock(llvm::StringRef Input) { std::string R; + R.reserve(Input.size()); // Double all backticks to make sure we don't close the inline block early. for (size_t From = 0; From < Input.size();) { size_t Next = Input.find("`", From); @@ -261,6 +340,9 @@ std::string renderBlocks(llvm::ArrayRef<std::unique_ptr<Block>> Children, // https://github.com/microsoft/vscode/issues/88416 for details. class Ruler : public Block { public: + void renderEscapedMarkdown(llvm::raw_ostream &OS) const override { + renderMarkdown(OS); + } void renderMarkdown(llvm::raw_ostream &OS) const override { // Note that we need an extra new line before the ruler, otherwise we might // make previous block a title instead of introducing a ruler. @@ -275,6 +357,9 @@ public: class CodeBlock : public Block { public: + void renderEscapedMarkdown(llvm::raw_ostream &OS) const override { + renderMarkdown(OS); + } void renderMarkdown(llvm::raw_ostream &OS) const override { std::string Marker = getMarkerForCodeBlock(Contents); // No need to pad from previous blocks, as they should end with a new line. @@ -303,11 +388,13 @@ private: std::string indentLines(llvm::StringRef Input) { assert(!Input.ends_with("\n") && "Input should've been trimmed."); std::string IndentedR; - // We'll add 2 spaces after each new line. + // We'll add 2 spaces after each new line which is not followed by another new + // line. IndentedR.reserve(Input.size() + Input.count('\n') * 2); - for (char C : Input) { + for (size_t I = 0; I < Input.size(); ++I) { + char C = Input[I]; IndentedR += C; - if (C == '\n') + if (C == '\n' && (((I + 1) < Input.size()) && (Input[I + 1] != '\n'))) IndentedR.append(" "); } return IndentedR; @@ -316,17 +403,34 @@ std::string indentLines(llvm::StringRef Input) { class Heading : public Paragraph { public: Heading(size_t Level) : Level(Level) {} + + void renderEscapedMarkdown(llvm::raw_ostream &OS) const override { + insertHeadingMarkers(OS); + Paragraph::renderEscapedMarkdown(OS); + } + void renderMarkdown(llvm::raw_ostream &OS) const override { - OS << std::string(Level, '#') << ' '; + insertHeadingMarkers(OS); Paragraph::renderMarkdown(OS); } private: size_t Level; + + void insertHeadingMarkers(llvm::raw_ostream &OS) const { + OS << std::string(Level, '#') << ' '; + } }; } // namespace +std::string Block::asEscapedMarkdown() const { + std::string R; + llvm::raw_string_ostream OS(R); + renderEscapedMarkdown(OS); + return llvm::StringRef(OS.str()).trim().str(); +} + std::string Block::asMarkdown() const { std::string R; llvm::raw_string_ostream OS(R); @@ -341,6 +445,35 @@ std::string Block::asPlainText() const { return llvm::StringRef(OS.str()).trim().str(); } +void Paragraph::renderEscapedMarkdown(llvm::raw_ostream &OS) const { + bool NeedsSpace = false; + bool HasChunks = false; + for (auto &C : Chunks) { + if (C.SpaceBefore || NeedsSpace) + OS << " "; + switch (C.Kind) { + case ChunkKind::PlainText: + OS << renderText(C.Contents, !HasChunks, /*EscapeMarkdown=*/true); + break; + case ChunkKind::InlineCode: + OS << renderInlineBlock(C.Contents); + break; + case ChunkKind::Bold: + OS << renderText("**" + C.Contents + "**", !HasChunks, + /*EscapeMarkdown=*/true); + break; + case ChunkKind::Emphasized: + OS << renderText("*" + C.Contents + "*", !HasChunks, + /*EscapeMarkdown=*/true); + break; + } + HasChunks = true; + NeedsSpace = C.SpaceAfter; + } + // A paragraph in markdown is separated by a blank line. + OS << "\n\n"; +} + void Paragraph::renderMarkdown(llvm::raw_ostream &OS) const { bool NeedsSpace = false; bool HasChunks = false; @@ -348,20 +481,26 @@ void Paragraph::renderMarkdown(llvm::raw_ostream &OS) const { if (C.SpaceBefore || NeedsSpace) OS << " "; switch (C.Kind) { - case Chunk::PlainText: - OS << renderText(C.Contents, !HasChunks); + case ChunkKind::PlainText: + OS << renderText(C.Contents, !HasChunks, /*EscapeMarkdown=*/false); break; - case Chunk::InlineCode: + case ChunkKind::InlineCode: OS << renderInlineBlock(C.Contents); break; + case ChunkKind::Bold: + OS << "**" << renderText(C.Contents, !HasChunks, /*EscapeMarkdown=*/false) + << "**"; + break; + case ChunkKind::Emphasized: + OS << "*" << renderText(C.Contents, !HasChunks, /*EscapeMarkdown=*/false) + << "*"; + break; } HasChunks = true; NeedsSpace = C.SpaceAfter; } - // Paragraphs are translated into markdown lines, not markdown paragraphs. - // Therefore it only has a single linebreak afterwards. - // VSCode requires two spaces at the end of line to start a new one. - OS << " \n"; + // A paragraph in markdown is separated by a blank line. + OS << "\n\n"; } std::unique_ptr<Block> Paragraph::clone() const { @@ -370,8 +509,8 @@ std::unique_ptr<Block> Paragraph::clone() const { /// Choose a marker to delimit `Text` from a prioritized list of options. /// This is more readable than escaping for plain-text. -llvm::StringRef chooseMarker(llvm::ArrayRef<llvm::StringRef> Options, - llvm::StringRef Text) { +llvm::StringRef Paragraph::chooseMarker(llvm::ArrayRef<llvm::StringRef> Options, + llvm::StringRef Text) const { // Prefer a delimiter whose characters don't appear in the text. for (llvm::StringRef S : Options) if (Text.find_first_of(S) == llvm::StringRef::npos) @@ -379,31 +518,147 @@ llvm::StringRef chooseMarker(llvm::ArrayRef<llvm::StringRef> Options, return Options.front(); } +bool Paragraph::punctuationIndicatesLineBreak(llvm::StringRef Line) const { + constexpr llvm::StringLiteral Punctuation = R"txt(.:,;!?)txt"; + + Line = Line.rtrim(); + return !Line.empty() && Punctuation.contains(Line.back()); +} + +bool Paragraph::isHardLineBreakIndicator(llvm::StringRef Rest) const { + // '-'/'*' md list, '@'/'\' documentation command, '>' md blockquote, + // '#' headings, '`' code blocks, two spaces (markdown force newline) + constexpr llvm::StringLiteral LinebreakIndicators = R"txt(-*@\>#`)txt"; + + Rest = Rest.ltrim(" \t"); + if (Rest.empty()) + return false; + + if (LinebreakIndicators.contains(Rest.front())) + return true; + + if (llvm::isDigit(Rest.front())) { + llvm::StringRef AfterDigit = Rest.drop_while(llvm::isDigit); + if (AfterDigit.starts_with(".") || AfterDigit.starts_with(")")) + return true; + } + return false; +} + +bool Paragraph::isHardLineBreakAfter(llvm::StringRef Line, + llvm::StringRef Rest) const { + // In Markdown, 2 spaces before a line break forces a line break. + // Add a line break for plaintext in this case too. + // Should we also consider whether Line is short? + return Line.ends_with(" ") || punctuationIndicatesLineBreak(Line) || + isHardLineBreakIndicator(Rest); +} + void Paragraph::renderPlainText(llvm::raw_ostream &OS) const { bool NeedsSpace = false; + std::string ConcatenatedText; + ConcatenatedText.reserve(EstimatedStringSize); + + llvm::raw_string_ostream ConcatenatedOS(ConcatenatedText); + for (auto &C : Chunks) { + + if (C.Kind == ChunkKind::PlainText) { + if (C.SpaceBefore || NeedsSpace) + ConcatenatedOS << ' '; + + ConcatenatedOS << C.Contents; + NeedsSpace = llvm::isSpace(C.Contents.back()) || C.SpaceAfter; + continue; + } + if (C.SpaceBefore || NeedsSpace) - OS << " "; + ConcatenatedOS << ' '; llvm::StringRef Marker = ""; - if (C.Preserve && C.Kind == Chunk::InlineCode) + if (C.Preserve && C.Kind == ChunkKind::InlineCode) Marker = chooseMarker({"`", "'", "\""}, C.Contents); - OS << Marker << C.Contents << Marker; + else if (C.Kind == ChunkKind::Bold) + Marker = "**"; + else if (C.Kind == ChunkKind::Emphasized) + Marker = "*"; + ConcatenatedOS << Marker << C.Contents << Marker; NeedsSpace = C.SpaceAfter; } - OS << '\n'; + + // We go through the contents line by line to handle the newlines + // and required spacing correctly. + // + // Newlines are added if: + // - the line ends with 2 spaces and a newline follows + // - the line ends with punctuation that indicates a line break (.:,;!?) + // - the next line starts with a hard line break indicator (-@>#`, or a digit + // followed by '.' or ')'), ignoring leading whitespace. + // + // Otherwise, newlines in the input are replaced with a single space. + // + // Multiple spaces are collapsed into a single space. + // + // Lines containing only whitespace are ignored. + llvm::StringRef Line, Rest; + + for (std::tie(Line, Rest) = + llvm::StringRef(ConcatenatedText).trim().split('\n'); + !(Line.empty() && Rest.empty()); + std::tie(Line, Rest) = Rest.split('\n')) { + + // Remove lines which only contain whitespace. + // + // Note: this also handles the case when there are multiple newlines + // in a row, since all leading newlines are removed. + // + // The documentation parsing treats multiple newlines as paragraph + // separators, hence it will create a new Paragraph instead of adding + // multiple newlines to the same Paragraph. + // Therfore multiple newlines are never added to a paragraph + // except if the user explicitly adds them using + // e.g. appendText("user text\n\nnext text"). + Line = Line.ltrim(); + if (Line.empty()) + continue; + + OS << canonicalizeSpaces(Line); + + if (isHardLineBreakAfter(Line, Rest)) + OS << '\n'; + else if (!Rest.empty()) + // Since we removed any trailing whitespace from the input using trim(), + // we know that the next line contains non-whitespace characters. + // Therefore, we can add a space without worrying about trailing spaces. + OS << ' '; + } + + // Paragraphs are separated by a blank line. + OS << "\n\n"; } BulletList::BulletList() = default; BulletList::~BulletList() = default; +void BulletList::renderEscapedMarkdown(llvm::raw_ostream &OS) const { + for (auto &D : Items) { + std::string M = D.asEscapedMarkdown(); + // Instead of doing this we might prefer passing Indent to children to get + // rid of the copies, if it turns out to be a bottleneck. + OS << "- " << indentLines(M) << '\n'; + } + // We add 2 newlines after list to terminate it in markdown. + OS << "\n\n"; +} + void BulletList::renderMarkdown(llvm::raw_ostream &OS) const { for (auto &D : Items) { + std::string M = D.asMarkdown(); // Instead of doing this we might prefer passing Indent to children to get // rid of the copies, if it turns out to be a bottleneck. - OS << "- " << indentLines(D.asMarkdown()) << '\n'; + OS << "- " << indentLines(M) << '\n'; } - // We need a new line after list to terminate it in markdown. - OS << '\n'; + // We add 2 newlines after list to terminate it in markdown. + OS << "\n\n"; } void BulletList::renderPlainText(llvm::raw_ostream &OS) const { @@ -412,6 +667,7 @@ void BulletList::renderPlainText(llvm::raw_ostream &OS) const { // rid of the copies, if it turns out to be a bottleneck. OS << "- " << indentLines(D.asPlainText()) << '\n'; } + OS << '\n'; } Paragraph &Paragraph::appendSpace() { @@ -420,32 +676,51 @@ Paragraph &Paragraph::appendSpace() { return *this; } -Paragraph &Paragraph::appendText(llvm::StringRef Text) { - std::string Norm = canonicalizeSpaces(Text); - if (Norm.empty()) +Paragraph &Paragraph::appendChunk(llvm::StringRef Contents, ChunkKind K) { + if (Contents.empty()) return *this; Chunks.emplace_back(); Chunk &C = Chunks.back(); - C.Contents = std::move(Norm); - C.Kind = Chunk::PlainText; - C.SpaceBefore = llvm::isSpace(Text.front()); - C.SpaceAfter = llvm::isSpace(Text.back()); + C.Contents = Contents; + C.Kind = K; + + EstimatedStringSize += Contents.size(); return *this; } +Paragraph &Paragraph::appendText(llvm::StringRef Text) { + if (!Chunks.empty() && Chunks.back().Kind == ChunkKind::PlainText) { + Chunks.back().Contents += std::move(Text); + return *this; + } + + return appendChunk(std::move(Text), ChunkKind::PlainText); +} + +Paragraph &Paragraph::appendEmphasizedText(llvm::StringRef Text) { + return appendChunk(canonicalizeSpaces(std::move(Text)), + ChunkKind::Emphasized); +} + +Paragraph &Paragraph::appendBoldText(llvm::StringRef Text) { + return appendChunk(canonicalizeSpaces(std::move(Text)), ChunkKind::Bold); +} + Paragraph &Paragraph::appendCode(llvm::StringRef Code, bool Preserve) { bool AdjacentCode = - !Chunks.empty() && Chunks.back().Kind == Chunk::InlineCode; + !Chunks.empty() && Chunks.back().Kind == ChunkKind::InlineCode; std::string Norm = canonicalizeSpaces(std::move(Code)); if (Norm.empty()) return *this; + EstimatedStringSize += Norm.size(); Chunks.emplace_back(); Chunk &C = Chunks.back(); C.Contents = std::move(Norm); - C.Kind = Chunk::InlineCode; + C.Kind = ChunkKind::InlineCode; C.Preserve = Preserve; // Disallow adjacent code spans without spaces, markdown can't render them. C.SpaceBefore = AdjacentCode; + return *this; } @@ -482,6 +757,10 @@ void Document::addCodeBlock(std::string Code, std::string Language) { std::make_unique<CodeBlock>(std::move(Code), std::move(Language))); } +std::string Document::asEscapedMarkdown() const { + return renderBlocks(Children, &Block::renderEscapedMarkdown); +} + std::string Document::asMarkdown() const { return renderBlocks(Children, &Block::renderMarkdown); } |