1 files changed, 318 insertions, 39 deletions
diff --git a/clang-tools-extra/clangd/support/Markup.cpp b/clang-tools-extra/clangd/support/Markup.cpp
index 63aff96..a130830 100644
--- a/clang-tools-extra/clangd/support/Markup.cpp
+++ b/clang-tools-extra/clangd/support/Markup.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 #include "support/Markup.h"
+#include "clang/Basic/CharInfo.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Support/Compiler.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cstddef>
 #include <iterator>
@@ -64,8 +64,8 @@ bool looksLikeTag(llvm::StringRef Contents) {
 // It's always safe to escape punctuation, but want minimal escaping.
 // The strategy is to escape the first character of anything that might start
 // a markdown grammar construct.
-bool needsLeadingEscape(char C, llvm::StringRef Before, llvm::StringRef After,
-                        bool StartsLine) {
+bool needsLeadingEscapePlaintext(char C, llvm::StringRef Before,
+                                 llvm::StringRef After, bool StartsLine) {
   assert(Before.take_while(llvm::isSpace).empty());
   auto RulerLength = [&]() -> /*Length*/ unsigned {
     if (!StartsLine || !Before.empty())
@@ -151,16 +151,94 @@ bool needsLeadingEscape(char C, llvm::StringRef Before, llvm::StringRef After,
   }
 }
 
-/// Escape a markdown text block. Ensures the punctuation will not introduce
+/// \brief Tests whether \p C should be backslash-escaped in markdown.
+///
+/// The MarkupContent LSP specification defines that `markdown` content needs to
+/// follow GFM (GitHub Flavored Markdown) rules. And we can assume that markdown
+/// is rendered on the client side. This means we do not need to escape any
+/// markdown constructs.
+/// The only exception is when the client does not support HTML rendering in
+/// markdown. In that case, we need to escape HTML tags and HTML entities.
+///
+/// **FIXME:** handle the case when the client does support HTML rendering in
+/// markdown. For this, the LSP server needs to check the
+/// [supportsHtml
+/// capability](https://github.com/microsoft/language-server-protocol/issues/1344)
+/// of the client.
+///
+/// \param C The character to check.
+/// \param After The string that follows \p C .
+/// This is used to determine if \p C is part of a tag or an entity reference.
+///
+/// \returns true if \p C should be escaped, false otherwise.
+bool needsLeadingEscapeMarkdown(char C, llvm::StringRef After) {
+  switch (C) {
+  case '<': // HTML tag (or autolink, which we choose not to escape)
+    return looksLikeTag(After);
+  case '&': { // HTML entity reference
+    auto End = After.find(';');
+    if (End == llvm::StringRef::npos)
+      return false;
+    llvm::StringRef Content = After.substr(0, End);
+    if (Content.consume_front("#")) {
+      if (Content.consume_front("x") || Content.consume_front("X"))
+        return llvm::all_of(Content, llvm::isHexDigit);
+      return llvm::all_of(Content, llvm::isDigit);
+    }
+    return llvm::all_of(Content, llvm::isAlpha);
+  }
+  default:
+    return false;
+  }
+}
+
+bool needsLeadingEscape(char C, llvm::StringRef Before, llvm::StringRef After,
+                        bool StartsLine, bool EscapeMarkdown) {
+  if (EscapeMarkdown)
+    return needsLeadingEscapePlaintext(C, Before, After, StartsLine);
+  return needsLeadingEscapeMarkdown(C, After);
+}
+
+/// Escape a markdown text block.
+/// If \p EscapeMarkdown is true it ensures the punctuation will not introduce
 /// any of the markdown constructs.
-std::string renderText(llvm::StringRef Input, bool StartsLine) {
+/// Else, markdown syntax is not escaped, only HTML tags and entities.
+std::string renderText(llvm::StringRef Input, bool StartsLine,
+                       bool EscapeMarkdown) {
   std::string R;
-  for (unsigned I = 0; I < Input.size(); ++I) {
-    if (needsLeadingEscape(Input[I], Input.substr(0, I), Input.substr(I + 1),
-                           StartsLine))
-      R.push_back('\\');
-    R.push_back(Input[I]);
+  R.reserve(Input.size());
+
+  // split the input into lines, and escape each line separately.
+  llvm::StringRef Line, Rest;
+
+  bool IsFirstLine = true;
+
+  for (std::tie(Line, Rest) = Input.split('\n');
+       !(Line.empty() && Rest.empty());
+       std::tie(Line, Rest) = Rest.split('\n')) {
+
+    bool StartsLineIntern = IsFirstLine ? StartsLine : true;
+
+    // Ignore leading spaces for the escape logic, but preserve them in the
+    // output.
+    StringRef LeadingSpaces = Line.take_while(llvm::isSpace);
+    if (!LeadingSpaces.empty()) {
+      R.append(LeadingSpaces);
+    }
+
+    for (unsigned I = LeadingSpaces.size(); I < Line.size(); ++I) {
+      if (needsLeadingEscape(Line[I], Line.substr(LeadingSpaces.size(), I),
+                             Line.substr(I + 1), StartsLineIntern,
+                             EscapeMarkdown))
+        R.push_back('\\');
+      R.push_back(Line[I]);
+    }
+
+    IsFirstLine = false;
+    if (!Rest.empty())
+      R.push_back('\n');
   }
+
   return R;
 }
 
@@ -168,6 +246,7 @@ std::string renderText(llvm::StringRef Input, bool StartsLine) {
 /// is surrounded by backticks and the inner contents are properly escaped.
 std::string renderInlineBlock(llvm::StringRef Input) {
   std::string R;
+  R.reserve(Input.size());
   // Double all backticks to make sure we don't close the inline block early.
   for (size_t From = 0; From < Input.size();) {
     size_t Next = Input.find("`", From);
@@ -261,6 +340,9 @@ std::string renderBlocks(llvm::ArrayRef<std::unique_ptr<Block>> Children,
 // https://github.com/microsoft/vscode/issues/88416 for details.
 class Ruler : public Block {
 public:
+  void renderEscapedMarkdown(llvm::raw_ostream &OS) const override {
+    renderMarkdown(OS);
+  }
   void renderMarkdown(llvm::raw_ostream &OS) const override {
     // Note that we need an extra new line before the ruler, otherwise we might
     // make previous block a title instead of introducing a ruler.
@@ -275,6 +357,9 @@ public:
 
 class CodeBlock : public Block {
 public:
+  void renderEscapedMarkdown(llvm::raw_ostream &OS) const override {
+    renderMarkdown(OS);
+  }
   void renderMarkdown(llvm::raw_ostream &OS) const override {
     std::string Marker = getMarkerForCodeBlock(Contents);
     // No need to pad from previous blocks, as they should end with a new line.
@@ -303,11 +388,13 @@ private:
 std::string indentLines(llvm::StringRef Input) {
   assert(!Input.ends_with("\n") && "Input should've been trimmed.");
   std::string IndentedR;
-  // We'll add 2 spaces after each new line.
+  // We'll add 2 spaces after each new line which is not followed by another new
+  // line.
   IndentedR.reserve(Input.size() + Input.count('\n') * 2);
-  for (char C : Input) {
+  for (size_t I = 0; I < Input.size(); ++I) {
+    char C = Input[I];
     IndentedR += C;
-    if (C == '\n')
+    if (C == '\n' && (((I + 1) < Input.size()) && (Input[I + 1] != '\n')))
       IndentedR.append("  ");
   }
   return IndentedR;
@@ -316,17 +403,34 @@ std::string indentLines(llvm::StringRef Input) {
 class Heading : public Paragraph {
 public:
   Heading(size_t Level) : Level(Level) {}
+
+  void renderEscapedMarkdown(llvm::raw_ostream &OS) const override {
+    insertHeadingMarkers(OS);
+    Paragraph::renderEscapedMarkdown(OS);
+  }
+
   void renderMarkdown(llvm::raw_ostream &OS) const override {
-    OS << std::string(Level, '#') << ' ';
+    insertHeadingMarkers(OS);
     Paragraph::renderMarkdown(OS);
   }
 
 private:
   size_t Level;
+
+  void insertHeadingMarkers(llvm::raw_ostream &OS) const {
+    OS << std::string(Level, '#') << ' ';
+  }
 };
 
 } // namespace
 
+std::string Block::asEscapedMarkdown() const {
+  std::string R;
+  llvm::raw_string_ostream OS(R);
+  renderEscapedMarkdown(OS);
+  return llvm::StringRef(OS.str()).trim().str();
+}
+
 std::string Block::asMarkdown() const {
   std::string R;
   llvm::raw_string_ostream OS(R);
@@ -341,6 +445,35 @@ std::string Block::asPlainText() const {
   return llvm::StringRef(OS.str()).trim().str();
 }
 
+void Paragraph::renderEscapedMarkdown(llvm::raw_ostream &OS) const {
+  bool NeedsSpace = false;
+  bool HasChunks = false;
+  for (auto &C : Chunks) {
+    if (C.SpaceBefore || NeedsSpace)
+      OS << " ";
+    switch (C.Kind) {
+    case ChunkKind::PlainText:
+      OS << renderText(C.Contents, !HasChunks, /*EscapeMarkdown=*/true);
+      break;
+    case ChunkKind::InlineCode:
+      OS << renderInlineBlock(C.Contents);
+      break;
+    case ChunkKind::Bold:
+      OS << renderText("**" + C.Contents + "**", !HasChunks,
+                       /*EscapeMarkdown=*/true);
+      break;
+    case ChunkKind::Emphasized:
+      OS << renderText("*" + C.Contents + "*", !HasChunks,
+                       /*EscapeMarkdown=*/true);
+      break;
+    }
+    HasChunks = true;
+    NeedsSpace = C.SpaceAfter;
+  }
+  // A paragraph in markdown is separated by a blank line.
+  OS << "\n\n";
+}
+
 void Paragraph::renderMarkdown(llvm::raw_ostream &OS) const {
   bool NeedsSpace = false;
   bool HasChunks = false;
@@ -348,20 +481,26 @@ void Paragraph::renderMarkdown(llvm::raw_ostream &OS) const {
     if (C.SpaceBefore || NeedsSpace)
       OS << " ";
     switch (C.Kind) {
-    case Chunk::PlainText:
-      OS << renderText(C.Contents, !HasChunks);
+    case ChunkKind::PlainText:
+      OS << renderText(C.Contents, !HasChunks, /*EscapeMarkdown=*/false);
       break;
-    case Chunk::InlineCode:
+    case ChunkKind::InlineCode:
       OS << renderInlineBlock(C.Contents);
       break;
+    case ChunkKind::Bold:
+      OS << "**" << renderText(C.Contents, !HasChunks, /*EscapeMarkdown=*/false)
+         << "**";
+      break;
+    case ChunkKind::Emphasized:
+      OS << "*" << renderText(C.Contents, !HasChunks, /*EscapeMarkdown=*/false)
+         << "*";
+      break;
     }
     HasChunks = true;
     NeedsSpace = C.SpaceAfter;
   }
-  // Paragraphs are translated into markdown lines, not markdown paragraphs.
-  // Therefore it only has a single linebreak afterwards.
-  // VSCode requires two spaces at the end of line to start a new one.
-  OS << "  \n";
+  // A paragraph in markdown is separated by a blank line.
+  OS << "\n\n";
 }
 
 std::unique_ptr<Block> Paragraph::clone() const {
@@ -370,8 +509,8 @@ std::unique_ptr<Block> Paragraph::clone() const {
 
 /// Choose a marker to delimit `Text` from a prioritized list of options.
 /// This is more readable than escaping for plain-text.
-llvm::StringRef chooseMarker(llvm::ArrayRef<llvm::StringRef> Options,
-                             llvm::StringRef Text) {
+llvm::StringRef Paragraph::chooseMarker(llvm::ArrayRef<llvm::StringRef> Options,
+                                        llvm::StringRef Text) const {
   // Prefer a delimiter whose characters don't appear in the text.
   for (llvm::StringRef S : Options)
     if (Text.find_first_of(S) == llvm::StringRef::npos)
@@ -379,31 +518,147 @@ llvm::StringRef chooseMarker(llvm::ArrayRef<llvm::StringRef> Options,
   return Options.front();
 }
 
+bool Paragraph::punctuationIndicatesLineBreak(llvm::StringRef Line) const {
+  constexpr llvm::StringLiteral Punctuation = R"txt(.:,;!?)txt";
+
+  Line = Line.rtrim();
+  return !Line.empty() && Punctuation.contains(Line.back());
+}
+
+bool Paragraph::isHardLineBreakIndicator(llvm::StringRef Rest) const {
+  // '-'/'*' md list, '@'/'\' documentation command, '>' md blockquote,
+  // '#' headings, '`' code blocks, two spaces (markdown force newline)
+  constexpr llvm::StringLiteral LinebreakIndicators = R"txt(-*@\>#`)txt";
+
+  Rest = Rest.ltrim(" \t");
+  if (Rest.empty())
+    return false;
+
+  if (LinebreakIndicators.contains(Rest.front()))
+    return true;
+
+  if (llvm::isDigit(Rest.front())) {
+    llvm::StringRef AfterDigit = Rest.drop_while(llvm::isDigit);
+    if (AfterDigit.starts_with(".") || AfterDigit.starts_with(")"))
+      return true;
+  }
+  return false;
+}
+
+bool Paragraph::isHardLineBreakAfter(llvm::StringRef Line,
+                                     llvm::StringRef Rest) const {
+  // In Markdown, 2 spaces before a line break forces a line break.
+  // Add a line break for plaintext in this case too.
+  // Should we also consider whether Line is short?
+  return Line.ends_with("  ") || punctuationIndicatesLineBreak(Line) ||
+         isHardLineBreakIndicator(Rest);
+}
+
 void Paragraph::renderPlainText(llvm::raw_ostream &OS) const {
   bool NeedsSpace = false;
+  std::string ConcatenatedText;
+  ConcatenatedText.reserve(EstimatedStringSize);
+
+  llvm::raw_string_ostream ConcatenatedOS(ConcatenatedText);
+
   for (auto &C : Chunks) {
+
+    if (C.Kind == ChunkKind::PlainText) {
+      if (C.SpaceBefore || NeedsSpace)
+        ConcatenatedOS << ' ';
+
+      ConcatenatedOS << C.Contents;
+      NeedsSpace = llvm::isSpace(C.Contents.back()) || C.SpaceAfter;
+      continue;
+    }
+
     if (C.SpaceBefore || NeedsSpace)
-      OS << " ";
+      ConcatenatedOS << ' ';
     llvm::StringRef Marker = "";
-    if (C.Preserve && C.Kind == Chunk::InlineCode)
+    if (C.Preserve && C.Kind == ChunkKind::InlineCode)
       Marker = chooseMarker({"`", "'", "\""}, C.Contents);
-    OS << Marker << C.Contents << Marker;
+    else if (C.Kind == ChunkKind::Bold)
+      Marker = "**";
+    else if (C.Kind == ChunkKind::Emphasized)
+      Marker = "*";
+    ConcatenatedOS << Marker << C.Contents << Marker;
     NeedsSpace = C.SpaceAfter;
   }
-  OS << '\n';
+
+  // We go through the contents line by line to handle the newlines
+  // and required spacing correctly.
+  //
+  // Newlines are added if:
+  // - the line ends with 2 spaces and a newline follows
+  // - the line ends with punctuation that indicates a line break (.:,;!?)
+  // - the next line starts with a hard line break indicator (-@>#`, or a digit
+  //   followed by '.' or ')'), ignoring leading whitespace.
+  //
+  // Otherwise, newlines in the input are replaced with a single space.
+  //
+  // Multiple spaces are collapsed into a single space.
+  //
+  // Lines containing only whitespace are ignored.
+  llvm::StringRef Line, Rest;
+
+  for (std::tie(Line, Rest) =
+           llvm::StringRef(ConcatenatedText).trim().split('\n');
+       !(Line.empty() && Rest.empty());
+       std::tie(Line, Rest) = Rest.split('\n')) {
+
+    // Remove lines which only contain whitespace.
+    //
+    // Note: this also handles the case when there are multiple newlines
+    // in a row, since all leading newlines are removed.
+    //
+    // The documentation parsing treats multiple newlines as paragraph
+    // separators, hence it will create a new Paragraph instead of adding
+    // multiple newlines to the same Paragraph.
+    // Therfore multiple newlines are never added to a paragraph
+    // except if the user explicitly adds them using
+    // e.g. appendText("user text\n\nnext text").
+    Line = Line.ltrim();
+    if (Line.empty())
+      continue;
+
+    OS << canonicalizeSpaces(Line);
+
+    if (isHardLineBreakAfter(Line, Rest))
+      OS << '\n';
+    else if (!Rest.empty())
+      // Since we removed any trailing whitespace from the input using trim(),
+      // we know that the next line contains non-whitespace characters.
+      // Therefore, we can add a space without worrying about trailing spaces.
+      OS << ' ';
+  }
+
+  // Paragraphs are separated by a blank line.
+  OS << "\n\n";
 }
 
 BulletList::BulletList() = default;
 BulletList::~BulletList() = default;
 
+void BulletList::renderEscapedMarkdown(llvm::raw_ostream &OS) const {
+  for (auto &D : Items) {
+    std::string M = D.asEscapedMarkdown();
+    // Instead of doing this we might prefer passing Indent to children to get
+    // rid of the copies, if it turns out to be a bottleneck.
+    OS << "- " << indentLines(M) << '\n';
+  }
+  // We add 2 newlines after list to terminate it in markdown.
+  OS << "\n\n";
+}
+
 void BulletList::renderMarkdown(llvm::raw_ostream &OS) const {
   for (auto &D : Items) {
+    std::string M = D.asMarkdown();
     // Instead of doing this we might prefer passing Indent to children to get
     // rid of the copies, if it turns out to be a bottleneck.
-    OS << "- " << indentLines(D.asMarkdown()) << '\n';
+    OS << "- " << indentLines(M) << '\n';
   }
-  // We need a new line after list to terminate it in markdown.
-  OS << '\n';
+  // We add 2 newlines after list to terminate it in markdown.
+  OS << "\n\n";
 }
 
 void BulletList::renderPlainText(llvm::raw_ostream &OS) const {
@@ -412,6 +667,7 @@ void BulletList::renderPlainText(llvm::raw_ostream &OS) const {
     // rid of the copies, if it turns out to be a bottleneck.
     OS << "- " << indentLines(D.asPlainText()) << '\n';
   }
+  OS << '\n';
 }
 
 Paragraph &Paragraph::appendSpace() {
@@ -420,32 +676,51 @@ Paragraph &Paragraph::appendSpace() {
   return *this;
 }
 
-Paragraph &Paragraph::appendText(llvm::StringRef Text) {
-  std::string Norm = canonicalizeSpaces(Text);
-  if (Norm.empty())
+Paragraph &Paragraph::appendChunk(llvm::StringRef Contents, ChunkKind K) {
+  if (Contents.empty())
     return *this;
   Chunks.emplace_back();
   Chunk &C = Chunks.back();
-  C.Contents = std::move(Norm);
-  C.Kind = Chunk::PlainText;
-  C.SpaceBefore = llvm::isSpace(Text.front());
-  C.SpaceAfter = llvm::isSpace(Text.back());
+  C.Contents = Contents;
+  C.Kind = K;
+
+  EstimatedStringSize += Contents.size();
   return *this;
 }
 
+Paragraph &Paragraph::appendText(llvm::StringRef Text) {
+  if (!Chunks.empty() && Chunks.back().Kind == ChunkKind::PlainText) {
+    Chunks.back().Contents += std::move(Text);
+    return *this;
+  }
+
+  return appendChunk(std::move(Text), ChunkKind::PlainText);
+}
+
+Paragraph &Paragraph::appendEmphasizedText(llvm::StringRef Text) {
+  return appendChunk(canonicalizeSpaces(std::move(Text)),
+                     ChunkKind::Emphasized);
+}
+
+Paragraph &Paragraph::appendBoldText(llvm::StringRef Text) {
+  return appendChunk(canonicalizeSpaces(std::move(Text)), ChunkKind::Bold);
+}
+
 Paragraph &Paragraph::appendCode(llvm::StringRef Code, bool Preserve) {
   bool AdjacentCode =
-      !Chunks.empty() && Chunks.back().Kind == Chunk::InlineCode;
+      !Chunks.empty() && Chunks.back().Kind == ChunkKind::InlineCode;
   std::string Norm = canonicalizeSpaces(std::move(Code));
   if (Norm.empty())
     return *this;
+  EstimatedStringSize += Norm.size();
   Chunks.emplace_back();
   Chunk &C = Chunks.back();
   C.Contents = std::move(Norm);
-  C.Kind = Chunk::InlineCode;
+  C.Kind = ChunkKind::InlineCode;
   C.Preserve = Preserve;
   // Disallow adjacent code spans without spaces, markdown can't render them.
   C.SpaceBefore = AdjacentCode;
+
   return *this;
 }
 
@@ -482,6 +757,10 @@ void Document::addCodeBlock(std::string Code, std::string Language) {
       std::make_unique<CodeBlock>(std::move(Code), std::move(Language)));
 }
 
+std::string Document::asEscapedMarkdown() const {
+  return renderBlocks(Children, &Block::renderEscapedMarkdown);
+}
+
 std::string Document::asMarkdown() const {
   return renderBlocks(Children, &Block::renderMarkdown);
 }