[clang-doc] markdown parserusers/evelez7/clang-doc-markdown-parser

author: Erick Velez <erickvelez7@gmail.com> 2025-08-11 10:21:35 -0700
committer: Erick Velez <erickvelez7@gmail.com> 2025-08-28 10:08:06 -0700
commit: 5a66d8962440f61848dd5d7b4054b6c7f4a97a6b (patch)
tree: 49af4fdbc2aaddde28ac08b7c2edb91697fdb100
parent: 790bee99ded0a7142f435028d8a3bf2b098a8553 (diff)
download: llvm-users/evelez7/clang-doc-markdown-parser.zip
llvm-users/evelez7/clang-doc-markdown-parser.tar.gz
llvm-users/evelez7/clang-doc-markdown-parser.tar.bz2
6 files changed, 369 insertions, 0 deletions
diff --git a/clang-tools-extra/clang-doc/CMakeLists.txt b/clang-tools-extra/clang-doc/CMakeLists.txt
index 5989e5f..f86272b 100644
--- a/clang-tools-extra/clang-doc/CMakeLists.txt
+++ b/clang-tools-extra/clang-doc/CMakeLists.txt
@@ -18,6 +18,7 @@ add_clang_library(clangDoc STATIC
   YAMLGenerator.cpp
   HTMLMustacheGenerator.cpp
   JSONGenerator.cpp
+  MDParser.cpp
 
   DEPENDS
   omp_gen
diff --git a/clang-tools-extra/clang-doc/MDParser.cpp b/clang-tools-extra/clang-doc/MDParser.cpp
new file mode 100644
index 0000000..80e0386
--- /dev/null
+++ b/clang-tools-extra/clang-doc/MDParser.cpp
@@ -0,0 +1,229 @@
+#include "MDParser.h"
+#include "clang/Basic/CharInfo.h"
+#include "llvm/ADT/AllocatorList.h"
+
+namespace clang {
+namespace doc {
+namespace {
+bool isEmphasisDelimiter(char &Token) {
+  // TODO: support '_'
+  if (Token == '*')
+    return true;
+  return false;
+}
+} // namespace
+
+std::pair<std::optional<DelimiterContext>, size_t>
+MarkdownParser::processDelimiters(SmallString<64> &Line, const size_t &Origin) {
+  size_t Idx = Origin;
+  while (Idx < Line.size() && Line[Idx] == Line[Origin]) {
+    ++Idx;
+  }
+
+  char Preceeding = (Origin == 0) ? ' ' : Line[Origin - 1];
+  char Proceeding = (Idx >= Line.size()) ? ' ' : Line[Idx];
+
+  bool LeftFlanking = !isWhitespace(Proceeding) &&
+                      (!isPunctuation(Proceeding) || isWhitespace(Preceeding) ||
+                       isPunctuation(Preceeding));
+  bool RightFlanking = !isWhitespace(Preceeding) &&
+                       (!isPunctuation(Preceeding) || isWhitespace(Proceeding) ||
+                        isPunctuation(Proceeding));
+
+  if (LeftFlanking && RightFlanking)
+    return {DelimiterContext{LeftFlanking, RightFlanking, true, true}, Idx};
+  if (LeftFlanking)
+    return {DelimiterContext{LeftFlanking, RightFlanking, true, false}, Idx};
+  if (RightFlanking)
+    return {DelimiterContext{LeftFlanking, RightFlanking, false, true}, Idx};
+  return {std::nullopt, 0};
+}
+
+Node *MarkdownParser::createTextNode(const std::list<LineNode *> &Text) {
+  Node *TextNode = new (Arena) Node();
+  for (const auto *Node : Text) {
+    TextNode->Content.append(Node->Content);
+  }
+  TextNode->Type = MDType::Text;
+  return TextNode;
+}
+
+Node *MarkdownParser::reverseIterateLine(std::list<LineNode *> &Stack,
+                                         std::list<LineNode *>::iterator &It) {
+  auto ReverseIt = std::make_reverse_iterator(It);
+  std::list<LineNode *> Text;
+  while (ReverseIt != Stack.rend()) {
+    auto *ReverseCurrent = *ReverseIt;
+    if (!ReverseCurrent->DelimiterContext && !ReverseCurrent->Content.empty()) {
+      Text.push_back(ReverseCurrent);
+      ReverseIt++;
+      continue;
+    }
+
+    if (ReverseCurrent->DelimiterContext &&
+        ReverseCurrent->DelimiterContext->CanOpen) {
+      if (Text.empty()) {
+        // If there is no text between the runs, there is no emphasis, so both
+        // delimiter runs are literal text.
+        auto *DelimiterTextNode = new (Arena) Node();
+        DelimiterTextNode->Content =
+            Saver.save((*It)->Content + ReverseCurrent->Content);
+        DelimiterTextNode->Type = MDType::Text;
+        return DelimiterTextNode;
+      }
+      Node *Emphasis = nullptr;
+
+      auto &Closer = (*It)->DelimiterContext;
+      auto &Opener = ReverseCurrent->DelimiterContext;
+
+      if (Closer->Length >= 2 && Opener->Length >= 2) {
+        // We have at least one strong node.
+        Closer->Length -= 2;
+        Opener->Length -= 2;
+        Emphasis = new (Arena) Node();
+        Emphasis->Type = MDType::Strong;
+        auto *Child = createTextNode(Text);
+        Child->Parent = Emphasis;
+        Emphasis->Children.push_back(Child);
+      } else if (Closer->Length == 1 && Opener->Length == 1) {
+        Closer->Length -= 1;
+        Opener->Length -= 1;
+        Emphasis = new (Arena) Node();
+        Emphasis->Type = MDType::Emphasis;
+        auto *Child = createTextNode(Text);
+        Child->Parent = Emphasis;
+        Emphasis->Children.push_back(Child);
+      }
+
+      if (Closer->Length == 0)
+        It = Stack.erase(It);
+      if (Opener->Length == 0)
+        ReverseIt = std::make_reverse_iterator(Stack.erase(ReverseIt.base()));
+      if (!Text.empty())
+        for (auto *Node : Text)
+          Stack.remove(Node);
+      return Emphasis;
+    }
+    ReverseIt++;
+  }
+  return nullptr;
+}
+
+std::list<Node *>
+MarkdownParser::processEmphasis(std::list<LineNode *> &Stack) {
+  std::list<Node *> Result;
+  auto It = Stack.begin();
+  while (It != Stack.end()) {
+    LineNode *Current = *It;
+    if (Current->DelimiterContext && Current->DelimiterContext->CanClose) {
+      auto *NewNode = reverseIterateLine(Stack, It);
+      if (NewNode) {
+        Result.push_back(NewNode);
+        It = Stack.begin();
+        continue;
+      }
+    }
+    ++It;
+  }
+
+  return Result;
+}
+
+void MarkdownParser::parseLine(SmallString<64> &Line, Node *Current) {
+  std::list<LineNode *> Stack;
+  BumpPtrAllocator LineArena;
+  size_t StrCount = 0;
+  size_t Idx = 0;
+  for (; Idx < Line.size(); ++Idx) {
+    if (isEmphasisDelimiter(Line[Idx])) {
+      auto DelimiterResult = processDelimiters(Line, Idx);
+      if (DelimiterResult.first != std::nullopt) {
+        if (StrCount > 0) {
+          auto *TextNode = new (LineArena) LineNode();
+          TextNode->Content = Line.substr(Idx - StrCount, StrCount);
+          Stack.push_back(TextNode);
+          StrCount = 0;
+        }
+        auto *NewNode = new (LineArena) LineNode();
+        NewNode->Content = Line.substr(Idx, DelimiterResult.second - Idx);
+        NewNode->DelimiterContext = std::move(DelimiterResult.first);
+        NewNode->DelimiterContext->Length = NewNode->Content.size();
+        Stack.push_back(NewNode);
+        Idx = DelimiterResult.second - 1;
+        continue;
+      }
+    }
+    // Not any emphasis delimiter, so it will be appended as a string later
+    StrCount += 1;
+  }
+
+  if (StrCount > 0) {
+    auto *TextNode = new (LineArena) LineNode();
+    TextNode->Content = Line.substr(Line.size() - StrCount, StrCount);
+    Stack.push_back(TextNode);
+  }
+
+  auto Resolved = processEmphasis(Stack);
+  for (auto *Node : Resolved) {
+    Node->Parent = Current;
+    Current->Children.push_back(Node);
+  }
+}
+
+Node *MarkdownParser::parse(std::vector<SmallString<64>> &Lines) {
+  auto *Root = new (Arena) Node();
+  Node *Current = Root;
+  for (auto &Line : Lines) {
+    if (Line.empty()) {
+      auto *Paragraph = new (Arena) Node();
+      Paragraph->Type = MDType::Paragraph;
+      Paragraph->Parent = Current;
+      Current->Children.push_back(Paragraph);
+      Current = Paragraph;
+      continue;
+    }
+    parseLine(Line, Current);
+  }
+  return Root;
+}
+
+std::string MarkdownParser::traverse(Node *Current) {
+  std::string Result;
+  switch (Current->Type) {
+  case MDType::Strong:
+    Result.append("<strong>");
+    for (auto *Child : Current->Children)
+      Result.append(traverse(Child));
+    Result.append("</strong>");
+    break;
+  case MDType::Text:
+    Result.append(Current->Content);
+    break;
+  case MDType::Softbreak:
+    Result.append("\n");
+    break;
+  case MDType::Paragraph:
+    Result.append("<p>");
+    for (auto *Child : Current->Children)
+      Result.append(traverse(Child));
+    Result.append("</p>");
+    break;
+  case MDType::Emphasis:
+    Result.append("<em>");
+    for (auto *Child : Current->Children)
+      Result.append(traverse(Child));
+    Result.append("</em>");
+    break;
+  }
+  return Result;
+}
+
+std::string MarkdownParser::render(std::vector<SmallString<64>> &Lines) {
+  auto *Document = parse(Lines);
+  std::string Result;
+  for (auto *Child : Document->Children)
+    Result.append(traverse(Child));
+  return Result;
+}
+} // namespace doc
+} // namespace clang
diff --git a/clang-tools-extra/clang-doc/MDParser.h b/clang-tools-extra/clang-doc/MDParser.h
new file mode 100644
index 0000000..32599ea
--- /dev/null
+++ b/clang-tools-extra/clang-doc/MDParser.h
@@ -0,0 +1,99 @@
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MD_PARSER_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MD_PARSER_H
+#include "llvm/ADT/SmallString.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/StringSaver.h"
+#include <list>
+
+using namespace llvm;
+
+namespace clang {
+namespace doc {
+using llvm::SmallString;
+enum class MDState { Emphasis, Strong, None };
+
+enum class MDType {
+  Paragraph,
+  Emphasis,
+  Strong,
+  Text,
+  Softbreak,
+};
+
+enum class MDTokenType { LeftDelimiterRun, RightDelimiterRun, Text };
+
+struct Node {
+  SmallVector<Node*> Children;
+  MDType Type;
+  Node *Parent;
+  std::string Content;
+};
+
+struct DelimiterContext {
+  bool RightFlanking;
+  bool LeftFlanking;
+  bool CanOpen;
+  bool CanClose;
+  char DelimChar;
+  // Since Content is a StringRef, we separately track the length so that we can
+  // decrement when necessary without modifying the string.
+  size_t Length;
+};
+
+/// A LineNode might be a valid delimiter run, text, or a delimiter run that
+/// will later be merged with a text if there is no matching run e.g. ***foo.
+/// @brief A preprocessing structure for tracking text in a line.
+struct LineNode {
+  StringRef Content;
+  // Instantiated if the line is a delimiter run.
+  std::optional<DelimiterContext> DelimiterContext;
+};
+
+class MarkdownParser {
+  // MDState State;
+  BumpPtrAllocator Arena;
+  StringSaver Saver;
+
+  /// If a delimiter is found, determine if it is a delimiter run, what type of
+  /// run it is, and whether it can be an opener or closer.
+  ///
+  /// The CommonMark specification defines delimiter runs as:
+  /// A delimiter run is either a sequence of one or more * or _ characters that
+  /// is not preceded or followed by a non-backslash-escaped * or _ character
+  ///
+  /// A left-flanking delimiter run is a delimiter run that is (1) not followed
+  /// by Unicode whitespace, and either (2a) not followed by a Unicode
+  /// punctuation character, or (2b) followed by a Unicode punctuation character
+  /// and preceded by Unicode whitespace or a Unicode punctuation character.
+  ///
+  /// A right-flanking delimiter run is a delimiter run that is (1) not preceded
+  /// by Unicode whitespace, and either (2a) not preceded by a Unicode
+  /// punctuation character, or (2b) preceded by a Unicode punctuation character
+  /// and followed by Unicode whitespace or a Unicode punctuation character.
+  ///
+  /// @param IdxOrigin the index of * or _ that might start a delimiter run.
+  /// @return A pair denoting the type of run and the index where the run stops
+  std::pair<std::optional<DelimiterContext>, size_t>
+  processDelimiters(SmallString<64> &Line, const size_t &Origin = 0);
+
+  void parseLine(SmallString<64> &Line, Node *Current);
+  std::list<Node *> processEmphasis(std::list<LineNode *> &Stack);
+  void convertToNode(LineNode LN, Node *Parent);
+
+  Node *reverseIterateLine(std::list<LineNode *> &Stack,
+                           std::list<LineNode *>::iterator &It);
+
+  Node *createTextNode(const std::list<LineNode *> &Text);
+
+  std::string traverse(Node *Current);
+
+  /// @param Lines An entire Document that resides in a comment.
+  /// @return the root of a Markdown document.
+  Node* parse(std::vector<SmallString<64>> &Lines);
+public:
+  MarkdownParser() : Arena(BumpPtrAllocator()), Saver(Arena) {}
+  std::string render(std::vector<SmallString<64>> &Lines);
+};
+} // namespace doc
+} // namespace clang
+#endif
diff --git a/clang-tools-extra/clang-doc/Representation.h b/clang-tools-extra/clang-doc/Representation.h
index 2a75f89..71acb69 100644
--- a/clang-tools-extra/clang-doc/Representation.h
+++ b/clang-tools-extra/clang-doc/Representation.h
@@ -99,6 +99,7 @@ struct CommentInfo {
   bool SelfClosing = false;  // Indicates if tag is self-closing (for HTML).
   bool Explicit = false; // Indicates if the direction of a param is explicit
                          // (for (T)ParamCommand).
+  bool Markdown = false; // Comment contains Markdown tokens.
   llvm::SmallVector<SmallString<16>, 4>
       AttrKeys; // List of attribute keys (for HTML).
   llvm::SmallVector<SmallString<16>, 4>
diff --git a/clang-tools-extra/unittests/clang-doc/CMakeLists.txt b/clang-tools-extra/unittests/clang-doc/CMakeLists.txt
index 18166ac..b62fd25 100644
--- a/clang-tools-extra/unittests/clang-doc/CMakeLists.txt
+++ b/clang-tools-extra/unittests/clang-doc/CMakeLists.txt
@@ -32,6 +32,7 @@ add_extra_unittest(ClangDocTests
   SerializeTest.cpp
   YAMLGeneratorTest.cpp
   JSONGeneratorTest.cpp
+  MDParserTest.cpp
   )
 
 clang_target_link_libraries(ClangDocTests
diff --git a/clang-tools-extra/unittests/clang-doc/MDParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MDParserTest.cpp
new file mode 100644
index 0000000..0ecbce0
--- /dev/null
+++ b/clang-tools-extra/unittests/clang-doc/MDParserTest.cpp
@@ -0,0 +1,38 @@
+#include "MDParser.h"
+#include "ClangDocTest.h"
+
+namespace clang {
+namespace doc {
+TEST(MDParserTest, Strong) {
+  MarkdownParser Parser;
+  std::vector<SmallString<64>> Line = {{"**Strong**"}};
+  auto Result = Parser.render(Line);
+  std::string Expected = R"raw(<strong>Strong</strong>)raw";
+  EXPECT_EQ(Expected, Result);
+}
+
+// TEST(MDParserTest, DoubleStrong) {
+//   MarkdownParser Parser;
+//   std::vector<SmallString<64>> Line = {{"****Strong****"}};
+//   auto Result = Parser.render(Line);
+//   std::string Expected = R"raw(<strong><strong>Strong</strong></strong>)raw";
+//   EXPECT_EQ(Expected, Result);
+// }
+
+TEST(MDParserTest, Emphasis) {
+  MarkdownParser Parser;
+  std::vector<SmallString<64>> Line = {{"*Emphasis*"}};
+  auto Result = Parser.render(Line);
+  std::string Expected = R"raw(<em>Emphasis</em>)raw";
+  EXPECT_EQ(Expected, Result);
+}
+
+// TEST(MDParserTest, Text) {
+//   MarkdownParser Parser;
+//   std::vector<SmallString<64>> Line = {{"Text"}};
+//   auto Result = Parser.render(Line);
+//   std::string Expected = R"raw(Text)raw";
+//   EXPECT_EQ(Expected, Result);
+// }
+} // namespace doc
+} // namespace clang
author	Erick Velez <erickvelez7@gmail.com>	2025-08-11 10:21:35 -0700
committer	Erick Velez <erickvelez7@gmail.com>	2025-08-28 10:08:06 -0700
commit	5a66d8962440f61848dd5d7b4054b6c7f4a97a6b (patch)
tree	49af4fdbc2aaddde28ac08b7c2edb91697fdb100
parent	790bee99ded0a7142f435028d8a3bf2b098a8553 (diff)
download	llvm-users/evelez7/clang-doc-markdown-parser.zip llvm-users/evelez7/clang-doc-markdown-parser.tar.gz llvm-users/evelez7/clang-doc-markdown-parser.tar.bz2