//===- FormatGen.h - Utilities for custom assembly formats ------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file contains common classes for building custom assembly format parsers // and generators. // //===----------------------------------------------------------------------===// #ifndef MLIR_TOOLS_MLIRTBLGEN_FORMATGEN_H_ #define MLIR_TOOLS_MLIRTBLGEN_FORMATGEN_H_ #include "mlir/Support/LLVM.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSet.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/SMLoc.h" #include namespace llvm { class SourceMgr; } // namespace llvm namespace mlir { namespace tblgen { //===----------------------------------------------------------------------===// // FormatToken //===----------------------------------------------------------------------===// /// This class represents a specific token in the input format. class FormatToken { public: /// Basic token kinds. enum Kind { // Markers. eof, error, // Tokens with no info. l_paren, r_paren, caret, colon, comma, equal, less, greater, question, star, pipe, // Keywords. keyword_start, kw_attr_dict, kw_attr_dict_w_keyword, kw_prop_dict, kw_custom, kw_functional_type, kw_oilist, kw_operands, kw_params, kw_qualified, kw_ref, kw_regions, kw_results, kw_struct, kw_successors, kw_type, keyword_end, // String valued tokens. identifier, literal, variable, string, }; FormatToken(Kind kind, StringRef spelling) : kind(kind), spelling(spelling) {} /// Return the bytes that make up this token. StringRef getSpelling() const { return spelling; } /// Return the kind of this token. Kind getKind() const { return kind; } /// Return a location for this token. SMLoc getLoc() const; /// Returns true if the token is of the given kind. bool is(Kind kind) { return getKind() == kind; } /// Return if this token is a keyword. bool isKeyword() const { return getKind() > Kind::keyword_start && getKind() < Kind::keyword_end; } private: /// Discriminator that indicates the kind of token this is. Kind kind; /// A reference to the entire token contents; this is always a pointer into /// a memory buffer owned by the source manager. StringRef spelling; }; //===----------------------------------------------------------------------===// // FormatLexer //===----------------------------------------------------------------------===// /// This class implements a simple lexer for operation assembly format strings. class FormatLexer { public: FormatLexer(llvm::SourceMgr &mgr, SMLoc loc); /// Lex the next token and return it. FormatToken lexToken(); /// Emit an error to the lexer with the given location and message. FormatToken emitError(SMLoc loc, const Twine &msg); FormatToken emitError(const char *loc, const Twine &msg); FormatToken emitErrorAndNote(SMLoc loc, const Twine &msg, const Twine ¬e); private: /// Return the next character in the stream. int getNextChar(); /// Lex an identifier, literal, variable, or string. FormatToken lexIdentifier(const char *tokStart); FormatToken lexLiteral(const char *tokStart); FormatToken lexVariable(const char *tokStart); FormatToken lexString(const char *tokStart); /// Create a token with the current pointer and a start pointer. FormatToken formToken(FormatToken::Kind kind, const char *tokStart) { return FormatToken(kind, StringRef(tokStart, curPtr - tokStart)); } /// The source manager containing the format string. llvm::SourceMgr &mgr; /// Location of the format string. SMLoc loc; /// Buffer containing the format string. StringRef curBuffer; /// Current pointer in the buffer. const char *curPtr; }; //===----------------------------------------------------------------------===// // FormatElement //===----------------------------------------------------------------------===// /// This class represents a single format element. /// /// If you squint and take a close look, you can see the outline of a `Format` /// dialect. class FormatElement { public: virtual ~FormatElement(); // The top-level kinds of format elements. enum Kind { Literal, String, Variable, Whitespace, Directive, Optional }; /// Support LLVM-style RTTI. static bool classof(const FormatElement *el) { return true; } /// Get the element kind. Kind getKind() const { return kind; } protected: /// Create a format element with the given kind. FormatElement(Kind kind) : kind(kind) {} private: /// The kind of the element. Kind kind; }; /// The base class for all format elements. This class implements common methods /// for LLVM-style RTTI. template class FormatElementBase : public FormatElement { public: /// Support LLVM-style RTTI. static bool classof(const FormatElement *el) { return ElementKind == el->getKind(); } protected: /// Create a format element with the given kind. FormatElementBase() : FormatElement(ElementKind) {} }; /// This class represents a literal element. A literal is either one of the /// supported punctuation characters (e.g. `(` or `,`) or a string literal (e.g. /// `literal`). class LiteralElement : public FormatElementBase { public: /// Create a literal element with the given spelling. explicit LiteralElement(StringRef spelling) : spelling(spelling) {} /// Get the spelling of the literal. StringRef getSpelling() const { return spelling; } private: /// The spelling of the variable, i.e. the string contained within the /// backticks. StringRef spelling; }; /// This class represents a raw string that can contain arbitrary C++ code. class StringElement : public FormatElementBase { public: /// Create a string element with the given contents. explicit StringElement(std::string value) : value(std::move(value)) {} /// Get the value of the string element. StringRef getValue() const { return value; } private: /// The contents of the string. std::string value; }; /// This class represents a variable element. A variable refers to some part of /// the object being parsed, e.g. an attribute or operand on an operation or a /// parameter on an attribute. class VariableElement : public FormatElementBase { public: /// These are the kinds of variables. enum Kind { Attribute, Operand, Region, Result, Successor, Parameter, Property }; /// Get the kind of variable. Kind getKind() const { return kind; } protected: /// Create a variable with a kind. VariableElement(Kind kind) : kind(kind) {} private: /// The kind of variable. Kind kind; }; /// Base class for variable elements. This class implements common methods for /// LLVM-style RTTI. template class VariableElementBase : public VariableElement { public: /// An element is of this class if it is a variable and has the same variable /// type. static bool classof(const FormatElement *el) { if (auto *varEl = dyn_cast(el)) return VariableKind == varEl->getKind(); return false; } protected: /// Create a variable element with the given variable kind. VariableElementBase() : VariableElement(VariableKind) {} }; /// This class represents a whitespace element, e.g. a newline or space. It is a /// literal that is printed but never parsed. When the value is empty, i.e. ``, /// a space is elided where one would have been printed automatically. class WhitespaceElement : public FormatElementBase { public: /// Create a whitespace element. explicit WhitespaceElement(StringRef value) : value(value) {} /// Get the whitespace value. StringRef getValue() const { return value; } private: /// The value of the whitespace element. Can be empty. StringRef value; }; class DirectiveElement : public FormatElementBase { public: /// These are the kinds of directives. enum Kind { AttrDict, PropDict, Custom, FunctionalType, OIList, Operands, Ref, Regions, Results, Successors, Type, Params, Struct }; /// Get the directive kind. Kind getKind() const { return kind; } protected: /// Create a directive element with a kind. DirectiveElement(Kind kind) : kind(kind) {} private: /// The directive kind. Kind kind; }; /// Base class for directive elements. This class implements common methods for /// LLVM-style RTTI. template class DirectiveElementBase : public DirectiveElement { public: /// Create a directive element with the specified kind. DirectiveElementBase() : DirectiveElement(DirectiveKind) {} /// A format element is of this class if it is a directive element and has the /// same kind. static bool classof(const FormatElement *el) { if (auto *directiveEl = dyn_cast(el)) return DirectiveKind == directiveEl->getKind(); return false; } }; /// This class represents a custom format directive that is implemented by the /// user in C++. The directive accepts a list of arguments that is passed to the /// C++ function. class CustomDirective : public DirectiveElementBase { public: /// Create a custom directive with a name and list of arguments. CustomDirective(StringRef name, std::vector &&arguments) : name(name), arguments(std::move(arguments)) {} /// Get the custom directive name. StringRef getName() const { return name; } /// Get the arguments to the custom directive. ArrayRef getArguments() const { return arguments; } private: /// The name of the custom directive. The name is used to call two C++ /// methods: `parse{name}` and `print{name}` with the given arguments. StringRef name; /// The arguments with which to call the custom functions. These are either /// variables (for which the functions are responsible for populating) or /// references to variables. std::vector arguments; }; /// This class represents a reference directive. This directive can be used to /// reference but not bind a previously bound variable or format object. Its /// current only use is to pass variables as arguments to the custom directive. class RefDirective : public DirectiveElementBase { public: /// Create a reference directive with the single referenced child. RefDirective(FormatElement *arg) : arg(arg) {} /// Get the reference argument. FormatElement *getArg() const { return arg; } private: /// The referenced argument. FormatElement *arg; }; /// This class represents a group of elements that are optionally emitted based /// on an optional variable "anchor" and a group of elements that are emitted /// when the anchor element is not present. class OptionalElement : public FormatElementBase { public: /// Create an optional group with the given child elements. OptionalElement(std::vector &&thenElements, std::vector &&elseElements, unsigned thenParseStart, unsigned elseParseStart, FormatElement *anchor, bool inverted) : thenElements(std::move(thenElements)), elseElements(std::move(elseElements)), thenParseStart(thenParseStart), elseParseStart(elseParseStart), anchor(anchor), inverted(inverted) {} /// Return the `then` elements of the optional group. Drops the first /// `thenParseStart` whitespace elements if `parseable` is true. ArrayRef getThenElements(bool parseable = false) const { return llvm::ArrayRef(thenElements) .drop_front(parseable ? thenParseStart : 0); } /// Return the `else` elements of the optional group. Drops the first /// `elseParseStart` whitespace elements if `parseable` is true. ArrayRef getElseElements(bool parseable = false) const { return llvm::ArrayRef(elseElements) .drop_front(parseable ? elseParseStart : 0); } /// Return the anchor of the optional group. FormatElement *getAnchor() const { return anchor; } /// Return true if the optional group is inverted. bool isInverted() const { return inverted; } private: /// The child elements emitted when the anchor is present. std::vector thenElements; /// The child elements emitted when the anchor is not present. std::vector elseElements; /// The index of the first element that is parsed in `thenElements`. That is, /// the first non-whitespace element. unsigned thenParseStart; /// The index of the first element that is parsed in `elseElements`. That is, /// the first non-whitespace element. unsigned elseParseStart; /// The anchor element of the optional group. FormatElement *anchor; /// Whether the optional group condition is inverted and the anchor element is /// in the else group. bool inverted; }; //===----------------------------------------------------------------------===// // FormatParserBase //===----------------------------------------------------------------------===// /// Base class for a parser that implements an assembly format. This class /// defines a common assembly format syntax and the creation of format elements. /// Subclasses will need to implement parsing for the format elements they /// support. class FormatParser { public: /// Vtable anchor. virtual ~FormatParser(); /// Parse the assembly format. FailureOr> parse(); protected: /// The current context of the parser when parsing an element. enum Context { /// The element is being parsed in a "top-level" context, i.e. at the top of /// the format or in an optional group. TopLevelContext, /// The element is being parsed as a custom directive child. CustomDirectiveContext, /// The element is being parsed as a type directive child. TypeDirectiveContext, /// The element is being parsed as a reference directive child. RefDirectiveContext, /// The element is being parsed as a struct directive child. StructDirectiveContext }; /// Create a format parser with the given source manager and a location. explicit FormatParser(llvm::SourceMgr &mgr, llvm::SMLoc loc) : lexer(mgr, loc), curToken(lexer.lexToken()) {} /// Allocate and construct a format element. template FormatElementT *create(Args &&...args) { // FormatElementT *ptr = allocator.Allocate(); // ::new (ptr) FormatElementT(std::forward(args)...); // return ptr; auto mem = std::make_unique(std::forward(args)...); FormatElementT *ptr = mem.get(); allocator.push_back(std::move(mem)); return ptr; } //===--------------------------------------------------------------------===// // Element Parsing /// Parse a single element of any kind. FailureOr parseElement(Context ctx); /// Parse a literal. FailureOr parseLiteral(Context ctx); /// Parse a string. FailureOr parseString(Context ctx); /// Parse a variable. FailureOr parseVariable(Context ctx); /// Parse a directive. FailureOr parseDirective(Context ctx); /// Parse an optional group. FailureOr parseOptionalGroup(Context ctx); /// Parse a custom directive. FailureOr parseCustomDirective(llvm::SMLoc loc, Context ctx); /// Parse a ref directive. FailureOr parseRefDirective(SMLoc loc, Context context); /// Parse a qualified directive. FailureOr parseQualifiedDirective(SMLoc loc, Context ctx); /// Parse a format-specific variable kind. virtual FailureOr parseVariableImpl(llvm::SMLoc loc, StringRef name, Context ctx) = 0; /// Parse a format-specific directive kind. virtual FailureOr parseDirectiveImpl(llvm::SMLoc loc, FormatToken::Kind kind, Context ctx) = 0; //===--------------------------------------------------------------------===// // Format Verification /// Verify that the format is well-formed. virtual LogicalResult verify(llvm::SMLoc loc, ArrayRef elements) = 0; /// Verify the arguments to a custom directive. virtual LogicalResult verifyCustomDirectiveArguments(llvm::SMLoc loc, ArrayRef arguments) = 0; /// Verify the elements of an optional group. virtual LogicalResult verifyOptionalGroupElements(llvm::SMLoc loc, ArrayRef elements, FormatElement *anchor) = 0; /// Mark 'element' as qualified. If 'element' cannot be qualified an error /// should be emitted and failure returned. virtual LogicalResult markQualified(llvm::SMLoc loc, FormatElement *element) = 0; //===--------------------------------------------------------------------===// // Lexer Utilities /// Emit an error at the given location. LogicalResult emitError(llvm::SMLoc loc, const Twine &msg) { lexer.emitError(loc, msg); return failure(); } /// Emit an error and a note at the given notation. LogicalResult emitErrorAndNote(llvm::SMLoc loc, const Twine &msg, const Twine ¬e) { lexer.emitErrorAndNote(loc, msg, note); return failure(); } /// Parse a single token of the expected kind. FailureOr parseToken(FormatToken::Kind kind, const Twine &msg) { if (!curToken.is(kind)) return emitError(curToken.getLoc(), msg); FormatToken tok = curToken; consumeToken(); return tok; } /// Advance the lexer to the next token. void consumeToken() { assert(!curToken.is(FormatToken::eof) && !curToken.is(FormatToken::error) && "shouldn't advance past EOF or errors"); curToken = lexer.lexToken(); } /// Get the current token. FormatToken peekToken() { return curToken; } private: /// The format parser retains ownership of the format elements in a bump /// pointer allocator. // FIXME: FormatElement with `std::vector` need to be converted to use // trailing objects. // llvm::BumpPtrAllocator allocator; std::vector> allocator; /// The format lexer to use. FormatLexer lexer; /// The current token in the lexer. FormatToken curToken; }; //===----------------------------------------------------------------------===// // Utility Functions //===----------------------------------------------------------------------===// /// Whether a space needs to be emitted before a literal. E.g., two keywords /// back-to-back require a space separator, but a keyword followed by '<' does /// not require a space. bool shouldEmitSpaceBefore(StringRef value, bool lastWasPunctuation); /// Returns true if the given string can be formatted as a keyword. bool canFormatStringAsKeyword(StringRef value, function_ref emitError = nullptr); /// Returns true if the given string is valid format literal element. /// If `emitError` is provided, it is invoked with the reason for the failure. bool isValidLiteral(StringRef value, function_ref emitError = nullptr); /// Whether a failure in parsing the assembly format should be a fatal error. extern llvm::cl::opt formatErrorIsFatal; } // namespace tblgen } // namespace mlir #endif // MLIR_TOOLS_MLIRTBLGEN_FORMATGEN_H_