//===--- SourceCode.h - Manipulating source code as strings -----*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // Various code that examines C++ source code without using heavy AST machinery // (and often not even the lexer). To be used sparingly! // //===----------------------------------------------------------------------===// #ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_SOURCECODE_H #define LLVM_CLANG_TOOLS_EXTRA_CLANGD_SOURCECODE_H #include "Protocol.h" #include "support/Context.h" #include "support/ThreadsafeFS.h" #include "clang/Basic/CharInfo.h" #include "clang/Basic/Diagnostic.h" #include "clang/Basic/LangOptions.h" #include "clang/Basic/SourceLocation.h" #include "clang/Basic/SourceManager.h" #include "clang/Format/Format.h" #include "clang/Lex/HeaderSearch.h" #include "clang/Tooling/Core/Replacement.h" #include "clang/Tooling/Syntax/Tokens.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSet.h" #include "llvm/Support/Error.h" #include #include namespace clang { class SourceManager; namespace clangd { // We tend to generate digests for source codes in a lot of different places. // This represents the type for those digests to prevent us hard coding details // of hashing function at every place that needs to store this information. using FileDigest = std::array; FileDigest digest(StringRef Content); std::optional digestFile(const SourceManager &SM, FileID FID); // This context variable controls the behavior of functions in this file // that convert between LSP offsets and native clang byte offsets. // If not set, defaults to UTF-16 for backwards-compatibility. extern Key kCurrentOffsetEncoding; // Counts the number of UTF-16 code units needed to represent a string (LSP // specifies string lengths in UTF-16 code units). // Use of UTF-16 may be overridden by kCurrentOffsetEncoding. size_t lspLength(StringRef Code); /// Turn a [line, column] pair into an offset in Code. /// /// If P.character exceeds the line length, returns the offset at end-of-line. /// (If !AllowColumnsBeyondLineLength, then returns an error instead). /// If the line number is out of range, returns an error. /// /// The returned value is in the range [0, Code.size()]. llvm::Expected positionToOffset(llvm::StringRef Code, Position P, bool AllowColumnsBeyondLineLength = true); /// Turn an offset in Code into a [line, column] pair. /// The offset must be in range [0, Code.size()]. Position offsetToPosition(llvm::StringRef Code, size_t Offset); /// Turn a SourceLocation into a [line, column] pair. /// FIXME: This should return an error if the location is invalid. Position sourceLocToPosition(const SourceManager &SM, SourceLocation Loc); /// Return the file location, corresponding to \p P. Note that one should take /// care to avoid comparing the result with expansion locations. llvm::Expected sourceLocationInMainFile(const SourceManager &SM, Position P); /// Returns true iff \p Loc is inside the main file. This function handles /// file & macro locations. For macro locations, returns iff the macro is being /// expanded inside the main file. /// /// The function is usually used to check whether a declaration is inside the /// the main file. bool isInsideMainFile(SourceLocation Loc, const SourceManager &SM); /// Returns the #include location through which IncludedFIle was loaded. /// Where SM.getIncludeLoc() returns the location of the *filename*, which may /// be in a macro, includeHashLoc() returns the location of the #. SourceLocation includeHashLoc(FileID IncludedFile, const SourceManager &SM); /// Returns true if the token at Loc is spelled in the source code. /// This is not the case for: /// * symbols formed via macro concatenation, the spelling location will /// be "" /// * symbols controlled and defined by a compile command-line option /// `-DName=foo`, the spelling location will be "". bool isSpelledInSource(SourceLocation Loc, const SourceManager &SM); /// Turns a token range into a half-open range and checks its correctness. /// The resulting range will have only valid source location on both sides, both /// of which are file locations. /// /// File locations always point to a particular offset in a file, i.e. they /// never refer to a location inside a macro expansion. Turning locations from /// macro expansions into file locations is ambiguous - one can use /// SourceManager::{getExpansion|getFile|getSpelling}Loc. This function /// calls SourceManager::getFileLoc on both ends of \p R to do the conversion. /// /// User input (e.g. cursor position) is expressed as a file location, so this /// function can be viewed as a way to normalize the ranges used in the clang /// AST so that they are comparable with ranges coming from the user input. std::optional toHalfOpenFileRange(const SourceManager &Mgr, const LangOptions &LangOpts, SourceRange R); /// Returns true iff all of the following conditions hold: /// - start and end locations are valid, /// - start and end locations are file locations from the same file /// (i.e. expansion locations are not taken into account). /// - start offset <= end offset. /// FIXME: introduce a type for source range with this invariant. bool isValidFileRange(const SourceManager &Mgr, SourceRange R); /// Returns the source code covered by the source range. /// EXPECTS: isValidFileRange(R) == true. llvm::StringRef toSourceCode(const SourceManager &SM, SourceRange R); // Converts a half-open clang source range to an LSP range. // Note that clang also uses closed source ranges, which this can't handle! Range halfOpenToRange(const SourceManager &SM, CharSourceRange R); // Expand range `A` to also contain `B`. void unionRanges(Range &A, Range B); // Converts an offset to a clang line/column (1-based, columns are bytes). // The offset must be in range [0, Code.size()]. // Prefer to use SourceManager if one is available. std::pair offsetToClangLineColumn(llvm::StringRef Code, size_t Offset); /// From "a::b::c", return {"a::b::", "c"}. Scope is empty if there's no /// qualifier. std::pair splitQualifiedName(llvm::StringRef QName); TextEdit replacementToEdit(StringRef Code, const tooling::Replacement &R); std::vector replacementsToEdits(StringRef Code, const tooling::Replacements &Repls); TextEdit toTextEdit(const FixItHint &FixIt, const SourceManager &M, const LangOptions &L); /// Get the canonical path of \p F. This means: /// /// - Absolute path /// - Symlinks resolved /// - No "." or ".." component /// - No duplicate or trailing directory separator /// /// This function should be used when paths needs to be used outside the /// component that generate it, so that paths are normalized as much as /// possible. std::optional getCanonicalPath(const FileEntryRef F, FileManager &FileMgr); /// Choose the clang-format style we should apply to a certain file. /// This will usually use FS to look for .clang-format directories. /// FIXME: should we be caching the .clang-format file search? /// This uses format::DefaultFormatStyle and format::DefaultFallbackStyle, /// though the latter may have been overridden in main()! format::FormatStyle getFormatStyleForFile(llvm::StringRef File, llvm::StringRef Content, const ThreadsafeFS &TFS); /// Cleanup and format the given replacements. llvm::Expected cleanupAndFormat(StringRef Code, const tooling::Replacements &Replaces, const format::FormatStyle &Style); /// A set of edits generated for a single file. Can verify whether it is safe to /// apply these edits to a code block. struct Edit { tooling::Replacements Replacements; std::string InitialCode; Edit() = default; Edit(llvm::StringRef Code, tooling::Replacements Reps) : Replacements(std::move(Reps)), InitialCode(Code) {} /// Returns the file contents after changes are applied. llvm::Expected apply() const; /// Represents Replacements as TextEdits that are available for use in LSP. std::vector asTextEdits() const; /// Checks whether the Replacements are applicable to given Code. bool canApplyTo(llvm::StringRef Code) const; }; /// A mapping from absolute file path (the one used for accessing the underlying /// VFS) to edits. using FileEdits = llvm::StringMap; /// Formats the edits and code around it according to Style. Changes /// Replacements to formatted ones if succeeds. llvm::Error reformatEdit(Edit &E, const format::FormatStyle &Style); /// Apply an incremental update to a text document. llvm::Error applyChange(std::string &Contents, const TextDocumentContentChangeEvent &Change); /// Collects identifiers with counts in the source code. llvm::StringMap collectIdentifiers(llvm::StringRef Content, const format::FormatStyle &Style); /// Collects all ranges of the given identifier in the source code. std::vector collectIdentifierRanges(llvm::StringRef Identifier, llvm::StringRef Content, const LangOptions &LangOpts); /// Collects words from the source code. /// Unlike collectIdentifiers: /// - also finds text in comments: /// - splits text into words /// - drops stopwords like "get" and "for" llvm::StringSet<> collectWords(llvm::StringRef Content); // Something that looks like a word in the source code. // Could be a "real" token that's "live" in the AST, a spelled token consumed by // the preprocessor, or part of a spelled token (e.g. word in a comment). struct SpelledWord { // (Spelling) location of the start of the word. SourceLocation Location; // The range of the word itself, excluding any quotes. // This is a subrange of the file buffer. llvm::StringRef Text; // Whether this word is likely to refer to an identifier. True if: // - the word is a spelled identifier token // - Text is identifier-like (e.g. "foo_bar") // - Text is surrounded by backticks (e.g. Foo in "// returns `Foo`") bool LikelyIdentifier = false; // Set if the word is contained in a token spelled in the file. // (This should always be true, but comments aren't retained by TokenBuffer). const syntax::Token *PartOfSpelledToken = nullptr; // Set if the word is exactly a token spelled in the file. const syntax::Token *SpelledToken = nullptr; // Set if the word is a token spelled in the file, and that token survives // preprocessing to emit an expanded token spelled the same way. const syntax::Token *ExpandedToken = nullptr; // Find the unique word that contains SpelledLoc or starts/ends there. static std::optional touching(SourceLocation SpelledLoc, const syntax::TokenBuffer &TB, const LangOptions &LangOpts); }; /// Return true if the \p TokenName is in the list of reversed keywords of the /// language. bool isKeyword(llvm::StringRef TokenName, const LangOptions &LangOpts); /// Heuristically determine namespaces visible at a point, without parsing Code. /// This considers using-directives and enclosing namespace-declarations that /// are visible (and not obfuscated) in the file itself (not headers). /// Code should be truncated at the point of interest. /// /// The returned vector is always non-empty. /// - The first element is the namespace that encloses the point: a declaration /// near the point would be within this namespace. /// - The elements are the namespaces in scope at the point: an unqualified /// lookup would search within these namespaces. /// /// Using directives are resolved against all enclosing scopes, but no other /// namespace directives. /// /// example: /// using namespace a; /// namespace foo { /// using namespace b; /// /// visibleNamespaces are {"foo::", "", "a::", "b::", "foo::b::"}, not "a::b::". std::vector visibleNamespaces(llvm::StringRef Code, const LangOptions &LangOpts); /// Represents locations that can accept a definition. struct EligibleRegion { /// Namespace that owns all of the EligiblePoints, e.g. /// namespace a{ namespace b {^ void foo();^} } /// It will be “a::b” for both carrot locations. std::string EnclosingNamespace; /// Offsets into the code marking eligible points to insert a function /// definition. std::vector EligiblePoints; }; /// Returns most eligible region to insert a definition for \p /// FullyQualifiedName in the \p Code. /// Pseudo parses \pCode under the hood to determine namespace decls and /// possible insertion points. Choses the region that matches the longest prefix /// of \p FullyQualifiedName. Returns EOF if there are no shared namespaces. /// \p FullyQualifiedName should not contain anonymous namespaces. EligibleRegion getEligiblePoints(llvm::StringRef Code, llvm::StringRef FullyQualifiedName, const LangOptions &LangOpts); struct DefinedMacro { llvm::StringRef Name; const MacroInfo *Info; /// Location of the identifier that names the macro. /// Unlike Info->Location, this translates preamble-patch locations to /// main-file locations. SourceLocation NameLoc; }; /// Gets the macro referenced by \p SpelledTok. It must be a spelled token /// aligned to the beginning of an identifier. std::optional locateMacroAt(const syntax::Token &SpelledTok, Preprocessor &PP); /// Infers whether this is a header from the FileName and LangOpts (if /// presents). bool isHeaderFile(llvm::StringRef FileName, std::optional LangOpts = std::nullopt); /// Returns true if the given location is in a generated protobuf file. bool isProtoFile(SourceLocation Loc, const SourceManager &SourceMgr); /// Returns true if Name is reserved, like _Foo or __Vector_base. inline bool isReservedName(llvm::StringRef Name) { // This doesn't catch all cases, but the most common. return Name.size() >= 2 && Name[0] == '_' && (isUppercase(Name[1]) || Name[1] == '_'); } /// Translates locations inside preamble patch to their main-file equivalent /// using presumed locations. Returns \p Loc if it isn't inside preamble patch. SourceLocation translatePreamblePatchLocation(SourceLocation Loc, const SourceManager &SM); /// Returns the range starting at offset and spanning the whole line. Escaped /// newlines are not handled. clangd::Range rangeTillEOL(llvm::StringRef Code, unsigned HashOffset); } // namespace clangd } // namespace clang #endif