diff options
author | Steven Wu <stevenwu@apple.com> | 2024-10-29 10:37:31 -0700 |
---|---|---|
committer | Steven Wu <stevenwu@apple.com> | 2024-10-29 10:37:31 -0700 |
commit | 424bd23894010780c0477da4324fd4e97194339f (patch) | |
tree | adb2c8f49f7625f8aaf1a2996b163f4ec0d94086 | |
parent | b510cdb895b9188e5819c4c85a6dab22a4d14385 (diff) | |
download | llvm-users/cachemeifyoucan/spr/main.cas-add-ondiskgraphdb-and-ondiskkeyvaluedb.zip llvm-users/cachemeifyoucan/spr/main.cas-add-ondiskgraphdb-and-ondiskkeyvaluedb.tar.gz llvm-users/cachemeifyoucan/spr/main.cas-add-ondiskgraphdb-and-ondiskkeyvaluedb.tar.bz2 |
[𝘀𝗽𝗿] changes to main this commit is based onusers/cachemeifyoucan/spr/main.cas-add-ondiskgraphdb-and-ondiskkeyvaluedb
Created using spr 1.3.5
[skip ci]
36 files changed, 4946 insertions, 10 deletions
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt index cde4a99..3ac4dde 100644 --- a/llvm/CMakeLists.txt +++ b/llvm/CMakeLists.txt @@ -837,6 +837,13 @@ option (LLVM_ENABLE_SPHINX "Use Sphinx to generate llvm documentation." OFF) option (LLVM_ENABLE_OCAMLDOC "Build OCaml bindings documentation." ON) option (LLVM_ENABLE_BINDINGS "Build bindings." ON) +if(UNIX AND CMAKE_SIZEOF_VOID_P GREATER_EQUAL 8) + set(LLVM_ENABLE_ONDISK_CAS_default ON) +else() + set(LLVM_ENABLE_ONDISK_CAS_default OFF) +endif() +option(LLVM_ENABLE_ONDISK_CAS "Build OnDiskCAS." ${LLVM_ENABLE_ONDISK_CAS_default}) + set(LLVM_INSTALL_DOXYGEN_HTML_DIR "${CMAKE_INSTALL_DOCDIR}/llvm/doxygen-html" CACHE STRING "Doxygen-generated HTML documentation install directory") set(LLVM_INSTALL_OCAMLDOC_HTML_DIR "${CMAKE_INSTALL_DOCDIR}/llvm/ocaml-html" diff --git a/llvm/docs/ContentAddressableStorage.md b/llvm/docs/ContentAddressableStorage.md new file mode 100644 index 0000000..4f2d9a6 --- /dev/null +++ b/llvm/docs/ContentAddressableStorage.md @@ -0,0 +1,120 @@ +# Content Addressable Storage + +## Introduction to CAS + +Content Addressable Storage, or `CAS`, is a storage system where it assigns +unique addresses to the data stored. It is very useful for data deduplicaton +and creating unique identifiers. + +Unlikely other kind of storage system like file system, CAS is immutable. It +is more reliable to model a computation when representing the inputs and outputs +of the computation using objects stored in CAS. + +The basic unit of the CAS library is a CASObject, where it contains: + +* Data: arbitrary data +* References: references to other CASObject + +It can be conceptually modeled as something like: + +``` +struct CASObject { + ArrayRef<char> Data; + ArrayRef<CASObject*> Refs; +} +``` + +Such abstraction can allow simple composition of CASObjects into a DAG to +represent complicated data structure while still allowing data deduplication. +Note you can compare two DAGs by just comparing the CASObject hash of two +root nodes. + + + +## LLVM CAS Library User Guide + +The CAS-like storage provided in LLVM is `llvm::cas::ObjectStore`. +To reference a CASObject, there are few different abstractions provided +with different trade-offs: + +### ObjectRef + +`ObjectRef` is a lightweight reference to a CASObject stored in the CAS. +This is the most commonly used abstraction and it is cheap to copy/pass +along. It has following properties: + +* `ObjectRef` is only meaningful within the `ObjectStore` that created the ref. +`ObjectRef` created by different `ObjectStore` cannot be cross-referenced or +compared. +* `ObjectRef` doesn't guarantee the existence of the CASObject it points to. An +explicitly load is required before accessing the data stored in CASObject. +This load can also fail, for reasons like but not limited to: object does +not exist, corrupted CAS storage, operation timeout, etc. +* If two `ObjectRef` are equal, it is guarantee that the object they point to +(if exists) are identical. If they are not equal, the underlying objects are +guaranteed to be not the same. + +### ObjectProxy + +`ObjectProxy` represents a loaded CASObject. With an `ObjectProxy`, the +underlying stored data and references can be accessed without the need +of error handling. The class APIs also provide convenient methods to +access underlying data. The lifetime of the underlying data is equal to +the lifetime of the instance of `ObjectStore` unless explicitly copied. + +### CASID + +`CASID` is the hash identifier for CASObjects. It owns the underlying +storage for hash value so it can be expensive to copy and compare depending +on the hash algorithm. `CASID` is generally only useful in rare situations +like printing raw hash value or exchanging hash values between different +CAS instances with the same hashing schema. + +### ObjectStore + +`ObjectStore` is the CAS-like object storage. It provides API to save +and load CASObjects, for example: + +``` +ObjectRef A, B, C; +Expected<ObjectRef> Stored = ObjectStore.store("data", {A, B}); +Expected<ObjectProxy> Loaded = ObjectStore.getProxy(C); +``` + +It also provides APIs to convert between `ObjectRef`, `ObjectProxy` and +`CASID`. + + + +## CAS Library Implementation Guide + +The LLVM ObjectStore APIs are designed so that it is easy to add +customized CAS implementation that are interchangeable with builtin +CAS implementations. + +To add your own implementation, you just need to add a subclass to +`llvm::cas::ObjectStore` and implement all its pure virtual methods. +To be interchangeable with LLVM ObjectStore, the new CAS implementation +needs to conform to following contracts: + +* Different CASObject stored in the ObjectStore needs to have a different hash +and result in a different `ObjectRef`. Vice versa, same CASObject should have +same hash and same `ObjectRef`. Note two different CASObjects with identical +data but different references are considered different objects. +* `ObjectRef`s are comparable within the same `ObjectStore` instance, and can +be used to determine the equality of the underlying CASObjects. +* The loaded objects from the ObjectStore need to have the lifetime to be at +least as long as the ObjectStore itself. + +If not specified, the behavior can be implementation defined. For example, +`ObjectRef` can be used to point to a loaded CASObject so +`ObjectStore` never fails to load. It is also legal to use a stricter model +than required. For example, an `ObjectRef` that can be used to compare +objects between different `ObjectStore` instances is legal but user +of the ObjectStore should not depend on this behavior. + +For CAS library implementer, there is also a `ObjectHandle` class that +is an internal representation of a loaded CASObject reference. +`ObjectProxy` is just a pair of `ObjectHandle` and `ObjectStore`, because +just like `ObjectRef`, `ObjectHandle` is only useful when paired with +the ObjectStore that knows about the loaded CASObject. diff --git a/llvm/docs/Reference.rst b/llvm/docs/Reference.rst index df61628..ae03a3a 100644 --- a/llvm/docs/Reference.rst +++ b/llvm/docs/Reference.rst @@ -15,6 +15,7 @@ LLVM and API reference documentation. BranchWeightMetadata Bugpoint CommandGuide/index + ContentAddressableStorage ConvergenceAndUniformity ConvergentOperations Coroutines @@ -232,3 +233,6 @@ Additional Topics :doc:`ConvergenceAndUniformity` A description of uniformity analysis in the presence of irreducible control flow, and its implementation. + +:doc:`ContentAddressableStorage` + A reference guide for using LLVM's CAS library. diff --git a/llvm/include/llvm/CAS/ActionCache.h b/llvm/include/llvm/CAS/ActionCache.h new file mode 100644 index 0000000..e6dfdd4 --- /dev/null +++ b/llvm/include/llvm/CAS/ActionCache.h @@ -0,0 +1,94 @@ +//===- llvm/CAS/ActionCache.h -----------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CAS_CASACTIONCACHE_H +#define LLVM_CAS_CASACTIONCACHE_H + +#include "llvm/ADT/StringRef.h" +#include "llvm/CAS/CASID.h" +#include "llvm/CAS/CASReference.h" +#include "llvm/Support/Error.h" + +namespace llvm::cas { + +class ObjectStore; +class CASID; +class ObjectProxy; + +/// A key for caching an operation. +/// It is implemented as a bag of bytes and provides a convenient constructor +/// for CAS types. +class CacheKey { +public: + StringRef getKey() const { return Key; } + + // TODO: Support CacheKey other than a CASID but rather any array of bytes. + // To do that, ActionCache need to be able to rehash the key into the index, + // which then `getOrCompute` method can be used to avoid multiple calls to + // has function. + CacheKey(const CASID &ID); + CacheKey(const ObjectProxy &Proxy); + CacheKey(const ObjectStore &CAS, const ObjectRef &Ref); + +private: + std::string Key; +}; + +/// A cache from a key describing an action to the result of doing it. +/// +/// Actions are expected to be pure (collision is an error). +class ActionCache { + virtual void anchor(); + +public: + /// Get a previously computed result for \p ActionKey. + /// + /// \param Globally if true it is a hint to the underlying implementation that + /// the lookup is profitable to be done on a distributed caching level, not + /// just locally. The implementation is free to ignore this flag. + Expected<std::optional<CASID>> get(const CacheKey &ActionKey, + bool Globally = false) const { + return getImpl(arrayRefFromStringRef(ActionKey.getKey()), Globally); + } + + /// Cache \p Result for the \p ActionKey computation. + /// + /// \param Globally if true it is a hint to the underlying implementation that + /// the association is profitable to be done on a distributed caching level, + /// not just locally. The implementation is free to ignore this flag. + Error put(const CacheKey &ActionKey, const CASID &Result, + bool Globally = false) { + assert(Result.getContext().getHashSchemaIdentifier() == + getContext().getHashSchemaIdentifier() && + "Hash schema mismatch"); + return putImpl(arrayRefFromStringRef(ActionKey.getKey()), Result, Globally); + } + + virtual ~ActionCache() = default; + +protected: + virtual Expected<std::optional<CASID>> getImpl(ArrayRef<uint8_t> ResolvedKey, + bool Globally) const = 0; + + virtual Error putImpl(ArrayRef<uint8_t> ResolvedKey, const CASID &Result, + bool Globally) = 0; + + ActionCache(const CASContext &Context) : Context(Context) {} + + const CASContext &getContext() const { return Context; } + +private: + const CASContext &Context; +}; + +/// Create an action cache in memory. +std::unique_ptr<ActionCache> createInMemoryActionCache(); + +} // end namespace llvm::cas + +#endif // LLVM_CAS_CASACTIONCACHE_H diff --git a/llvm/include/llvm/CAS/BuiltinCASContext.h b/llvm/include/llvm/CAS/BuiltinCASContext.h new file mode 100644 index 0000000..ebc4ca8 --- /dev/null +++ b/llvm/include/llvm/CAS/BuiltinCASContext.h @@ -0,0 +1,88 @@ +//===- BuiltinCASContext.h --------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CAS_BUILTINCASCONTEXT_H +#define LLVM_CAS_BUILTINCASCONTEXT_H + +#include "llvm/CAS/CASID.h" +#include "llvm/Support/BLAKE3.h" +#include "llvm/Support/Error.h" + +namespace llvm::cas::builtin { + +/// Current hash type for the builtin CAS. +/// +/// FIXME: This should be configurable via an enum to allow configuring the hash +/// function. The enum should be sent into \a createInMemoryCAS() and \a +/// createOnDiskCAS(). +/// +/// This is important (at least) for future-proofing, when we want to make new +/// CAS instances use BLAKE7, but still know how to read/write BLAKE3. +/// +/// Even just for BLAKE3, it would be useful to have these values: +/// +/// BLAKE3 => 32B hash from BLAKE3 +/// BLAKE3_16B => 16B hash from BLAKE3 (truncated) +/// +/// ... where BLAKE3_16 uses \a TruncatedBLAKE3<16>. +/// +/// Motivation for a truncated hash is that it's cheaper to store. It's not +/// clear if we always (or ever) need the full 32B, and for an ephemeral +/// in-memory CAS, we almost certainly don't need it. +/// +/// Note that the cost is linear in the number of objects for the builtin CAS, +/// since we're using internal offsets and/or pointers as an optimization. +/// +/// However, it's possible we'll want to hook up a local builtin CAS to, e.g., +/// a distributed generic hash map to use as an ActionCache. In that scenario, +/// the transitive closure of the structured objects that are the results of +/// the cached actions would need to be serialized into the map, something +/// like: +/// +/// "action:<schema>:<key>" -> "0123" +/// "object:<schema>:0123" -> "3,4567,89AB,CDEF,9,some data" +/// "object:<schema>:4567" -> ... +/// "object:<schema>:89AB" -> ... +/// "object:<schema>:CDEF" -> ... +/// +/// These references would be full cost. +using HasherT = BLAKE3; +using HashType = decltype(HasherT::hash(std::declval<ArrayRef<uint8_t> &>())); + +class BuiltinCASContext : public CASContext { + void printIDImpl(raw_ostream &OS, const CASID &ID) const final; + void anchor() override; + +public: + /// Get the name of the hash for any table identifiers. + /// + /// FIXME: This should be configurable via an enum, with at the following + /// values: + /// + /// "BLAKE3" => 32B hash from BLAKE3 + /// "BLAKE3.16" => 16B hash from BLAKE3 (truncated) + /// + /// Enum can be sent into \a createInMemoryCAS() and \a createOnDiskCAS(). + static StringRef getHashName() { return "BLAKE3"; } + StringRef getHashSchemaIdentifier() const final { + static const std::string ID = + ("llvm.cas.builtin.v2[" + getHashName() + "]").str(); + return ID; + } + + static const BuiltinCASContext &getDefaultContext(); + + BuiltinCASContext() = default; + + static Expected<HashType> parseID(StringRef PrintedDigest); + static void printID(ArrayRef<uint8_t> Digest, raw_ostream &OS); +}; + +} // namespace llvm::cas::builtin + +#endif // LLVM_CAS_BUILTINCASCONTEXT_H diff --git a/llvm/include/llvm/CAS/BuiltinObjectHasher.h b/llvm/include/llvm/CAS/BuiltinObjectHasher.h new file mode 100644 index 0000000..22e556c --- /dev/null +++ b/llvm/include/llvm/CAS/BuiltinObjectHasher.h @@ -0,0 +1,81 @@ +//===- BuiltinObjectHasher.h ------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CAS_BUILTINOBJECTHASHER_H +#define LLVM_CAS_BUILTINOBJECTHASHER_H + +#include "llvm/CAS/ObjectStore.h" +#include "llvm/Support/Endian.h" + +namespace llvm::cas { + +template <class HasherT> class BuiltinObjectHasher { +public: + using HashT = decltype(HasherT::hash(std::declval<ArrayRef<uint8_t> &>())); + + static HashT hashObject(const ObjectStore &CAS, ArrayRef<ObjectRef> Refs, + ArrayRef<char> Data) { + BuiltinObjectHasher H; + H.updateSize(Refs.size()); + for (const ObjectRef &Ref : Refs) + H.updateRef(CAS, Ref); + H.updateArray(Data); + return H.finish(); + } + + static HashT hashObject(ArrayRef<ArrayRef<uint8_t>> Refs, + ArrayRef<char> Data) { + BuiltinObjectHasher H; + H.updateSize(Refs.size()); + for (const ArrayRef<uint8_t> &Ref : Refs) + H.updateID(Ref); + H.updateArray(Data); + return H.finish(); + } + +private: + HashT finish() { return Hasher.final(); } + + void updateRef(const ObjectStore &CAS, ObjectRef Ref) { + updateID(CAS.getID(Ref)); + } + + void updateID(const CASID &ID) { updateID(ID.getHash()); } + + void updateID(ArrayRef<uint8_t> Hash) { + // NOTE: Does not hash the size of the hash. That's a CAS implementation + // detail that shouldn't leak into the UUID for an object. + assert(Hash.size() == sizeof(HashT) && + "Expected object ref to match the hash size"); + Hasher.update(Hash); + } + + void updateArray(ArrayRef<uint8_t> Bytes) { + updateSize(Bytes.size()); + Hasher.update(Bytes); + } + + void updateArray(ArrayRef<char> Bytes) { + updateArray(ArrayRef(reinterpret_cast<const uint8_t *>(Bytes.data()), + Bytes.size())); + } + + void updateSize(uint64_t Size) { + Size = support::endian::byte_swap(Size, endianness::little); + Hasher.update( + ArrayRef(reinterpret_cast<const uint8_t *>(&Size), sizeof(Size))); + } + + BuiltinObjectHasher() = default; + ~BuiltinObjectHasher() = default; + HasherT Hasher; +}; + +} // namespace llvm::cas + +#endif // LLVM_CAS_BUILTINOBJECTHASHER_H diff --git a/llvm/include/llvm/CAS/CASID.h b/llvm/include/llvm/CAS/CASID.h new file mode 100644 index 0000000..5f9110a --- /dev/null +++ b/llvm/include/llvm/CAS/CASID.h @@ -0,0 +1,156 @@ +//===- llvm/CAS/CASID.h -----------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CAS_CASID_H +#define LLVM_CAS_CASID_H + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMapInfo.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Error.h" + +namespace llvm { + +class raw_ostream; + +namespace cas { + +class CASID; + +/// Context for CAS identifiers. +class CASContext { + virtual void anchor(); + +public: + virtual ~CASContext() = default; + + /// Get an identifer for the schema used by this CAS context. Two CAS + /// instances should return \c true for this identifier if and only if their + /// CASIDs are safe to compare by hash. This is used by \a + /// CASID::equalsImpl(). + virtual StringRef getHashSchemaIdentifier() const = 0; + +protected: + /// Print \p ID to \p OS. + virtual void printIDImpl(raw_ostream &OS, const CASID &ID) const = 0; + + friend class CASID; +}; + +/// Unique identifier for a CAS object. +/// +/// Locally, stores an internal CAS identifier that's specific to a single CAS +/// instance. It's guaranteed not to change across the view of that CAS, but +/// might change between runs. +/// +/// It also has \a CASIDContext pointer to allow comparison of these +/// identifiers. If two CASIDs are from the same CASIDContext, they can be +/// compared directly. If they are, then \a +/// CASIDContext::getHashSchemaIdentifier() is compared to see if they can be +/// compared by hash, in which case the result of \a getHash() is compared. +class CASID { +public: + void dump() const; + void print(raw_ostream &OS) const { + return getContext().printIDImpl(OS, *this); + } + friend raw_ostream &operator<<(raw_ostream &OS, const CASID &ID) { + ID.print(OS); + return OS; + } + std::string toString() const; + + ArrayRef<uint8_t> getHash() const { + return arrayRefFromStringRef<uint8_t>(Hash); + } + + friend bool operator==(const CASID &LHS, const CASID &RHS) { + if (LHS.Context == RHS.Context) + return LHS.Hash == RHS.Hash; + + // EmptyKey or TombstoneKey. + if (!LHS.Context || !RHS.Context) + return false; + + // CASIDs are equal when they have the same hash schema and same hash value. + return LHS.Context->getHashSchemaIdentifier() == + RHS.Context->getHashSchemaIdentifier() && + LHS.Hash == RHS.Hash; + } + + friend bool operator!=(const CASID &LHS, const CASID &RHS) { + return !(LHS == RHS); + } + + friend hash_code hash_value(const CASID &ID) { + ArrayRef<uint8_t> Hash = ID.getHash(); + return hash_combine_range(Hash.begin(), Hash.end()); + } + + const CASContext &getContext() const { + assert(Context && "Tombstone or empty key for DenseMap?"); + return *Context; + } + + static CASID getDenseMapEmptyKey() { + return CASID(nullptr, DenseMapInfo<StringRef>::getEmptyKey()); + } + static CASID getDenseMapTombstoneKey() { + return CASID(nullptr, DenseMapInfo<StringRef>::getTombstoneKey()); + } + + CASID() = delete; + + static CASID create(const CASContext *Context, StringRef Hash) { + return CASID(Context, Hash); + } + +private: + CASID(const CASContext *Context, StringRef Hash) + : Context(Context), Hash(Hash) {} + + const CASContext *Context; + SmallString<32> Hash; +}; + +/// This is used to workaround the issue of MSVC needing default-constructible +/// types for \c std::promise/future. +template <typename T> struct AsyncValue { + Expected<std::optional<T>> take() { return std::move(Value); } + + AsyncValue() : Value(std::nullopt) {} + AsyncValue(Error &&E) : Value(std::move(E)) {} + AsyncValue(T &&V) : Value(std::move(V)) {} + AsyncValue(std::nullopt_t) : Value(std::nullopt) {} + AsyncValue(Expected<std::optional<T>> &&Obj) : Value(std::move(Obj)) {} + +private: + Expected<std::optional<T>> Value; +}; + +} // namespace cas + +template <> struct DenseMapInfo<cas::CASID> { + static cas::CASID getEmptyKey() { return cas::CASID::getDenseMapEmptyKey(); } + + static cas::CASID getTombstoneKey() { + return cas::CASID::getDenseMapTombstoneKey(); + } + + static unsigned getHashValue(cas::CASID ID) { + return (unsigned)hash_value(ID); + } + + static bool isEqual(cas::CASID LHS, cas::CASID RHS) { return LHS == RHS; } +}; + +} // namespace llvm + +#endif // LLVM_CAS_CASID_H diff --git a/llvm/include/llvm/CAS/CASReference.h b/llvm/include/llvm/CAS/CASReference.h new file mode 100644 index 0000000..1f435cf --- /dev/null +++ b/llvm/include/llvm/CAS/CASReference.h @@ -0,0 +1,207 @@ +//===- llvm/CAS/CASReference.h ----------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CAS_CASREFERENCE_H +#define LLVM_CAS_CASREFERENCE_H + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMapInfo.h" +#include "llvm/ADT/StringRef.h" + +namespace llvm { + +class raw_ostream; + +namespace cas { + +class ObjectStore; + +class ObjectHandle; +class ObjectRef; + +/// Base class for references to things in \a ObjectStore. +class ReferenceBase { +protected: + struct DenseMapEmptyTag {}; + struct DenseMapTombstoneTag {}; + static constexpr uint64_t getDenseMapEmptyRef() { return -1ULL; } + static constexpr uint64_t getDenseMapTombstoneRef() { return -2ULL; } + +public: + /// Get an internal reference. + uint64_t getInternalRef(const ObjectStore &ExpectedCAS) const { +#if LLVM_ENABLE_ABI_BREAKING_CHECKS + assert(CAS == &ExpectedCAS && "Extracting reference for the wrong CAS"); +#endif + return InternalRef; + } + + unsigned getDenseMapHash() const { + return (unsigned)llvm::hash_value(InternalRef); + } + bool isDenseMapEmpty() const { return InternalRef == getDenseMapEmptyRef(); } + bool isDenseMapTombstone() const { + return InternalRef == getDenseMapTombstoneRef(); + } + bool isDenseMapSentinel() const { + return isDenseMapEmpty() || isDenseMapTombstone(); + } + +protected: + void print(raw_ostream &OS, const ObjectHandle &This) const; + void print(raw_ostream &OS, const ObjectRef &This) const; + + bool hasSameInternalRef(const ReferenceBase &RHS) const { +#if LLVM_ENABLE_ABI_BREAKING_CHECKS + assert( + (isDenseMapSentinel() || RHS.isDenseMapSentinel() || CAS == RHS.CAS) && + "Cannot compare across CAS instances"); +#endif + return InternalRef == RHS.InternalRef; + } + +protected: + friend class ObjectStore; + ReferenceBase(const ObjectStore *CAS, uint64_t InternalRef, bool IsHandle) + : InternalRef(InternalRef) { +#if LLVM_ENABLE_ABI_BREAKING_CHECKS + this->CAS = CAS; +#endif + assert(InternalRef != getDenseMapEmptyRef() && "Reserved for DenseMapInfo"); + assert(InternalRef != getDenseMapTombstoneRef() && + "Reserved for DenseMapInfo"); + } + explicit ReferenceBase(DenseMapEmptyTag) + : InternalRef(getDenseMapEmptyRef()) {} + explicit ReferenceBase(DenseMapTombstoneTag) + : InternalRef(getDenseMapTombstoneRef()) {} + +private: + uint64_t InternalRef; + +#if LLVM_ENABLE_ABI_BREAKING_CHECKS + const ObjectStore *CAS = nullptr; +#endif +}; + +/// Reference to an object in a \a ObjectStore instance. +/// +/// If you have an ObjectRef, you know the object exists, and you can point at +/// it from new nodes with \a ObjectStore::store(), but you don't know anything +/// about it. "Loading" the object is a separate step that may not have +/// happened yet, and which can fail (due to filesystem corruption) or +/// introduce latency (if downloading from a remote store). +/// +/// \a ObjectStore::store() takes a list of these, and these are returned by \a +/// ObjectStore::forEachRef() and \a ObjectStore::readRef(), which are accessors +/// for nodes, and \a ObjectStore::getReference(). +/// +/// \a ObjectStore::load() will load the referenced object, and returns \a +/// ObjectHandle, a variant that knows what kind of entity it is. \a +/// ObjectStore::getReferenceKind() can expect the type of reference without +/// asking for unloaded objects to be loaded. +/// +/// This is a wrapper around a \c uint64_t (and a \a ObjectStore instance when +/// assertions are on). If necessary, it can be deconstructed and reconstructed +/// using \a Reference::getInternalRef() and \a +/// Reference::getFromInternalRef(), but clients aren't expected to need to do +/// this. These both require the right \a ObjectStore instance. +class ObjectRef : public ReferenceBase { + struct DenseMapTag {}; + +public: + friend bool operator==(const ObjectRef &LHS, const ObjectRef &RHS) { + return LHS.hasSameInternalRef(RHS); + } + friend bool operator!=(const ObjectRef &LHS, const ObjectRef &RHS) { + return !(LHS == RHS); + } + + /// Allow a reference to be recreated after it's deconstructed. + static ObjectRef getFromInternalRef(const ObjectStore &CAS, + uint64_t InternalRef) { + return ObjectRef(CAS, InternalRef); + } + + static ObjectRef getDenseMapEmptyKey() { + return ObjectRef(DenseMapEmptyTag{}); + } + static ObjectRef getDenseMapTombstoneKey() { + return ObjectRef(DenseMapTombstoneTag{}); + } + + /// Print internal ref and/or CASID. Only suitable for debugging. + void print(raw_ostream &OS) const { return ReferenceBase::print(OS, *this); } + + LLVM_DUMP_METHOD void dump() const; + +private: + friend class ObjectStore; + friend class ReferenceBase; + using ReferenceBase::ReferenceBase; + ObjectRef(const ObjectStore &CAS, uint64_t InternalRef) + : ReferenceBase(&CAS, InternalRef, /*IsHandle=*/false) { + assert(InternalRef != -1ULL && "Reserved for DenseMapInfo"); + assert(InternalRef != -2ULL && "Reserved for DenseMapInfo"); + } + explicit ObjectRef(DenseMapEmptyTag T) : ReferenceBase(T) {} + explicit ObjectRef(DenseMapTombstoneTag T) : ReferenceBase(T) {} + explicit ObjectRef(ReferenceBase) = delete; +}; + +/// Handle to a loaded object in a \a ObjectStore instance. +/// +/// ObjectHandle encapulates a *loaded* object in the CAS. You need one +/// of these to inspect the content of an object: to look at its stored +/// data and references. +class ObjectHandle : public ReferenceBase { +public: + friend bool operator==(const ObjectHandle &LHS, const ObjectHandle &RHS) { + return LHS.hasSameInternalRef(RHS); + } + friend bool operator!=(const ObjectHandle &LHS, const ObjectHandle &RHS) { + return !(LHS == RHS); + } + + /// Print internal ref and/or CASID. Only suitable for debugging. + void print(raw_ostream &OS) const { return ReferenceBase::print(OS, *this); } + + LLVM_DUMP_METHOD void dump() const; + +private: + friend class ObjectStore; + friend class ReferenceBase; + using ReferenceBase::ReferenceBase; + explicit ObjectHandle(ReferenceBase) = delete; + ObjectHandle(const ObjectStore &CAS, uint64_t InternalRef) + : ReferenceBase(&CAS, InternalRef, /*IsHandle=*/true) {} +}; + +} // namespace cas + +template <> struct DenseMapInfo<cas::ObjectRef> { + static cas::ObjectRef getEmptyKey() { + return cas::ObjectRef::getDenseMapEmptyKey(); + } + + static cas::ObjectRef getTombstoneKey() { + return cas::ObjectRef::getDenseMapTombstoneKey(); + } + + static unsigned getHashValue(cas::ObjectRef Ref) { + return Ref.getDenseMapHash(); + } + + static bool isEqual(cas::ObjectRef LHS, cas::ObjectRef RHS) { + return LHS == RHS; + } +}; + +} // namespace llvm + +#endif // LLVM_CAS_CASREFERENCE_H diff --git a/llvm/include/llvm/CAS/MappedFileRegionBumpPtr.h b/llvm/include/llvm/CAS/MappedFileRegionBumpPtr.h new file mode 100644 index 0000000..0df8163 --- /dev/null +++ b/llvm/include/llvm/CAS/MappedFileRegionBumpPtr.h @@ -0,0 +1,110 @@ +//===- MappedFileRegionBumpPtr.h --------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CAS_MAPPEDFILEREGIONBUMPPTR_H +#define LLVM_CAS_MAPPEDFILEREGIONBUMPPTR_H + +#include "llvm/Support/Alignment.h" +#include "llvm/Support/FileSystem.h" +#include <atomic> + +namespace llvm::cas { + +/// Allocator for an owned mapped file region that supports thread-safe and +/// process-safe bump pointer allocation. +/// +/// This allocator is designed to create a sparse file when supported by the +/// filesystem's \c ftruncate so that it can be used with a large maximum size. +/// It will also attempt to shrink the underlying file down to its current +/// allocation size when the last concurrent mapping is closed. +/// +/// Process-safe. Uses file locks when resizing the file during initialization +/// and destruction. +/// +/// Thread-safe, assuming all threads use the same instance to talk to a given +/// file/mapping. Unsafe to have multiple instances talking to the same file +/// in the same process since file locks will misbehave. Clients should +/// coordinate (somehow). +/// +/// \note Currently we allocate the whole file without sparseness on Windows. +/// +/// Provides 8-byte alignment for all allocations. +class MappedFileRegionBumpPtr { +public: + using RegionT = sys::fs::mapped_file_region; + + /// Create a \c MappedFileRegionBumpPtr. + /// + /// \param Path the path to open the mapped region. + /// \param Capacity the maximum size for the mapped file region. + /// \param BumpPtrOffset the offset at which to store the bump pointer. + /// \param NewFileConstructor is for constructing new files. It has exclusive + /// access to the file. Must call \c initializeBumpPtr. + static Expected<MappedFileRegionBumpPtr> + create(const Twine &Path, uint64_t Capacity, int64_t BumpPtrOffset, + function_ref<Error(MappedFileRegionBumpPtr &)> NewFileConstructor); + + /// Finish initializing the bump pointer. Must be called by + /// \c NewFileConstructor. + void initializeBumpPtr(int64_t BumpPtrOffset); + + /// Minimum alignment for allocations, currently hardcoded to 8B. + static constexpr Align getAlign() { + // Trick Align into giving us '8' as a constexpr. + struct alignas(8) T {}; + static_assert(alignof(T) == 8, "Tautology failed?"); + return Align::Of<T>(); + } + + /// Allocate at least \p AllocSize. Rounds up to \a getAlign(). + char *allocate(uint64_t AllocSize) { + return data() + allocateOffset(AllocSize); + } + /// Allocate, returning the offset from \a data() instead of a pointer. + int64_t allocateOffset(uint64_t AllocSize); + + char *data() const { return Region.data(); } + uint64_t size() const { return *BumpPtr; } + uint64_t capacity() const { return Region.size(); } + + RegionT &getRegion() { return Region; } + + ~MappedFileRegionBumpPtr() { destroyImpl(); } + + MappedFileRegionBumpPtr() = default; + MappedFileRegionBumpPtr(MappedFileRegionBumpPtr &&RHS) { moveImpl(RHS); } + MappedFileRegionBumpPtr &operator=(MappedFileRegionBumpPtr &&RHS) { + destroyImpl(); + moveImpl(RHS); + return *this; + } + + MappedFileRegionBumpPtr(const MappedFileRegionBumpPtr &) = delete; + MappedFileRegionBumpPtr &operator=(const MappedFileRegionBumpPtr &) = delete; + +private: + void destroyImpl(); + void moveImpl(MappedFileRegionBumpPtr &RHS) { + std::swap(Region, RHS.Region); + std::swap(BumpPtr, RHS.BumpPtr); + std::swap(Path, RHS.Path); + std::swap(FD, RHS.FD); + std::swap(SharedLockFD, RHS.SharedLockFD); + } + +private: + RegionT Region; + std::atomic<int64_t> *BumpPtr = nullptr; + std::string Path; + std::optional<int> FD; + std::optional<int> SharedLockFD; +}; + +} // namespace llvm::cas + +#endif // LLVM_CAS_MAPPEDFILEREGIONBUMPPTR_H diff --git a/llvm/include/llvm/CAS/ObjectStore.h b/llvm/include/llvm/CAS/ObjectStore.h new file mode 100644 index 0000000..b4720c7 --- /dev/null +++ b/llvm/include/llvm/CAS/ObjectStore.h @@ -0,0 +1,302 @@ +//===- llvm/CAS/ObjectStore.h -----------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CAS_OBJECTSTORE_H +#define LLVM_CAS_OBJECTSTORE_H + +#include "llvm/ADT/StringRef.h" +#include "llvm/CAS/CASID.h" +#include "llvm/CAS/CASReference.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/FileSystem.h" +#include <cstddef> + +namespace llvm { + +class MemoryBuffer; +template <typename T> class unique_function; + +namespace cas { + +class ObjectStore; +class ObjectProxy; + +/// Content-addressable storage for objects. +/// +/// Conceptually, objects are stored in a "unique set". +/// +/// - Objects are immutable ("value objects") that are defined by their +/// content. They are implicitly deduplicated by content. +/// - Each object has a unique identifier (UID) that's derived from its content, +/// called a \a CASID. +/// - This UID is a fixed-size (strong) hash of the transitive content of a +/// CAS object. +/// - It's comparable between any two CAS instances that have the same \a +/// CASIDContext::getHashSchemaIdentifier(). +/// - The UID can be printed (e.g., \a CASID::toString()) and it can parsed +/// by the same or a different CAS instance with \a +/// ObjectStore::parseID(). +/// - An object can be looked up by content or by UID. +/// - \a store() is "get-or-create" methods, writing an object if it +/// doesn't exist yet, and return a ref to it in any case. +/// - \a loadObject(const CASID&) looks up an object by its UID. +/// - Objects can reference other objects, forming an arbitrary DAG. +/// +/// The \a ObjectStore interface has a few ways of referencing objects: +/// +/// - \a ObjectRef encapsulates a reference to something in the CAS. It is an +/// opaque type that references an object inside a specific CAS. It is +/// implementation defined if the underlying object exists or not for an +/// ObjectRef, and it can used to speed up CAS lookup as an implementation +/// detail. However, you don't know anything about the underlying objects. +/// "Loading" the object is a separate step that may not have happened +/// yet, and which can fail (e.g. due to filesystem corruption) or introduce +/// latency (if downloading from a remote store). +/// - \a ObjectHandle encapulates a *loaded* object in the CAS. You need one of +/// these to inspect the content of an object: to look at its stored +/// data and references. This is internal to CAS implementation and not +/// availble from CAS public APIs. +/// - \a CASID: the UID for an object in the CAS, obtained through \a +/// ObjectStore::getID() or \a ObjectStore::parseID(). This is a valid CAS +/// identifier, but may reference an object that is unknown to this CAS +/// instance. +/// - \a ObjectProxy pairs an ObjectHandle (subclass) with a ObjectStore, and +/// wraps access APIs to avoid having to pass extra parameters. It is the +/// object used for accessing underlying data and refs by CAS users. +/// +/// Both ObjectRef and ObjectHandle are lightweight, wrapping a `uint64_t` and +/// are only valid with the associated ObjectStore instance. +/// +/// There are a few options for accessing content of objects, with different +/// lifetime tradeoffs: +/// +/// - \a getData() accesses data without exposing lifetime at all. +/// - \a getMemoryBuffer() returns a \a MemoryBuffer whose lifetime +/// is independent of the CAS (it can live longer). +/// - \a getDataString() return StringRef with lifetime is guaranteed to last as +/// long as \a ObjectStore. +/// - \a readRef() and \a forEachRef() iterate through the references in an +/// object. There is no lifetime assumption. +class ObjectStore { + friend class ObjectProxy; + void anchor(); + +public: + /// Get a \p CASID from a \p ID, which should have been generated by \a + /// CASID::print(). This succeeds as long as \a validateID() would pass. The + /// object may be unknown to this CAS instance. + /// + /// TODO: Remove, and update callers to use \a validateID() or \a + /// extractHashFromID(). + virtual Expected<CASID> parseID(StringRef ID) = 0; + + /// Store object into ObjectStore. + virtual Expected<ObjectRef> store(ArrayRef<ObjectRef> Refs, + ArrayRef<char> Data) = 0; + /// Get an ID for \p Ref. + virtual CASID getID(ObjectRef Ref) const = 0; + + /// Get an existing reference to the object called \p ID. + /// + /// Returns \c None if the object is not stored in this CAS. + virtual std::optional<ObjectRef> getReference(const CASID &ID) const = 0; + + /// \returns true if the object is directly available from the local CAS, for + /// implementations that have this kind of distinction. + virtual Expected<bool> isMaterialized(ObjectRef Ref) const = 0; + + /// Validate the underlying object referred by CASID. + virtual Error validate(const CASID &ID) = 0; + +protected: + /// Load the object referenced by \p Ref. + /// + /// Errors if the object cannot be loaded. + /// \returns \c std::nullopt if the object is missing from the CAS. + virtual Expected<std::optional<ObjectHandle>> loadIfExists(ObjectRef Ref) = 0; + + /// Like \c loadIfExists but returns an error if the object is missing. + Expected<ObjectHandle> load(ObjectRef Ref); + + /// Get the size of some data. + virtual uint64_t getDataSize(ObjectHandle Node) const = 0; + + /// Methods for handling objects. + virtual Error forEachRef(ObjectHandle Node, + function_ref<Error(ObjectRef)> Callback) const = 0; + virtual ObjectRef readRef(ObjectHandle Node, size_t I) const = 0; + virtual size_t getNumRefs(ObjectHandle Node) const = 0; + virtual ArrayRef<char> getData(ObjectHandle Node, + bool RequiresNullTerminator = false) const = 0; + + /// Get ObjectRef from open file. + virtual Expected<ObjectRef> + storeFromOpenFileImpl(sys::fs::file_t FD, + std::optional<sys::fs::file_status> Status); + + /// Get a lifetime-extended StringRef pointing at \p Data. + /// + /// Depending on the CAS implementation, this may involve in-memory storage + /// overhead. + StringRef getDataString(ObjectHandle Node) { + return toStringRef(getData(Node)); + } + + /// Get a lifetime-extended MemoryBuffer pointing at \p Data. + /// + /// Depending on the CAS implementation, this may involve in-memory storage + /// overhead. + std::unique_ptr<MemoryBuffer> + getMemoryBuffer(ObjectHandle Node, StringRef Name = "", + bool RequiresNullTerminator = true); + + /// Read all the refs from object in a SmallVector. + virtual void readRefs(ObjectHandle Node, + SmallVectorImpl<ObjectRef> &Refs) const; + + /// Allow ObjectStore implementations to create internal handles. +#define MAKE_CAS_HANDLE_CONSTRUCTOR(HandleKind) \ + HandleKind make##HandleKind(uint64_t InternalRef) const { \ + return HandleKind(*this, InternalRef); \ + } + MAKE_CAS_HANDLE_CONSTRUCTOR(ObjectHandle) + MAKE_CAS_HANDLE_CONSTRUCTOR(ObjectRef) +#undef MAKE_CAS_HANDLE_CONSTRUCTOR + +public: + /// Helper functions to store object and returns a ObjectProxy. + Expected<ObjectProxy> createProxy(ArrayRef<ObjectRef> Refs, StringRef Data); + + /// Store object from StringRef. + Expected<ObjectRef> storeFromString(ArrayRef<ObjectRef> Refs, + StringRef String) { + return store(Refs, arrayRefFromStringRef<char>(String)); + } + + /// Default implementation reads \p FD and calls \a storeNode(). Does not + /// take ownership of \p FD; the caller is responsible for closing it. + /// + /// If \p Status is sent in it is to be treated as a hint. Implementations + /// must protect against the file size potentially growing after the status + /// was taken (i.e., they cannot assume that an mmap will be null-terminated + /// where \p Status implies). + /// + /// Returns the \a CASID and the size of the file. + Expected<ObjectRef> + storeFromOpenFile(sys::fs::file_t FD, + std::optional<sys::fs::file_status> Status = std::nullopt) { + return storeFromOpenFileImpl(FD, Status); + } + + static Error createUnknownObjectError(const CASID &ID); + + /// Create ObjectProxy from CASID. If the object doesn't exist, get an error. + Expected<ObjectProxy> getProxy(const CASID &ID); + /// Create ObjectProxy from ObjectRef. If the object can't be loaded, get an + /// error. + Expected<ObjectProxy> getProxy(ObjectRef Ref); + + /// \returns \c std::nullopt if the object is missing from the CAS. + Expected<std::optional<ObjectProxy>> getProxyIfExists(ObjectRef Ref); + + /// Read the data from \p Data into \p OS. + uint64_t readData(ObjectHandle Node, raw_ostream &OS, uint64_t Offset = 0, + uint64_t MaxBytes = -1ULL) const { + ArrayRef<char> Data = getData(Node); + assert(Offset < Data.size() && "Expected valid offset"); + Data = Data.drop_front(Offset).take_front(MaxBytes); + OS << toStringRef(Data); + return Data.size(); + } + + /// Validate the whole node tree. + Error validateTree(ObjectRef Ref); + + /// Print the ObjectStore internals for debugging purpose. + virtual void print(raw_ostream &) const {} + void dump() const; + + /// Get CASContext + const CASContext &getContext() const { return Context; } + + virtual ~ObjectStore() = default; + +protected: + ObjectStore(const CASContext &Context) : Context(Context) {} + +private: + const CASContext &Context; +}; + +/// Reference to an abstract hierarchical node, with data and references. +/// Reference is passed by value and is expected to be valid as long as the \a +/// ObjectStore is. +class ObjectProxy { +public: + const ObjectStore &getCAS() const { return *CAS; } + ObjectStore &getCAS() { return *CAS; } + CASID getID() const { return CAS->getID(Ref); } + ObjectRef getRef() const { return Ref; } + size_t getNumReferences() const { return CAS->getNumRefs(H); } + ObjectRef getReference(size_t I) const { return CAS->readRef(H, I); } + + operator CASID() const { return getID(); } + CASID getReferenceID(size_t I) const { + std::optional<CASID> ID = getCAS().getID(getReference(I)); + assert(ID && "Expected reference to be first-class object"); + return *ID; + } + + /// Visit each reference in order, returning an error from \p Callback to + /// stop early. + Error forEachReference(function_ref<Error(ObjectRef)> Callback) const { + return CAS->forEachRef(H, Callback); + } + + std::unique_ptr<MemoryBuffer> + getMemoryBuffer(StringRef Name = "", + bool RequiresNullTerminator = true) const; + + /// Get the content of the node. Valid as long as the CAS is valid. + StringRef getData() const { return CAS->getDataString(H); } + + friend bool operator==(const ObjectProxy &Proxy, ObjectRef Ref) { + return Proxy.getRef() == Ref; + } + friend bool operator==(ObjectRef Ref, const ObjectProxy &Proxy) { + return Proxy.getRef() == Ref; + } + friend bool operator!=(const ObjectProxy &Proxy, ObjectRef Ref) { + return !(Proxy.getRef() == Ref); + } + friend bool operator!=(ObjectRef Ref, const ObjectProxy &Proxy) { + return !(Proxy.getRef() == Ref); + } + +public: + ObjectProxy() = delete; + + static ObjectProxy load(ObjectStore &CAS, ObjectRef Ref, ObjectHandle Node) { + return ObjectProxy(CAS, Ref, Node); + } + +private: + ObjectProxy(ObjectStore &CAS, ObjectRef Ref, ObjectHandle H) + : CAS(&CAS), Ref(Ref), H(H) {} + + ObjectStore *CAS; + ObjectRef Ref; + ObjectHandle H; +}; + +std::unique_ptr<ObjectStore> createInMemoryCAS(); + +} // namespace cas +} // namespace llvm + +#endif // LLVM_CAS_OBJECTSTORE_H diff --git a/llvm/include/llvm/CAS/OnDiskHashMappedTrie.h b/llvm/include/llvm/CAS/OnDiskHashMappedTrie.h new file mode 100644 index 0000000..9fe911f --- /dev/null +++ b/llvm/include/llvm/CAS/OnDiskHashMappedTrie.h @@ -0,0 +1,375 @@ +//===- OnDiskHashMappedTrie.h -----------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CAS_ONDISKHASHMAPPEDTRIE_H +#define LLVM_CAS_ONDISKHASHMAPPEDTRIE_H + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/STLFunctionalExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Error.h" +#include <optional> + +namespace llvm { + +class raw_ostream; + +namespace cas { + +class FileOffset { +public: + int64_t get() const { return Offset; } + + explicit operator bool() const { return Offset; } + + FileOffset() = default; + explicit FileOffset(int64_t Offset) : Offset(Offset) { assert(Offset >= 0); } + +private: + int64_t Offset = 0; +}; + +/// On-disk hash-mapped trie. Thread-safe / lock-free. +/// +/// This is an on-disk, thread-safe key-value store that is (mostly) lock-free. +/// The keys are fixed length, and are expected to be binary hashes with a +/// normal distribution. +/// +/// - Thread-safety is achieved through the use of atomics within a shared +/// memory mapping. Atomic access does not work on networked filesystems. +/// - Filesystem locks are used, but only sparingly: +/// - during initialization, for creating / opening an existing store; +/// - for the lifetime of the instance, a shared/reader lock is held +/// - during destruction, if there are no concurrent readers, to shrink the +/// files to their minimum size. +/// - Path is used as a directory: +/// - "index" stores the root trie and subtries. +/// - "data" stores (most of) the entries, like a bump-ptr-allocator. +/// - Large entries are stored externally in a file named by the key. +/// - Code is system-dependent (Windows not yet implemented), and binary format +/// itself is not portable. These are not artifacts that can/should be moved +/// between different systems; they are only appropriate for local storage. +class OnDiskHashMappedTrie { +public: + LLVM_DUMP_METHOD void dump() const; + void + print(raw_ostream &OS, + function_ref<void(ArrayRef<char>)> PrintRecordData = nullptr) const; + +public: + struct ConstValueProxy { + ConstValueProxy() = default; + ConstValueProxy(ArrayRef<uint8_t> Hash, ArrayRef<char> Data) + : Hash(Hash), Data(Data) {} + ConstValueProxy(ArrayRef<uint8_t> Hash, StringRef Data) + : Hash(Hash), Data(Data.begin(), Data.size()) {} + + ArrayRef<uint8_t> Hash; + ArrayRef<char> Data; + }; + + struct ValueProxy { + operator ConstValueProxy() const { return ConstValueProxy(Hash, Data); } + + ValueProxy() = default; + ValueProxy(ArrayRef<uint8_t> Hash, MutableArrayRef<char> Data) + : Hash(Hash), Data(Data) {} + + ArrayRef<uint8_t> Hash; + MutableArrayRef<char> Data; + }; + +private: + struct HintT { + explicit operator ValueProxy() const { + ValueProxy Value; + Value.Data = MutableArrayRef<char>( + const_cast<char *>(reinterpret_cast<const char *>(P)), I); + Value.Hash = ArrayRef<uint8_t>(nullptr, B); + return Value; + } + + explicit HintT(ConstValueProxy Value) + : P(Value.Data.data()), I(Value.Data.size()), B(Value.Hash.size()) { + // Spot-check that this really was a hint. + assert(Value.Data.size() <= UINT16_MAX); + assert(Value.Hash.size() <= UINT16_MAX); + assert(Value.Hash.data() == nullptr); + } + + HintT(const void *P, uint16_t I, uint16_t B) : P(P), I(I), B(B) {} + + const void *P = nullptr; + uint16_t I = 0; + uint16_t B = 0; + }; + +public: + template <class ProxyT> class PointerImpl { + public: + FileOffset getOffset() const { + return FileOffset(OffsetLow32 | (uint64_t)OffsetHigh16 << 32); + } + + explicit operator bool() const { return IsValue; } + + const ProxyT &operator*() const { + assert(IsValue); + return ValueOrHint; + } + const ProxyT *operator->() const { + assert(IsValue); + return &ValueOrHint; + } + + PointerImpl() = default; + + protected: + PointerImpl(FileOffset Offset, ProxyT Value) + : PointerImpl(Value, Offset, /*IsHint=*/false, /*IsValue=*/true) {} + + explicit PointerImpl(FileOffset Offset, HintT H) + : PointerImpl(ValueProxy(H), Offset, /*IsHint=*/true, + /*IsValue=*/false) {} + + PointerImpl(ProxyT ValueOrHint, FileOffset Offset, bool IsHint, + bool IsValue) + : ValueOrHint(ValueOrHint), OffsetLow32((uint64_t)Offset.get()), + OffsetHigh16((uint64_t)Offset.get() >> 32), IsHint(IsHint), + IsValue(IsValue) { + checkOffset(Offset); + } + + static void checkOffset(FileOffset Offset) { + assert(Offset.get() > 0); + assert((uint64_t)Offset.get() < (1LL << 48)); + } + + std::optional<HintT> getHint(const OnDiskHashMappedTrie &This) const { + if (!IsHint) + return std::nullopt; + HintT H(ValueOrHint); + assert(H.P == &This && "Expected hint to be for This"); + if (H.P != &This) + return std::nullopt; + return H; + } + + ProxyT ValueOrHint; + uint32_t OffsetLow32 = 0; + uint16_t OffsetHigh16 = 0; + bool IsHint = false; + bool IsValue = false; + }; + + class pointer; + class const_pointer : public PointerImpl<ConstValueProxy> { + public: + const_pointer() = default; + + private: + friend class pointer; + friend class OnDiskHashMappedTrie; + using const_pointer::PointerImpl::PointerImpl; + }; + + class pointer : public PointerImpl<ValueProxy> { + public: + operator const_pointer() const { + return const_pointer(ValueOrHint, getOffset(), IsHint, IsValue); + } + + pointer() = default; + + private: + friend class OnDiskHashMappedTrie; + using pointer::PointerImpl::PointerImpl; + }; + + pointer getMutablePointer(const_pointer CP) { + if (std::optional<HintT> H = CP.getHint(*this)) + return pointer(CP.getOffset(), *H); + if (!CP) + return pointer(); + ValueProxy V{CP->Hash, MutableArrayRef(const_cast<char *>(CP->Data.data()), + CP->Data.size())}; + return pointer(CP.getOffset(), V); + } + + const_pointer find(ArrayRef<uint8_t> Hash) const; + pointer find(ArrayRef<uint8_t> Hash) { + return getMutablePointer( + const_cast<const OnDiskHashMappedTrie *>(this)->find(Hash)); + } + + const_pointer recoverFromHashPointer(const uint8_t *HashBegin) const; + pointer recoverFromHashPointer(const uint8_t *HashBegin) { + return getMutablePointer( + const_cast<const OnDiskHashMappedTrie *>(this)->recoverFromHashPointer( + HashBegin)); + } + + const_pointer recoverFromFileOffset(FileOffset Offset) const; + pointer recoverFromFileOffset(FileOffset Offset) { + return getMutablePointer( + const_cast<const OnDiskHashMappedTrie *>(this)->recoverFromFileOffset( + Offset)); + } + + using LazyInsertOnConstructCB = + function_ref<void(FileOffset TentativeOffset, ValueProxy TentativeValue)>; + using LazyInsertOnLeakCB = + function_ref<void(FileOffset TentativeOffset, ValueProxy TentativeValue, + FileOffset FinalOffset, ValueProxy FinalValue)>; + + /// Insert lazily. + /// + /// \p OnConstruct is called when ready to insert a value, after allocating + /// space for the data. It is called at most once. + /// + /// \p OnLeak is called only if \p OnConstruct has been called and a race + /// occurred before insertion, causing the tentative offset and data to be + /// abandoned. This allows clients to clean up other results or update any + /// references. + /// + /// NOTE: Does *not* guarantee that \p OnConstruct is only called on success. + /// The in-memory \a HashMappedTrie uses LazyAtomicPointer to synchronize + /// simultaneous writes, but that seems dangerous to use in a memory-mapped + /// file in case a process crashes in the busy state. + pointer insertLazy(const_pointer Hint, ArrayRef<uint8_t> Hash, + LazyInsertOnConstructCB OnConstruct = nullptr, + LazyInsertOnLeakCB OnLeak = nullptr); + pointer insertLazy(ArrayRef<uint8_t> Hash, + LazyInsertOnConstructCB OnConstruct = nullptr, + LazyInsertOnLeakCB OnLeak = nullptr) { + return insertLazy(const_pointer(), Hash, OnConstruct, OnLeak); + } + + pointer insert(const_pointer Hint, const ConstValueProxy &Value) { + return insertLazy(Hint, Value.Hash, [&](FileOffset, ValueProxy Allocated) { + assert(Allocated.Hash == Value.Hash); + assert(Allocated.Data.size() == Value.Data.size()); + llvm::copy(Value.Data, Allocated.Data.begin()); + }); + } + pointer insert(const ConstValueProxy &Value) { + return insert(const_pointer(), Value); + } + + size_t size() const; + + /// Gets or creates a file at \p Path with a hash-mapped trie named \p + /// TrieName. The hash size is \p NumHashBits (in bits) and the records store + /// data of size \p DataSize (in bytes). + /// + /// \p MaxFileSize controls the maximum file size to support, limiting the + /// size of the \a mapped_file_region. \p NewFileInitialSize is the starting + /// size if a new file is created. + /// + /// \p NewTableNumRootBits and \p NewTableNumSubtrieBits are hints to + /// configure the trie, if it doesn't already exist. + /// + /// \pre NumHashBits is a multiple of 8 (byte-aligned). + static Expected<OnDiskHashMappedTrie> + create(const Twine &Path, const Twine &TrieName, size_t NumHashBits, + uint64_t DataSize, uint64_t MaxFileSize, + std::optional<uint64_t> NewFileInitialSize, + std::optional<size_t> NewTableNumRootBits = std::nullopt, + std::optional<size_t> NewTableNumSubtrieBits = std::nullopt); + + OnDiskHashMappedTrie(OnDiskHashMappedTrie &&RHS); + OnDiskHashMappedTrie &operator=(OnDiskHashMappedTrie &&RHS); + ~OnDiskHashMappedTrie(); + +private: + struct ImplType; + explicit OnDiskHashMappedTrie(std::unique_ptr<ImplType> Impl); + std::unique_ptr<ImplType> Impl; +}; + +/// Sink for data. Stores variable length data with 8-byte alignment. Does not +/// track size of data, which is assumed to known from context, or embedded. +/// Uses 0-padding but does not guarantee 0-termination. +class OnDiskDataAllocator { +public: + using ValueProxy = MutableArrayRef<char>; + + /// An iterator-like return value for data insertion. Maybe it should be + /// called \c iterator, but it has no increment. + class pointer { + public: + FileOffset getOffset() const { return Offset; } + explicit operator bool() const { return bool(getOffset()); } + const ValueProxy &operator*() const { + assert(Offset && "Null dereference"); + return Value; + } + const ValueProxy *operator->() const { + assert(Offset && "Null dereference"); + return &Value; + } + + pointer() = default; + + private: + friend class OnDiskDataAllocator; + pointer(FileOffset Offset, ValueProxy Value) + : Offset(Offset), Value(Value) {} + FileOffset Offset; + ValueProxy Value; + }; + + // Look up the data stored at the given offset. + const char *beginData(FileOffset Offset) const; + char *beginData(FileOffset Offset) { + return const_cast<char *>( + const_cast<const OnDiskDataAllocator *>(this)->beginData(Offset)); + } + + pointer allocate(size_t Size); + pointer save(ArrayRef<char> Data) { + pointer P = allocate(Data.size()); + llvm::copy(Data, P->begin()); + return P; + } + pointer save(StringRef Data) { + return save(ArrayRef<char>(Data.begin(), Data.size())); + } + + /// \returns the buffer that was allocated at \p create time, with size + /// \p UserHeaderSize. + MutableArrayRef<uint8_t> getUserHeader(); + + size_t size() const; + + static Expected<OnDiskDataAllocator> + create(const Twine &Path, const Twine &TableName, uint64_t MaxFileSize, + std::optional<uint64_t> NewFileInitialSize, + uint32_t UserHeaderSize = 0, + function_ref<void(void *)> UserHeaderInit = nullptr); + + OnDiskDataAllocator(OnDiskDataAllocator &&RHS); + OnDiskDataAllocator &operator=(OnDiskDataAllocator &&RHS); + + // No copy. Just call \a create() again. + OnDiskDataAllocator(const OnDiskDataAllocator &) = delete; + OnDiskDataAllocator &operator=(const OnDiskDataAllocator &) = delete; + + ~OnDiskDataAllocator(); + +private: + struct ImplType; + explicit OnDiskDataAllocator(std::unique_ptr<ImplType> Impl); + std::unique_ptr<ImplType> Impl; +}; + +} // namespace cas +} // namespace llvm + +#endif // LLVM_CAS_ONDISKHASHMAPPEDTRIE_H diff --git a/llvm/include/llvm/Support/FileSystem.h b/llvm/include/llvm/Support/FileSystem.h index 9cf5336..38ad0e7 100644 --- a/llvm/include/llvm/Support/FileSystem.h +++ b/llvm/include/llvm/Support/FileSystem.h @@ -1184,12 +1184,16 @@ openNativeFileForRead(const Twine &Name, OpenFlags Flags = OF_None, /// descriptor. std::error_code tryLockFile(int FD, - std::chrono::milliseconds Timeout = std::chrono::milliseconds(0)); + std::chrono::milliseconds Timeout = std::chrono::milliseconds(0), + bool Exclusive = true); /// Lock the file. /// /// This function acts as @ref tryLockFile but it waits infinitely. -std::error_code lockFile(int FD); +/// \param FD file descriptor to use for locking. +/// \param Exclusive if \p true use exclusive/writer lock, otherwise use +/// shared/reader lock. +std::error_code lockFile(int FD, bool Exclusive = true); /// Unlock the file. /// diff --git a/llvm/include/module.modulemap b/llvm/include/module.modulemap index b00da6d..d44d395 100644 --- a/llvm/include/module.modulemap +++ b/llvm/include/module.modulemap @@ -105,6 +105,12 @@ module LLVM_BinaryFormat { textual header "llvm/BinaryFormat/MsgPack.def" } +module LLVM_CAS { + requires cplusplus + umbrella "llvm/CAS" + module * { export * } +} + module LLVM_Config { requires cplusplus umbrella "llvm/Config" diff --git a/llvm/lib/CAS/ActionCache.cpp b/llvm/lib/CAS/ActionCache.cpp new file mode 100644 index 0000000..851fee5 --- /dev/null +++ b/llvm/lib/CAS/ActionCache.cpp @@ -0,0 +1,22 @@ +//===- ActionCache.cpp ------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/CAS/ActionCache.h" +#include "llvm/CAS/CASID.h" +#include "llvm/CAS/ObjectStore.h" + +using namespace llvm; +using namespace llvm::cas; + +void ActionCache::anchor() {} + +CacheKey::CacheKey(const CASID &ID) : Key(toStringRef(ID.getHash()).str()) {} +CacheKey::CacheKey(const ObjectProxy &Proxy) + : CacheKey(Proxy.getCAS(), Proxy.getRef()) {} +CacheKey::CacheKey(const ObjectStore &CAS, const ObjectRef &Ref) + : Key(toStringRef(CAS.getID(Ref).getHash())) {} diff --git a/llvm/lib/CAS/ActionCaches.cpp b/llvm/lib/CAS/ActionCaches.cpp new file mode 100644 index 0000000..4fd5499 --- /dev/null +++ b/llvm/lib/CAS/ActionCaches.cpp @@ -0,0 +1,99 @@ +//===- ActionCaches.cpp -----------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "BuiltinCAS.h" +#include "llvm/ADT/TrieRawHashMap.h" +#include "llvm/CAS/ActionCache.h" +#include "llvm/Support/BLAKE3.h" + +#define DEBUG_TYPE "action-caches" + +using namespace llvm; +using namespace llvm::cas; + +namespace { + +using HasherT = BLAKE3; +using HashType = decltype(HasherT::hash(std::declval<ArrayRef<uint8_t> &>())); + +template <size_t Size> class CacheEntry { +public: + CacheEntry() = default; + CacheEntry(ArrayRef<uint8_t> Hash) { llvm::copy(Hash, Value.data()); } + CacheEntry(const CacheEntry &Entry) { llvm::copy(Entry.Value, Value.data()); } + ArrayRef<uint8_t> getValue() const { return Value; } + +private: + std::array<uint8_t, Size> Value; +}; + +class InMemoryActionCache final : public ActionCache { +public: + InMemoryActionCache() + : ActionCache(builtin::BuiltinCASContext::getDefaultContext()) {} + + Error putImpl(ArrayRef<uint8_t> ActionKey, const CASID &Result, + bool Globally) final; + Expected<std::optional<CASID>> getImpl(ArrayRef<uint8_t> ActionKey, + bool Globally) const final; + +private: + using DataT = CacheEntry<sizeof(HashType)>; + using InMemoryCacheT = ThreadSafeTrieRawHashMap<DataT, sizeof(HashType)>; + + InMemoryCacheT Cache; +}; +} // end namespace + +static std::string hashToString(ArrayRef<uint8_t> Hash) { + SmallString<64> Str; + toHex(Hash, /*LowerCase=*/true, Str); + return Str.str().str(); +} + +static Error createResultCachePoisonedError(StringRef Key, + const CASContext &Context, + CASID Output, + ArrayRef<uint8_t> ExistingOutput) { + std::string Existing = + CASID::create(&Context, toStringRef(ExistingOutput)).toString(); + return createStringError(std::make_error_code(std::errc::invalid_argument), + "cache poisoned for '" + Key + "' (new='" + + Output.toString() + "' vs. existing '" + + Existing + "')"); +} + +Expected<std::optional<CASID>> +InMemoryActionCache::getImpl(ArrayRef<uint8_t> Key, bool /*Globally*/) const { + auto Result = Cache.find(Key); + if (!Result) + return std::nullopt; + return CASID::create(&getContext(), toStringRef(Result->Data.getValue())); +} + +Error InMemoryActionCache::putImpl(ArrayRef<uint8_t> Key, const CASID &Result, + bool /*Globally*/) { + DataT Expected(Result.getHash()); + const InMemoryCacheT::value_type &Cached = *Cache.insertLazy( + Key, [&](auto ValueConstructor) { ValueConstructor.emplace(Expected); }); + + const DataT &Observed = Cached.Data; + if (Expected.getValue() == Observed.getValue()) + return Error::success(); + + return createResultCachePoisonedError(hashToString(Key), getContext(), Result, + Observed.getValue()); +} + +namespace llvm::cas { + +std::unique_ptr<ActionCache> createInMemoryActionCache() { + return std::make_unique<InMemoryActionCache>(); +} + +} // namespace llvm::cas diff --git a/llvm/lib/CAS/BuiltinCAS.cpp b/llvm/lib/CAS/BuiltinCAS.cpp new file mode 100644 index 0000000..73646ad --- /dev/null +++ b/llvm/lib/CAS/BuiltinCAS.cpp @@ -0,0 +1,94 @@ +//===- BuiltinCAS.cpp -------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "BuiltinCAS.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/CAS/BuiltinObjectHasher.h" +#include "llvm/Support/Process.h" + +using namespace llvm; +using namespace llvm::cas; +using namespace llvm::cas::builtin; + +static StringRef getCASIDPrefix() { return "llvmcas://"; } +void BuiltinCASContext::anchor() {} + +Expected<HashType> BuiltinCASContext::parseID(StringRef Reference) { + if (!Reference.consume_front(getCASIDPrefix())) + return createStringError(std::make_error_code(std::errc::invalid_argument), + "invalid cas-id '" + Reference + "'"); + + // FIXME: Allow shortened references? + if (Reference.size() != 2 * sizeof(HashType)) + return createStringError(std::make_error_code(std::errc::invalid_argument), + "wrong size for cas-id hash '" + Reference + "'"); + + std::string Binary; + if (!tryGetFromHex(Reference, Binary)) + return createStringError(std::make_error_code(std::errc::invalid_argument), + "invalid hash in cas-id '" + Reference + "'"); + + assert(Binary.size() == sizeof(HashType)); + HashType Digest; + llvm::copy(Binary, Digest.data()); + return Digest; +} + +Expected<CASID> BuiltinCAS::parseID(StringRef Reference) { + Expected<HashType> Digest = BuiltinCASContext::parseID(Reference); + if (!Digest) + return Digest.takeError(); + + return CASID::create(&getContext(), toStringRef(*Digest)); +} + +void BuiltinCASContext::printID(ArrayRef<uint8_t> Digest, raw_ostream &OS) { + SmallString<64> Hash; + toHex(Digest, /*LowerCase=*/true, Hash); + OS << getCASIDPrefix() << Hash; +} + +void BuiltinCASContext::printIDImpl(raw_ostream &OS, const CASID &ID) const { + BuiltinCASContext::printID(ID.getHash(), OS); +} + +const BuiltinCASContext &BuiltinCASContext::getDefaultContext() { + static BuiltinCASContext DefaultContext; + return DefaultContext; +} + +Expected<ObjectRef> BuiltinCAS::store(ArrayRef<ObjectRef> Refs, + ArrayRef<char> Data) { + return storeImpl(BuiltinObjectHasher<HasherT>::hashObject(*this, Refs, Data), + Refs, Data); +} + +Error BuiltinCAS::validate(const CASID &ID) { + auto Ref = getReference(ID); + if (!Ref) + return createUnknownObjectError(ID); + + auto Handle = load(*Ref); + if (!Handle) + return Handle.takeError(); + + auto Proxy = ObjectProxy::load(*this, *Ref, *Handle); + SmallVector<ObjectRef> Refs; + if (auto E = Proxy.forEachReference([&](ObjectRef Ref) -> Error { + Refs.push_back(Ref); + return Error::success(); + })) + return E; + + ArrayRef<char> Data(Proxy.getData().data(), Proxy.getData().size()); + auto Hash = BuiltinObjectHasher<HasherT>::hashObject(*this, Refs, Data); + if (!ID.getHash().equals(Hash)) + return createCorruptObjectError(ID); + + return Error::success(); +} diff --git a/llvm/lib/CAS/BuiltinCAS.h b/llvm/lib/CAS/BuiltinCAS.h new file mode 100644 index 0000000..1a4f640 --- /dev/null +++ b/llvm/lib/CAS/BuiltinCAS.h @@ -0,0 +1,74 @@ +//===- BuiltinCAS.h ---------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_CAS_BUILTINCAS_H +#define LLVM_LIB_CAS_BUILTINCAS_H + +#include "llvm/ADT/StringRef.h" +#include "llvm/CAS/BuiltinCASContext.h" +#include "llvm/CAS/ObjectStore.h" + +namespace llvm::cas { +class ActionCache; +namespace builtin { + +class BuiltinCAS : public ObjectStore { +public: + BuiltinCAS() : ObjectStore(BuiltinCASContext::getDefaultContext()) {} + + Expected<CASID> parseID(StringRef Reference) final; + + Expected<ObjectRef> store(ArrayRef<ObjectRef> Refs, + ArrayRef<char> Data) final; + virtual Expected<ObjectRef> storeImpl(ArrayRef<uint8_t> ComputedHash, + ArrayRef<ObjectRef> Refs, + ArrayRef<char> Data) = 0; + + virtual Expected<ObjectRef> + storeFromNullTerminatedRegion(ArrayRef<uint8_t> ComputedHash, + sys::fs::mapped_file_region Map) { + return storeImpl(ComputedHash, std::nullopt, + ArrayRef(Map.data(), Map.size())); + } + + /// Both builtin CAS implementations provide lifetime for free, so this can + /// be const, and readData() and getDataSize() can be implemented on top of + /// it. + virtual ArrayRef<char> getDataConst(ObjectHandle Node) const = 0; + + ArrayRef<char> getData(ObjectHandle Node, + bool RequiresNullTerminator) const final { + // BuiltinCAS Objects are always null terminated. + return getDataConst(Node); + } + uint64_t getDataSize(ObjectHandle Node) const final { + return getDataConst(Node).size(); + } + + Error createUnknownObjectError(const CASID &ID) const { + return createStringError(std::make_error_code(std::errc::invalid_argument), + "unknown object '" + ID.toString() + "'"); + } + + Error createCorruptObjectError(const CASID &ID) const { + return createStringError(std::make_error_code(std::errc::invalid_argument), + "corrupt object '" + ID.toString() + "'"); + } + + Error createCorruptStorageError() const { + return createStringError(std::make_error_code(std::errc::invalid_argument), + "corrupt storage"); + } + + Error validate(const CASID &ID) final; +}; + +} // end namespace builtin +} // end namespace llvm::cas + +#endif // LLVM_LIB_CAS_BUILTINCAS_H diff --git a/llvm/lib/CAS/CMakeLists.txt b/llvm/lib/CAS/CMakeLists.txt new file mode 100644 index 0000000..7766a9b --- /dev/null +++ b/llvm/lib/CAS/CMakeLists.txt @@ -0,0 +1,17 @@ +if (LLVM_ENABLE_ONDISK_CAS) + add_definitions(-DLLVM_ENABLE_ONDISK_CAS=1) +endif() + +add_llvm_component_library(LLVMCAS + ActionCache.cpp + ActionCaches.cpp + BuiltinCAS.cpp + InMemoryCAS.cpp + MappedFileRegionBumpPtr.cpp + ObjectStore.cpp + OnDiskCommon.cpp + OnDiskHashMappedTrie.cpp + + ADDITIONAL_HEADER_DIRS + ${LLVM_MAIN_INCLUDE_DIR}/llvm/CAS +) diff --git a/llvm/lib/CAS/InMemoryCAS.cpp b/llvm/lib/CAS/InMemoryCAS.cpp new file mode 100644 index 0000000..abdd7ed --- /dev/null +++ b/llvm/lib/CAS/InMemoryCAS.cpp @@ -0,0 +1,320 @@ +//===- InMemoryCAS.cpp ------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "BuiltinCAS.h" +#include "llvm/ADT/LazyAtomicPointer.h" +#include "llvm/ADT/PointerIntPair.h" +#include "llvm/ADT/TrieRawHashMap.h" +#include "llvm/Support/Allocator.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/ThreadSafeAllocator.h" + +using namespace llvm; +using namespace llvm::cas; +using namespace llvm::cas::builtin; + +namespace { + +class InMemoryObject; + +/// Index of referenced IDs (map: Hash -> InMemoryObject*). Uses +/// LazyAtomicPointer to coordinate creation of objects. +using InMemoryIndexT = + ThreadSafeTrieRawHashMap<LazyAtomicPointer<const InMemoryObject>, + sizeof(HashType)>; + +/// Values in \a InMemoryIndexT. \a InMemoryObject's point at this to access +/// their hash. +using InMemoryIndexValueT = InMemoryIndexT::value_type; + +class InMemoryObject { +public: + enum class Kind { + /// Node with refs and data. + RefNode, + + /// Node with refs and data co-allocated. + InlineNode, + + Max = InlineNode, + }; + + Kind getKind() const { return IndexAndKind.getInt(); } + const InMemoryIndexValueT &getIndex() const { + assert(IndexAndKind.getPointer()); + return *IndexAndKind.getPointer(); + } + + ArrayRef<uint8_t> getHash() const { return getIndex().Hash; } + + InMemoryObject() = delete; + InMemoryObject(InMemoryObject &&) = delete; + InMemoryObject(const InMemoryObject &) = delete; + +protected: + InMemoryObject(Kind K, const InMemoryIndexValueT &I) : IndexAndKind(&I, K) {} + +private: + enum Counts : int { + NumKindBits = 2, + }; + PointerIntPair<const InMemoryIndexValueT *, NumKindBits, Kind> IndexAndKind; + static_assert((1U << NumKindBits) <= alignof(InMemoryIndexValueT), + "Kind will clobber pointer"); + static_assert(((int)Kind::Max >> NumKindBits) == 0, "Kind will be truncated"); + +public: + inline ArrayRef<char> getData() const; + + inline ArrayRef<const InMemoryObject *> getRefs() const; +}; + +class InMemoryRefObject : public InMemoryObject { +public: + static constexpr Kind KindValue = Kind::RefNode; + static bool classof(const InMemoryObject *O) { + return O->getKind() == KindValue; + } + + ArrayRef<const InMemoryObject *> getRefsImpl() const { return Refs; } + ArrayRef<const InMemoryObject *> getRefs() const { return Refs; } + ArrayRef<char> getDataImpl() const { return Data; } + ArrayRef<char> getData() const { return Data; } + + static InMemoryRefObject &create(function_ref<void *(size_t Size)> Allocate, + const InMemoryIndexValueT &I, + ArrayRef<const InMemoryObject *> Refs, + ArrayRef<char> Data) { + void *Mem = Allocate(sizeof(InMemoryRefObject)); + return *new (Mem) InMemoryRefObject(I, Refs, Data); + } + +private: + InMemoryRefObject(const InMemoryIndexValueT &I, + ArrayRef<const InMemoryObject *> Refs, ArrayRef<char> Data) + : InMemoryObject(KindValue, I), Refs(Refs), Data(Data) { + assert(isAddrAligned(Align(8), this) && "Expected 8-byte alignment"); + assert(isAddrAligned(Align(8), Data.data()) && "Expected 8-byte alignment"); + assert(*Data.end() == 0 && "Expected null-termination"); + } + + ArrayRef<const InMemoryObject *> Refs; + ArrayRef<char> Data; +}; + +class InMemoryInlineObject : public InMemoryObject { +public: + static constexpr Kind KindValue = Kind::InlineNode; + static bool classof(const InMemoryObject *O) { + return O->getKind() == KindValue; + } + + ArrayRef<const InMemoryObject *> getRefs() const { return getRefsImpl(); } + ArrayRef<const InMemoryObject *> getRefsImpl() const { + return ArrayRef(reinterpret_cast<const InMemoryObject *const *>(this + 1), + NumRefs); + } + + ArrayRef<char> getData() const { return getDataImpl(); } + ArrayRef<char> getDataImpl() const { + ArrayRef<const InMemoryObject *> Refs = getRefs(); + return ArrayRef(reinterpret_cast<const char *>(Refs.data() + Refs.size()), + DataSize); + } + + static InMemoryInlineObject & + create(function_ref<void *(size_t Size)> Allocate, + const InMemoryIndexValueT &I, ArrayRef<const InMemoryObject *> Refs, + ArrayRef<char> Data) { + void *Mem = Allocate(sizeof(InMemoryInlineObject) + + sizeof(uintptr_t) * Refs.size() + Data.size() + 1); + return *new (Mem) InMemoryInlineObject(I, Refs, Data); + } + +private: + InMemoryInlineObject(const InMemoryIndexValueT &I, + ArrayRef<const InMemoryObject *> Refs, + ArrayRef<char> Data) + : InMemoryObject(KindValue, I), NumRefs(Refs.size()), + DataSize(Data.size()) { + auto *BeginRefs = reinterpret_cast<const InMemoryObject **>(this + 1); + llvm::copy(Refs, BeginRefs); + auto *BeginData = reinterpret_cast<char *>(BeginRefs + NumRefs); + llvm::copy(Data, BeginData); + BeginData[Data.size()] = 0; + } + uint32_t NumRefs; + uint32_t DataSize; +}; + +/// In-memory CAS database and action cache (the latter should be separated). +class InMemoryCAS : public BuiltinCAS { +public: + Expected<ObjectRef> storeImpl(ArrayRef<uint8_t> ComputedHash, + ArrayRef<ObjectRef> Refs, + ArrayRef<char> Data) final; + + Expected<ObjectRef> + storeFromNullTerminatedRegion(ArrayRef<uint8_t> ComputedHash, + sys::fs::mapped_file_region Map) override; + + CASID getID(const InMemoryIndexValueT &I) const { + StringRef Hash = toStringRef(I.Hash); + return CASID::create(&getContext(), Hash); + } + CASID getID(const InMemoryObject &O) const { return getID(O.getIndex()); } + + ObjectHandle getObjectHandle(const InMemoryObject &Node) const { + assert(!(reinterpret_cast<uintptr_t>(&Node) & 0x1ULL)); + return makeObjectHandle(reinterpret_cast<uintptr_t>(&Node)); + } + + Expected<std::optional<ObjectHandle>> loadIfExists(ObjectRef Ref) override { + return getObjectHandle(asInMemoryObject(Ref)); + } + + InMemoryIndexValueT &indexHash(ArrayRef<uint8_t> Hash) { + return *Index.insertLazy( + Hash, [](auto ValueConstructor) { ValueConstructor.emplace(nullptr); }); + } + + /// TODO: Consider callers to actually do an insert and to return a handle to + /// the slot in the trie. + const InMemoryObject *getInMemoryObject(CASID ID) const { + assert(ID.getContext().getHashSchemaIdentifier() == + getContext().getHashSchemaIdentifier() && + "Expected ID from same hash schema"); + if (InMemoryIndexT::const_pointer P = Index.find(ID.getHash())) + return P->Data; + return nullptr; + } + + const InMemoryObject &getInMemoryObject(ObjectHandle OH) const { + return *reinterpret_cast<const InMemoryObject *>( + (uintptr_t)OH.getInternalRef(*this)); + } + + const InMemoryObject &asInMemoryObject(ReferenceBase Ref) const { + uintptr_t P = Ref.getInternalRef(*this); + return *reinterpret_cast<const InMemoryObject *>(P); + } + ObjectRef toReference(const InMemoryObject &O) const { + return makeObjectRef(reinterpret_cast<uintptr_t>(&O)); + } + + CASID getID(ObjectRef Ref) const final { return getIDImpl(Ref); } + CASID getIDImpl(ReferenceBase Ref) const { + return getID(asInMemoryObject(Ref)); + } + + std::optional<ObjectRef> getReference(const CASID &ID) const final { + if (const InMemoryObject *Object = getInMemoryObject(ID)) + return toReference(*Object); + return std::nullopt; + } + + Expected<bool> isMaterialized(ObjectRef Ref) const final { return true; } + + ArrayRef<char> getDataConst(ObjectHandle Node) const final { + return cast<InMemoryObject>(asInMemoryObject(Node)).getData(); + } + + InMemoryCAS() = default; + +private: + size_t getNumRefs(ObjectHandle Node) const final { + return getInMemoryObject(Node).getRefs().size(); + } + ObjectRef readRef(ObjectHandle Node, size_t I) const final { + return toReference(*getInMemoryObject(Node).getRefs()[I]); + } + Error forEachRef(ObjectHandle Node, + function_ref<Error(ObjectRef)> Callback) const final; + + /// Index of referenced IDs (map: Hash -> InMemoryObject*). Mapped to nullptr + /// as a convenient way to store hashes. + /// + /// - Insert nullptr on lookups. + /// - InMemoryObject points back to here. + InMemoryIndexT Index; + + ThreadSafeAllocator<BumpPtrAllocator> Objects; + ThreadSafeAllocator<SpecificBumpPtrAllocator<sys::fs::mapped_file_region>> + MemoryMaps; +}; + +} // end anonymous namespace + +ArrayRef<char> InMemoryObject::getData() const { + if (auto *Derived = dyn_cast<InMemoryRefObject>(this)) + return Derived->getDataImpl(); + return cast<InMemoryInlineObject>(this)->getDataImpl(); +} + +ArrayRef<const InMemoryObject *> InMemoryObject::getRefs() const { + if (auto *Derived = dyn_cast<InMemoryRefObject>(this)) + return Derived->getRefsImpl(); + return cast<InMemoryInlineObject>(this)->getRefsImpl(); +} + +Expected<ObjectRef> +InMemoryCAS::storeFromNullTerminatedRegion(ArrayRef<uint8_t> ComputedHash, + sys::fs::mapped_file_region Map) { + // Look up the hash in the index, initializing to nullptr if it's new. + ArrayRef<char> Data(Map.data(), Map.size()); + auto &I = indexHash(ComputedHash); + + // Load or generate. + auto Allocator = [&](size_t Size) -> void * { + return Objects.Allocate(Size, alignof(InMemoryObject)); + }; + auto Generator = [&]() -> const InMemoryObject * { + return &InMemoryRefObject::create(Allocator, I, std::nullopt, Data); + }; + const InMemoryObject &Node = + cast<InMemoryObject>(I.Data.loadOrGenerate(Generator)); + + // Save Map if the winning node uses it. + if (auto *RefNode = dyn_cast<InMemoryRefObject>(&Node)) + if (RefNode->getData().data() == Map.data()) + new (MemoryMaps.Allocate(1)) sys::fs::mapped_file_region(std::move(Map)); + + return toReference(Node); +} + +Expected<ObjectRef> InMemoryCAS::storeImpl(ArrayRef<uint8_t> ComputedHash, + ArrayRef<ObjectRef> Refs, + ArrayRef<char> Data) { + // Look up the hash in the index, initializing to nullptr if it's new. + auto &I = indexHash(ComputedHash); + + // Create the node. + SmallVector<const InMemoryObject *> InternalRefs; + for (ObjectRef Ref : Refs) + InternalRefs.push_back(&asInMemoryObject(Ref)); + auto Allocator = [&](size_t Size) -> void * { + return Objects.Allocate(Size, alignof(InMemoryObject)); + }; + auto Generator = [&]() -> const InMemoryObject * { + return &InMemoryInlineObject::create(Allocator, I, InternalRefs, Data); + }; + return toReference(cast<InMemoryObject>(I.Data.loadOrGenerate(Generator))); +} + +Error InMemoryCAS::forEachRef(ObjectHandle Handle, + function_ref<Error(ObjectRef)> Callback) const { + auto &Node = getInMemoryObject(Handle); + for (const InMemoryObject *Ref : Node.getRefs()) + if (Error E = Callback(toReference(*Ref))) + return E; + return Error::success(); +} + +std::unique_ptr<ObjectStore> cas::createInMemoryCAS() { + return std::make_unique<InMemoryCAS>(); +} diff --git a/llvm/lib/CAS/MappedFileRegionBumpPtr.cpp b/llvm/lib/CAS/MappedFileRegionBumpPtr.cpp new file mode 100644 index 0000000..2222644 --- /dev/null +++ b/llvm/lib/CAS/MappedFileRegionBumpPtr.cpp @@ -0,0 +1,233 @@ +//===- MappedFileRegionBumpPtr.cpp ------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// A bump pointer allocator, backed by a memory-mapped file. +/// +/// The effect we want is: +/// +/// 1. If it doesn't exist, create the file with an initial size. +/// 2. Reserve virtual memory large enough for the max file size. +/// 3. Map the file into memory in the reserved region. +/// 4. Increase the file size and update the mapping when necessary. +/// +/// However, updating the mapping is challenging when it needs to work portably, +/// and across multiple processes without locking for every read. Our current +/// implementation strategy is: +/// +/// 1. Use \c ftruncate (\c sys::fs::resize_file) to grow the file to its max +/// size (typically several GB). Many modern filesystems will create a sparse +/// file, so that the trailing unused pages do not take space on disk. +/// 2. Call \c mmap (\c sys::fs::mapped_file_region) +/// 3. [Automatic as part of 2.] +/// 4. [Automatic as part of 2.] +/// +/// Additionally, we attempt to resize the file to its actual data size when +/// closing the mapping, if this is the only concurrent instance. This is done +/// using file locks. Shrinking the file mitigates problems with having large +/// files: on filesystems without sparse files it avoids unnecessary space use; +/// it also avoids allocating the full size if another process copies the file, +/// which typically loses sparseness. These mitigations only work while the file +/// is not in use. +/// +/// FIXME: we assume that all concurrent users of the file will use the same +/// value for Capacity. Otherwise a process with a larger capacity can write +/// data that is "out of bounds" for processes with smaller capacity. Currently +/// this is true in the CAS. +/// +/// To support resizing, we use two separate file locks: +/// 1. We use a shared reader lock on a ".shared" file until destruction. +/// 2. We use a lock on the main file during initialization - shared to check +/// the status, upgraded to exclusive to resize/initialize the file. +/// +/// Then during destruction we attempt to get exclusive access on (1), which +/// requires no concurrent readers. If so, we shrink the file. Using two +/// separate locks simplifies the implementation and enables it to work on +/// platforms (e.g. Windows) where a shared/reader lock prevents writing. +//===----------------------------------------------------------------------===// + +#include "llvm/CAS/MappedFileRegionBumpPtr.h" +#include "OnDiskCommon.h" + +using namespace llvm; +using namespace llvm::cas; +using namespace llvm::cas::ondisk; + +namespace { +struct FileLockRAII { + std::string Path; + int FD; + enum LockKind { Shared, Exclusive }; + std::optional<LockKind> Locked; + + FileLockRAII(std::string Path, int FD) : Path(std::move(Path)), FD(FD) {} + ~FileLockRAII() { consumeError(unlock()); } + + Error lock(LockKind LK) { + if (std::error_code EC = lockFileThreadSafe(FD, LK == Exclusive)) + return createFileError(Path, EC); + Locked = LK; + return Error::success(); + } + + Error unlock() { + if (Locked) { + Locked = std::nullopt; + if (std::error_code EC = unlockFileThreadSafe(FD)) + return createFileError(Path, EC); + } + return Error::success(); + } +}; +} // end anonymous namespace + +Expected<MappedFileRegionBumpPtr> MappedFileRegionBumpPtr::create( + const Twine &Path, uint64_t Capacity, int64_t BumpPtrOffset, + function_ref<Error(MappedFileRegionBumpPtr &)> NewFileConstructor) { + MappedFileRegionBumpPtr Result; + Result.Path = Path.str(); + // Open the main file. + int FD; + if (std::error_code EC = sys::fs::openFileForReadWrite( + Result.Path, FD, sys::fs::CD_OpenAlways, sys::fs::OF_None)) + return createFileError(Path, EC); + Result.FD = FD; + + // Open the shared lock file. See file comment for details of locking scheme. + SmallString<128> SharedLockPath(Result.Path); + SharedLockPath.append(".shared"); + int SharedLockFD; + if (std::error_code EC = sys::fs::openFileForReadWrite( + SharedLockPath, SharedLockFD, sys::fs::CD_OpenAlways, + sys::fs::OF_None)) + return createFileError(SharedLockPath, EC); + Result.SharedLockFD = SharedLockFD; + + // Take shared/reader lock that will be held until we close the file; unlocked + // by destroyImpl. + if (std::error_code EC = + lockFileThreadSafe(SharedLockFD, /*Exclusive=*/false)) + return createFileError(Path, EC); + + // Take shared/reader lock for initialization. + FileLockRAII InitLock(Result.Path, FD); + if (Error E = InitLock.lock(FileLockRAII::Shared)) + return std::move(E); + + sys::fs::file_t File = sys::fs::convertFDToNativeFile(FD); + sys::fs::file_status Status; + if (std::error_code EC = sys::fs::status(File, Status)) + return createFileError(Result.Path, EC); + + if (Status.getSize() < Capacity) { + // Lock the file exclusively so only one process will do the initialization. + if (Error E = InitLock.unlock()) + return std::move(E); + if (Error E = InitLock.lock(FileLockRAII::Exclusive)) + return std::move(E); + // Retrieve the current size now that we have exclusive access. + if (std::error_code EC = sys::fs::status(File, Status)) + return createFileError(Result.Path, EC); + } + + // At this point either the file is still under-sized, or we have the size for + // the completely initialized file. + + if (Status.getSize() < Capacity) { + // We are initializing the file; it may be empty, or may have been shrunk + // during a previous close. + // FIXME: Detect a case where someone opened it with a smaller capacity. + // FIXME: On Windows we should use FSCTL_SET_SPARSE and FSCTL_SET_ZERO_DATA + // to make this a sparse region, if supported. + if (std::error_code EC = sys::fs::resize_file(FD, Capacity)) + return createFileError(Result.Path, EC); + } else { + // Someone else initialized it. + Capacity = Status.getSize(); + } + + // Create the mapped region. + { + std::error_code EC; + sys::fs::mapped_file_region Map( + File, sys::fs::mapped_file_region::readwrite, Capacity, 0, EC); + if (EC) + return createFileError(Result.Path, EC); + Result.Region = std::move(Map); + } + + if (Status.getSize() == 0) { + // We are creating a new file; run the constructor. + if (Error E = NewFileConstructor(Result)) + return std::move(E); + } else { + Result.initializeBumpPtr(BumpPtrOffset); + } + + return Result; +} + +void MappedFileRegionBumpPtr::destroyImpl() { + if (!FD) + return; + + // Drop the shared lock indicating we are no longer accessing the file. + if (SharedLockFD) + (void)unlockFileThreadSafe(*SharedLockFD); + + // Attempt to truncate the file if we can get exclusive access. Ignore any + // errors. + if (BumpPtr) { + assert(SharedLockFD && "Must have shared lock file open"); + if (tryLockFileThreadSafe(*SharedLockFD) == std::error_code()) { + assert(size() <= capacity()); + (void)sys::fs::resize_file(*FD, size()); + (void)unlockFileThreadSafe(*SharedLockFD); + } + } + + auto Close = [](std::optional<int> &FD) { + if (FD) { + sys::fs::file_t File = sys::fs::convertFDToNativeFile(*FD); + sys::fs::closeFile(File); + FD = std::nullopt; + } + }; + + // Close the file and shared lock. + Close(FD); + Close(SharedLockFD); +} + +void MappedFileRegionBumpPtr::initializeBumpPtr(int64_t BumpPtrOffset) { + assert(capacity() < (uint64_t)INT64_MAX && "capacity must fit in int64_t"); + int64_t BumpPtrEndOffset = BumpPtrOffset + sizeof(decltype(*BumpPtr)); + assert(BumpPtrEndOffset <= (int64_t)capacity() && + "Expected end offset to be pre-allocated"); + assert(isAligned(Align::Of<decltype(*BumpPtr)>(), BumpPtrOffset) && + "Expected end offset to be aligned"); + BumpPtr = reinterpret_cast<decltype(BumpPtr)>(data() + BumpPtrOffset); + + int64_t ExistingValue = 0; + if (!BumpPtr->compare_exchange_strong(ExistingValue, BumpPtrEndOffset)) + assert(ExistingValue >= BumpPtrEndOffset && + "Expected 0, or past the end of the BumpPtr itself"); +} + +int64_t MappedFileRegionBumpPtr::allocateOffset(uint64_t AllocSize) { + AllocSize = alignTo(AllocSize, getAlign()); + int64_t OldEnd = BumpPtr->fetch_add(AllocSize); + int64_t NewEnd = OldEnd + AllocSize; + if (LLVM_UNLIKELY(NewEnd > (int64_t)capacity())) { + // Try to return the allocation. + (void)BumpPtr->compare_exchange_strong(OldEnd, NewEnd); + report_fatal_error( + errorCodeToError(std::make_error_code(std::errc::not_enough_memory))); + } + return OldEnd; +} diff --git a/llvm/lib/CAS/ObjectStore.cpp b/llvm/lib/CAS/ObjectStore.cpp new file mode 100644 index 0000000..a938c4e --- /dev/null +++ b/llvm/lib/CAS/ObjectStore.cpp @@ -0,0 +1,168 @@ +//===- ObjectStore.cpp ------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/CAS/ObjectStore.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/Errc.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/MemoryBuffer.h" + +using namespace llvm; +using namespace llvm::cas; + +void CASContext::anchor() {} +void ObjectStore::anchor() {} + +LLVM_DUMP_METHOD void CASID::dump() const { print(dbgs()); } +LLVM_DUMP_METHOD void ObjectStore::dump() const { print(dbgs()); } +LLVM_DUMP_METHOD void ObjectRef::dump() const { print(dbgs()); } +LLVM_DUMP_METHOD void ObjectHandle::dump() const { print(dbgs()); } + +std::string CASID::toString() const { + std::string S; + raw_string_ostream(S) << *this; + return S; +} + +static void printReferenceBase(raw_ostream &OS, StringRef Kind, + uint64_t InternalRef, std::optional<CASID> ID) { + OS << Kind << "=" << InternalRef; + if (ID) + OS << "[" << *ID << "]"; +} + +void ReferenceBase::print(raw_ostream &OS, const ObjectHandle &This) const { + assert(this == &This); + printReferenceBase(OS, "object-handle", InternalRef, std::nullopt); +} + +void ReferenceBase::print(raw_ostream &OS, const ObjectRef &This) const { + assert(this == &This); + + std::optional<CASID> ID; +#if LLVM_ENABLE_ABI_BREAKING_CHECKS + if (CAS) + ID = CAS->getID(This); +#endif + printReferenceBase(OS, "object-ref", InternalRef, ID); +} + +Expected<ObjectHandle> ObjectStore::load(ObjectRef Ref) { + std::optional<ObjectHandle> Handle; + if (Error E = loadIfExists(Ref).moveInto(Handle)) + return std::move(E); + if (!Handle) + return createStringError(errc::invalid_argument, + "missing object '" + getID(Ref).toString() + "'"); + return *Handle; +} + +std::unique_ptr<MemoryBuffer> +ObjectStore::getMemoryBuffer(ObjectHandle Node, StringRef Name, + bool RequiresNullTerminator) { + return MemoryBuffer::getMemBuffer( + toStringRef(getData(Node, RequiresNullTerminator)), Name, + RequiresNullTerminator); +} + +void ObjectStore::readRefs(ObjectHandle Node, + SmallVectorImpl<ObjectRef> &Refs) const { + consumeError(forEachRef(Node, [&Refs](ObjectRef Ref) -> Error { + Refs.push_back(Ref); + return Error::success(); + })); +} + +Expected<ObjectProxy> ObjectStore::getProxy(const CASID &ID) { + std::optional<ObjectRef> Ref = getReference(ID); + if (!Ref) + return createUnknownObjectError(ID); + + return getProxy(*Ref); +} + +Expected<ObjectProxy> ObjectStore::getProxy(ObjectRef Ref) { + std::optional<ObjectHandle> H; + if (Error E = load(Ref).moveInto(H)) + return std::move(E); + + return ObjectProxy::load(*this, Ref, *H); +} + +Expected<std::optional<ObjectProxy>> +ObjectStore::getProxyIfExists(ObjectRef Ref) { + std::optional<ObjectHandle> H; + if (Error E = loadIfExists(Ref).moveInto(H)) + return std::move(E); + if (!H) + return std::nullopt; + return ObjectProxy::load(*this, Ref, *H); +} + +Error ObjectStore::createUnknownObjectError(const CASID &ID) { + return createStringError(std::make_error_code(std::errc::invalid_argument), + "unknown object '" + ID.toString() + "'"); +} + +Expected<ObjectProxy> ObjectStore::createProxy(ArrayRef<ObjectRef> Refs, + StringRef Data) { + Expected<ObjectRef> Ref = store(Refs, arrayRefFromStringRef<char>(Data)); + if (!Ref) + return Ref.takeError(); + return getProxy(*Ref); +} + +Expected<ObjectRef> +ObjectStore::storeFromOpenFileImpl(sys::fs::file_t FD, + std::optional<sys::fs::file_status> Status) { + // Copy the file into an immutable memory buffer and call \c store on that. + // Using \c mmap would be unsafe because there's a race window between when we + // get the digest hash for the \c mmap contents and when we store the data; if + // the file changes in-between we will create an invalid object. + + // FIXME: For the on-disk CAS implementation use cloning to store it as a + // standalone file if the file-system supports it and the file is large. + + constexpr size_t ChunkSize = 4 * 4096; + SmallString<0> Data; + Data.reserve(ChunkSize * 2); + if (Error E = sys::fs::readNativeFileToEOF(FD, Data, ChunkSize)) + return std::move(E); + return store(std::nullopt, ArrayRef(Data.data(), Data.size())); +} + +Error ObjectStore::validateTree(ObjectRef Root) { + SmallDenseSet<ObjectRef> ValidatedRefs; + SmallVector<ObjectRef, 16> RefsToValidate; + RefsToValidate.push_back(Root); + + while (!RefsToValidate.empty()) { + ObjectRef Ref = RefsToValidate.pop_back_val(); + auto [I, Inserted] = ValidatedRefs.insert(Ref); + if (!Inserted) + continue; // already validated. + if (Error E = validate(getID(Ref))) + return E; + Expected<ObjectHandle> Obj = load(Ref); + if (!Obj) + return Obj.takeError(); + if (Error E = forEachRef(*Obj, [&RefsToValidate](ObjectRef R) -> Error { + RefsToValidate.push_back(R); + return Error::success(); + })) + return E; + } + return Error::success(); +} + +std::unique_ptr<MemoryBuffer> +ObjectProxy::getMemoryBuffer(StringRef Name, + bool RequiresNullTerminator) const { + return CAS->getMemoryBuffer(H, Name, RequiresNullTerminator); +} diff --git a/llvm/lib/CAS/OnDiskCommon.cpp b/llvm/lib/CAS/OnDiskCommon.cpp new file mode 100644 index 0000000..12df239e --- /dev/null +++ b/llvm/lib/CAS/OnDiskCommon.cpp @@ -0,0 +1,73 @@ +//===- OnDiskCommon.cpp ---------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "OnDiskCommon.h" +#include <thread> + +#if __has_include(<sys/file.h>) +#include <sys/file.h> +#ifdef LOCK_SH +#define HAVE_FLOCK 1 +#else +#define HAVE_FLOCK 0 +#endif +#endif + +using namespace llvm; + +std::error_code cas::ondisk::lockFileThreadSafe(int FD, bool Exclusive) { +#if HAVE_FLOCK + if (flock(FD, Exclusive ? LOCK_EX : LOCK_SH) == 0) + return std::error_code(); + return std::error_code(errno, std::generic_category()); +#elif defined(_WIN32) + // On Windows this implementation is thread-safe. + return sys::fs::lockFile(FD, Exclusive); +#else + return make_error_code(std::errc::no_lock_available); +#endif +} + +std::error_code cas::ondisk::unlockFileThreadSafe(int FD) { +#if HAVE_FLOCK + if (flock(FD, LOCK_UN) == 0) + return std::error_code(); + return std::error_code(errno, std::generic_category()); +#elif defined(_WIN32) + // On Windows this implementation is thread-safe. + return sys::fs::unlockFile(FD); +#else + return make_error_code(std::errc::no_lock_available); +#endif +} + +std::error_code +cas::ondisk::tryLockFileThreadSafe(int FD, std::chrono::milliseconds Timeout, + bool Exclusive) { +#if HAVE_FLOCK + auto Start = std::chrono::steady_clock::now(); + auto End = Start + Timeout; + do { + if (flock(FD, (Exclusive ? LOCK_EX : LOCK_SH) | LOCK_NB) == 0) + return std::error_code(); + int Error = errno; + if (Error == EWOULDBLOCK) { + // Match sys::fs::tryLockFile, which sleeps for 1 ms per attempt. + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + continue; + } + return std::error_code(Error, std::generic_category()); + } while (std::chrono::steady_clock::now() < End); + return make_error_code(std::errc::no_lock_available); +#elif defined(_WIN32) + // On Windows this implementation is thread-safe. + return sys::fs::tryLockFile(FD, Timeout, Exclusive); +#else + return make_error_code(std::errc::no_lock_available); +#endif +} diff --git a/llvm/lib/CAS/OnDiskCommon.h b/llvm/lib/CAS/OnDiskCommon.h new file mode 100644 index 0000000..08da355 --- /dev/null +++ b/llvm/lib/CAS/OnDiskCommon.h @@ -0,0 +1,35 @@ +//===- OnDiskCommon.h -------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_CAS_ONDISKCOMMON_H +#define LLVM_LIB_CAS_ONDISKCOMMON_H + +#include <chrono> + +namespace llvm::cas::ondisk { + +/// Thread-safe alternative to \c sys::fs::lockFile. This does not support all +/// the platforms that \c sys::fs::lockFile does, so keep it in the CAS library +/// for now. +std::error_code lockFileThreadSafe(int FD, bool Exclusive = true); + +/// Thread-safe alternative to \c sys::fs::unlockFile. This does not support all +/// the platforms that \c sys::fs::lockFile does, so keep it in the CAS library +/// for now. +std::error_code unlockFileThreadSafe(int FD); + +/// Thread-safe alternative to \c sys::fs::tryLockFile. This does not support +/// all the platforms that \c sys::fs::lockFile does, so keep it in the CAS +/// library for now. +std::error_code tryLockFileThreadSafe( + int FD, std::chrono::milliseconds Timeout = std::chrono::milliseconds(0), + bool Exclusive = true); + +} // namespace llvm::cas::ondisk + +#endif // LLVM_LIB_CAS_ONDISKCOMMON_H diff --git a/llvm/lib/CAS/OnDiskHashMappedTrie.cpp b/llvm/lib/CAS/OnDiskHashMappedTrie.cpp new file mode 100644 index 0000000..1fd5dfe --- /dev/null +++ b/llvm/lib/CAS/OnDiskHashMappedTrie.cpp @@ -0,0 +1,1352 @@ +//===- OnDiskHashMappedTrie.cpp -------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/CAS/OnDiskHashMappedTrie.h" +#include "llvm/ADT/TrieHashIndexGenerator.h" +#include "llvm/CAS/MappedFileRegionBumpPtr.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; +using namespace llvm::cas; + +#if LLVM_ENABLE_ONDISK_CAS + +static_assert(sizeof(size_t) == sizeof(uint64_t), "64-bit only"); +static_assert(sizeof(std::atomic<int64_t>) == sizeof(uint64_t), + "Requires lock-free 64-bit atomics"); + +//===----------------------------------------------------------------------===// +// Generic database data structures. +//===----------------------------------------------------------------------===// +namespace { +using MappedFileRegion = MappedFileRegionBumpPtr::RegionT; + +/// Generic handle for a table. +/// +/// Probably we want some table kinds for pointing at multiple tables. +/// - Probably a tree or trie type makes sense. +/// - Or a deque. Linear search is okay as long as there aren't many tables in +/// a file. +/// +/// Generic table header layout: +/// - 2-bytes: TableKind +/// - 2-bytes: TableNameSize +/// - 4-bytes: TableNameRelOffset (relative to header) +class TableHandle { +public: + enum class TableKind : uint16_t { + HashMappedTrie = 1, + DataAllocator = 2, + }; + struct Header { + TableKind Kind; + uint16_t NameSize; + int32_t NameRelOffset; // Relative to Header. + }; + + explicit operator bool() const { return H; } + const Header &getHeader() const { return *H; } + MappedFileRegion &getRegion() const { return *Region; } + + template <class T> static void check() { + static_assert( + std::is_same<decltype(T::Header::GenericHeader), Header>::value, + "T::GenericHeader should be of type TableHandle::Header"); + static_assert(offsetof(typename T::Header, GenericHeader) == 0, + "T::GenericHeader must be the head of T::Header"); + } + template <class T> bool is() const { return T::Kind == H->Kind; } + template <class T> T dyn_cast() const { + check<T>(); + if (is<T>()) + return T(*Region, *reinterpret_cast<typename T::Header *>(H)); + return T(); + } + template <class T> T cast() const { + assert(is<T>()); + return dyn_cast<T>(); + } + + StringRef getName() const { + auto *Begin = reinterpret_cast<const char *>(H) + H->NameRelOffset; + return StringRef(Begin, H->NameSize); + } + + TableHandle() = default; + TableHandle(MappedFileRegion &Region, Header &H) : Region(&Region), H(&H) {} + TableHandle(MappedFileRegion &Region, intptr_t HeaderOffset) + : TableHandle(Region, + *reinterpret_cast<Header *>(Region.data() + HeaderOffset)) { + } + +private: + MappedFileRegion *Region = nullptr; + Header *H = nullptr; +}; + +/// Encapsulate a database file, which: +/// - Sets/checks magic. +/// - Sets/checks version. +/// - Points at an arbitrary root table (can be changed later using a lock-free +/// algorithm). +/// - Sets up a BumpPtr for allocation. +/// +/// Top-level layout: +/// - 8-bytes: Magic +/// - 8-bytes: Version +/// - 8-bytes: RootTable (16-bits: Kind; 48-bits: Offset) +/// - 8-bytes: BumpPtr +class DatabaseFile { +public: + static constexpr uint64_t getMagic() { return 0x00FFDA7ABA53FF00ULL; } + static constexpr uint64_t getVersion() { return 1ULL; } + struct Header { + uint64_t Magic; + uint64_t Version; + std::atomic<int64_t> RootTableOffset; + std::atomic<int64_t> BumpPtr; + }; + + const Header &getHeader() { return *H; } + MappedFileRegionBumpPtr &getAlloc() { return Alloc; } + MappedFileRegion &getRegion() { return Alloc.getRegion(); } + + /// Add a table. + /// + /// TODO: Allow lazy construction via getOrCreate()-style API. + void addTable(TableHandle Table); + + /// Find a table. May return null. + std::optional<TableHandle> findTable(StringRef Name); + + static Expected<DatabaseFile> + create(const Twine &Path, uint64_t Capacity, + function_ref<Error(DatabaseFile &)> NewDBConstructor); + + size_t size() const { return Alloc.size(); } + +private: + static Expected<DatabaseFile> + get(std::unique_ptr<MappedFileRegionBumpPtr> Alloc) { + if (Error E = validate(Alloc->getRegion())) + return std::move(E); + return DatabaseFile(std::move(Alloc)); + } + + static Error validate(MappedFileRegion &Region); + + DatabaseFile(MappedFileRegionBumpPtr &Alloc) + : H(reinterpret_cast<Header *>(Alloc.data())), Alloc(Alloc) {} + DatabaseFile(std::unique_ptr<MappedFileRegionBumpPtr> Alloc) + : DatabaseFile(*Alloc) { + OwnedAlloc = std::move(Alloc); + } + + Header *H = nullptr; + MappedFileRegionBumpPtr &Alloc; + std::unique_ptr<MappedFileRegionBumpPtr> OwnedAlloc; +}; + +} // end anonymous namespace + +Expected<DatabaseFile> +DatabaseFile::create(const Twine &Path, uint64_t Capacity, + function_ref<Error(DatabaseFile &)> NewDBConstructor) { + // Constructor for if the file doesn't exist. + auto NewFileConstructor = [&](MappedFileRegionBumpPtr &Alloc) -> Error { + assert(Alloc.capacity() >= sizeof(Header)); + (void)new (Alloc.data()) Header{getMagic(), getVersion(), {0}, {0}}; + Alloc.initializeBumpPtr(offsetof(Header, BumpPtr)); + DatabaseFile DB(Alloc); + return NewDBConstructor(DB); + }; + + // Get or create the file. + MappedFileRegionBumpPtr Alloc; + if (Error E = MappedFileRegionBumpPtr::create(Path, Capacity, + offsetof(Header, BumpPtr), + NewFileConstructor) + .moveInto(Alloc)) + return std::move(E); + + return DatabaseFile::get( + std::make_unique<MappedFileRegionBumpPtr>(std::move(Alloc))); +} + +void DatabaseFile::addTable(TableHandle Table) { + assert(Table); + assert(&Table.getRegion() == &getRegion()); + int64_t ExistingRootOffset = 0; + const int64_t NewOffset = + reinterpret_cast<const char *>(&Table.getHeader()) - getRegion().data(); + if (H->RootTableOffset.compare_exchange_strong(ExistingRootOffset, NewOffset)) + return; + + // Silently ignore attempts to set the root to itself. + if (ExistingRootOffset == NewOffset) + return; + + // FIXME: Fix the API so that having the same name is not an error. Instead, + // the colliding table should just be used as-is and the client can decide + // what to do with the new one. + // + // TODO: Add support for creating a chain or tree of tables (more than one at + // all!) to avoid this error. + TableHandle Root(getRegion(), ExistingRootOffset); + if (Root.getName() == Table.getName()) + report_fatal_error( + createStringError(make_error_code(std::errc::not_supported), + "table name collision '" + Table.getName() + "'")); + else + report_fatal_error( + createStringError(make_error_code(std::errc::not_supported), + "cannot add new table '" + Table.getName() + + "'" + " to existing root '" + + Root.getName() + "'")); +} + +std::optional<TableHandle> DatabaseFile::findTable(StringRef Name) { + int64_t RootTableOffset = H->RootTableOffset.load(); + if (!RootTableOffset) + return std::nullopt; + + TableHandle Root(getRegion(), RootTableOffset); + if (Root.getName() == Name) + return Root; + + // TODO: Once multiple tables are supported, need to walk to find them. + return std::nullopt; +} + +Error DatabaseFile::validate(MappedFileRegion &Region) { + if (Region.size() < sizeof(Header)) + return createStringError(std::errc::invalid_argument, + "database: missing header"); + + // Check the magic and version. + auto *H = reinterpret_cast<Header *>(Region.data()); + if (H->Magic != getMagic()) + return createStringError(std::errc::invalid_argument, + "database: bad magic"); + if (H->Version != getVersion()) + return createStringError(std::errc::invalid_argument, + "database: wrong version"); + + // Check the bump-ptr, which should point past the header. + if (H->BumpPtr.load() < (int64_t)sizeof(Header)) + return createStringError(std::errc::invalid_argument, + "database: corrupt bump-ptr"); + + return Error::success(); +} + +//===----------------------------------------------------------------------===// +// HashMappedTrie data structures. +//===----------------------------------------------------------------------===// + +namespace { + +class SubtrieHandle; +class SubtrieSlotValue { +public: + explicit operator bool() const { return !isEmpty(); } + bool isEmpty() const { return !Offset; } + bool isData() const { return Offset > 0; } + bool isSubtrie() const { return Offset < 0; } + int64_t asData() const { + assert(isData()); + return Offset; + } + int64_t asSubtrie() const { + assert(isSubtrie()); + return -Offset; + } + + FileOffset asSubtrieFileOffset() const { return FileOffset(asSubtrie()); } + + FileOffset asDataFileOffset() const { return FileOffset(asData()); } + + int64_t getRawOffset() const { return Offset; } + + static SubtrieSlotValue getDataOffset(int64_t Offset) { + return SubtrieSlotValue(Offset); + } + + static SubtrieSlotValue getSubtrieOffset(int64_t Offset) { + return SubtrieSlotValue(-Offset); + } + + static SubtrieSlotValue getDataOffset(FileOffset Offset) { + return getDataOffset(Offset.get()); + } + + static SubtrieSlotValue getSubtrieOffset(FileOffset Offset) { + return getDataOffset(Offset.get()); + } + + static SubtrieSlotValue getFromSlot(std::atomic<int64_t> &Slot) { + return SubtrieSlotValue(Slot.load()); + } + + SubtrieSlotValue() = default; + +private: + friend class SubtrieHandle; + explicit SubtrieSlotValue(int64_t Offset) : Offset(Offset) {} + int64_t Offset = 0; +}; + +class HashMappedTrieHandle; + +/// Subtrie layout: +/// - 2-bytes: StartBit +/// - 1-bytes: NumBits=lg(num-slots) +/// - 1-bytes: NumUnusedBits=lg(num-slots-unused) +/// - 4-bytes: 0-pad +/// - <slots> +class SubtrieHandle { +public: + struct Header { + /// The bit this subtrie starts on. + uint16_t StartBit; + + /// The number of bits this subtrie handles. It has 2^NumBits slots. + uint8_t NumBits; + + /// The number of extra bits this allocation *could* handle, due to + /// over-allocation. It has 2^NumUnusedBits unused slots. + uint8_t NumUnusedBits; + + /// 0-pad to 8B. + uint32_t ZeroPad4B; + }; + + /// Slot storage: + /// - zero: Empty + /// - positive: RecordOffset + /// - negative: SubtrieOffset + using SlotT = std::atomic<int64_t>; + + static int64_t getSlotsSize(uint32_t NumBits) { + return sizeof(int64_t) * (1u << NumBits); + } + + static int64_t getSize(uint32_t NumBits) { + return sizeof(SubtrieHandle::Header) + getSlotsSize(NumBits); + } + + int64_t getSize() const { return getSize(H->NumBits); } + + SubtrieSlotValue load(size_t I) const { + return SubtrieSlotValue(Slots[I].load()); + } + void store(size_t I, SubtrieSlotValue V) { + return Slots[I].store(V.getRawOffset()); + } + + void printHash(raw_ostream &OS, ArrayRef<uint8_t> Bytes) const; + void print(raw_ostream &OS, HashMappedTrieHandle Trie, + SmallVectorImpl<int64_t> &Records, + std::optional<std::string> Prefix = std::nullopt) const; + + /// Return None on success, or the existing offset on failure. + bool compare_exchange_strong(size_t I, SubtrieSlotValue &Expected, + SubtrieSlotValue New) { + return Slots[I].compare_exchange_strong(Expected.Offset, New.Offset); + } + + /// Sink \p V from \p I in this subtrie down to \p NewI in a new subtrie with + /// \p NumSubtrieBits. + /// + /// \p UnusedSubtrie maintains a 1-item "free" list of unused subtries. If a + /// new subtrie is created that isn't used because of a lost race, then it If + /// it's already valid, it should be used instead of allocating a new one. + /// should be returned as an out parameter to be passed back in the future. + /// If it's already valid, it should be used instead of allocating a new one. + /// + /// Returns the subtrie that now lives at \p I. + SubtrieHandle sink(size_t I, SubtrieSlotValue V, + MappedFileRegionBumpPtr &Alloc, size_t NumSubtrieBits, + SubtrieHandle &UnusedSubtrie, size_t NewI); + + /// Only safe if the subtrie is empty. + void reinitialize(uint32_t StartBit, uint32_t NumBits); + + SubtrieSlotValue getOffset() const { + return SubtrieSlotValue::getSubtrieOffset( + reinterpret_cast<const char *>(H) - Region->data()); + } + + FileOffset getFileOffset() const { return getOffset().asSubtrieFileOffset(); } + + explicit operator bool() const { return H; } + + Header &getHeader() const { return *H; } + uint32_t getStartBit() const { return H->StartBit; } + uint32_t getNumBits() const { return H->NumBits; } + uint32_t getNumUnusedBits() const { return H->NumUnusedBits; } + + static SubtrieHandle create(MappedFileRegionBumpPtr &Alloc, uint32_t StartBit, + uint32_t NumBits, uint32_t NumUnusedBits = 0); + + static SubtrieHandle getFromFileOffset(MappedFileRegion &Region, + FileOffset Offset) { + return SubtrieHandle(Region, SubtrieSlotValue::getSubtrieOffset(Offset)); + } + + SubtrieHandle() = default; + SubtrieHandle(MappedFileRegion &Region, Header &H) + : Region(&Region), H(&H), Slots(getSlots(H)) {} + SubtrieHandle(MappedFileRegion &Region, SubtrieSlotValue Offset) + : SubtrieHandle(Region, *reinterpret_cast<Header *>( + Region.data() + Offset.asSubtrie())) {} + +private: + MappedFileRegion *Region = nullptr; + Header *H = nullptr; + MutableArrayRef<SlotT> Slots; + + static MutableArrayRef<SlotT> getSlots(Header &H) { + return MutableArrayRef(reinterpret_cast<SlotT *>(&H + 1), 1u << H.NumBits); + } +}; + +/// Handle for a HashMappedTrie table. +/// +/// HashMappedTrie table layout: +/// - [8-bytes: Generic table header] +/// - 1-byte: NumSubtrieBits +/// - 1-byte: Flags (not used yet) +/// - 2-bytes: NumHashBits +/// - 4-bytes: RecordDataSize (in bytes) +/// - 8-bytes: RootTrieOffset +/// - 8-bytes: AllocatorOffset (reserved for implementing free lists) +/// - <name> '\0' +/// +/// Record layout: +/// - <data> +/// - <hash> +class HashMappedTrieHandle { +public: + static constexpr TableHandle::TableKind Kind = + TableHandle::TableKind::HashMappedTrie; + + struct Header { + TableHandle::Header GenericHeader; + uint8_t NumSubtrieBits; + uint8_t Flags; // None used yet. + uint16_t NumHashBits; + uint32_t RecordDataSize; + std::atomic<int64_t> RootTrieOffset; + std::atomic<int64_t> AllocatorOffset; + }; + + operator TableHandle() const { + if (!H) + return TableHandle(); + return TableHandle(*Region, H->GenericHeader); + } + + struct RecordData { + OnDiskHashMappedTrie::ValueProxy Proxy; + SubtrieSlotValue Offset; + FileOffset getFileOffset() const { return Offset.asDataFileOffset(); } + }; + + enum Limits : size_t { + /// Seems like 65528 hash bits ought to be enough. + MaxNumHashBytes = UINT16_MAX >> 3, + MaxNumHashBits = MaxNumHashBytes << 3, + + /// 2^16 bits in a trie is 65536 slots. This restricts us to a 16-bit + /// index. This many slots is suspicously large anyway. + MaxNumRootBits = 16, + + /// 2^10 bits in a trie is 1024 slots. This many slots seems suspiciously + /// large for subtries. + MaxNumSubtrieBits = 10, + }; + + static constexpr size_t getNumHashBytes(size_t NumHashBits) { + assert(NumHashBits % 8 == 0); + return NumHashBits / 8; + } + static constexpr size_t getRecordSize(size_t RecordDataSize, + size_t NumHashBits) { + return RecordDataSize + getNumHashBytes(NumHashBits); + } + + RecordData getRecord(SubtrieSlotValue Offset); + RecordData createRecord(MappedFileRegionBumpPtr &Alloc, + ArrayRef<uint8_t> Hash); + + explicit operator bool() const { return H; } + const Header &getHeader() const { return *H; } + SubtrieHandle getRoot() const; + SubtrieHandle getOrCreateRoot(MappedFileRegionBumpPtr &Alloc); + MappedFileRegion &getRegion() const { return *Region; } + + size_t getFlags() const { return H->Flags; } + uint64_t getNumSubtrieBits() const { return H->NumSubtrieBits; } + uint64_t getNumHashBits() const { return H->NumHashBits; } + size_t getNumHashBytes() const { return getNumHashBytes(H->NumHashBits); } + size_t getRecordDataSize() const { return H->RecordDataSize; } + size_t getRecordSize() const { + return getRecordSize(H->RecordDataSize, H->NumHashBits); + } + + TrieHashIndexGenerator getIndexGen(SubtrieHandle Root, + ArrayRef<uint8_t> Hash) { + assert(Root.getStartBit() == 0); + assert(getNumHashBytes() == Hash.size()); + assert(getNumHashBits() == Hash.size() * 8); + return TrieHashIndexGenerator{Root.getNumBits(), getNumSubtrieBits(), Hash}; + } + + static HashMappedTrieHandle + create(MappedFileRegionBumpPtr &Alloc, StringRef Name, + std::optional<uint64_t> NumRootBits, uint64_t NumSubtrieBits, + uint64_t NumHashBits, uint64_t RecordDataSize); + + void + print(raw_ostream &OS, + function_ref<void(ArrayRef<char>)> PrintRecordData = nullptr) const; + + HashMappedTrieHandle() = default; + HashMappedTrieHandle(MappedFileRegion &Region, Header &H) + : Region(&Region), H(&H) {} + HashMappedTrieHandle(MappedFileRegion &Region, intptr_t HeaderOffset) + : HashMappedTrieHandle( + Region, *reinterpret_cast<Header *>(Region.data() + HeaderOffset)) { + } + +private: + MappedFileRegion *Region = nullptr; + Header *H = nullptr; +}; + +} // end anonymous namespace + +struct OnDiskHashMappedTrie::ImplType { + DatabaseFile File; + HashMappedTrieHandle Trie; +}; + +SubtrieHandle SubtrieHandle::create(MappedFileRegionBumpPtr &Alloc, + uint32_t StartBit, uint32_t NumBits, + uint32_t NumUnusedBits) { + assert(StartBit <= HashMappedTrieHandle::MaxNumHashBits); + assert(NumBits <= UINT8_MAX); + assert(NumUnusedBits <= UINT8_MAX); + assert(NumBits + NumUnusedBits <= HashMappedTrieHandle::MaxNumRootBits); + + void *Mem = Alloc.allocate(getSize(NumBits + NumUnusedBits)); + auto *H = + new (Mem) SubtrieHandle::Header{(uint16_t)StartBit, (uint8_t)NumBits, + (uint8_t)NumUnusedBits, /*ZeroPad4B=*/0}; + SubtrieHandle S(Alloc.getRegion(), *H); + for (auto I = S.Slots.begin(), E = S.Slots.end(); I != E; ++I) + new (I) SlotT(0); + return S; +} + +SubtrieHandle HashMappedTrieHandle::getRoot() const { + if (int64_t Root = H->RootTrieOffset) + return SubtrieHandle(getRegion(), SubtrieSlotValue::getSubtrieOffset(Root)); + return SubtrieHandle(); +} + +SubtrieHandle +HashMappedTrieHandle::getOrCreateRoot(MappedFileRegionBumpPtr &Alloc) { + assert(&Alloc.getRegion() == &getRegion()); + if (SubtrieHandle Root = getRoot()) + return Root; + + int64_t Race = 0; + SubtrieHandle LazyRoot = SubtrieHandle::create(Alloc, 0, H->NumSubtrieBits); + if (H->RootTrieOffset.compare_exchange_strong( + Race, LazyRoot.getOffset().asSubtrie())) + return LazyRoot; + + // There was a race. Return the other root. + // + // TODO: Avoid leaking the lazy root by storing it in an allocator. + return SubtrieHandle(getRegion(), SubtrieSlotValue::getSubtrieOffset(Race)); +} + +HashMappedTrieHandle +HashMappedTrieHandle::create(MappedFileRegionBumpPtr &Alloc, StringRef Name, + std::optional<uint64_t> NumRootBits, + uint64_t NumSubtrieBits, uint64_t NumHashBits, + uint64_t RecordDataSize) { + // Allocate. + intptr_t Offset = Alloc.allocateOffset(sizeof(Header) + Name.size() + 1); + + // Construct the header and the name. + assert(Name.size() <= UINT16_MAX && "Expected smaller table name"); + assert(NumSubtrieBits <= UINT8_MAX && "Expected valid subtrie bits"); + assert(NumHashBits <= UINT16_MAX && "Expected valid hash size"); + assert(RecordDataSize <= UINT32_MAX && "Expected smaller table name"); + auto *H = new (Alloc.getRegion().data() + Offset) + Header{{TableHandle::TableKind::HashMappedTrie, (uint16_t)Name.size(), + (uint32_t)sizeof(Header)}, + (uint8_t)NumSubtrieBits, + /*Flags=*/0, + (uint16_t)NumHashBits, + (uint32_t)RecordDataSize, + /*RootTrieOffset=*/{0}, + /*AllocatorOffset=*/{0}}; + char *NameStorage = reinterpret_cast<char *>(H + 1); + llvm::copy(Name, NameStorage); + NameStorage[Name.size()] = 0; + + // Construct a root trie, if requested. + HashMappedTrieHandle Trie(Alloc.getRegion(), *H); + if (NumRootBits) + H->RootTrieOffset = + SubtrieHandle::create(Alloc, 0, *NumRootBits).getOffset().asSubtrie(); + return Trie; +} + +HashMappedTrieHandle::RecordData +HashMappedTrieHandle::getRecord(SubtrieSlotValue Offset) { + char *Begin = Region->data() + Offset.asData(); + OnDiskHashMappedTrie::ValueProxy Proxy; + Proxy.Data = MutableArrayRef(Begin, getRecordDataSize()); + Proxy.Hash = ArrayRef(reinterpret_cast<const uint8_t *>(Proxy.Data.end()), + getNumHashBytes()); + return RecordData{Proxy, Offset}; +} + +HashMappedTrieHandle::RecordData +HashMappedTrieHandle::createRecord(MappedFileRegionBumpPtr &Alloc, + ArrayRef<uint8_t> Hash) { + assert(&Alloc.getRegion() == Region); + assert(Hash.size() == getNumHashBytes()); + RecordData Record = getRecord( + SubtrieSlotValue::getDataOffset(Alloc.allocateOffset(getRecordSize()))); + llvm::copy(Hash, const_cast<uint8_t *>(Record.Proxy.Hash.begin())); + return Record; +} + +OnDiskHashMappedTrie::const_pointer +OnDiskHashMappedTrie::recoverFromHashPointer( + const uint8_t *HashBeginPtr) const { + // Record hashes occur immediately after data. Compute the beginning of the + // record and check for overflow. + const uintptr_t HashBegin = reinterpret_cast<uintptr_t>(HashBeginPtr); + const uintptr_t RecordBegin = HashBegin - Impl->Trie.getRecordSize(); + if (HashBegin < RecordBegin) + return const_pointer(); + + // Check that it'll be a positive offset. + const uintptr_t FileBegin = + reinterpret_cast<uintptr_t>(Impl->File.getRegion().data()); + if (RecordBegin < FileBegin) + return const_pointer(); + + // Good enough to form an offset. Continue checking there. + return recoverFromFileOffset(FileOffset(RecordBegin - FileBegin)); +} + +OnDiskHashMappedTrie::const_pointer +OnDiskHashMappedTrie::recoverFromFileOffset(FileOffset Offset) const { + // Check alignment. + if (!isAligned(MappedFileRegionBumpPtr::getAlign(), Offset.get())) + return const_pointer(); + + // Check bounds. + // + // Note: There's no potential overflow when using \c uint64_t because Offset + // is in \c [0,INT64_MAX] and the record size is in \c [0,UINT32_MAX]. + assert(Offset.get() >= 0 && "Expected FileOffset constructor guarantee this"); + if ((uint64_t)Offset.get() + Impl->Trie.getRecordSize() > + Impl->File.getAlloc().size()) + return const_pointer(); + + // Looks okay... + HashMappedTrieHandle::RecordData D = + Impl->Trie.getRecord(SubtrieSlotValue::getDataOffset(Offset)); + return const_pointer(D.getFileOffset(), D.Proxy); +} + +OnDiskHashMappedTrie::const_pointer +OnDiskHashMappedTrie::find(ArrayRef<uint8_t> Hash) const { + HashMappedTrieHandle Trie = Impl->Trie; + assert(Hash.size() == Trie.getNumHashBytes() && "Invalid hash"); + + SubtrieHandle S = Trie.getRoot(); + if (!S) + return const_pointer(); + + TrieHashIndexGenerator IndexGen = Trie.getIndexGen(S, Hash); + size_t Index = IndexGen.next(); + for (;;) { + // Try to set the content. + SubtrieSlotValue V = S.load(Index); + if (!V) + return const_pointer(S.getFileOffset(), + HintT(this, Index, *IndexGen.StartBit)); + + // Check for an exact match. + if (V.isData()) { + HashMappedTrieHandle::RecordData D = Trie.getRecord(V); + return D.Proxy.Hash == Hash + ? const_pointer(D.getFileOffset(), D.Proxy) + : const_pointer(S.getFileOffset(), + HintT(this, Index, *IndexGen.StartBit)); + } + + Index = IndexGen.next(); + S = SubtrieHandle(Trie.getRegion(), V); + } +} + +/// Only safe if the subtrie is empty. +void SubtrieHandle::reinitialize(uint32_t StartBit, uint32_t NumBits) { + assert(StartBit > H->StartBit); + assert(NumBits <= H->NumBits); + // Ideally would also assert that all slots are empty, but that's expensive. + + H->StartBit = StartBit; + H->NumBits = NumBits; +} + +OnDiskHashMappedTrie::pointer +OnDiskHashMappedTrie::insertLazy(const_pointer Hint, ArrayRef<uint8_t> Hash, + LazyInsertOnConstructCB OnConstruct, + LazyInsertOnLeakCB OnLeak) { + HashMappedTrieHandle Trie = Impl->Trie; + assert(Hash.size() == Trie.getNumHashBytes() && "Invalid hash"); + + MappedFileRegionBumpPtr &Alloc = Impl->File.getAlloc(); + SubtrieHandle S = Trie.getOrCreateRoot(Alloc); + TrieHashIndexGenerator IndexGen = Trie.getIndexGen(S, Hash); + + size_t Index; + if (std::optional<HintT> H = Hint.getHint(*this)) { + S = SubtrieHandle::getFromFileOffset(Trie.getRegion(), Hint.getOffset()); + Index = IndexGen.hint(H->I, H->B); + } else { + Index = IndexGen.next(); + } + + // FIXME: Add non-assertion based checks for data corruption that would + // otherwise cause infinite loops in release builds, instead calling + // report_fatal_error(). + // + // Two loops are possible: + // - All bits used up in the IndexGenerator because subtries are somehow + // linked in a cycle. Could confirm that each subtrie's start-bit + // follows from the start-bit and num-bits of its parent. Could also check + // that the generator doesn't run out of bits. + // - Existing data matches tail of Hash but not the head (stored in an + // invalid spot). Probably a cheap way to check this too, but needs + // thought. + std::optional<HashMappedTrieHandle::RecordData> NewRecord; + SubtrieHandle UnusedSubtrie; + for (;;) { + SubtrieSlotValue Existing = S.load(Index); + + // Try to set it, if it's empty. + if (!Existing) { + if (!NewRecord) { + NewRecord = Trie.createRecord(Alloc, Hash); + if (OnConstruct) + OnConstruct(NewRecord->Offset.asDataFileOffset(), NewRecord->Proxy); + } + + if (S.compare_exchange_strong(Index, Existing, NewRecord->Offset)) + return pointer(NewRecord->Offset.asDataFileOffset(), NewRecord->Proxy); + + // Race means that Existing is no longer empty; fall through... + } + + if (Existing.isSubtrie()) { + S = SubtrieHandle(Trie.getRegion(), Existing); + Index = IndexGen.next(); + continue; + } + + // Check for an exact match. + HashMappedTrieHandle::RecordData ExistingRecord = Trie.getRecord(Existing); + if (ExistingRecord.Proxy.Hash == Hash) { + if (NewRecord && OnLeak) + OnLeak(NewRecord->Offset.asDataFileOffset(), NewRecord->Proxy, + ExistingRecord.Offset.asDataFileOffset(), ExistingRecord.Proxy); + return pointer(ExistingRecord.Offset.asDataFileOffset(), + ExistingRecord.Proxy); + } + + // Sink the existing content as long as the indexes match. + for (;;) { + size_t NextIndex = IndexGen.next(); + size_t NewIndexForExistingContent = + IndexGen.getCollidingBits(ExistingRecord.Proxy.Hash); + + S = S.sink(Index, Existing, Alloc, IndexGen.getNumBits(), UnusedSubtrie, + NewIndexForExistingContent); + Index = NextIndex; + + // Found the difference. + if (NextIndex != NewIndexForExistingContent) + break; + } + } +} + +SubtrieHandle SubtrieHandle::sink(size_t I, SubtrieSlotValue V, + MappedFileRegionBumpPtr &Alloc, + size_t NumSubtrieBits, + SubtrieHandle &UnusedSubtrie, size_t NewI) { + SubtrieHandle NewS; + if (UnusedSubtrie) { + // Steal UnusedSubtrie and initialize it. + std::swap(NewS, UnusedSubtrie); + NewS.reinitialize(getStartBit() + getNumBits(), NumSubtrieBits); + } else { + // Allocate a new, empty subtrie. + NewS = SubtrieHandle::create(Alloc, getStartBit() + getNumBits(), + NumSubtrieBits); + } + + NewS.store(NewI, V); + if (compare_exchange_strong(I, V, NewS.getOffset())) + return NewS; // Success! + + // Raced. + assert(V.isSubtrie() && "Expected racing sink() to add a subtrie"); + + // Wipe out the new slot so NewS can be reused and set the out parameter. + NewS.store(NewI, SubtrieSlotValue()); + UnusedSubtrie = NewS; + + // Return the subtrie added by the concurrent sink() call. + return SubtrieHandle(Alloc.getRegion(), V); +} + +void OnDiskHashMappedTrie::print( + raw_ostream &OS, function_ref<void(ArrayRef<char>)> PrintRecordData) const { + Impl->Trie.print(OS, PrintRecordData); +} + +static void printHexDigit(raw_ostream &OS, uint8_t Digit) { + if (Digit < 10) + OS << char(Digit + '0'); + else + OS << char(Digit - 10 + 'a'); +} + +static void printHexDigits(raw_ostream &OS, ArrayRef<uint8_t> Bytes, + size_t StartBit, size_t NumBits) { + assert(StartBit % 4 == 0); + assert(NumBits % 4 == 0); + for (size_t I = StartBit, E = StartBit + NumBits; I != E; I += 4) { + uint8_t HexPair = Bytes[I / 8]; + uint8_t HexDigit = I % 8 == 0 ? HexPair >> 4 : HexPair & 0xf; + printHexDigit(OS, HexDigit); + } +} + +void HashMappedTrieHandle::print( + raw_ostream &OS, function_ref<void(ArrayRef<char>)> PrintRecordData) const { + OS << "hash-num-bits=" << getNumHashBits() + << " hash-size=" << getNumHashBytes() + << " record-data-size=" << getRecordDataSize() << "\n"; + SubtrieHandle Root = getRoot(); + + SmallVector<int64_t> Records; + if (Root) + Root.print(OS, *this, Records); + + if (Records.empty()) + return; + llvm::sort(Records); + OS << "records\n"; + for (int64_t Offset : Records) { + OS << "- addr=" << (void *)Offset << " "; + HashMappedTrieHandle Trie = *this; + HashMappedTrieHandle::RecordData Record = + Trie.getRecord(SubtrieSlotValue::getDataOffset(Offset)); + if (PrintRecordData) { + PrintRecordData(Record.Proxy.Data); + } else { + OS << "bytes="; + ArrayRef<uint8_t> Data( + reinterpret_cast<const uint8_t *>(Record.Proxy.Data.data()), + Record.Proxy.Data.size()); + printHexDigits(OS, Data, 0, Data.size() * 8); + } + OS << "\n"; + } +} + +static void printBits(raw_ostream &OS, ArrayRef<uint8_t> Bytes, size_t StartBit, + size_t NumBits) { + assert(StartBit + NumBits <= Bytes.size() * 8u); + for (size_t I = StartBit, E = StartBit + NumBits; I != E; ++I) { + uint8_t Byte = Bytes[I / 8]; + size_t ByteOffset = I % 8; + if (size_t ByteShift = 8 - ByteOffset - 1) + Byte >>= ByteShift; + OS << (Byte & 0x1 ? '1' : '0'); + } +} + +void SubtrieHandle::printHash(raw_ostream &OS, ArrayRef<uint8_t> Bytes) const { + // afb[1c:00*01110*0]def + size_t EndBit = getStartBit() + getNumBits(); + size_t HashEndBit = Bytes.size() * 8u; + + size_t FirstBinaryBit = getStartBit() & ~0x3u; + printHexDigits(OS, Bytes, 0, FirstBinaryBit); + + size_t LastBinaryBit = (EndBit + 3u) & ~0x3u; + OS << "["; + printBits(OS, Bytes, FirstBinaryBit, LastBinaryBit - FirstBinaryBit); + OS << "]"; + + printHexDigits(OS, Bytes, LastBinaryBit, HashEndBit - LastBinaryBit); +} + +static void appendIndexBits(std::string &Prefix, size_t Index, + size_t NumSlots) { + std::string Bits; + for (size_t NumBits = 1u; NumBits < NumSlots; NumBits <<= 1) { + Bits.push_back('0' + (Index & 0x1)); + Index >>= 1; + } + for (char Ch : llvm::reverse(Bits)) + Prefix += Ch; +} + +static void printPrefix(raw_ostream &OS, StringRef Prefix) { + while (Prefix.size() >= 4) { + uint8_t Digit; + bool ErrorParsingBinary = Prefix.take_front(4).getAsInteger(2, Digit); + assert(!ErrorParsingBinary); + (void)ErrorParsingBinary; + printHexDigit(OS, Digit); + Prefix = Prefix.drop_front(4); + } + if (!Prefix.empty()) + OS << "[" << Prefix << "]"; +} + +void SubtrieHandle::print(raw_ostream &OS, HashMappedTrieHandle Trie, + SmallVectorImpl<int64_t> &Records, + std::optional<std::string> Prefix) const { + if (!Prefix) { + OS << "root"; + Prefix.emplace(); + } else { + OS << "subtrie="; + printPrefix(OS, *Prefix); + } + + OS << " addr=" + << (void *)(reinterpret_cast<const char *>(H) - Region->data()); + + const size_t NumSlots = Slots.size(); + OS << " num-slots=" << NumSlots << "\n"; + SmallVector<SubtrieHandle> Subs; + SmallVector<std::string> Prefixes; + for (size_t I = 0, E = NumSlots; I != E; ++I) { + SubtrieSlotValue Slot = load(I); + if (!Slot) + continue; + OS << "- index="; + for (size_t Pad : {10, 100, 1000}) + if (I < Pad && NumSlots >= Pad) + OS << "0"; + OS << I << " "; + if (Slot.isSubtrie()) { + SubtrieHandle S(*Region, Slot); + std::string SubtriePrefix = *Prefix; + appendIndexBits(SubtriePrefix, I, NumSlots); + OS << "addr=" << (void *)Slot.asSubtrie(); + OS << " subtrie="; + printPrefix(OS, SubtriePrefix); + OS << "\n"; + Subs.push_back(S); + Prefixes.push_back(SubtriePrefix); + continue; + } + Records.push_back(Slot.asData()); + HashMappedTrieHandle::RecordData Record = Trie.getRecord(Slot); + OS << "addr=" << (void *)Record.getFileOffset().get(); + OS << " content="; + printHash(OS, Record.Proxy.Hash); + OS << "\n"; + } + for (size_t I = 0, E = Subs.size(); I != E; ++I) + Subs[I].print(OS, Trie, Records, Prefixes[I]); +} + +LLVM_DUMP_METHOD void OnDiskHashMappedTrie::dump() const { print(dbgs()); } + +static Error createTableConfigError(std::errc ErrC, StringRef Path, + StringRef TableName, const Twine &Msg) { + return createStringError(make_error_code(ErrC), + Path + "[" + TableName + "]: " + Msg); +} + +static Expected<size_t> checkParameter(StringRef Label, size_t Max, + std::optional<size_t> Value, + std::optional<size_t> Default, + StringRef Path, StringRef TableName) { + assert(Value || Default); + assert(!Default || *Default <= Max); + if (!Value) + return *Default; + + if (*Value <= Max) + return *Value; + return createTableConfigError( + std::errc::argument_out_of_domain, Path, TableName, + "invalid " + Label + ": " + Twine(*Value) + " (max: " + Twine(Max) + ")"); +} + +static Error checkTable(StringRef Label, size_t Expected, size_t Observed, + StringRef Path, StringRef TrieName) { + if (Expected == Observed) + return Error::success(); + return createTableConfigError(std::errc::invalid_argument, Path, TrieName, + "mismatched " + Label + + " (expected: " + Twine(Expected) + + ", observed: " + Twine(Observed) + ")"); +} + +size_t OnDiskHashMappedTrie::size() const { return Impl->File.size(); } + +Expected<OnDiskHashMappedTrie> +OnDiskHashMappedTrie::create(const Twine &PathTwine, const Twine &TrieNameTwine, + size_t NumHashBits, uint64_t DataSize, + uint64_t MaxFileSize, + std::optional<uint64_t> NewFileInitialSize, + std::optional<size_t> NewTableNumRootBits, + std::optional<size_t> NewTableNumSubtrieBits) { + SmallString<128> PathStorage; + StringRef Path = PathTwine.toStringRef(PathStorage); + SmallString<128> TrieNameStorage; + StringRef TrieName = TrieNameTwine.toStringRef(TrieNameStorage); + + constexpr size_t DefaultNumRootBits = 10; + constexpr size_t DefaultNumSubtrieBits = 6; + + size_t NumRootBits; + if (Error E = checkParameter( + "root bits", HashMappedTrieHandle::MaxNumRootBits, + NewTableNumRootBits, DefaultNumRootBits, Path, TrieName) + .moveInto(NumRootBits)) + return std::move(E); + + size_t NumSubtrieBits; + if (Error E = checkParameter("subtrie bits", + HashMappedTrieHandle::MaxNumSubtrieBits, + NewTableNumSubtrieBits, DefaultNumSubtrieBits, + Path, TrieName) + .moveInto(NumSubtrieBits)) + return std::move(E); + + size_t NumHashBytes = NumHashBits >> 3; + if (Error E = + checkParameter("hash size", HashMappedTrieHandle::MaxNumHashBits, + NumHashBits, std::nullopt, Path, TrieName) + .takeError()) + return std::move(E); + assert(NumHashBits == NumHashBytes << 3 && + "Expected hash size to be byte-aligned"); + if (NumHashBits != NumHashBytes << 3) + return createTableConfigError( + std::errc::argument_out_of_domain, Path, TrieName, + "invalid hash size: " + Twine(NumHashBits) + " (not byte-aligned)"); + + // Constructor for if the file doesn't exist. + auto NewDBConstructor = [&](DatabaseFile &DB) -> Error { + HashMappedTrieHandle Trie = + HashMappedTrieHandle::create(DB.getAlloc(), TrieName, NumRootBits, + NumSubtrieBits, NumHashBits, DataSize); + DB.addTable(Trie); + return Error::success(); + }; + + // Get or create the file. + Expected<DatabaseFile> File = + DatabaseFile::create(Path, MaxFileSize, NewDBConstructor); + if (!File) + return File.takeError(); + + // Find the trie and validate it. + // + // TODO: Add support for creating/adding a table to an existing file. + std::optional<TableHandle> Table = File->findTable(TrieName); + if (!Table) + return createTableConfigError(std::errc::argument_out_of_domain, Path, + TrieName, "table not found"); + if (Error E = checkTable("table kind", (size_t)HashMappedTrieHandle::Kind, + (size_t)Table->getHeader().Kind, Path, TrieName)) + return std::move(E); + auto Trie = Table->cast<HashMappedTrieHandle>(); + assert(Trie && "Already checked the kind"); + + // Check the hash and data size. + if (Error E = checkTable("hash size", NumHashBits, Trie.getNumHashBits(), + Path, TrieName)) + return std::move(E); + if (Error E = checkTable("data size", DataSize, Trie.getRecordDataSize(), + Path, TrieName)) + return std::move(E); + + // No flags supported right now. Either corrupt, or coming from a future + // writer. + if (size_t Flags = Trie.getFlags()) + return createTableConfigError(std::errc::invalid_argument, Path, TrieName, + "unsupported flags: " + Twine(Flags)); + + // Success. + OnDiskHashMappedTrie::ImplType Impl{DatabaseFile(std::move(*File)), Trie}; + return OnDiskHashMappedTrie(std::make_unique<ImplType>(std::move(Impl))); +} + +//===----------------------------------------------------------------------===// +// DataAllocator data structures. +//===----------------------------------------------------------------------===// + +namespace { +/// DataAllocator table layout: +/// - [8-bytes: Generic table header] +/// - 8-bytes: AllocatorOffset (reserved for implementing free lists) +/// - 8-bytes: Size for user data header +/// - <user data buffer> +/// +/// Record layout: +/// - <data> +class DataAllocatorHandle { +public: + static constexpr TableHandle::TableKind Kind = + TableHandle::TableKind::DataAllocator; + + struct Header { + TableHandle::Header GenericHeader; + std::atomic<int64_t> AllocatorOffset; + const uint64_t UserHeaderSize; + }; + + operator TableHandle() const { + if (!H) + return TableHandle(); + return TableHandle(*Region, H->GenericHeader); + } + + MutableArrayRef<char> allocate(MappedFileRegionBumpPtr &Alloc, + size_t DataSize) { + assert(&Alloc.getRegion() == Region); + return MutableArrayRef(Alloc.allocate(DataSize), DataSize); + } + + explicit operator bool() const { return H; } + const Header &getHeader() const { return *H; } + MappedFileRegion &getRegion() const { return *Region; } + + MutableArrayRef<uint8_t> getUserHeader() { + return MutableArrayRef(reinterpret_cast<uint8_t *>(H + 1), + H->UserHeaderSize); + } + + static DataAllocatorHandle create(MappedFileRegionBumpPtr &Alloc, + StringRef Name, uint32_t UserHeaderSize); + + DataAllocatorHandle() = default; + DataAllocatorHandle(MappedFileRegion &Region, Header &H) + : Region(&Region), H(&H) {} + DataAllocatorHandle(MappedFileRegion &Region, intptr_t HeaderOffset) + : DataAllocatorHandle( + Region, *reinterpret_cast<Header *>(Region.data() + HeaderOffset)) { + } + +private: + MappedFileRegion *Region = nullptr; + Header *H = nullptr; +}; + +} // end anonymous namespace + +struct OnDiskDataAllocator::ImplType { + DatabaseFile File; + DataAllocatorHandle Store; +}; + +DataAllocatorHandle DataAllocatorHandle::create(MappedFileRegionBumpPtr &Alloc, + StringRef Name, + uint32_t UserHeaderSize) { + // Allocate. + intptr_t Offset = + Alloc.allocateOffset(sizeof(Header) + UserHeaderSize + Name.size() + 1); + + // Construct the header and the name. + assert(Name.size() <= UINT16_MAX && "Expected smaller table name"); + auto *H = new (Alloc.getRegion().data() + Offset) + Header{{TableHandle::TableKind::DataAllocator, (uint16_t)Name.size(), + (int32_t)(sizeof(Header) + UserHeaderSize)}, + /*AllocatorOffset=*/{0}, + /*UserHeaderSize=*/UserHeaderSize}; + memset(H + 1, 0, UserHeaderSize); + char *NameStorage = reinterpret_cast<char *>(H + 1) + UserHeaderSize; + llvm::copy(Name, NameStorage); + NameStorage[Name.size()] = 0; + return DataAllocatorHandle(Alloc.getRegion(), *H); +} + +Expected<OnDiskDataAllocator> OnDiskDataAllocator::create( + const Twine &PathTwine, const Twine &TableNameTwine, uint64_t MaxFileSize, + std::optional<uint64_t> NewFileInitialSize, uint32_t UserHeaderSize, + function_ref<void(void *)> UserHeaderInit) { + assert(!UserHeaderSize || UserHeaderInit); + SmallString<128> PathStorage; + StringRef Path = PathTwine.toStringRef(PathStorage); + SmallString<128> TableNameStorage; + StringRef TableName = TableNameTwine.toStringRef(TableNameStorage); + + // Constructor for if the file doesn't exist. + auto NewDBConstructor = [&](DatabaseFile &DB) -> Error { + DataAllocatorHandle Store = + DataAllocatorHandle::create(DB.getAlloc(), TableName, UserHeaderSize); + DB.addTable(Store); + if (UserHeaderSize) + UserHeaderInit(Store.getUserHeader().data()); + return Error::success(); + }; + + // Get or create the file. + Expected<DatabaseFile> File = + DatabaseFile::create(Path, MaxFileSize, NewDBConstructor); + if (!File) + return File.takeError(); + + // Find the table and validate it. + // + // TODO: Add support for creating/adding a table to an existing file. + std::optional<TableHandle> Table = File->findTable(TableName); + if (!Table) + return createTableConfigError(std::errc::argument_out_of_domain, Path, + TableName, "table not found"); + if (Error E = checkTable("table kind", (size_t)DataAllocatorHandle::Kind, + (size_t)Table->getHeader().Kind, Path, TableName)) + return std::move(E); + auto Store = Table->cast<DataAllocatorHandle>(); + assert(Store && "Already checked the kind"); + + // Success. + OnDiskDataAllocator::ImplType Impl{DatabaseFile(std::move(*File)), Store}; + return OnDiskDataAllocator(std::make_unique<ImplType>(std::move(Impl))); +} + +OnDiskDataAllocator::pointer OnDiskDataAllocator::allocate(size_t Size) { + MutableArrayRef<char> Data = + Impl->Store.allocate(Impl->File.getAlloc(), Size); + return pointer(FileOffset(Data.data() - Impl->Store.getRegion().data()), + Data); +} + +const char *OnDiskDataAllocator::beginData(FileOffset Offset) const { + assert(Offset); + assert(Impl); + assert(Offset.get() < (int64_t)Impl->File.getAlloc().size()); + return Impl->File.getRegion().data() + Offset.get(); +} + +MutableArrayRef<uint8_t> OnDiskDataAllocator::getUserHeader() { + return Impl->Store.getUserHeader(); +} + +size_t OnDiskDataAllocator::size() const { return Impl->File.size(); } + +OnDiskDataAllocator::OnDiskDataAllocator(std::unique_ptr<ImplType> Impl) + : Impl(std::move(Impl)) {} + +#else // !LLVM_ENABLE_ONDISK_CAS + +struct OnDiskHashMappedTrie::ImplType {}; + +Expected<OnDiskHashMappedTrie> +OnDiskHashMappedTrie::create(const Twine &PathTwine, const Twine &TrieNameTwine, + size_t NumHashBits, uint64_t DataSize, + uint64_t MaxFileSize, + std::optional<uint64_t> NewFileInitialSize, + std::optional<size_t> NewTableNumRootBits, + std::optional<size_t> NewTableNumSubtrieBits) { + report_fatal_error("not supported"); +} + +OnDiskHashMappedTrie::pointer +OnDiskHashMappedTrie::insertLazy(const_pointer Hint, ArrayRef<uint8_t> Hash, + LazyInsertOnConstructCB OnConstruct, + LazyInsertOnLeakCB OnLeak) { + report_fatal_error("not supported"); +} + +OnDiskHashMappedTrie::const_pointer +OnDiskHashMappedTrie::recoverFromFileOffset(FileOffset Offset) const { + report_fatal_error("not supported"); +} + +OnDiskHashMappedTrie::const_pointer +OnDiskHashMappedTrie::find(ArrayRef<uint8_t> Hash) const { + report_fatal_error("not supported"); +} + +void OnDiskHashMappedTrie::print( + raw_ostream &OS, function_ref<void(ArrayRef<char>)> PrintRecordData) const { + report_fatal_error("not supported"); +} + +size_t OnDiskHashMappedTrie::size() const { + report_fatal_error("not supported"); +} + +struct OnDiskDataAllocator::ImplType {}; + +Expected<OnDiskDataAllocator> OnDiskDataAllocator::create( + const Twine &Path, const Twine &TableName, uint64_t MaxFileSize, + std::optional<uint64_t> NewFileInitialSize, uint32_t UserHeaderSize, + function_ref<void(void *)> UserHeaderInit) { + report_fatal_error("not supported"); +} + +OnDiskDataAllocator::pointer OnDiskDataAllocator::allocate(size_t Size) { + report_fatal_error("not supported"); +} + +const char *OnDiskDataAllocator::beginData(FileOffset Offset) const { + report_fatal_error("not supported"); +} + +MutableArrayRef<uint8_t> OnDiskDataAllocator::getUserHeader() { + report_fatal_error("not supported"); +} + +size_t OnDiskDataAllocator::size() const { + report_fatal_error("not supported"); +} + +#endif // LLVM_ENABLE_ONDISK_CAS + +OnDiskHashMappedTrie::OnDiskHashMappedTrie(std::unique_ptr<ImplType> Impl) + : Impl(std::move(Impl)) {} +OnDiskHashMappedTrie::OnDiskHashMappedTrie(OnDiskHashMappedTrie &&RHS) = + default; +OnDiskHashMappedTrie & +OnDiskHashMappedTrie::operator=(OnDiskHashMappedTrie &&RHS) = default; +OnDiskHashMappedTrie::~OnDiskHashMappedTrie() = default; + +OnDiskDataAllocator::OnDiskDataAllocator(OnDiskDataAllocator &&RHS) = default; +OnDiskDataAllocator & +OnDiskDataAllocator::operator=(OnDiskDataAllocator &&RHS) = default; +OnDiskDataAllocator::~OnDiskDataAllocator() = default; diff --git a/llvm/lib/CMakeLists.txt b/llvm/lib/CMakeLists.txt index 503c77cb..b06f4ff 100644 --- a/llvm/lib/CMakeLists.txt +++ b/llvm/lib/CMakeLists.txt @@ -9,6 +9,7 @@ add_subdirectory(FileCheck) add_subdirectory(InterfaceStub) add_subdirectory(IRPrinter) add_subdirectory(IRReader) +add_subdirectory(CAS) add_subdirectory(CGData) add_subdirectory(CodeGen) add_subdirectory(CodeGenTypes) diff --git a/llvm/lib/Support/Unix/Path.inc b/llvm/lib/Support/Unix/Path.inc index 44097ba..9f6f15b 100644 --- a/llvm/lib/Support/Unix/Path.inc +++ b/llvm/lib/Support/Unix/Path.inc @@ -1223,13 +1223,14 @@ Expected<size_t> readNativeFileSlice(file_t FD, MutableArrayRef<char> Buf, return NumRead; } -std::error_code tryLockFile(int FD, std::chrono::milliseconds Timeout) { +std::error_code tryLockFile(int FD, std::chrono::milliseconds Timeout, + bool Exclusive) { auto Start = std::chrono::steady_clock::now(); auto End = Start + Timeout; do { struct flock Lock; memset(&Lock, 0, sizeof(Lock)); - Lock.l_type = F_WRLCK; + Lock.l_type = Exclusive ? F_WRLCK : F_RDLCK; Lock.l_whence = SEEK_SET; Lock.l_start = 0; Lock.l_len = 0; @@ -1238,15 +1239,17 @@ std::error_code tryLockFile(int FD, std::chrono::milliseconds Timeout) { int Error = errno; if (Error != EACCES && Error != EAGAIN) return std::error_code(Error, std::generic_category()); + if (Timeout.count() == 0) + break; usleep(1000); } while (std::chrono::steady_clock::now() < End); return make_error_code(errc::no_lock_available); } -std::error_code lockFile(int FD) { +std::error_code lockFile(int FD, bool Exclusive) { struct flock Lock; memset(&Lock, 0, sizeof(Lock)); - Lock.l_type = F_WRLCK; + Lock.l_type = Exclusive ? F_WRLCK : F_RDLCK; Lock.l_whence = SEEK_SET; Lock.l_start = 0; Lock.l_len = 0; diff --git a/llvm/lib/Support/Windows/Path.inc b/llvm/lib/Support/Windows/Path.inc index c4bd5e2..07ee3d9 100644 --- a/llvm/lib/Support/Windows/Path.inc +++ b/llvm/lib/Support/Windows/Path.inc @@ -1327,8 +1327,10 @@ Expected<size_t> readNativeFileSlice(file_t FileHandle, return readNativeFileImpl(FileHandle, Buf, &Overlapped); } -std::error_code tryLockFile(int FD, std::chrono::milliseconds Timeout) { - DWORD Flags = LOCKFILE_EXCLUSIVE_LOCK | LOCKFILE_FAIL_IMMEDIATELY; +std::error_code tryLockFile(int FD, std::chrono::milliseconds Timeout, + bool Exclusive) { + DWORD Flags = Exclusive ? LOCKFILE_EXCLUSIVE_LOCK : 0; + Flags |= LOCKFILE_FAIL_IMMEDIATELY; OVERLAPPED OV = {}; file_t File = convertFDToNativeFile(FD); auto Start = std::chrono::steady_clock::now(); @@ -1338,6 +1340,8 @@ std::error_code tryLockFile(int FD, std::chrono::milliseconds Timeout) { return std::error_code(); DWORD Error = ::GetLastError(); if (Error == ERROR_LOCK_VIOLATION) { + if (Timeout.count() == 0) + break; ::Sleep(1); continue; } @@ -1346,8 +1350,8 @@ std::error_code tryLockFile(int FD, std::chrono::milliseconds Timeout) { return mapWindowsError(ERROR_LOCK_VIOLATION); } -std::error_code lockFile(int FD) { - DWORD Flags = LOCKFILE_EXCLUSIVE_LOCK; +std::error_code lockFile(int FD, bool Exclusive) { + DWORD Flags = Exclusive ? LOCKFILE_EXCLUSIVE_LOCK : 0; OVERLAPPED OV = {}; file_t File = convertFDToNativeFile(FD); if (::LockFileEx(File, Flags, 0, MAXDWORD, MAXDWORD, &OV)) diff --git a/llvm/unittests/CAS/ActionCacheTest.cpp b/llvm/unittests/CAS/ActionCacheTest.cpp new file mode 100644 index 0000000..a74a3e9 --- /dev/null +++ b/llvm/unittests/CAS/ActionCacheTest.cpp @@ -0,0 +1,73 @@ +//===- ActionCacheTest.cpp ------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/CAS/ActionCache.h" +#include "CASTestConfig.h" +#include "llvm/CAS/ObjectStore.h" +#include "llvm/Testing/Support/Error.h" +#include "gtest/gtest.h" + +using namespace llvm; +using namespace llvm::cas; + +TEST_P(CASTest, ActionCacheHit) { + std::shared_ptr<ObjectStore> CAS = createObjectStore(); + std::unique_ptr<ActionCache> Cache = createActionCache(); + + std::optional<ObjectProxy> ID; + ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, "1").moveInto(ID), + Succeeded()); + std::optional<CASID> ResultID; + ASSERT_THAT_ERROR(Cache->put(*ID, *ID), Succeeded()); + ASSERT_THAT_ERROR(Cache->get(*ID).moveInto(ResultID), Succeeded()); + ASSERT_TRUE(ResultID); + std::optional<ObjectRef> Result = CAS->getReference(*ResultID); + ASSERT_TRUE(Result); + ASSERT_EQ(*ID, *Result); +} + +TEST_P(CASTest, ActionCacheMiss) { + std::shared_ptr<ObjectStore> CAS = createObjectStore(); + std::unique_ptr<ActionCache> Cache = createActionCache(); + + std::optional<ObjectProxy> ID1, ID2; + ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, "1").moveInto(ID1), + Succeeded()); + ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, "2").moveInto(ID2), + Succeeded()); + ASSERT_THAT_ERROR(Cache->put(*ID1, *ID2), Succeeded()); + // This is a cache miss for looking up a key doesn't exist. + std::optional<CASID> Result1; + ASSERT_THAT_ERROR(Cache->get(*ID2).moveInto(Result1), Succeeded()); + ASSERT_FALSE(Result1); + + ASSERT_THAT_ERROR(Cache->put(*ID2, *ID1), Succeeded()); + // Cache hit after adding the value. + std::optional<CASID> Result2; + ASSERT_THAT_ERROR(Cache->get(*ID2).moveInto(Result2), Succeeded()); + ASSERT_TRUE(Result2); + std::optional<ObjectRef> Ref = CAS->getReference(*Result2); + ASSERT_TRUE(Ref); + ASSERT_EQ(*ID1, *Ref); +} + +TEST_P(CASTest, ActionCacheRewrite) { + std::shared_ptr<ObjectStore> CAS = createObjectStore(); + std::unique_ptr<ActionCache> Cache = createActionCache(); + + std::optional<ObjectProxy> ID1, ID2; + ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, "1").moveInto(ID1), + Succeeded()); + ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, "2").moveInto(ID2), + Succeeded()); + ASSERT_THAT_ERROR(Cache->put(*ID1, *ID1), Succeeded()); + // Writing to the same key with different value is error. + ASSERT_THAT_ERROR(Cache->put(*ID1, *ID2), Failed()); + // Writing the same value multiple times to the same key is fine. + ASSERT_THAT_ERROR(Cache->put(*ID1, *ID1), Succeeded()); +} diff --git a/llvm/unittests/CAS/CASTestConfig.cpp b/llvm/unittests/CAS/CASTestConfig.cpp new file mode 100644 index 0000000..1cb7b43 --- /dev/null +++ b/llvm/unittests/CAS/CASTestConfig.cpp @@ -0,0 +1,23 @@ +//===- CASTestConfig.cpp --------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "CASTestConfig.h" +#include "llvm/CAS/ObjectStore.h" +#include "gtest/gtest.h" + +using namespace llvm; +using namespace llvm::cas; + +CASTestingEnv createInMemory(int I) { + std::unique_ptr<ObjectStore> CAS = createInMemoryCAS(); + std::unique_ptr<ActionCache> Cache = createInMemoryActionCache(); + return CASTestingEnv{std::move(CAS), std::move(Cache)}; +} + +INSTANTIATE_TEST_SUITE_P(InMemoryCAS, CASTest, + ::testing::Values(createInMemory)); diff --git a/llvm/unittests/CAS/CASTestConfig.h b/llvm/unittests/CAS/CASTestConfig.h new file mode 100644 index 0000000..8093a0b --- /dev/null +++ b/llvm/unittests/CAS/CASTestConfig.h @@ -0,0 +1,38 @@ +//===- CASTestConfig.h ----------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/CAS/ActionCache.h" +#include "llvm/CAS/ObjectStore.h" +#include "gtest/gtest.h" + +#ifndef LLVM_UNITTESTS_CASTESTCONFIG_H +#define LLVM_UNITTESTS_CASTESTCONFIG_H + +struct CASTestingEnv { + std::unique_ptr<llvm::cas::ObjectStore> CAS; + std::unique_ptr<llvm::cas::ActionCache> Cache; +}; + +class CASTest + : public testing::TestWithParam<std::function<CASTestingEnv(int)>> { +protected: + std::optional<int> NextCASIndex; + + std::unique_ptr<llvm::cas::ObjectStore> createObjectStore() { + auto TD = GetParam()(++(*NextCASIndex)); + return std::move(TD.CAS); + } + std::unique_ptr<llvm::cas::ActionCache> createActionCache() { + auto TD = GetParam()(++(*NextCASIndex)); + return std::move(TD.Cache); + } + void SetUp() { NextCASIndex = 0; } + void TearDown() { NextCASIndex = std::nullopt; } +}; + +#endif diff --git a/llvm/unittests/CAS/CMakeLists.txt b/llvm/unittests/CAS/CMakeLists.txt new file mode 100644 index 0000000..1550fc0 --- /dev/null +++ b/llvm/unittests/CAS/CMakeLists.txt @@ -0,0 +1,19 @@ +if (LLVM_ENABLE_ONDISK_CAS) + add_definitions(-DLLVM_ENABLE_ONDISK_CAS=1) +endif() + +set(LLVM_LINK_COMPONENTS + Support + CAS + TestingSupport + ) + +add_llvm_unittest(CASTests + ActionCacheTest.cpp + CASTestConfig.cpp + ObjectStoreTest.cpp + OnDiskHashMappedTrieTest.cpp + ProgramTest.cpp + ) + +target_link_libraries(CASTests PRIVATE LLVMTestingSupport) diff --git a/llvm/unittests/CAS/ObjectStoreTest.cpp b/llvm/unittests/CAS/ObjectStoreTest.cpp new file mode 100644 index 0000000..0d94731 --- /dev/null +++ b/llvm/unittests/CAS/ObjectStoreTest.cpp @@ -0,0 +1,360 @@ +//===- ObjectStoreTest.cpp ------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/CAS/ObjectStore.h" +#include "llvm/Support/Process.h" +#include "llvm/Support/ThreadPool.h" +#include "llvm/Testing/Support/Error.h" +#include "gtest/gtest.h" + +#include "CASTestConfig.h" + +using namespace llvm; +using namespace llvm::cas; + +TEST_P(CASTest, PrintIDs) { + std::unique_ptr<ObjectStore> CAS = createObjectStore(); + + std::optional<CASID> ID1, ID2; + ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, "1").moveInto(ID1), + Succeeded()); + ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, "2").moveInto(ID2), + Succeeded()); + EXPECT_NE(ID1, ID2); + std::string PrintedID1 = ID1->toString(); + std::string PrintedID2 = ID2->toString(); + EXPECT_NE(PrintedID1, PrintedID2); + + std::optional<CASID> ParsedID1, ParsedID2; + ASSERT_THAT_ERROR(CAS->parseID(PrintedID1).moveInto(ParsedID1), Succeeded()); + ASSERT_THAT_ERROR(CAS->parseID(PrintedID2).moveInto(ParsedID2), Succeeded()); + EXPECT_EQ(ID1, ParsedID1); + EXPECT_EQ(ID2, ParsedID2); +} + +TEST_P(CASTest, Blobs) { + std::unique_ptr<ObjectStore> CAS1 = createObjectStore(); + StringRef ContentStrings[] = { + "word", + "some longer text std::string's local memory", + R"(multiline text multiline text multiline text multiline text +multiline text multiline text multiline text multiline text multiline text +multiline text multiline text multiline text multiline text multiline text +multiline text multiline text multiline text multiline text multiline text +multiline text multiline text multiline text multiline text multiline text +multiline text multiline text multiline text multiline text multiline text)", + }; + + SmallVector<CASID> IDs; + for (StringRef Content : ContentStrings) { + // Use StringRef::str() to create a temporary std::string. This could cause + // problems if the CAS is storing references to the input string instead of + // copying it. + std::optional<ObjectProxy> Blob; + ASSERT_THAT_ERROR(CAS1->createProxy(std::nullopt, Content).moveInto(Blob), + Succeeded()); + IDs.push_back(Blob->getID()); + + // Check basic printing of IDs. + EXPECT_EQ(IDs.back().toString(), IDs.back().toString()); + if (IDs.size() > 2) + EXPECT_NE(IDs.front().toString(), IDs.back().toString()); + } + + // Check that the blobs give the same IDs later. + for (int I = 0, E = IDs.size(); I != E; ++I) { + std::optional<ObjectProxy> Blob; + ASSERT_THAT_ERROR( + CAS1->createProxy(std::nullopt, ContentStrings[I]).moveInto(Blob), + Succeeded()); + EXPECT_EQ(IDs[I], Blob->getID()); + } + + // Run validation on all CASIDs. + for (int I = 0, E = IDs.size(); I != E; ++I) + ASSERT_THAT_ERROR(CAS1->validate(IDs[I]), Succeeded()); + + // Check that the blobs can be retrieved multiple times. + for (int I = 0, E = IDs.size(); I != E; ++I) { + for (int J = 0, JE = 3; J != JE; ++J) { + std::optional<ObjectProxy> Buffer; + ASSERT_THAT_ERROR(CAS1->getProxy(IDs[I]).moveInto(Buffer), Succeeded()); + EXPECT_EQ(ContentStrings[I], Buffer->getData()); + } + } + + // Confirm these blobs don't exist in a fresh CAS instance. + std::unique_ptr<ObjectStore> CAS2 = createObjectStore(); + for (int I = 0, E = IDs.size(); I != E; ++I) { + std::optional<ObjectProxy> Proxy; + EXPECT_THAT_ERROR(CAS2->getProxy(IDs[I]).moveInto(Proxy), Failed()); + } + + // Insert into the second CAS and confirm the IDs are stable. Getting them + // should work now. + for (int I = IDs.size(), E = 0; I != E; --I) { + auto &ID = IDs[I - 1]; + auto &Content = ContentStrings[I - 1]; + std::optional<ObjectProxy> Blob; + ASSERT_THAT_ERROR(CAS2->createProxy(std::nullopt, Content).moveInto(Blob), + Succeeded()); + EXPECT_EQ(ID, Blob->getID()); + + std::optional<ObjectProxy> Buffer; + ASSERT_THAT_ERROR(CAS2->getProxy(ID).moveInto(Buffer), Succeeded()); + EXPECT_EQ(Content, Buffer->getData()); + } +} + +TEST_P(CASTest, BlobsBig) { + // A little bit of validation that bigger blobs are okay. Climb up to 1MB. + std::unique_ptr<ObjectStore> CAS = createObjectStore(); + SmallString<256> String1 = StringRef("a few words"); + SmallString<256> String2 = StringRef("others"); + while (String1.size() < 1024U * 1024U) { + std::optional<CASID> ID1; + std::optional<CASID> ID2; + ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, String1).moveInto(ID1), + Succeeded()); + ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, String1).moveInto(ID2), + Succeeded()); + ASSERT_THAT_ERROR(CAS->validate(*ID1), Succeeded()); + ASSERT_THAT_ERROR(CAS->validate(*ID2), Succeeded()); + ASSERT_EQ(ID1, ID2); + + String1.append(String2); + ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, String2).moveInto(ID1), + Succeeded()); + ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, String2).moveInto(ID2), + Succeeded()); + ASSERT_THAT_ERROR(CAS->validate(*ID1), Succeeded()); + ASSERT_THAT_ERROR(CAS->validate(*ID2), Succeeded()); + ASSERT_EQ(ID1, ID2); + String2.append(String1); + } + + // Specifically check near 1MB for objects large enough they're likely to be + // stored externally in an on-disk CAS and will be near a page boundary. + SmallString<0> Storage; + const size_t InterestingSize = 1024U * 1024ULL; + const size_t SizeE = InterestingSize + 2; + if (Storage.size() < SizeE) + Storage.resize(SizeE, '\01'); + for (size_t Size = InterestingSize - 2; Size != SizeE; ++Size) { + StringRef Data(Storage.data(), Size); + std::optional<ObjectProxy> Blob; + ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, Data).moveInto(Blob), + Succeeded()); + ASSERT_EQ(Data, Blob->getData()); + ASSERT_EQ(0, Blob->getData().end()[0]); + } +} + +TEST_P(CASTest, LeafNodes) { + std::unique_ptr<ObjectStore> CAS1 = createObjectStore(); + StringRef ContentStrings[] = { + "word", + "some longer text std::string's local memory", + R"(multiline text multiline text multiline text multiline text +multiline text multiline text multiline text multiline text multiline text +multiline text multiline text multiline text multiline text multiline text +multiline text multiline text multiline text multiline text multiline text +multiline text multiline text multiline text multiline text multiline text +multiline text multiline text multiline text multiline text multiline text)", + }; + + SmallVector<ObjectRef> Nodes; + SmallVector<CASID> IDs; + for (StringRef Content : ContentStrings) { + // Use StringRef::str() to create a temporary std::string. This could cause + // problems if the CAS is storing references to the input string instead of + // copying it. + std::optional<ObjectRef> Node; + ASSERT_THAT_ERROR( + CAS1->store(std::nullopt, arrayRefFromStringRef<char>(Content)) + .moveInto(Node), + Succeeded()); + Nodes.push_back(*Node); + + // Check basic printing of IDs. + IDs.push_back(CAS1->getID(*Node)); + EXPECT_EQ(IDs.back().toString(), IDs.back().toString()); + EXPECT_EQ(Nodes.front(), Nodes.front()); + EXPECT_EQ(Nodes.back(), Nodes.back()); + EXPECT_EQ(IDs.front(), IDs.front()); + EXPECT_EQ(IDs.back(), IDs.back()); + if (Nodes.size() <= 1) + continue; + EXPECT_NE(Nodes.front(), Nodes.back()); + EXPECT_NE(IDs.front(), IDs.back()); + } + + // Check that the blobs give the same IDs later. + for (int I = 0, E = IDs.size(); I != E; ++I) { + std::optional<ObjectRef> Node; + ASSERT_THAT_ERROR(CAS1->store(std::nullopt, arrayRefFromStringRef<char>( + ContentStrings[I])) + .moveInto(Node), + Succeeded()); + EXPECT_EQ(IDs[I], CAS1->getID(*Node)); + } + + // Check that the blobs can be retrieved multiple times. + for (int I = 0, E = IDs.size(); I != E; ++I) { + for (int J = 0, JE = 3; J != JE; ++J) { + std::optional<ObjectProxy> Object; + ASSERT_THAT_ERROR(CAS1->getProxy(IDs[I]).moveInto(Object), Succeeded()); + ASSERT_TRUE(Object); + EXPECT_EQ(ContentStrings[I], Object->getData()); + } + } + + // Confirm these blobs don't exist in a fresh CAS instance. + std::unique_ptr<ObjectStore> CAS2 = createObjectStore(); + for (int I = 0, E = IDs.size(); I != E; ++I) { + std::optional<ObjectProxy> Object; + EXPECT_THAT_ERROR(CAS2->getProxy(IDs[I]).moveInto(Object), Failed()); + } + + // Insert into the second CAS and confirm the IDs are stable. Getting them + // should work now. + for (int I = IDs.size(), E = 0; I != E; --I) { + auto &ID = IDs[I - 1]; + auto &Content = ContentStrings[I - 1]; + std::optional<ObjectRef> Node; + ASSERT_THAT_ERROR( + CAS2->store(std::nullopt, arrayRefFromStringRef<char>(Content)) + .moveInto(Node), + Succeeded()); + EXPECT_EQ(ID, CAS2->getID(*Node)); + + std::optional<ObjectProxy> Object; + ASSERT_THAT_ERROR(CAS2->getProxy(ID).moveInto(Object), Succeeded()); + ASSERT_TRUE(Object); + EXPECT_EQ(Content, Object->getData()); + } +} + +TEST_P(CASTest, NodesBig) { + std::unique_ptr<ObjectStore> CAS = createObjectStore(); + + // Specifically check near 1MB for objects large enough they're likely to be + // stored externally in an on-disk CAS, and such that one of them will be + // near a page boundary. + SmallString<0> Storage; + constexpr size_t InterestingSize = 1024U * 1024ULL; + constexpr size_t WordSize = sizeof(void *); + + // Start much smaller to account for headers. + constexpr size_t SizeB = InterestingSize - 8 * WordSize; + constexpr size_t SizeE = InterestingSize + 1; + if (Storage.size() < SizeE) + Storage.resize(SizeE, '\01'); + + SmallVector<ObjectRef, 4> CreatedNodes; + // Avoid checking every size because this is an expensive test. Just check + // for data that is 8B-word-aligned, and one less. Also appending the created + // nodes as the references in the next block to check references are created + // correctly. + for (size_t Size = SizeB; Size < SizeE; Size += WordSize) { + for (bool IsAligned : {false, true}) { + StringRef Data(Storage.data(), Size - (IsAligned ? 0 : 1)); + std::optional<ObjectProxy> Node; + ASSERT_THAT_ERROR(CAS->createProxy(CreatedNodes, Data).moveInto(Node), + Succeeded()); + ASSERT_EQ(Data, Node->getData()); + ASSERT_EQ(0, Node->getData().end()[0]); + ASSERT_EQ(Node->getNumReferences(), CreatedNodes.size()); + CreatedNodes.emplace_back(Node->getRef()); + } + } + + for (auto ID : CreatedNodes) + ASSERT_THAT_ERROR(CAS->validate(CAS->getID(ID)), Succeeded()); +} + +/// Common test functionality for creating blobs in parallel. You can vary which +/// cas instances are the same or different, and the size of the created blobs. +static void testBlobsParallel(ObjectStore &Read1, ObjectStore &Read2, + ObjectStore &Write1, ObjectStore &Write2, + uint64_t BlobSize) { + SCOPED_TRACE(testBlobsParallel); + unsigned BlobCount = 100; + std::vector<std::string> Blobs; + Blobs.reserve(BlobCount); + for (unsigned I = 0; I < BlobCount; ++I) { + std::string Blob; + Blob.reserve(BlobSize); + while (Blob.size() < BlobSize) { + auto R = sys::Process::GetRandomNumber(); + Blob.append((char *)&R, sizeof(R)); + } + assert(Blob.size() >= BlobSize); + Blob.resize(BlobSize); + Blobs.push_back(std::move(Blob)); + } + + std::mutex NodesMtx; + std::vector<std::optional<CASID>> CreatedNodes(BlobCount); + + auto Producer = [&](unsigned I, ObjectStore *CAS) { + std::optional<ObjectProxy> Node; + EXPECT_THAT_ERROR(CAS->createProxy({}, Blobs[I]).moveInto(Node), + Succeeded()); + { + std::lock_guard<std::mutex> L(NodesMtx); + CreatedNodes[I] = Node ? Node->getID() : CASID::getDenseMapTombstoneKey(); + } + }; + + auto Consumer = [&](unsigned I, ObjectStore *CAS) { + std::optional<CASID> ID; + while (!ID) { + // Busy wait. + std::lock_guard<std::mutex> L(NodesMtx); + ID = CreatedNodes[I]; + } + if (ID == CASID::getDenseMapTombstoneKey()) + // Producer failed; already reported. + return; + + std::optional<ObjectProxy> Node; + ASSERT_THAT_ERROR(CAS->getProxy(*ID).moveInto(Node), Succeeded()); + EXPECT_EQ(Node->getData(), Blobs[I]); + }; + + DefaultThreadPool Threads; + for (unsigned I = 0; I < BlobCount; ++I) { + Threads.async(Consumer, I, &Read1); + Threads.async(Consumer, I, &Read2); + Threads.async(Producer, I, &Write1); + Threads.async(Producer, I, &Write2); + } + + Threads.wait(); +} + +static void testBlobsParallel1(ObjectStore &CAS, uint64_t BlobSize) { + SCOPED_TRACE(testBlobsParallel1); + testBlobsParallel(CAS, CAS, CAS, CAS, BlobSize); +} + +TEST_P(CASTest, BlobsParallel) { + std::shared_ptr<ObjectStore> CAS = createObjectStore(); + uint64_t Size = 1ULL * 1024; + ASSERT_NO_FATAL_FAILURE(testBlobsParallel1(*CAS, Size)); +} + +#ifdef EXPENSIVE_CHECKS +TEST_P(CASTest, BlobsBigParallel) { + std::shared_ptr<ObjectStore> CAS = createObjectStore(); + // 100k is large enough to be standalone files in our on-disk cas. + uint64_t Size = 100ULL * 1024; + ASSERT_NO_FATAL_FAILURE(testBlobsParallel1(*CAS, Size)); +} +#endif diff --git a/llvm/unittests/CAS/OnDiskHashMappedTrieTest.cpp b/llvm/unittests/CAS/OnDiskHashMappedTrieTest.cpp new file mode 100644 index 0000000..b8e5bf6 --- /dev/null +++ b/llvm/unittests/CAS/OnDiskHashMappedTrieTest.cpp @@ -0,0 +1,146 @@ +//===- OnDiskHashMappedTrieTest.cpp ---------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/CAS/OnDiskHashMappedTrie.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/Support/Alignment.h" +#include "llvm/Testing/Support/Error.h" +#include "llvm/Testing/Support/SupportHelpers.h" +#include "gtest/gtest.h" + +#if LLVM_ENABLE_ONDISK_CAS +using namespace llvm; +using namespace llvm::cas; + +namespace { + +TEST(OnDiskHashMappedTrieTest, Insertion) { + unittest::TempDir Temp("on-disk-hash-mapped-trie", /*Unique=*/true); + + // Create tries with various sizes of hash and with data. + // + // NOTE: The check related to \a recoverFromFileOffset() catches a potential + // off-by-one bounds-checking bug when the trie record size (data + hash) add + // up to a multiple of 8B. Iterate through a few different hash sizes to + // check it both ways. + constexpr size_t MB = 1024u * 1024u; + constexpr size_t DataSize = 8; // Multiple of 8B. + for (size_t NumHashBytes : {1, 2, 4, 8}) { + size_t NumHashBits = NumHashBytes * 8; + + auto createTrie = [&]() { + return OnDiskHashMappedTrie::create( + Temp.path((Twine(NumHashBytes) + "B").str()), "index", + /*NumHashBits=*/NumHashBits, DataSize, /*MaxFileSize=*/MB, + /*NewInitialFileSize=*/std::nullopt); + }; + + std::optional<OnDiskHashMappedTrie> Trie1; + ASSERT_THAT_ERROR(createTrie().moveInto(Trie1), Succeeded()); + std::optional<OnDiskHashMappedTrie> Trie2; + ASSERT_THAT_ERROR(createTrie().moveInto(Trie2), Succeeded()); + + uint8_t Hash0Bytes[8] = {0, 0, 0, 0, 0, 0, 0, 0}; + uint8_t Hash1Bytes[8] = {1, 0, 0, 0, 0, 0, 0, 0}; + auto Hash0 = ArrayRef(Hash0Bytes).take_front(NumHashBytes); + auto Hash1 = ArrayRef(Hash1Bytes).take_front(NumHashBytes); + constexpr StringLiteral Data0v1Bytes = "data0.v1"; + constexpr StringLiteral Data0v2Bytes = "data0.v2"; + constexpr StringLiteral Data1Bytes = "data1..."; + static_assert(Data0v1Bytes.size() == DataSize, "math error"); + static_assert(Data0v2Bytes.size() == DataSize, "math error"); + static_assert(Data1Bytes.size() == DataSize, "math error"); + ArrayRef<char> Data0v1 = ArrayRef(Data0v1Bytes.data(), Data0v1Bytes.size()); + ArrayRef<char> Data0v2 = ArrayRef(Data0v2Bytes.data(), Data0v2Bytes.size()); + ArrayRef<char> Data1 = ArrayRef(Data1Bytes.data(), Data1Bytes.size()); + + // Lookup when trie is empty. + EXPECT_FALSE(Trie1->find(Hash0)); + + // Insert. + std::optional<FileOffset> Offset; + std::optional<MutableArrayRef<char>> Data; + { + auto Insertion = Trie1->insert({Hash0, Data0v1}); + ASSERT_TRUE(Insertion); + EXPECT_EQ(Hash0, Insertion->Hash); + EXPECT_EQ(Data0v1, Insertion->Data); + EXPECT_TRUE(isAddrAligned(Align(8), Insertion->Data.data())); + + Offset = Insertion.getOffset(); + Data = Insertion->Data; + } + + // Find. + { + auto Lookup = Trie1->find(Hash0); + ASSERT_TRUE(Lookup); + EXPECT_EQ(Hash0, Lookup->Hash); + EXPECT_EQ(Data0v1, Lookup->Data); + EXPECT_EQ(Offset->get(), Lookup.getOffset().get()); + } + + // Find in a different instance of the same on-disk trie that existed + // before the insertion. + { + auto Lookup = Trie2->find(Hash0); + ASSERT_TRUE(Lookup); + EXPECT_EQ(Hash0, Lookup->Hash); + EXPECT_EQ(Data0v1, Lookup->Data); + EXPECT_EQ(Offset->get(), Lookup.getOffset().get()); + } + + // Create a new instance and check that too. + Trie2.reset(); + ASSERT_THAT_ERROR(createTrie().moveInto(Trie2), Succeeded()); + { + auto Lookup = Trie2->find(Hash0); + ASSERT_TRUE(Lookup); + EXPECT_EQ(Hash0, Lookup->Hash); + EXPECT_EQ(Data0v1, Lookup->Data); + EXPECT_EQ(Offset->get(), Lookup.getOffset().get()); + } + + // Change the data. + llvm::copy(Data0v2, Data->data()); + { + auto Lookup = Trie2->find(Hash0); + ASSERT_TRUE(Lookup); + EXPECT_EQ(Hash0, Lookup->Hash); + EXPECT_EQ(Data0v2, Lookup->Data); + EXPECT_EQ(Offset->get(), Lookup.getOffset().get()); + } + + // Find different hash. + EXPECT_FALSE(Trie1->find(Hash1)); + EXPECT_FALSE(Trie2->find(Hash1)); + + // Recover from an offset. + { + auto Recovered = Trie1->recoverFromFileOffset(*Offset); + ASSERT_TRUE(Recovered); + EXPECT_EQ(Offset->get(), Recovered.getOffset().get()); + EXPECT_EQ(Hash0, Recovered->Hash); + EXPECT_EQ(Data0v2, Recovered->Data); + } + + // Insert another thing. + { + auto Insertion = Trie1->insert({Hash1, Data1}); + ASSERT_TRUE(Insertion); + EXPECT_EQ(Hash1, Insertion->Hash); + EXPECT_EQ(Data1, Insertion->Data); + EXPECT_TRUE(isAddrAligned(Align(8), Insertion->Data.data())); + EXPECT_NE(Offset->get(), Insertion.getOffset().get()); + } + } +} + +} // namespace + +#endif diff --git a/llvm/unittests/CAS/ProgramTest.cpp b/llvm/unittests/CAS/ProgramTest.cpp new file mode 100644 index 0000000..95cc359 --- /dev/null +++ b/llvm/unittests/CAS/ProgramTest.cpp @@ -0,0 +1,151 @@ +//===- MappedFileRegionBumpPtrTest.cpp ------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/Program.h" +#include "llvm/CAS/MappedFileRegionBumpPtr.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/Path.h" +#include "llvm/Support/ThreadPool.h" +#include "gtest/gtest.h" +#if defined(__APPLE__) +#include <crt_externs.h> +#elif !defined(_MSC_VER) +// Forward declare environ in case it's not provided by stdlib.h. +extern char **environ; +#endif + +using namespace llvm; +using namespace llvm::cas; + +extern const char *TestMainArgv0; +static char ProgramID = 0; + +class CASProgramTest : public testing::Test { + std::vector<StringRef> EnvTable; + std::vector<std::string> EnvStorage; + +protected: + void SetUp() override { + auto EnvP = [] { +#if defined(_WIN32) + _wgetenv(L"TMP"); // Populate _wenviron, initially is null + return _wenviron; +#elif defined(__APPLE__) + return *_NSGetEnviron(); +#else + return environ; +#endif + }(); + ASSERT_TRUE(EnvP); + + auto prepareEnvVar = [this](decltype(*EnvP) Var) -> StringRef { +#if defined(_WIN32) + // On Windows convert UTF16 encoded variable to UTF8 + auto Len = wcslen(Var); + ArrayRef<char> Ref{reinterpret_cast<char const *>(Var), + Len * sizeof(*Var)}; + EnvStorage.emplace_back(); + auto convStatus = convertUTF16ToUTF8String(Ref, EnvStorage.back()); + EXPECT_TRUE(convStatus); + return EnvStorage.back(); +#else + (void)this; + return StringRef(Var); +#endif + }; + + while (*EnvP != nullptr) { + auto S = prepareEnvVar(*EnvP); + if (!StringRef(S).starts_with("GTEST_")) + EnvTable.emplace_back(S); + ++EnvP; + } + } + + void TearDown() override { + EnvTable.clear(); + EnvStorage.clear(); + } + + void addEnvVar(StringRef Var) { EnvTable.emplace_back(Var); } + + ArrayRef<StringRef> getEnviron() const { return EnvTable; } +}; + +#if LLVM_ENABLE_ONDISK_CAS + +TEST_F(CASProgramTest, MappedFileRegionBumpPtrTest) { + auto TestAllocator = [](StringRef Path) { + auto NewFileConstructor = [&](MappedFileRegionBumpPtr &Alloc) -> Error { + Alloc.initializeBumpPtr(0); + return Error::success(); + }; + + auto Alloc = MappedFileRegionBumpPtr::create( + Path, /*Capacity=*/10 * 1024 * 1024, + /*BumpPtrOffset=*/0, NewFileConstructor); + if (!Alloc) + ASSERT_TRUE(false); + + std::vector<unsigned *> AllocatedPtr; + AllocatedPtr.resize(100); + DefaultThreadPool Threads; + for (unsigned I = 0; I < 100; ++I) { + Threads.async( + [&](unsigned Idx) { + // Allocate a buffer that is larger than needed so allocator hits + // additional pages for test coverage. + unsigned *P = (unsigned *)Alloc->allocate(100); + *P = Idx; + AllocatedPtr[Idx] = P; + }, + I); + } + + Threads.wait(); + for (unsigned I = 0; I < 100; ++I) + EXPECT_EQ(*AllocatedPtr[I], I); + }; + + if (const char *File = getenv("LLVM_CAS_TEST_MAPPED_FILE_REGION")) { + TestAllocator(File); + exit(0); + } + + SmallString<128> FilePath; + sys::fs::createUniqueDirectory("MappedFileRegionBumpPtr", FilePath); + sys::path::append(FilePath, "allocation-file"); + + std::string Executable = + sys::fs::getMainExecutable(TestMainArgv0, &ProgramID); + StringRef Argv[] = { + Executable, "--gtest_filter=CASProgramTest.MappedFileRegionBumpPtrTest"}; + + // Add LLVM_PROGRAM_TEST_LOCKED_FILE to the environment of the child. + std::string EnvVar = "LLVM_CAS_TEST_MAPPED_FILE_REGION="; + EnvVar += FilePath.str(); + addEnvVar(EnvVar); + + std::string Error; + bool ExecutionFailed; + sys::ProcessInfo PI = sys::ExecuteNoWait(Executable, Argv, getEnviron(), {}, + 0, &Error, &ExecutionFailed); + TestAllocator(FilePath); + + ASSERT_FALSE(ExecutionFailed) << Error; + ASSERT_TRUE(Error.empty()); + ASSERT_NE(PI.Pid, sys::ProcessInfo::InvalidPid) << "Invalid process id"; + llvm::sys::Wait(PI, /*SecondsToWait=*/5, &Error); + ASSERT_TRUE(Error.empty()); + + // Clean up after both processes finish testing. + sys::fs::remove(FilePath); + sys::fs::remove_directories(sys::path::parent_path(FilePath)); +} + +#endif // LLVM_ENABLE_ONDISK_CAS diff --git a/llvm/unittests/CMakeLists.txt b/llvm/unittests/CMakeLists.txt index 8892f3e..5ebdc3b 100644 --- a/llvm/unittests/CMakeLists.txt +++ b/llvm/unittests/CMakeLists.txt @@ -34,6 +34,7 @@ add_subdirectory(AsmParser) add_subdirectory(BinaryFormat) add_subdirectory(Bitcode) add_subdirectory(Bitstream) +add_subdirectory(CAS) add_subdirectory(CGData) add_subdirectory(CodeGen) add_subdirectory(DebugInfo) diff --git a/llvm/unittests/Support/ProgramTest.cpp b/llvm/unittests/Support/ProgramTest.cpp index b1b35ea..93db38b 100644 --- a/llvm/unittests/Support/ProgramTest.cpp +++ b/llvm/unittests/Support/ProgramTest.cpp @@ -10,6 +10,7 @@ #include "llvm/Config/llvm-config.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ConvertUTF.h" +#include "llvm/Support/ExponentialBackoff.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" #include "llvm/Support/Signals.h" @@ -561,6 +562,81 @@ TEST_F(ProgramEnvTest, TestLockFile) { sys::fs::remove(LockedFile); } +TEST_F(ProgramEnvTest, TestLockFileExclusive) { + using namespace llvm::sys; + using namespace std::chrono_literals; + + if (const char *LockedFile = getenv("LLVM_PROGRAM_TEST_LOCKED_FILE")) { + // Child process. + int FD2; + ASSERT_NO_ERROR(fs::openFileForReadWrite(LockedFile, FD2, + fs::CD_OpenExisting, fs::OF_None)); + + std::error_code ErrC = + fs::tryLockFile(FD2, std::chrono::seconds(0), /*Exclusive=*/true); + EXPECT_TRUE(ErrC); + close(FD2); + // Write a file to indicate just finished. + std::string FinishFile = std::string(LockedFile) + "-finished"; + int FD3; + ASSERT_NO_ERROR(fs::openFileForReadWrite(FinishFile, FD3, fs::CD_CreateNew, + fs::OF_None)); + close(FD3); + exit(0); + } + + // Create file that will be locked. + SmallString<64> LockedFile; + int FD1; + ASSERT_NO_ERROR( + fs::createUniqueDirectory("TestLockFileExclusive", LockedFile)); + sys::path::append(LockedFile, "file"); + ASSERT_NO_ERROR( + fs::openFileForReadWrite(LockedFile, FD1, fs::CD_CreateNew, fs::OF_None)); + + std::string Executable = + sys::fs::getMainExecutable(TestMainArgv0, &ProgramTestStringArg1); + StringRef argv[] = {Executable, + "--gtest_filter=ProgramEnvTest.TestLockFileExclusive"}; + + // Add LLVM_PROGRAM_TEST_LOCKED_FILE to the environment of the child. + std::string EnvVar = "LLVM_PROGRAM_TEST_LOCKED_FILE="; + EnvVar += LockedFile.str(); + addEnvVar(EnvVar); + + // Lock the file. + ASSERT_NO_ERROR(fs::tryLockFile(FD1)); + + std::string Error; + bool ExecutionFailed; + ProcessInfo PI2 = ExecuteNoWait(Executable, argv, getEnviron(), {}, 0, &Error, + &ExecutionFailed); + ASSERT_FALSE(ExecutionFailed) << Error; + ASSERT_TRUE(Error.empty()); + ASSERT_NE(PI2.Pid, ProcessInfo::InvalidPid) << "Invalid process id"; + + std::string FinishFile = std::string(LockedFile) + "-finished"; + // Wait till child process writes the file to indicate the job finished. + bool Finished = false; + ExponentialBackoff Backoff(5s); // timeout 5s. + do { + if (fs::exists(FinishFile)) { + Finished = true; + break; + } + } while (Backoff.waitForNextAttempt()); + + ASSERT_TRUE(Finished); + ASSERT_NO_ERROR(fs::unlockFile(FD1)); + ProcessInfo WaitResult = llvm::sys::Wait(PI2, /*SecondsToWait=*/1, &Error); + ASSERT_TRUE(Error.empty()); + ASSERT_EQ(0, WaitResult.ReturnCode); + ASSERT_EQ(WaitResult.Pid, PI2.Pid); + sys::fs::remove(LockedFile); + sys::fs::remove(FinishFile); + sys::fs::remove_directories(sys::path::parent_path(LockedFile)); +} + TEST_F(ProgramEnvTest, TestExecuteWithNoStacktraceHandler) { using namespace llvm::sys; |