aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSteven Wu <stevenwu@apple.com>2024-10-29 10:37:31 -0700
committerSteven Wu <stevenwu@apple.com>2024-10-29 10:37:31 -0700
commit4f0c6be7aa773c5ec54ca3dcd1d201bfd397c546 (patch)
tree5d27a4360e82a6655f1684ac57456250b3c856f1
parentb510cdb895b9188e5819c4c85a6dab22a4d14385 (diff)
parent424bd23894010780c0477da4324fd4e97194339f (diff)
downloadllvm-users/cachemeifyoucan/spr/cas-add-ondiskgraphdb-and-ondiskkeyvaluedb.zip
llvm-users/cachemeifyoucan/spr/cas-add-ondiskgraphdb-and-ondiskkeyvaluedb.tar.gz
llvm-users/cachemeifyoucan/spr/cas-add-ondiskgraphdb-and-ondiskkeyvaluedb.tar.bz2
Created using spr 1.3.5
-rw-r--r--llvm/CMakeLists.txt7
-rw-r--r--llvm/docs/ContentAddressableStorage.md120
-rw-r--r--llvm/docs/Reference.rst4
-rw-r--r--llvm/include/llvm/CAS/ActionCache.h94
-rw-r--r--llvm/include/llvm/CAS/BuiltinCASContext.h88
-rw-r--r--llvm/include/llvm/CAS/BuiltinObjectHasher.h81
-rw-r--r--llvm/include/llvm/CAS/CASID.h156
-rw-r--r--llvm/include/llvm/CAS/CASReference.h207
-rw-r--r--llvm/include/llvm/CAS/MappedFileRegionBumpPtr.h110
-rw-r--r--llvm/include/llvm/CAS/ObjectStore.h302
-rw-r--r--llvm/include/llvm/CAS/OnDiskGraphDB.h428
-rw-r--r--llvm/include/llvm/CAS/OnDiskHashMappedTrie.h375
-rw-r--r--llvm/include/llvm/CAS/OnDiskKeyValueDB.h63
-rw-r--r--llvm/include/llvm/Support/FileSystem.h8
-rw-r--r--llvm/include/module.modulemap6
-rw-r--r--llvm/lib/CAS/ActionCache.cpp22
-rw-r--r--llvm/lib/CAS/ActionCaches.cpp99
-rw-r--r--llvm/lib/CAS/BuiltinCAS.cpp94
-rw-r--r--llvm/lib/CAS/BuiltinCAS.h74
-rw-r--r--llvm/lib/CAS/CMakeLists.txt19
-rw-r--r--llvm/lib/CAS/InMemoryCAS.cpp320
-rw-r--r--llvm/lib/CAS/MappedFileRegionBumpPtr.cpp233
-rw-r--r--llvm/lib/CAS/ObjectStore.cpp168
-rw-r--r--llvm/lib/CAS/OnDiskCommon.cpp108
-rw-r--r--llvm/lib/CAS/OnDiskCommon.h48
-rw-r--r--llvm/lib/CAS/OnDiskGraphDB.cpp1548
-rw-r--r--llvm/lib/CAS/OnDiskHashMappedTrie.cpp1352
-rw-r--r--llvm/lib/CAS/OnDiskKeyValueDB.cpp79
-rw-r--r--llvm/lib/CMakeLists.txt1
-rw-r--r--llvm/lib/Support/Unix/Path.inc11
-rw-r--r--llvm/lib/Support/Windows/Path.inc12
-rw-r--r--llvm/unittests/CAS/ActionCacheTest.cpp73
-rw-r--r--llvm/unittests/CAS/CASTestConfig.cpp23
-rw-r--r--llvm/unittests/CAS/CASTestConfig.h38
-rw-r--r--llvm/unittests/CAS/CMakeLists.txt21
-rw-r--r--llvm/unittests/CAS/ObjectStoreTest.cpp360
-rw-r--r--llvm/unittests/CAS/OnDiskCommonUtils.h69
-rw-r--r--llvm/unittests/CAS/OnDiskGraphDBTest.cpp284
-rw-r--r--llvm/unittests/CAS/OnDiskHashMappedTrieTest.cpp146
-rw-r--r--llvm/unittests/CAS/OnDiskKeyValueDBTest.cpp54
-rw-r--r--llvm/unittests/CAS/ProgramTest.cpp151
-rw-r--r--llvm/unittests/CMakeLists.txt1
-rw-r--r--llvm/unittests/Support/ProgramTest.cpp76
43 files changed, 7523 insertions, 10 deletions
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index cde4a99..3ac4dde 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -837,6 +837,13 @@ option (LLVM_ENABLE_SPHINX "Use Sphinx to generate llvm documentation." OFF)
option (LLVM_ENABLE_OCAMLDOC "Build OCaml bindings documentation." ON)
option (LLVM_ENABLE_BINDINGS "Build bindings." ON)
+if(UNIX AND CMAKE_SIZEOF_VOID_P GREATER_EQUAL 8)
+ set(LLVM_ENABLE_ONDISK_CAS_default ON)
+else()
+ set(LLVM_ENABLE_ONDISK_CAS_default OFF)
+endif()
+option(LLVM_ENABLE_ONDISK_CAS "Build OnDiskCAS." ${LLVM_ENABLE_ONDISK_CAS_default})
+
set(LLVM_INSTALL_DOXYGEN_HTML_DIR "${CMAKE_INSTALL_DOCDIR}/llvm/doxygen-html"
CACHE STRING "Doxygen-generated HTML documentation install directory")
set(LLVM_INSTALL_OCAMLDOC_HTML_DIR "${CMAKE_INSTALL_DOCDIR}/llvm/ocaml-html"
diff --git a/llvm/docs/ContentAddressableStorage.md b/llvm/docs/ContentAddressableStorage.md
new file mode 100644
index 0000000..4f2d9a6
--- /dev/null
+++ b/llvm/docs/ContentAddressableStorage.md
@@ -0,0 +1,120 @@
+# Content Addressable Storage
+
+## Introduction to CAS
+
+Content Addressable Storage, or `CAS`, is a storage system where it assigns
+unique addresses to the data stored. It is very useful for data deduplicaton
+and creating unique identifiers.
+
+Unlikely other kind of storage system like file system, CAS is immutable. It
+is more reliable to model a computation when representing the inputs and outputs
+of the computation using objects stored in CAS.
+
+The basic unit of the CAS library is a CASObject, where it contains:
+
+* Data: arbitrary data
+* References: references to other CASObject
+
+It can be conceptually modeled as something like:
+
+```
+struct CASObject {
+ ArrayRef<char> Data;
+ ArrayRef<CASObject*> Refs;
+}
+```
+
+Such abstraction can allow simple composition of CASObjects into a DAG to
+represent complicated data structure while still allowing data deduplication.
+Note you can compare two DAGs by just comparing the CASObject hash of two
+root nodes.
+
+
+
+## LLVM CAS Library User Guide
+
+The CAS-like storage provided in LLVM is `llvm::cas::ObjectStore`.
+To reference a CASObject, there are few different abstractions provided
+with different trade-offs:
+
+### ObjectRef
+
+`ObjectRef` is a lightweight reference to a CASObject stored in the CAS.
+This is the most commonly used abstraction and it is cheap to copy/pass
+along. It has following properties:
+
+* `ObjectRef` is only meaningful within the `ObjectStore` that created the ref.
+`ObjectRef` created by different `ObjectStore` cannot be cross-referenced or
+compared.
+* `ObjectRef` doesn't guarantee the existence of the CASObject it points to. An
+explicitly load is required before accessing the data stored in CASObject.
+This load can also fail, for reasons like but not limited to: object does
+not exist, corrupted CAS storage, operation timeout, etc.
+* If two `ObjectRef` are equal, it is guarantee that the object they point to
+(if exists) are identical. If they are not equal, the underlying objects are
+guaranteed to be not the same.
+
+### ObjectProxy
+
+`ObjectProxy` represents a loaded CASObject. With an `ObjectProxy`, the
+underlying stored data and references can be accessed without the need
+of error handling. The class APIs also provide convenient methods to
+access underlying data. The lifetime of the underlying data is equal to
+the lifetime of the instance of `ObjectStore` unless explicitly copied.
+
+### CASID
+
+`CASID` is the hash identifier for CASObjects. It owns the underlying
+storage for hash value so it can be expensive to copy and compare depending
+on the hash algorithm. `CASID` is generally only useful in rare situations
+like printing raw hash value or exchanging hash values between different
+CAS instances with the same hashing schema.
+
+### ObjectStore
+
+`ObjectStore` is the CAS-like object storage. It provides API to save
+and load CASObjects, for example:
+
+```
+ObjectRef A, B, C;
+Expected<ObjectRef> Stored = ObjectStore.store("data", {A, B});
+Expected<ObjectProxy> Loaded = ObjectStore.getProxy(C);
+```
+
+It also provides APIs to convert between `ObjectRef`, `ObjectProxy` and
+`CASID`.
+
+
+
+## CAS Library Implementation Guide
+
+The LLVM ObjectStore APIs are designed so that it is easy to add
+customized CAS implementation that are interchangeable with builtin
+CAS implementations.
+
+To add your own implementation, you just need to add a subclass to
+`llvm::cas::ObjectStore` and implement all its pure virtual methods.
+To be interchangeable with LLVM ObjectStore, the new CAS implementation
+needs to conform to following contracts:
+
+* Different CASObject stored in the ObjectStore needs to have a different hash
+and result in a different `ObjectRef`. Vice versa, same CASObject should have
+same hash and same `ObjectRef`. Note two different CASObjects with identical
+data but different references are considered different objects.
+* `ObjectRef`s are comparable within the same `ObjectStore` instance, and can
+be used to determine the equality of the underlying CASObjects.
+* The loaded objects from the ObjectStore need to have the lifetime to be at
+least as long as the ObjectStore itself.
+
+If not specified, the behavior can be implementation defined. For example,
+`ObjectRef` can be used to point to a loaded CASObject so
+`ObjectStore` never fails to load. It is also legal to use a stricter model
+than required. For example, an `ObjectRef` that can be used to compare
+objects between different `ObjectStore` instances is legal but user
+of the ObjectStore should not depend on this behavior.
+
+For CAS library implementer, there is also a `ObjectHandle` class that
+is an internal representation of a loaded CASObject reference.
+`ObjectProxy` is just a pair of `ObjectHandle` and `ObjectStore`, because
+just like `ObjectRef`, `ObjectHandle` is only useful when paired with
+the ObjectStore that knows about the loaded CASObject.
diff --git a/llvm/docs/Reference.rst b/llvm/docs/Reference.rst
index df61628..ae03a3a 100644
--- a/llvm/docs/Reference.rst
+++ b/llvm/docs/Reference.rst
@@ -15,6 +15,7 @@ LLVM and API reference documentation.
BranchWeightMetadata
Bugpoint
CommandGuide/index
+ ContentAddressableStorage
ConvergenceAndUniformity
ConvergentOperations
Coroutines
@@ -232,3 +233,6 @@ Additional Topics
:doc:`ConvergenceAndUniformity`
A description of uniformity analysis in the presence of irreducible
control flow, and its implementation.
+
+:doc:`ContentAddressableStorage`
+ A reference guide for using LLVM's CAS library.
diff --git a/llvm/include/llvm/CAS/ActionCache.h b/llvm/include/llvm/CAS/ActionCache.h
new file mode 100644
index 0000000..e6dfdd4
--- /dev/null
+++ b/llvm/include/llvm/CAS/ActionCache.h
@@ -0,0 +1,94 @@
+//===- llvm/CAS/ActionCache.h -----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CAS_CASACTIONCACHE_H
+#define LLVM_CAS_CASACTIONCACHE_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CAS/CASID.h"
+#include "llvm/CAS/CASReference.h"
+#include "llvm/Support/Error.h"
+
+namespace llvm::cas {
+
+class ObjectStore;
+class CASID;
+class ObjectProxy;
+
+/// A key for caching an operation.
+/// It is implemented as a bag of bytes and provides a convenient constructor
+/// for CAS types.
+class CacheKey {
+public:
+ StringRef getKey() const { return Key; }
+
+ // TODO: Support CacheKey other than a CASID but rather any array of bytes.
+ // To do that, ActionCache need to be able to rehash the key into the index,
+ // which then `getOrCompute` method can be used to avoid multiple calls to
+ // has function.
+ CacheKey(const CASID &ID);
+ CacheKey(const ObjectProxy &Proxy);
+ CacheKey(const ObjectStore &CAS, const ObjectRef &Ref);
+
+private:
+ std::string Key;
+};
+
+/// A cache from a key describing an action to the result of doing it.
+///
+/// Actions are expected to be pure (collision is an error).
+class ActionCache {
+ virtual void anchor();
+
+public:
+ /// Get a previously computed result for \p ActionKey.
+ ///
+ /// \param Globally if true it is a hint to the underlying implementation that
+ /// the lookup is profitable to be done on a distributed caching level, not
+ /// just locally. The implementation is free to ignore this flag.
+ Expected<std::optional<CASID>> get(const CacheKey &ActionKey,
+ bool Globally = false) const {
+ return getImpl(arrayRefFromStringRef(ActionKey.getKey()), Globally);
+ }
+
+ /// Cache \p Result for the \p ActionKey computation.
+ ///
+ /// \param Globally if true it is a hint to the underlying implementation that
+ /// the association is profitable to be done on a distributed caching level,
+ /// not just locally. The implementation is free to ignore this flag.
+ Error put(const CacheKey &ActionKey, const CASID &Result,
+ bool Globally = false) {
+ assert(Result.getContext().getHashSchemaIdentifier() ==
+ getContext().getHashSchemaIdentifier() &&
+ "Hash schema mismatch");
+ return putImpl(arrayRefFromStringRef(ActionKey.getKey()), Result, Globally);
+ }
+
+ virtual ~ActionCache() = default;
+
+protected:
+ virtual Expected<std::optional<CASID>> getImpl(ArrayRef<uint8_t> ResolvedKey,
+ bool Globally) const = 0;
+
+ virtual Error putImpl(ArrayRef<uint8_t> ResolvedKey, const CASID &Result,
+ bool Globally) = 0;
+
+ ActionCache(const CASContext &Context) : Context(Context) {}
+
+ const CASContext &getContext() const { return Context; }
+
+private:
+ const CASContext &Context;
+};
+
+/// Create an action cache in memory.
+std::unique_ptr<ActionCache> createInMemoryActionCache();
+
+} // end namespace llvm::cas
+
+#endif // LLVM_CAS_CASACTIONCACHE_H
diff --git a/llvm/include/llvm/CAS/BuiltinCASContext.h b/llvm/include/llvm/CAS/BuiltinCASContext.h
new file mode 100644
index 0000000..ebc4ca8
--- /dev/null
+++ b/llvm/include/llvm/CAS/BuiltinCASContext.h
@@ -0,0 +1,88 @@
+//===- BuiltinCASContext.h --------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CAS_BUILTINCASCONTEXT_H
+#define LLVM_CAS_BUILTINCASCONTEXT_H
+
+#include "llvm/CAS/CASID.h"
+#include "llvm/Support/BLAKE3.h"
+#include "llvm/Support/Error.h"
+
+namespace llvm::cas::builtin {
+
+/// Current hash type for the builtin CAS.
+///
+/// FIXME: This should be configurable via an enum to allow configuring the hash
+/// function. The enum should be sent into \a createInMemoryCAS() and \a
+/// createOnDiskCAS().
+///
+/// This is important (at least) for future-proofing, when we want to make new
+/// CAS instances use BLAKE7, but still know how to read/write BLAKE3.
+///
+/// Even just for BLAKE3, it would be useful to have these values:
+///
+/// BLAKE3 => 32B hash from BLAKE3
+/// BLAKE3_16B => 16B hash from BLAKE3 (truncated)
+///
+/// ... where BLAKE3_16 uses \a TruncatedBLAKE3<16>.
+///
+/// Motivation for a truncated hash is that it's cheaper to store. It's not
+/// clear if we always (or ever) need the full 32B, and for an ephemeral
+/// in-memory CAS, we almost certainly don't need it.
+///
+/// Note that the cost is linear in the number of objects for the builtin CAS,
+/// since we're using internal offsets and/or pointers as an optimization.
+///
+/// However, it's possible we'll want to hook up a local builtin CAS to, e.g.,
+/// a distributed generic hash map to use as an ActionCache. In that scenario,
+/// the transitive closure of the structured objects that are the results of
+/// the cached actions would need to be serialized into the map, something
+/// like:
+///
+/// "action:<schema>:<key>" -> "0123"
+/// "object:<schema>:0123" -> "3,4567,89AB,CDEF,9,some data"
+/// "object:<schema>:4567" -> ...
+/// "object:<schema>:89AB" -> ...
+/// "object:<schema>:CDEF" -> ...
+///
+/// These references would be full cost.
+using HasherT = BLAKE3;
+using HashType = decltype(HasherT::hash(std::declval<ArrayRef<uint8_t> &>()));
+
+class BuiltinCASContext : public CASContext {
+ void printIDImpl(raw_ostream &OS, const CASID &ID) const final;
+ void anchor() override;
+
+public:
+ /// Get the name of the hash for any table identifiers.
+ ///
+ /// FIXME: This should be configurable via an enum, with at the following
+ /// values:
+ ///
+ /// "BLAKE3" => 32B hash from BLAKE3
+ /// "BLAKE3.16" => 16B hash from BLAKE3 (truncated)
+ ///
+ /// Enum can be sent into \a createInMemoryCAS() and \a createOnDiskCAS().
+ static StringRef getHashName() { return "BLAKE3"; }
+ StringRef getHashSchemaIdentifier() const final {
+ static const std::string ID =
+ ("llvm.cas.builtin.v2[" + getHashName() + "]").str();
+ return ID;
+ }
+
+ static const BuiltinCASContext &getDefaultContext();
+
+ BuiltinCASContext() = default;
+
+ static Expected<HashType> parseID(StringRef PrintedDigest);
+ static void printID(ArrayRef<uint8_t> Digest, raw_ostream &OS);
+};
+
+} // namespace llvm::cas::builtin
+
+#endif // LLVM_CAS_BUILTINCASCONTEXT_H
diff --git a/llvm/include/llvm/CAS/BuiltinObjectHasher.h b/llvm/include/llvm/CAS/BuiltinObjectHasher.h
new file mode 100644
index 0000000..22e556c
--- /dev/null
+++ b/llvm/include/llvm/CAS/BuiltinObjectHasher.h
@@ -0,0 +1,81 @@
+//===- BuiltinObjectHasher.h ------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CAS_BUILTINOBJECTHASHER_H
+#define LLVM_CAS_BUILTINOBJECTHASHER_H
+
+#include "llvm/CAS/ObjectStore.h"
+#include "llvm/Support/Endian.h"
+
+namespace llvm::cas {
+
+template <class HasherT> class BuiltinObjectHasher {
+public:
+ using HashT = decltype(HasherT::hash(std::declval<ArrayRef<uint8_t> &>()));
+
+ static HashT hashObject(const ObjectStore &CAS, ArrayRef<ObjectRef> Refs,
+ ArrayRef<char> Data) {
+ BuiltinObjectHasher H;
+ H.updateSize(Refs.size());
+ for (const ObjectRef &Ref : Refs)
+ H.updateRef(CAS, Ref);
+ H.updateArray(Data);
+ return H.finish();
+ }
+
+ static HashT hashObject(ArrayRef<ArrayRef<uint8_t>> Refs,
+ ArrayRef<char> Data) {
+ BuiltinObjectHasher H;
+ H.updateSize(Refs.size());
+ for (const ArrayRef<uint8_t> &Ref : Refs)
+ H.updateID(Ref);
+ H.updateArray(Data);
+ return H.finish();
+ }
+
+private:
+ HashT finish() { return Hasher.final(); }
+
+ void updateRef(const ObjectStore &CAS, ObjectRef Ref) {
+ updateID(CAS.getID(Ref));
+ }
+
+ void updateID(const CASID &ID) { updateID(ID.getHash()); }
+
+ void updateID(ArrayRef<uint8_t> Hash) {
+ // NOTE: Does not hash the size of the hash. That's a CAS implementation
+ // detail that shouldn't leak into the UUID for an object.
+ assert(Hash.size() == sizeof(HashT) &&
+ "Expected object ref to match the hash size");
+ Hasher.update(Hash);
+ }
+
+ void updateArray(ArrayRef<uint8_t> Bytes) {
+ updateSize(Bytes.size());
+ Hasher.update(Bytes);
+ }
+
+ void updateArray(ArrayRef<char> Bytes) {
+ updateArray(ArrayRef(reinterpret_cast<const uint8_t *>(Bytes.data()),
+ Bytes.size()));
+ }
+
+ void updateSize(uint64_t Size) {
+ Size = support::endian::byte_swap(Size, endianness::little);
+ Hasher.update(
+ ArrayRef(reinterpret_cast<const uint8_t *>(&Size), sizeof(Size)));
+ }
+
+ BuiltinObjectHasher() = default;
+ ~BuiltinObjectHasher() = default;
+ HasherT Hasher;
+};
+
+} // namespace llvm::cas
+
+#endif // LLVM_CAS_BUILTINOBJECTHASHER_H
diff --git a/llvm/include/llvm/CAS/CASID.h b/llvm/include/llvm/CAS/CASID.h
new file mode 100644
index 0000000..5f9110a
--- /dev/null
+++ b/llvm/include/llvm/CAS/CASID.h
@@ -0,0 +1,156 @@
+//===- llvm/CAS/CASID.h -----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CAS_CASID_H
+#define LLVM_CAS_CASID_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Error.h"
+
+namespace llvm {
+
+class raw_ostream;
+
+namespace cas {
+
+class CASID;
+
+/// Context for CAS identifiers.
+class CASContext {
+ virtual void anchor();
+
+public:
+ virtual ~CASContext() = default;
+
+ /// Get an identifer for the schema used by this CAS context. Two CAS
+ /// instances should return \c true for this identifier if and only if their
+ /// CASIDs are safe to compare by hash. This is used by \a
+ /// CASID::equalsImpl().
+ virtual StringRef getHashSchemaIdentifier() const = 0;
+
+protected:
+ /// Print \p ID to \p OS.
+ virtual void printIDImpl(raw_ostream &OS, const CASID &ID) const = 0;
+
+ friend class CASID;
+};
+
+/// Unique identifier for a CAS object.
+///
+/// Locally, stores an internal CAS identifier that's specific to a single CAS
+/// instance. It's guaranteed not to change across the view of that CAS, but
+/// might change between runs.
+///
+/// It also has \a CASIDContext pointer to allow comparison of these
+/// identifiers. If two CASIDs are from the same CASIDContext, they can be
+/// compared directly. If they are, then \a
+/// CASIDContext::getHashSchemaIdentifier() is compared to see if they can be
+/// compared by hash, in which case the result of \a getHash() is compared.
+class CASID {
+public:
+ void dump() const;
+ void print(raw_ostream &OS) const {
+ return getContext().printIDImpl(OS, *this);
+ }
+ friend raw_ostream &operator<<(raw_ostream &OS, const CASID &ID) {
+ ID.print(OS);
+ return OS;
+ }
+ std::string toString() const;
+
+ ArrayRef<uint8_t> getHash() const {
+ return arrayRefFromStringRef<uint8_t>(Hash);
+ }
+
+ friend bool operator==(const CASID &LHS, const CASID &RHS) {
+ if (LHS.Context == RHS.Context)
+ return LHS.Hash == RHS.Hash;
+
+ // EmptyKey or TombstoneKey.
+ if (!LHS.Context || !RHS.Context)
+ return false;
+
+ // CASIDs are equal when they have the same hash schema and same hash value.
+ return LHS.Context->getHashSchemaIdentifier() ==
+ RHS.Context->getHashSchemaIdentifier() &&
+ LHS.Hash == RHS.Hash;
+ }
+
+ friend bool operator!=(const CASID &LHS, const CASID &RHS) {
+ return !(LHS == RHS);
+ }
+
+ friend hash_code hash_value(const CASID &ID) {
+ ArrayRef<uint8_t> Hash = ID.getHash();
+ return hash_combine_range(Hash.begin(), Hash.end());
+ }
+
+ const CASContext &getContext() const {
+ assert(Context && "Tombstone or empty key for DenseMap?");
+ return *Context;
+ }
+
+ static CASID getDenseMapEmptyKey() {
+ return CASID(nullptr, DenseMapInfo<StringRef>::getEmptyKey());
+ }
+ static CASID getDenseMapTombstoneKey() {
+ return CASID(nullptr, DenseMapInfo<StringRef>::getTombstoneKey());
+ }
+
+ CASID() = delete;
+
+ static CASID create(const CASContext *Context, StringRef Hash) {
+ return CASID(Context, Hash);
+ }
+
+private:
+ CASID(const CASContext *Context, StringRef Hash)
+ : Context(Context), Hash(Hash) {}
+
+ const CASContext *Context;
+ SmallString<32> Hash;
+};
+
+/// This is used to workaround the issue of MSVC needing default-constructible
+/// types for \c std::promise/future.
+template <typename T> struct AsyncValue {
+ Expected<std::optional<T>> take() { return std::move(Value); }
+
+ AsyncValue() : Value(std::nullopt) {}
+ AsyncValue(Error &&E) : Value(std::move(E)) {}
+ AsyncValue(T &&V) : Value(std::move(V)) {}
+ AsyncValue(std::nullopt_t) : Value(std::nullopt) {}
+ AsyncValue(Expected<std::optional<T>> &&Obj) : Value(std::move(Obj)) {}
+
+private:
+ Expected<std::optional<T>> Value;
+};
+
+} // namespace cas
+
+template <> struct DenseMapInfo<cas::CASID> {
+ static cas::CASID getEmptyKey() { return cas::CASID::getDenseMapEmptyKey(); }
+
+ static cas::CASID getTombstoneKey() {
+ return cas::CASID::getDenseMapTombstoneKey();
+ }
+
+ static unsigned getHashValue(cas::CASID ID) {
+ return (unsigned)hash_value(ID);
+ }
+
+ static bool isEqual(cas::CASID LHS, cas::CASID RHS) { return LHS == RHS; }
+};
+
+} // namespace llvm
+
+#endif // LLVM_CAS_CASID_H
diff --git a/llvm/include/llvm/CAS/CASReference.h b/llvm/include/llvm/CAS/CASReference.h
new file mode 100644
index 0000000..1f435cf
--- /dev/null
+++ b/llvm/include/llvm/CAS/CASReference.h
@@ -0,0 +1,207 @@
+//===- llvm/CAS/CASReference.h ----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CAS_CASREFERENCE_H
+#define LLVM_CAS_CASREFERENCE_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/StringRef.h"
+
+namespace llvm {
+
+class raw_ostream;
+
+namespace cas {
+
+class ObjectStore;
+
+class ObjectHandle;
+class ObjectRef;
+
+/// Base class for references to things in \a ObjectStore.
+class ReferenceBase {
+protected:
+ struct DenseMapEmptyTag {};
+ struct DenseMapTombstoneTag {};
+ static constexpr uint64_t getDenseMapEmptyRef() { return -1ULL; }
+ static constexpr uint64_t getDenseMapTombstoneRef() { return -2ULL; }
+
+public:
+ /// Get an internal reference.
+ uint64_t getInternalRef(const ObjectStore &ExpectedCAS) const {
+#if LLVM_ENABLE_ABI_BREAKING_CHECKS
+ assert(CAS == &ExpectedCAS && "Extracting reference for the wrong CAS");
+#endif
+ return InternalRef;
+ }
+
+ unsigned getDenseMapHash() const {
+ return (unsigned)llvm::hash_value(InternalRef);
+ }
+ bool isDenseMapEmpty() const { return InternalRef == getDenseMapEmptyRef(); }
+ bool isDenseMapTombstone() const {
+ return InternalRef == getDenseMapTombstoneRef();
+ }
+ bool isDenseMapSentinel() const {
+ return isDenseMapEmpty() || isDenseMapTombstone();
+ }
+
+protected:
+ void print(raw_ostream &OS, const ObjectHandle &This) const;
+ void print(raw_ostream &OS, const ObjectRef &This) const;
+
+ bool hasSameInternalRef(const ReferenceBase &RHS) const {
+#if LLVM_ENABLE_ABI_BREAKING_CHECKS
+ assert(
+ (isDenseMapSentinel() || RHS.isDenseMapSentinel() || CAS == RHS.CAS) &&
+ "Cannot compare across CAS instances");
+#endif
+ return InternalRef == RHS.InternalRef;
+ }
+
+protected:
+ friend class ObjectStore;
+ ReferenceBase(const ObjectStore *CAS, uint64_t InternalRef, bool IsHandle)
+ : InternalRef(InternalRef) {
+#if LLVM_ENABLE_ABI_BREAKING_CHECKS
+ this->CAS = CAS;
+#endif
+ assert(InternalRef != getDenseMapEmptyRef() && "Reserved for DenseMapInfo");
+ assert(InternalRef != getDenseMapTombstoneRef() &&
+ "Reserved for DenseMapInfo");
+ }
+ explicit ReferenceBase(DenseMapEmptyTag)
+ : InternalRef(getDenseMapEmptyRef()) {}
+ explicit ReferenceBase(DenseMapTombstoneTag)
+ : InternalRef(getDenseMapTombstoneRef()) {}
+
+private:
+ uint64_t InternalRef;
+
+#if LLVM_ENABLE_ABI_BREAKING_CHECKS
+ const ObjectStore *CAS = nullptr;
+#endif
+};
+
+/// Reference to an object in a \a ObjectStore instance.
+///
+/// If you have an ObjectRef, you know the object exists, and you can point at
+/// it from new nodes with \a ObjectStore::store(), but you don't know anything
+/// about it. "Loading" the object is a separate step that may not have
+/// happened yet, and which can fail (due to filesystem corruption) or
+/// introduce latency (if downloading from a remote store).
+///
+/// \a ObjectStore::store() takes a list of these, and these are returned by \a
+/// ObjectStore::forEachRef() and \a ObjectStore::readRef(), which are accessors
+/// for nodes, and \a ObjectStore::getReference().
+///
+/// \a ObjectStore::load() will load the referenced object, and returns \a
+/// ObjectHandle, a variant that knows what kind of entity it is. \a
+/// ObjectStore::getReferenceKind() can expect the type of reference without
+/// asking for unloaded objects to be loaded.
+///
+/// This is a wrapper around a \c uint64_t (and a \a ObjectStore instance when
+/// assertions are on). If necessary, it can be deconstructed and reconstructed
+/// using \a Reference::getInternalRef() and \a
+/// Reference::getFromInternalRef(), but clients aren't expected to need to do
+/// this. These both require the right \a ObjectStore instance.
+class ObjectRef : public ReferenceBase {
+ struct DenseMapTag {};
+
+public:
+ friend bool operator==(const ObjectRef &LHS, const ObjectRef &RHS) {
+ return LHS.hasSameInternalRef(RHS);
+ }
+ friend bool operator!=(const ObjectRef &LHS, const ObjectRef &RHS) {
+ return !(LHS == RHS);
+ }
+
+ /// Allow a reference to be recreated after it's deconstructed.
+ static ObjectRef getFromInternalRef(const ObjectStore &CAS,
+ uint64_t InternalRef) {
+ return ObjectRef(CAS, InternalRef);
+ }
+
+ static ObjectRef getDenseMapEmptyKey() {
+ return ObjectRef(DenseMapEmptyTag{});
+ }
+ static ObjectRef getDenseMapTombstoneKey() {
+ return ObjectRef(DenseMapTombstoneTag{});
+ }
+
+ /// Print internal ref and/or CASID. Only suitable for debugging.
+ void print(raw_ostream &OS) const { return ReferenceBase::print(OS, *this); }
+
+ LLVM_DUMP_METHOD void dump() const;
+
+private:
+ friend class ObjectStore;
+ friend class ReferenceBase;
+ using ReferenceBase::ReferenceBase;
+ ObjectRef(const ObjectStore &CAS, uint64_t InternalRef)
+ : ReferenceBase(&CAS, InternalRef, /*IsHandle=*/false) {
+ assert(InternalRef != -1ULL && "Reserved for DenseMapInfo");
+ assert(InternalRef != -2ULL && "Reserved for DenseMapInfo");
+ }
+ explicit ObjectRef(DenseMapEmptyTag T) : ReferenceBase(T) {}
+ explicit ObjectRef(DenseMapTombstoneTag T) : ReferenceBase(T) {}
+ explicit ObjectRef(ReferenceBase) = delete;
+};
+
+/// Handle to a loaded object in a \a ObjectStore instance.
+///
+/// ObjectHandle encapulates a *loaded* object in the CAS. You need one
+/// of these to inspect the content of an object: to look at its stored
+/// data and references.
+class ObjectHandle : public ReferenceBase {
+public:
+ friend bool operator==(const ObjectHandle &LHS, const ObjectHandle &RHS) {
+ return LHS.hasSameInternalRef(RHS);
+ }
+ friend bool operator!=(const ObjectHandle &LHS, const ObjectHandle &RHS) {
+ return !(LHS == RHS);
+ }
+
+ /// Print internal ref and/or CASID. Only suitable for debugging.
+ void print(raw_ostream &OS) const { return ReferenceBase::print(OS, *this); }
+
+ LLVM_DUMP_METHOD void dump() const;
+
+private:
+ friend class ObjectStore;
+ friend class ReferenceBase;
+ using ReferenceBase::ReferenceBase;
+ explicit ObjectHandle(ReferenceBase) = delete;
+ ObjectHandle(const ObjectStore &CAS, uint64_t InternalRef)
+ : ReferenceBase(&CAS, InternalRef, /*IsHandle=*/true) {}
+};
+
+} // namespace cas
+
+template <> struct DenseMapInfo<cas::ObjectRef> {
+ static cas::ObjectRef getEmptyKey() {
+ return cas::ObjectRef::getDenseMapEmptyKey();
+ }
+
+ static cas::ObjectRef getTombstoneKey() {
+ return cas::ObjectRef::getDenseMapTombstoneKey();
+ }
+
+ static unsigned getHashValue(cas::ObjectRef Ref) {
+ return Ref.getDenseMapHash();
+ }
+
+ static bool isEqual(cas::ObjectRef LHS, cas::ObjectRef RHS) {
+ return LHS == RHS;
+ }
+};
+
+} // namespace llvm
+
+#endif // LLVM_CAS_CASREFERENCE_H
diff --git a/llvm/include/llvm/CAS/MappedFileRegionBumpPtr.h b/llvm/include/llvm/CAS/MappedFileRegionBumpPtr.h
new file mode 100644
index 0000000..0df8163
--- /dev/null
+++ b/llvm/include/llvm/CAS/MappedFileRegionBumpPtr.h
@@ -0,0 +1,110 @@
+//===- MappedFileRegionBumpPtr.h --------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CAS_MAPPEDFILEREGIONBUMPPTR_H
+#define LLVM_CAS_MAPPEDFILEREGIONBUMPPTR_H
+
+#include "llvm/Support/Alignment.h"
+#include "llvm/Support/FileSystem.h"
+#include <atomic>
+
+namespace llvm::cas {
+
+/// Allocator for an owned mapped file region that supports thread-safe and
+/// process-safe bump pointer allocation.
+///
+/// This allocator is designed to create a sparse file when supported by the
+/// filesystem's \c ftruncate so that it can be used with a large maximum size.
+/// It will also attempt to shrink the underlying file down to its current
+/// allocation size when the last concurrent mapping is closed.
+///
+/// Process-safe. Uses file locks when resizing the file during initialization
+/// and destruction.
+///
+/// Thread-safe, assuming all threads use the same instance to talk to a given
+/// file/mapping. Unsafe to have multiple instances talking to the same file
+/// in the same process since file locks will misbehave. Clients should
+/// coordinate (somehow).
+///
+/// \note Currently we allocate the whole file without sparseness on Windows.
+///
+/// Provides 8-byte alignment for all allocations.
+class MappedFileRegionBumpPtr {
+public:
+ using RegionT = sys::fs::mapped_file_region;
+
+ /// Create a \c MappedFileRegionBumpPtr.
+ ///
+ /// \param Path the path to open the mapped region.
+ /// \param Capacity the maximum size for the mapped file region.
+ /// \param BumpPtrOffset the offset at which to store the bump pointer.
+ /// \param NewFileConstructor is for constructing new files. It has exclusive
+ /// access to the file. Must call \c initializeBumpPtr.
+ static Expected<MappedFileRegionBumpPtr>
+ create(const Twine &Path, uint64_t Capacity, int64_t BumpPtrOffset,
+ function_ref<Error(MappedFileRegionBumpPtr &)> NewFileConstructor);
+
+ /// Finish initializing the bump pointer. Must be called by
+ /// \c NewFileConstructor.
+ void initializeBumpPtr(int64_t BumpPtrOffset);
+
+ /// Minimum alignment for allocations, currently hardcoded to 8B.
+ static constexpr Align getAlign() {
+ // Trick Align into giving us '8' as a constexpr.
+ struct alignas(8) T {};
+ static_assert(alignof(T) == 8, "Tautology failed?");
+ return Align::Of<T>();
+ }
+
+ /// Allocate at least \p AllocSize. Rounds up to \a getAlign().
+ char *allocate(uint64_t AllocSize) {
+ return data() + allocateOffset(AllocSize);
+ }
+ /// Allocate, returning the offset from \a data() instead of a pointer.
+ int64_t allocateOffset(uint64_t AllocSize);
+
+ char *data() const { return Region.data(); }
+ uint64_t size() const { return *BumpPtr; }
+ uint64_t capacity() const { return Region.size(); }
+
+ RegionT &getRegion() { return Region; }
+
+ ~MappedFileRegionBumpPtr() { destroyImpl(); }
+
+ MappedFileRegionBumpPtr() = default;
+ MappedFileRegionBumpPtr(MappedFileRegionBumpPtr &&RHS) { moveImpl(RHS); }
+ MappedFileRegionBumpPtr &operator=(MappedFileRegionBumpPtr &&RHS) {
+ destroyImpl();
+ moveImpl(RHS);
+ return *this;
+ }
+
+ MappedFileRegionBumpPtr(const MappedFileRegionBumpPtr &) = delete;
+ MappedFileRegionBumpPtr &operator=(const MappedFileRegionBumpPtr &) = delete;
+
+private:
+ void destroyImpl();
+ void moveImpl(MappedFileRegionBumpPtr &RHS) {
+ std::swap(Region, RHS.Region);
+ std::swap(BumpPtr, RHS.BumpPtr);
+ std::swap(Path, RHS.Path);
+ std::swap(FD, RHS.FD);
+ std::swap(SharedLockFD, RHS.SharedLockFD);
+ }
+
+private:
+ RegionT Region;
+ std::atomic<int64_t> *BumpPtr = nullptr;
+ std::string Path;
+ std::optional<int> FD;
+ std::optional<int> SharedLockFD;
+};
+
+} // namespace llvm::cas
+
+#endif // LLVM_CAS_MAPPEDFILEREGIONBUMPPTR_H
diff --git a/llvm/include/llvm/CAS/ObjectStore.h b/llvm/include/llvm/CAS/ObjectStore.h
new file mode 100644
index 0000000..b4720c7
--- /dev/null
+++ b/llvm/include/llvm/CAS/ObjectStore.h
@@ -0,0 +1,302 @@
+//===- llvm/CAS/ObjectStore.h -----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CAS_OBJECTSTORE_H
+#define LLVM_CAS_OBJECTSTORE_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CAS/CASID.h"
+#include "llvm/CAS/CASReference.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/FileSystem.h"
+#include <cstddef>
+
+namespace llvm {
+
+class MemoryBuffer;
+template <typename T> class unique_function;
+
+namespace cas {
+
+class ObjectStore;
+class ObjectProxy;
+
+/// Content-addressable storage for objects.
+///
+/// Conceptually, objects are stored in a "unique set".
+///
+/// - Objects are immutable ("value objects") that are defined by their
+/// content. They are implicitly deduplicated by content.
+/// - Each object has a unique identifier (UID) that's derived from its content,
+/// called a \a CASID.
+/// - This UID is a fixed-size (strong) hash of the transitive content of a
+/// CAS object.
+/// - It's comparable between any two CAS instances that have the same \a
+/// CASIDContext::getHashSchemaIdentifier().
+/// - The UID can be printed (e.g., \a CASID::toString()) and it can parsed
+/// by the same or a different CAS instance with \a
+/// ObjectStore::parseID().
+/// - An object can be looked up by content or by UID.
+/// - \a store() is "get-or-create" methods, writing an object if it
+/// doesn't exist yet, and return a ref to it in any case.
+/// - \a loadObject(const CASID&) looks up an object by its UID.
+/// - Objects can reference other objects, forming an arbitrary DAG.
+///
+/// The \a ObjectStore interface has a few ways of referencing objects:
+///
+/// - \a ObjectRef encapsulates a reference to something in the CAS. It is an
+/// opaque type that references an object inside a specific CAS. It is
+/// implementation defined if the underlying object exists or not for an
+/// ObjectRef, and it can used to speed up CAS lookup as an implementation
+/// detail. However, you don't know anything about the underlying objects.
+/// "Loading" the object is a separate step that may not have happened
+/// yet, and which can fail (e.g. due to filesystem corruption) or introduce
+/// latency (if downloading from a remote store).
+/// - \a ObjectHandle encapulates a *loaded* object in the CAS. You need one of
+/// these to inspect the content of an object: to look at its stored
+/// data and references. This is internal to CAS implementation and not
+/// availble from CAS public APIs.
+/// - \a CASID: the UID for an object in the CAS, obtained through \a
+/// ObjectStore::getID() or \a ObjectStore::parseID(). This is a valid CAS
+/// identifier, but may reference an object that is unknown to this CAS
+/// instance.
+/// - \a ObjectProxy pairs an ObjectHandle (subclass) with a ObjectStore, and
+/// wraps access APIs to avoid having to pass extra parameters. It is the
+/// object used for accessing underlying data and refs by CAS users.
+///
+/// Both ObjectRef and ObjectHandle are lightweight, wrapping a `uint64_t` and
+/// are only valid with the associated ObjectStore instance.
+///
+/// There are a few options for accessing content of objects, with different
+/// lifetime tradeoffs:
+///
+/// - \a getData() accesses data without exposing lifetime at all.
+/// - \a getMemoryBuffer() returns a \a MemoryBuffer whose lifetime
+/// is independent of the CAS (it can live longer).
+/// - \a getDataString() return StringRef with lifetime is guaranteed to last as
+/// long as \a ObjectStore.
+/// - \a readRef() and \a forEachRef() iterate through the references in an
+/// object. There is no lifetime assumption.
+class ObjectStore {
+ friend class ObjectProxy;
+ void anchor();
+
+public:
+ /// Get a \p CASID from a \p ID, which should have been generated by \a
+ /// CASID::print(). This succeeds as long as \a validateID() would pass. The
+ /// object may be unknown to this CAS instance.
+ ///
+ /// TODO: Remove, and update callers to use \a validateID() or \a
+ /// extractHashFromID().
+ virtual Expected<CASID> parseID(StringRef ID) = 0;
+
+ /// Store object into ObjectStore.
+ virtual Expected<ObjectRef> store(ArrayRef<ObjectRef> Refs,
+ ArrayRef<char> Data) = 0;
+ /// Get an ID for \p Ref.
+ virtual CASID getID(ObjectRef Ref) const = 0;
+
+ /// Get an existing reference to the object called \p ID.
+ ///
+ /// Returns \c None if the object is not stored in this CAS.
+ virtual std::optional<ObjectRef> getReference(const CASID &ID) const = 0;
+
+ /// \returns true if the object is directly available from the local CAS, for
+ /// implementations that have this kind of distinction.
+ virtual Expected<bool> isMaterialized(ObjectRef Ref) const = 0;
+
+ /// Validate the underlying object referred by CASID.
+ virtual Error validate(const CASID &ID) = 0;
+
+protected:
+ /// Load the object referenced by \p Ref.
+ ///
+ /// Errors if the object cannot be loaded.
+ /// \returns \c std::nullopt if the object is missing from the CAS.
+ virtual Expected<std::optional<ObjectHandle>> loadIfExists(ObjectRef Ref) = 0;
+
+ /// Like \c loadIfExists but returns an error if the object is missing.
+ Expected<ObjectHandle> load(ObjectRef Ref);
+
+ /// Get the size of some data.
+ virtual uint64_t getDataSize(ObjectHandle Node) const = 0;
+
+ /// Methods for handling objects.
+ virtual Error forEachRef(ObjectHandle Node,
+ function_ref<Error(ObjectRef)> Callback) const = 0;
+ virtual ObjectRef readRef(ObjectHandle Node, size_t I) const = 0;
+ virtual size_t getNumRefs(ObjectHandle Node) const = 0;
+ virtual ArrayRef<char> getData(ObjectHandle Node,
+ bool RequiresNullTerminator = false) const = 0;
+
+ /// Get ObjectRef from open file.
+ virtual Expected<ObjectRef>
+ storeFromOpenFileImpl(sys::fs::file_t FD,
+ std::optional<sys::fs::file_status> Status);
+
+ /// Get a lifetime-extended StringRef pointing at \p Data.
+ ///
+ /// Depending on the CAS implementation, this may involve in-memory storage
+ /// overhead.
+ StringRef getDataString(ObjectHandle Node) {
+ return toStringRef(getData(Node));
+ }
+
+ /// Get a lifetime-extended MemoryBuffer pointing at \p Data.
+ ///
+ /// Depending on the CAS implementation, this may involve in-memory storage
+ /// overhead.
+ std::unique_ptr<MemoryBuffer>
+ getMemoryBuffer(ObjectHandle Node, StringRef Name = "",
+ bool RequiresNullTerminator = true);
+
+ /// Read all the refs from object in a SmallVector.
+ virtual void readRefs(ObjectHandle Node,
+ SmallVectorImpl<ObjectRef> &Refs) const;
+
+ /// Allow ObjectStore implementations to create internal handles.
+#define MAKE_CAS_HANDLE_CONSTRUCTOR(HandleKind) \
+ HandleKind make##HandleKind(uint64_t InternalRef) const { \
+ return HandleKind(*this, InternalRef); \
+ }
+ MAKE_CAS_HANDLE_CONSTRUCTOR(ObjectHandle)
+ MAKE_CAS_HANDLE_CONSTRUCTOR(ObjectRef)
+#undef MAKE_CAS_HANDLE_CONSTRUCTOR
+
+public:
+ /// Helper functions to store object and returns a ObjectProxy.
+ Expected<ObjectProxy> createProxy(ArrayRef<ObjectRef> Refs, StringRef Data);
+
+ /// Store object from StringRef.
+ Expected<ObjectRef> storeFromString(ArrayRef<ObjectRef> Refs,
+ StringRef String) {
+ return store(Refs, arrayRefFromStringRef<char>(String));
+ }
+
+ /// Default implementation reads \p FD and calls \a storeNode(). Does not
+ /// take ownership of \p FD; the caller is responsible for closing it.
+ ///
+ /// If \p Status is sent in it is to be treated as a hint. Implementations
+ /// must protect against the file size potentially growing after the status
+ /// was taken (i.e., they cannot assume that an mmap will be null-terminated
+ /// where \p Status implies).
+ ///
+ /// Returns the \a CASID and the size of the file.
+ Expected<ObjectRef>
+ storeFromOpenFile(sys::fs::file_t FD,
+ std::optional<sys::fs::file_status> Status = std::nullopt) {
+ return storeFromOpenFileImpl(FD, Status);
+ }
+
+ static Error createUnknownObjectError(const CASID &ID);
+
+ /// Create ObjectProxy from CASID. If the object doesn't exist, get an error.
+ Expected<ObjectProxy> getProxy(const CASID &ID);
+ /// Create ObjectProxy from ObjectRef. If the object can't be loaded, get an
+ /// error.
+ Expected<ObjectProxy> getProxy(ObjectRef Ref);
+
+ /// \returns \c std::nullopt if the object is missing from the CAS.
+ Expected<std::optional<ObjectProxy>> getProxyIfExists(ObjectRef Ref);
+
+ /// Read the data from \p Data into \p OS.
+ uint64_t readData(ObjectHandle Node, raw_ostream &OS, uint64_t Offset = 0,
+ uint64_t MaxBytes = -1ULL) const {
+ ArrayRef<char> Data = getData(Node);
+ assert(Offset < Data.size() && "Expected valid offset");
+ Data = Data.drop_front(Offset).take_front(MaxBytes);
+ OS << toStringRef(Data);
+ return Data.size();
+ }
+
+ /// Validate the whole node tree.
+ Error validateTree(ObjectRef Ref);
+
+ /// Print the ObjectStore internals for debugging purpose.
+ virtual void print(raw_ostream &) const {}
+ void dump() const;
+
+ /// Get CASContext
+ const CASContext &getContext() const { return Context; }
+
+ virtual ~ObjectStore() = default;
+
+protected:
+ ObjectStore(const CASContext &Context) : Context(Context) {}
+
+private:
+ const CASContext &Context;
+};
+
+/// Reference to an abstract hierarchical node, with data and references.
+/// Reference is passed by value and is expected to be valid as long as the \a
+/// ObjectStore is.
+class ObjectProxy {
+public:
+ const ObjectStore &getCAS() const { return *CAS; }
+ ObjectStore &getCAS() { return *CAS; }
+ CASID getID() const { return CAS->getID(Ref); }
+ ObjectRef getRef() const { return Ref; }
+ size_t getNumReferences() const { return CAS->getNumRefs(H); }
+ ObjectRef getReference(size_t I) const { return CAS->readRef(H, I); }
+
+ operator CASID() const { return getID(); }
+ CASID getReferenceID(size_t I) const {
+ std::optional<CASID> ID = getCAS().getID(getReference(I));
+ assert(ID && "Expected reference to be first-class object");
+ return *ID;
+ }
+
+ /// Visit each reference in order, returning an error from \p Callback to
+ /// stop early.
+ Error forEachReference(function_ref<Error(ObjectRef)> Callback) const {
+ return CAS->forEachRef(H, Callback);
+ }
+
+ std::unique_ptr<MemoryBuffer>
+ getMemoryBuffer(StringRef Name = "",
+ bool RequiresNullTerminator = true) const;
+
+ /// Get the content of the node. Valid as long as the CAS is valid.
+ StringRef getData() const { return CAS->getDataString(H); }
+
+ friend bool operator==(const ObjectProxy &Proxy, ObjectRef Ref) {
+ return Proxy.getRef() == Ref;
+ }
+ friend bool operator==(ObjectRef Ref, const ObjectProxy &Proxy) {
+ return Proxy.getRef() == Ref;
+ }
+ friend bool operator!=(const ObjectProxy &Proxy, ObjectRef Ref) {
+ return !(Proxy.getRef() == Ref);
+ }
+ friend bool operator!=(ObjectRef Ref, const ObjectProxy &Proxy) {
+ return !(Proxy.getRef() == Ref);
+ }
+
+public:
+ ObjectProxy() = delete;
+
+ static ObjectProxy load(ObjectStore &CAS, ObjectRef Ref, ObjectHandle Node) {
+ return ObjectProxy(CAS, Ref, Node);
+ }
+
+private:
+ ObjectProxy(ObjectStore &CAS, ObjectRef Ref, ObjectHandle H)
+ : CAS(&CAS), Ref(Ref), H(H) {}
+
+ ObjectStore *CAS;
+ ObjectRef Ref;
+ ObjectHandle H;
+};
+
+std::unique_ptr<ObjectStore> createInMemoryCAS();
+
+} // namespace cas
+} // namespace llvm
+
+#endif // LLVM_CAS_OBJECTSTORE_H
diff --git a/llvm/include/llvm/CAS/OnDiskGraphDB.h b/llvm/include/llvm/CAS/OnDiskGraphDB.h
new file mode 100644
index 0000000..52cef5d3
--- /dev/null
+++ b/llvm/include/llvm/CAS/OnDiskGraphDB.h
@@ -0,0 +1,428 @@
+//===- OnDiskGraphDB.h ------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CAS_ONDISKGRAPHDB_H
+#define LLVM_CAS_ONDISKGRAPHDB_H
+
+#include "llvm/ADT/PointerUnion.h"
+#include "llvm/CAS/OnDiskHashMappedTrie.h"
+
+namespace llvm::cas::ondisk {
+
+/// 8B reference.
+class InternalRef {
+public:
+ FileOffset getFileOffset() const { return FileOffset(getRawOffset()); }
+
+ uint64_t getRawData() const { return Data; }
+ uint64_t getRawOffset() const { return Data; }
+
+ static InternalRef getFromRawData(uint64_t Data) { return InternalRef(Data); }
+
+ static InternalRef getFromOffset(FileOffset Offset) {
+ return InternalRef(Offset.get());
+ }
+
+ friend bool operator==(InternalRef LHS, InternalRef RHS) {
+ return LHS.Data == RHS.Data;
+ }
+
+private:
+ InternalRef(FileOffset Offset) : Data((uint64_t)Offset.get()) {}
+ InternalRef(uint64_t Data) : Data(Data) {}
+ uint64_t Data;
+};
+
+/// 4B reference.
+class InternalRef4B {
+public:
+ FileOffset getFileOffset() const { return FileOffset(Data); }
+
+ uint32_t getRawData() const { return Data; }
+
+ /// Shrink to 4B reference.
+ static std::optional<InternalRef4B> tryToShrink(InternalRef Ref) {
+ uint64_t Offset = Ref.getRawOffset();
+ if (Offset > UINT32_MAX)
+ return std::nullopt;
+
+ return InternalRef4B(Offset);
+ }
+
+ operator InternalRef() const {
+ return InternalRef::getFromOffset(getFileOffset());
+ }
+
+private:
+ friend class InternalRef;
+ InternalRef4B(uint32_t Data) : Data(Data) {}
+ uint32_t Data;
+};
+
+/// Array of internal node references.
+class InternalRefArrayRef {
+public:
+ size_t size() const { return Size; }
+ bool empty() const { return !Size; }
+
+ class iterator
+ : public iterator_facade_base<iterator, std::random_access_iterator_tag,
+ const InternalRef> {
+ public:
+ bool operator==(const iterator &RHS) const { return I == RHS.I; }
+ InternalRef operator*() const {
+ if (auto *Ref = dyn_cast<const InternalRef *>(I))
+ return *Ref;
+ return InternalRef(*I.get<const InternalRef4B *>());
+ }
+ bool operator<(const iterator &RHS) const {
+ assert(I.is<const InternalRef *>() == RHS.I.is<const InternalRef *>());
+ if (auto *Ref = dyn_cast<const InternalRef *>(I))
+ return Ref < RHS.I.get<const InternalRef *>();
+ return I.get<const InternalRef4B *>() -
+ RHS.I.get<const InternalRef4B *>();
+ }
+ ptrdiff_t operator-(const iterator &RHS) const {
+ assert(I.is<const InternalRef *>() == RHS.I.is<const InternalRef *>());
+ if (auto *Ref = dyn_cast<const InternalRef *>(I))
+ return Ref - RHS.I.get<const InternalRef *>();
+ return I.get<const InternalRef4B *>() -
+ RHS.I.get<const InternalRef4B *>();
+ }
+ iterator &operator+=(ptrdiff_t N) {
+ if (auto *Ref = dyn_cast<const InternalRef *>(I))
+ I = Ref + N;
+ else
+ I = I.get<const InternalRef4B *>() + N;
+ return *this;
+ }
+ iterator &operator-=(ptrdiff_t N) {
+ if (auto *Ref = dyn_cast<const InternalRef *>(I))
+ I = Ref - N;
+ else
+ I = I.get<const InternalRef4B *>() - N;
+ return *this;
+ }
+ InternalRef operator[](ptrdiff_t N) const { return *(this->operator+(N)); }
+
+ iterator() = default;
+
+ uint64_t getOpaqueData() const { return uintptr_t(I.getOpaqueValue()); }
+
+ static iterator fromOpaqueData(uint64_t Opaque) {
+ return iterator(
+ PointerUnion<const InternalRef *,
+ const InternalRef4B *>::getFromOpaqueValue((void *)
+ Opaque));
+ }
+
+ private:
+ friend class InternalRefArrayRef;
+ explicit iterator(
+ PointerUnion<const InternalRef *, const InternalRef4B *> I)
+ : I(I) {}
+ PointerUnion<const InternalRef *, const InternalRef4B *> I;
+ };
+
+ bool operator==(const InternalRefArrayRef &RHS) const {
+ return size() == RHS.size() && std::equal(begin(), end(), RHS.begin());
+ }
+
+ iterator begin() const { return iterator(Begin); }
+ iterator end() const { return begin() + Size; }
+
+ /// Array accessor.
+ InternalRef operator[](ptrdiff_t N) const { return begin()[N]; }
+
+ bool is4B() const { return Begin.is<const InternalRef4B *>(); }
+ bool is8B() const { return Begin.is<const InternalRef *>(); }
+
+ ArrayRef<uint8_t> getBuffer() const {
+ if (is4B()) {
+ auto *B = Begin.get<const InternalRef4B *>();
+ return ArrayRef((const uint8_t *)B, sizeof(InternalRef4B) * Size);
+ } else {
+ auto *B = Begin.get<const InternalRef *>();
+ return ArrayRef((const uint8_t *)B, sizeof(InternalRef) * Size);
+ }
+ }
+
+ InternalRefArrayRef(std::nullopt_t = std::nullopt) {
+ // This is useful so that all the casts in the \p iterator functions can
+ // operate without needing to check for a null value.
+ static InternalRef PlaceHolder = InternalRef::getFromRawData(0);
+ Begin = &PlaceHolder;
+ }
+
+ InternalRefArrayRef(ArrayRef<InternalRef> Refs)
+ : Begin(Refs.begin()), Size(Refs.size()) {}
+
+ InternalRefArrayRef(ArrayRef<InternalRef4B> Refs)
+ : Begin(Refs.begin()), Size(Refs.size()) {}
+
+private:
+ PointerUnion<const InternalRef *, const InternalRef4B *> Begin;
+ size_t Size = 0;
+};
+
+struct OnDiskContent;
+
+/// Reference to a node. The node's data may not be stored in the database.
+/// An \p ObjectID instance can only be used with the \p OnDiskGraphDB instance
+/// it came from. \p ObjectIDs from different \p OnDiskGraphDB instances are not
+/// comparable.
+class ObjectID {
+public:
+ uint64_t getOpaqueData() const { return Opaque; }
+
+ static ObjectID fromOpaqueData(uint64_t Opaque) { return ObjectID(Opaque); }
+
+ friend bool operator==(const ObjectID &LHS, const ObjectID &RHS) {
+ return LHS.Opaque == RHS.Opaque;
+ }
+ friend bool operator!=(const ObjectID &LHS, const ObjectID &RHS) {
+ return !(LHS == RHS);
+ }
+
+private:
+ explicit ObjectID(uint64_t Opaque) : Opaque(Opaque) {}
+ uint64_t Opaque;
+};
+
+/// Handle for a loaded node object.
+class ObjectHandle {
+public:
+ uint64_t getOpaqueData() const { return Opaque; }
+
+ static ObjectHandle fromOpaqueData(uint64_t Opaque) {
+ return ObjectHandle(Opaque);
+ }
+
+ friend bool operator==(const ObjectHandle &LHS, const ObjectHandle &RHS) {
+ return LHS.Opaque == RHS.Opaque;
+ }
+ friend bool operator!=(const ObjectHandle &LHS, const ObjectHandle &RHS) {
+ return !(LHS == RHS);
+ }
+
+private:
+ explicit ObjectHandle(uint64_t Opaque) : Opaque(Opaque) {}
+ uint64_t Opaque;
+};
+
+class object_refs_iterator
+ : public iterator_facade_base<object_refs_iterator,
+ std::random_access_iterator_tag, ObjectID> {
+public:
+ bool operator==(const object_refs_iterator &RHS) const { return I == RHS.I; }
+ ObjectID operator*() const {
+ return ObjectID::fromOpaqueData((*I).getRawData());
+ }
+ bool operator<(const object_refs_iterator &RHS) const { return I < RHS.I; }
+ ptrdiff_t operator-(const object_refs_iterator &RHS) const {
+ return I - RHS.I;
+ }
+ object_refs_iterator &operator+=(ptrdiff_t N) {
+ I += N;
+ return *this;
+ }
+ object_refs_iterator &operator-=(ptrdiff_t N) {
+ I -= N;
+ return *this;
+ }
+ ObjectID operator[](ptrdiff_t N) const { return *(this->operator+(N)); }
+
+ object_refs_iterator() = default;
+ object_refs_iterator(InternalRefArrayRef::iterator I) : I(I) {}
+
+ uint64_t getOpaqueData() const { return I.getOpaqueData(); }
+
+ static object_refs_iterator fromOpaqueData(uint64_t Opaque) {
+ return InternalRefArrayRef::iterator::fromOpaqueData(Opaque);
+ }
+
+private:
+ InternalRefArrayRef::iterator I;
+};
+
+using object_refs_range = llvm::iterator_range<object_refs_iterator>;
+
+/// On-disk CAS nodes database, independent of a particular hashing algorithm.
+class OnDiskGraphDB {
+public:
+ /// Associate data & references with a particular object ID. If there is
+ /// already a record for this object the operation is a no-op. \param ID the
+ /// object ID to associate the data & references with. \param Refs references
+ /// \param Data data buffer.
+ Error store(ObjectID ID, ArrayRef<ObjectID> Refs, ArrayRef<char> Data);
+
+ /// \returns \p nullopt if the object associated with \p Ref does not exist.
+ Expected<std::optional<ObjectHandle>> load(ObjectID Ref);
+
+ /// \returns the hash bytes digest for the object reference.
+ ArrayRef<uint8_t> getDigest(ObjectID Ref) const {
+ return getDigest(getInternalRef(Ref));
+ }
+
+ /// Form a reference for the provided hash. The reference can be used as part
+ /// of a CAS object even if it's not associated with an object yet.
+ ObjectID getReference(ArrayRef<uint8_t> Hash);
+
+ /// Get an existing reference to the object \p Digest.
+ ///
+ /// Returns \p nullopt if the object is not stored in this CAS.
+ std::optional<ObjectID> getExistingReference(ArrayRef<uint8_t> Digest);
+
+ /// Check whether the object associated with \p Ref is stored in the CAS.
+ /// Note that this function will fault-in according to the policy.
+ Expected<bool> isMaterialized(ObjectID Ref);
+
+ /// Check whether the object associated with \p Ref is stored in the CAS.
+ /// Note that this function does not fault-in.
+ bool containsObject(ObjectID Ref) const {
+ return containsObject(Ref, /*CheckUpstream=*/true);
+ }
+
+ /// \returns the data part of the provided object handle.
+ ArrayRef<char> getObjectData(ObjectHandle Node) const;
+
+ object_refs_range getObjectRefs(ObjectHandle Node) const {
+ InternalRefArrayRef Refs = getInternalRefs(Node);
+ return make_range(Refs.begin(), Refs.end());
+ }
+
+ /// \returns Total size of stored objects.
+ ///
+ /// NOTE: There's a possibility that the returned size is not including a
+ /// large object if the process crashed right at the point of inserting it.
+ size_t getStorageSize() const;
+
+ void print(raw_ostream &OS) const;
+
+ /// How to fault-in nodes if an upstream database is used.
+ enum class FaultInPolicy {
+ /// Copy only the requested node.
+ SingleNode,
+ /// Copy the the entire graph of a node.
+ FullTree,
+ };
+
+ /// Open the on-disk store from a directory.
+ ///
+ /// \param Path directory for the on-disk store. The directory will be created
+ /// if it doesn't exist.
+ /// \param HashName Identifier name for the hashing algorithm that is going to
+ /// be used.
+ /// \param HashByteSize Size for the object digest hash bytes.
+ /// \param UpstreamDB Optional on-disk store to be used for faulting-in nodes
+ /// if they don't exist in the primary store. The upstream store is only used
+ /// for reading nodes, new nodes are only written to the primary store.
+ /// \param Policy If \p UpstreamDB is provided, controls how nodes are copied
+ /// to primary store. This is recorded at creation time and subsequent opens
+ /// need to pass the same policy otherwise the \p open will fail.
+ static Expected<std::unique_ptr<OnDiskGraphDB>>
+ open(StringRef Path, StringRef HashName, unsigned HashByteSize,
+ std::unique_ptr<OnDiskGraphDB> UpstreamDB = nullptr,
+ FaultInPolicy Policy = FaultInPolicy::FullTree);
+
+ ~OnDiskGraphDB();
+
+private:
+ struct IndexProxy;
+ class TempFile;
+ class MappedTempFile;
+
+ enum class ObjectPresence {
+ Missing,
+ InPrimaryDB,
+ OnlyInUpstreamDB,
+ };
+
+ ObjectPresence getObjectPresence(ObjectID Ref, bool CheckUpstream) const;
+
+ bool containsObject(ObjectID Ref, bool CheckUpstream) const {
+ switch (getObjectPresence(Ref, CheckUpstream)) {
+ case ObjectPresence::Missing:
+ return false;
+ case ObjectPresence::InPrimaryDB:
+ return true;
+ case ObjectPresence::OnlyInUpstreamDB:
+ return true;
+ }
+ }
+
+ /// When \p load is called for a node that doesn't exist, this function tries
+ /// to load it from the upstream store and copy it to the primary one.
+ Expected<std::optional<ObjectHandle>> faultInFromUpstream(ObjectID PrimaryID);
+ Error importFullTree(ObjectID PrimaryID, ObjectHandle UpstreamNode);
+ Error importSingleNode(ObjectID PrimaryID, ObjectHandle UpstreamNode);
+
+ IndexProxy indexHash(ArrayRef<uint8_t> Hash);
+
+ Error createStandaloneLeaf(IndexProxy &I, ArrayRef<char> Data);
+
+ Expected<MappedTempFile> createTempFile(StringRef FinalPath, uint64_t Size);
+
+ OnDiskContent getContentFromHandle(ObjectHandle H) const;
+
+ static InternalRef getInternalRef(ObjectID Ref) {
+ return InternalRef::getFromRawData(Ref.getOpaqueData());
+ }
+ static ObjectID getExternalReference(InternalRef Ref) {
+ return ObjectID::fromOpaqueData(Ref.getRawData());
+ }
+
+ static ObjectID getExternalReference(const IndexProxy &I);
+
+ void getStandalonePath(StringRef FileSuffix, const IndexProxy &I,
+ SmallVectorImpl<char> &Path) const;
+
+ ArrayRef<uint8_t> getDigest(InternalRef Ref) const;
+ ArrayRef<uint8_t> getDigest(const IndexProxy &I) const;
+
+ IndexProxy getIndexProxyFromRef(InternalRef Ref) const;
+
+ static InternalRef makeInternalRef(FileOffset IndexOffset);
+
+ IndexProxy
+ getIndexProxyFromPointer(OnDiskHashMappedTrie::const_pointer P) const;
+
+ InternalRefArrayRef getInternalRefs(ObjectHandle Node) const;
+
+ void recordStandaloneSizeIncrease(size_t SizeIncrease);
+
+ std::atomic<uint64_t> &getStandaloneStorageSize();
+ uint64_t getStandaloneStorageSize() const;
+
+ OnDiskGraphDB(StringRef RootPath, OnDiskHashMappedTrie Index,
+ OnDiskDataAllocator DataPool,
+ std::unique_ptr<OnDiskGraphDB> UpstreamDB,
+ FaultInPolicy Policy);
+
+ /// Mapping from hash to object reference.
+ ///
+ /// Data type is TrieRecord.
+ OnDiskHashMappedTrie Index;
+
+ /// Storage for most objects.
+ ///
+ /// Data type is DataRecordHandle.
+ OnDiskDataAllocator DataPool;
+
+ void *StandaloneData; // a StandaloneDataMap.
+
+ std::string RootPath;
+
+ /// Optional on-disk store to be used for faulting-in nodes.
+ std::unique_ptr<OnDiskGraphDB> UpstreamDB;
+ FaultInPolicy FIPolicy;
+};
+
+} // namespace llvm::cas::ondisk
+
+#endif // LLVM_CAS_ONDISKGRAPHDB_H
diff --git a/llvm/include/llvm/CAS/OnDiskHashMappedTrie.h b/llvm/include/llvm/CAS/OnDiskHashMappedTrie.h
new file mode 100644
index 0000000..9fe911f
--- /dev/null
+++ b/llvm/include/llvm/CAS/OnDiskHashMappedTrie.h
@@ -0,0 +1,375 @@
+//===- OnDiskHashMappedTrie.h -----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CAS_ONDISKHASHMAPPEDTRIE_H
+#define LLVM_CAS_ONDISKHASHMAPPEDTRIE_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Error.h"
+#include <optional>
+
+namespace llvm {
+
+class raw_ostream;
+
+namespace cas {
+
+class FileOffset {
+public:
+ int64_t get() const { return Offset; }
+
+ explicit operator bool() const { return Offset; }
+
+ FileOffset() = default;
+ explicit FileOffset(int64_t Offset) : Offset(Offset) { assert(Offset >= 0); }
+
+private:
+ int64_t Offset = 0;
+};
+
+/// On-disk hash-mapped trie. Thread-safe / lock-free.
+///
+/// This is an on-disk, thread-safe key-value store that is (mostly) lock-free.
+/// The keys are fixed length, and are expected to be binary hashes with a
+/// normal distribution.
+///
+/// - Thread-safety is achieved through the use of atomics within a shared
+/// memory mapping. Atomic access does not work on networked filesystems.
+/// - Filesystem locks are used, but only sparingly:
+/// - during initialization, for creating / opening an existing store;
+/// - for the lifetime of the instance, a shared/reader lock is held
+/// - during destruction, if there are no concurrent readers, to shrink the
+/// files to their minimum size.
+/// - Path is used as a directory:
+/// - "index" stores the root trie and subtries.
+/// - "data" stores (most of) the entries, like a bump-ptr-allocator.
+/// - Large entries are stored externally in a file named by the key.
+/// - Code is system-dependent (Windows not yet implemented), and binary format
+/// itself is not portable. These are not artifacts that can/should be moved
+/// between different systems; they are only appropriate for local storage.
+class OnDiskHashMappedTrie {
+public:
+ LLVM_DUMP_METHOD void dump() const;
+ void
+ print(raw_ostream &OS,
+ function_ref<void(ArrayRef<char>)> PrintRecordData = nullptr) const;
+
+public:
+ struct ConstValueProxy {
+ ConstValueProxy() = default;
+ ConstValueProxy(ArrayRef<uint8_t> Hash, ArrayRef<char> Data)
+ : Hash(Hash), Data(Data) {}
+ ConstValueProxy(ArrayRef<uint8_t> Hash, StringRef Data)
+ : Hash(Hash), Data(Data.begin(), Data.size()) {}
+
+ ArrayRef<uint8_t> Hash;
+ ArrayRef<char> Data;
+ };
+
+ struct ValueProxy {
+ operator ConstValueProxy() const { return ConstValueProxy(Hash, Data); }
+
+ ValueProxy() = default;
+ ValueProxy(ArrayRef<uint8_t> Hash, MutableArrayRef<char> Data)
+ : Hash(Hash), Data(Data) {}
+
+ ArrayRef<uint8_t> Hash;
+ MutableArrayRef<char> Data;
+ };
+
+private:
+ struct HintT {
+ explicit operator ValueProxy() const {
+ ValueProxy Value;
+ Value.Data = MutableArrayRef<char>(
+ const_cast<char *>(reinterpret_cast<const char *>(P)), I);
+ Value.Hash = ArrayRef<uint8_t>(nullptr, B);
+ return Value;
+ }
+
+ explicit HintT(ConstValueProxy Value)
+ : P(Value.Data.data()), I(Value.Data.size()), B(Value.Hash.size()) {
+ // Spot-check that this really was a hint.
+ assert(Value.Data.size() <= UINT16_MAX);
+ assert(Value.Hash.size() <= UINT16_MAX);
+ assert(Value.Hash.data() == nullptr);
+ }
+
+ HintT(const void *P, uint16_t I, uint16_t B) : P(P), I(I), B(B) {}
+
+ const void *P = nullptr;
+ uint16_t I = 0;
+ uint16_t B = 0;
+ };
+
+public:
+ template <class ProxyT> class PointerImpl {
+ public:
+ FileOffset getOffset() const {
+ return FileOffset(OffsetLow32 | (uint64_t)OffsetHigh16 << 32);
+ }
+
+ explicit operator bool() const { return IsValue; }
+
+ const ProxyT &operator*() const {
+ assert(IsValue);
+ return ValueOrHint;
+ }
+ const ProxyT *operator->() const {
+ assert(IsValue);
+ return &ValueOrHint;
+ }
+
+ PointerImpl() = default;
+
+ protected:
+ PointerImpl(FileOffset Offset, ProxyT Value)
+ : PointerImpl(Value, Offset, /*IsHint=*/false, /*IsValue=*/true) {}
+
+ explicit PointerImpl(FileOffset Offset, HintT H)
+ : PointerImpl(ValueProxy(H), Offset, /*IsHint=*/true,
+ /*IsValue=*/false) {}
+
+ PointerImpl(ProxyT ValueOrHint, FileOffset Offset, bool IsHint,
+ bool IsValue)
+ : ValueOrHint(ValueOrHint), OffsetLow32((uint64_t)Offset.get()),
+ OffsetHigh16((uint64_t)Offset.get() >> 32), IsHint(IsHint),
+ IsValue(IsValue) {
+ checkOffset(Offset);
+ }
+
+ static void checkOffset(FileOffset Offset) {
+ assert(Offset.get() > 0);
+ assert((uint64_t)Offset.get() < (1LL << 48));
+ }
+
+ std::optional<HintT> getHint(const OnDiskHashMappedTrie &This) const {
+ if (!IsHint)
+ return std::nullopt;
+ HintT H(ValueOrHint);
+ assert(H.P == &This && "Expected hint to be for This");
+ if (H.P != &This)
+ return std::nullopt;
+ return H;
+ }
+
+ ProxyT ValueOrHint;
+ uint32_t OffsetLow32 = 0;
+ uint16_t OffsetHigh16 = 0;
+ bool IsHint = false;
+ bool IsValue = false;
+ };
+
+ class pointer;
+ class const_pointer : public PointerImpl<ConstValueProxy> {
+ public:
+ const_pointer() = default;
+
+ private:
+ friend class pointer;
+ friend class OnDiskHashMappedTrie;
+ using const_pointer::PointerImpl::PointerImpl;
+ };
+
+ class pointer : public PointerImpl<ValueProxy> {
+ public:
+ operator const_pointer() const {
+ return const_pointer(ValueOrHint, getOffset(), IsHint, IsValue);
+ }
+
+ pointer() = default;
+
+ private:
+ friend class OnDiskHashMappedTrie;
+ using pointer::PointerImpl::PointerImpl;
+ };
+
+ pointer getMutablePointer(const_pointer CP) {
+ if (std::optional<HintT> H = CP.getHint(*this))
+ return pointer(CP.getOffset(), *H);
+ if (!CP)
+ return pointer();
+ ValueProxy V{CP->Hash, MutableArrayRef(const_cast<char *>(CP->Data.data()),
+ CP->Data.size())};
+ return pointer(CP.getOffset(), V);
+ }
+
+ const_pointer find(ArrayRef<uint8_t> Hash) const;
+ pointer find(ArrayRef<uint8_t> Hash) {
+ return getMutablePointer(
+ const_cast<const OnDiskHashMappedTrie *>(this)->find(Hash));
+ }
+
+ const_pointer recoverFromHashPointer(const uint8_t *HashBegin) const;
+ pointer recoverFromHashPointer(const uint8_t *HashBegin) {
+ return getMutablePointer(
+ const_cast<const OnDiskHashMappedTrie *>(this)->recoverFromHashPointer(
+ HashBegin));
+ }
+
+ const_pointer recoverFromFileOffset(FileOffset Offset) const;
+ pointer recoverFromFileOffset(FileOffset Offset) {
+ return getMutablePointer(
+ const_cast<const OnDiskHashMappedTrie *>(this)->recoverFromFileOffset(
+ Offset));
+ }
+
+ using LazyInsertOnConstructCB =
+ function_ref<void(FileOffset TentativeOffset, ValueProxy TentativeValue)>;
+ using LazyInsertOnLeakCB =
+ function_ref<void(FileOffset TentativeOffset, ValueProxy TentativeValue,
+ FileOffset FinalOffset, ValueProxy FinalValue)>;
+
+ /// Insert lazily.
+ ///
+ /// \p OnConstruct is called when ready to insert a value, after allocating
+ /// space for the data. It is called at most once.
+ ///
+ /// \p OnLeak is called only if \p OnConstruct has been called and a race
+ /// occurred before insertion, causing the tentative offset and data to be
+ /// abandoned. This allows clients to clean up other results or update any
+ /// references.
+ ///
+ /// NOTE: Does *not* guarantee that \p OnConstruct is only called on success.
+ /// The in-memory \a HashMappedTrie uses LazyAtomicPointer to synchronize
+ /// simultaneous writes, but that seems dangerous to use in a memory-mapped
+ /// file in case a process crashes in the busy state.
+ pointer insertLazy(const_pointer Hint, ArrayRef<uint8_t> Hash,
+ LazyInsertOnConstructCB OnConstruct = nullptr,
+ LazyInsertOnLeakCB OnLeak = nullptr);
+ pointer insertLazy(ArrayRef<uint8_t> Hash,
+ LazyInsertOnConstructCB OnConstruct = nullptr,
+ LazyInsertOnLeakCB OnLeak = nullptr) {
+ return insertLazy(const_pointer(), Hash, OnConstruct, OnLeak);
+ }
+
+ pointer insert(const_pointer Hint, const ConstValueProxy &Value) {
+ return insertLazy(Hint, Value.Hash, [&](FileOffset, ValueProxy Allocated) {
+ assert(Allocated.Hash == Value.Hash);
+ assert(Allocated.Data.size() == Value.Data.size());
+ llvm::copy(Value.Data, Allocated.Data.begin());
+ });
+ }
+ pointer insert(const ConstValueProxy &Value) {
+ return insert(const_pointer(), Value);
+ }
+
+ size_t size() const;
+
+ /// Gets or creates a file at \p Path with a hash-mapped trie named \p
+ /// TrieName. The hash size is \p NumHashBits (in bits) and the records store
+ /// data of size \p DataSize (in bytes).
+ ///
+ /// \p MaxFileSize controls the maximum file size to support, limiting the
+ /// size of the \a mapped_file_region. \p NewFileInitialSize is the starting
+ /// size if a new file is created.
+ ///
+ /// \p NewTableNumRootBits and \p NewTableNumSubtrieBits are hints to
+ /// configure the trie, if it doesn't already exist.
+ ///
+ /// \pre NumHashBits is a multiple of 8 (byte-aligned).
+ static Expected<OnDiskHashMappedTrie>
+ create(const Twine &Path, const Twine &TrieName, size_t NumHashBits,
+ uint64_t DataSize, uint64_t MaxFileSize,
+ std::optional<uint64_t> NewFileInitialSize,
+ std::optional<size_t> NewTableNumRootBits = std::nullopt,
+ std::optional<size_t> NewTableNumSubtrieBits = std::nullopt);
+
+ OnDiskHashMappedTrie(OnDiskHashMappedTrie &&RHS);
+ OnDiskHashMappedTrie &operator=(OnDiskHashMappedTrie &&RHS);
+ ~OnDiskHashMappedTrie();
+
+private:
+ struct ImplType;
+ explicit OnDiskHashMappedTrie(std::unique_ptr<ImplType> Impl);
+ std::unique_ptr<ImplType> Impl;
+};
+
+/// Sink for data. Stores variable length data with 8-byte alignment. Does not
+/// track size of data, which is assumed to known from context, or embedded.
+/// Uses 0-padding but does not guarantee 0-termination.
+class OnDiskDataAllocator {
+public:
+ using ValueProxy = MutableArrayRef<char>;
+
+ /// An iterator-like return value for data insertion. Maybe it should be
+ /// called \c iterator, but it has no increment.
+ class pointer {
+ public:
+ FileOffset getOffset() const { return Offset; }
+ explicit operator bool() const { return bool(getOffset()); }
+ const ValueProxy &operator*() const {
+ assert(Offset && "Null dereference");
+ return Value;
+ }
+ const ValueProxy *operator->() const {
+ assert(Offset && "Null dereference");
+ return &Value;
+ }
+
+ pointer() = default;
+
+ private:
+ friend class OnDiskDataAllocator;
+ pointer(FileOffset Offset, ValueProxy Value)
+ : Offset(Offset), Value(Value) {}
+ FileOffset Offset;
+ ValueProxy Value;
+ };
+
+ // Look up the data stored at the given offset.
+ const char *beginData(FileOffset Offset) const;
+ char *beginData(FileOffset Offset) {
+ return const_cast<char *>(
+ const_cast<const OnDiskDataAllocator *>(this)->beginData(Offset));
+ }
+
+ pointer allocate(size_t Size);
+ pointer save(ArrayRef<char> Data) {
+ pointer P = allocate(Data.size());
+ llvm::copy(Data, P->begin());
+ return P;
+ }
+ pointer save(StringRef Data) {
+ return save(ArrayRef<char>(Data.begin(), Data.size()));
+ }
+
+ /// \returns the buffer that was allocated at \p create time, with size
+ /// \p UserHeaderSize.
+ MutableArrayRef<uint8_t> getUserHeader();
+
+ size_t size() const;
+
+ static Expected<OnDiskDataAllocator>
+ create(const Twine &Path, const Twine &TableName, uint64_t MaxFileSize,
+ std::optional<uint64_t> NewFileInitialSize,
+ uint32_t UserHeaderSize = 0,
+ function_ref<void(void *)> UserHeaderInit = nullptr);
+
+ OnDiskDataAllocator(OnDiskDataAllocator &&RHS);
+ OnDiskDataAllocator &operator=(OnDiskDataAllocator &&RHS);
+
+ // No copy. Just call \a create() again.
+ OnDiskDataAllocator(const OnDiskDataAllocator &) = delete;
+ OnDiskDataAllocator &operator=(const OnDiskDataAllocator &) = delete;
+
+ ~OnDiskDataAllocator();
+
+private:
+ struct ImplType;
+ explicit OnDiskDataAllocator(std::unique_ptr<ImplType> Impl);
+ std::unique_ptr<ImplType> Impl;
+};
+
+} // namespace cas
+} // namespace llvm
+
+#endif // LLVM_CAS_ONDISKHASHMAPPEDTRIE_H
diff --git a/llvm/include/llvm/CAS/OnDiskKeyValueDB.h b/llvm/include/llvm/CAS/OnDiskKeyValueDB.h
new file mode 100644
index 0000000..a6b4b12
--- /dev/null
+++ b/llvm/include/llvm/CAS/OnDiskKeyValueDB.h
@@ -0,0 +1,63 @@
+//===- OnDiskKeyValueDB.h ---------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CAS_ONDISKKEYVALUEDB_H
+#define LLVM_CAS_ONDISKKEYVALUEDB_H
+
+#include "llvm/CAS/OnDiskHashMappedTrie.h"
+
+namespace llvm::cas::ondisk {
+
+/// An on-disk key-value data store with the following properties:
+/// * Keys are fixed length binary hashes with expected normal distribution.
+/// * Values are buffers of the same size, specified at creation time.
+/// * The value of a key cannot be changed once it is set.
+/// * The value buffers returned from a key lookup have 8-byte alignment.
+class OnDiskKeyValueDB {
+public:
+ /// Associate a value with a key.
+ ///
+ /// \param Key the hash bytes for the key
+ /// \param Value the value bytes, same size as \p ValueSize parameter of
+ /// \p open call.
+ ///
+ /// \returns the value associated with the \p Key. It may be different than
+ /// \p Value if another value is already associated with this key.
+ Expected<ArrayRef<char>> put(ArrayRef<uint8_t> Key, ArrayRef<char> Value);
+
+ /// \returns the value associated with the \p Key, or \p std::nullopt if the
+ /// key does not exist.
+ Expected<std::optional<ArrayRef<char>>> get(ArrayRef<uint8_t> Key);
+
+ /// \returns Total size of stored data.
+ size_t getStorageSize() const { return Cache.size(); }
+
+ /// Open the on-disk store from a directory.
+ ///
+ /// \param Path directory for the on-disk store. The directory will be created
+ /// if it doesn't exist.
+ /// \param HashName Identifier name for the hashing algorithm that is going to
+ /// be used.
+ /// \param KeySize Size for the key hash bytes.
+ /// \param ValueName Identifier name for the values.
+ /// \param ValueSize Size for the value bytes.
+ static Expected<std::unique_ptr<OnDiskKeyValueDB>>
+ open(StringRef Path, StringRef HashName, unsigned KeySize,
+ StringRef ValueName, size_t ValueSize);
+
+private:
+ OnDiskKeyValueDB(size_t ValueSize, OnDiskHashMappedTrie Cache)
+ : ValueSize(ValueSize), Cache(std::move(Cache)) {}
+
+ const size_t ValueSize;
+ OnDiskHashMappedTrie Cache;
+};
+
+} // namespace llvm::cas::ondisk
+
+#endif // LLVM_CAS_ONDISKKEYVALUEDB_H
diff --git a/llvm/include/llvm/Support/FileSystem.h b/llvm/include/llvm/Support/FileSystem.h
index 9cf5336..38ad0e7 100644
--- a/llvm/include/llvm/Support/FileSystem.h
+++ b/llvm/include/llvm/Support/FileSystem.h
@@ -1184,12 +1184,16 @@ openNativeFileForRead(const Twine &Name, OpenFlags Flags = OF_None,
/// descriptor.
std::error_code
tryLockFile(int FD,
- std::chrono::milliseconds Timeout = std::chrono::milliseconds(0));
+ std::chrono::milliseconds Timeout = std::chrono::milliseconds(0),
+ bool Exclusive = true);
/// Lock the file.
///
/// This function acts as @ref tryLockFile but it waits infinitely.
-std::error_code lockFile(int FD);
+/// \param FD file descriptor to use for locking.
+/// \param Exclusive if \p true use exclusive/writer lock, otherwise use
+/// shared/reader lock.
+std::error_code lockFile(int FD, bool Exclusive = true);
/// Unlock the file.
///
diff --git a/llvm/include/module.modulemap b/llvm/include/module.modulemap
index b00da6d..d44d395 100644
--- a/llvm/include/module.modulemap
+++ b/llvm/include/module.modulemap
@@ -105,6 +105,12 @@ module LLVM_BinaryFormat {
textual header "llvm/BinaryFormat/MsgPack.def"
}
+module LLVM_CAS {
+ requires cplusplus
+ umbrella "llvm/CAS"
+ module * { export * }
+}
+
module LLVM_Config {
requires cplusplus
umbrella "llvm/Config"
diff --git a/llvm/lib/CAS/ActionCache.cpp b/llvm/lib/CAS/ActionCache.cpp
new file mode 100644
index 0000000..851fee5
--- /dev/null
+++ b/llvm/lib/CAS/ActionCache.cpp
@@ -0,0 +1,22 @@
+//===- ActionCache.cpp ------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/ActionCache.h"
+#include "llvm/CAS/CASID.h"
+#include "llvm/CAS/ObjectStore.h"
+
+using namespace llvm;
+using namespace llvm::cas;
+
+void ActionCache::anchor() {}
+
+CacheKey::CacheKey(const CASID &ID) : Key(toStringRef(ID.getHash()).str()) {}
+CacheKey::CacheKey(const ObjectProxy &Proxy)
+ : CacheKey(Proxy.getCAS(), Proxy.getRef()) {}
+CacheKey::CacheKey(const ObjectStore &CAS, const ObjectRef &Ref)
+ : Key(toStringRef(CAS.getID(Ref).getHash())) {}
diff --git a/llvm/lib/CAS/ActionCaches.cpp b/llvm/lib/CAS/ActionCaches.cpp
new file mode 100644
index 0000000..4fd5499
--- /dev/null
+++ b/llvm/lib/CAS/ActionCaches.cpp
@@ -0,0 +1,99 @@
+//===- ActionCaches.cpp -----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "BuiltinCAS.h"
+#include "llvm/ADT/TrieRawHashMap.h"
+#include "llvm/CAS/ActionCache.h"
+#include "llvm/Support/BLAKE3.h"
+
+#define DEBUG_TYPE "action-caches"
+
+using namespace llvm;
+using namespace llvm::cas;
+
+namespace {
+
+using HasherT = BLAKE3;
+using HashType = decltype(HasherT::hash(std::declval<ArrayRef<uint8_t> &>()));
+
+template <size_t Size> class CacheEntry {
+public:
+ CacheEntry() = default;
+ CacheEntry(ArrayRef<uint8_t> Hash) { llvm::copy(Hash, Value.data()); }
+ CacheEntry(const CacheEntry &Entry) { llvm::copy(Entry.Value, Value.data()); }
+ ArrayRef<uint8_t> getValue() const { return Value; }
+
+private:
+ std::array<uint8_t, Size> Value;
+};
+
+class InMemoryActionCache final : public ActionCache {
+public:
+ InMemoryActionCache()
+ : ActionCache(builtin::BuiltinCASContext::getDefaultContext()) {}
+
+ Error putImpl(ArrayRef<uint8_t> ActionKey, const CASID &Result,
+ bool Globally) final;
+ Expected<std::optional<CASID>> getImpl(ArrayRef<uint8_t> ActionKey,
+ bool Globally) const final;
+
+private:
+ using DataT = CacheEntry<sizeof(HashType)>;
+ using InMemoryCacheT = ThreadSafeTrieRawHashMap<DataT, sizeof(HashType)>;
+
+ InMemoryCacheT Cache;
+};
+} // end namespace
+
+static std::string hashToString(ArrayRef<uint8_t> Hash) {
+ SmallString<64> Str;
+ toHex(Hash, /*LowerCase=*/true, Str);
+ return Str.str().str();
+}
+
+static Error createResultCachePoisonedError(StringRef Key,
+ const CASContext &Context,
+ CASID Output,
+ ArrayRef<uint8_t> ExistingOutput) {
+ std::string Existing =
+ CASID::create(&Context, toStringRef(ExistingOutput)).toString();
+ return createStringError(std::make_error_code(std::errc::invalid_argument),
+ "cache poisoned for '" + Key + "' (new='" +
+ Output.toString() + "' vs. existing '" +
+ Existing + "')");
+}
+
+Expected<std::optional<CASID>>
+InMemoryActionCache::getImpl(ArrayRef<uint8_t> Key, bool /*Globally*/) const {
+ auto Result = Cache.find(Key);
+ if (!Result)
+ return std::nullopt;
+ return CASID::create(&getContext(), toStringRef(Result->Data.getValue()));
+}
+
+Error InMemoryActionCache::putImpl(ArrayRef<uint8_t> Key, const CASID &Result,
+ bool /*Globally*/) {
+ DataT Expected(Result.getHash());
+ const InMemoryCacheT::value_type &Cached = *Cache.insertLazy(
+ Key, [&](auto ValueConstructor) { ValueConstructor.emplace(Expected); });
+
+ const DataT &Observed = Cached.Data;
+ if (Expected.getValue() == Observed.getValue())
+ return Error::success();
+
+ return createResultCachePoisonedError(hashToString(Key), getContext(), Result,
+ Observed.getValue());
+}
+
+namespace llvm::cas {
+
+std::unique_ptr<ActionCache> createInMemoryActionCache() {
+ return std::make_unique<InMemoryActionCache>();
+}
+
+} // namespace llvm::cas
diff --git a/llvm/lib/CAS/BuiltinCAS.cpp b/llvm/lib/CAS/BuiltinCAS.cpp
new file mode 100644
index 0000000..73646ad
--- /dev/null
+++ b/llvm/lib/CAS/BuiltinCAS.cpp
@@ -0,0 +1,94 @@
+//===- BuiltinCAS.cpp -------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "BuiltinCAS.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CAS/BuiltinObjectHasher.h"
+#include "llvm/Support/Process.h"
+
+using namespace llvm;
+using namespace llvm::cas;
+using namespace llvm::cas::builtin;
+
+static StringRef getCASIDPrefix() { return "llvmcas://"; }
+void BuiltinCASContext::anchor() {}
+
+Expected<HashType> BuiltinCASContext::parseID(StringRef Reference) {
+ if (!Reference.consume_front(getCASIDPrefix()))
+ return createStringError(std::make_error_code(std::errc::invalid_argument),
+ "invalid cas-id '" + Reference + "'");
+
+ // FIXME: Allow shortened references?
+ if (Reference.size() != 2 * sizeof(HashType))
+ return createStringError(std::make_error_code(std::errc::invalid_argument),
+ "wrong size for cas-id hash '" + Reference + "'");
+
+ std::string Binary;
+ if (!tryGetFromHex(Reference, Binary))
+ return createStringError(std::make_error_code(std::errc::invalid_argument),
+ "invalid hash in cas-id '" + Reference + "'");
+
+ assert(Binary.size() == sizeof(HashType));
+ HashType Digest;
+ llvm::copy(Binary, Digest.data());
+ return Digest;
+}
+
+Expected<CASID> BuiltinCAS::parseID(StringRef Reference) {
+ Expected<HashType> Digest = BuiltinCASContext::parseID(Reference);
+ if (!Digest)
+ return Digest.takeError();
+
+ return CASID::create(&getContext(), toStringRef(*Digest));
+}
+
+void BuiltinCASContext::printID(ArrayRef<uint8_t> Digest, raw_ostream &OS) {
+ SmallString<64> Hash;
+ toHex(Digest, /*LowerCase=*/true, Hash);
+ OS << getCASIDPrefix() << Hash;
+}
+
+void BuiltinCASContext::printIDImpl(raw_ostream &OS, const CASID &ID) const {
+ BuiltinCASContext::printID(ID.getHash(), OS);
+}
+
+const BuiltinCASContext &BuiltinCASContext::getDefaultContext() {
+ static BuiltinCASContext DefaultContext;
+ return DefaultContext;
+}
+
+Expected<ObjectRef> BuiltinCAS::store(ArrayRef<ObjectRef> Refs,
+ ArrayRef<char> Data) {
+ return storeImpl(BuiltinObjectHasher<HasherT>::hashObject(*this, Refs, Data),
+ Refs, Data);
+}
+
+Error BuiltinCAS::validate(const CASID &ID) {
+ auto Ref = getReference(ID);
+ if (!Ref)
+ return createUnknownObjectError(ID);
+
+ auto Handle = load(*Ref);
+ if (!Handle)
+ return Handle.takeError();
+
+ auto Proxy = ObjectProxy::load(*this, *Ref, *Handle);
+ SmallVector<ObjectRef> Refs;
+ if (auto E = Proxy.forEachReference([&](ObjectRef Ref) -> Error {
+ Refs.push_back(Ref);
+ return Error::success();
+ }))
+ return E;
+
+ ArrayRef<char> Data(Proxy.getData().data(), Proxy.getData().size());
+ auto Hash = BuiltinObjectHasher<HasherT>::hashObject(*this, Refs, Data);
+ if (!ID.getHash().equals(Hash))
+ return createCorruptObjectError(ID);
+
+ return Error::success();
+}
diff --git a/llvm/lib/CAS/BuiltinCAS.h b/llvm/lib/CAS/BuiltinCAS.h
new file mode 100644
index 0000000..1a4f640
--- /dev/null
+++ b/llvm/lib/CAS/BuiltinCAS.h
@@ -0,0 +1,74 @@
+//===- BuiltinCAS.h ---------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CAS_BUILTINCAS_H
+#define LLVM_LIB_CAS_BUILTINCAS_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CAS/BuiltinCASContext.h"
+#include "llvm/CAS/ObjectStore.h"
+
+namespace llvm::cas {
+class ActionCache;
+namespace builtin {
+
+class BuiltinCAS : public ObjectStore {
+public:
+ BuiltinCAS() : ObjectStore(BuiltinCASContext::getDefaultContext()) {}
+
+ Expected<CASID> parseID(StringRef Reference) final;
+
+ Expected<ObjectRef> store(ArrayRef<ObjectRef> Refs,
+ ArrayRef<char> Data) final;
+ virtual Expected<ObjectRef> storeImpl(ArrayRef<uint8_t> ComputedHash,
+ ArrayRef<ObjectRef> Refs,
+ ArrayRef<char> Data) = 0;
+
+ virtual Expected<ObjectRef>
+ storeFromNullTerminatedRegion(ArrayRef<uint8_t> ComputedHash,
+ sys::fs::mapped_file_region Map) {
+ return storeImpl(ComputedHash, std::nullopt,
+ ArrayRef(Map.data(), Map.size()));
+ }
+
+ /// Both builtin CAS implementations provide lifetime for free, so this can
+ /// be const, and readData() and getDataSize() can be implemented on top of
+ /// it.
+ virtual ArrayRef<char> getDataConst(ObjectHandle Node) const = 0;
+
+ ArrayRef<char> getData(ObjectHandle Node,
+ bool RequiresNullTerminator) const final {
+ // BuiltinCAS Objects are always null terminated.
+ return getDataConst(Node);
+ }
+ uint64_t getDataSize(ObjectHandle Node) const final {
+ return getDataConst(Node).size();
+ }
+
+ Error createUnknownObjectError(const CASID &ID) const {
+ return createStringError(std::make_error_code(std::errc::invalid_argument),
+ "unknown object '" + ID.toString() + "'");
+ }
+
+ Error createCorruptObjectError(const CASID &ID) const {
+ return createStringError(std::make_error_code(std::errc::invalid_argument),
+ "corrupt object '" + ID.toString() + "'");
+ }
+
+ Error createCorruptStorageError() const {
+ return createStringError(std::make_error_code(std::errc::invalid_argument),
+ "corrupt storage");
+ }
+
+ Error validate(const CASID &ID) final;
+};
+
+} // end namespace builtin
+} // end namespace llvm::cas
+
+#endif // LLVM_LIB_CAS_BUILTINCAS_H
diff --git a/llvm/lib/CAS/CMakeLists.txt b/llvm/lib/CAS/CMakeLists.txt
new file mode 100644
index 0000000..66a5579
--- /dev/null
+++ b/llvm/lib/CAS/CMakeLists.txt
@@ -0,0 +1,19 @@
+if (LLVM_ENABLE_ONDISK_CAS)
+ add_definitions(-DLLVM_ENABLE_ONDISK_CAS=1)
+endif()
+
+add_llvm_component_library(LLVMCAS
+ ActionCache.cpp
+ ActionCaches.cpp
+ BuiltinCAS.cpp
+ InMemoryCAS.cpp
+ MappedFileRegionBumpPtr.cpp
+ ObjectStore.cpp
+ OnDiskCommon.cpp
+ OnDiskKeyValueDB.cpp
+ OnDiskHashMappedTrie.cpp
+ OnDiskGraphDB.cpp
+
+ ADDITIONAL_HEADER_DIRS
+ ${LLVM_MAIN_INCLUDE_DIR}/llvm/CAS
+)
diff --git a/llvm/lib/CAS/InMemoryCAS.cpp b/llvm/lib/CAS/InMemoryCAS.cpp
new file mode 100644
index 0000000..abdd7ed
--- /dev/null
+++ b/llvm/lib/CAS/InMemoryCAS.cpp
@@ -0,0 +1,320 @@
+//===- InMemoryCAS.cpp ------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "BuiltinCAS.h"
+#include "llvm/ADT/LazyAtomicPointer.h"
+#include "llvm/ADT/PointerIntPair.h"
+#include "llvm/ADT/TrieRawHashMap.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ThreadSafeAllocator.h"
+
+using namespace llvm;
+using namespace llvm::cas;
+using namespace llvm::cas::builtin;
+
+namespace {
+
+class InMemoryObject;
+
+/// Index of referenced IDs (map: Hash -> InMemoryObject*). Uses
+/// LazyAtomicPointer to coordinate creation of objects.
+using InMemoryIndexT =
+ ThreadSafeTrieRawHashMap<LazyAtomicPointer<const InMemoryObject>,
+ sizeof(HashType)>;
+
+/// Values in \a InMemoryIndexT. \a InMemoryObject's point at this to access
+/// their hash.
+using InMemoryIndexValueT = InMemoryIndexT::value_type;
+
+class InMemoryObject {
+public:
+ enum class Kind {
+ /// Node with refs and data.
+ RefNode,
+
+ /// Node with refs and data co-allocated.
+ InlineNode,
+
+ Max = InlineNode,
+ };
+
+ Kind getKind() const { return IndexAndKind.getInt(); }
+ const InMemoryIndexValueT &getIndex() const {
+ assert(IndexAndKind.getPointer());
+ return *IndexAndKind.getPointer();
+ }
+
+ ArrayRef<uint8_t> getHash() const { return getIndex().Hash; }
+
+ InMemoryObject() = delete;
+ InMemoryObject(InMemoryObject &&) = delete;
+ InMemoryObject(const InMemoryObject &) = delete;
+
+protected:
+ InMemoryObject(Kind K, const InMemoryIndexValueT &I) : IndexAndKind(&I, K) {}
+
+private:
+ enum Counts : int {
+ NumKindBits = 2,
+ };
+ PointerIntPair<const InMemoryIndexValueT *, NumKindBits, Kind> IndexAndKind;
+ static_assert((1U << NumKindBits) <= alignof(InMemoryIndexValueT),
+ "Kind will clobber pointer");
+ static_assert(((int)Kind::Max >> NumKindBits) == 0, "Kind will be truncated");
+
+public:
+ inline ArrayRef<char> getData() const;
+
+ inline ArrayRef<const InMemoryObject *> getRefs() const;
+};
+
+class InMemoryRefObject : public InMemoryObject {
+public:
+ static constexpr Kind KindValue = Kind::RefNode;
+ static bool classof(const InMemoryObject *O) {
+ return O->getKind() == KindValue;
+ }
+
+ ArrayRef<const InMemoryObject *> getRefsImpl() const { return Refs; }
+ ArrayRef<const InMemoryObject *> getRefs() const { return Refs; }
+ ArrayRef<char> getDataImpl() const { return Data; }
+ ArrayRef<char> getData() const { return Data; }
+
+ static InMemoryRefObject &create(function_ref<void *(size_t Size)> Allocate,
+ const InMemoryIndexValueT &I,
+ ArrayRef<const InMemoryObject *> Refs,
+ ArrayRef<char> Data) {
+ void *Mem = Allocate(sizeof(InMemoryRefObject));
+ return *new (Mem) InMemoryRefObject(I, Refs, Data);
+ }
+
+private:
+ InMemoryRefObject(const InMemoryIndexValueT &I,
+ ArrayRef<const InMemoryObject *> Refs, ArrayRef<char> Data)
+ : InMemoryObject(KindValue, I), Refs(Refs), Data(Data) {
+ assert(isAddrAligned(Align(8), this) && "Expected 8-byte alignment");
+ assert(isAddrAligned(Align(8), Data.data()) && "Expected 8-byte alignment");
+ assert(*Data.end() == 0 && "Expected null-termination");
+ }
+
+ ArrayRef<const InMemoryObject *> Refs;
+ ArrayRef<char> Data;
+};
+
+class InMemoryInlineObject : public InMemoryObject {
+public:
+ static constexpr Kind KindValue = Kind::InlineNode;
+ static bool classof(const InMemoryObject *O) {
+ return O->getKind() == KindValue;
+ }
+
+ ArrayRef<const InMemoryObject *> getRefs() const { return getRefsImpl(); }
+ ArrayRef<const InMemoryObject *> getRefsImpl() const {
+ return ArrayRef(reinterpret_cast<const InMemoryObject *const *>(this + 1),
+ NumRefs);
+ }
+
+ ArrayRef<char> getData() const { return getDataImpl(); }
+ ArrayRef<char> getDataImpl() const {
+ ArrayRef<const InMemoryObject *> Refs = getRefs();
+ return ArrayRef(reinterpret_cast<const char *>(Refs.data() + Refs.size()),
+ DataSize);
+ }
+
+ static InMemoryInlineObject &
+ create(function_ref<void *(size_t Size)> Allocate,
+ const InMemoryIndexValueT &I, ArrayRef<const InMemoryObject *> Refs,
+ ArrayRef<char> Data) {
+ void *Mem = Allocate(sizeof(InMemoryInlineObject) +
+ sizeof(uintptr_t) * Refs.size() + Data.size() + 1);
+ return *new (Mem) InMemoryInlineObject(I, Refs, Data);
+ }
+
+private:
+ InMemoryInlineObject(const InMemoryIndexValueT &I,
+ ArrayRef<const InMemoryObject *> Refs,
+ ArrayRef<char> Data)
+ : InMemoryObject(KindValue, I), NumRefs(Refs.size()),
+ DataSize(Data.size()) {
+ auto *BeginRefs = reinterpret_cast<const InMemoryObject **>(this + 1);
+ llvm::copy(Refs, BeginRefs);
+ auto *BeginData = reinterpret_cast<char *>(BeginRefs + NumRefs);
+ llvm::copy(Data, BeginData);
+ BeginData[Data.size()] = 0;
+ }
+ uint32_t NumRefs;
+ uint32_t DataSize;
+};
+
+/// In-memory CAS database and action cache (the latter should be separated).
+class InMemoryCAS : public BuiltinCAS {
+public:
+ Expected<ObjectRef> storeImpl(ArrayRef<uint8_t> ComputedHash,
+ ArrayRef<ObjectRef> Refs,
+ ArrayRef<char> Data) final;
+
+ Expected<ObjectRef>
+ storeFromNullTerminatedRegion(ArrayRef<uint8_t> ComputedHash,
+ sys::fs::mapped_file_region Map) override;
+
+ CASID getID(const InMemoryIndexValueT &I) const {
+ StringRef Hash = toStringRef(I.Hash);
+ return CASID::create(&getContext(), Hash);
+ }
+ CASID getID(const InMemoryObject &O) const { return getID(O.getIndex()); }
+
+ ObjectHandle getObjectHandle(const InMemoryObject &Node) const {
+ assert(!(reinterpret_cast<uintptr_t>(&Node) & 0x1ULL));
+ return makeObjectHandle(reinterpret_cast<uintptr_t>(&Node));
+ }
+
+ Expected<std::optional<ObjectHandle>> loadIfExists(ObjectRef Ref) override {
+ return getObjectHandle(asInMemoryObject(Ref));
+ }
+
+ InMemoryIndexValueT &indexHash(ArrayRef<uint8_t> Hash) {
+ return *Index.insertLazy(
+ Hash, [](auto ValueConstructor) { ValueConstructor.emplace(nullptr); });
+ }
+
+ /// TODO: Consider callers to actually do an insert and to return a handle to
+ /// the slot in the trie.
+ const InMemoryObject *getInMemoryObject(CASID ID) const {
+ assert(ID.getContext().getHashSchemaIdentifier() ==
+ getContext().getHashSchemaIdentifier() &&
+ "Expected ID from same hash schema");
+ if (InMemoryIndexT::const_pointer P = Index.find(ID.getHash()))
+ return P->Data;
+ return nullptr;
+ }
+
+ const InMemoryObject &getInMemoryObject(ObjectHandle OH) const {
+ return *reinterpret_cast<const InMemoryObject *>(
+ (uintptr_t)OH.getInternalRef(*this));
+ }
+
+ const InMemoryObject &asInMemoryObject(ReferenceBase Ref) const {
+ uintptr_t P = Ref.getInternalRef(*this);
+ return *reinterpret_cast<const InMemoryObject *>(P);
+ }
+ ObjectRef toReference(const InMemoryObject &O) const {
+ return makeObjectRef(reinterpret_cast<uintptr_t>(&O));
+ }
+
+ CASID getID(ObjectRef Ref) const final { return getIDImpl(Ref); }
+ CASID getIDImpl(ReferenceBase Ref) const {
+ return getID(asInMemoryObject(Ref));
+ }
+
+ std::optional<ObjectRef> getReference(const CASID &ID) const final {
+ if (const InMemoryObject *Object = getInMemoryObject(ID))
+ return toReference(*Object);
+ return std::nullopt;
+ }
+
+ Expected<bool> isMaterialized(ObjectRef Ref) const final { return true; }
+
+ ArrayRef<char> getDataConst(ObjectHandle Node) const final {
+ return cast<InMemoryObject>(asInMemoryObject(Node)).getData();
+ }
+
+ InMemoryCAS() = default;
+
+private:
+ size_t getNumRefs(ObjectHandle Node) const final {
+ return getInMemoryObject(Node).getRefs().size();
+ }
+ ObjectRef readRef(ObjectHandle Node, size_t I) const final {
+ return toReference(*getInMemoryObject(Node).getRefs()[I]);
+ }
+ Error forEachRef(ObjectHandle Node,
+ function_ref<Error(ObjectRef)> Callback) const final;
+
+ /// Index of referenced IDs (map: Hash -> InMemoryObject*). Mapped to nullptr
+ /// as a convenient way to store hashes.
+ ///
+ /// - Insert nullptr on lookups.
+ /// - InMemoryObject points back to here.
+ InMemoryIndexT Index;
+
+ ThreadSafeAllocator<BumpPtrAllocator> Objects;
+ ThreadSafeAllocator<SpecificBumpPtrAllocator<sys::fs::mapped_file_region>>
+ MemoryMaps;
+};
+
+} // end anonymous namespace
+
+ArrayRef<char> InMemoryObject::getData() const {
+ if (auto *Derived = dyn_cast<InMemoryRefObject>(this))
+ return Derived->getDataImpl();
+ return cast<InMemoryInlineObject>(this)->getDataImpl();
+}
+
+ArrayRef<const InMemoryObject *> InMemoryObject::getRefs() const {
+ if (auto *Derived = dyn_cast<InMemoryRefObject>(this))
+ return Derived->getRefsImpl();
+ return cast<InMemoryInlineObject>(this)->getRefsImpl();
+}
+
+Expected<ObjectRef>
+InMemoryCAS::storeFromNullTerminatedRegion(ArrayRef<uint8_t> ComputedHash,
+ sys::fs::mapped_file_region Map) {
+ // Look up the hash in the index, initializing to nullptr if it's new.
+ ArrayRef<char> Data(Map.data(), Map.size());
+ auto &I = indexHash(ComputedHash);
+
+ // Load or generate.
+ auto Allocator = [&](size_t Size) -> void * {
+ return Objects.Allocate(Size, alignof(InMemoryObject));
+ };
+ auto Generator = [&]() -> const InMemoryObject * {
+ return &InMemoryRefObject::create(Allocator, I, std::nullopt, Data);
+ };
+ const InMemoryObject &Node =
+ cast<InMemoryObject>(I.Data.loadOrGenerate(Generator));
+
+ // Save Map if the winning node uses it.
+ if (auto *RefNode = dyn_cast<InMemoryRefObject>(&Node))
+ if (RefNode->getData().data() == Map.data())
+ new (MemoryMaps.Allocate(1)) sys::fs::mapped_file_region(std::move(Map));
+
+ return toReference(Node);
+}
+
+Expected<ObjectRef> InMemoryCAS::storeImpl(ArrayRef<uint8_t> ComputedHash,
+ ArrayRef<ObjectRef> Refs,
+ ArrayRef<char> Data) {
+ // Look up the hash in the index, initializing to nullptr if it's new.
+ auto &I = indexHash(ComputedHash);
+
+ // Create the node.
+ SmallVector<const InMemoryObject *> InternalRefs;
+ for (ObjectRef Ref : Refs)
+ InternalRefs.push_back(&asInMemoryObject(Ref));
+ auto Allocator = [&](size_t Size) -> void * {
+ return Objects.Allocate(Size, alignof(InMemoryObject));
+ };
+ auto Generator = [&]() -> const InMemoryObject * {
+ return &InMemoryInlineObject::create(Allocator, I, InternalRefs, Data);
+ };
+ return toReference(cast<InMemoryObject>(I.Data.loadOrGenerate(Generator)));
+}
+
+Error InMemoryCAS::forEachRef(ObjectHandle Handle,
+ function_ref<Error(ObjectRef)> Callback) const {
+ auto &Node = getInMemoryObject(Handle);
+ for (const InMemoryObject *Ref : Node.getRefs())
+ if (Error E = Callback(toReference(*Ref)))
+ return E;
+ return Error::success();
+}
+
+std::unique_ptr<ObjectStore> cas::createInMemoryCAS() {
+ return std::make_unique<InMemoryCAS>();
+}
diff --git a/llvm/lib/CAS/MappedFileRegionBumpPtr.cpp b/llvm/lib/CAS/MappedFileRegionBumpPtr.cpp
new file mode 100644
index 0000000..2222644
--- /dev/null
+++ b/llvm/lib/CAS/MappedFileRegionBumpPtr.cpp
@@ -0,0 +1,233 @@
+//===- MappedFileRegionBumpPtr.cpp ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// A bump pointer allocator, backed by a memory-mapped file.
+///
+/// The effect we want is:
+///
+/// 1. If it doesn't exist, create the file with an initial size.
+/// 2. Reserve virtual memory large enough for the max file size.
+/// 3. Map the file into memory in the reserved region.
+/// 4. Increase the file size and update the mapping when necessary.
+///
+/// However, updating the mapping is challenging when it needs to work portably,
+/// and across multiple processes without locking for every read. Our current
+/// implementation strategy is:
+///
+/// 1. Use \c ftruncate (\c sys::fs::resize_file) to grow the file to its max
+/// size (typically several GB). Many modern filesystems will create a sparse
+/// file, so that the trailing unused pages do not take space on disk.
+/// 2. Call \c mmap (\c sys::fs::mapped_file_region)
+/// 3. [Automatic as part of 2.]
+/// 4. [Automatic as part of 2.]
+///
+/// Additionally, we attempt to resize the file to its actual data size when
+/// closing the mapping, if this is the only concurrent instance. This is done
+/// using file locks. Shrinking the file mitigates problems with having large
+/// files: on filesystems without sparse files it avoids unnecessary space use;
+/// it also avoids allocating the full size if another process copies the file,
+/// which typically loses sparseness. These mitigations only work while the file
+/// is not in use.
+///
+/// FIXME: we assume that all concurrent users of the file will use the same
+/// value for Capacity. Otherwise a process with a larger capacity can write
+/// data that is "out of bounds" for processes with smaller capacity. Currently
+/// this is true in the CAS.
+///
+/// To support resizing, we use two separate file locks:
+/// 1. We use a shared reader lock on a ".shared" file until destruction.
+/// 2. We use a lock on the main file during initialization - shared to check
+/// the status, upgraded to exclusive to resize/initialize the file.
+///
+/// Then during destruction we attempt to get exclusive access on (1), which
+/// requires no concurrent readers. If so, we shrink the file. Using two
+/// separate locks simplifies the implementation and enables it to work on
+/// platforms (e.g. Windows) where a shared/reader lock prevents writing.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/MappedFileRegionBumpPtr.h"
+#include "OnDiskCommon.h"
+
+using namespace llvm;
+using namespace llvm::cas;
+using namespace llvm::cas::ondisk;
+
+namespace {
+struct FileLockRAII {
+ std::string Path;
+ int FD;
+ enum LockKind { Shared, Exclusive };
+ std::optional<LockKind> Locked;
+
+ FileLockRAII(std::string Path, int FD) : Path(std::move(Path)), FD(FD) {}
+ ~FileLockRAII() { consumeError(unlock()); }
+
+ Error lock(LockKind LK) {
+ if (std::error_code EC = lockFileThreadSafe(FD, LK == Exclusive))
+ return createFileError(Path, EC);
+ Locked = LK;
+ return Error::success();
+ }
+
+ Error unlock() {
+ if (Locked) {
+ Locked = std::nullopt;
+ if (std::error_code EC = unlockFileThreadSafe(FD))
+ return createFileError(Path, EC);
+ }
+ return Error::success();
+ }
+};
+} // end anonymous namespace
+
+Expected<MappedFileRegionBumpPtr> MappedFileRegionBumpPtr::create(
+ const Twine &Path, uint64_t Capacity, int64_t BumpPtrOffset,
+ function_ref<Error(MappedFileRegionBumpPtr &)> NewFileConstructor) {
+ MappedFileRegionBumpPtr Result;
+ Result.Path = Path.str();
+ // Open the main file.
+ int FD;
+ if (std::error_code EC = sys::fs::openFileForReadWrite(
+ Result.Path, FD, sys::fs::CD_OpenAlways, sys::fs::OF_None))
+ return createFileError(Path, EC);
+ Result.FD = FD;
+
+ // Open the shared lock file. See file comment for details of locking scheme.
+ SmallString<128> SharedLockPath(Result.Path);
+ SharedLockPath.append(".shared");
+ int SharedLockFD;
+ if (std::error_code EC = sys::fs::openFileForReadWrite(
+ SharedLockPath, SharedLockFD, sys::fs::CD_OpenAlways,
+ sys::fs::OF_None))
+ return createFileError(SharedLockPath, EC);
+ Result.SharedLockFD = SharedLockFD;
+
+ // Take shared/reader lock that will be held until we close the file; unlocked
+ // by destroyImpl.
+ if (std::error_code EC =
+ lockFileThreadSafe(SharedLockFD, /*Exclusive=*/false))
+ return createFileError(Path, EC);
+
+ // Take shared/reader lock for initialization.
+ FileLockRAII InitLock(Result.Path, FD);
+ if (Error E = InitLock.lock(FileLockRAII::Shared))
+ return std::move(E);
+
+ sys::fs::file_t File = sys::fs::convertFDToNativeFile(FD);
+ sys::fs::file_status Status;
+ if (std::error_code EC = sys::fs::status(File, Status))
+ return createFileError(Result.Path, EC);
+
+ if (Status.getSize() < Capacity) {
+ // Lock the file exclusively so only one process will do the initialization.
+ if (Error E = InitLock.unlock())
+ return std::move(E);
+ if (Error E = InitLock.lock(FileLockRAII::Exclusive))
+ return std::move(E);
+ // Retrieve the current size now that we have exclusive access.
+ if (std::error_code EC = sys::fs::status(File, Status))
+ return createFileError(Result.Path, EC);
+ }
+
+ // At this point either the file is still under-sized, or we have the size for
+ // the completely initialized file.
+
+ if (Status.getSize() < Capacity) {
+ // We are initializing the file; it may be empty, or may have been shrunk
+ // during a previous close.
+ // FIXME: Detect a case where someone opened it with a smaller capacity.
+ // FIXME: On Windows we should use FSCTL_SET_SPARSE and FSCTL_SET_ZERO_DATA
+ // to make this a sparse region, if supported.
+ if (std::error_code EC = sys::fs::resize_file(FD, Capacity))
+ return createFileError(Result.Path, EC);
+ } else {
+ // Someone else initialized it.
+ Capacity = Status.getSize();
+ }
+
+ // Create the mapped region.
+ {
+ std::error_code EC;
+ sys::fs::mapped_file_region Map(
+ File, sys::fs::mapped_file_region::readwrite, Capacity, 0, EC);
+ if (EC)
+ return createFileError(Result.Path, EC);
+ Result.Region = std::move(Map);
+ }
+
+ if (Status.getSize() == 0) {
+ // We are creating a new file; run the constructor.
+ if (Error E = NewFileConstructor(Result))
+ return std::move(E);
+ } else {
+ Result.initializeBumpPtr(BumpPtrOffset);
+ }
+
+ return Result;
+}
+
+void MappedFileRegionBumpPtr::destroyImpl() {
+ if (!FD)
+ return;
+
+ // Drop the shared lock indicating we are no longer accessing the file.
+ if (SharedLockFD)
+ (void)unlockFileThreadSafe(*SharedLockFD);
+
+ // Attempt to truncate the file if we can get exclusive access. Ignore any
+ // errors.
+ if (BumpPtr) {
+ assert(SharedLockFD && "Must have shared lock file open");
+ if (tryLockFileThreadSafe(*SharedLockFD) == std::error_code()) {
+ assert(size() <= capacity());
+ (void)sys::fs::resize_file(*FD, size());
+ (void)unlockFileThreadSafe(*SharedLockFD);
+ }
+ }
+
+ auto Close = [](std::optional<int> &FD) {
+ if (FD) {
+ sys::fs::file_t File = sys::fs::convertFDToNativeFile(*FD);
+ sys::fs::closeFile(File);
+ FD = std::nullopt;
+ }
+ };
+
+ // Close the file and shared lock.
+ Close(FD);
+ Close(SharedLockFD);
+}
+
+void MappedFileRegionBumpPtr::initializeBumpPtr(int64_t BumpPtrOffset) {
+ assert(capacity() < (uint64_t)INT64_MAX && "capacity must fit in int64_t");
+ int64_t BumpPtrEndOffset = BumpPtrOffset + sizeof(decltype(*BumpPtr));
+ assert(BumpPtrEndOffset <= (int64_t)capacity() &&
+ "Expected end offset to be pre-allocated");
+ assert(isAligned(Align::Of<decltype(*BumpPtr)>(), BumpPtrOffset) &&
+ "Expected end offset to be aligned");
+ BumpPtr = reinterpret_cast<decltype(BumpPtr)>(data() + BumpPtrOffset);
+
+ int64_t ExistingValue = 0;
+ if (!BumpPtr->compare_exchange_strong(ExistingValue, BumpPtrEndOffset))
+ assert(ExistingValue >= BumpPtrEndOffset &&
+ "Expected 0, or past the end of the BumpPtr itself");
+}
+
+int64_t MappedFileRegionBumpPtr::allocateOffset(uint64_t AllocSize) {
+ AllocSize = alignTo(AllocSize, getAlign());
+ int64_t OldEnd = BumpPtr->fetch_add(AllocSize);
+ int64_t NewEnd = OldEnd + AllocSize;
+ if (LLVM_UNLIKELY(NewEnd > (int64_t)capacity())) {
+ // Try to return the allocation.
+ (void)BumpPtr->compare_exchange_strong(OldEnd, NewEnd);
+ report_fatal_error(
+ errorCodeToError(std::make_error_code(std::errc::not_enough_memory)));
+ }
+ return OldEnd;
+}
diff --git a/llvm/lib/CAS/ObjectStore.cpp b/llvm/lib/CAS/ObjectStore.cpp
new file mode 100644
index 0000000..a938c4e
--- /dev/null
+++ b/llvm/lib/CAS/ObjectStore.cpp
@@ -0,0 +1,168 @@
+//===- ObjectStore.cpp ------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/ObjectStore.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MemoryBuffer.h"
+
+using namespace llvm;
+using namespace llvm::cas;
+
+void CASContext::anchor() {}
+void ObjectStore::anchor() {}
+
+LLVM_DUMP_METHOD void CASID::dump() const { print(dbgs()); }
+LLVM_DUMP_METHOD void ObjectStore::dump() const { print(dbgs()); }
+LLVM_DUMP_METHOD void ObjectRef::dump() const { print(dbgs()); }
+LLVM_DUMP_METHOD void ObjectHandle::dump() const { print(dbgs()); }
+
+std::string CASID::toString() const {
+ std::string S;
+ raw_string_ostream(S) << *this;
+ return S;
+}
+
+static void printReferenceBase(raw_ostream &OS, StringRef Kind,
+ uint64_t InternalRef, std::optional<CASID> ID) {
+ OS << Kind << "=" << InternalRef;
+ if (ID)
+ OS << "[" << *ID << "]";
+}
+
+void ReferenceBase::print(raw_ostream &OS, const ObjectHandle &This) const {
+ assert(this == &This);
+ printReferenceBase(OS, "object-handle", InternalRef, std::nullopt);
+}
+
+void ReferenceBase::print(raw_ostream &OS, const ObjectRef &This) const {
+ assert(this == &This);
+
+ std::optional<CASID> ID;
+#if LLVM_ENABLE_ABI_BREAKING_CHECKS
+ if (CAS)
+ ID = CAS->getID(This);
+#endif
+ printReferenceBase(OS, "object-ref", InternalRef, ID);
+}
+
+Expected<ObjectHandle> ObjectStore::load(ObjectRef Ref) {
+ std::optional<ObjectHandle> Handle;
+ if (Error E = loadIfExists(Ref).moveInto(Handle))
+ return std::move(E);
+ if (!Handle)
+ return createStringError(errc::invalid_argument,
+ "missing object '" + getID(Ref).toString() + "'");
+ return *Handle;
+}
+
+std::unique_ptr<MemoryBuffer>
+ObjectStore::getMemoryBuffer(ObjectHandle Node, StringRef Name,
+ bool RequiresNullTerminator) {
+ return MemoryBuffer::getMemBuffer(
+ toStringRef(getData(Node, RequiresNullTerminator)), Name,
+ RequiresNullTerminator);
+}
+
+void ObjectStore::readRefs(ObjectHandle Node,
+ SmallVectorImpl<ObjectRef> &Refs) const {
+ consumeError(forEachRef(Node, [&Refs](ObjectRef Ref) -> Error {
+ Refs.push_back(Ref);
+ return Error::success();
+ }));
+}
+
+Expected<ObjectProxy> ObjectStore::getProxy(const CASID &ID) {
+ std::optional<ObjectRef> Ref = getReference(ID);
+ if (!Ref)
+ return createUnknownObjectError(ID);
+
+ return getProxy(*Ref);
+}
+
+Expected<ObjectProxy> ObjectStore::getProxy(ObjectRef Ref) {
+ std::optional<ObjectHandle> H;
+ if (Error E = load(Ref).moveInto(H))
+ return std::move(E);
+
+ return ObjectProxy::load(*this, Ref, *H);
+}
+
+Expected<std::optional<ObjectProxy>>
+ObjectStore::getProxyIfExists(ObjectRef Ref) {
+ std::optional<ObjectHandle> H;
+ if (Error E = loadIfExists(Ref).moveInto(H))
+ return std::move(E);
+ if (!H)
+ return std::nullopt;
+ return ObjectProxy::load(*this, Ref, *H);
+}
+
+Error ObjectStore::createUnknownObjectError(const CASID &ID) {
+ return createStringError(std::make_error_code(std::errc::invalid_argument),
+ "unknown object '" + ID.toString() + "'");
+}
+
+Expected<ObjectProxy> ObjectStore::createProxy(ArrayRef<ObjectRef> Refs,
+ StringRef Data) {
+ Expected<ObjectRef> Ref = store(Refs, arrayRefFromStringRef<char>(Data));
+ if (!Ref)
+ return Ref.takeError();
+ return getProxy(*Ref);
+}
+
+Expected<ObjectRef>
+ObjectStore::storeFromOpenFileImpl(sys::fs::file_t FD,
+ std::optional<sys::fs::file_status> Status) {
+ // Copy the file into an immutable memory buffer and call \c store on that.
+ // Using \c mmap would be unsafe because there's a race window between when we
+ // get the digest hash for the \c mmap contents and when we store the data; if
+ // the file changes in-between we will create an invalid object.
+
+ // FIXME: For the on-disk CAS implementation use cloning to store it as a
+ // standalone file if the file-system supports it and the file is large.
+
+ constexpr size_t ChunkSize = 4 * 4096;
+ SmallString<0> Data;
+ Data.reserve(ChunkSize * 2);
+ if (Error E = sys::fs::readNativeFileToEOF(FD, Data, ChunkSize))
+ return std::move(E);
+ return store(std::nullopt, ArrayRef(Data.data(), Data.size()));
+}
+
+Error ObjectStore::validateTree(ObjectRef Root) {
+ SmallDenseSet<ObjectRef> ValidatedRefs;
+ SmallVector<ObjectRef, 16> RefsToValidate;
+ RefsToValidate.push_back(Root);
+
+ while (!RefsToValidate.empty()) {
+ ObjectRef Ref = RefsToValidate.pop_back_val();
+ auto [I, Inserted] = ValidatedRefs.insert(Ref);
+ if (!Inserted)
+ continue; // already validated.
+ if (Error E = validate(getID(Ref)))
+ return E;
+ Expected<ObjectHandle> Obj = load(Ref);
+ if (!Obj)
+ return Obj.takeError();
+ if (Error E = forEachRef(*Obj, [&RefsToValidate](ObjectRef R) -> Error {
+ RefsToValidate.push_back(R);
+ return Error::success();
+ }))
+ return E;
+ }
+ return Error::success();
+}
+
+std::unique_ptr<MemoryBuffer>
+ObjectProxy::getMemoryBuffer(StringRef Name,
+ bool RequiresNullTerminator) const {
+ return CAS->getMemoryBuffer(H, Name, RequiresNullTerminator);
+}
diff --git a/llvm/lib/CAS/OnDiskCommon.cpp b/llvm/lib/CAS/OnDiskCommon.cpp
new file mode 100644
index 0000000..da3a424
--- /dev/null
+++ b/llvm/lib/CAS/OnDiskCommon.cpp
@@ -0,0 +1,108 @@
+//===- OnDiskCommon.cpp ---------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "OnDiskCommon.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/Process.h"
+#include <thread>
+
+#if __has_include(<sys/file.h>)
+#include <sys/file.h>
+#ifdef LOCK_SH
+#define HAVE_FLOCK 1
+#else
+#define HAVE_FLOCK 0
+#endif
+#endif
+
+using namespace llvm;
+
+static uint64_t OnDiskCASMaxMappingSize = 0;
+
+Expected<std::optional<uint64_t>> cas::ondisk::getOverriddenMaxMappingSize() {
+ static std::once_flag Flag;
+ Error Err = Error::success();
+ std::call_once(Flag, [&Err] {
+ ErrorAsOutParameter EAO(&Err);
+ constexpr const char *EnvVar = "LLVM_CAS_MAX_MAPPING_SIZE";
+ auto Value = sys::Process::GetEnv(EnvVar);
+ if (!Value)
+ return;
+
+ uint64_t Size;
+ if (StringRef(*Value).getAsInteger(/*auto*/ 0, Size))
+ Err = createStringError(inconvertibleErrorCode(),
+ "invalid value for %s: expected integer", EnvVar);
+ OnDiskCASMaxMappingSize = Size;
+ });
+
+ if (Err)
+ return std::move(Err);
+
+ if (OnDiskCASMaxMappingSize == 0)
+ return std::nullopt;
+
+ return OnDiskCASMaxMappingSize;
+}
+
+void cas::ondisk::setMaxMappingSize(uint64_t Size) {
+ OnDiskCASMaxMappingSize = Size;
+}
+
+std::error_code cas::ondisk::lockFileThreadSafe(int FD, bool Exclusive) {
+#if HAVE_FLOCK
+ if (flock(FD, Exclusive ? LOCK_EX : LOCK_SH) == 0)
+ return std::error_code();
+ return std::error_code(errno, std::generic_category());
+#elif defined(_WIN32)
+ // On Windows this implementation is thread-safe.
+ return sys::fs::lockFile(FD, Exclusive);
+#else
+ return make_error_code(std::errc::no_lock_available);
+#endif
+}
+
+std::error_code cas::ondisk::unlockFileThreadSafe(int FD) {
+#if HAVE_FLOCK
+ if (flock(FD, LOCK_UN) == 0)
+ return std::error_code();
+ return std::error_code(errno, std::generic_category());
+#elif defined(_WIN32)
+ // On Windows this implementation is thread-safe.
+ return sys::fs::unlockFile(FD);
+#else
+ return make_error_code(std::errc::no_lock_available);
+#endif
+}
+
+std::error_code
+cas::ondisk::tryLockFileThreadSafe(int FD, std::chrono::milliseconds Timeout,
+ bool Exclusive) {
+#if HAVE_FLOCK
+ auto Start = std::chrono::steady_clock::now();
+ auto End = Start + Timeout;
+ do {
+ if (flock(FD, (Exclusive ? LOCK_EX : LOCK_SH) | LOCK_NB) == 0)
+ return std::error_code();
+ int Error = errno;
+ if (Error == EWOULDBLOCK) {
+ // Match sys::fs::tryLockFile, which sleeps for 1 ms per attempt.
+ std::this_thread::sleep_for(std::chrono::milliseconds(1));
+ continue;
+ }
+ return std::error_code(Error, std::generic_category());
+ } while (std::chrono::steady_clock::now() < End);
+ return make_error_code(std::errc::no_lock_available);
+#elif defined(_WIN32)
+ // On Windows this implementation is thread-safe.
+ return sys::fs::tryLockFile(FD, Timeout, Exclusive);
+#else
+ return make_error_code(std::errc::no_lock_available);
+#endif
+}
diff --git a/llvm/lib/CAS/OnDiskCommon.h b/llvm/lib/CAS/OnDiskCommon.h
new file mode 100644
index 0000000..c8dc0c9
--- /dev/null
+++ b/llvm/lib/CAS/OnDiskCommon.h
@@ -0,0 +1,48 @@
+//===- OnDiskCommon.h -------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CAS_ONDISKCOMMON_H
+#define LLVM_LIB_CAS_ONDISKCOMMON_H
+
+#include "llvm/Support/Error.h"
+#include <chrono>
+#include <optional>
+
+namespace llvm::cas::ondisk {
+
+/// Retrieves an overridden maximum mapping size for CAS files, if any,
+/// speicified by LLVM_CAS_MAX_MAPPING_SIZE in the environment or set by
+/// `setMaxMappingSize()`. If the value from environment is unreadable, returns
+/// an error.
+Expected<std::optional<uint64_t>> getOverriddenMaxMappingSize();
+
+/// Set MaxMappingSize for ondisk CAS. This function is not thread-safe and
+/// should be set before creaing any ondisk CAS and does not affect CAS already
+/// created. Set value 0 to use default size.
+void setMaxMappingSize(uint64_t Size);
+
+/// Thread-safe alternative to \c sys::fs::lockFile. This does not support all
+/// the platforms that \c sys::fs::lockFile does, so keep it in the CAS library
+/// for now.
+std::error_code lockFileThreadSafe(int FD, bool Exclusive = true);
+
+/// Thread-safe alternative to \c sys::fs::unlockFile. This does not support all
+/// the platforms that \c sys::fs::lockFile does, so keep it in the CAS library
+/// for now.
+std::error_code unlockFileThreadSafe(int FD);
+
+/// Thread-safe alternative to \c sys::fs::tryLockFile. This does not support
+/// all the platforms that \c sys::fs::lockFile does, so keep it in the CAS
+/// library for now.
+std::error_code tryLockFileThreadSafe(
+ int FD, std::chrono::milliseconds Timeout = std::chrono::milliseconds(0),
+ bool Exclusive = true);
+
+} // namespace llvm::cas::ondisk
+
+#endif // LLVM_LIB_CAS_ONDISKCOMMON_H
diff --git a/llvm/lib/CAS/OnDiskGraphDB.cpp b/llvm/lib/CAS/OnDiskGraphDB.cpp
new file mode 100644
index 0000000..d804ca1
--- /dev/null
+++ b/llvm/lib/CAS/OnDiskGraphDB.cpp
@@ -0,0 +1,1548 @@
+//===- OnDiskGraphDB.cpp ----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// On-disk CAS nodes database, independent of a particular hashing algorithm.
+//
+// Here's a top-level description of the current layout (could expose or make
+// this configurable in the future).
+//
+// Files, each with a prefix set by \a FilePrefix:
+//
+// - db/<prefix>.index: a file for the "index" table, named by \a
+// IndexTableName and managed by \a HashMappedTrie. The contents are 8B
+// that are accessed atomically, describing the object kind and where/how
+// it's stored (including an optional file offset). See \a TrieRecord for
+// more details.
+// - db/<prefix>.data: a file for the "data" table, named by \a
+// DataPoolTableName and managed by \a DataStore. New objects within
+// TrieRecord::MaxEmbeddedSize are inserted here as \a
+// TrieRecord::StorageKind::DataPool.
+// - db/<prefix>.<offset>.data: a file storing an object outside the main
+// "data" table, named by its offset into the "index" table, with the
+// format of \a TrieRecord::StorageKind::Standalone.
+// - db/<prefix>.<offset>.leaf: a file storing a leaf node outside the
+// main "data" table, named by its offset into the "index" table, with
+// the format of \a TrieRecord::StorageKind::StandaloneLeaf.
+// - db/<prefix>.<offset>.leaf+0: a file storing a leaf object outside the
+// main "data" table, named by its offset into the "index" table, with
+// the format of \a TrieRecord::StorageKind::StandaloneLeaf0.
+//
+// The "index", and "data" tables could be stored in a single file,
+// (using a root record that points at the two types of stores), but splitting
+// the files seems more convenient for now.
+//
+// ObjectID: this is a pointer to Trie record
+//
+// ObjectHandle: this is a pointer to Data record
+//
+// Eventually: consider creating a StringPool for strings instead of using
+// RecordDataStore table.
+// - Lookup by prefix tree
+// - Store by suffix tree
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/OnDiskGraphDB.h"
+#include "OnDiskCommon.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Alignment.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/Process.h"
+
+#if __has_include(<sys/mount.h>)
+#include <sys/mount.h> // statfs
+#endif
+
+#define DEBUG_TYPE "on-disk-cas"
+
+using namespace llvm;
+using namespace llvm::cas;
+using namespace llvm::cas::ondisk;
+
+static constexpr StringLiteral IndexTableName = "llvm.cas.index";
+static constexpr StringLiteral DataPoolTableName = "llvm.cas.data";
+
+static constexpr StringLiteral IndexFile = "index";
+static constexpr StringLiteral DataPoolFile = "data";
+
+static constexpr StringLiteral FilePrefix = "v1.";
+static constexpr StringLiteral FileSuffixData = ".data";
+static constexpr StringLiteral FileSuffixLeaf = ".leaf";
+static constexpr StringLiteral FileSuffixLeaf0 = ".leaf+0";
+
+static Error createCorruptObjectError(ArrayRef<uint8_t> ID) {
+ return createStringError(llvm::errc::invalid_argument,
+ "corrupt object '" + toHex(ID) + "'");
+}
+
+namespace {
+
+/// Trie record data: 8B, atomic<uint64_t>
+/// - 1-byte: StorageKind
+/// - 7-bytes: DataStoreOffset (offset into referenced file)
+class TrieRecord {
+public:
+ enum class StorageKind : uint8_t {
+ /// Unknown object.
+ Unknown = 0,
+
+ /// vX.data: main pool, full DataStore record.
+ DataPool = 1,
+
+ /// vX.<TrieRecordOffset>.data: standalone, with a full DataStore record.
+ Standalone = 10,
+
+ /// vX.<TrieRecordOffset>.leaf: standalone, just the data. File contents
+ /// exactly the data content and file size matches the data size. No refs.
+ StandaloneLeaf = 11,
+
+ /// vX.<TrieRecordOffset>.leaf+0: standalone, just the data plus an
+ /// extra null character ('\0'). File size is 1 bigger than the data size.
+ /// No refs.
+ StandaloneLeaf0 = 12,
+ };
+
+ static StringRef getStandaloneFileSuffix(StorageKind SK) {
+ switch (SK) {
+ default:
+ llvm_unreachable("Expected standalone storage kind");
+ case TrieRecord::StorageKind::Standalone:
+ return FileSuffixData;
+ case TrieRecord::StorageKind::StandaloneLeaf0:
+ return FileSuffixLeaf0;
+ case TrieRecord::StorageKind::StandaloneLeaf:
+ return FileSuffixLeaf;
+ }
+ }
+
+ enum Limits : int64_t {
+ // Saves files bigger than 64KB standalone instead of embedding them.
+ MaxEmbeddedSize = 64LL * 1024LL - 1,
+ };
+
+ struct Data {
+ StorageKind SK = StorageKind::Unknown;
+ FileOffset Offset;
+ };
+
+ static uint64_t pack(Data D) {
+ assert(D.Offset.get() < (int64_t)(1ULL << 56));
+ uint64_t Packed = uint64_t(D.SK) << 56 | D.Offset.get();
+ assert(D.SK != StorageKind::Unknown || Packed == 0);
+#ifndef NDEBUG
+ Data RoundTrip = unpack(Packed);
+ assert(D.SK == RoundTrip.SK);
+ assert(D.Offset.get() == RoundTrip.Offset.get());
+#endif
+ return Packed;
+ }
+
+ static Data unpack(uint64_t Packed) {
+ Data D;
+ if (!Packed)
+ return D;
+ D.SK = (StorageKind)(Packed >> 56);
+ D.Offset = FileOffset(Packed & (UINT64_MAX >> 8));
+ return D;
+ }
+
+ TrieRecord() : Storage(0) {}
+
+ Data load() const { return unpack(Storage); }
+ bool compare_exchange_strong(Data &Existing, Data New);
+
+private:
+ std::atomic<uint64_t> Storage;
+};
+
+/// DataStore record data: 4B + size? + refs? + data + 0
+/// - 4-bytes: Header
+/// - {0,4,8}-bytes: DataSize (may be packed in Header)
+/// - {0,4,8}-bytes: NumRefs (may be packed in Header)
+/// - NumRefs*{4,8}-bytes: Refs[] (end-ptr is 8-byte aligned)
+/// - <data>
+/// - 1-byte: 0-term
+struct DataRecordHandle {
+ /// NumRefs storage: 4B, 2B, 1B, or 0B (no refs). Or, 8B, for alignment
+ /// convenience to avoid computing padding later.
+ enum class NumRefsFlags : uint8_t {
+ Uses0B = 0U,
+ Uses1B = 1U,
+ Uses2B = 2U,
+ Uses4B = 3U,
+ Uses8B = 4U,
+ Max = Uses8B,
+ };
+
+ /// DataSize storage: 8B, 4B, 2B, or 1B.
+ enum class DataSizeFlags {
+ Uses1B = 0U,
+ Uses2B = 1U,
+ Uses4B = 2U,
+ Uses8B = 3U,
+ Max = Uses8B,
+ };
+
+ /// Kind of ref stored in Refs[]: InternalRef or InternalRef4B.
+ enum class RefKindFlags {
+ InternalRef = 0U,
+ InternalRef4B = 1U,
+ Max = InternalRef4B,
+ };
+
+ enum Counts : int {
+ NumRefsShift = 0,
+ NumRefsBits = 3,
+ DataSizeShift = NumRefsShift + NumRefsBits,
+ DataSizeBits = 2,
+ RefKindShift = DataSizeShift + DataSizeBits,
+ RefKindBits = 1,
+ };
+ static_assert(((UINT32_MAX << NumRefsBits) & (uint32_t)NumRefsFlags::Max) ==
+ 0,
+ "Not enough bits");
+ static_assert(((UINT32_MAX << DataSizeBits) & (uint32_t)DataSizeFlags::Max) ==
+ 0,
+ "Not enough bits");
+ static_assert(((UINT32_MAX << RefKindBits) & (uint32_t)RefKindFlags::Max) ==
+ 0,
+ "Not enough bits");
+
+ struct LayoutFlags {
+ NumRefsFlags NumRefs;
+ DataSizeFlags DataSize;
+ RefKindFlags RefKind;
+
+ static uint64_t pack(LayoutFlags LF) {
+ unsigned Packed = ((unsigned)LF.NumRefs << NumRefsShift) |
+ ((unsigned)LF.DataSize << DataSizeShift) |
+ ((unsigned)LF.RefKind << RefKindShift);
+#ifndef NDEBUG
+ LayoutFlags RoundTrip = unpack(Packed);
+ assert(LF.NumRefs == RoundTrip.NumRefs);
+ assert(LF.DataSize == RoundTrip.DataSize);
+ assert(LF.RefKind == RoundTrip.RefKind);
+#endif
+ return Packed;
+ }
+ static LayoutFlags unpack(uint64_t Storage) {
+ assert(Storage <= UINT8_MAX && "Expect storage to fit in a byte");
+ LayoutFlags LF;
+ LF.NumRefs =
+ (NumRefsFlags)((Storage >> NumRefsShift) & ((1U << NumRefsBits) - 1));
+ LF.DataSize = (DataSizeFlags)((Storage >> DataSizeShift) &
+ ((1U << DataSizeBits) - 1));
+ LF.RefKind =
+ (RefKindFlags)((Storage >> RefKindShift) & ((1U << RefKindBits) - 1));
+ return LF;
+ }
+ };
+
+ /// Header layout:
+ /// - 1-byte: LayoutFlags
+ /// - 1-byte: 1B size field
+ /// - {0,2}-bytes: 2B size field
+ struct Header {
+ using PackTy = uint32_t;
+ PackTy Packed;
+
+ static constexpr unsigned LayoutFlagsShift =
+ (sizeof(PackTy) - 1) * CHAR_BIT;
+ };
+
+ struct Input {
+ InternalRefArrayRef Refs;
+ ArrayRef<char> Data;
+ };
+
+ LayoutFlags getLayoutFlags() const {
+ return LayoutFlags::unpack(H->Packed >> Header::LayoutFlagsShift);
+ }
+
+ uint64_t getDataSize() const;
+ void skipDataSize(LayoutFlags LF, int64_t &RelOffset) const;
+ uint32_t getNumRefs() const;
+ void skipNumRefs(LayoutFlags LF, int64_t &RelOffset) const;
+ int64_t getRefsRelOffset() const;
+ int64_t getDataRelOffset() const;
+
+ static uint64_t getTotalSize(uint64_t DataRelOffset, uint64_t DataSize) {
+ return DataRelOffset + DataSize + 1;
+ }
+ uint64_t getTotalSize() const {
+ return getDataRelOffset() + getDataSize() + 1;
+ }
+
+ struct Layout {
+ explicit Layout(const Input &I);
+
+ LayoutFlags Flags{};
+ uint64_t DataSize = 0;
+ uint32_t NumRefs = 0;
+ int64_t RefsRelOffset = 0;
+ int64_t DataRelOffset = 0;
+ uint64_t getTotalSize() const {
+ return DataRecordHandle::getTotalSize(DataRelOffset, DataSize);
+ }
+ };
+
+ InternalRefArrayRef getRefs() const {
+ assert(H && "Expected valid handle");
+ auto *BeginByte = reinterpret_cast<const char *>(H) + getRefsRelOffset();
+ size_t Size = getNumRefs();
+ if (!Size)
+ return InternalRefArrayRef();
+ if (getLayoutFlags().RefKind == RefKindFlags::InternalRef4B)
+ return ArrayRef(reinterpret_cast<const InternalRef4B *>(BeginByte), Size);
+ return ArrayRef(reinterpret_cast<const InternalRef *>(BeginByte), Size);
+ }
+
+ ArrayRef<char> getData() const {
+ assert(H && "Expected valid handle");
+ return ArrayRef(reinterpret_cast<const char *>(H) + getDataRelOffset(),
+ getDataSize());
+ }
+
+ static DataRecordHandle create(function_ref<char *(size_t Size)> Alloc,
+ const Input &I);
+ static Expected<DataRecordHandle>
+ createWithError(function_ref<Expected<char *>(size_t Size)> Alloc,
+ const Input &I);
+ static DataRecordHandle construct(char *Mem, const Input &I);
+
+ static DataRecordHandle get(const char *Mem) {
+ return DataRecordHandle(
+ *reinterpret_cast<const DataRecordHandle::Header *>(Mem));
+ }
+
+ explicit operator bool() const { return H; }
+ const Header &getHeader() const { return *H; }
+
+ DataRecordHandle() = default;
+ explicit DataRecordHandle(const Header &H) : H(&H) {}
+
+private:
+ static DataRecordHandle constructImpl(char *Mem, const Input &I,
+ const Layout &L);
+ const Header *H = nullptr;
+};
+
+class StandaloneDataInMemory {
+public:
+ OnDiskContent getContent() const;
+
+ /// FIXME: Should be mapped_file_region instead of MemoryBuffer to drop a
+ /// layer of indirection.
+ std::unique_ptr<MemoryBuffer> Region;
+ TrieRecord::StorageKind SK;
+ StandaloneDataInMemory(std::unique_ptr<MemoryBuffer> Region,
+ TrieRecord::StorageKind SK)
+ : Region(std::move(Region)), SK(SK) {
+#ifndef NDEBUG
+ bool IsStandalone = false;
+ switch (SK) {
+ case TrieRecord::StorageKind::Standalone:
+ case TrieRecord::StorageKind::StandaloneLeaf:
+ case TrieRecord::StorageKind::StandaloneLeaf0:
+ IsStandalone = true;
+ break;
+ default:
+ break;
+ }
+ assert(IsStandalone);
+#endif
+ }
+};
+
+/// Container for "big" objects mapped in separately.
+template <size_t NumShards> class StandaloneDataMap {
+ static_assert(isPowerOf2_64(NumShards), "Expected power of 2");
+
+public:
+ const StandaloneDataInMemory &insert(ArrayRef<uint8_t> Hash,
+ TrieRecord::StorageKind SK,
+ std::unique_ptr<MemoryBuffer> Buffer);
+
+ const StandaloneDataInMemory *lookup(ArrayRef<uint8_t> Hash) const;
+ bool count(ArrayRef<uint8_t> Hash) const { return bool(lookup(Hash)); }
+
+private:
+ struct Shard {
+ /// Needs to store a std::unique_ptr for a stable address identity.
+ DenseMap<const uint8_t *, std::unique_ptr<StandaloneDataInMemory>> Map;
+ mutable std::mutex Mutex;
+ };
+ Shard &getShard(ArrayRef<uint8_t> Hash) {
+ return const_cast<Shard &>(
+ const_cast<const StandaloneDataMap *>(this)->getShard(Hash));
+ }
+ const Shard &getShard(ArrayRef<uint8_t> Hash) const {
+ static_assert(NumShards <= 256, "Expected only 8 bits of shard");
+ return Shards[Hash[0] % NumShards];
+ }
+
+ Shard Shards[NumShards];
+};
+
+using StandaloneDataMapTy = StandaloneDataMap<16>;
+
+struct InternalHandle {
+ FileOffset getAsFileOffset() const { return *DataOffset; }
+
+ uint64_t getRawData() const {
+ if (DataOffset) {
+ uint64_t Raw = DataOffset->get();
+ assert(!(Raw & 0x1));
+ return Raw;
+ }
+ uint64_t Raw = reinterpret_cast<uintptr_t>(SDIM);
+ assert(!(Raw & 0x1));
+ return Raw | 1;
+ }
+
+ explicit InternalHandle(FileOffset DataOffset) : DataOffset(DataOffset) {}
+ explicit InternalHandle(uint64_t DataOffset) : DataOffset(DataOffset) {}
+ explicit InternalHandle(const StandaloneDataInMemory &SDIM) : SDIM(&SDIM) {}
+ std::optional<FileOffset> DataOffset;
+ const StandaloneDataInMemory *SDIM = nullptr;
+};
+
+class InternalRefVector {
+public:
+ void push_back(InternalRef Ref) {
+ if (NeedsFull)
+ return FullRefs.push_back(Ref);
+ if (std::optional<InternalRef4B> Small = InternalRef4B::tryToShrink(Ref))
+ return SmallRefs.push_back(*Small);
+ NeedsFull = true;
+ assert(FullRefs.empty());
+ FullRefs.reserve(SmallRefs.size() + 1);
+ for (InternalRef4B Small : SmallRefs)
+ FullRefs.push_back(Small);
+ FullRefs.push_back(Ref);
+ SmallRefs.clear();
+ }
+
+ operator InternalRefArrayRef() const {
+ assert(SmallRefs.empty() || FullRefs.empty());
+ return NeedsFull ? InternalRefArrayRef(FullRefs)
+ : InternalRefArrayRef(SmallRefs);
+ }
+
+private:
+ bool NeedsFull = false;
+ SmallVector<InternalRef4B> SmallRefs;
+ SmallVector<InternalRef> FullRefs;
+};
+
+} // namespace
+
+/// Proxy for any on-disk object or raw data.
+struct ondisk::OnDiskContent {
+ std::optional<DataRecordHandle> Record;
+ std::optional<ArrayRef<char>> Bytes;
+};
+
+Expected<DataRecordHandle> DataRecordHandle::createWithError(
+ function_ref<Expected<char *>(size_t Size)> Alloc, const Input &I) {
+ Layout L(I);
+ if (Expected<char *> Mem = Alloc(L.getTotalSize()))
+ return constructImpl(*Mem, I, L);
+ else
+ return Mem.takeError();
+}
+
+DataRecordHandle
+DataRecordHandle::create(function_ref<char *(size_t Size)> Alloc,
+ const Input &I) {
+ Layout L(I);
+ return constructImpl(Alloc(L.getTotalSize()), I, L);
+}
+
+/// Proxy for an on-disk index record.
+struct OnDiskGraphDB::IndexProxy {
+ FileOffset Offset;
+ ArrayRef<uint8_t> Hash;
+ TrieRecord &Ref;
+};
+
+template <size_t N>
+const StandaloneDataInMemory &
+StandaloneDataMap<N>::insert(ArrayRef<uint8_t> Hash, TrieRecord::StorageKind SK,
+ std::unique_ptr<MemoryBuffer> Buffer) {
+ auto &S = getShard(Hash);
+ std::lock_guard<std::mutex> Lock(S.Mutex);
+ auto &V = S.Map[Hash.data()];
+ if (!V)
+ V = std::make_unique<StandaloneDataInMemory>(std::move(Buffer), SK);
+ return *V;
+}
+
+template <size_t N>
+const StandaloneDataInMemory *
+StandaloneDataMap<N>::lookup(ArrayRef<uint8_t> Hash) const {
+ auto &S = getShard(Hash);
+ std::lock_guard<std::mutex> Lock(S.Mutex);
+ auto I = S.Map.find(Hash.data());
+ if (I == S.Map.end())
+ return nullptr;
+ return &*I->second;
+}
+
+/// Copy of \a sys::fs::TempFile that skips RemoveOnSignal, which is too
+/// expensive to register/unregister at this rate.
+///
+/// FIXME: Add a TempFileManager that maintains a thread-safe list of open temp
+/// files and has a signal handler registerd that removes them all.
+class OnDiskGraphDB::TempFile {
+ bool Done = false;
+ TempFile(StringRef Name, int FD) : TmpName(std::string(Name)), FD(FD) {}
+
+public:
+ /// This creates a temporary file with createUniqueFile.
+ static Expected<TempFile> create(const Twine &Model);
+ TempFile(TempFile &&Other) { *this = std::move(Other); }
+ TempFile &operator=(TempFile &&Other) {
+ TmpName = std::move(Other.TmpName);
+ FD = Other.FD;
+ Other.Done = true;
+ Other.FD = -1;
+ return *this;
+ }
+
+ // Name of the temporary file.
+ std::string TmpName;
+
+ // The open file descriptor.
+ int FD = -1;
+
+ // Keep this with the given name.
+ Error keep(const Twine &Name);
+ Error discard();
+
+ // This checks that keep or delete was called.
+ ~TempFile() { consumeError(discard()); }
+};
+
+class OnDiskGraphDB::MappedTempFile {
+public:
+ char *data() const { return Map.data(); }
+ size_t size() const { return Map.size(); }
+
+ Error discard() {
+ assert(Map && "Map already destroyed");
+ Map.unmap();
+ return Temp.discard();
+ }
+
+ Error keep(const Twine &Name) {
+ assert(Map && "Map already destroyed");
+ Map.unmap();
+ return Temp.keep(Name);
+ }
+
+ MappedTempFile(TempFile Temp, sys::fs::mapped_file_region Map)
+ : Temp(std::move(Temp)), Map(std::move(Map)) {}
+
+private:
+ TempFile Temp;
+ sys::fs::mapped_file_region Map;
+};
+
+Error OnDiskGraphDB::TempFile::discard() {
+ Done = true;
+ if (FD != -1) {
+ sys::fs::file_t File = sys::fs::convertFDToNativeFile(FD);
+ if (std::error_code EC = sys::fs::closeFile(File))
+ return errorCodeToError(EC);
+ }
+ FD = -1;
+
+ // Always try to close and remove.
+ std::error_code RemoveEC;
+ if (!TmpName.empty())
+ if (std::error_code EC = sys::fs::remove(TmpName))
+ return errorCodeToError(EC);
+ TmpName = "";
+
+ return Error::success();
+}
+
+Error OnDiskGraphDB::TempFile::keep(const Twine &Name) {
+ assert(!Done);
+ Done = true;
+ // Always try to close and rename.
+ std::error_code RenameEC = sys::fs::rename(TmpName, Name);
+
+ if (!RenameEC)
+ TmpName = "";
+
+ sys::fs::file_t File = sys::fs::convertFDToNativeFile(FD);
+ if (std::error_code EC = sys::fs::closeFile(File))
+ return errorCodeToError(EC);
+ FD = -1;
+
+ return errorCodeToError(RenameEC);
+}
+
+Expected<OnDiskGraphDB::TempFile>
+OnDiskGraphDB::TempFile::create(const Twine &Model) {
+ int FD;
+ SmallString<128> ResultPath;
+ if (std::error_code EC = sys::fs::createUniqueFile(Model, FD, ResultPath))
+ return errorCodeToError(EC);
+
+ TempFile Ret(ResultPath, FD);
+ return std::move(Ret);
+}
+
+bool TrieRecord::compare_exchange_strong(Data &Existing, Data New) {
+ uint64_t ExistingPacked = pack(Existing);
+ uint64_t NewPacked = pack(New);
+ if (Storage.compare_exchange_strong(ExistingPacked, NewPacked))
+ return true;
+ Existing = unpack(ExistingPacked);
+ return false;
+}
+
+DataRecordHandle DataRecordHandle::construct(char *Mem, const Input &I) {
+ return constructImpl(Mem, I, Layout(I));
+}
+
+DataRecordHandle DataRecordHandle::constructImpl(char *Mem, const Input &I,
+ const Layout &L) {
+ char *Next = Mem + sizeof(Header);
+
+ // Fill in Packed and set other data, then come back to construct the header.
+ Header::PackTy Packed = 0;
+ Packed |= LayoutFlags::pack(L.Flags) << Header::LayoutFlagsShift;
+
+ // Construct DataSize.
+ switch (L.Flags.DataSize) {
+ case DataSizeFlags::Uses1B:
+ assert(I.Data.size() <= UINT8_MAX);
+ Packed |= (Header::PackTy)I.Data.size()
+ << ((sizeof(Packed) - 2) * CHAR_BIT);
+ break;
+ case DataSizeFlags::Uses2B:
+ assert(I.Data.size() <= UINT16_MAX);
+ Packed |= (Header::PackTy)I.Data.size()
+ << ((sizeof(Packed) - 4) * CHAR_BIT);
+ break;
+ case DataSizeFlags::Uses4B:
+ support::endian::write32le(Next, I.Data.size());
+ Next += 4;
+ break;
+ case DataSizeFlags::Uses8B:
+ support::endian::write64le(Next, I.Data.size());
+ Next += 8;
+ break;
+ }
+
+ // Construct NumRefs.
+ //
+ // NOTE: May be writing NumRefs even if there are zero refs in order to fix
+ // alignment.
+ switch (L.Flags.NumRefs) {
+ case NumRefsFlags::Uses0B:
+ break;
+ case NumRefsFlags::Uses1B:
+ assert(I.Refs.size() <= UINT8_MAX);
+ Packed |= (Header::PackTy)I.Refs.size()
+ << ((sizeof(Packed) - 2) * CHAR_BIT);
+ break;
+ case NumRefsFlags::Uses2B:
+ assert(I.Refs.size() <= UINT16_MAX);
+ Packed |= (Header::PackTy)I.Refs.size()
+ << ((sizeof(Packed) - 4) * CHAR_BIT);
+ break;
+ case NumRefsFlags::Uses4B:
+ support::endian::write32le(Next, I.Refs.size());
+ Next += 4;
+ break;
+ case NumRefsFlags::Uses8B:
+ support::endian::write64le(Next, I.Refs.size());
+ Next += 8;
+ break;
+ }
+
+ // Construct Refs[].
+ if (!I.Refs.empty()) {
+ assert((L.Flags.RefKind == RefKindFlags::InternalRef4B) == I.Refs.is4B());
+ ArrayRef<uint8_t> RefsBuffer = I.Refs.getBuffer();
+ llvm::copy(RefsBuffer, Next);
+ Next += RefsBuffer.size();
+ }
+
+ // Construct Data and the trailing null.
+ assert(isAddrAligned(Align(8), Next));
+ llvm::copy(I.Data, Next);
+ Next[I.Data.size()] = 0;
+
+ // Construct the header itself and return.
+ Header *H = new (Mem) Header{Packed};
+ DataRecordHandle Record(*H);
+ assert(Record.getData() == I.Data);
+ assert(Record.getNumRefs() == I.Refs.size());
+ assert(Record.getRefs() == I.Refs);
+ assert(Record.getLayoutFlags().DataSize == L.Flags.DataSize);
+ assert(Record.getLayoutFlags().NumRefs == L.Flags.NumRefs);
+ assert(Record.getLayoutFlags().RefKind == L.Flags.RefKind);
+ return Record;
+}
+
+DataRecordHandle::Layout::Layout(const Input &I) {
+ // Start initial relative offsets right after the Header.
+ uint64_t RelOffset = sizeof(Header);
+
+ // Initialize the easy stuff.
+ DataSize = I.Data.size();
+ NumRefs = I.Refs.size();
+
+ // Check refs size.
+ Flags.RefKind =
+ I.Refs.is4B() ? RefKindFlags::InternalRef4B : RefKindFlags::InternalRef;
+
+ // Find the smallest slot available for DataSize.
+ bool Has1B = true;
+ bool Has2B = true;
+ if (DataSize <= UINT8_MAX && Has1B) {
+ Flags.DataSize = DataSizeFlags::Uses1B;
+ Has1B = false;
+ } else if (DataSize <= UINT16_MAX && Has2B) {
+ Flags.DataSize = DataSizeFlags::Uses2B;
+ Has2B = false;
+ } else if (DataSize <= UINT32_MAX) {
+ Flags.DataSize = DataSizeFlags::Uses4B;
+ RelOffset += 4;
+ } else {
+ Flags.DataSize = DataSizeFlags::Uses8B;
+ RelOffset += 8;
+ }
+
+ // Find the smallest slot available for NumRefs. Never sets NumRefs8B here.
+ if (!NumRefs) {
+ Flags.NumRefs = NumRefsFlags::Uses0B;
+ } else if (NumRefs <= UINT8_MAX && Has1B) {
+ Flags.NumRefs = NumRefsFlags::Uses1B;
+ Has1B = false;
+ } else if (NumRefs <= UINT16_MAX && Has2B) {
+ Flags.NumRefs = NumRefsFlags::Uses2B;
+ Has2B = false;
+ } else {
+ Flags.NumRefs = NumRefsFlags::Uses4B;
+ RelOffset += 4;
+ }
+
+ // Helper to "upgrade" either DataSize or NumRefs by 4B to avoid complicated
+ // padding rules when reading and writing. This also bumps RelOffset.
+ //
+ // The value for NumRefs is strictly limited to UINT32_MAX, but it can be
+ // stored as 8B. This means we can *always* find a size to grow.
+ //
+ // NOTE: Only call this once.
+ auto GrowSizeFieldsBy4B = [&]() {
+ assert(isAligned(Align(4), RelOffset));
+ RelOffset += 4;
+
+ assert(Flags.NumRefs != NumRefsFlags::Uses8B &&
+ "Expected to be able to grow NumRefs8B");
+
+ // First try to grow DataSize. NumRefs will not (yet) be 8B, and if
+ // DataSize is upgraded to 8B it'll already be aligned.
+ //
+ // Failing that, grow NumRefs.
+ if (Flags.DataSize < DataSizeFlags::Uses4B)
+ Flags.DataSize = DataSizeFlags::Uses4B; // DataSize: Packed => 4B.
+ else if (Flags.DataSize < DataSizeFlags::Uses8B)
+ Flags.DataSize = DataSizeFlags::Uses8B; // DataSize: 4B => 8B.
+ else if (Flags.NumRefs < NumRefsFlags::Uses4B)
+ Flags.NumRefs = NumRefsFlags::Uses4B; // NumRefs: Packed => 4B.
+ else
+ Flags.NumRefs = NumRefsFlags::Uses8B; // NumRefs: 4B => 8B.
+ };
+
+ assert(isAligned(Align(4), RelOffset));
+ if (Flags.RefKind == RefKindFlags::InternalRef) {
+ // List of 8B refs should be 8B-aligned. Grow one of the sizes to get this
+ // without padding.
+ if (!isAligned(Align(8), RelOffset))
+ GrowSizeFieldsBy4B();
+
+ assert(isAligned(Align(8), RelOffset));
+ RefsRelOffset = RelOffset;
+ RelOffset += 8 * NumRefs;
+ } else {
+ // The array of 4B refs doesn't need 8B alignment, but the data will need
+ // to be 8B-aligned. Detect this now, and, if necessary, shift everything
+ // by 4B by growing one of the sizes.
+ // If we remove the need for 8B-alignment for data there is <1% savings in
+ // disk storage for a clang build using MCCAS but the 8B-alignment may be
+ // useful in the future so keep it for now.
+ uint64_t RefListSize = 4 * NumRefs;
+ if (!isAligned(Align(8), RelOffset + RefListSize))
+ GrowSizeFieldsBy4B();
+ RefsRelOffset = RelOffset;
+ RelOffset += RefListSize;
+ }
+
+ assert(isAligned(Align(8), RelOffset));
+ DataRelOffset = RelOffset;
+}
+
+uint64_t DataRecordHandle::getDataSize() const {
+ int64_t RelOffset = sizeof(Header);
+ auto *DataSizePtr = reinterpret_cast<const char *>(H) + RelOffset;
+ switch (getLayoutFlags().DataSize) {
+ case DataSizeFlags::Uses1B:
+ return (H->Packed >> ((sizeof(Header::PackTy) - 2) * CHAR_BIT)) & UINT8_MAX;
+ case DataSizeFlags::Uses2B:
+ return (H->Packed >> ((sizeof(Header::PackTy) - 4) * CHAR_BIT)) &
+ UINT16_MAX;
+ case DataSizeFlags::Uses4B:
+ return support::endian::read32le(DataSizePtr);
+ case DataSizeFlags::Uses8B:
+ return support::endian::read64le(DataSizePtr);
+ }
+}
+
+void DataRecordHandle::skipDataSize(LayoutFlags LF, int64_t &RelOffset) const {
+ if (LF.DataSize >= DataSizeFlags::Uses4B)
+ RelOffset += 4;
+ if (LF.DataSize >= DataSizeFlags::Uses8B)
+ RelOffset += 4;
+}
+
+uint32_t DataRecordHandle::getNumRefs() const {
+ LayoutFlags LF = getLayoutFlags();
+ int64_t RelOffset = sizeof(Header);
+ skipDataSize(LF, RelOffset);
+ auto *NumRefsPtr = reinterpret_cast<const char *>(H) + RelOffset;
+ switch (LF.NumRefs) {
+ case NumRefsFlags::Uses0B:
+ return 0;
+ case NumRefsFlags::Uses1B:
+ return (H->Packed >> ((sizeof(Header::PackTy) - 2) * CHAR_BIT)) & UINT8_MAX;
+ case NumRefsFlags::Uses2B:
+ return (H->Packed >> ((sizeof(Header::PackTy) - 4) * CHAR_BIT)) &
+ UINT16_MAX;
+ case NumRefsFlags::Uses4B:
+ return support::endian::read32le(NumRefsPtr);
+ case NumRefsFlags::Uses8B:
+ return support::endian::read64le(NumRefsPtr);
+ }
+}
+
+void DataRecordHandle::skipNumRefs(LayoutFlags LF, int64_t &RelOffset) const {
+ if (LF.NumRefs >= NumRefsFlags::Uses4B)
+ RelOffset += 4;
+ if (LF.NumRefs >= NumRefsFlags::Uses8B)
+ RelOffset += 4;
+}
+
+int64_t DataRecordHandle::getRefsRelOffset() const {
+ LayoutFlags LF = getLayoutFlags();
+ int64_t RelOffset = sizeof(Header);
+ skipDataSize(LF, RelOffset);
+ skipNumRefs(LF, RelOffset);
+ return RelOffset;
+}
+
+int64_t DataRecordHandle::getDataRelOffset() const {
+ LayoutFlags LF = getLayoutFlags();
+ int64_t RelOffset = sizeof(Header);
+ skipDataSize(LF, RelOffset);
+ skipNumRefs(LF, RelOffset);
+ uint32_t RefSize = LF.RefKind == RefKindFlags::InternalRef4B ? 4 : 8;
+ RelOffset += RefSize * getNumRefs();
+ return RelOffset;
+}
+
+void OnDiskGraphDB::print(raw_ostream &OS) const {
+ OS << "on-disk-root-path: " << RootPath << "\n";
+
+ struct PoolInfo {
+ int64_t Offset;
+ };
+ SmallVector<PoolInfo> Pool;
+
+ OS << "\n";
+ OS << "index:\n";
+ Index.print(OS, [&](ArrayRef<char> Data) {
+ assert(Data.size() == sizeof(TrieRecord));
+ assert(isAligned(Align::Of<TrieRecord>(), Data.size()));
+ auto *R = reinterpret_cast<const TrieRecord *>(Data.data());
+ TrieRecord::Data D = R->load();
+ OS << " SK=";
+ switch (D.SK) {
+ case TrieRecord::StorageKind::Unknown:
+ OS << "unknown ";
+ break;
+ case TrieRecord::StorageKind::DataPool:
+ OS << "datapool ";
+ Pool.push_back({D.Offset.get()});
+ break;
+ case TrieRecord::StorageKind::Standalone:
+ OS << "standalone-data ";
+ break;
+ case TrieRecord::StorageKind::StandaloneLeaf:
+ OS << "standalone-leaf ";
+ break;
+ case TrieRecord::StorageKind::StandaloneLeaf0:
+ OS << "standalone-leaf+0";
+ break;
+ }
+ OS << " Offset=" << (void *)D.Offset.get();
+ });
+ if (Pool.empty())
+ return;
+
+ OS << "\n";
+ OS << "pool:\n";
+ llvm::sort(
+ Pool, [](PoolInfo LHS, PoolInfo RHS) { return LHS.Offset < RHS.Offset; });
+ for (PoolInfo PI : Pool) {
+ OS << "- addr=" << (void *)PI.Offset << " ";
+ DataRecordHandle D =
+ DataRecordHandle::get(DataPool.beginData(FileOffset(PI.Offset)));
+ OS << "record refs=" << D.getNumRefs() << " data=" << D.getDataSize()
+ << " size=" << D.getTotalSize()
+ << " end=" << (void *)(PI.Offset + D.getTotalSize()) << "\n";
+ }
+}
+
+OnDiskGraphDB::IndexProxy OnDiskGraphDB::indexHash(ArrayRef<uint8_t> Hash) {
+ OnDiskHashMappedTrie::pointer P = Index.insertLazy(
+ Hash, [](FileOffset TentativeOffset,
+ OnDiskHashMappedTrie::ValueProxy TentativeValue) {
+ assert(TentativeValue.Data.size() == sizeof(TrieRecord));
+ assert(
+ isAddrAligned(Align::Of<TrieRecord>(), TentativeValue.Data.data()));
+ new (TentativeValue.Data.data()) TrieRecord();
+ });
+ assert(P && "Expected insertion");
+ return getIndexProxyFromPointer(P);
+}
+
+OnDiskGraphDB::IndexProxy OnDiskGraphDB::getIndexProxyFromPointer(
+ OnDiskHashMappedTrie::const_pointer P) const {
+ assert(P);
+ assert(P.getOffset());
+ return IndexProxy{P.getOffset(), P->Hash,
+ *const_cast<TrieRecord *>(
+ reinterpret_cast<const TrieRecord *>(P->Data.data()))};
+}
+
+ObjectID OnDiskGraphDB::getReference(ArrayRef<uint8_t> Hash) {
+ IndexProxy I = indexHash(Hash);
+ return getExternalReference(I);
+}
+
+ObjectID OnDiskGraphDB::getExternalReference(const IndexProxy &I) {
+ return getExternalReference(makeInternalRef(I.Offset));
+}
+
+std::optional<ObjectID>
+OnDiskGraphDB::getExistingReference(ArrayRef<uint8_t> Digest) {
+ auto tryUpstream =
+ [&](std::optional<IndexProxy> I) -> std::optional<ObjectID> {
+ if (!UpstreamDB)
+ return std::nullopt;
+ std::optional<ObjectID> UpstreamID =
+ UpstreamDB->getExistingReference(Digest);
+ if (!UpstreamID)
+ return std::nullopt;
+ if (!I)
+ I.emplace(indexHash(Digest));
+ return getExternalReference(*I);
+ };
+
+ OnDiskHashMappedTrie::const_pointer P = Index.find(Digest);
+ if (!P)
+ return tryUpstream(std::nullopt);
+ IndexProxy I = getIndexProxyFromPointer(P);
+ TrieRecord::Data Obj = I.Ref.load();
+ if (Obj.SK == TrieRecord::StorageKind::Unknown)
+ return tryUpstream(I);
+ return getExternalReference(makeInternalRef(I.Offset));
+}
+
+OnDiskGraphDB::IndexProxy
+OnDiskGraphDB::getIndexProxyFromRef(InternalRef Ref) const {
+ OnDiskHashMappedTrie::const_pointer P =
+ Index.recoverFromFileOffset(Ref.getFileOffset());
+ if (LLVM_UNLIKELY(!P))
+ report_fatal_error("OnDiskCAS: corrupt internal reference");
+ return getIndexProxyFromPointer(P);
+}
+
+ArrayRef<uint8_t> OnDiskGraphDB::getDigest(InternalRef Ref) const {
+ IndexProxy I = getIndexProxyFromRef(Ref);
+ return I.Hash;
+}
+
+ArrayRef<uint8_t> OnDiskGraphDB::getDigest(const IndexProxy &I) const {
+ return I.Hash;
+}
+
+ArrayRef<char> OnDiskGraphDB::getObjectData(ObjectHandle Node) const {
+ OnDiskContent Content = getContentFromHandle(Node);
+ if (Content.Bytes)
+ return *Content.Bytes;
+ assert(Content.Record && "Expected record or bytes");
+ return Content.Record->getData();
+}
+
+InternalRefArrayRef OnDiskGraphDB::getInternalRefs(ObjectHandle Node) const {
+ if (std::optional<DataRecordHandle> Record =
+ getContentFromHandle(Node).Record)
+ return Record->getRefs();
+ return std::nullopt;
+}
+
+Expected<std::optional<ObjectHandle>>
+OnDiskGraphDB::load(ObjectID ExternalRef) {
+ InternalRef Ref = getInternalRef(ExternalRef);
+ IndexProxy I = getIndexProxyFromRef(Ref);
+ TrieRecord::Data Object = I.Ref.load();
+
+ if (Object.SK == TrieRecord::StorageKind::Unknown) {
+ if (!UpstreamDB)
+ return std::nullopt;
+ return faultInFromUpstream(ExternalRef);
+ }
+
+ auto toObjectHandle = [](InternalHandle H) -> ObjectHandle {
+ return ObjectHandle::fromOpaqueData(H.getRawData());
+ };
+
+ if (Object.SK == TrieRecord::StorageKind::DataPool)
+ return toObjectHandle(InternalHandle(Object.Offset));
+
+ // Only TrieRecord::StorageKind::Standalone (and variants) need to be
+ // explicitly loaded.
+ //
+ // There's corruption if standalone objects have offsets, or if we get here
+ // for something that isn't standalone.
+ if (Object.Offset)
+ return createCorruptObjectError(getDigest(I));
+ switch (Object.SK) {
+ case TrieRecord::StorageKind::Unknown:
+ case TrieRecord::StorageKind::DataPool:
+ llvm_unreachable("unexpected storage kind");
+ case TrieRecord::StorageKind::Standalone:
+ case TrieRecord::StorageKind::StandaloneLeaf0:
+ case TrieRecord::StorageKind::StandaloneLeaf:
+ break;
+ }
+
+ // Load it from disk.
+ //
+ // Note: Creation logic guarantees that data that needs null-termination is
+ // suitably 0-padded. Requiring null-termination here would be too expensive
+ // for extremely large objects that happen to be page-aligned.
+ SmallString<256> Path;
+ getStandalonePath(TrieRecord::getStandaloneFileSuffix(Object.SK), I, Path);
+ ErrorOr<std::unique_ptr<MemoryBuffer>> OwnedBuffer = MemoryBuffer::getFile(
+ Path, /*IsText=*/false, /*RequiresNullTerminator=*/false);
+ if (!OwnedBuffer)
+ return createCorruptObjectError(getDigest(I));
+
+ return toObjectHandle(
+ InternalHandle(static_cast<StandaloneDataMapTy *>(StandaloneData)
+ ->insert(I.Hash, Object.SK, std::move(*OwnedBuffer))));
+}
+
+Expected<bool> OnDiskGraphDB::isMaterialized(ObjectID Ref) {
+ switch (getObjectPresence(Ref, /*CheckUpstream=*/true)) {
+ case ObjectPresence::Missing:
+ return false;
+ case ObjectPresence::InPrimaryDB:
+ return true;
+ case ObjectPresence::OnlyInUpstreamDB:
+ if (auto FaultInResult = faultInFromUpstream(Ref); !FaultInResult)
+ return FaultInResult.takeError();
+ return true;
+ }
+}
+
+OnDiskGraphDB::ObjectPresence
+OnDiskGraphDB::getObjectPresence(ObjectID ExternalRef,
+ bool CheckUpstream) const {
+ InternalRef Ref = getInternalRef(ExternalRef);
+ IndexProxy I = getIndexProxyFromRef(Ref);
+ TrieRecord::Data Object = I.Ref.load();
+ if (Object.SK != TrieRecord::StorageKind::Unknown)
+ return ObjectPresence::InPrimaryDB;
+ if (!CheckUpstream || !UpstreamDB)
+ return ObjectPresence::Missing;
+ std::optional<ObjectID> UpstreamID =
+ UpstreamDB->getExistingReference(getDigest(I));
+ return UpstreamID.has_value() ? ObjectPresence::OnlyInUpstreamDB
+ : ObjectPresence::Missing;
+}
+
+InternalRef OnDiskGraphDB::makeInternalRef(FileOffset IndexOffset) {
+ return InternalRef::getFromOffset(IndexOffset);
+}
+
+void OnDiskGraphDB::getStandalonePath(StringRef Suffix, const IndexProxy &I,
+ SmallVectorImpl<char> &Path) const {
+ Path.assign(RootPath.begin(), RootPath.end());
+ sys::path::append(Path, FilePrefix + Twine(I.Offset.get()) + Suffix);
+}
+
+OnDiskContent OnDiskGraphDB::getContentFromHandle(ObjectHandle OH) const {
+ auto getInternalHandle = [](ObjectHandle Handle) -> InternalHandle {
+ uint64_t Data = Handle.getOpaqueData();
+ if (Data & 1)
+ return InternalHandle(*reinterpret_cast<const StandaloneDataInMemory *>(
+ Data & (-1ULL << 1)));
+ return InternalHandle(Data);
+ };
+
+ InternalHandle Handle = getInternalHandle(OH);
+ if (Handle.SDIM)
+ return Handle.SDIM->getContent();
+
+ auto DataHandle =
+ DataRecordHandle::get(DataPool.beginData(Handle.getAsFileOffset()));
+ assert(DataHandle.getData().end()[0] == 0 && "Null termination");
+ return OnDiskContent{DataHandle, std::nullopt};
+}
+
+OnDiskContent StandaloneDataInMemory::getContent() const {
+ bool Leaf0 = false;
+ bool Leaf = false;
+ switch (SK) {
+ default:
+ llvm_unreachable("Storage kind must be standalone");
+ case TrieRecord::StorageKind::Standalone:
+ break;
+ case TrieRecord::StorageKind::StandaloneLeaf0:
+ Leaf = Leaf0 = true;
+ break;
+ case TrieRecord::StorageKind::StandaloneLeaf:
+ Leaf = true;
+ break;
+ }
+
+ if (Leaf) {
+ assert(Region->getBuffer().drop_back(Leaf0).end()[0] == 0 &&
+ "Standalone node data missing null termination");
+ return OnDiskContent{
+ std::nullopt,
+ arrayRefFromStringRef<char>(Region->getBuffer().drop_back(Leaf0))};
+ }
+
+ DataRecordHandle Record = DataRecordHandle::get(Region->getBuffer().data());
+ assert(Record.getData().end()[0] == 0 &&
+ "Standalone object record missing null termination for data");
+ return OnDiskContent{Record, std::nullopt};
+}
+
+Expected<OnDiskGraphDB::MappedTempFile>
+OnDiskGraphDB::createTempFile(StringRef FinalPath, uint64_t Size) {
+ assert(Size && "Unexpected request for an empty temp file");
+ Expected<TempFile> File = TempFile::create(FinalPath + ".%%%%%%");
+ if (!File)
+ return File.takeError();
+
+ if (auto EC = sys::fs::resize_file_before_mapping_readwrite(File->FD, Size))
+ return createFileError(File->TmpName, EC);
+
+ std::error_code EC;
+ sys::fs::mapped_file_region Map(sys::fs::convertFDToNativeFile(File->FD),
+ sys::fs::mapped_file_region::readwrite, Size,
+ 0, EC);
+ if (EC)
+ return createFileError(File->TmpName, EC);
+ return MappedTempFile(std::move(*File), std::move(Map));
+}
+
+static size_t getPageSize() {
+ static int PageSize = sys::Process::getPageSizeEstimate();
+ return PageSize;
+}
+
+Error OnDiskGraphDB::createStandaloneLeaf(IndexProxy &I, ArrayRef<char> Data) {
+ assert(Data.size() > TrieRecord::MaxEmbeddedSize &&
+ "Expected a bigger file for external content...");
+
+ bool Leaf0 = isAligned(Align(getPageSize()), Data.size());
+ TrieRecord::StorageKind SK = Leaf0 ? TrieRecord::StorageKind::StandaloneLeaf0
+ : TrieRecord::StorageKind::StandaloneLeaf;
+
+ SmallString<256> Path;
+ int64_t FileSize = Data.size() + Leaf0;
+ getStandalonePath(TrieRecord::getStandaloneFileSuffix(SK), I, Path);
+
+ // Write the file. Don't reuse this mapped_file_region, which is read/write.
+ // Let load() pull up one that's read-only.
+ Expected<MappedTempFile> File = createTempFile(Path, FileSize);
+ if (!File)
+ return File.takeError();
+ assert(File->size() == (uint64_t)FileSize);
+ llvm::copy(Data, File->data());
+ if (Leaf0)
+ File->data()[Data.size()] = 0;
+ assert(File->data()[Data.size()] == 0);
+ if (Error E = File->keep(Path))
+ return E;
+
+ // Store the object reference.
+ TrieRecord::Data Existing;
+ {
+ TrieRecord::Data Leaf{SK, FileOffset()};
+ if (I.Ref.compare_exchange_strong(Existing, Leaf)) {
+ recordStandaloneSizeIncrease(FileSize);
+ return Error::success();
+ }
+ }
+
+ // If there was a race, confirm that the new value has valid storage.
+ if (Existing.SK == TrieRecord::StorageKind::Unknown)
+ return createCorruptObjectError(getDigest(I));
+
+ return Error::success();
+}
+
+Error OnDiskGraphDB::store(ObjectID ID, ArrayRef<ObjectID> Refs,
+ ArrayRef<char> Data) {
+ IndexProxy I = getIndexProxyFromRef(getInternalRef(ID));
+
+ // Early return in case the node exists.
+ {
+ TrieRecord::Data Existing = I.Ref.load();
+ if (Existing.SK != TrieRecord::StorageKind::Unknown)
+ return Error::success();
+ }
+
+ // Big leaf nodes.
+ if (Refs.empty() && Data.size() > TrieRecord::MaxEmbeddedSize)
+ return createStandaloneLeaf(I, Data);
+
+ // TODO: Check whether it's worth checking the index for an already existing
+ // object (like storeTreeImpl() does) before building up the
+ // InternalRefVector.
+ InternalRefVector InternalRefs;
+ for (ObjectID Ref : Refs)
+ InternalRefs.push_back(getInternalRef(Ref));
+
+ // Create the object.
+
+ DataRecordHandle::Input Input{InternalRefs, Data};
+
+ // Compute the storage kind, allocate it, and create the record.
+ TrieRecord::StorageKind SK = TrieRecord::StorageKind::Unknown;
+ FileOffset PoolOffset;
+ SmallString<256> Path;
+ std::optional<MappedTempFile> File;
+ std::optional<uint64_t> FileSize;
+ auto Alloc = [&](size_t Size) -> Expected<char *> {
+ if (Size <= TrieRecord::MaxEmbeddedSize) {
+ SK = TrieRecord::StorageKind::DataPool;
+ OnDiskDataAllocator::pointer P = DataPool.allocate(Size);
+ PoolOffset = P.getOffset();
+ LLVM_DEBUG({
+ dbgs() << "pool-alloc addr=" << (void *)PoolOffset.get()
+ << " size=" << Size
+ << " end=" << (void *)(PoolOffset.get() + Size) << "\n";
+ });
+ return P->data();
+ }
+
+ SK = TrieRecord::StorageKind::Standalone;
+ getStandalonePath(TrieRecord::getStandaloneFileSuffix(SK), I, Path);
+ if (Error E = createTempFile(Path, Size).moveInto(File))
+ return std::move(E);
+ assert(File->size() == Size);
+ FileSize = Size;
+ return File->data();
+ };
+ DataRecordHandle Record;
+ if (Error E =
+ DataRecordHandle::createWithError(Alloc, Input).moveInto(Record))
+ return E;
+ assert(Record.getData().end()[0] == 0 && "Expected null-termination");
+ assert(Record.getData() == Input.Data && "Expected initialization");
+ assert(SK != TrieRecord::StorageKind::Unknown);
+ assert(bool(File) != bool(PoolOffset) &&
+ "Expected either a mapped file or a pooled offset");
+
+ // Check for a race before calling MappedTempFile::keep().
+ //
+ // Then decide what to do with the file. Better to discard than overwrite if
+ // another thread/process has already added this.
+ TrieRecord::Data Existing = I.Ref.load();
+ {
+ TrieRecord::Data NewObject{SK, PoolOffset};
+ if (File) {
+ if (Existing.SK == TrieRecord::StorageKind::Unknown) {
+ // Keep the file!
+ if (Error E = File->keep(Path))
+ return E;
+ } else {
+ File.reset();
+ }
+ }
+
+ // If we didn't already see a racing/existing write, then try storing the
+ // new object. If that races, confirm that the new value has valid storage.
+ //
+ // TODO: Find a way to reuse the storage from the new-but-abandoned record
+ // handle.
+ if (Existing.SK == TrieRecord::StorageKind::Unknown) {
+ if (I.Ref.compare_exchange_strong(Existing, NewObject)) {
+ if (FileSize)
+ recordStandaloneSizeIncrease(*FileSize);
+ return Error::success();
+ }
+ }
+ }
+
+ if (Existing.SK == TrieRecord::StorageKind::Unknown)
+ return createCorruptObjectError(getDigest(I));
+
+ // Load existing object.
+ return Error::success();
+}
+
+void OnDiskGraphDB::recordStandaloneSizeIncrease(size_t SizeIncrease) {
+ getStandaloneStorageSize().fetch_add(SizeIncrease, std::memory_order_relaxed);
+}
+
+std::atomic<uint64_t> &OnDiskGraphDB::getStandaloneStorageSize() {
+ MutableArrayRef<uint8_t> UserHeader = DataPool.getUserHeader();
+ assert(UserHeader.size() == sizeof(std::atomic<uint64_t>));
+ assert(isAddrAligned(Align(8), UserHeader.data()));
+ return *reinterpret_cast<std::atomic<uint64_t> *>(UserHeader.data());
+}
+
+uint64_t OnDiskGraphDB::getStandaloneStorageSize() const {
+ return const_cast<OnDiskGraphDB *>(this)->getStandaloneStorageSize().load(
+ std::memory_order_relaxed);
+}
+
+size_t OnDiskGraphDB::getStorageSize() const {
+ return Index.size() + DataPool.size() + getStandaloneStorageSize();
+}
+
+static bool useSmallMappedFiles(const Twine &P) {
+ // macOS tmpfs does not support sparse tails.
+#if defined(__APPLE__) && __has_include(<sys/mount.h>)
+ SmallString<128> PathStorage;
+ StringRef Path = P.toNullTerminatedStringRef(PathStorage);
+ struct statfs StatFS;
+ if (statfs(Path.data(), &StatFS) != 0)
+ return false;
+
+ if (strcmp(StatFS.f_fstypename, "tmpfs") == 0)
+ return true;
+#endif
+
+ return false;
+}
+
+Expected<std::unique_ptr<OnDiskGraphDB>> OnDiskGraphDB::open(
+ StringRef AbsPath, StringRef HashName, unsigned HashByteSize,
+ std::unique_ptr<OnDiskGraphDB> UpstreamDB, FaultInPolicy Policy) {
+ if (std::error_code EC = sys::fs::create_directories(AbsPath))
+ return createFileError(AbsPath, EC);
+
+ const StringRef Slash = sys::path::get_separator();
+ constexpr uint64_t MB = 1024ull * 1024ull;
+ constexpr uint64_t GB = 1024ull * 1024ull * 1024ull;
+
+ uint64_t MaxIndexSize = 8 * GB;
+ uint64_t MaxDataPoolSize = 16 * GB;
+
+ if (useSmallMappedFiles(AbsPath)) {
+ MaxIndexSize = 1 * GB;
+ MaxDataPoolSize = 2 * GB;
+ }
+
+ auto CustomSize = getOverriddenMaxMappingSize();
+ if (!CustomSize)
+ return CustomSize.takeError();
+ if (*CustomSize)
+ MaxIndexSize = MaxDataPoolSize = **CustomSize;
+
+ std::optional<OnDiskHashMappedTrie> Index;
+ if (Error E =
+ OnDiskHashMappedTrie::create(
+ AbsPath + Slash + FilePrefix + IndexFile,
+ IndexTableName + "[" + HashName + "]", HashByteSize * CHAR_BIT,
+ /*DataSize=*/sizeof(TrieRecord), MaxIndexSize, /*MinFileSize=*/MB)
+ .moveInto(Index))
+ return std::move(E);
+
+ uint32_t UserHeaderSize = sizeof(std::atomic<uint64_t>);
+ std::optional<OnDiskDataAllocator> DataPool;
+ StringRef PolicyName =
+ Policy == FaultInPolicy::SingleNode ? "single" : "full";
+ if (Error E = OnDiskDataAllocator::create(
+ AbsPath + Slash + FilePrefix + DataPoolFile,
+ DataPoolTableName + "[" + HashName + "]" + PolicyName,
+ MaxDataPoolSize, /*MinFileSize=*/MB, UserHeaderSize,
+ [](void *UserHeaderPtr) {
+ new (UserHeaderPtr) std::atomic<uint64_t>(0);
+ })
+ .moveInto(DataPool))
+ return std::move(E);
+ if (DataPool->getUserHeader().size() != UserHeaderSize)
+ return createStringError(llvm::errc::argument_out_of_domain,
+ "unexpected user header in '" + AbsPath + Slash +
+ FilePrefix + DataPoolFile + "'");
+
+ return std::unique_ptr<OnDiskGraphDB>(
+ new OnDiskGraphDB(AbsPath, std::move(*Index), std::move(*DataPool),
+ std::move(UpstreamDB), Policy));
+}
+
+OnDiskGraphDB::OnDiskGraphDB(StringRef RootPath, OnDiskHashMappedTrie Index,
+ OnDiskDataAllocator DataPool,
+ std::unique_ptr<OnDiskGraphDB> UpstreamDB,
+ FaultInPolicy Policy)
+ : Index(std::move(Index)), DataPool(std::move(DataPool)),
+ RootPath(RootPath.str()), UpstreamDB(std::move(UpstreamDB)),
+ FIPolicy(Policy) {
+ /// Lifetime for "big" objects not in DataPool.
+ ///
+ /// NOTE: Could use ThreadSafeHashMappedTrie here. For now, doing something
+ /// simpler on the assumption there won't be much contention since most data
+ /// is not big. If there is contention, and we've already fixed ObjectProxy
+ /// object handles to be cheap enough to use consistently, the fix might be
+ /// to use better use of them rather than optimizing this map.
+ ///
+ /// FIXME: Figure out the right number of shards, if any.
+ StandaloneData = new StandaloneDataMapTy();
+}
+
+OnDiskGraphDB::~OnDiskGraphDB() {
+ delete static_cast<StandaloneDataMapTy *>(StandaloneData);
+}
+
+Error OnDiskGraphDB::importFullTree(ObjectID PrimaryID,
+ ObjectHandle UpstreamNode) {
+ // Copies the full CAS tree from upstream. Uses depth-first copying to protect
+ // against the process dying during importing and leaving the database with an
+ // incomplete tree. Note that if the upstream has missing nodes then the tree
+ // will be copied with missing nodes as well, it won't be considered an error.
+
+ struct UpstreamCursor {
+ ObjectHandle Node;
+ size_t RefsCount;
+ object_refs_iterator RefI;
+ object_refs_iterator RefE;
+ };
+ /// Keeps track of the state of visitation for current node and all of its
+ /// parents.
+ SmallVector<UpstreamCursor, 16> CursorStack;
+ /// Keeps track of the currently visited nodes as they are imported into
+ /// primary database, from current node and its parents. When a node is
+ /// entered for visitation it appends its own ID, then appends referenced IDs
+ /// as they get imported. When a node is fully imported it removes the
+ /// referenced IDs from the bottom of the stack which leaves its own ID at the
+ /// bottom, adding to the list of referenced IDs for the parent node.
+ SmallVector<ObjectID, 128> PrimaryNodesStack;
+
+ auto enqueueNode = [&](ObjectID PrimaryID, std::optional<ObjectHandle> Node) {
+ PrimaryNodesStack.push_back(PrimaryID);
+ if (!Node)
+ return;
+ auto Refs = UpstreamDB->getObjectRefs(*Node);
+ CursorStack.push_back({*Node,
+ (size_t)std::distance(Refs.begin(), Refs.end()),
+ Refs.begin(), Refs.end()});
+ };
+
+ enqueueNode(PrimaryID, UpstreamNode);
+
+ while (!CursorStack.empty()) {
+ UpstreamCursor &Cur = CursorStack.back();
+ if (Cur.RefI == Cur.RefE) {
+ // Copy the node data into the primary store.
+ // FIXME: Use hard-link or cloning if the file-system supports it and data
+ // is stored into a separate file.
+
+ // The bottom of \p PrimaryNodesStack contains the primary ID for the
+ // current node plus the list of imported referenced IDs.
+ assert(PrimaryNodesStack.size() >= Cur.RefsCount + 1);
+ ObjectID PrimaryID = *(PrimaryNodesStack.end() - Cur.RefsCount - 1);
+ auto PrimaryRefs = ArrayRef(PrimaryNodesStack)
+ .slice(PrimaryNodesStack.size() - Cur.RefsCount);
+ auto Data = UpstreamDB->getObjectData(Cur.Node);
+ if (Error E = store(PrimaryID, PrimaryRefs, Data))
+ return E;
+ // Remove the current node and its IDs from the stack.
+ PrimaryNodesStack.truncate(PrimaryNodesStack.size() - Cur.RefsCount);
+ CursorStack.pop_back();
+ continue;
+ }
+
+ ObjectID UpstreamID = *(Cur.RefI++);
+ ObjectID PrimaryID = getReference(UpstreamDB->getDigest(UpstreamID));
+ if (containsObject(PrimaryID, /*CheckUpstream=*/false)) {
+ // This \p ObjectID already exists in the primary. Either it was imported
+ // via \p importFullTree or the client created it, in which case the
+ // client takes responsibility for how it was formed.
+ enqueueNode(PrimaryID, std::nullopt);
+ continue;
+ }
+ Expected<std::optional<ObjectHandle>> UpstreamNode =
+ UpstreamDB->load(UpstreamID);
+ if (!UpstreamNode)
+ return UpstreamNode.takeError();
+ enqueueNode(PrimaryID, *UpstreamNode);
+ }
+
+ assert(PrimaryNodesStack.size() == 1);
+ assert(PrimaryNodesStack.front() == PrimaryID);
+ return Error::success();
+}
+
+Error OnDiskGraphDB::importSingleNode(ObjectID PrimaryID,
+ ObjectHandle UpstreamNode) {
+ // Copies only a single node, it doesn't copy the referenced nodes.
+
+ // Copy the node data into the primary store.
+ // FIXME: Use hard-link or cloning if the file-system supports it and data is
+ // stored into a separate file.
+
+ auto Data = UpstreamDB->getObjectData(UpstreamNode);
+ auto UpstreamRefs = UpstreamDB->getObjectRefs(UpstreamNode);
+ SmallVector<ObjectID, 64> Refs;
+ Refs.reserve(std::distance(UpstreamRefs.begin(), UpstreamRefs.end()));
+ for (ObjectID UpstreamRef : UpstreamRefs)
+ Refs.push_back(getReference(UpstreamDB->getDigest(UpstreamRef)));
+
+ return store(PrimaryID, Refs, Data);
+}
+
+Expected<std::optional<ObjectHandle>>
+OnDiskGraphDB::faultInFromUpstream(ObjectID PrimaryID) {
+ assert(UpstreamDB);
+
+ ObjectID UpstreamID = UpstreamDB->getReference(getDigest(PrimaryID));
+ Expected<std::optional<ObjectHandle>> UpstreamNode =
+ UpstreamDB->load(UpstreamID);
+ if (!UpstreamNode)
+ return UpstreamNode.takeError();
+ if (!*UpstreamNode)
+ return std::nullopt;
+
+ if (Error E = FIPolicy == FaultInPolicy::SingleNode
+ ? importSingleNode(PrimaryID, **UpstreamNode)
+ : importFullTree(PrimaryID, **UpstreamNode))
+ return std::move(E);
+ return load(PrimaryID);
+}
diff --git a/llvm/lib/CAS/OnDiskHashMappedTrie.cpp b/llvm/lib/CAS/OnDiskHashMappedTrie.cpp
new file mode 100644
index 0000000..1fd5dfe
--- /dev/null
+++ b/llvm/lib/CAS/OnDiskHashMappedTrie.cpp
@@ -0,0 +1,1352 @@
+//===- OnDiskHashMappedTrie.cpp -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/OnDiskHashMappedTrie.h"
+#include "llvm/ADT/TrieHashIndexGenerator.h"
+#include "llvm/CAS/MappedFileRegionBumpPtr.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace llvm::cas;
+
+#if LLVM_ENABLE_ONDISK_CAS
+
+static_assert(sizeof(size_t) == sizeof(uint64_t), "64-bit only");
+static_assert(sizeof(std::atomic<int64_t>) == sizeof(uint64_t),
+ "Requires lock-free 64-bit atomics");
+
+//===----------------------------------------------------------------------===//
+// Generic database data structures.
+//===----------------------------------------------------------------------===//
+namespace {
+using MappedFileRegion = MappedFileRegionBumpPtr::RegionT;
+
+/// Generic handle for a table.
+///
+/// Probably we want some table kinds for pointing at multiple tables.
+/// - Probably a tree or trie type makes sense.
+/// - Or a deque. Linear search is okay as long as there aren't many tables in
+/// a file.
+///
+/// Generic table header layout:
+/// - 2-bytes: TableKind
+/// - 2-bytes: TableNameSize
+/// - 4-bytes: TableNameRelOffset (relative to header)
+class TableHandle {
+public:
+ enum class TableKind : uint16_t {
+ HashMappedTrie = 1,
+ DataAllocator = 2,
+ };
+ struct Header {
+ TableKind Kind;
+ uint16_t NameSize;
+ int32_t NameRelOffset; // Relative to Header.
+ };
+
+ explicit operator bool() const { return H; }
+ const Header &getHeader() const { return *H; }
+ MappedFileRegion &getRegion() const { return *Region; }
+
+ template <class T> static void check() {
+ static_assert(
+ std::is_same<decltype(T::Header::GenericHeader), Header>::value,
+ "T::GenericHeader should be of type TableHandle::Header");
+ static_assert(offsetof(typename T::Header, GenericHeader) == 0,
+ "T::GenericHeader must be the head of T::Header");
+ }
+ template <class T> bool is() const { return T::Kind == H->Kind; }
+ template <class T> T dyn_cast() const {
+ check<T>();
+ if (is<T>())
+ return T(*Region, *reinterpret_cast<typename T::Header *>(H));
+ return T();
+ }
+ template <class T> T cast() const {
+ assert(is<T>());
+ return dyn_cast<T>();
+ }
+
+ StringRef getName() const {
+ auto *Begin = reinterpret_cast<const char *>(H) + H->NameRelOffset;
+ return StringRef(Begin, H->NameSize);
+ }
+
+ TableHandle() = default;
+ TableHandle(MappedFileRegion &Region, Header &H) : Region(&Region), H(&H) {}
+ TableHandle(MappedFileRegion &Region, intptr_t HeaderOffset)
+ : TableHandle(Region,
+ *reinterpret_cast<Header *>(Region.data() + HeaderOffset)) {
+ }
+
+private:
+ MappedFileRegion *Region = nullptr;
+ Header *H = nullptr;
+};
+
+/// Encapsulate a database file, which:
+/// - Sets/checks magic.
+/// - Sets/checks version.
+/// - Points at an arbitrary root table (can be changed later using a lock-free
+/// algorithm).
+/// - Sets up a BumpPtr for allocation.
+///
+/// Top-level layout:
+/// - 8-bytes: Magic
+/// - 8-bytes: Version
+/// - 8-bytes: RootTable (16-bits: Kind; 48-bits: Offset)
+/// - 8-bytes: BumpPtr
+class DatabaseFile {
+public:
+ static constexpr uint64_t getMagic() { return 0x00FFDA7ABA53FF00ULL; }
+ static constexpr uint64_t getVersion() { return 1ULL; }
+ struct Header {
+ uint64_t Magic;
+ uint64_t Version;
+ std::atomic<int64_t> RootTableOffset;
+ std::atomic<int64_t> BumpPtr;
+ };
+
+ const Header &getHeader() { return *H; }
+ MappedFileRegionBumpPtr &getAlloc() { return Alloc; }
+ MappedFileRegion &getRegion() { return Alloc.getRegion(); }
+
+ /// Add a table.
+ ///
+ /// TODO: Allow lazy construction via getOrCreate()-style API.
+ void addTable(TableHandle Table);
+
+ /// Find a table. May return null.
+ std::optional<TableHandle> findTable(StringRef Name);
+
+ static Expected<DatabaseFile>
+ create(const Twine &Path, uint64_t Capacity,
+ function_ref<Error(DatabaseFile &)> NewDBConstructor);
+
+ size_t size() const { return Alloc.size(); }
+
+private:
+ static Expected<DatabaseFile>
+ get(std::unique_ptr<MappedFileRegionBumpPtr> Alloc) {
+ if (Error E = validate(Alloc->getRegion()))
+ return std::move(E);
+ return DatabaseFile(std::move(Alloc));
+ }
+
+ static Error validate(MappedFileRegion &Region);
+
+ DatabaseFile(MappedFileRegionBumpPtr &Alloc)
+ : H(reinterpret_cast<Header *>(Alloc.data())), Alloc(Alloc) {}
+ DatabaseFile(std::unique_ptr<MappedFileRegionBumpPtr> Alloc)
+ : DatabaseFile(*Alloc) {
+ OwnedAlloc = std::move(Alloc);
+ }
+
+ Header *H = nullptr;
+ MappedFileRegionBumpPtr &Alloc;
+ std::unique_ptr<MappedFileRegionBumpPtr> OwnedAlloc;
+};
+
+} // end anonymous namespace
+
+Expected<DatabaseFile>
+DatabaseFile::create(const Twine &Path, uint64_t Capacity,
+ function_ref<Error(DatabaseFile &)> NewDBConstructor) {
+ // Constructor for if the file doesn't exist.
+ auto NewFileConstructor = [&](MappedFileRegionBumpPtr &Alloc) -> Error {
+ assert(Alloc.capacity() >= sizeof(Header));
+ (void)new (Alloc.data()) Header{getMagic(), getVersion(), {0}, {0}};
+ Alloc.initializeBumpPtr(offsetof(Header, BumpPtr));
+ DatabaseFile DB(Alloc);
+ return NewDBConstructor(DB);
+ };
+
+ // Get or create the file.
+ MappedFileRegionBumpPtr Alloc;
+ if (Error E = MappedFileRegionBumpPtr::create(Path, Capacity,
+ offsetof(Header, BumpPtr),
+ NewFileConstructor)
+ .moveInto(Alloc))
+ return std::move(E);
+
+ return DatabaseFile::get(
+ std::make_unique<MappedFileRegionBumpPtr>(std::move(Alloc)));
+}
+
+void DatabaseFile::addTable(TableHandle Table) {
+ assert(Table);
+ assert(&Table.getRegion() == &getRegion());
+ int64_t ExistingRootOffset = 0;
+ const int64_t NewOffset =
+ reinterpret_cast<const char *>(&Table.getHeader()) - getRegion().data();
+ if (H->RootTableOffset.compare_exchange_strong(ExistingRootOffset, NewOffset))
+ return;
+
+ // Silently ignore attempts to set the root to itself.
+ if (ExistingRootOffset == NewOffset)
+ return;
+
+ // FIXME: Fix the API so that having the same name is not an error. Instead,
+ // the colliding table should just be used as-is and the client can decide
+ // what to do with the new one.
+ //
+ // TODO: Add support for creating a chain or tree of tables (more than one at
+ // all!) to avoid this error.
+ TableHandle Root(getRegion(), ExistingRootOffset);
+ if (Root.getName() == Table.getName())
+ report_fatal_error(
+ createStringError(make_error_code(std::errc::not_supported),
+ "table name collision '" + Table.getName() + "'"));
+ else
+ report_fatal_error(
+ createStringError(make_error_code(std::errc::not_supported),
+ "cannot add new table '" + Table.getName() +
+ "'"
+ " to existing root '" +
+ Root.getName() + "'"));
+}
+
+std::optional<TableHandle> DatabaseFile::findTable(StringRef Name) {
+ int64_t RootTableOffset = H->RootTableOffset.load();
+ if (!RootTableOffset)
+ return std::nullopt;
+
+ TableHandle Root(getRegion(), RootTableOffset);
+ if (Root.getName() == Name)
+ return Root;
+
+ // TODO: Once multiple tables are supported, need to walk to find them.
+ return std::nullopt;
+}
+
+Error DatabaseFile::validate(MappedFileRegion &Region) {
+ if (Region.size() < sizeof(Header))
+ return createStringError(std::errc::invalid_argument,
+ "database: missing header");
+
+ // Check the magic and version.
+ auto *H = reinterpret_cast<Header *>(Region.data());
+ if (H->Magic != getMagic())
+ return createStringError(std::errc::invalid_argument,
+ "database: bad magic");
+ if (H->Version != getVersion())
+ return createStringError(std::errc::invalid_argument,
+ "database: wrong version");
+
+ // Check the bump-ptr, which should point past the header.
+ if (H->BumpPtr.load() < (int64_t)sizeof(Header))
+ return createStringError(std::errc::invalid_argument,
+ "database: corrupt bump-ptr");
+
+ return Error::success();
+}
+
+//===----------------------------------------------------------------------===//
+// HashMappedTrie data structures.
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+class SubtrieHandle;
+class SubtrieSlotValue {
+public:
+ explicit operator bool() const { return !isEmpty(); }
+ bool isEmpty() const { return !Offset; }
+ bool isData() const { return Offset > 0; }
+ bool isSubtrie() const { return Offset < 0; }
+ int64_t asData() const {
+ assert(isData());
+ return Offset;
+ }
+ int64_t asSubtrie() const {
+ assert(isSubtrie());
+ return -Offset;
+ }
+
+ FileOffset asSubtrieFileOffset() const { return FileOffset(asSubtrie()); }
+
+ FileOffset asDataFileOffset() const { return FileOffset(asData()); }
+
+ int64_t getRawOffset() const { return Offset; }
+
+ static SubtrieSlotValue getDataOffset(int64_t Offset) {
+ return SubtrieSlotValue(Offset);
+ }
+
+ static SubtrieSlotValue getSubtrieOffset(int64_t Offset) {
+ return SubtrieSlotValue(-Offset);
+ }
+
+ static SubtrieSlotValue getDataOffset(FileOffset Offset) {
+ return getDataOffset(Offset.get());
+ }
+
+ static SubtrieSlotValue getSubtrieOffset(FileOffset Offset) {
+ return getDataOffset(Offset.get());
+ }
+
+ static SubtrieSlotValue getFromSlot(std::atomic<int64_t> &Slot) {
+ return SubtrieSlotValue(Slot.load());
+ }
+
+ SubtrieSlotValue() = default;
+
+private:
+ friend class SubtrieHandle;
+ explicit SubtrieSlotValue(int64_t Offset) : Offset(Offset) {}
+ int64_t Offset = 0;
+};
+
+class HashMappedTrieHandle;
+
+/// Subtrie layout:
+/// - 2-bytes: StartBit
+/// - 1-bytes: NumBits=lg(num-slots)
+/// - 1-bytes: NumUnusedBits=lg(num-slots-unused)
+/// - 4-bytes: 0-pad
+/// - <slots>
+class SubtrieHandle {
+public:
+ struct Header {
+ /// The bit this subtrie starts on.
+ uint16_t StartBit;
+
+ /// The number of bits this subtrie handles. It has 2^NumBits slots.
+ uint8_t NumBits;
+
+ /// The number of extra bits this allocation *could* handle, due to
+ /// over-allocation. It has 2^NumUnusedBits unused slots.
+ uint8_t NumUnusedBits;
+
+ /// 0-pad to 8B.
+ uint32_t ZeroPad4B;
+ };
+
+ /// Slot storage:
+ /// - zero: Empty
+ /// - positive: RecordOffset
+ /// - negative: SubtrieOffset
+ using SlotT = std::atomic<int64_t>;
+
+ static int64_t getSlotsSize(uint32_t NumBits) {
+ return sizeof(int64_t) * (1u << NumBits);
+ }
+
+ static int64_t getSize(uint32_t NumBits) {
+ return sizeof(SubtrieHandle::Header) + getSlotsSize(NumBits);
+ }
+
+ int64_t getSize() const { return getSize(H->NumBits); }
+
+ SubtrieSlotValue load(size_t I) const {
+ return SubtrieSlotValue(Slots[I].load());
+ }
+ void store(size_t I, SubtrieSlotValue V) {
+ return Slots[I].store(V.getRawOffset());
+ }
+
+ void printHash(raw_ostream &OS, ArrayRef<uint8_t> Bytes) const;
+ void print(raw_ostream &OS, HashMappedTrieHandle Trie,
+ SmallVectorImpl<int64_t> &Records,
+ std::optional<std::string> Prefix = std::nullopt) const;
+
+ /// Return None on success, or the existing offset on failure.
+ bool compare_exchange_strong(size_t I, SubtrieSlotValue &Expected,
+ SubtrieSlotValue New) {
+ return Slots[I].compare_exchange_strong(Expected.Offset, New.Offset);
+ }
+
+ /// Sink \p V from \p I in this subtrie down to \p NewI in a new subtrie with
+ /// \p NumSubtrieBits.
+ ///
+ /// \p UnusedSubtrie maintains a 1-item "free" list of unused subtries. If a
+ /// new subtrie is created that isn't used because of a lost race, then it If
+ /// it's already valid, it should be used instead of allocating a new one.
+ /// should be returned as an out parameter to be passed back in the future.
+ /// If it's already valid, it should be used instead of allocating a new one.
+ ///
+ /// Returns the subtrie that now lives at \p I.
+ SubtrieHandle sink(size_t I, SubtrieSlotValue V,
+ MappedFileRegionBumpPtr &Alloc, size_t NumSubtrieBits,
+ SubtrieHandle &UnusedSubtrie, size_t NewI);
+
+ /// Only safe if the subtrie is empty.
+ void reinitialize(uint32_t StartBit, uint32_t NumBits);
+
+ SubtrieSlotValue getOffset() const {
+ return SubtrieSlotValue::getSubtrieOffset(
+ reinterpret_cast<const char *>(H) - Region->data());
+ }
+
+ FileOffset getFileOffset() const { return getOffset().asSubtrieFileOffset(); }
+
+ explicit operator bool() const { return H; }
+
+ Header &getHeader() const { return *H; }
+ uint32_t getStartBit() const { return H->StartBit; }
+ uint32_t getNumBits() const { return H->NumBits; }
+ uint32_t getNumUnusedBits() const { return H->NumUnusedBits; }
+
+ static SubtrieHandle create(MappedFileRegionBumpPtr &Alloc, uint32_t StartBit,
+ uint32_t NumBits, uint32_t NumUnusedBits = 0);
+
+ static SubtrieHandle getFromFileOffset(MappedFileRegion &Region,
+ FileOffset Offset) {
+ return SubtrieHandle(Region, SubtrieSlotValue::getSubtrieOffset(Offset));
+ }
+
+ SubtrieHandle() = default;
+ SubtrieHandle(MappedFileRegion &Region, Header &H)
+ : Region(&Region), H(&H), Slots(getSlots(H)) {}
+ SubtrieHandle(MappedFileRegion &Region, SubtrieSlotValue Offset)
+ : SubtrieHandle(Region, *reinterpret_cast<Header *>(
+ Region.data() + Offset.asSubtrie())) {}
+
+private:
+ MappedFileRegion *Region = nullptr;
+ Header *H = nullptr;
+ MutableArrayRef<SlotT> Slots;
+
+ static MutableArrayRef<SlotT> getSlots(Header &H) {
+ return MutableArrayRef(reinterpret_cast<SlotT *>(&H + 1), 1u << H.NumBits);
+ }
+};
+
+/// Handle for a HashMappedTrie table.
+///
+/// HashMappedTrie table layout:
+/// - [8-bytes: Generic table header]
+/// - 1-byte: NumSubtrieBits
+/// - 1-byte: Flags (not used yet)
+/// - 2-bytes: NumHashBits
+/// - 4-bytes: RecordDataSize (in bytes)
+/// - 8-bytes: RootTrieOffset
+/// - 8-bytes: AllocatorOffset (reserved for implementing free lists)
+/// - <name> '\0'
+///
+/// Record layout:
+/// - <data>
+/// - <hash>
+class HashMappedTrieHandle {
+public:
+ static constexpr TableHandle::TableKind Kind =
+ TableHandle::TableKind::HashMappedTrie;
+
+ struct Header {
+ TableHandle::Header GenericHeader;
+ uint8_t NumSubtrieBits;
+ uint8_t Flags; // None used yet.
+ uint16_t NumHashBits;
+ uint32_t RecordDataSize;
+ std::atomic<int64_t> RootTrieOffset;
+ std::atomic<int64_t> AllocatorOffset;
+ };
+
+ operator TableHandle() const {
+ if (!H)
+ return TableHandle();
+ return TableHandle(*Region, H->GenericHeader);
+ }
+
+ struct RecordData {
+ OnDiskHashMappedTrie::ValueProxy Proxy;
+ SubtrieSlotValue Offset;
+ FileOffset getFileOffset() const { return Offset.asDataFileOffset(); }
+ };
+
+ enum Limits : size_t {
+ /// Seems like 65528 hash bits ought to be enough.
+ MaxNumHashBytes = UINT16_MAX >> 3,
+ MaxNumHashBits = MaxNumHashBytes << 3,
+
+ /// 2^16 bits in a trie is 65536 slots. This restricts us to a 16-bit
+ /// index. This many slots is suspicously large anyway.
+ MaxNumRootBits = 16,
+
+ /// 2^10 bits in a trie is 1024 slots. This many slots seems suspiciously
+ /// large for subtries.
+ MaxNumSubtrieBits = 10,
+ };
+
+ static constexpr size_t getNumHashBytes(size_t NumHashBits) {
+ assert(NumHashBits % 8 == 0);
+ return NumHashBits / 8;
+ }
+ static constexpr size_t getRecordSize(size_t RecordDataSize,
+ size_t NumHashBits) {
+ return RecordDataSize + getNumHashBytes(NumHashBits);
+ }
+
+ RecordData getRecord(SubtrieSlotValue Offset);
+ RecordData createRecord(MappedFileRegionBumpPtr &Alloc,
+ ArrayRef<uint8_t> Hash);
+
+ explicit operator bool() const { return H; }
+ const Header &getHeader() const { return *H; }
+ SubtrieHandle getRoot() const;
+ SubtrieHandle getOrCreateRoot(MappedFileRegionBumpPtr &Alloc);
+ MappedFileRegion &getRegion() const { return *Region; }
+
+ size_t getFlags() const { return H->Flags; }
+ uint64_t getNumSubtrieBits() const { return H->NumSubtrieBits; }
+ uint64_t getNumHashBits() const { return H->NumHashBits; }
+ size_t getNumHashBytes() const { return getNumHashBytes(H->NumHashBits); }
+ size_t getRecordDataSize() const { return H->RecordDataSize; }
+ size_t getRecordSize() const {
+ return getRecordSize(H->RecordDataSize, H->NumHashBits);
+ }
+
+ TrieHashIndexGenerator getIndexGen(SubtrieHandle Root,
+ ArrayRef<uint8_t> Hash) {
+ assert(Root.getStartBit() == 0);
+ assert(getNumHashBytes() == Hash.size());
+ assert(getNumHashBits() == Hash.size() * 8);
+ return TrieHashIndexGenerator{Root.getNumBits(), getNumSubtrieBits(), Hash};
+ }
+
+ static HashMappedTrieHandle
+ create(MappedFileRegionBumpPtr &Alloc, StringRef Name,
+ std::optional<uint64_t> NumRootBits, uint64_t NumSubtrieBits,
+ uint64_t NumHashBits, uint64_t RecordDataSize);
+
+ void
+ print(raw_ostream &OS,
+ function_ref<void(ArrayRef<char>)> PrintRecordData = nullptr) const;
+
+ HashMappedTrieHandle() = default;
+ HashMappedTrieHandle(MappedFileRegion &Region, Header &H)
+ : Region(&Region), H(&H) {}
+ HashMappedTrieHandle(MappedFileRegion &Region, intptr_t HeaderOffset)
+ : HashMappedTrieHandle(
+ Region, *reinterpret_cast<Header *>(Region.data() + HeaderOffset)) {
+ }
+
+private:
+ MappedFileRegion *Region = nullptr;
+ Header *H = nullptr;
+};
+
+} // end anonymous namespace
+
+struct OnDiskHashMappedTrie::ImplType {
+ DatabaseFile File;
+ HashMappedTrieHandle Trie;
+};
+
+SubtrieHandle SubtrieHandle::create(MappedFileRegionBumpPtr &Alloc,
+ uint32_t StartBit, uint32_t NumBits,
+ uint32_t NumUnusedBits) {
+ assert(StartBit <= HashMappedTrieHandle::MaxNumHashBits);
+ assert(NumBits <= UINT8_MAX);
+ assert(NumUnusedBits <= UINT8_MAX);
+ assert(NumBits + NumUnusedBits <= HashMappedTrieHandle::MaxNumRootBits);
+
+ void *Mem = Alloc.allocate(getSize(NumBits + NumUnusedBits));
+ auto *H =
+ new (Mem) SubtrieHandle::Header{(uint16_t)StartBit, (uint8_t)NumBits,
+ (uint8_t)NumUnusedBits, /*ZeroPad4B=*/0};
+ SubtrieHandle S(Alloc.getRegion(), *H);
+ for (auto I = S.Slots.begin(), E = S.Slots.end(); I != E; ++I)
+ new (I) SlotT(0);
+ return S;
+}
+
+SubtrieHandle HashMappedTrieHandle::getRoot() const {
+ if (int64_t Root = H->RootTrieOffset)
+ return SubtrieHandle(getRegion(), SubtrieSlotValue::getSubtrieOffset(Root));
+ return SubtrieHandle();
+}
+
+SubtrieHandle
+HashMappedTrieHandle::getOrCreateRoot(MappedFileRegionBumpPtr &Alloc) {
+ assert(&Alloc.getRegion() == &getRegion());
+ if (SubtrieHandle Root = getRoot())
+ return Root;
+
+ int64_t Race = 0;
+ SubtrieHandle LazyRoot = SubtrieHandle::create(Alloc, 0, H->NumSubtrieBits);
+ if (H->RootTrieOffset.compare_exchange_strong(
+ Race, LazyRoot.getOffset().asSubtrie()))
+ return LazyRoot;
+
+ // There was a race. Return the other root.
+ //
+ // TODO: Avoid leaking the lazy root by storing it in an allocator.
+ return SubtrieHandle(getRegion(), SubtrieSlotValue::getSubtrieOffset(Race));
+}
+
+HashMappedTrieHandle
+HashMappedTrieHandle::create(MappedFileRegionBumpPtr &Alloc, StringRef Name,
+ std::optional<uint64_t> NumRootBits,
+ uint64_t NumSubtrieBits, uint64_t NumHashBits,
+ uint64_t RecordDataSize) {
+ // Allocate.
+ intptr_t Offset = Alloc.allocateOffset(sizeof(Header) + Name.size() + 1);
+
+ // Construct the header and the name.
+ assert(Name.size() <= UINT16_MAX && "Expected smaller table name");
+ assert(NumSubtrieBits <= UINT8_MAX && "Expected valid subtrie bits");
+ assert(NumHashBits <= UINT16_MAX && "Expected valid hash size");
+ assert(RecordDataSize <= UINT32_MAX && "Expected smaller table name");
+ auto *H = new (Alloc.getRegion().data() + Offset)
+ Header{{TableHandle::TableKind::HashMappedTrie, (uint16_t)Name.size(),
+ (uint32_t)sizeof(Header)},
+ (uint8_t)NumSubtrieBits,
+ /*Flags=*/0,
+ (uint16_t)NumHashBits,
+ (uint32_t)RecordDataSize,
+ /*RootTrieOffset=*/{0},
+ /*AllocatorOffset=*/{0}};
+ char *NameStorage = reinterpret_cast<char *>(H + 1);
+ llvm::copy(Name, NameStorage);
+ NameStorage[Name.size()] = 0;
+
+ // Construct a root trie, if requested.
+ HashMappedTrieHandle Trie(Alloc.getRegion(), *H);
+ if (NumRootBits)
+ H->RootTrieOffset =
+ SubtrieHandle::create(Alloc, 0, *NumRootBits).getOffset().asSubtrie();
+ return Trie;
+}
+
+HashMappedTrieHandle::RecordData
+HashMappedTrieHandle::getRecord(SubtrieSlotValue Offset) {
+ char *Begin = Region->data() + Offset.asData();
+ OnDiskHashMappedTrie::ValueProxy Proxy;
+ Proxy.Data = MutableArrayRef(Begin, getRecordDataSize());
+ Proxy.Hash = ArrayRef(reinterpret_cast<const uint8_t *>(Proxy.Data.end()),
+ getNumHashBytes());
+ return RecordData{Proxy, Offset};
+}
+
+HashMappedTrieHandle::RecordData
+HashMappedTrieHandle::createRecord(MappedFileRegionBumpPtr &Alloc,
+ ArrayRef<uint8_t> Hash) {
+ assert(&Alloc.getRegion() == Region);
+ assert(Hash.size() == getNumHashBytes());
+ RecordData Record = getRecord(
+ SubtrieSlotValue::getDataOffset(Alloc.allocateOffset(getRecordSize())));
+ llvm::copy(Hash, const_cast<uint8_t *>(Record.Proxy.Hash.begin()));
+ return Record;
+}
+
+OnDiskHashMappedTrie::const_pointer
+OnDiskHashMappedTrie::recoverFromHashPointer(
+ const uint8_t *HashBeginPtr) const {
+ // Record hashes occur immediately after data. Compute the beginning of the
+ // record and check for overflow.
+ const uintptr_t HashBegin = reinterpret_cast<uintptr_t>(HashBeginPtr);
+ const uintptr_t RecordBegin = HashBegin - Impl->Trie.getRecordSize();
+ if (HashBegin < RecordBegin)
+ return const_pointer();
+
+ // Check that it'll be a positive offset.
+ const uintptr_t FileBegin =
+ reinterpret_cast<uintptr_t>(Impl->File.getRegion().data());
+ if (RecordBegin < FileBegin)
+ return const_pointer();
+
+ // Good enough to form an offset. Continue checking there.
+ return recoverFromFileOffset(FileOffset(RecordBegin - FileBegin));
+}
+
+OnDiskHashMappedTrie::const_pointer
+OnDiskHashMappedTrie::recoverFromFileOffset(FileOffset Offset) const {
+ // Check alignment.
+ if (!isAligned(MappedFileRegionBumpPtr::getAlign(), Offset.get()))
+ return const_pointer();
+
+ // Check bounds.
+ //
+ // Note: There's no potential overflow when using \c uint64_t because Offset
+ // is in \c [0,INT64_MAX] and the record size is in \c [0,UINT32_MAX].
+ assert(Offset.get() >= 0 && "Expected FileOffset constructor guarantee this");
+ if ((uint64_t)Offset.get() + Impl->Trie.getRecordSize() >
+ Impl->File.getAlloc().size())
+ return const_pointer();
+
+ // Looks okay...
+ HashMappedTrieHandle::RecordData D =
+ Impl->Trie.getRecord(SubtrieSlotValue::getDataOffset(Offset));
+ return const_pointer(D.getFileOffset(), D.Proxy);
+}
+
+OnDiskHashMappedTrie::const_pointer
+OnDiskHashMappedTrie::find(ArrayRef<uint8_t> Hash) const {
+ HashMappedTrieHandle Trie = Impl->Trie;
+ assert(Hash.size() == Trie.getNumHashBytes() && "Invalid hash");
+
+ SubtrieHandle S = Trie.getRoot();
+ if (!S)
+ return const_pointer();
+
+ TrieHashIndexGenerator IndexGen = Trie.getIndexGen(S, Hash);
+ size_t Index = IndexGen.next();
+ for (;;) {
+ // Try to set the content.
+ SubtrieSlotValue V = S.load(Index);
+ if (!V)
+ return const_pointer(S.getFileOffset(),
+ HintT(this, Index, *IndexGen.StartBit));
+
+ // Check for an exact match.
+ if (V.isData()) {
+ HashMappedTrieHandle::RecordData D = Trie.getRecord(V);
+ return D.Proxy.Hash == Hash
+ ? const_pointer(D.getFileOffset(), D.Proxy)
+ : const_pointer(S.getFileOffset(),
+ HintT(this, Index, *IndexGen.StartBit));
+ }
+
+ Index = IndexGen.next();
+ S = SubtrieHandle(Trie.getRegion(), V);
+ }
+}
+
+/// Only safe if the subtrie is empty.
+void SubtrieHandle::reinitialize(uint32_t StartBit, uint32_t NumBits) {
+ assert(StartBit > H->StartBit);
+ assert(NumBits <= H->NumBits);
+ // Ideally would also assert that all slots are empty, but that's expensive.
+
+ H->StartBit = StartBit;
+ H->NumBits = NumBits;
+}
+
+OnDiskHashMappedTrie::pointer
+OnDiskHashMappedTrie::insertLazy(const_pointer Hint, ArrayRef<uint8_t> Hash,
+ LazyInsertOnConstructCB OnConstruct,
+ LazyInsertOnLeakCB OnLeak) {
+ HashMappedTrieHandle Trie = Impl->Trie;
+ assert(Hash.size() == Trie.getNumHashBytes() && "Invalid hash");
+
+ MappedFileRegionBumpPtr &Alloc = Impl->File.getAlloc();
+ SubtrieHandle S = Trie.getOrCreateRoot(Alloc);
+ TrieHashIndexGenerator IndexGen = Trie.getIndexGen(S, Hash);
+
+ size_t Index;
+ if (std::optional<HintT> H = Hint.getHint(*this)) {
+ S = SubtrieHandle::getFromFileOffset(Trie.getRegion(), Hint.getOffset());
+ Index = IndexGen.hint(H->I, H->B);
+ } else {
+ Index = IndexGen.next();
+ }
+
+ // FIXME: Add non-assertion based checks for data corruption that would
+ // otherwise cause infinite loops in release builds, instead calling
+ // report_fatal_error().
+ //
+ // Two loops are possible:
+ // - All bits used up in the IndexGenerator because subtries are somehow
+ // linked in a cycle. Could confirm that each subtrie's start-bit
+ // follows from the start-bit and num-bits of its parent. Could also check
+ // that the generator doesn't run out of bits.
+ // - Existing data matches tail of Hash but not the head (stored in an
+ // invalid spot). Probably a cheap way to check this too, but needs
+ // thought.
+ std::optional<HashMappedTrieHandle::RecordData> NewRecord;
+ SubtrieHandle UnusedSubtrie;
+ for (;;) {
+ SubtrieSlotValue Existing = S.load(Index);
+
+ // Try to set it, if it's empty.
+ if (!Existing) {
+ if (!NewRecord) {
+ NewRecord = Trie.createRecord(Alloc, Hash);
+ if (OnConstruct)
+ OnConstruct(NewRecord->Offset.asDataFileOffset(), NewRecord->Proxy);
+ }
+
+ if (S.compare_exchange_strong(Index, Existing, NewRecord->Offset))
+ return pointer(NewRecord->Offset.asDataFileOffset(), NewRecord->Proxy);
+
+ // Race means that Existing is no longer empty; fall through...
+ }
+
+ if (Existing.isSubtrie()) {
+ S = SubtrieHandle(Trie.getRegion(), Existing);
+ Index = IndexGen.next();
+ continue;
+ }
+
+ // Check for an exact match.
+ HashMappedTrieHandle::RecordData ExistingRecord = Trie.getRecord(Existing);
+ if (ExistingRecord.Proxy.Hash == Hash) {
+ if (NewRecord && OnLeak)
+ OnLeak(NewRecord->Offset.asDataFileOffset(), NewRecord->Proxy,
+ ExistingRecord.Offset.asDataFileOffset(), ExistingRecord.Proxy);
+ return pointer(ExistingRecord.Offset.asDataFileOffset(),
+ ExistingRecord.Proxy);
+ }
+
+ // Sink the existing content as long as the indexes match.
+ for (;;) {
+ size_t NextIndex = IndexGen.next();
+ size_t NewIndexForExistingContent =
+ IndexGen.getCollidingBits(ExistingRecord.Proxy.Hash);
+
+ S = S.sink(Index, Existing, Alloc, IndexGen.getNumBits(), UnusedSubtrie,
+ NewIndexForExistingContent);
+ Index = NextIndex;
+
+ // Found the difference.
+ if (NextIndex != NewIndexForExistingContent)
+ break;
+ }
+ }
+}
+
+SubtrieHandle SubtrieHandle::sink(size_t I, SubtrieSlotValue V,
+ MappedFileRegionBumpPtr &Alloc,
+ size_t NumSubtrieBits,
+ SubtrieHandle &UnusedSubtrie, size_t NewI) {
+ SubtrieHandle NewS;
+ if (UnusedSubtrie) {
+ // Steal UnusedSubtrie and initialize it.
+ std::swap(NewS, UnusedSubtrie);
+ NewS.reinitialize(getStartBit() + getNumBits(), NumSubtrieBits);
+ } else {
+ // Allocate a new, empty subtrie.
+ NewS = SubtrieHandle::create(Alloc, getStartBit() + getNumBits(),
+ NumSubtrieBits);
+ }
+
+ NewS.store(NewI, V);
+ if (compare_exchange_strong(I, V, NewS.getOffset()))
+ return NewS; // Success!
+
+ // Raced.
+ assert(V.isSubtrie() && "Expected racing sink() to add a subtrie");
+
+ // Wipe out the new slot so NewS can be reused and set the out parameter.
+ NewS.store(NewI, SubtrieSlotValue());
+ UnusedSubtrie = NewS;
+
+ // Return the subtrie added by the concurrent sink() call.
+ return SubtrieHandle(Alloc.getRegion(), V);
+}
+
+void OnDiskHashMappedTrie::print(
+ raw_ostream &OS, function_ref<void(ArrayRef<char>)> PrintRecordData) const {
+ Impl->Trie.print(OS, PrintRecordData);
+}
+
+static void printHexDigit(raw_ostream &OS, uint8_t Digit) {
+ if (Digit < 10)
+ OS << char(Digit + '0');
+ else
+ OS << char(Digit - 10 + 'a');
+}
+
+static void printHexDigits(raw_ostream &OS, ArrayRef<uint8_t> Bytes,
+ size_t StartBit, size_t NumBits) {
+ assert(StartBit % 4 == 0);
+ assert(NumBits % 4 == 0);
+ for (size_t I = StartBit, E = StartBit + NumBits; I != E; I += 4) {
+ uint8_t HexPair = Bytes[I / 8];
+ uint8_t HexDigit = I % 8 == 0 ? HexPair >> 4 : HexPair & 0xf;
+ printHexDigit(OS, HexDigit);
+ }
+}
+
+void HashMappedTrieHandle::print(
+ raw_ostream &OS, function_ref<void(ArrayRef<char>)> PrintRecordData) const {
+ OS << "hash-num-bits=" << getNumHashBits()
+ << " hash-size=" << getNumHashBytes()
+ << " record-data-size=" << getRecordDataSize() << "\n";
+ SubtrieHandle Root = getRoot();
+
+ SmallVector<int64_t> Records;
+ if (Root)
+ Root.print(OS, *this, Records);
+
+ if (Records.empty())
+ return;
+ llvm::sort(Records);
+ OS << "records\n";
+ for (int64_t Offset : Records) {
+ OS << "- addr=" << (void *)Offset << " ";
+ HashMappedTrieHandle Trie = *this;
+ HashMappedTrieHandle::RecordData Record =
+ Trie.getRecord(SubtrieSlotValue::getDataOffset(Offset));
+ if (PrintRecordData) {
+ PrintRecordData(Record.Proxy.Data);
+ } else {
+ OS << "bytes=";
+ ArrayRef<uint8_t> Data(
+ reinterpret_cast<const uint8_t *>(Record.Proxy.Data.data()),
+ Record.Proxy.Data.size());
+ printHexDigits(OS, Data, 0, Data.size() * 8);
+ }
+ OS << "\n";
+ }
+}
+
+static void printBits(raw_ostream &OS, ArrayRef<uint8_t> Bytes, size_t StartBit,
+ size_t NumBits) {
+ assert(StartBit + NumBits <= Bytes.size() * 8u);
+ for (size_t I = StartBit, E = StartBit + NumBits; I != E; ++I) {
+ uint8_t Byte = Bytes[I / 8];
+ size_t ByteOffset = I % 8;
+ if (size_t ByteShift = 8 - ByteOffset - 1)
+ Byte >>= ByteShift;
+ OS << (Byte & 0x1 ? '1' : '0');
+ }
+}
+
+void SubtrieHandle::printHash(raw_ostream &OS, ArrayRef<uint8_t> Bytes) const {
+ // afb[1c:00*01110*0]def
+ size_t EndBit = getStartBit() + getNumBits();
+ size_t HashEndBit = Bytes.size() * 8u;
+
+ size_t FirstBinaryBit = getStartBit() & ~0x3u;
+ printHexDigits(OS, Bytes, 0, FirstBinaryBit);
+
+ size_t LastBinaryBit = (EndBit + 3u) & ~0x3u;
+ OS << "[";
+ printBits(OS, Bytes, FirstBinaryBit, LastBinaryBit - FirstBinaryBit);
+ OS << "]";
+
+ printHexDigits(OS, Bytes, LastBinaryBit, HashEndBit - LastBinaryBit);
+}
+
+static void appendIndexBits(std::string &Prefix, size_t Index,
+ size_t NumSlots) {
+ std::string Bits;
+ for (size_t NumBits = 1u; NumBits < NumSlots; NumBits <<= 1) {
+ Bits.push_back('0' + (Index & 0x1));
+ Index >>= 1;
+ }
+ for (char Ch : llvm::reverse(Bits))
+ Prefix += Ch;
+}
+
+static void printPrefix(raw_ostream &OS, StringRef Prefix) {
+ while (Prefix.size() >= 4) {
+ uint8_t Digit;
+ bool ErrorParsingBinary = Prefix.take_front(4).getAsInteger(2, Digit);
+ assert(!ErrorParsingBinary);
+ (void)ErrorParsingBinary;
+ printHexDigit(OS, Digit);
+ Prefix = Prefix.drop_front(4);
+ }
+ if (!Prefix.empty())
+ OS << "[" << Prefix << "]";
+}
+
+void SubtrieHandle::print(raw_ostream &OS, HashMappedTrieHandle Trie,
+ SmallVectorImpl<int64_t> &Records,
+ std::optional<std::string> Prefix) const {
+ if (!Prefix) {
+ OS << "root";
+ Prefix.emplace();
+ } else {
+ OS << "subtrie=";
+ printPrefix(OS, *Prefix);
+ }
+
+ OS << " addr="
+ << (void *)(reinterpret_cast<const char *>(H) - Region->data());
+
+ const size_t NumSlots = Slots.size();
+ OS << " num-slots=" << NumSlots << "\n";
+ SmallVector<SubtrieHandle> Subs;
+ SmallVector<std::string> Prefixes;
+ for (size_t I = 0, E = NumSlots; I != E; ++I) {
+ SubtrieSlotValue Slot = load(I);
+ if (!Slot)
+ continue;
+ OS << "- index=";
+ for (size_t Pad : {10, 100, 1000})
+ if (I < Pad && NumSlots >= Pad)
+ OS << "0";
+ OS << I << " ";
+ if (Slot.isSubtrie()) {
+ SubtrieHandle S(*Region, Slot);
+ std::string SubtriePrefix = *Prefix;
+ appendIndexBits(SubtriePrefix, I, NumSlots);
+ OS << "addr=" << (void *)Slot.asSubtrie();
+ OS << " subtrie=";
+ printPrefix(OS, SubtriePrefix);
+ OS << "\n";
+ Subs.push_back(S);
+ Prefixes.push_back(SubtriePrefix);
+ continue;
+ }
+ Records.push_back(Slot.asData());
+ HashMappedTrieHandle::RecordData Record = Trie.getRecord(Slot);
+ OS << "addr=" << (void *)Record.getFileOffset().get();
+ OS << " content=";
+ printHash(OS, Record.Proxy.Hash);
+ OS << "\n";
+ }
+ for (size_t I = 0, E = Subs.size(); I != E; ++I)
+ Subs[I].print(OS, Trie, Records, Prefixes[I]);
+}
+
+LLVM_DUMP_METHOD void OnDiskHashMappedTrie::dump() const { print(dbgs()); }
+
+static Error createTableConfigError(std::errc ErrC, StringRef Path,
+ StringRef TableName, const Twine &Msg) {
+ return createStringError(make_error_code(ErrC),
+ Path + "[" + TableName + "]: " + Msg);
+}
+
+static Expected<size_t> checkParameter(StringRef Label, size_t Max,
+ std::optional<size_t> Value,
+ std::optional<size_t> Default,
+ StringRef Path, StringRef TableName) {
+ assert(Value || Default);
+ assert(!Default || *Default <= Max);
+ if (!Value)
+ return *Default;
+
+ if (*Value <= Max)
+ return *Value;
+ return createTableConfigError(
+ std::errc::argument_out_of_domain, Path, TableName,
+ "invalid " + Label + ": " + Twine(*Value) + " (max: " + Twine(Max) + ")");
+}
+
+static Error checkTable(StringRef Label, size_t Expected, size_t Observed,
+ StringRef Path, StringRef TrieName) {
+ if (Expected == Observed)
+ return Error::success();
+ return createTableConfigError(std::errc::invalid_argument, Path, TrieName,
+ "mismatched " + Label +
+ " (expected: " + Twine(Expected) +
+ ", observed: " + Twine(Observed) + ")");
+}
+
+size_t OnDiskHashMappedTrie::size() const { return Impl->File.size(); }
+
+Expected<OnDiskHashMappedTrie>
+OnDiskHashMappedTrie::create(const Twine &PathTwine, const Twine &TrieNameTwine,
+ size_t NumHashBits, uint64_t DataSize,
+ uint64_t MaxFileSize,
+ std::optional<uint64_t> NewFileInitialSize,
+ std::optional<size_t> NewTableNumRootBits,
+ std::optional<size_t> NewTableNumSubtrieBits) {
+ SmallString<128> PathStorage;
+ StringRef Path = PathTwine.toStringRef(PathStorage);
+ SmallString<128> TrieNameStorage;
+ StringRef TrieName = TrieNameTwine.toStringRef(TrieNameStorage);
+
+ constexpr size_t DefaultNumRootBits = 10;
+ constexpr size_t DefaultNumSubtrieBits = 6;
+
+ size_t NumRootBits;
+ if (Error E = checkParameter(
+ "root bits", HashMappedTrieHandle::MaxNumRootBits,
+ NewTableNumRootBits, DefaultNumRootBits, Path, TrieName)
+ .moveInto(NumRootBits))
+ return std::move(E);
+
+ size_t NumSubtrieBits;
+ if (Error E = checkParameter("subtrie bits",
+ HashMappedTrieHandle::MaxNumSubtrieBits,
+ NewTableNumSubtrieBits, DefaultNumSubtrieBits,
+ Path, TrieName)
+ .moveInto(NumSubtrieBits))
+ return std::move(E);
+
+ size_t NumHashBytes = NumHashBits >> 3;
+ if (Error E =
+ checkParameter("hash size", HashMappedTrieHandle::MaxNumHashBits,
+ NumHashBits, std::nullopt, Path, TrieName)
+ .takeError())
+ return std::move(E);
+ assert(NumHashBits == NumHashBytes << 3 &&
+ "Expected hash size to be byte-aligned");
+ if (NumHashBits != NumHashBytes << 3)
+ return createTableConfigError(
+ std::errc::argument_out_of_domain, Path, TrieName,
+ "invalid hash size: " + Twine(NumHashBits) + " (not byte-aligned)");
+
+ // Constructor for if the file doesn't exist.
+ auto NewDBConstructor = [&](DatabaseFile &DB) -> Error {
+ HashMappedTrieHandle Trie =
+ HashMappedTrieHandle::create(DB.getAlloc(), TrieName, NumRootBits,
+ NumSubtrieBits, NumHashBits, DataSize);
+ DB.addTable(Trie);
+ return Error::success();
+ };
+
+ // Get or create the file.
+ Expected<DatabaseFile> File =
+ DatabaseFile::create(Path, MaxFileSize, NewDBConstructor);
+ if (!File)
+ return File.takeError();
+
+ // Find the trie and validate it.
+ //
+ // TODO: Add support for creating/adding a table to an existing file.
+ std::optional<TableHandle> Table = File->findTable(TrieName);
+ if (!Table)
+ return createTableConfigError(std::errc::argument_out_of_domain, Path,
+ TrieName, "table not found");
+ if (Error E = checkTable("table kind", (size_t)HashMappedTrieHandle::Kind,
+ (size_t)Table->getHeader().Kind, Path, TrieName))
+ return std::move(E);
+ auto Trie = Table->cast<HashMappedTrieHandle>();
+ assert(Trie && "Already checked the kind");
+
+ // Check the hash and data size.
+ if (Error E = checkTable("hash size", NumHashBits, Trie.getNumHashBits(),
+ Path, TrieName))
+ return std::move(E);
+ if (Error E = checkTable("data size", DataSize, Trie.getRecordDataSize(),
+ Path, TrieName))
+ return std::move(E);
+
+ // No flags supported right now. Either corrupt, or coming from a future
+ // writer.
+ if (size_t Flags = Trie.getFlags())
+ return createTableConfigError(std::errc::invalid_argument, Path, TrieName,
+ "unsupported flags: " + Twine(Flags));
+
+ // Success.
+ OnDiskHashMappedTrie::ImplType Impl{DatabaseFile(std::move(*File)), Trie};
+ return OnDiskHashMappedTrie(std::make_unique<ImplType>(std::move(Impl)));
+}
+
+//===----------------------------------------------------------------------===//
+// DataAllocator data structures.
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// DataAllocator table layout:
+/// - [8-bytes: Generic table header]
+/// - 8-bytes: AllocatorOffset (reserved for implementing free lists)
+/// - 8-bytes: Size for user data header
+/// - <user data buffer>
+///
+/// Record layout:
+/// - <data>
+class DataAllocatorHandle {
+public:
+ static constexpr TableHandle::TableKind Kind =
+ TableHandle::TableKind::DataAllocator;
+
+ struct Header {
+ TableHandle::Header GenericHeader;
+ std::atomic<int64_t> AllocatorOffset;
+ const uint64_t UserHeaderSize;
+ };
+
+ operator TableHandle() const {
+ if (!H)
+ return TableHandle();
+ return TableHandle(*Region, H->GenericHeader);
+ }
+
+ MutableArrayRef<char> allocate(MappedFileRegionBumpPtr &Alloc,
+ size_t DataSize) {
+ assert(&Alloc.getRegion() == Region);
+ return MutableArrayRef(Alloc.allocate(DataSize), DataSize);
+ }
+
+ explicit operator bool() const { return H; }
+ const Header &getHeader() const { return *H; }
+ MappedFileRegion &getRegion() const { return *Region; }
+
+ MutableArrayRef<uint8_t> getUserHeader() {
+ return MutableArrayRef(reinterpret_cast<uint8_t *>(H + 1),
+ H->UserHeaderSize);
+ }
+
+ static DataAllocatorHandle create(MappedFileRegionBumpPtr &Alloc,
+ StringRef Name, uint32_t UserHeaderSize);
+
+ DataAllocatorHandle() = default;
+ DataAllocatorHandle(MappedFileRegion &Region, Header &H)
+ : Region(&Region), H(&H) {}
+ DataAllocatorHandle(MappedFileRegion &Region, intptr_t HeaderOffset)
+ : DataAllocatorHandle(
+ Region, *reinterpret_cast<Header *>(Region.data() + HeaderOffset)) {
+ }
+
+private:
+ MappedFileRegion *Region = nullptr;
+ Header *H = nullptr;
+};
+
+} // end anonymous namespace
+
+struct OnDiskDataAllocator::ImplType {
+ DatabaseFile File;
+ DataAllocatorHandle Store;
+};
+
+DataAllocatorHandle DataAllocatorHandle::create(MappedFileRegionBumpPtr &Alloc,
+ StringRef Name,
+ uint32_t UserHeaderSize) {
+ // Allocate.
+ intptr_t Offset =
+ Alloc.allocateOffset(sizeof(Header) + UserHeaderSize + Name.size() + 1);
+
+ // Construct the header and the name.
+ assert(Name.size() <= UINT16_MAX && "Expected smaller table name");
+ auto *H = new (Alloc.getRegion().data() + Offset)
+ Header{{TableHandle::TableKind::DataAllocator, (uint16_t)Name.size(),
+ (int32_t)(sizeof(Header) + UserHeaderSize)},
+ /*AllocatorOffset=*/{0},
+ /*UserHeaderSize=*/UserHeaderSize};
+ memset(H + 1, 0, UserHeaderSize);
+ char *NameStorage = reinterpret_cast<char *>(H + 1) + UserHeaderSize;
+ llvm::copy(Name, NameStorage);
+ NameStorage[Name.size()] = 0;
+ return DataAllocatorHandle(Alloc.getRegion(), *H);
+}
+
+Expected<OnDiskDataAllocator> OnDiskDataAllocator::create(
+ const Twine &PathTwine, const Twine &TableNameTwine, uint64_t MaxFileSize,
+ std::optional<uint64_t> NewFileInitialSize, uint32_t UserHeaderSize,
+ function_ref<void(void *)> UserHeaderInit) {
+ assert(!UserHeaderSize || UserHeaderInit);
+ SmallString<128> PathStorage;
+ StringRef Path = PathTwine.toStringRef(PathStorage);
+ SmallString<128> TableNameStorage;
+ StringRef TableName = TableNameTwine.toStringRef(TableNameStorage);
+
+ // Constructor for if the file doesn't exist.
+ auto NewDBConstructor = [&](DatabaseFile &DB) -> Error {
+ DataAllocatorHandle Store =
+ DataAllocatorHandle::create(DB.getAlloc(), TableName, UserHeaderSize);
+ DB.addTable(Store);
+ if (UserHeaderSize)
+ UserHeaderInit(Store.getUserHeader().data());
+ return Error::success();
+ };
+
+ // Get or create the file.
+ Expected<DatabaseFile> File =
+ DatabaseFile::create(Path, MaxFileSize, NewDBConstructor);
+ if (!File)
+ return File.takeError();
+
+ // Find the table and validate it.
+ //
+ // TODO: Add support for creating/adding a table to an existing file.
+ std::optional<TableHandle> Table = File->findTable(TableName);
+ if (!Table)
+ return createTableConfigError(std::errc::argument_out_of_domain, Path,
+ TableName, "table not found");
+ if (Error E = checkTable("table kind", (size_t)DataAllocatorHandle::Kind,
+ (size_t)Table->getHeader().Kind, Path, TableName))
+ return std::move(E);
+ auto Store = Table->cast<DataAllocatorHandle>();
+ assert(Store && "Already checked the kind");
+
+ // Success.
+ OnDiskDataAllocator::ImplType Impl{DatabaseFile(std::move(*File)), Store};
+ return OnDiskDataAllocator(std::make_unique<ImplType>(std::move(Impl)));
+}
+
+OnDiskDataAllocator::pointer OnDiskDataAllocator::allocate(size_t Size) {
+ MutableArrayRef<char> Data =
+ Impl->Store.allocate(Impl->File.getAlloc(), Size);
+ return pointer(FileOffset(Data.data() - Impl->Store.getRegion().data()),
+ Data);
+}
+
+const char *OnDiskDataAllocator::beginData(FileOffset Offset) const {
+ assert(Offset);
+ assert(Impl);
+ assert(Offset.get() < (int64_t)Impl->File.getAlloc().size());
+ return Impl->File.getRegion().data() + Offset.get();
+}
+
+MutableArrayRef<uint8_t> OnDiskDataAllocator::getUserHeader() {
+ return Impl->Store.getUserHeader();
+}
+
+size_t OnDiskDataAllocator::size() const { return Impl->File.size(); }
+
+OnDiskDataAllocator::OnDiskDataAllocator(std::unique_ptr<ImplType> Impl)
+ : Impl(std::move(Impl)) {}
+
+#else // !LLVM_ENABLE_ONDISK_CAS
+
+struct OnDiskHashMappedTrie::ImplType {};
+
+Expected<OnDiskHashMappedTrie>
+OnDiskHashMappedTrie::create(const Twine &PathTwine, const Twine &TrieNameTwine,
+ size_t NumHashBits, uint64_t DataSize,
+ uint64_t MaxFileSize,
+ std::optional<uint64_t> NewFileInitialSize,
+ std::optional<size_t> NewTableNumRootBits,
+ std::optional<size_t> NewTableNumSubtrieBits) {
+ report_fatal_error("not supported");
+}
+
+OnDiskHashMappedTrie::pointer
+OnDiskHashMappedTrie::insertLazy(const_pointer Hint, ArrayRef<uint8_t> Hash,
+ LazyInsertOnConstructCB OnConstruct,
+ LazyInsertOnLeakCB OnLeak) {
+ report_fatal_error("not supported");
+}
+
+OnDiskHashMappedTrie::const_pointer
+OnDiskHashMappedTrie::recoverFromFileOffset(FileOffset Offset) const {
+ report_fatal_error("not supported");
+}
+
+OnDiskHashMappedTrie::const_pointer
+OnDiskHashMappedTrie::find(ArrayRef<uint8_t> Hash) const {
+ report_fatal_error("not supported");
+}
+
+void OnDiskHashMappedTrie::print(
+ raw_ostream &OS, function_ref<void(ArrayRef<char>)> PrintRecordData) const {
+ report_fatal_error("not supported");
+}
+
+size_t OnDiskHashMappedTrie::size() const {
+ report_fatal_error("not supported");
+}
+
+struct OnDiskDataAllocator::ImplType {};
+
+Expected<OnDiskDataAllocator> OnDiskDataAllocator::create(
+ const Twine &Path, const Twine &TableName, uint64_t MaxFileSize,
+ std::optional<uint64_t> NewFileInitialSize, uint32_t UserHeaderSize,
+ function_ref<void(void *)> UserHeaderInit) {
+ report_fatal_error("not supported");
+}
+
+OnDiskDataAllocator::pointer OnDiskDataAllocator::allocate(size_t Size) {
+ report_fatal_error("not supported");
+}
+
+const char *OnDiskDataAllocator::beginData(FileOffset Offset) const {
+ report_fatal_error("not supported");
+}
+
+MutableArrayRef<uint8_t> OnDiskDataAllocator::getUserHeader() {
+ report_fatal_error("not supported");
+}
+
+size_t OnDiskDataAllocator::size() const {
+ report_fatal_error("not supported");
+}
+
+#endif // LLVM_ENABLE_ONDISK_CAS
+
+OnDiskHashMappedTrie::OnDiskHashMappedTrie(std::unique_ptr<ImplType> Impl)
+ : Impl(std::move(Impl)) {}
+OnDiskHashMappedTrie::OnDiskHashMappedTrie(OnDiskHashMappedTrie &&RHS) =
+ default;
+OnDiskHashMappedTrie &
+OnDiskHashMappedTrie::operator=(OnDiskHashMappedTrie &&RHS) = default;
+OnDiskHashMappedTrie::~OnDiskHashMappedTrie() = default;
+
+OnDiskDataAllocator::OnDiskDataAllocator(OnDiskDataAllocator &&RHS) = default;
+OnDiskDataAllocator &
+OnDiskDataAllocator::operator=(OnDiskDataAllocator &&RHS) = default;
+OnDiskDataAllocator::~OnDiskDataAllocator() = default;
diff --git a/llvm/lib/CAS/OnDiskKeyValueDB.cpp b/llvm/lib/CAS/OnDiskKeyValueDB.cpp
new file mode 100644
index 0000000..935e507
--- /dev/null
+++ b/llvm/lib/CAS/OnDiskKeyValueDB.cpp
@@ -0,0 +1,79 @@
+//===- OnDiskKeyValueDB.cpp -------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/OnDiskKeyValueDB.h"
+#include "OnDiskCommon.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Alignment.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Path.h"
+
+using namespace llvm;
+using namespace llvm::cas;
+using namespace llvm::cas::ondisk;
+
+static constexpr StringLiteral ActionCacheFile = "actions";
+static constexpr StringLiteral FilePrefix = "v1.";
+
+Expected<ArrayRef<char>> OnDiskKeyValueDB::put(ArrayRef<uint8_t> Key,
+ ArrayRef<char> Value) {
+ if (LLVM_UNLIKELY(Value.size() != ValueSize))
+ return createStringError(errc::invalid_argument,
+ "expected value size of " + itostr(ValueSize) +
+ ", got: " + itostr(Value.size()));
+ assert(Value.size() == ValueSize);
+ OnDiskHashMappedTrie::pointer ActionP = Cache.insertLazy(
+ Key, [&](FileOffset TentativeOffset,
+ OnDiskHashMappedTrie::ValueProxy TentativeValue) {
+ assert(TentativeValue.Data.size() == ValueSize);
+ llvm::copy(Value, TentativeValue.Data.data());
+ });
+ return ActionP->Data;
+}
+
+Expected<std::optional<ArrayRef<char>>>
+OnDiskKeyValueDB::get(ArrayRef<uint8_t> Key) {
+ // Check the result cache.
+ OnDiskHashMappedTrie::const_pointer ActionP = Cache.find(Key);
+ if (!ActionP)
+ return std::nullopt;
+ assert(isAddrAligned(Align(8), ActionP->Data.data()));
+ return ActionP->Data;
+}
+
+Expected<std::unique_ptr<OnDiskKeyValueDB>>
+OnDiskKeyValueDB::open(StringRef Path, StringRef HashName, unsigned KeySize,
+ StringRef ValueName, size_t ValueSize) {
+ if (std::error_code EC = sys::fs::create_directories(Path))
+ return createFileError(Path, EC);
+
+ SmallString<256> CachePath(Path);
+ sys::path::append(CachePath, FilePrefix + ActionCacheFile);
+ constexpr uint64_t MB = 1024ull * 1024ull;
+ constexpr uint64_t GB = 1024ull * 1024ull * 1024ull;
+
+ uint64_t MaxFileSize = GB;
+ auto CustomSize = getOverriddenMaxMappingSize();
+ if (!CustomSize)
+ return CustomSize.takeError();
+ if (*CustomSize)
+ MaxFileSize = **CustomSize;
+
+ std::optional<OnDiskHashMappedTrie> ActionCache;
+ if (Error E = OnDiskHashMappedTrie::create(
+ CachePath,
+ "llvm.actioncache[" + HashName + "->" + ValueName + "]",
+ KeySize * 8,
+ /*DataSize=*/ValueSize, MaxFileSize, /*MinFileSize=*/MB)
+ .moveInto(ActionCache))
+ return std::move(E);
+
+ return std::unique_ptr<OnDiskKeyValueDB>(
+ new OnDiskKeyValueDB(ValueSize, std::move(*ActionCache)));
+}
diff --git a/llvm/lib/CMakeLists.txt b/llvm/lib/CMakeLists.txt
index 503c77cb..b06f4ff 100644
--- a/llvm/lib/CMakeLists.txt
+++ b/llvm/lib/CMakeLists.txt
@@ -9,6 +9,7 @@ add_subdirectory(FileCheck)
add_subdirectory(InterfaceStub)
add_subdirectory(IRPrinter)
add_subdirectory(IRReader)
+add_subdirectory(CAS)
add_subdirectory(CGData)
add_subdirectory(CodeGen)
add_subdirectory(CodeGenTypes)
diff --git a/llvm/lib/Support/Unix/Path.inc b/llvm/lib/Support/Unix/Path.inc
index 44097ba..9f6f15b 100644
--- a/llvm/lib/Support/Unix/Path.inc
+++ b/llvm/lib/Support/Unix/Path.inc
@@ -1223,13 +1223,14 @@ Expected<size_t> readNativeFileSlice(file_t FD, MutableArrayRef<char> Buf,
return NumRead;
}
-std::error_code tryLockFile(int FD, std::chrono::milliseconds Timeout) {
+std::error_code tryLockFile(int FD, std::chrono::milliseconds Timeout,
+ bool Exclusive) {
auto Start = std::chrono::steady_clock::now();
auto End = Start + Timeout;
do {
struct flock Lock;
memset(&Lock, 0, sizeof(Lock));
- Lock.l_type = F_WRLCK;
+ Lock.l_type = Exclusive ? F_WRLCK : F_RDLCK;
Lock.l_whence = SEEK_SET;
Lock.l_start = 0;
Lock.l_len = 0;
@@ -1238,15 +1239,17 @@ std::error_code tryLockFile(int FD, std::chrono::milliseconds Timeout) {
int Error = errno;
if (Error != EACCES && Error != EAGAIN)
return std::error_code(Error, std::generic_category());
+ if (Timeout.count() == 0)
+ break;
usleep(1000);
} while (std::chrono::steady_clock::now() < End);
return make_error_code(errc::no_lock_available);
}
-std::error_code lockFile(int FD) {
+std::error_code lockFile(int FD, bool Exclusive) {
struct flock Lock;
memset(&Lock, 0, sizeof(Lock));
- Lock.l_type = F_WRLCK;
+ Lock.l_type = Exclusive ? F_WRLCK : F_RDLCK;
Lock.l_whence = SEEK_SET;
Lock.l_start = 0;
Lock.l_len = 0;
diff --git a/llvm/lib/Support/Windows/Path.inc b/llvm/lib/Support/Windows/Path.inc
index c4bd5e2..07ee3d9 100644
--- a/llvm/lib/Support/Windows/Path.inc
+++ b/llvm/lib/Support/Windows/Path.inc
@@ -1327,8 +1327,10 @@ Expected<size_t> readNativeFileSlice(file_t FileHandle,
return readNativeFileImpl(FileHandle, Buf, &Overlapped);
}
-std::error_code tryLockFile(int FD, std::chrono::milliseconds Timeout) {
- DWORD Flags = LOCKFILE_EXCLUSIVE_LOCK | LOCKFILE_FAIL_IMMEDIATELY;
+std::error_code tryLockFile(int FD, std::chrono::milliseconds Timeout,
+ bool Exclusive) {
+ DWORD Flags = Exclusive ? LOCKFILE_EXCLUSIVE_LOCK : 0;
+ Flags |= LOCKFILE_FAIL_IMMEDIATELY;
OVERLAPPED OV = {};
file_t File = convertFDToNativeFile(FD);
auto Start = std::chrono::steady_clock::now();
@@ -1338,6 +1340,8 @@ std::error_code tryLockFile(int FD, std::chrono::milliseconds Timeout) {
return std::error_code();
DWORD Error = ::GetLastError();
if (Error == ERROR_LOCK_VIOLATION) {
+ if (Timeout.count() == 0)
+ break;
::Sleep(1);
continue;
}
@@ -1346,8 +1350,8 @@ std::error_code tryLockFile(int FD, std::chrono::milliseconds Timeout) {
return mapWindowsError(ERROR_LOCK_VIOLATION);
}
-std::error_code lockFile(int FD) {
- DWORD Flags = LOCKFILE_EXCLUSIVE_LOCK;
+std::error_code lockFile(int FD, bool Exclusive) {
+ DWORD Flags = Exclusive ? LOCKFILE_EXCLUSIVE_LOCK : 0;
OVERLAPPED OV = {};
file_t File = convertFDToNativeFile(FD);
if (::LockFileEx(File, Flags, 0, MAXDWORD, MAXDWORD, &OV))
diff --git a/llvm/unittests/CAS/ActionCacheTest.cpp b/llvm/unittests/CAS/ActionCacheTest.cpp
new file mode 100644
index 0000000..a74a3e9
--- /dev/null
+++ b/llvm/unittests/CAS/ActionCacheTest.cpp
@@ -0,0 +1,73 @@
+//===- ActionCacheTest.cpp ------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/ActionCache.h"
+#include "CASTestConfig.h"
+#include "llvm/CAS/ObjectStore.h"
+#include "llvm/Testing/Support/Error.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace llvm::cas;
+
+TEST_P(CASTest, ActionCacheHit) {
+ std::shared_ptr<ObjectStore> CAS = createObjectStore();
+ std::unique_ptr<ActionCache> Cache = createActionCache();
+
+ std::optional<ObjectProxy> ID;
+ ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, "1").moveInto(ID),
+ Succeeded());
+ std::optional<CASID> ResultID;
+ ASSERT_THAT_ERROR(Cache->put(*ID, *ID), Succeeded());
+ ASSERT_THAT_ERROR(Cache->get(*ID).moveInto(ResultID), Succeeded());
+ ASSERT_TRUE(ResultID);
+ std::optional<ObjectRef> Result = CAS->getReference(*ResultID);
+ ASSERT_TRUE(Result);
+ ASSERT_EQ(*ID, *Result);
+}
+
+TEST_P(CASTest, ActionCacheMiss) {
+ std::shared_ptr<ObjectStore> CAS = createObjectStore();
+ std::unique_ptr<ActionCache> Cache = createActionCache();
+
+ std::optional<ObjectProxy> ID1, ID2;
+ ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, "1").moveInto(ID1),
+ Succeeded());
+ ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, "2").moveInto(ID2),
+ Succeeded());
+ ASSERT_THAT_ERROR(Cache->put(*ID1, *ID2), Succeeded());
+ // This is a cache miss for looking up a key doesn't exist.
+ std::optional<CASID> Result1;
+ ASSERT_THAT_ERROR(Cache->get(*ID2).moveInto(Result1), Succeeded());
+ ASSERT_FALSE(Result1);
+
+ ASSERT_THAT_ERROR(Cache->put(*ID2, *ID1), Succeeded());
+ // Cache hit after adding the value.
+ std::optional<CASID> Result2;
+ ASSERT_THAT_ERROR(Cache->get(*ID2).moveInto(Result2), Succeeded());
+ ASSERT_TRUE(Result2);
+ std::optional<ObjectRef> Ref = CAS->getReference(*Result2);
+ ASSERT_TRUE(Ref);
+ ASSERT_EQ(*ID1, *Ref);
+}
+
+TEST_P(CASTest, ActionCacheRewrite) {
+ std::shared_ptr<ObjectStore> CAS = createObjectStore();
+ std::unique_ptr<ActionCache> Cache = createActionCache();
+
+ std::optional<ObjectProxy> ID1, ID2;
+ ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, "1").moveInto(ID1),
+ Succeeded());
+ ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, "2").moveInto(ID2),
+ Succeeded());
+ ASSERT_THAT_ERROR(Cache->put(*ID1, *ID1), Succeeded());
+ // Writing to the same key with different value is error.
+ ASSERT_THAT_ERROR(Cache->put(*ID1, *ID2), Failed());
+ // Writing the same value multiple times to the same key is fine.
+ ASSERT_THAT_ERROR(Cache->put(*ID1, *ID1), Succeeded());
+}
diff --git a/llvm/unittests/CAS/CASTestConfig.cpp b/llvm/unittests/CAS/CASTestConfig.cpp
new file mode 100644
index 0000000..1cb7b43
--- /dev/null
+++ b/llvm/unittests/CAS/CASTestConfig.cpp
@@ -0,0 +1,23 @@
+//===- CASTestConfig.cpp --------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "CASTestConfig.h"
+#include "llvm/CAS/ObjectStore.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace llvm::cas;
+
+CASTestingEnv createInMemory(int I) {
+ std::unique_ptr<ObjectStore> CAS = createInMemoryCAS();
+ std::unique_ptr<ActionCache> Cache = createInMemoryActionCache();
+ return CASTestingEnv{std::move(CAS), std::move(Cache)};
+}
+
+INSTANTIATE_TEST_SUITE_P(InMemoryCAS, CASTest,
+ ::testing::Values(createInMemory));
diff --git a/llvm/unittests/CAS/CASTestConfig.h b/llvm/unittests/CAS/CASTestConfig.h
new file mode 100644
index 0000000..8093a0b
--- /dev/null
+++ b/llvm/unittests/CAS/CASTestConfig.h
@@ -0,0 +1,38 @@
+//===- CASTestConfig.h ----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/ActionCache.h"
+#include "llvm/CAS/ObjectStore.h"
+#include "gtest/gtest.h"
+
+#ifndef LLVM_UNITTESTS_CASTESTCONFIG_H
+#define LLVM_UNITTESTS_CASTESTCONFIG_H
+
+struct CASTestingEnv {
+ std::unique_ptr<llvm::cas::ObjectStore> CAS;
+ std::unique_ptr<llvm::cas::ActionCache> Cache;
+};
+
+class CASTest
+ : public testing::TestWithParam<std::function<CASTestingEnv(int)>> {
+protected:
+ std::optional<int> NextCASIndex;
+
+ std::unique_ptr<llvm::cas::ObjectStore> createObjectStore() {
+ auto TD = GetParam()(++(*NextCASIndex));
+ return std::move(TD.CAS);
+ }
+ std::unique_ptr<llvm::cas::ActionCache> createActionCache() {
+ auto TD = GetParam()(++(*NextCASIndex));
+ return std::move(TD.Cache);
+ }
+ void SetUp() { NextCASIndex = 0; }
+ void TearDown() { NextCASIndex = std::nullopt; }
+};
+
+#endif
diff --git a/llvm/unittests/CAS/CMakeLists.txt b/llvm/unittests/CAS/CMakeLists.txt
new file mode 100644
index 0000000..f839c70
--- /dev/null
+++ b/llvm/unittests/CAS/CMakeLists.txt
@@ -0,0 +1,21 @@
+if (LLVM_ENABLE_ONDISK_CAS)
+ add_definitions(-DLLVM_ENABLE_ONDISK_CAS=1)
+endif()
+
+set(LLVM_LINK_COMPONENTS
+ Support
+ CAS
+ TestingSupport
+ )
+
+add_llvm_unittest(CASTests
+ ActionCacheTest.cpp
+ CASTestConfig.cpp
+ ObjectStoreTest.cpp
+ OnDiskGraphDBTest.cpp
+ OnDiskHashMappedTrieTest.cpp
+ OnDiskKeyValueDBTest.cpp
+ ProgramTest.cpp
+ )
+
+target_link_libraries(CASTests PRIVATE LLVMTestingSupport)
diff --git a/llvm/unittests/CAS/ObjectStoreTest.cpp b/llvm/unittests/CAS/ObjectStoreTest.cpp
new file mode 100644
index 0000000..0d94731
--- /dev/null
+++ b/llvm/unittests/CAS/ObjectStoreTest.cpp
@@ -0,0 +1,360 @@
+//===- ObjectStoreTest.cpp ------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/ObjectStore.h"
+#include "llvm/Support/Process.h"
+#include "llvm/Support/ThreadPool.h"
+#include "llvm/Testing/Support/Error.h"
+#include "gtest/gtest.h"
+
+#include "CASTestConfig.h"
+
+using namespace llvm;
+using namespace llvm::cas;
+
+TEST_P(CASTest, PrintIDs) {
+ std::unique_ptr<ObjectStore> CAS = createObjectStore();
+
+ std::optional<CASID> ID1, ID2;
+ ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, "1").moveInto(ID1),
+ Succeeded());
+ ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, "2").moveInto(ID2),
+ Succeeded());
+ EXPECT_NE(ID1, ID2);
+ std::string PrintedID1 = ID1->toString();
+ std::string PrintedID2 = ID2->toString();
+ EXPECT_NE(PrintedID1, PrintedID2);
+
+ std::optional<CASID> ParsedID1, ParsedID2;
+ ASSERT_THAT_ERROR(CAS->parseID(PrintedID1).moveInto(ParsedID1), Succeeded());
+ ASSERT_THAT_ERROR(CAS->parseID(PrintedID2).moveInto(ParsedID2), Succeeded());
+ EXPECT_EQ(ID1, ParsedID1);
+ EXPECT_EQ(ID2, ParsedID2);
+}
+
+TEST_P(CASTest, Blobs) {
+ std::unique_ptr<ObjectStore> CAS1 = createObjectStore();
+ StringRef ContentStrings[] = {
+ "word",
+ "some longer text std::string's local memory",
+ R"(multiline text multiline text multiline text multiline text
+multiline text multiline text multiline text multiline text multiline text
+multiline text multiline text multiline text multiline text multiline text
+multiline text multiline text multiline text multiline text multiline text
+multiline text multiline text multiline text multiline text multiline text
+multiline text multiline text multiline text multiline text multiline text)",
+ };
+
+ SmallVector<CASID> IDs;
+ for (StringRef Content : ContentStrings) {
+ // Use StringRef::str() to create a temporary std::string. This could cause
+ // problems if the CAS is storing references to the input string instead of
+ // copying it.
+ std::optional<ObjectProxy> Blob;
+ ASSERT_THAT_ERROR(CAS1->createProxy(std::nullopt, Content).moveInto(Blob),
+ Succeeded());
+ IDs.push_back(Blob->getID());
+
+ // Check basic printing of IDs.
+ EXPECT_EQ(IDs.back().toString(), IDs.back().toString());
+ if (IDs.size() > 2)
+ EXPECT_NE(IDs.front().toString(), IDs.back().toString());
+ }
+
+ // Check that the blobs give the same IDs later.
+ for (int I = 0, E = IDs.size(); I != E; ++I) {
+ std::optional<ObjectProxy> Blob;
+ ASSERT_THAT_ERROR(
+ CAS1->createProxy(std::nullopt, ContentStrings[I]).moveInto(Blob),
+ Succeeded());
+ EXPECT_EQ(IDs[I], Blob->getID());
+ }
+
+ // Run validation on all CASIDs.
+ for (int I = 0, E = IDs.size(); I != E; ++I)
+ ASSERT_THAT_ERROR(CAS1->validate(IDs[I]), Succeeded());
+
+ // Check that the blobs can be retrieved multiple times.
+ for (int I = 0, E = IDs.size(); I != E; ++I) {
+ for (int J = 0, JE = 3; J != JE; ++J) {
+ std::optional<ObjectProxy> Buffer;
+ ASSERT_THAT_ERROR(CAS1->getProxy(IDs[I]).moveInto(Buffer), Succeeded());
+ EXPECT_EQ(ContentStrings[I], Buffer->getData());
+ }
+ }
+
+ // Confirm these blobs don't exist in a fresh CAS instance.
+ std::unique_ptr<ObjectStore> CAS2 = createObjectStore();
+ for (int I = 0, E = IDs.size(); I != E; ++I) {
+ std::optional<ObjectProxy> Proxy;
+ EXPECT_THAT_ERROR(CAS2->getProxy(IDs[I]).moveInto(Proxy), Failed());
+ }
+
+ // Insert into the second CAS and confirm the IDs are stable. Getting them
+ // should work now.
+ for (int I = IDs.size(), E = 0; I != E; --I) {
+ auto &ID = IDs[I - 1];
+ auto &Content = ContentStrings[I - 1];
+ std::optional<ObjectProxy> Blob;
+ ASSERT_THAT_ERROR(CAS2->createProxy(std::nullopt, Content).moveInto(Blob),
+ Succeeded());
+ EXPECT_EQ(ID, Blob->getID());
+
+ std::optional<ObjectProxy> Buffer;
+ ASSERT_THAT_ERROR(CAS2->getProxy(ID).moveInto(Buffer), Succeeded());
+ EXPECT_EQ(Content, Buffer->getData());
+ }
+}
+
+TEST_P(CASTest, BlobsBig) {
+ // A little bit of validation that bigger blobs are okay. Climb up to 1MB.
+ std::unique_ptr<ObjectStore> CAS = createObjectStore();
+ SmallString<256> String1 = StringRef("a few words");
+ SmallString<256> String2 = StringRef("others");
+ while (String1.size() < 1024U * 1024U) {
+ std::optional<CASID> ID1;
+ std::optional<CASID> ID2;
+ ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, String1).moveInto(ID1),
+ Succeeded());
+ ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, String1).moveInto(ID2),
+ Succeeded());
+ ASSERT_THAT_ERROR(CAS->validate(*ID1), Succeeded());
+ ASSERT_THAT_ERROR(CAS->validate(*ID2), Succeeded());
+ ASSERT_EQ(ID1, ID2);
+
+ String1.append(String2);
+ ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, String2).moveInto(ID1),
+ Succeeded());
+ ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, String2).moveInto(ID2),
+ Succeeded());
+ ASSERT_THAT_ERROR(CAS->validate(*ID1), Succeeded());
+ ASSERT_THAT_ERROR(CAS->validate(*ID2), Succeeded());
+ ASSERT_EQ(ID1, ID2);
+ String2.append(String1);
+ }
+
+ // Specifically check near 1MB for objects large enough they're likely to be
+ // stored externally in an on-disk CAS and will be near a page boundary.
+ SmallString<0> Storage;
+ const size_t InterestingSize = 1024U * 1024ULL;
+ const size_t SizeE = InterestingSize + 2;
+ if (Storage.size() < SizeE)
+ Storage.resize(SizeE, '\01');
+ for (size_t Size = InterestingSize - 2; Size != SizeE; ++Size) {
+ StringRef Data(Storage.data(), Size);
+ std::optional<ObjectProxy> Blob;
+ ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, Data).moveInto(Blob),
+ Succeeded());
+ ASSERT_EQ(Data, Blob->getData());
+ ASSERT_EQ(0, Blob->getData().end()[0]);
+ }
+}
+
+TEST_P(CASTest, LeafNodes) {
+ std::unique_ptr<ObjectStore> CAS1 = createObjectStore();
+ StringRef ContentStrings[] = {
+ "word",
+ "some longer text std::string's local memory",
+ R"(multiline text multiline text multiline text multiline text
+multiline text multiline text multiline text multiline text multiline text
+multiline text multiline text multiline text multiline text multiline text
+multiline text multiline text multiline text multiline text multiline text
+multiline text multiline text multiline text multiline text multiline text
+multiline text multiline text multiline text multiline text multiline text)",
+ };
+
+ SmallVector<ObjectRef> Nodes;
+ SmallVector<CASID> IDs;
+ for (StringRef Content : ContentStrings) {
+ // Use StringRef::str() to create a temporary std::string. This could cause
+ // problems if the CAS is storing references to the input string instead of
+ // copying it.
+ std::optional<ObjectRef> Node;
+ ASSERT_THAT_ERROR(
+ CAS1->store(std::nullopt, arrayRefFromStringRef<char>(Content))
+ .moveInto(Node),
+ Succeeded());
+ Nodes.push_back(*Node);
+
+ // Check basic printing of IDs.
+ IDs.push_back(CAS1->getID(*Node));
+ EXPECT_EQ(IDs.back().toString(), IDs.back().toString());
+ EXPECT_EQ(Nodes.front(), Nodes.front());
+ EXPECT_EQ(Nodes.back(), Nodes.back());
+ EXPECT_EQ(IDs.front(), IDs.front());
+ EXPECT_EQ(IDs.back(), IDs.back());
+ if (Nodes.size() <= 1)
+ continue;
+ EXPECT_NE(Nodes.front(), Nodes.back());
+ EXPECT_NE(IDs.front(), IDs.back());
+ }
+
+ // Check that the blobs give the same IDs later.
+ for (int I = 0, E = IDs.size(); I != E; ++I) {
+ std::optional<ObjectRef> Node;
+ ASSERT_THAT_ERROR(CAS1->store(std::nullopt, arrayRefFromStringRef<char>(
+ ContentStrings[I]))
+ .moveInto(Node),
+ Succeeded());
+ EXPECT_EQ(IDs[I], CAS1->getID(*Node));
+ }
+
+ // Check that the blobs can be retrieved multiple times.
+ for (int I = 0, E = IDs.size(); I != E; ++I) {
+ for (int J = 0, JE = 3; J != JE; ++J) {
+ std::optional<ObjectProxy> Object;
+ ASSERT_THAT_ERROR(CAS1->getProxy(IDs[I]).moveInto(Object), Succeeded());
+ ASSERT_TRUE(Object);
+ EXPECT_EQ(ContentStrings[I], Object->getData());
+ }
+ }
+
+ // Confirm these blobs don't exist in a fresh CAS instance.
+ std::unique_ptr<ObjectStore> CAS2 = createObjectStore();
+ for (int I = 0, E = IDs.size(); I != E; ++I) {
+ std::optional<ObjectProxy> Object;
+ EXPECT_THAT_ERROR(CAS2->getProxy(IDs[I]).moveInto(Object), Failed());
+ }
+
+ // Insert into the second CAS and confirm the IDs are stable. Getting them
+ // should work now.
+ for (int I = IDs.size(), E = 0; I != E; --I) {
+ auto &ID = IDs[I - 1];
+ auto &Content = ContentStrings[I - 1];
+ std::optional<ObjectRef> Node;
+ ASSERT_THAT_ERROR(
+ CAS2->store(std::nullopt, arrayRefFromStringRef<char>(Content))
+ .moveInto(Node),
+ Succeeded());
+ EXPECT_EQ(ID, CAS2->getID(*Node));
+
+ std::optional<ObjectProxy> Object;
+ ASSERT_THAT_ERROR(CAS2->getProxy(ID).moveInto(Object), Succeeded());
+ ASSERT_TRUE(Object);
+ EXPECT_EQ(Content, Object->getData());
+ }
+}
+
+TEST_P(CASTest, NodesBig) {
+ std::unique_ptr<ObjectStore> CAS = createObjectStore();
+
+ // Specifically check near 1MB for objects large enough they're likely to be
+ // stored externally in an on-disk CAS, and such that one of them will be
+ // near a page boundary.
+ SmallString<0> Storage;
+ constexpr size_t InterestingSize = 1024U * 1024ULL;
+ constexpr size_t WordSize = sizeof(void *);
+
+ // Start much smaller to account for headers.
+ constexpr size_t SizeB = InterestingSize - 8 * WordSize;
+ constexpr size_t SizeE = InterestingSize + 1;
+ if (Storage.size() < SizeE)
+ Storage.resize(SizeE, '\01');
+
+ SmallVector<ObjectRef, 4> CreatedNodes;
+ // Avoid checking every size because this is an expensive test. Just check
+ // for data that is 8B-word-aligned, and one less. Also appending the created
+ // nodes as the references in the next block to check references are created
+ // correctly.
+ for (size_t Size = SizeB; Size < SizeE; Size += WordSize) {
+ for (bool IsAligned : {false, true}) {
+ StringRef Data(Storage.data(), Size - (IsAligned ? 0 : 1));
+ std::optional<ObjectProxy> Node;
+ ASSERT_THAT_ERROR(CAS->createProxy(CreatedNodes, Data).moveInto(Node),
+ Succeeded());
+ ASSERT_EQ(Data, Node->getData());
+ ASSERT_EQ(0, Node->getData().end()[0]);
+ ASSERT_EQ(Node->getNumReferences(), CreatedNodes.size());
+ CreatedNodes.emplace_back(Node->getRef());
+ }
+ }
+
+ for (auto ID : CreatedNodes)
+ ASSERT_THAT_ERROR(CAS->validate(CAS->getID(ID)), Succeeded());
+}
+
+/// Common test functionality for creating blobs in parallel. You can vary which
+/// cas instances are the same or different, and the size of the created blobs.
+static void testBlobsParallel(ObjectStore &Read1, ObjectStore &Read2,
+ ObjectStore &Write1, ObjectStore &Write2,
+ uint64_t BlobSize) {
+ SCOPED_TRACE(testBlobsParallel);
+ unsigned BlobCount = 100;
+ std::vector<std::string> Blobs;
+ Blobs.reserve(BlobCount);
+ for (unsigned I = 0; I < BlobCount; ++I) {
+ std::string Blob;
+ Blob.reserve(BlobSize);
+ while (Blob.size() < BlobSize) {
+ auto R = sys::Process::GetRandomNumber();
+ Blob.append((char *)&R, sizeof(R));
+ }
+ assert(Blob.size() >= BlobSize);
+ Blob.resize(BlobSize);
+ Blobs.push_back(std::move(Blob));
+ }
+
+ std::mutex NodesMtx;
+ std::vector<std::optional<CASID>> CreatedNodes(BlobCount);
+
+ auto Producer = [&](unsigned I, ObjectStore *CAS) {
+ std::optional<ObjectProxy> Node;
+ EXPECT_THAT_ERROR(CAS->createProxy({}, Blobs[I]).moveInto(Node),
+ Succeeded());
+ {
+ std::lock_guard<std::mutex> L(NodesMtx);
+ CreatedNodes[I] = Node ? Node->getID() : CASID::getDenseMapTombstoneKey();
+ }
+ };
+
+ auto Consumer = [&](unsigned I, ObjectStore *CAS) {
+ std::optional<CASID> ID;
+ while (!ID) {
+ // Busy wait.
+ std::lock_guard<std::mutex> L(NodesMtx);
+ ID = CreatedNodes[I];
+ }
+ if (ID == CASID::getDenseMapTombstoneKey())
+ // Producer failed; already reported.
+ return;
+
+ std::optional<ObjectProxy> Node;
+ ASSERT_THAT_ERROR(CAS->getProxy(*ID).moveInto(Node), Succeeded());
+ EXPECT_EQ(Node->getData(), Blobs[I]);
+ };
+
+ DefaultThreadPool Threads;
+ for (unsigned I = 0; I < BlobCount; ++I) {
+ Threads.async(Consumer, I, &Read1);
+ Threads.async(Consumer, I, &Read2);
+ Threads.async(Producer, I, &Write1);
+ Threads.async(Producer, I, &Write2);
+ }
+
+ Threads.wait();
+}
+
+static void testBlobsParallel1(ObjectStore &CAS, uint64_t BlobSize) {
+ SCOPED_TRACE(testBlobsParallel1);
+ testBlobsParallel(CAS, CAS, CAS, CAS, BlobSize);
+}
+
+TEST_P(CASTest, BlobsParallel) {
+ std::shared_ptr<ObjectStore> CAS = createObjectStore();
+ uint64_t Size = 1ULL * 1024;
+ ASSERT_NO_FATAL_FAILURE(testBlobsParallel1(*CAS, Size));
+}
+
+#ifdef EXPENSIVE_CHECKS
+TEST_P(CASTest, BlobsBigParallel) {
+ std::shared_ptr<ObjectStore> CAS = createObjectStore();
+ // 100k is large enough to be standalone files in our on-disk cas.
+ uint64_t Size = 100ULL * 1024;
+ ASSERT_NO_FATAL_FAILURE(testBlobsParallel1(*CAS, Size));
+}
+#endif
diff --git a/llvm/unittests/CAS/OnDiskCommonUtils.h b/llvm/unittests/CAS/OnDiskCommonUtils.h
new file mode 100644
index 0000000..3978c6b0
--- /dev/null
+++ b/llvm/unittests/CAS/OnDiskCommonUtils.h
@@ -0,0 +1,69 @@
+//===- llvm/unittest/CAS/OnDiskCommonUtils.h --------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/BuiltinObjectHasher.h"
+#include "llvm/CAS/OnDiskGraphDB.h"
+#include "llvm/Support/BLAKE3.h"
+
+namespace llvm::unittest::cas {
+
+using namespace llvm::cas;
+using namespace llvm::cas::ondisk;
+
+using HasherT = BLAKE3;
+using HashType = decltype(HasherT::hash(std::declval<ArrayRef<uint8_t> &>()));
+using ValueType = std::array<char, 20>;
+
+inline HashType digest(StringRef Data, ArrayRef<ArrayRef<uint8_t>> RefHashes) {
+ return BuiltinObjectHasher<HasherT>::hashObject(
+ RefHashes, arrayRefFromStringRef<char>(Data));
+}
+
+inline ObjectID digest(OnDiskGraphDB &DB, StringRef Data,
+ ArrayRef<ObjectID> Refs) {
+ SmallVector<ArrayRef<uint8_t>, 8> RefHashes;
+ for (ObjectID Ref : Refs)
+ RefHashes.push_back(DB.getDigest(Ref));
+ HashType Digest = digest(Data, RefHashes);
+ return DB.getReference(Digest);
+}
+
+inline HashType digest(StringRef Data) {
+ return HasherT::hash(arrayRefFromStringRef(Data));
+}
+
+inline ValueType valueFromString(StringRef S) {
+ ValueType Val;
+ llvm::copy(S.substr(0, sizeof(Val)), Val.data());
+ return Val;
+}
+
+inline Expected<ObjectID> store(OnDiskGraphDB &DB, StringRef Data,
+ ArrayRef<ObjectID> Refs) {
+ ObjectID ID = digest(DB, Data, Refs);
+ if (Error E = DB.store(ID, Refs, arrayRefFromStringRef<char>(Data)))
+ return std::move(E);
+ return ID;
+}
+
+inline Error printTree(OnDiskGraphDB &DB, ObjectID ID, raw_ostream &OS,
+ unsigned Indent = 0) {
+ std::optional<ondisk::ObjectHandle> Obj;
+ if (Error E = DB.load(ID).moveInto(Obj))
+ return E;
+ if (!Obj)
+ return Error::success();
+ OS.indent(Indent) << toStringRef(DB.getObjectData(*Obj)) << '\n';
+ for (ObjectID Ref : DB.getObjectRefs(*Obj)) {
+ if (Error E = printTree(DB, Ref, OS, Indent + 2))
+ return E;
+ }
+ return Error::success();
+}
+
+} // namespace llvm::unittest::cas
diff --git a/llvm/unittests/CAS/OnDiskGraphDBTest.cpp b/llvm/unittests/CAS/OnDiskGraphDBTest.cpp
new file mode 100644
index 0000000..57ea061
--- /dev/null
+++ b/llvm/unittests/CAS/OnDiskGraphDBTest.cpp
@@ -0,0 +1,284 @@
+//===- llvm/unittest/CAS/OnDiskGraphDBTest.cpp ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "OnDiskCommonUtils.h"
+#include "llvm/Testing/Support/Error.h"
+#include "llvm/Testing/Support/SupportHelpers.h"
+#include "gtest/gtest.h"
+
+#if LLVM_ENABLE_ONDISK_CAS
+
+using namespace llvm;
+using namespace llvm::cas;
+using namespace llvm::cas::ondisk;
+using namespace llvm::unittest::cas;
+
+TEST(OnDiskGraphDBTest, Basic) {
+ unittest::TempDir Temp("ondiskcas", /*Unique=*/true);
+ std::unique_ptr<OnDiskGraphDB> DB;
+ ASSERT_THAT_ERROR(
+ OnDiskGraphDB::open(Temp.path(), "blake3", sizeof(HashType)).moveInto(DB),
+ Succeeded());
+
+ auto digest = [&DB](StringRef Data, ArrayRef<ObjectID> Refs) -> ObjectID {
+ return ::digest(*DB, Data, Refs);
+ };
+
+ auto store = [&](StringRef Data,
+ ArrayRef<ObjectID> Refs) -> Expected<ObjectID> {
+ return ::store(*DB, Data, Refs);
+ };
+
+ std::optional<ObjectID> ID1;
+ ASSERT_THAT_ERROR(store("hello", {}).moveInto(ID1), Succeeded());
+
+ std::optional<ondisk::ObjectHandle> Obj1;
+ ASSERT_THAT_ERROR(DB->load(*ID1).moveInto(Obj1), Succeeded());
+ ASSERT_TRUE(Obj1.has_value());
+ EXPECT_EQ(toStringRef(DB->getObjectData(*Obj1)), "hello");
+
+ ArrayRef<uint8_t> Digest1 = DB->getDigest(*ID1);
+ ObjectID ID2 = DB->getReference(Digest1);
+ EXPECT_EQ(ID1, ID2);
+
+ ObjectID ID3 = digest("world", {});
+ EXPECT_FALSE(DB->containsObject(ID3));
+ std::optional<ondisk::ObjectHandle> Obj2;
+ ASSERT_THAT_ERROR(DB->load(ID3).moveInto(Obj2), Succeeded());
+ EXPECT_FALSE(Obj2.has_value());
+
+ ASSERT_THAT_ERROR(DB->store(ID3, {}, arrayRefFromStringRef<char>("world")),
+ Succeeded());
+ EXPECT_TRUE(DB->containsObject(ID3));
+ ASSERT_THAT_ERROR(DB->load(ID3).moveInto(Obj2), Succeeded());
+ ASSERT_TRUE(Obj2.has_value());
+ EXPECT_EQ(toStringRef(DB->getObjectData(*Obj2)), "world");
+
+ size_t LargeDataSize = 256LL * 1024LL; // 256K.
+ // The precise size number is not important, we mainly check that the large
+ // object will be properly accounted for.
+ EXPECT_TRUE(DB->getStorageSize() > 10 &&
+ DB->getStorageSize() < LargeDataSize);
+
+ SmallString<16> Buffer;
+ Buffer.resize(LargeDataSize);
+ ASSERT_THAT_ERROR(store(Buffer, {}).moveInto(ID1), Succeeded());
+ size_t StorageSize = DB->getStorageSize();
+ EXPECT_TRUE(StorageSize > LargeDataSize);
+
+ // Close & re-open the DB and check that it reports the same storage size.
+ DB.reset();
+ ASSERT_THAT_ERROR(
+ OnDiskGraphDB::open(Temp.path(), "blake3", sizeof(HashType)).moveInto(DB),
+ Succeeded());
+ EXPECT_EQ(DB->getStorageSize(), StorageSize);
+}
+
+TEST(OnDiskGraphDBTest, FaultInSingleNode) {
+ unittest::TempDir TempUpstream("ondiskcas-upstream", /*Unique=*/true);
+ std::unique_ptr<OnDiskGraphDB> UpstreamDB;
+ ASSERT_THAT_ERROR(
+ OnDiskGraphDB::open(TempUpstream.path(), "blake3", sizeof(HashType))
+ .moveInto(UpstreamDB),
+ Succeeded());
+ {
+ std::optional<ObjectID> ID1;
+ ASSERT_THAT_ERROR(store(*UpstreamDB, "hello", {}).moveInto(ID1),
+ Succeeded());
+ std::optional<ObjectID> ID2;
+ ASSERT_THAT_ERROR(store(*UpstreamDB, "another", {}).moveInto(ID2),
+ Succeeded());
+ std::optional<ObjectID> ID3;
+ ASSERT_THAT_ERROR(store(*UpstreamDB, "world", {*ID1, *ID2}).moveInto(ID3),
+ Succeeded());
+ }
+
+ unittest::TempDir Temp("ondiskcas", /*Unique=*/true);
+ std::unique_ptr<OnDiskGraphDB> DB;
+ ASSERT_THAT_ERROR(
+ OnDiskGraphDB::open(Temp.path(), "blake3", sizeof(HashType),
+ std::move(UpstreamDB),
+ OnDiskGraphDB::FaultInPolicy::SingleNode)
+ .moveInto(DB),
+ Succeeded());
+
+ ObjectID ID1 = digest(*DB, "hello", {});
+ ObjectID ID2 = digest(*DB, "another", {});
+ ObjectID ID3 = digest(*DB, "world", {ID1, ID2});
+ ObjectID ID4 = digest(*DB, "world", {});
+
+ EXPECT_TRUE(DB->containsObject(ID1));
+ EXPECT_TRUE(DB->containsObject(ID2));
+ EXPECT_TRUE(DB->containsObject(ID3));
+ EXPECT_FALSE(DB->containsObject(ID4));
+
+ EXPECT_TRUE(DB->getExistingReference(digest("hello", {})).has_value());
+ EXPECT_TRUE(DB->getExistingReference(DB->getDigest(ID3)).has_value());
+ EXPECT_FALSE(DB->getExistingReference(digest("world", {})).has_value());
+
+ {
+ std::optional<ondisk::ObjectHandle> Obj;
+ ASSERT_THAT_ERROR(DB->load(ID1).moveInto(Obj), Succeeded());
+ ASSERT_TRUE(Obj.has_value());
+ EXPECT_EQ(toStringRef(DB->getObjectData(*Obj)), "hello");
+ auto Refs = DB->getObjectRefs(*Obj);
+ EXPECT_TRUE(Refs.empty());
+ }
+ {
+ std::optional<ondisk::ObjectHandle> Obj;
+ ASSERT_THAT_ERROR(DB->load(ID3).moveInto(Obj), Succeeded());
+ ASSERT_TRUE(Obj.has_value());
+ EXPECT_EQ(toStringRef(DB->getObjectData(*Obj)), "world");
+ auto Refs = DB->getObjectRefs(*Obj);
+ ASSERT_EQ(std::distance(Refs.begin(), Refs.end()), 2);
+ EXPECT_EQ(Refs.begin()[0], ID1);
+ EXPECT_EQ(Refs.begin()[1], ID2);
+ }
+ {
+ std::optional<ondisk::ObjectHandle> Obj;
+ ASSERT_THAT_ERROR(DB->load(ID4).moveInto(Obj), Succeeded());
+ EXPECT_FALSE(Obj.has_value());
+ }
+
+ // Re-open the primary without chaining, to verify the data were copied from
+ // the upstream.
+ ASSERT_THAT_ERROR(
+ OnDiskGraphDB::open(Temp.path(), "blake3", sizeof(HashType),
+ /*UpstreamDB=*/nullptr,
+ OnDiskGraphDB::FaultInPolicy::SingleNode)
+ .moveInto(DB),
+ Succeeded());
+ ID1 = digest(*DB, "hello", {});
+ ID2 = digest(*DB, "another", {});
+ ID3 = digest(*DB, "world", {ID1, ID2});
+ EXPECT_TRUE(DB->containsObject(ID1));
+ EXPECT_FALSE(DB->containsObject(ID2));
+ EXPECT_TRUE(DB->containsObject(ID3));
+ {
+ std::optional<ondisk::ObjectHandle> Obj;
+ ASSERT_THAT_ERROR(DB->load(ID1).moveInto(Obj), Succeeded());
+ ASSERT_TRUE(Obj.has_value());
+ EXPECT_EQ(toStringRef(DB->getObjectData(*Obj)), "hello");
+ auto Refs = DB->getObjectRefs(*Obj);
+ EXPECT_TRUE(Refs.empty());
+ }
+}
+
+TEST(OnDiskGraphDBTest, FaultInFullTree) {
+ unittest::TempDir TempUpstream("ondiskcas-upstream", /*Unique=*/true);
+ std::unique_ptr<OnDiskGraphDB> UpstreamDB;
+ ASSERT_THAT_ERROR(
+ OnDiskGraphDB::open(TempUpstream.path(), "blake3", sizeof(HashType))
+ .moveInto(UpstreamDB),
+ Succeeded());
+ HashType RootHash;
+ {
+ std::optional<ObjectID> ID11;
+ ASSERT_THAT_ERROR(store(*UpstreamDB, "11", {}).moveInto(ID11), Succeeded());
+ std::optional<ObjectID> ID121;
+ ASSERT_THAT_ERROR(store(*UpstreamDB, "121", {}).moveInto(ID121),
+ Succeeded());
+ std::optional<ObjectID> ID12;
+ ASSERT_THAT_ERROR(store(*UpstreamDB, "12", {*ID121}).moveInto(ID12),
+ Succeeded());
+ std::optional<ObjectID> ID1;
+ ASSERT_THAT_ERROR(store(*UpstreamDB, "1", {*ID11, *ID12}).moveInto(ID1),
+ Succeeded());
+ std::optional<ObjectID> ID21;
+ ASSERT_THAT_ERROR(store(*UpstreamDB, "21", {}).moveInto(ID21), Succeeded());
+ std::optional<ObjectID> ID22;
+ ASSERT_THAT_ERROR(store(*UpstreamDB, "22", {}).moveInto(ID22), Succeeded());
+ std::optional<ObjectID> ID2;
+ ASSERT_THAT_ERROR(
+ store(*UpstreamDB, "2", {*ID12, *ID21, *ID22}).moveInto(ID2),
+ Succeeded());
+ std::optional<ObjectID> IDRoot;
+ ASSERT_THAT_ERROR(store(*UpstreamDB, "root", {*ID1, *ID2}).moveInto(IDRoot),
+ Succeeded());
+ ArrayRef<uint8_t> Digest = UpstreamDB->getDigest(*IDRoot);
+ ASSERT_EQ(Digest.size(), RootHash.size());
+ llvm::copy(Digest, RootHash.data());
+ }
+
+ unittest::TempDir Temp("ondiskcas", /*Unique=*/true);
+ std::unique_ptr<OnDiskGraphDB> DB;
+ ASSERT_THAT_ERROR(OnDiskGraphDB::open(Temp.path(), "blake3", sizeof(HashType),
+ std::move(UpstreamDB),
+ OnDiskGraphDB::FaultInPolicy::FullTree)
+ .moveInto(DB),
+ Succeeded());
+
+ {
+ ObjectID IDRoot = DB->getReference(RootHash);
+ std::optional<ondisk::ObjectHandle> Obj;
+ ASSERT_THAT_ERROR(DB->load(IDRoot).moveInto(Obj), Succeeded());
+ ASSERT_TRUE(Obj.has_value());
+ EXPECT_EQ(toStringRef(DB->getObjectData(*Obj)), "root");
+ auto Refs = DB->getObjectRefs(*Obj);
+ ASSERT_EQ(std::distance(Refs.begin(), Refs.end()), 2);
+ }
+
+ // Re-open the primary without chaining, to verify the data were copied from
+ // the upstream.
+ ASSERT_THAT_ERROR(OnDiskGraphDB::open(Temp.path(), "blake3", sizeof(HashType),
+ /*UpstreamDB=*/nullptr,
+ OnDiskGraphDB::FaultInPolicy::FullTree)
+ .moveInto(DB),
+ Succeeded());
+
+ ObjectID IDRoot = DB->getReference(RootHash);
+ std::string PrintedTree;
+ raw_string_ostream OS(PrintedTree);
+ ASSERT_THAT_ERROR(printTree(*DB, IDRoot, OS), Succeeded());
+ StringRef Expected = R"(root
+ 1
+ 11
+ 12
+ 121
+ 2
+ 12
+ 121
+ 21
+ 22
+)";
+ EXPECT_EQ(PrintedTree, Expected);
+}
+
+TEST(OnDiskGraphDBTest, FaultInPolicyConflict) {
+ auto tryFaultInPolicyConflict = [](OnDiskGraphDB::FaultInPolicy Policy1,
+ OnDiskGraphDB::FaultInPolicy Policy2) {
+ unittest::TempDir TempUpstream("ondiskcas-upstream", /*Unique=*/true);
+ std::unique_ptr<OnDiskGraphDB> UpstreamDB;
+ ASSERT_THAT_ERROR(
+ OnDiskGraphDB::open(TempUpstream.path(), "blake3", sizeof(HashType))
+ .moveInto(UpstreamDB),
+ Succeeded());
+
+ unittest::TempDir Temp("ondiskcas", /*Unique=*/true);
+ std::unique_ptr<OnDiskGraphDB> DB;
+ ASSERT_THAT_ERROR(OnDiskGraphDB::open(Temp.path(), "blake3",
+ sizeof(HashType),
+ std::move(UpstreamDB), Policy1)
+ .moveInto(DB),
+ Succeeded());
+ DB.reset();
+ ASSERT_THAT_ERROR(OnDiskGraphDB::open(Temp.path(), "blake3",
+ sizeof(HashType),
+ std::move(UpstreamDB), Policy2)
+ .moveInto(DB),
+ Failed());
+ };
+ // Open as 'single', then as 'full'.
+ tryFaultInPolicyConflict(OnDiskGraphDB::FaultInPolicy::SingleNode,
+ OnDiskGraphDB::FaultInPolicy::FullTree);
+ // Open as 'full', then as 'single'.
+ tryFaultInPolicyConflict(OnDiskGraphDB::FaultInPolicy::FullTree,
+ OnDiskGraphDB::FaultInPolicy::SingleNode);
+}
+
+#endif // LLVM_ENABLE_ONDISK_CAS
diff --git a/llvm/unittests/CAS/OnDiskHashMappedTrieTest.cpp b/llvm/unittests/CAS/OnDiskHashMappedTrieTest.cpp
new file mode 100644
index 0000000..b8e5bf6
--- /dev/null
+++ b/llvm/unittests/CAS/OnDiskHashMappedTrieTest.cpp
@@ -0,0 +1,146 @@
+//===- OnDiskHashMappedTrieTest.cpp ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/OnDiskHashMappedTrie.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/Alignment.h"
+#include "llvm/Testing/Support/Error.h"
+#include "llvm/Testing/Support/SupportHelpers.h"
+#include "gtest/gtest.h"
+
+#if LLVM_ENABLE_ONDISK_CAS
+using namespace llvm;
+using namespace llvm::cas;
+
+namespace {
+
+TEST(OnDiskHashMappedTrieTest, Insertion) {
+ unittest::TempDir Temp("on-disk-hash-mapped-trie", /*Unique=*/true);
+
+ // Create tries with various sizes of hash and with data.
+ //
+ // NOTE: The check related to \a recoverFromFileOffset() catches a potential
+ // off-by-one bounds-checking bug when the trie record size (data + hash) add
+ // up to a multiple of 8B. Iterate through a few different hash sizes to
+ // check it both ways.
+ constexpr size_t MB = 1024u * 1024u;
+ constexpr size_t DataSize = 8; // Multiple of 8B.
+ for (size_t NumHashBytes : {1, 2, 4, 8}) {
+ size_t NumHashBits = NumHashBytes * 8;
+
+ auto createTrie = [&]() {
+ return OnDiskHashMappedTrie::create(
+ Temp.path((Twine(NumHashBytes) + "B").str()), "index",
+ /*NumHashBits=*/NumHashBits, DataSize, /*MaxFileSize=*/MB,
+ /*NewInitialFileSize=*/std::nullopt);
+ };
+
+ std::optional<OnDiskHashMappedTrie> Trie1;
+ ASSERT_THAT_ERROR(createTrie().moveInto(Trie1), Succeeded());
+ std::optional<OnDiskHashMappedTrie> Trie2;
+ ASSERT_THAT_ERROR(createTrie().moveInto(Trie2), Succeeded());
+
+ uint8_t Hash0Bytes[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+ uint8_t Hash1Bytes[8] = {1, 0, 0, 0, 0, 0, 0, 0};
+ auto Hash0 = ArrayRef(Hash0Bytes).take_front(NumHashBytes);
+ auto Hash1 = ArrayRef(Hash1Bytes).take_front(NumHashBytes);
+ constexpr StringLiteral Data0v1Bytes = "data0.v1";
+ constexpr StringLiteral Data0v2Bytes = "data0.v2";
+ constexpr StringLiteral Data1Bytes = "data1...";
+ static_assert(Data0v1Bytes.size() == DataSize, "math error");
+ static_assert(Data0v2Bytes.size() == DataSize, "math error");
+ static_assert(Data1Bytes.size() == DataSize, "math error");
+ ArrayRef<char> Data0v1 = ArrayRef(Data0v1Bytes.data(), Data0v1Bytes.size());
+ ArrayRef<char> Data0v2 = ArrayRef(Data0v2Bytes.data(), Data0v2Bytes.size());
+ ArrayRef<char> Data1 = ArrayRef(Data1Bytes.data(), Data1Bytes.size());
+
+ // Lookup when trie is empty.
+ EXPECT_FALSE(Trie1->find(Hash0));
+
+ // Insert.
+ std::optional<FileOffset> Offset;
+ std::optional<MutableArrayRef<char>> Data;
+ {
+ auto Insertion = Trie1->insert({Hash0, Data0v1});
+ ASSERT_TRUE(Insertion);
+ EXPECT_EQ(Hash0, Insertion->Hash);
+ EXPECT_EQ(Data0v1, Insertion->Data);
+ EXPECT_TRUE(isAddrAligned(Align(8), Insertion->Data.data()));
+
+ Offset = Insertion.getOffset();
+ Data = Insertion->Data;
+ }
+
+ // Find.
+ {
+ auto Lookup = Trie1->find(Hash0);
+ ASSERT_TRUE(Lookup);
+ EXPECT_EQ(Hash0, Lookup->Hash);
+ EXPECT_EQ(Data0v1, Lookup->Data);
+ EXPECT_EQ(Offset->get(), Lookup.getOffset().get());
+ }
+
+ // Find in a different instance of the same on-disk trie that existed
+ // before the insertion.
+ {
+ auto Lookup = Trie2->find(Hash0);
+ ASSERT_TRUE(Lookup);
+ EXPECT_EQ(Hash0, Lookup->Hash);
+ EXPECT_EQ(Data0v1, Lookup->Data);
+ EXPECT_EQ(Offset->get(), Lookup.getOffset().get());
+ }
+
+ // Create a new instance and check that too.
+ Trie2.reset();
+ ASSERT_THAT_ERROR(createTrie().moveInto(Trie2), Succeeded());
+ {
+ auto Lookup = Trie2->find(Hash0);
+ ASSERT_TRUE(Lookup);
+ EXPECT_EQ(Hash0, Lookup->Hash);
+ EXPECT_EQ(Data0v1, Lookup->Data);
+ EXPECT_EQ(Offset->get(), Lookup.getOffset().get());
+ }
+
+ // Change the data.
+ llvm::copy(Data0v2, Data->data());
+ {
+ auto Lookup = Trie2->find(Hash0);
+ ASSERT_TRUE(Lookup);
+ EXPECT_EQ(Hash0, Lookup->Hash);
+ EXPECT_EQ(Data0v2, Lookup->Data);
+ EXPECT_EQ(Offset->get(), Lookup.getOffset().get());
+ }
+
+ // Find different hash.
+ EXPECT_FALSE(Trie1->find(Hash1));
+ EXPECT_FALSE(Trie2->find(Hash1));
+
+ // Recover from an offset.
+ {
+ auto Recovered = Trie1->recoverFromFileOffset(*Offset);
+ ASSERT_TRUE(Recovered);
+ EXPECT_EQ(Offset->get(), Recovered.getOffset().get());
+ EXPECT_EQ(Hash0, Recovered->Hash);
+ EXPECT_EQ(Data0v2, Recovered->Data);
+ }
+
+ // Insert another thing.
+ {
+ auto Insertion = Trie1->insert({Hash1, Data1});
+ ASSERT_TRUE(Insertion);
+ EXPECT_EQ(Hash1, Insertion->Hash);
+ EXPECT_EQ(Data1, Insertion->Data);
+ EXPECT_TRUE(isAddrAligned(Align(8), Insertion->Data.data()));
+ EXPECT_NE(Offset->get(), Insertion.getOffset().get());
+ }
+ }
+}
+
+} // namespace
+
+#endif
diff --git a/llvm/unittests/CAS/OnDiskKeyValueDBTest.cpp b/llvm/unittests/CAS/OnDiskKeyValueDBTest.cpp
new file mode 100644
index 0000000..3edc5e7
--- /dev/null
+++ b/llvm/unittests/CAS/OnDiskKeyValueDBTest.cpp
@@ -0,0 +1,54 @@
+//===- llvm/unittest/CAS/OnDiskKeyValueDBTest.cpp -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/OnDiskKeyValueDB.h"
+#include "OnDiskCommonUtils.h"
+#include "llvm/Testing/Support/Error.h"
+#include "llvm/Testing/Support/SupportHelpers.h"
+#include "gtest/gtest.h"
+
+#if LLVM_ENABLE_ONDISK_CAS
+
+using namespace llvm;
+using namespace llvm::cas;
+using namespace llvm::cas::ondisk;
+using namespace llvm::unittest::cas;
+
+TEST(OnDiskKeyValueDBTest, Basic) {
+ unittest::TempDir Temp("ondiskkv", /*Unique=*/true);
+ std::unique_ptr<OnDiskKeyValueDB> DB;
+ ASSERT_THAT_ERROR(OnDiskKeyValueDB::open(Temp.path(), "blake3",
+ sizeof(HashType), "test",
+ sizeof(ValueType))
+ .moveInto(DB),
+ Succeeded());
+
+ {
+ std::optional<ArrayRef<char>> Val;
+ ASSERT_THAT_ERROR(DB->get(digest("hello")).moveInto(Val), Succeeded());
+ EXPECT_FALSE(Val.has_value());
+ }
+
+ ValueType ValW = valueFromString("world");
+ ArrayRef<char> Val;
+ ASSERT_THAT_ERROR(DB->put(digest("hello"), ValW).moveInto(Val), Succeeded());
+ EXPECT_EQ(Val, ArrayRef(ValW));
+ ASSERT_THAT_ERROR(
+ DB->put(digest("hello"), valueFromString("other")).moveInto(Val),
+ Succeeded());
+ EXPECT_EQ(Val, ArrayRef(ValW));
+
+ {
+ std::optional<ArrayRef<char>> Val;
+ ASSERT_THAT_ERROR(DB->get(digest("hello")).moveInto(Val), Succeeded());
+ EXPECT_TRUE(Val.has_value());
+ EXPECT_EQ(*Val, ArrayRef(ValW));
+ }
+}
+
+#endif // LLVM_ENABLE_ONDISK_CAS
diff --git a/llvm/unittests/CAS/ProgramTest.cpp b/llvm/unittests/CAS/ProgramTest.cpp
new file mode 100644
index 0000000..95cc359
--- /dev/null
+++ b/llvm/unittests/CAS/ProgramTest.cpp
@@ -0,0 +1,151 @@
+//===- MappedFileRegionBumpPtrTest.cpp ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Program.h"
+#include "llvm/CAS/MappedFileRegionBumpPtr.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/ThreadPool.h"
+#include "gtest/gtest.h"
+#if defined(__APPLE__)
+#include <crt_externs.h>
+#elif !defined(_MSC_VER)
+// Forward declare environ in case it's not provided by stdlib.h.
+extern char **environ;
+#endif
+
+using namespace llvm;
+using namespace llvm::cas;
+
+extern const char *TestMainArgv0;
+static char ProgramID = 0;
+
+class CASProgramTest : public testing::Test {
+ std::vector<StringRef> EnvTable;
+ std::vector<std::string> EnvStorage;
+
+protected:
+ void SetUp() override {
+ auto EnvP = [] {
+#if defined(_WIN32)
+ _wgetenv(L"TMP"); // Populate _wenviron, initially is null
+ return _wenviron;
+#elif defined(__APPLE__)
+ return *_NSGetEnviron();
+#else
+ return environ;
+#endif
+ }();
+ ASSERT_TRUE(EnvP);
+
+ auto prepareEnvVar = [this](decltype(*EnvP) Var) -> StringRef {
+#if defined(_WIN32)
+ // On Windows convert UTF16 encoded variable to UTF8
+ auto Len = wcslen(Var);
+ ArrayRef<char> Ref{reinterpret_cast<char const *>(Var),
+ Len * sizeof(*Var)};
+ EnvStorage.emplace_back();
+ auto convStatus = convertUTF16ToUTF8String(Ref, EnvStorage.back());
+ EXPECT_TRUE(convStatus);
+ return EnvStorage.back();
+#else
+ (void)this;
+ return StringRef(Var);
+#endif
+ };
+
+ while (*EnvP != nullptr) {
+ auto S = prepareEnvVar(*EnvP);
+ if (!StringRef(S).starts_with("GTEST_"))
+ EnvTable.emplace_back(S);
+ ++EnvP;
+ }
+ }
+
+ void TearDown() override {
+ EnvTable.clear();
+ EnvStorage.clear();
+ }
+
+ void addEnvVar(StringRef Var) { EnvTable.emplace_back(Var); }
+
+ ArrayRef<StringRef> getEnviron() const { return EnvTable; }
+};
+
+#if LLVM_ENABLE_ONDISK_CAS
+
+TEST_F(CASProgramTest, MappedFileRegionBumpPtrTest) {
+ auto TestAllocator = [](StringRef Path) {
+ auto NewFileConstructor = [&](MappedFileRegionBumpPtr &Alloc) -> Error {
+ Alloc.initializeBumpPtr(0);
+ return Error::success();
+ };
+
+ auto Alloc = MappedFileRegionBumpPtr::create(
+ Path, /*Capacity=*/10 * 1024 * 1024,
+ /*BumpPtrOffset=*/0, NewFileConstructor);
+ if (!Alloc)
+ ASSERT_TRUE(false);
+
+ std::vector<unsigned *> AllocatedPtr;
+ AllocatedPtr.resize(100);
+ DefaultThreadPool Threads;
+ for (unsigned I = 0; I < 100; ++I) {
+ Threads.async(
+ [&](unsigned Idx) {
+ // Allocate a buffer that is larger than needed so allocator hits
+ // additional pages for test coverage.
+ unsigned *P = (unsigned *)Alloc->allocate(100);
+ *P = Idx;
+ AllocatedPtr[Idx] = P;
+ },
+ I);
+ }
+
+ Threads.wait();
+ for (unsigned I = 0; I < 100; ++I)
+ EXPECT_EQ(*AllocatedPtr[I], I);
+ };
+
+ if (const char *File = getenv("LLVM_CAS_TEST_MAPPED_FILE_REGION")) {
+ TestAllocator(File);
+ exit(0);
+ }
+
+ SmallString<128> FilePath;
+ sys::fs::createUniqueDirectory("MappedFileRegionBumpPtr", FilePath);
+ sys::path::append(FilePath, "allocation-file");
+
+ std::string Executable =
+ sys::fs::getMainExecutable(TestMainArgv0, &ProgramID);
+ StringRef Argv[] = {
+ Executable, "--gtest_filter=CASProgramTest.MappedFileRegionBumpPtrTest"};
+
+ // Add LLVM_PROGRAM_TEST_LOCKED_FILE to the environment of the child.
+ std::string EnvVar = "LLVM_CAS_TEST_MAPPED_FILE_REGION=";
+ EnvVar += FilePath.str();
+ addEnvVar(EnvVar);
+
+ std::string Error;
+ bool ExecutionFailed;
+ sys::ProcessInfo PI = sys::ExecuteNoWait(Executable, Argv, getEnviron(), {},
+ 0, &Error, &ExecutionFailed);
+ TestAllocator(FilePath);
+
+ ASSERT_FALSE(ExecutionFailed) << Error;
+ ASSERT_TRUE(Error.empty());
+ ASSERT_NE(PI.Pid, sys::ProcessInfo::InvalidPid) << "Invalid process id";
+ llvm::sys::Wait(PI, /*SecondsToWait=*/5, &Error);
+ ASSERT_TRUE(Error.empty());
+
+ // Clean up after both processes finish testing.
+ sys::fs::remove(FilePath);
+ sys::fs::remove_directories(sys::path::parent_path(FilePath));
+}
+
+#endif // LLVM_ENABLE_ONDISK_CAS
diff --git a/llvm/unittests/CMakeLists.txt b/llvm/unittests/CMakeLists.txt
index 8892f3e..5ebdc3b 100644
--- a/llvm/unittests/CMakeLists.txt
+++ b/llvm/unittests/CMakeLists.txt
@@ -34,6 +34,7 @@ add_subdirectory(AsmParser)
add_subdirectory(BinaryFormat)
add_subdirectory(Bitcode)
add_subdirectory(Bitstream)
+add_subdirectory(CAS)
add_subdirectory(CGData)
add_subdirectory(CodeGen)
add_subdirectory(DebugInfo)
diff --git a/llvm/unittests/Support/ProgramTest.cpp b/llvm/unittests/Support/ProgramTest.cpp
index b1b35ea..93db38b 100644
--- a/llvm/unittests/Support/ProgramTest.cpp
+++ b/llvm/unittests/Support/ProgramTest.cpp
@@ -10,6 +10,7 @@
#include "llvm/Config/llvm-config.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/ConvertUTF.h"
+#include "llvm/Support/ExponentialBackoff.h"
#include "llvm/Support/FileSystem.h"
#include "llvm/Support/Path.h"
#include "llvm/Support/Signals.h"
@@ -561,6 +562,81 @@ TEST_F(ProgramEnvTest, TestLockFile) {
sys::fs::remove(LockedFile);
}
+TEST_F(ProgramEnvTest, TestLockFileExclusive) {
+ using namespace llvm::sys;
+ using namespace std::chrono_literals;
+
+ if (const char *LockedFile = getenv("LLVM_PROGRAM_TEST_LOCKED_FILE")) {
+ // Child process.
+ int FD2;
+ ASSERT_NO_ERROR(fs::openFileForReadWrite(LockedFile, FD2,
+ fs::CD_OpenExisting, fs::OF_None));
+
+ std::error_code ErrC =
+ fs::tryLockFile(FD2, std::chrono::seconds(0), /*Exclusive=*/true);
+ EXPECT_TRUE(ErrC);
+ close(FD2);
+ // Write a file to indicate just finished.
+ std::string FinishFile = std::string(LockedFile) + "-finished";
+ int FD3;
+ ASSERT_NO_ERROR(fs::openFileForReadWrite(FinishFile, FD3, fs::CD_CreateNew,
+ fs::OF_None));
+ close(FD3);
+ exit(0);
+ }
+
+ // Create file that will be locked.
+ SmallString<64> LockedFile;
+ int FD1;
+ ASSERT_NO_ERROR(
+ fs::createUniqueDirectory("TestLockFileExclusive", LockedFile));
+ sys::path::append(LockedFile, "file");
+ ASSERT_NO_ERROR(
+ fs::openFileForReadWrite(LockedFile, FD1, fs::CD_CreateNew, fs::OF_None));
+
+ std::string Executable =
+ sys::fs::getMainExecutable(TestMainArgv0, &ProgramTestStringArg1);
+ StringRef argv[] = {Executable,
+ "--gtest_filter=ProgramEnvTest.TestLockFileExclusive"};
+
+ // Add LLVM_PROGRAM_TEST_LOCKED_FILE to the environment of the child.
+ std::string EnvVar = "LLVM_PROGRAM_TEST_LOCKED_FILE=";
+ EnvVar += LockedFile.str();
+ addEnvVar(EnvVar);
+
+ // Lock the file.
+ ASSERT_NO_ERROR(fs::tryLockFile(FD1));
+
+ std::string Error;
+ bool ExecutionFailed;
+ ProcessInfo PI2 = ExecuteNoWait(Executable, argv, getEnviron(), {}, 0, &Error,
+ &ExecutionFailed);
+ ASSERT_FALSE(ExecutionFailed) << Error;
+ ASSERT_TRUE(Error.empty());
+ ASSERT_NE(PI2.Pid, ProcessInfo::InvalidPid) << "Invalid process id";
+
+ std::string FinishFile = std::string(LockedFile) + "-finished";
+ // Wait till child process writes the file to indicate the job finished.
+ bool Finished = false;
+ ExponentialBackoff Backoff(5s); // timeout 5s.
+ do {
+ if (fs::exists(FinishFile)) {
+ Finished = true;
+ break;
+ }
+ } while (Backoff.waitForNextAttempt());
+
+ ASSERT_TRUE(Finished);
+ ASSERT_NO_ERROR(fs::unlockFile(FD1));
+ ProcessInfo WaitResult = llvm::sys::Wait(PI2, /*SecondsToWait=*/1, &Error);
+ ASSERT_TRUE(Error.empty());
+ ASSERT_EQ(0, WaitResult.ReturnCode);
+ ASSERT_EQ(WaitResult.Pid, PI2.Pid);
+ sys::fs::remove(LockedFile);
+ sys::fs::remove(FinishFile);
+ sys::fs::remove_directories(sys::path::parent_path(LockedFile));
+}
+
TEST_F(ProgramEnvTest, TestExecuteWithNoStacktraceHandler) {
using namespace llvm::sys;