aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/CAS
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/CAS')
-rw-r--r--llvm/lib/CAS/CMakeLists.txt2
-rw-r--r--llvm/lib/CAS/OnDiskCommon.cpp56
-rw-r--r--llvm/lib/CAS/OnDiskCommon.h22
-rw-r--r--llvm/lib/CAS/OnDiskDataAllocator.cpp6
-rw-r--r--llvm/lib/CAS/OnDiskGraphDB.cpp1755
-rw-r--r--llvm/lib/CAS/OnDiskKeyValueDB.cpp113
6 files changed, 1951 insertions, 3 deletions
diff --git a/llvm/lib/CAS/CMakeLists.txt b/llvm/lib/CAS/CMakeLists.txt
index bca39b6..a2f8c49 100644
--- a/llvm/lib/CAS/CMakeLists.txt
+++ b/llvm/lib/CAS/CMakeLists.txt
@@ -8,6 +8,8 @@ add_llvm_component_library(LLVMCAS
ObjectStore.cpp
OnDiskCommon.cpp
OnDiskDataAllocator.cpp
+ OnDiskGraphDB.cpp
+ OnDiskKeyValueDB.cpp
OnDiskTrieRawHashMap.cpp
ADDITIONAL_HEADER_DIRS
diff --git a/llvm/lib/CAS/OnDiskCommon.cpp b/llvm/lib/CAS/OnDiskCommon.cpp
index 25aa06b..281bde9 100644
--- a/llvm/lib/CAS/OnDiskCommon.cpp
+++ b/llvm/lib/CAS/OnDiskCommon.cpp
@@ -7,9 +7,10 @@
//===----------------------------------------------------------------------===//
#include "OnDiskCommon.h"
-#include "llvm/Config/config.h"
#include "llvm/Support/Error.h"
#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Process.h"
+#include <mutex>
#include <thread>
#if __has_include(<sys/file.h>)
@@ -25,8 +26,44 @@
#include <fcntl.h>
#endif
+#if __has_include(<sys/mount.h>)
+#include <sys/mount.h> // statfs
+#endif
+
using namespace llvm;
+static uint64_t OnDiskCASMaxMappingSize = 0;
+
+Expected<std::optional<uint64_t>> cas::ondisk::getOverriddenMaxMappingSize() {
+ static std::once_flag Flag;
+ Error Err = Error::success();
+ std::call_once(Flag, [&Err] {
+ ErrorAsOutParameter EAO(&Err);
+ constexpr const char *EnvVar = "LLVM_CAS_MAX_MAPPING_SIZE";
+ auto Value = sys::Process::GetEnv(EnvVar);
+ if (!Value)
+ return;
+
+ uint64_t Size;
+ if (StringRef(*Value).getAsInteger(/*auto*/ 0, Size))
+ Err = createStringError(inconvertibleErrorCode(),
+ "invalid value for %s: expected integer", EnvVar);
+ OnDiskCASMaxMappingSize = Size;
+ });
+
+ if (Err)
+ return std::move(Err);
+
+ if (OnDiskCASMaxMappingSize == 0)
+ return std::nullopt;
+
+ return OnDiskCASMaxMappingSize;
+}
+
+void cas::ondisk::setMaxMappingSize(uint64_t Size) {
+ OnDiskCASMaxMappingSize = Size;
+}
+
std::error_code cas::ondisk::lockFileThreadSafe(int FD,
sys::fs::LockKind Kind) {
#if HAVE_FLOCK
@@ -125,3 +162,20 @@ Expected<size_t> cas::ondisk::preallocateFileTail(int FD, size_t CurrentSize,
return NewSize; // Pretend it worked.
#endif
}
+
+bool cas::ondisk::useSmallMappingSize(const Twine &P) {
+ // Add exceptions to use small database file here.
+#if defined(__APPLE__) && __has_include(<sys/mount.h>)
+ // macOS tmpfs does not support sparse tails.
+ SmallString<128> PathStorage;
+ StringRef Path = P.toNullTerminatedStringRef(PathStorage);
+ struct statfs StatFS;
+ if (statfs(Path.data(), &StatFS) != 0)
+ return false;
+
+ if (strcmp(StatFS.f_fstypename, "tmpfs") == 0)
+ return true;
+#endif
+ // Default to use regular database file.
+ return false;
+}
diff --git a/llvm/lib/CAS/OnDiskCommon.h b/llvm/lib/CAS/OnDiskCommon.h
index 8b79ffe..ac00662 100644
--- a/llvm/lib/CAS/OnDiskCommon.h
+++ b/llvm/lib/CAS/OnDiskCommon.h
@@ -12,9 +12,31 @@
#include "llvm/Support/Error.h"
#include "llvm/Support/FileSystem.h"
#include <chrono>
+#include <optional>
namespace llvm::cas::ondisk {
+/// The version for all the ondisk database files. It needs to be bumped when
+/// compatibility breaking changes are introduced.
+constexpr StringLiteral CASFormatVersion = "v1";
+
+/// Retrieves an overridden maximum mapping size for CAS files, if any,
+/// speicified by LLVM_CAS_MAX_MAPPING_SIZE in the environment or set by
+/// `setMaxMappingSize()`. If the value from environment is unreadable, returns
+/// an error.
+Expected<std::optional<uint64_t>> getOverriddenMaxMappingSize();
+
+/// Set MaxMappingSize for ondisk CAS. This function is not thread-safe and
+/// should be set before creaing any ondisk CAS and does not affect CAS already
+/// created. Set value 0 to use default size.
+void setMaxMappingSize(uint64_t Size);
+
+/// Whether to use a small file mapping for ondisk databases created in \p Path.
+///
+/// For some file system that doesn't support sparse file, use a smaller file
+/// mapping to avoid consuming too much disk space on creation.
+bool useSmallMappingSize(const Twine &Path);
+
/// Thread-safe alternative to \c sys::fs::lockFile. This does not support all
/// the platforms that \c sys::fs::lockFile does, so keep it in the CAS library
/// for now.
diff --git a/llvm/lib/CAS/OnDiskDataAllocator.cpp b/llvm/lib/CAS/OnDiskDataAllocator.cpp
index 13bbd66..9c68bc4 100644
--- a/llvm/lib/CAS/OnDiskDataAllocator.cpp
+++ b/llvm/lib/CAS/OnDiskDataAllocator.cpp
@@ -185,7 +185,7 @@ Expected<ArrayRef<char>> OnDiskDataAllocator::get(FileOffset Offset,
return ArrayRef<char>{Impl->File.getRegion().data() + Offset.get(), Size};
}
-MutableArrayRef<uint8_t> OnDiskDataAllocator::getUserHeader() {
+MutableArrayRef<uint8_t> OnDiskDataAllocator::getUserHeader() const {
return Impl->Store.getUserHeader();
}
@@ -221,7 +221,9 @@ Expected<ArrayRef<char>> OnDiskDataAllocator::get(FileOffset Offset,
"OnDiskDataAllocator is not supported");
}
-MutableArrayRef<uint8_t> OnDiskDataAllocator::getUserHeader() { return {}; }
+MutableArrayRef<uint8_t> OnDiskDataAllocator::getUserHeader() const {
+ return {};
+}
size_t OnDiskDataAllocator::size() const { return 0; }
size_t OnDiskDataAllocator::capacity() const { return 0; }
diff --git a/llvm/lib/CAS/OnDiskGraphDB.cpp b/llvm/lib/CAS/OnDiskGraphDB.cpp
new file mode 100644
index 0000000..72bb98c
--- /dev/null
+++ b/llvm/lib/CAS/OnDiskGraphDB.cpp
@@ -0,0 +1,1755 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file implements OnDiskGraphDB, an on-disk CAS nodes database,
+/// independent of a particular hashing algorithm. It only needs to be
+/// configured for the hash size and controls the schema of the storage.
+///
+/// OnDiskGraphDB defines:
+///
+/// - How the data is stored inside database, either as a standalone file, or
+/// allocated inside a datapool.
+/// - How references to other objects inside the same database is stored. They
+/// are stored as internal references, instead of full hash value to save
+/// space.
+/// - How to chain databases together and import objects from upstream
+/// databases.
+///
+/// Here's a top-level description of the current layout:
+///
+/// - db/index.<version>: a file for the "index" table, named by \a
+/// IndexTableName and managed by \a TrieRawHashMap. The contents are 8B
+/// that are accessed atomically, describing the object kind and where/how
+/// it's stored (including an optional file offset). See \a TrieRecord for
+/// more details.
+/// - db/data.<version>: a file for the "data" table, named by \a
+/// DataPoolTableName and managed by \a DataStore. New objects within
+/// TrieRecord::MaxEmbeddedSize are inserted here as \a
+/// TrieRecord::StorageKind::DataPool.
+/// - db/obj.<offset>.<version>: a file storing an object outside the main
+/// "data" table, named by its offset into the "index" table, with the
+/// format of \a TrieRecord::StorageKind::Standalone.
+/// - db/leaf.<offset>.<version>: a file storing a leaf node outside the
+/// main "data" table, named by its offset into the "index" table, with
+/// the format of \a TrieRecord::StorageKind::StandaloneLeaf.
+/// - db/leaf+0.<offset>.<version>: a file storing a null-terminated leaf object
+/// outside the main "data" table, named by its offset into the "index" table,
+/// with the format of \a TrieRecord::StorageKind::StandaloneLeaf0.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/OnDiskGraphDB.h"
+#include "OnDiskCommon.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/ScopeExit.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CAS/OnDiskDataAllocator.h"
+#include "llvm/CAS/OnDiskTrieRawHashMap.h"
+#include "llvm/Support/Alignment.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/Process.h"
+#include <atomic>
+#include <mutex>
+#include <optional>
+
+#define DEBUG_TYPE "on-disk-cas"
+
+using namespace llvm;
+using namespace llvm::cas;
+using namespace llvm::cas::ondisk;
+
+static constexpr StringLiteral IndexTableName = "llvm.cas.index";
+static constexpr StringLiteral DataPoolTableName = "llvm.cas.data";
+
+static constexpr StringLiteral IndexFilePrefix = "index.";
+static constexpr StringLiteral DataPoolFilePrefix = "data.";
+
+static constexpr StringLiteral FilePrefixObject = "obj.";
+static constexpr StringLiteral FilePrefixLeaf = "leaf.";
+static constexpr StringLiteral FilePrefixLeaf0 = "leaf+0.";
+
+static Error createCorruptObjectError(Expected<ArrayRef<uint8_t>> ID) {
+ if (!ID)
+ return ID.takeError();
+
+ return createStringError(llvm::errc::invalid_argument,
+ "corrupt object '" + toHex(*ID) + "'");
+}
+
+namespace {
+
+/// Trie record data: 8 bytes, atomic<uint64_t>
+/// - 1-byte: StorageKind
+/// - 7-bytes: DataStoreOffset (offset into referenced file)
+class TrieRecord {
+public:
+ enum class StorageKind : uint8_t {
+ /// Unknown object.
+ Unknown = 0,
+
+ /// data.vX: main pool, full DataStore record.
+ DataPool = 1,
+
+ /// obj.<TrieRecordOffset>.vX: standalone, with a full DataStore record.
+ Standalone = 10,
+
+ /// leaf.<TrieRecordOffset>.vX: standalone, just the data. File contents
+ /// exactly the data content and file size matches the data size. No refs.
+ StandaloneLeaf = 11,
+
+ /// leaf+0.<TrieRecordOffset>.vX: standalone, just the data plus an
+ /// extra null character ('\0'). File size is 1 bigger than the data size.
+ /// No refs.
+ StandaloneLeaf0 = 12,
+ };
+
+ static StringRef getStandaloneFilePrefix(StorageKind SK) {
+ switch (SK) {
+ default:
+ llvm_unreachable("Expected standalone storage kind");
+ case TrieRecord::StorageKind::Standalone:
+ return FilePrefixObject;
+ case TrieRecord::StorageKind::StandaloneLeaf:
+ return FilePrefixLeaf;
+ case TrieRecord::StorageKind::StandaloneLeaf0:
+ return FilePrefixLeaf0;
+ }
+ }
+
+ enum Limits : int64_t {
+ /// Saves files bigger than 64KB standalone instead of embedding them.
+ MaxEmbeddedSize = 64LL * 1024LL - 1,
+ };
+
+ struct Data {
+ StorageKind SK = StorageKind::Unknown;
+ FileOffset Offset;
+ };
+
+ /// Pack StorageKind and Offset from Data into 8 byte TrieRecord.
+ static uint64_t pack(Data D) {
+ assert(D.Offset.get() < (int64_t)(1ULL << 56));
+ uint64_t Packed = uint64_t(D.SK) << 56 | D.Offset.get();
+ assert(D.SK != StorageKind::Unknown || Packed == 0);
+#ifndef NDEBUG
+ Data RoundTrip = unpack(Packed);
+ assert(D.SK == RoundTrip.SK);
+ assert(D.Offset.get() == RoundTrip.Offset.get());
+#endif
+ return Packed;
+ }
+
+ // Unpack TrieRecord into Data.
+ static Data unpack(uint64_t Packed) {
+ Data D;
+ if (!Packed)
+ return D;
+ D.SK = (StorageKind)(Packed >> 56);
+ D.Offset = FileOffset(Packed & (UINT64_MAX >> 8));
+ return D;
+ }
+
+ TrieRecord() : Storage(0) {}
+
+ Data load() const { return unpack(Storage); }
+ bool compare_exchange_strong(Data &Existing, Data New);
+
+private:
+ std::atomic<uint64_t> Storage;
+};
+
+/// DataStore record data: 4B + size? + refs? + data + 0
+/// - 4-bytes: Header
+/// - {0,4,8}-bytes: DataSize (may be packed in Header)
+/// - {0,4,8}-bytes: NumRefs (may be packed in Header)
+/// - NumRefs*{4,8}-bytes: Refs[] (end-ptr is 8-byte aligned)
+/// - <data>
+/// - 1-byte: 0-term
+struct DataRecordHandle {
+ /// NumRefs storage: 4B, 2B, 1B, or 0B (no refs). Or, 8B, for alignment
+ /// convenience to avoid computing padding later.
+ enum class NumRefsFlags : uint8_t {
+ Uses0B = 0U,
+ Uses1B = 1U,
+ Uses2B = 2U,
+ Uses4B = 3U,
+ Uses8B = 4U,
+ Max = Uses8B,
+ };
+
+ /// DataSize storage: 8B, 4B, 2B, or 1B.
+ enum class DataSizeFlags {
+ Uses1B = 0U,
+ Uses2B = 1U,
+ Uses4B = 2U,
+ Uses8B = 3U,
+ Max = Uses8B,
+ };
+
+ /// Kind of ref stored in Refs[]: InternalRef or InternalRef4B.
+ enum class RefKindFlags {
+ InternalRef = 0U,
+ InternalRef4B = 1U,
+ Max = InternalRef4B,
+ };
+
+ enum Counts : int {
+ NumRefsShift = 0,
+ NumRefsBits = 3,
+ DataSizeShift = NumRefsShift + NumRefsBits,
+ DataSizeBits = 2,
+ RefKindShift = DataSizeShift + DataSizeBits,
+ RefKindBits = 1,
+ };
+ static_assert(((UINT32_MAX << NumRefsBits) & (uint32_t)NumRefsFlags::Max) ==
+ 0,
+ "Not enough bits");
+ static_assert(((UINT32_MAX << DataSizeBits) & (uint32_t)DataSizeFlags::Max) ==
+ 0,
+ "Not enough bits");
+ static_assert(((UINT32_MAX << RefKindBits) & (uint32_t)RefKindFlags::Max) ==
+ 0,
+ "Not enough bits");
+
+ /// Layout of the DataRecordHandle and how to decode it.
+ struct LayoutFlags {
+ NumRefsFlags NumRefs;
+ DataSizeFlags DataSize;
+ RefKindFlags RefKind;
+
+ static uint64_t pack(LayoutFlags LF) {
+ unsigned Packed = ((unsigned)LF.NumRefs << NumRefsShift) |
+ ((unsigned)LF.DataSize << DataSizeShift) |
+ ((unsigned)LF.RefKind << RefKindShift);
+#ifndef NDEBUG
+ LayoutFlags RoundTrip = unpack(Packed);
+ assert(LF.NumRefs == RoundTrip.NumRefs);
+ assert(LF.DataSize == RoundTrip.DataSize);
+ assert(LF.RefKind == RoundTrip.RefKind);
+#endif
+ return Packed;
+ }
+ static LayoutFlags unpack(uint64_t Storage) {
+ assert(Storage <= UINT8_MAX && "Expect storage to fit in a byte");
+ LayoutFlags LF;
+ LF.NumRefs =
+ (NumRefsFlags)((Storage >> NumRefsShift) & ((1U << NumRefsBits) - 1));
+ LF.DataSize = (DataSizeFlags)((Storage >> DataSizeShift) &
+ ((1U << DataSizeBits) - 1));
+ LF.RefKind =
+ (RefKindFlags)((Storage >> RefKindShift) & ((1U << RefKindBits) - 1));
+ return LF;
+ }
+ };
+
+ /// Header layout:
+ /// - 1-byte: LayoutFlags
+ /// - 1-byte: 1B size field
+ /// - {0,2}-bytes: 2B size field
+ struct Header {
+ using PackTy = uint32_t;
+ PackTy Packed;
+
+ static constexpr unsigned LayoutFlagsShift =
+ (sizeof(PackTy) - 1) * CHAR_BIT;
+ };
+
+ struct Input {
+ InternalRefArrayRef Refs;
+ ArrayRef<char> Data;
+ };
+
+ LayoutFlags getLayoutFlags() const {
+ return LayoutFlags::unpack(H->Packed >> Header::LayoutFlagsShift);
+ }
+
+ uint64_t getDataSize() const;
+ void skipDataSize(LayoutFlags LF, int64_t &RelOffset) const;
+ uint32_t getNumRefs() const;
+ void skipNumRefs(LayoutFlags LF, int64_t &RelOffset) const;
+ int64_t getRefsRelOffset() const;
+ int64_t getDataRelOffset() const;
+
+ static uint64_t getTotalSize(uint64_t DataRelOffset, uint64_t DataSize) {
+ return DataRelOffset + DataSize + 1;
+ }
+ uint64_t getTotalSize() const {
+ return getDataRelOffset() + getDataSize() + 1;
+ }
+
+ /// Describe the layout of data stored and how to decode from
+ /// DataRecordHandle.
+ struct Layout {
+ explicit Layout(const Input &I);
+
+ LayoutFlags Flags;
+ uint64_t DataSize = 0;
+ uint32_t NumRefs = 0;
+ int64_t RefsRelOffset = 0;
+ int64_t DataRelOffset = 0;
+ uint64_t getTotalSize() const {
+ return DataRecordHandle::getTotalSize(DataRelOffset, DataSize);
+ }
+ };
+
+ InternalRefArrayRef getRefs() const {
+ assert(H && "Expected valid handle");
+ auto *BeginByte = reinterpret_cast<const char *>(H) + getRefsRelOffset();
+ size_t Size = getNumRefs();
+ if (!Size)
+ return InternalRefArrayRef();
+ if (getLayoutFlags().RefKind == RefKindFlags::InternalRef4B)
+ return ArrayRef(reinterpret_cast<const InternalRef4B *>(BeginByte), Size);
+ return ArrayRef(reinterpret_cast<const InternalRef *>(BeginByte), Size);
+ }
+
+ ArrayRef<char> getData() const {
+ assert(H && "Expected valid handle");
+ return ArrayRef(reinterpret_cast<const char *>(H) + getDataRelOffset(),
+ getDataSize());
+ }
+
+ static DataRecordHandle create(function_ref<char *(size_t Size)> Alloc,
+ const Input &I);
+ static Expected<DataRecordHandle>
+ createWithError(function_ref<Expected<char *>(size_t Size)> Alloc,
+ const Input &I);
+ static DataRecordHandle construct(char *Mem, const Input &I);
+
+ static DataRecordHandle get(const char *Mem) {
+ return DataRecordHandle(
+ *reinterpret_cast<const DataRecordHandle::Header *>(Mem));
+ }
+ static Expected<DataRecordHandle>
+ getFromDataPool(const OnDiskDataAllocator &Pool, FileOffset Offset);
+
+ explicit operator bool() const { return H; }
+ const Header &getHeader() const { return *H; }
+
+ DataRecordHandle() = default;
+ explicit DataRecordHandle(const Header &H) : H(&H) {}
+
+private:
+ static DataRecordHandle constructImpl(char *Mem, const Input &I,
+ const Layout &L);
+ const Header *H = nullptr;
+};
+
+/// Proxy for any on-disk object or raw data.
+struct OnDiskContent {
+ std::optional<DataRecordHandle> Record;
+ std::optional<ArrayRef<char>> Bytes;
+};
+
+/// Data loaded inside the memory from standalone file.
+class StandaloneDataInMemory {
+public:
+ OnDiskContent getContent() const;
+
+ StandaloneDataInMemory(std::unique_ptr<sys::fs::mapped_file_region> Region,
+ TrieRecord::StorageKind SK)
+ : Region(std::move(Region)), SK(SK) {
+#ifndef NDEBUG
+ bool IsStandalone = false;
+ switch (SK) {
+ case TrieRecord::StorageKind::Standalone:
+ case TrieRecord::StorageKind::StandaloneLeaf:
+ case TrieRecord::StorageKind::StandaloneLeaf0:
+ IsStandalone = true;
+ break;
+ default:
+ break;
+ }
+ assert(IsStandalone);
+#endif
+ }
+
+private:
+ std::unique_ptr<sys::fs::mapped_file_region> Region;
+ TrieRecord::StorageKind SK;
+};
+
+/// Container to lookup loaded standalone objects.
+template <size_t NumShards> class StandaloneDataMap {
+ static_assert(isPowerOf2_64(NumShards), "Expected power of 2");
+
+public:
+ uintptr_t insert(ArrayRef<uint8_t> Hash, TrieRecord::StorageKind SK,
+ std::unique_ptr<sys::fs::mapped_file_region> Region);
+
+ const StandaloneDataInMemory *lookup(ArrayRef<uint8_t> Hash) const;
+ bool count(ArrayRef<uint8_t> Hash) const { return bool(lookup(Hash)); }
+
+private:
+ struct Shard {
+ /// Needs to store a std::unique_ptr for a stable address identity.
+ DenseMap<const uint8_t *, std::unique_ptr<StandaloneDataInMemory>> Map;
+ mutable std::mutex Mutex;
+ };
+ Shard &getShard(ArrayRef<uint8_t> Hash) {
+ return const_cast<Shard &>(
+ const_cast<const StandaloneDataMap *>(this)->getShard(Hash));
+ }
+ const Shard &getShard(ArrayRef<uint8_t> Hash) const {
+ static_assert(NumShards <= 256, "Expected only 8 bits of shard");
+ return Shards[Hash[0] % NumShards];
+ }
+
+ Shard Shards[NumShards];
+};
+
+using StandaloneDataMapTy = StandaloneDataMap<16>;
+
+/// A vector of internal node references.
+class InternalRefVector {
+public:
+ void push_back(InternalRef Ref) {
+ if (NeedsFull)
+ return FullRefs.push_back(Ref);
+ if (std::optional<InternalRef4B> Small = InternalRef4B::tryToShrink(Ref))
+ return SmallRefs.push_back(*Small);
+ NeedsFull = true;
+ assert(FullRefs.empty());
+ FullRefs.reserve(SmallRefs.size() + 1);
+ for (InternalRef4B Small : SmallRefs)
+ FullRefs.push_back(Small);
+ FullRefs.push_back(Ref);
+ SmallRefs.clear();
+ }
+
+ operator InternalRefArrayRef() const {
+ assert(SmallRefs.empty() || FullRefs.empty());
+ return NeedsFull ? InternalRefArrayRef(FullRefs)
+ : InternalRefArrayRef(SmallRefs);
+ }
+
+private:
+ bool NeedsFull = false;
+ SmallVector<InternalRef4B> SmallRefs;
+ SmallVector<InternalRef> FullRefs;
+};
+
+} // namespace
+
+Expected<DataRecordHandle> DataRecordHandle::createWithError(
+ function_ref<Expected<char *>(size_t Size)> Alloc, const Input &I) {
+ Layout L(I);
+ if (Expected<char *> Mem = Alloc(L.getTotalSize()))
+ return constructImpl(*Mem, I, L);
+ else
+ return Mem.takeError();
+}
+
+DataRecordHandle
+DataRecordHandle::create(function_ref<char *(size_t Size)> Alloc,
+ const Input &I) {
+ Layout L(I);
+ return constructImpl(Alloc(L.getTotalSize()), I, L);
+}
+
+ObjectHandle ObjectHandle::fromFileOffset(FileOffset Offset) {
+ // Store the file offset as it is.
+ assert(!(Offset.get() & 0x1));
+ return ObjectHandle(Offset.get());
+}
+
+ObjectHandle ObjectHandle::fromMemory(uintptr_t Ptr) {
+ // Store the pointer from memory with lowest bit set.
+ assert(!(Ptr & 0x1));
+ return ObjectHandle(Ptr | 1);
+}
+
+/// Proxy for an on-disk index record.
+struct OnDiskGraphDB::IndexProxy {
+ FileOffset Offset;
+ ArrayRef<uint8_t> Hash;
+ TrieRecord &Ref;
+};
+
+template <size_t N>
+uintptr_t StandaloneDataMap<N>::insert(
+ ArrayRef<uint8_t> Hash, TrieRecord::StorageKind SK,
+ std::unique_ptr<sys::fs::mapped_file_region> Region) {
+ auto &S = getShard(Hash);
+ std::lock_guard<std::mutex> Lock(S.Mutex);
+ auto &V = S.Map[Hash.data()];
+ if (!V)
+ V = std::make_unique<StandaloneDataInMemory>(std::move(Region), SK);
+ return reinterpret_cast<uintptr_t>(V.get());
+}
+
+template <size_t N>
+const StandaloneDataInMemory *
+StandaloneDataMap<N>::lookup(ArrayRef<uint8_t> Hash) const {
+ auto &S = getShard(Hash);
+ std::lock_guard<std::mutex> Lock(S.Mutex);
+ auto I = S.Map.find(Hash.data());
+ if (I == S.Map.end())
+ return nullptr;
+ return &*I->second;
+}
+
+namespace {
+
+/// Copy of \a sys::fs::TempFile that skips RemoveOnSignal, which is too
+/// expensive to register/unregister at this rate.
+///
+/// FIXME: Add a TempFileManager that maintains a thread-safe list of open temp
+/// files and has a signal handler registerd that removes them all.
+class TempFile {
+ bool Done = false;
+ TempFile(StringRef Name, int FD) : TmpName(std::string(Name)), FD(FD) {}
+
+public:
+ /// This creates a temporary file with createUniqueFile.
+ static Expected<TempFile> create(const Twine &Model);
+ TempFile(TempFile &&Other) { *this = std::move(Other); }
+ TempFile &operator=(TempFile &&Other) {
+ TmpName = std::move(Other.TmpName);
+ FD = Other.FD;
+ Other.Done = true;
+ Other.FD = -1;
+ return *this;
+ }
+
+ // Name of the temporary file.
+ std::string TmpName;
+
+ // The open file descriptor.
+ int FD = -1;
+
+ // Keep this with the given name.
+ Error keep(const Twine &Name);
+ Error discard();
+
+ // This checks that keep or delete was called.
+ ~TempFile() { consumeError(discard()); }
+};
+
+class MappedTempFile {
+public:
+ char *data() const { return Map.data(); }
+ size_t size() const { return Map.size(); }
+
+ Error discard() {
+ assert(Map && "Map already destroyed");
+ Map.unmap();
+ return Temp.discard();
+ }
+
+ Error keep(const Twine &Name) {
+ assert(Map && "Map already destroyed");
+ Map.unmap();
+ return Temp.keep(Name);
+ }
+
+ MappedTempFile(TempFile Temp, sys::fs::mapped_file_region Map)
+ : Temp(std::move(Temp)), Map(std::move(Map)) {}
+
+private:
+ TempFile Temp;
+ sys::fs::mapped_file_region Map;
+};
+} // namespace
+
+Error TempFile::discard() {
+ Done = true;
+ if (FD != -1) {
+ sys::fs::file_t File = sys::fs::convertFDToNativeFile(FD);
+ if (std::error_code EC = sys::fs::closeFile(File))
+ return errorCodeToError(EC);
+ }
+ FD = -1;
+
+ // Always try to close and remove.
+ std::error_code RemoveEC;
+ if (!TmpName.empty()) {
+ std::error_code EC = sys::fs::remove(TmpName);
+ if (EC)
+ return errorCodeToError(EC);
+ }
+ TmpName = "";
+
+ return Error::success();
+}
+
+Error TempFile::keep(const Twine &Name) {
+ assert(!Done);
+ Done = true;
+ // Always try to close and rename.
+ std::error_code RenameEC = sys::fs::rename(TmpName, Name);
+
+ if (!RenameEC)
+ TmpName = "";
+
+ sys::fs::file_t File = sys::fs::convertFDToNativeFile(FD);
+ if (std::error_code EC = sys::fs::closeFile(File))
+ return errorCodeToError(EC);
+ FD = -1;
+
+ return errorCodeToError(RenameEC);
+}
+
+Expected<TempFile> TempFile::create(const Twine &Model) {
+ int FD;
+ SmallString<128> ResultPath;
+ if (std::error_code EC = sys::fs::createUniqueFile(Model, FD, ResultPath))
+ return errorCodeToError(EC);
+
+ TempFile Ret(ResultPath, FD);
+ return std::move(Ret);
+}
+
+bool TrieRecord::compare_exchange_strong(Data &Existing, Data New) {
+ uint64_t ExistingPacked = pack(Existing);
+ uint64_t NewPacked = pack(New);
+ if (Storage.compare_exchange_strong(ExistingPacked, NewPacked))
+ return true;
+ Existing = unpack(ExistingPacked);
+ return false;
+}
+
+DataRecordHandle DataRecordHandle::construct(char *Mem, const Input &I) {
+ return constructImpl(Mem, I, Layout(I));
+}
+
+Expected<DataRecordHandle>
+DataRecordHandle::getFromDataPool(const OnDiskDataAllocator &Pool,
+ FileOffset Offset) {
+ auto HeaderData = Pool.get(Offset, sizeof(DataRecordHandle::Header));
+ if (!HeaderData)
+ return HeaderData.takeError();
+
+ auto Record = DataRecordHandle::get(HeaderData->data());
+ if (Record.getTotalSize() + Offset.get() > Pool.size())
+ return createStringError(
+ make_error_code(std::errc::illegal_byte_sequence),
+ "data record span passed the end of the data pool");
+
+ return Record;
+}
+
+DataRecordHandle DataRecordHandle::constructImpl(char *Mem, const Input &I,
+ const Layout &L) {
+ char *Next = Mem + sizeof(Header);
+
+ // Fill in Packed and set other data, then come back to construct the header.
+ Header::PackTy Packed = 0;
+ Packed |= LayoutFlags::pack(L.Flags) << Header::LayoutFlagsShift;
+
+ // Construct DataSize.
+ switch (L.Flags.DataSize) {
+ case DataSizeFlags::Uses1B:
+ assert(I.Data.size() <= UINT8_MAX);
+ Packed |= (Header::PackTy)I.Data.size()
+ << ((sizeof(Packed) - 2) * CHAR_BIT);
+ break;
+ case DataSizeFlags::Uses2B:
+ assert(I.Data.size() <= UINT16_MAX);
+ Packed |= (Header::PackTy)I.Data.size()
+ << ((sizeof(Packed) - 4) * CHAR_BIT);
+ break;
+ case DataSizeFlags::Uses4B:
+ support::endian::write32le(Next, I.Data.size());
+ Next += 4;
+ break;
+ case DataSizeFlags::Uses8B:
+ support::endian::write64le(Next, I.Data.size());
+ Next += 8;
+ break;
+ }
+
+ // Construct NumRefs.
+ //
+ // NOTE: May be writing NumRefs even if there are zero refs in order to fix
+ // alignment.
+ switch (L.Flags.NumRefs) {
+ case NumRefsFlags::Uses0B:
+ break;
+ case NumRefsFlags::Uses1B:
+ assert(I.Refs.size() <= UINT8_MAX);
+ Packed |= (Header::PackTy)I.Refs.size()
+ << ((sizeof(Packed) - 2) * CHAR_BIT);
+ break;
+ case NumRefsFlags::Uses2B:
+ assert(I.Refs.size() <= UINT16_MAX);
+ Packed |= (Header::PackTy)I.Refs.size()
+ << ((sizeof(Packed) - 4) * CHAR_BIT);
+ break;
+ case NumRefsFlags::Uses4B:
+ support::endian::write32le(Next, I.Refs.size());
+ Next += 4;
+ break;
+ case NumRefsFlags::Uses8B:
+ support::endian::write64le(Next, I.Refs.size());
+ Next += 8;
+ break;
+ }
+
+ // Construct Refs[].
+ if (!I.Refs.empty()) {
+ assert((L.Flags.RefKind == RefKindFlags::InternalRef4B) == I.Refs.is4B());
+ ArrayRef<uint8_t> RefsBuffer = I.Refs.getBuffer();
+ llvm::copy(RefsBuffer, Next);
+ Next += RefsBuffer.size();
+ }
+
+ // Construct Data and the trailing null.
+ assert(isAddrAligned(Align(8), Next));
+ llvm::copy(I.Data, Next);
+ Next[I.Data.size()] = 0;
+
+ // Construct the header itself and return.
+ Header *H = new (Mem) Header{Packed};
+ DataRecordHandle Record(*H);
+ assert(Record.getData() == I.Data);
+ assert(Record.getNumRefs() == I.Refs.size());
+ assert(Record.getRefs() == I.Refs);
+ assert(Record.getLayoutFlags().DataSize == L.Flags.DataSize);
+ assert(Record.getLayoutFlags().NumRefs == L.Flags.NumRefs);
+ assert(Record.getLayoutFlags().RefKind == L.Flags.RefKind);
+ return Record;
+}
+
+DataRecordHandle::Layout::Layout(const Input &I) {
+ // Start initial relative offsets right after the Header.
+ uint64_t RelOffset = sizeof(Header);
+
+ // Initialize the easy stuff.
+ DataSize = I.Data.size();
+ NumRefs = I.Refs.size();
+
+ // Check refs size.
+ Flags.RefKind =
+ I.Refs.is4B() ? RefKindFlags::InternalRef4B : RefKindFlags::InternalRef;
+
+ // Find the smallest slot available for DataSize.
+ bool Has1B = true;
+ bool Has2B = true;
+ if (DataSize <= UINT8_MAX && Has1B) {
+ Flags.DataSize = DataSizeFlags::Uses1B;
+ Has1B = false;
+ } else if (DataSize <= UINT16_MAX && Has2B) {
+ Flags.DataSize = DataSizeFlags::Uses2B;
+ Has2B = false;
+ } else if (DataSize <= UINT32_MAX) {
+ Flags.DataSize = DataSizeFlags::Uses4B;
+ RelOffset += 4;
+ } else {
+ Flags.DataSize = DataSizeFlags::Uses8B;
+ RelOffset += 8;
+ }
+
+ // Find the smallest slot available for NumRefs. Never sets NumRefs8B here.
+ if (!NumRefs) {
+ Flags.NumRefs = NumRefsFlags::Uses0B;
+ } else if (NumRefs <= UINT8_MAX && Has1B) {
+ Flags.NumRefs = NumRefsFlags::Uses1B;
+ Has1B = false;
+ } else if (NumRefs <= UINT16_MAX && Has2B) {
+ Flags.NumRefs = NumRefsFlags::Uses2B;
+ Has2B = false;
+ } else {
+ Flags.NumRefs = NumRefsFlags::Uses4B;
+ RelOffset += 4;
+ }
+
+ // Helper to "upgrade" either DataSize or NumRefs by 4B to avoid complicated
+ // padding rules when reading and writing. This also bumps RelOffset.
+ //
+ // The value for NumRefs is strictly limited to UINT32_MAX, but it can be
+ // stored as 8B. This means we can *always* find a size to grow.
+ //
+ // NOTE: Only call this once.
+ auto GrowSizeFieldsBy4B = [&]() {
+ assert(isAligned(Align(4), RelOffset));
+ RelOffset += 4;
+
+ assert(Flags.NumRefs != NumRefsFlags::Uses8B &&
+ "Expected to be able to grow NumRefs8B");
+
+ // First try to grow DataSize. NumRefs will not (yet) be 8B, and if
+ // DataSize is upgraded to 8B it'll already be aligned.
+ //
+ // Failing that, grow NumRefs.
+ if (Flags.DataSize < DataSizeFlags::Uses4B)
+ Flags.DataSize = DataSizeFlags::Uses4B; // DataSize: Packed => 4B.
+ else if (Flags.DataSize < DataSizeFlags::Uses8B)
+ Flags.DataSize = DataSizeFlags::Uses8B; // DataSize: 4B => 8B.
+ else if (Flags.NumRefs < NumRefsFlags::Uses4B)
+ Flags.NumRefs = NumRefsFlags::Uses4B; // NumRefs: Packed => 4B.
+ else
+ Flags.NumRefs = NumRefsFlags::Uses8B; // NumRefs: 4B => 8B.
+ };
+
+ assert(isAligned(Align(4), RelOffset));
+ if (Flags.RefKind == RefKindFlags::InternalRef) {
+ // List of 8B refs should be 8B-aligned. Grow one of the sizes to get this
+ // without padding.
+ if (!isAligned(Align(8), RelOffset))
+ GrowSizeFieldsBy4B();
+
+ assert(isAligned(Align(8), RelOffset));
+ RefsRelOffset = RelOffset;
+ RelOffset += 8 * NumRefs;
+ } else {
+ // The array of 4B refs doesn't need 8B alignment, but the data will need
+ // to be 8B-aligned. Detect this now, and, if necessary, shift everything
+ // by 4B by growing one of the sizes.
+ // If we remove the need for 8B-alignment for data there is <1% savings in
+ // disk storage for a clang build using MCCAS but the 8B-alignment may be
+ // useful in the future so keep it for now.
+ uint64_t RefListSize = 4 * NumRefs;
+ if (!isAligned(Align(8), RelOffset + RefListSize))
+ GrowSizeFieldsBy4B();
+ RefsRelOffset = RelOffset;
+ RelOffset += RefListSize;
+ }
+
+ assert(isAligned(Align(8), RelOffset));
+ DataRelOffset = RelOffset;
+}
+
+uint64_t DataRecordHandle::getDataSize() const {
+ int64_t RelOffset = sizeof(Header);
+ auto *DataSizePtr = reinterpret_cast<const char *>(H) + RelOffset;
+ switch (getLayoutFlags().DataSize) {
+ case DataSizeFlags::Uses1B:
+ return (H->Packed >> ((sizeof(Header::PackTy) - 2) * CHAR_BIT)) & UINT8_MAX;
+ case DataSizeFlags::Uses2B:
+ return (H->Packed >> ((sizeof(Header::PackTy) - 4) * CHAR_BIT)) &
+ UINT16_MAX;
+ case DataSizeFlags::Uses4B:
+ return support::endian::read32le(DataSizePtr);
+ case DataSizeFlags::Uses8B:
+ return support::endian::read64le(DataSizePtr);
+ }
+}
+
+void DataRecordHandle::skipDataSize(LayoutFlags LF, int64_t &RelOffset) const {
+ if (LF.DataSize >= DataSizeFlags::Uses4B)
+ RelOffset += 4;
+ if (LF.DataSize >= DataSizeFlags::Uses8B)
+ RelOffset += 4;
+}
+
+uint32_t DataRecordHandle::getNumRefs() const {
+ LayoutFlags LF = getLayoutFlags();
+ int64_t RelOffset = sizeof(Header);
+ skipDataSize(LF, RelOffset);
+ auto *NumRefsPtr = reinterpret_cast<const char *>(H) + RelOffset;
+ switch (LF.NumRefs) {
+ case NumRefsFlags::Uses0B:
+ return 0;
+ case NumRefsFlags::Uses1B:
+ return (H->Packed >> ((sizeof(Header::PackTy) - 2) * CHAR_BIT)) & UINT8_MAX;
+ case NumRefsFlags::Uses2B:
+ return (H->Packed >> ((sizeof(Header::PackTy) - 4) * CHAR_BIT)) &
+ UINT16_MAX;
+ case NumRefsFlags::Uses4B:
+ return support::endian::read32le(NumRefsPtr);
+ case NumRefsFlags::Uses8B:
+ return support::endian::read64le(NumRefsPtr);
+ }
+}
+
+void DataRecordHandle::skipNumRefs(LayoutFlags LF, int64_t &RelOffset) const {
+ if (LF.NumRefs >= NumRefsFlags::Uses4B)
+ RelOffset += 4;
+ if (LF.NumRefs >= NumRefsFlags::Uses8B)
+ RelOffset += 4;
+}
+
+int64_t DataRecordHandle::getRefsRelOffset() const {
+ LayoutFlags LF = getLayoutFlags();
+ int64_t RelOffset = sizeof(Header);
+ skipDataSize(LF, RelOffset);
+ skipNumRefs(LF, RelOffset);
+ return RelOffset;
+}
+
+int64_t DataRecordHandle::getDataRelOffset() const {
+ LayoutFlags LF = getLayoutFlags();
+ int64_t RelOffset = sizeof(Header);
+ skipDataSize(LF, RelOffset);
+ skipNumRefs(LF, RelOffset);
+ uint32_t RefSize = LF.RefKind == RefKindFlags::InternalRef4B ? 4 : 8;
+ RelOffset += RefSize * getNumRefs();
+ return RelOffset;
+}
+
+Error OnDiskGraphDB::validate(bool Deep, HashingFuncT Hasher) const {
+ return Index.validate([&](FileOffset Offset,
+ OnDiskTrieRawHashMap::ConstValueProxy Record)
+ -> Error {
+ auto formatError = [&](Twine Msg) {
+ return createStringError(
+ llvm::errc::illegal_byte_sequence,
+ "bad record at 0x" +
+ utohexstr((unsigned)Offset.get(), /*LowerCase=*/true) + ": " +
+ Msg.str());
+ };
+
+ if (Record.Data.size() != sizeof(TrieRecord))
+ return formatError("wrong data record size");
+ if (!isAligned(Align::Of<TrieRecord>(), Record.Data.size()))
+ return formatError("wrong data record alignment");
+
+ auto *R = reinterpret_cast<const TrieRecord *>(Record.Data.data());
+ TrieRecord::Data D = R->load();
+ std::unique_ptr<MemoryBuffer> FileBuffer;
+ if ((uint8_t)D.SK != (uint8_t)TrieRecord::StorageKind::Unknown &&
+ (uint8_t)D.SK != (uint8_t)TrieRecord::StorageKind::DataPool &&
+ (uint8_t)D.SK != (uint8_t)TrieRecord::StorageKind::Standalone &&
+ (uint8_t)D.SK != (uint8_t)TrieRecord::StorageKind::StandaloneLeaf &&
+ (uint8_t)D.SK != (uint8_t)TrieRecord::StorageKind::StandaloneLeaf0)
+ return formatError("invalid record kind value");
+
+ auto Ref = InternalRef::getFromOffset(Offset);
+ auto I = getIndexProxyFromRef(Ref);
+ if (!I)
+ return I.takeError();
+
+ switch (D.SK) {
+ case TrieRecord::StorageKind::Unknown:
+ // This could be an abandoned entry due to a termination before updating
+ // the record. It can be reused by later insertion so just skip this entry
+ // for now.
+ return Error::success();
+ case TrieRecord::StorageKind::DataPool:
+ // Check offset is a postive value, and large enough to hold the
+ // header for the data record.
+ if (D.Offset.get() <= 0 ||
+ (uint64_t)D.Offset.get() + sizeof(DataRecordHandle::Header) >=
+ DataPool.size())
+ return formatError("datapool record out of bound");
+ break;
+ case TrieRecord::StorageKind::Standalone:
+ case TrieRecord::StorageKind::StandaloneLeaf:
+ case TrieRecord::StorageKind::StandaloneLeaf0:
+ SmallString<256> Path;
+ getStandalonePath(TrieRecord::getStandaloneFilePrefix(D.SK), *I, Path);
+ // If need to validate the content of the file later, just load the
+ // buffer here. Otherwise, just check the existance of the file.
+ if (Deep) {
+ auto File = MemoryBuffer::getFile(Path, /*IsText=*/false,
+ /*RequiresNullTerminator=*/false);
+ if (!File || !*File)
+ return formatError("record file \'" + Path + "\' does not exist");
+
+ FileBuffer = std::move(*File);
+ } else if (!llvm::sys::fs::exists(Path))
+ return formatError("record file \'" + Path + "\' does not exist");
+ }
+
+ if (!Deep)
+ return Error::success();
+
+ auto dataError = [&](Twine Msg) {
+ return createStringError(llvm::errc::illegal_byte_sequence,
+ "bad data for digest \'" + toHex(I->Hash) +
+ "\': " + Msg.str());
+ };
+ SmallVector<ArrayRef<uint8_t>> Refs;
+ ArrayRef<char> StoredData;
+
+ switch (D.SK) {
+ case TrieRecord::StorageKind::Unknown:
+ llvm_unreachable("already handled");
+ case TrieRecord::StorageKind::DataPool: {
+ auto DataRecord = DataRecordHandle::getFromDataPool(DataPool, D.Offset);
+ if (!DataRecord)
+ return dataError(toString(DataRecord.takeError()));
+
+ for (auto InternRef : DataRecord->getRefs()) {
+ auto Index = getIndexProxyFromRef(InternRef);
+ if (!Index)
+ return Index.takeError();
+ Refs.push_back(Index->Hash);
+ }
+ StoredData = DataRecord->getData();
+ break;
+ }
+ case TrieRecord::StorageKind::Standalone: {
+ if (FileBuffer->getBufferSize() < sizeof(DataRecordHandle::Header))
+ return dataError("data record is not big enough to read the header");
+ auto DataRecord = DataRecordHandle::get(FileBuffer->getBufferStart());
+ if (DataRecord.getTotalSize() < FileBuffer->getBufferSize())
+ return dataError(
+ "data record span passed the end of the standalone file");
+ for (auto InternRef : DataRecord.getRefs()) {
+ auto Index = getIndexProxyFromRef(InternRef);
+ if (!Index)
+ return Index.takeError();
+ Refs.push_back(Index->Hash);
+ }
+ StoredData = DataRecord.getData();
+ break;
+ }
+ case TrieRecord::StorageKind::StandaloneLeaf:
+ case TrieRecord::StorageKind::StandaloneLeaf0: {
+ StoredData = arrayRefFromStringRef<char>(FileBuffer->getBuffer());
+ if (D.SK == TrieRecord::StorageKind::StandaloneLeaf0) {
+ if (!FileBuffer->getBuffer().ends_with('\0'))
+ return dataError("standalone file is not zero terminated");
+ StoredData = StoredData.drop_back(1);
+ }
+ break;
+ }
+ }
+
+ SmallVector<uint8_t> ComputedHash;
+ Hasher(Refs, StoredData, ComputedHash);
+ if (I->Hash != ArrayRef(ComputedHash))
+ return dataError("hash mismatch, got \'" + toHex(ComputedHash) +
+ "\' instead");
+
+ return Error::success();
+ });
+}
+
+void OnDiskGraphDB::print(raw_ostream &OS) const {
+ OS << "on-disk-root-path: " << RootPath << "\n";
+
+ struct PoolInfo {
+ uint64_t Offset;
+ };
+ SmallVector<PoolInfo> Pool;
+
+ OS << "\n";
+ OS << "index:\n";
+ Index.print(OS, [&](ArrayRef<char> Data) {
+ assert(Data.size() == sizeof(TrieRecord));
+ assert(isAligned(Align::Of<TrieRecord>(), Data.size()));
+ auto *R = reinterpret_cast<const TrieRecord *>(Data.data());
+ TrieRecord::Data D = R->load();
+ OS << " SK=";
+ switch (D.SK) {
+ case TrieRecord::StorageKind::Unknown:
+ OS << "unknown ";
+ break;
+ case TrieRecord::StorageKind::DataPool:
+ OS << "datapool ";
+ Pool.push_back({D.Offset.get()});
+ break;
+ case TrieRecord::StorageKind::Standalone:
+ OS << "standalone-data ";
+ break;
+ case TrieRecord::StorageKind::StandaloneLeaf:
+ OS << "standalone-leaf ";
+ break;
+ case TrieRecord::StorageKind::StandaloneLeaf0:
+ OS << "standalone-leaf+0";
+ break;
+ }
+ OS << " Offset=" << (void *)D.Offset.get();
+ });
+ if (Pool.empty())
+ return;
+
+ OS << "\n";
+ OS << "pool:\n";
+ llvm::sort(
+ Pool, [](PoolInfo LHS, PoolInfo RHS) { return LHS.Offset < RHS.Offset; });
+ for (PoolInfo PI : Pool) {
+ OS << "- addr=" << (void *)PI.Offset << " ";
+ auto D = DataRecordHandle::getFromDataPool(DataPool, FileOffset(PI.Offset));
+ if (!D) {
+ OS << "error: " << toString(D.takeError());
+ return;
+ }
+
+ OS << "record refs=" << D->getNumRefs() << " data=" << D->getDataSize()
+ << " size=" << D->getTotalSize()
+ << " end=" << (void *)(PI.Offset + D->getTotalSize()) << "\n";
+ }
+}
+
+Expected<OnDiskGraphDB::IndexProxy>
+OnDiskGraphDB::indexHash(ArrayRef<uint8_t> Hash) {
+ auto P = Index.insertLazy(
+ Hash, [](FileOffset TentativeOffset,
+ OnDiskTrieRawHashMap::ValueProxy TentativeValue) {
+ assert(TentativeValue.Data.size() == sizeof(TrieRecord));
+ assert(
+ isAddrAligned(Align::Of<TrieRecord>(), TentativeValue.Data.data()));
+ new (TentativeValue.Data.data()) TrieRecord();
+ });
+ if (LLVM_UNLIKELY(!P))
+ return P.takeError();
+
+ assert(*P && "Expected insertion");
+ return getIndexProxyFromPointer(*P);
+}
+
+OnDiskGraphDB::IndexProxy OnDiskGraphDB::getIndexProxyFromPointer(
+ OnDiskTrieRawHashMap::ConstOnDiskPtr P) const {
+ assert(P);
+ assert(P.getOffset());
+ return IndexProxy{P.getOffset(), P->Hash,
+ *const_cast<TrieRecord *>(
+ reinterpret_cast<const TrieRecord *>(P->Data.data()))};
+}
+
+Expected<ObjectID> OnDiskGraphDB::getReference(ArrayRef<uint8_t> Hash) {
+ auto I = indexHash(Hash);
+ if (LLVM_UNLIKELY(!I))
+ return I.takeError();
+ return getExternalReference(*I);
+}
+
+ObjectID OnDiskGraphDB::getExternalReference(const IndexProxy &I) {
+ return getExternalReference(makeInternalRef(I.Offset));
+}
+
+std::optional<ObjectID>
+OnDiskGraphDB::getExistingReference(ArrayRef<uint8_t> Digest) {
+ auto tryUpstream =
+ [&](std::optional<IndexProxy> I) -> std::optional<ObjectID> {
+ if (!UpstreamDB)
+ return std::nullopt;
+ std::optional<ObjectID> UpstreamID =
+ UpstreamDB->getExistingReference(Digest);
+ if (LLVM_UNLIKELY(!UpstreamID))
+ return std::nullopt;
+ auto Ref = expectedToOptional(indexHash(Digest));
+ if (!Ref)
+ return std::nullopt;
+ if (!I)
+ I.emplace(*Ref);
+ return getExternalReference(*I);
+ };
+
+ OnDiskTrieRawHashMap::ConstOnDiskPtr P = Index.find(Digest);
+ if (!P)
+ return tryUpstream(std::nullopt);
+ IndexProxy I = getIndexProxyFromPointer(P);
+ TrieRecord::Data Obj = I.Ref.load();
+ if (Obj.SK == TrieRecord::StorageKind::Unknown)
+ return tryUpstream(I);
+ return getExternalReference(makeInternalRef(I.Offset));
+}
+
+Expected<OnDiskGraphDB::IndexProxy>
+OnDiskGraphDB::getIndexProxyFromRef(InternalRef Ref) const {
+ auto P = Index.recoverFromFileOffset(Ref.getFileOffset());
+ if (LLVM_UNLIKELY(!P))
+ return P.takeError();
+ return getIndexProxyFromPointer(*P);
+}
+
+Expected<ArrayRef<uint8_t>> OnDiskGraphDB::getDigest(InternalRef Ref) const {
+ auto I = getIndexProxyFromRef(Ref);
+ if (!I)
+ return I.takeError();
+ return I->Hash;
+}
+
+ArrayRef<uint8_t> OnDiskGraphDB::getDigest(const IndexProxy &I) const {
+ return I.Hash;
+}
+
+static OnDiskContent getContentFromHandle(const OnDiskDataAllocator &DataPool,
+ ObjectHandle OH) {
+ // Decode ObjectHandle to locate the stored content.
+ uint64_t Data = OH.getOpaqueData();
+ if (Data & 1) {
+ const auto *SDIM =
+ reinterpret_cast<const StandaloneDataInMemory *>(Data & (-1ULL << 1));
+ return SDIM->getContent();
+ }
+
+ auto DataHandle =
+ cantFail(DataRecordHandle::getFromDataPool(DataPool, FileOffset(Data)));
+ assert(DataHandle.getData().end()[0] == 0 && "Null termination");
+ return OnDiskContent{DataHandle, std::nullopt};
+}
+
+ArrayRef<char> OnDiskGraphDB::getObjectData(ObjectHandle Node) const {
+ OnDiskContent Content = getContentFromHandle(DataPool, Node);
+ if (Content.Bytes)
+ return *Content.Bytes;
+ assert(Content.Record && "Expected record or bytes");
+ return Content.Record->getData();
+}
+
+InternalRefArrayRef OnDiskGraphDB::getInternalRefs(ObjectHandle Node) const {
+ if (std::optional<DataRecordHandle> Record =
+ getContentFromHandle(DataPool, Node).Record)
+ return Record->getRefs();
+ return std::nullopt;
+}
+
+Expected<std::optional<ObjectHandle>>
+OnDiskGraphDB::load(ObjectID ExternalRef) {
+ InternalRef Ref = getInternalRef(ExternalRef);
+ auto I = getIndexProxyFromRef(Ref);
+ if (!I)
+ return I.takeError();
+ TrieRecord::Data Object = I->Ref.load();
+
+ if (Object.SK == TrieRecord::StorageKind::Unknown) {
+ if (!UpstreamDB)
+ return std::nullopt;
+ return faultInFromUpstream(ExternalRef);
+ }
+
+ if (Object.SK == TrieRecord::StorageKind::DataPool)
+ return ObjectHandle::fromFileOffset(Object.Offset);
+
+ // Only TrieRecord::StorageKind::Standalone (and variants) need to be
+ // explicitly loaded.
+ //
+ // There's corruption if standalone objects have offsets, or if we get here
+ // for something that isn't standalone.
+ if (Object.Offset)
+ return createCorruptObjectError(getDigest(*I));
+ switch (Object.SK) {
+ case TrieRecord::StorageKind::Unknown:
+ case TrieRecord::StorageKind::DataPool:
+ llvm_unreachable("unexpected storage kind");
+ case TrieRecord::StorageKind::Standalone:
+ case TrieRecord::StorageKind::StandaloneLeaf0:
+ case TrieRecord::StorageKind::StandaloneLeaf:
+ break;
+ }
+
+ // Load it from disk.
+ //
+ // Note: Creation logic guarantees that data that needs null-termination is
+ // suitably 0-padded. Requiring null-termination here would be too expensive
+ // for extremely large objects that happen to be page-aligned.
+ SmallString<256> Path;
+ getStandalonePath(TrieRecord::getStandaloneFilePrefix(Object.SK), *I, Path);
+
+ auto File = sys::fs::openNativeFileForRead(Path);
+ if (!File)
+ return createFileError(Path, File.takeError());
+
+ auto CloseFile = make_scope_exit([&]() { sys::fs::closeFile(*File); });
+
+ sys::fs::file_status Status;
+ if (std::error_code EC = sys::fs::status(*File, Status))
+ return createCorruptObjectError(getDigest(*I));
+
+ std::error_code EC;
+ auto Region = std::make_unique<sys::fs::mapped_file_region>(
+ *File, sys::fs::mapped_file_region::readonly, Status.getSize(), 0, EC);
+ if (EC)
+ return createCorruptObjectError(getDigest(*I));
+
+ return ObjectHandle::fromMemory(
+ static_cast<StandaloneDataMapTy *>(StandaloneData)
+ ->insert(I->Hash, Object.SK, std::move(Region)));
+}
+
+Expected<bool> OnDiskGraphDB::isMaterialized(ObjectID Ref) {
+ auto Presence = getObjectPresence(Ref, /*CheckUpstream=*/true);
+ if (!Presence)
+ return Presence.takeError();
+
+ switch (*Presence) {
+ case ObjectPresence::Missing:
+ return false;
+ case ObjectPresence::InPrimaryDB:
+ return true;
+ case ObjectPresence::OnlyInUpstreamDB:
+ if (auto FaultInResult = faultInFromUpstream(Ref); !FaultInResult)
+ return FaultInResult.takeError();
+ return true;
+ }
+}
+
+Expected<OnDiskGraphDB::ObjectPresence>
+OnDiskGraphDB::getObjectPresence(ObjectID ExternalRef,
+ bool CheckUpstream) const {
+ InternalRef Ref = getInternalRef(ExternalRef);
+ auto I = getIndexProxyFromRef(Ref);
+ if (!I)
+ return I.takeError();
+
+ TrieRecord::Data Object = I->Ref.load();
+ if (Object.SK != TrieRecord::StorageKind::Unknown)
+ return ObjectPresence::InPrimaryDB;
+ if (!CheckUpstream || !UpstreamDB)
+ return ObjectPresence::Missing;
+ std::optional<ObjectID> UpstreamID =
+ UpstreamDB->getExistingReference(getDigest(*I));
+ return UpstreamID.has_value() ? ObjectPresence::OnlyInUpstreamDB
+ : ObjectPresence::Missing;
+}
+
+InternalRef OnDiskGraphDB::makeInternalRef(FileOffset IndexOffset) {
+ return InternalRef::getFromOffset(IndexOffset);
+}
+
+void OnDiskGraphDB::getStandalonePath(StringRef Prefix, const IndexProxy &I,
+ SmallVectorImpl<char> &Path) const {
+ Path.assign(RootPath.begin(), RootPath.end());
+ sys::path::append(Path,
+ Prefix + Twine(I.Offset.get()) + "." + CASFormatVersion);
+}
+
+OnDiskContent StandaloneDataInMemory::getContent() const {
+ bool Leaf0 = false;
+ bool Leaf = false;
+ switch (SK) {
+ default:
+ llvm_unreachable("Storage kind must be standalone");
+ case TrieRecord::StorageKind::Standalone:
+ break;
+ case TrieRecord::StorageKind::StandaloneLeaf0:
+ Leaf = Leaf0 = true;
+ break;
+ case TrieRecord::StorageKind::StandaloneLeaf:
+ Leaf = true;
+ break;
+ }
+
+ if (Leaf) {
+ StringRef Data(Region->data(), Region->size());
+ assert(Data.drop_back(Leaf0).end()[0] == 0 &&
+ "Standalone node data missing null termination");
+ return OnDiskContent{std::nullopt,
+ arrayRefFromStringRef<char>(Data.drop_back(Leaf0))};
+ }
+
+ DataRecordHandle Record = DataRecordHandle::get(Region->data());
+ assert(Record.getData().end()[0] == 0 &&
+ "Standalone object record missing null termination for data");
+ return OnDiskContent{Record, std::nullopt};
+}
+
+static Expected<MappedTempFile> createTempFile(StringRef FinalPath,
+ uint64_t Size) {
+ assert(Size && "Unexpected request for an empty temp file");
+ Expected<TempFile> File = TempFile::create(FinalPath + ".%%%%%%");
+ if (!File)
+ return File.takeError();
+
+ if (Error E = preallocateFileTail(File->FD, 0, Size).takeError())
+ return createFileError(File->TmpName, std::move(E));
+
+ if (auto EC = sys::fs::resize_file_before_mapping_readwrite(File->FD, Size))
+ return createFileError(File->TmpName, EC);
+
+ std::error_code EC;
+ sys::fs::mapped_file_region Map(sys::fs::convertFDToNativeFile(File->FD),
+ sys::fs::mapped_file_region::readwrite, Size,
+ 0, EC);
+ if (EC)
+ return createFileError(File->TmpName, EC);
+ return MappedTempFile(std::move(*File), std::move(Map));
+}
+
+static size_t getPageSize() {
+ static int PageSize = sys::Process::getPageSizeEstimate();
+ return PageSize;
+}
+
+Error OnDiskGraphDB::createStandaloneLeaf(IndexProxy &I, ArrayRef<char> Data) {
+ assert(Data.size() > TrieRecord::MaxEmbeddedSize &&
+ "Expected a bigger file for external content...");
+
+ bool Leaf0 = isAligned(Align(getPageSize()), Data.size());
+ TrieRecord::StorageKind SK = Leaf0 ? TrieRecord::StorageKind::StandaloneLeaf0
+ : TrieRecord::StorageKind::StandaloneLeaf;
+
+ SmallString<256> Path;
+ int64_t FileSize = Data.size() + Leaf0;
+ getStandalonePath(TrieRecord::getStandaloneFilePrefix(SK), I, Path);
+
+ // Write the file. Don't reuse this mapped_file_region, which is read/write.
+ // Let load() pull up one that's read-only.
+ Expected<MappedTempFile> File = createTempFile(Path, FileSize);
+ if (!File)
+ return File.takeError();
+ assert(File->size() == (uint64_t)FileSize);
+ llvm::copy(Data, File->data());
+ if (Leaf0)
+ File->data()[Data.size()] = 0;
+ assert(File->data()[Data.size()] == 0);
+ if (Error E = File->keep(Path))
+ return E;
+
+ // Store the object reference.
+ TrieRecord::Data Existing;
+ {
+ TrieRecord::Data Leaf{SK, FileOffset()};
+ if (I.Ref.compare_exchange_strong(Existing, Leaf)) {
+ recordStandaloneSizeIncrease(FileSize);
+ return Error::success();
+ }
+ }
+
+ // If there was a race, confirm that the new value has valid storage.
+ if (Existing.SK == TrieRecord::StorageKind::Unknown)
+ return createCorruptObjectError(getDigest(I));
+
+ return Error::success();
+}
+
+Error OnDiskGraphDB::store(ObjectID ID, ArrayRef<ObjectID> Refs,
+ ArrayRef<char> Data) {
+ auto I = getIndexProxyFromRef(getInternalRef(ID));
+ if (LLVM_UNLIKELY(!I))
+ return I.takeError();
+
+ // Early return in case the node exists.
+ {
+ TrieRecord::Data Existing = I->Ref.load();
+ if (Existing.SK != TrieRecord::StorageKind::Unknown)
+ return Error::success();
+ }
+
+ // Big leaf nodes.
+ if (Refs.empty() && Data.size() > TrieRecord::MaxEmbeddedSize)
+ return createStandaloneLeaf(*I, Data);
+
+ // TODO: Check whether it's worth checking the index for an already existing
+ // object (like storeTreeImpl() does) before building up the
+ // InternalRefVector.
+ InternalRefVector InternalRefs;
+ for (ObjectID Ref : Refs)
+ InternalRefs.push_back(getInternalRef(Ref));
+
+ // Create the object.
+
+ DataRecordHandle::Input Input{InternalRefs, Data};
+
+ // Compute the storage kind, allocate it, and create the record.
+ TrieRecord::StorageKind SK = TrieRecord::StorageKind::Unknown;
+ FileOffset PoolOffset;
+ SmallString<256> Path;
+ std::optional<MappedTempFile> File;
+ std::optional<uint64_t> FileSize;
+ auto AllocStandaloneFile = [&](size_t Size) -> Expected<char *> {
+ getStandalonePath(TrieRecord::getStandaloneFilePrefix(
+ TrieRecord::StorageKind::Standalone),
+ *I, Path);
+ if (Error E = createTempFile(Path, Size).moveInto(File))
+ return std::move(E);
+ assert(File->size() == Size);
+ FileSize = Size;
+ SK = TrieRecord::StorageKind::Standalone;
+ return File->data();
+ };
+ auto Alloc = [&](size_t Size) -> Expected<char *> {
+ if (Size <= TrieRecord::MaxEmbeddedSize) {
+ SK = TrieRecord::StorageKind::DataPool;
+ auto P = DataPool.allocate(Size);
+ if (LLVM_UNLIKELY(!P)) {
+ char *NewAlloc = nullptr;
+ auto NewE = handleErrors(
+ P.takeError(), [&](std::unique_ptr<StringError> E) -> Error {
+ if (E->convertToErrorCode() == std::errc::not_enough_memory)
+ return AllocStandaloneFile(Size).moveInto(NewAlloc);
+ return Error(std::move(E));
+ });
+ if (!NewE)
+ return NewAlloc;
+ return std::move(NewE);
+ }
+ PoolOffset = P->getOffset();
+ LLVM_DEBUG({
+ dbgs() << "pool-alloc addr=" << (void *)PoolOffset.get()
+ << " size=" << Size
+ << " end=" << (void *)(PoolOffset.get() + Size) << "\n";
+ });
+ return (*P)->data();
+ }
+ return AllocStandaloneFile(Size);
+ };
+
+ DataRecordHandle Record;
+ if (Error E =
+ DataRecordHandle::createWithError(Alloc, Input).moveInto(Record))
+ return E;
+ assert(Record.getData().end()[0] == 0 && "Expected null-termination");
+ assert(Record.getData() == Input.Data && "Expected initialization");
+ assert(SK != TrieRecord::StorageKind::Unknown);
+ assert(bool(File) != bool(PoolOffset) &&
+ "Expected either a mapped file or a pooled offset");
+
+ // Check for a race before calling MappedTempFile::keep().
+ //
+ // Then decide what to do with the file. Better to discard than overwrite if
+ // another thread/process has already added this.
+ TrieRecord::Data Existing = I->Ref.load();
+ {
+ TrieRecord::Data NewObject{SK, PoolOffset};
+ if (File) {
+ if (Existing.SK == TrieRecord::StorageKind::Unknown) {
+ // Keep the file!
+ if (Error E = File->keep(Path))
+ return E;
+ } else {
+ File.reset();
+ }
+ }
+
+ // If we didn't already see a racing/existing write, then try storing the
+ // new object. If that races, confirm that the new value has valid storage.
+ //
+ // TODO: Find a way to reuse the storage from the new-but-abandoned record
+ // handle.
+ if (Existing.SK == TrieRecord::StorageKind::Unknown) {
+ if (I->Ref.compare_exchange_strong(Existing, NewObject)) {
+ if (FileSize)
+ recordStandaloneSizeIncrease(*FileSize);
+ return Error::success();
+ }
+ }
+ }
+
+ if (Existing.SK == TrieRecord::StorageKind::Unknown)
+ return createCorruptObjectError(getDigest(*I));
+
+ // Load existing object.
+ return Error::success();
+}
+
+void OnDiskGraphDB::recordStandaloneSizeIncrease(size_t SizeIncrease) {
+ standaloneStorageSize().fetch_add(SizeIncrease, std::memory_order_relaxed);
+}
+
+std::atomic<uint64_t> &OnDiskGraphDB::standaloneStorageSize() const {
+ MutableArrayRef<uint8_t> UserHeader = DataPool.getUserHeader();
+ assert(UserHeader.size() == sizeof(std::atomic<uint64_t>));
+ assert(isAddrAligned(Align(8), UserHeader.data()));
+ return *reinterpret_cast<std::atomic<uint64_t> *>(UserHeader.data());
+}
+
+uint64_t OnDiskGraphDB::getStandaloneStorageSize() const {
+ return standaloneStorageSize().load(std::memory_order_relaxed);
+}
+
+size_t OnDiskGraphDB::getStorageSize() const {
+ return Index.size() + DataPool.size() + getStandaloneStorageSize();
+}
+
+unsigned OnDiskGraphDB::getHardStorageLimitUtilization() const {
+ unsigned IndexPercent = Index.size() * 100ULL / Index.capacity();
+ unsigned DataPercent = DataPool.size() * 100ULL / DataPool.capacity();
+ return std::max(IndexPercent, DataPercent);
+}
+
+Expected<std::unique_ptr<OnDiskGraphDB>> OnDiskGraphDB::open(
+ StringRef AbsPath, StringRef HashName, unsigned HashByteSize,
+ std::unique_ptr<OnDiskGraphDB> UpstreamDB, FaultInPolicy Policy) {
+ if (std::error_code EC = sys::fs::create_directories(AbsPath))
+ return createFileError(AbsPath, EC);
+
+ constexpr uint64_t MB = 1024ull * 1024ull;
+ constexpr uint64_t GB = 1024ull * 1024ull * 1024ull;
+
+ uint64_t MaxIndexSize = 12 * GB;
+ uint64_t MaxDataPoolSize = 24 * GB;
+
+ if (useSmallMappingSize(AbsPath)) {
+ MaxIndexSize = 1 * GB;
+ MaxDataPoolSize = 2 * GB;
+ }
+
+ auto CustomSize = getOverriddenMaxMappingSize();
+ if (!CustomSize)
+ return CustomSize.takeError();
+ if (*CustomSize)
+ MaxIndexSize = MaxDataPoolSize = **CustomSize;
+
+ SmallString<256> IndexPath(AbsPath);
+ sys::path::append(IndexPath, IndexFilePrefix + CASFormatVersion);
+ std::optional<OnDiskTrieRawHashMap> Index;
+ if (Error E = OnDiskTrieRawHashMap::create(
+ IndexPath, IndexTableName + "[" + HashName + "]",
+ HashByteSize * CHAR_BIT,
+ /*DataSize=*/sizeof(TrieRecord), MaxIndexSize,
+ /*MinFileSize=*/MB)
+ .moveInto(Index))
+ return std::move(E);
+
+ uint32_t UserHeaderSize = sizeof(std::atomic<uint64_t>);
+
+ SmallString<256> DataPoolPath(AbsPath);
+ sys::path::append(DataPoolPath, DataPoolFilePrefix + CASFormatVersion);
+ std::optional<OnDiskDataAllocator> DataPool;
+ StringRef PolicyName =
+ Policy == FaultInPolicy::SingleNode ? "single" : "full";
+ if (Error E = OnDiskDataAllocator::create(
+ DataPoolPath,
+ DataPoolTableName + "[" + HashName + "]" + PolicyName,
+ MaxDataPoolSize, /*MinFileSize=*/MB, UserHeaderSize,
+ [](void *UserHeaderPtr) {
+ new (UserHeaderPtr) std::atomic<uint64_t>(0);
+ })
+ .moveInto(DataPool))
+ return std::move(E);
+ if (DataPool->getUserHeader().size() != UserHeaderSize)
+ return createStringError(llvm::errc::argument_out_of_domain,
+ "unexpected user header in '" + DataPoolPath +
+ "'");
+
+ return std::unique_ptr<OnDiskGraphDB>(
+ new OnDiskGraphDB(AbsPath, std::move(*Index), std::move(*DataPool),
+ std::move(UpstreamDB), Policy));
+}
+
+OnDiskGraphDB::OnDiskGraphDB(StringRef RootPath, OnDiskTrieRawHashMap Index,
+ OnDiskDataAllocator DataPool,
+ std::unique_ptr<OnDiskGraphDB> UpstreamDB,
+ FaultInPolicy Policy)
+ : Index(std::move(Index)), DataPool(std::move(DataPool)),
+ RootPath(RootPath.str()), UpstreamDB(std::move(UpstreamDB)),
+ FIPolicy(Policy) {
+ /// Lifetime for "big" objects not in DataPool.
+ ///
+ /// NOTE: Could use ThreadSafeTrieRawHashMap here. For now, doing something
+ /// simpler on the assumption there won't be much contention since most data
+ /// is not big. If there is contention, and we've already fixed ObjectProxy
+ /// object handles to be cheap enough to use consistently, the fix might be
+ /// to use better use of them rather than optimizing this map.
+ ///
+ /// FIXME: Figure out the right number of shards, if any.
+ StandaloneData = new StandaloneDataMapTy();
+}
+
+OnDiskGraphDB::~OnDiskGraphDB() {
+ delete static_cast<StandaloneDataMapTy *>(StandaloneData);
+}
+
+Error OnDiskGraphDB::importFullTree(ObjectID PrimaryID,
+ ObjectHandle UpstreamNode) {
+ // Copies the full CAS tree from upstream. Uses depth-first copying to protect
+ // against the process dying during importing and leaving the database with an
+ // incomplete tree. Note that if the upstream has missing nodes then the tree
+ // will be copied with missing nodes as well, it won't be considered an error.
+
+ struct UpstreamCursor {
+ ObjectHandle Node;
+ size_t RefsCount;
+ object_refs_iterator RefI;
+ object_refs_iterator RefE;
+ };
+ /// Keeps track of the state of visitation for current node and all of its
+ /// parents.
+ SmallVector<UpstreamCursor, 16> CursorStack;
+ /// Keeps track of the currently visited nodes as they are imported into
+ /// primary database, from current node and its parents. When a node is
+ /// entered for visitation it appends its own ID, then appends referenced IDs
+ /// as they get imported. When a node is fully imported it removes the
+ /// referenced IDs from the bottom of the stack which leaves its own ID at the
+ /// bottom, adding to the list of referenced IDs for the parent node.
+ SmallVector<ObjectID, 128> PrimaryNodesStack;
+
+ auto enqueueNode = [&](ObjectID PrimaryID, std::optional<ObjectHandle> Node) {
+ PrimaryNodesStack.push_back(PrimaryID);
+ if (!Node)
+ return;
+ auto Refs = UpstreamDB->getObjectRefs(*Node);
+ CursorStack.push_back({*Node,
+ (size_t)std::distance(Refs.begin(), Refs.end()),
+ Refs.begin(), Refs.end()});
+ };
+
+ enqueueNode(PrimaryID, UpstreamNode);
+
+ while (!CursorStack.empty()) {
+ UpstreamCursor &Cur = CursorStack.back();
+ if (Cur.RefI == Cur.RefE) {
+ // Copy the node data into the primary store.
+ // FIXME: Use hard-link or cloning if the file-system supports it and data
+ // is stored into a separate file.
+
+ // The bottom of \p PrimaryNodesStack contains the primary ID for the
+ // current node plus the list of imported referenced IDs.
+ assert(PrimaryNodesStack.size() >= Cur.RefsCount + 1);
+ ObjectID PrimaryID = *(PrimaryNodesStack.end() - Cur.RefsCount - 1);
+ auto PrimaryRefs = ArrayRef(PrimaryNodesStack)
+ .slice(PrimaryNodesStack.size() - Cur.RefsCount);
+ auto Data = UpstreamDB->getObjectData(Cur.Node);
+ if (Error E = store(PrimaryID, PrimaryRefs, Data))
+ return E;
+ // Remove the current node and its IDs from the stack.
+ PrimaryNodesStack.truncate(PrimaryNodesStack.size() - Cur.RefsCount);
+ CursorStack.pop_back();
+ continue;
+ }
+
+ ObjectID UpstreamID = *(Cur.RefI++);
+ auto PrimaryID = getReference(UpstreamDB->getDigest(UpstreamID));
+ if (LLVM_UNLIKELY(!PrimaryID))
+ return PrimaryID.takeError();
+ if (containsObject(*PrimaryID, /*CheckUpstream=*/false)) {
+ // This \p ObjectID already exists in the primary. Either it was imported
+ // via \p importFullTree or the client created it, in which case the
+ // client takes responsibility for how it was formed.
+ enqueueNode(*PrimaryID, std::nullopt);
+ continue;
+ }
+ Expected<std::optional<ObjectHandle>> UpstreamNode =
+ UpstreamDB->load(UpstreamID);
+ if (!UpstreamNode)
+ return UpstreamNode.takeError();
+ enqueueNode(*PrimaryID, *UpstreamNode);
+ }
+
+ assert(PrimaryNodesStack.size() == 1);
+ assert(PrimaryNodesStack.front() == PrimaryID);
+ return Error::success();
+}
+
+Error OnDiskGraphDB::importSingleNode(ObjectID PrimaryID,
+ ObjectHandle UpstreamNode) {
+ // Copies only a single node, it doesn't copy the referenced nodes.
+
+ // Copy the node data into the primary store.
+ // FIXME: Use hard-link or cloning if the file-system supports it and data is
+ // stored into a separate file.
+
+ auto Data = UpstreamDB->getObjectData(UpstreamNode);
+ auto UpstreamRefs = UpstreamDB->getObjectRefs(UpstreamNode);
+ SmallVector<ObjectID, 64> Refs;
+ Refs.reserve(std::distance(UpstreamRefs.begin(), UpstreamRefs.end()));
+ for (ObjectID UpstreamRef : UpstreamRefs) {
+ auto Ref = getReference(UpstreamDB->getDigest(UpstreamRef));
+ if (LLVM_UNLIKELY(!Ref))
+ return Ref.takeError();
+ Refs.push_back(*Ref);
+ }
+
+ return store(PrimaryID, Refs, Data);
+}
+
+Expected<std::optional<ObjectHandle>>
+OnDiskGraphDB::faultInFromUpstream(ObjectID PrimaryID) {
+ assert(UpstreamDB);
+
+ auto UpstreamID = UpstreamDB->getReference(getDigest(PrimaryID));
+ if (LLVM_UNLIKELY(!UpstreamID))
+ return UpstreamID.takeError();
+
+ Expected<std::optional<ObjectHandle>> UpstreamNode =
+ UpstreamDB->load(*UpstreamID);
+ if (!UpstreamNode)
+ return UpstreamNode.takeError();
+ if (!*UpstreamNode)
+ return std::nullopt;
+
+ if (Error E = FIPolicy == FaultInPolicy::SingleNode
+ ? importSingleNode(PrimaryID, **UpstreamNode)
+ : importFullTree(PrimaryID, **UpstreamNode))
+ return std::move(E);
+ return load(PrimaryID);
+}
diff --git a/llvm/lib/CAS/OnDiskKeyValueDB.cpp b/llvm/lib/CAS/OnDiskKeyValueDB.cpp
new file mode 100644
index 0000000..2186071
--- /dev/null
+++ b/llvm/lib/CAS/OnDiskKeyValueDB.cpp
@@ -0,0 +1,113 @@
+//===- OnDiskKeyValueDB.cpp -------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file implements OnDiskKeyValueDB, an ondisk key value database.
+///
+/// The KeyValue database file is named `actions.<version>` inside the CAS
+/// directory. The database stores a mapping between a fixed-sized key and a
+/// fixed-sized value, where the size of key and value can be configured when
+/// opening the database.
+///
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/OnDiskKeyValueDB.h"
+#include "OnDiskCommon.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Alignment.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/Path.h"
+
+using namespace llvm;
+using namespace llvm::cas;
+using namespace llvm::cas::ondisk;
+
+static constexpr StringLiteral ActionCacheFile = "actions.";
+
+Expected<ArrayRef<char>> OnDiskKeyValueDB::put(ArrayRef<uint8_t> Key,
+ ArrayRef<char> Value) {
+ if (LLVM_UNLIKELY(Value.size() != ValueSize))
+ return createStringError(errc::invalid_argument,
+ "expected value size of " + itostr(ValueSize) +
+ ", got: " + itostr(Value.size()));
+ assert(Value.size() == ValueSize);
+ auto ActionP = Cache.insertLazy(
+ Key, [&](FileOffset TentativeOffset,
+ OnDiskTrieRawHashMap::ValueProxy TentativeValue) {
+ assert(TentativeValue.Data.size() == ValueSize);
+ llvm::copy(Value, TentativeValue.Data.data());
+ });
+ if (LLVM_UNLIKELY(!ActionP))
+ return ActionP.takeError();
+ return (*ActionP)->Data;
+}
+
+Expected<std::optional<ArrayRef<char>>>
+OnDiskKeyValueDB::get(ArrayRef<uint8_t> Key) {
+ // Check the result cache.
+ OnDiskTrieRawHashMap::ConstOnDiskPtr ActionP = Cache.find(Key);
+ if (!ActionP)
+ return std::nullopt;
+ assert(isAddrAligned(Align(8), ActionP->Data.data()));
+ return ActionP->Data;
+}
+
+Expected<std::unique_ptr<OnDiskKeyValueDB>>
+OnDiskKeyValueDB::open(StringRef Path, StringRef HashName, unsigned KeySize,
+ StringRef ValueName, size_t ValueSize) {
+ if (std::error_code EC = sys::fs::create_directories(Path))
+ return createFileError(Path, EC);
+
+ SmallString<256> CachePath(Path);
+ sys::path::append(CachePath, ActionCacheFile + CASFormatVersion);
+ constexpr uint64_t MB = 1024ull * 1024ull;
+ constexpr uint64_t GB = 1024ull * 1024ull * 1024ull;
+
+ uint64_t MaxFileSize = GB;
+ auto CustomSize = getOverriddenMaxMappingSize();
+ if (!CustomSize)
+ return CustomSize.takeError();
+ if (*CustomSize)
+ MaxFileSize = **CustomSize;
+
+ std::optional<OnDiskTrieRawHashMap> ActionCache;
+ if (Error E = OnDiskTrieRawHashMap::create(
+ CachePath,
+ "llvm.actioncache[" + HashName + "->" + ValueName + "]",
+ KeySize * 8,
+ /*DataSize=*/ValueSize, MaxFileSize, /*MinFileSize=*/MB)
+ .moveInto(ActionCache))
+ return std::move(E);
+
+ return std::unique_ptr<OnDiskKeyValueDB>(
+ new OnDiskKeyValueDB(ValueSize, std::move(*ActionCache)));
+}
+
+Error OnDiskKeyValueDB::validate(CheckValueT CheckValue) const {
+ return Cache.validate(
+ [&](FileOffset Offset,
+ OnDiskTrieRawHashMap::ConstValueProxy Record) -> Error {
+ auto formatError = [&](Twine Msg) {
+ return createStringError(
+ llvm::errc::illegal_byte_sequence,
+ "bad cache value at 0x" +
+ utohexstr((unsigned)Offset.get(), /*LowerCase=*/true) + ": " +
+ Msg.str());
+ };
+
+ if (Record.Data.size() != ValueSize)
+ return formatError("wrong cache value size");
+ if (!isAddrAligned(Align(8), Record.Data.data()))
+ return formatError("wrong cache value alignment");
+ if (CheckValue)
+ return CheckValue(Offset, Record.Data);
+ return Error::success();
+ });
+}