diff options
Diffstat (limited to 'llvm/lib/CAS')
| -rw-r--r-- | llvm/lib/CAS/ActionCaches.cpp | 166 | ||||
| -rw-r--r-- | llvm/lib/CAS/BuiltinCAS.cpp | 14 | ||||
| -rw-r--r-- | llvm/lib/CAS/BuiltinCAS.h | 25 | ||||
| -rw-r--r-- | llvm/lib/CAS/BuiltinUnifiedCASDatabases.cpp | 38 | ||||
| -rw-r--r-- | llvm/lib/CAS/CMakeLists.txt | 3 | ||||
| -rw-r--r-- | llvm/lib/CAS/InMemoryCAS.cpp | 8 | ||||
| -rw-r--r-- | llvm/lib/CAS/ObjectStore.cpp | 93 | ||||
| -rw-r--r-- | llvm/lib/CAS/OnDiskCAS.cpp | 211 | ||||
| -rw-r--r-- | llvm/lib/CAS/OnDiskGraphDB.cpp | 34 | ||||
| -rw-r--r-- | llvm/lib/CAS/OnDiskKeyValueDB.cpp | 21 | ||||
| -rw-r--r-- | llvm/lib/CAS/UnifiedOnDiskCache.cpp | 613 | 
11 files changed, 1198 insertions, 28 deletions
diff --git a/llvm/lib/CAS/ActionCaches.cpp b/llvm/lib/CAS/ActionCaches.cpp index 571c5b3..003c850 100644 --- a/llvm/lib/CAS/ActionCaches.cpp +++ b/llvm/lib/CAS/ActionCaches.cpp @@ -13,7 +13,11 @@  #include "BuiltinCAS.h"  #include "llvm/ADT/TrieRawHashMap.h"  #include "llvm/CAS/ActionCache.h" +#include "llvm/CAS/OnDiskKeyValueDB.h" +#include "llvm/CAS/UnifiedOnDiskCache.h" +#include "llvm/Config/llvm-config.h"  #include "llvm/Support/BLAKE3.h" +#include "llvm/Support/Errc.h"  #define DEBUG_TYPE "cas-action-caches" @@ -47,12 +51,54 @@ public:    Expected<std::optional<CASID>> getImpl(ArrayRef<uint8_t> ActionKey,                                           bool CanBeDistributed) const final; +  Error validate() const final { +    return createStringError("InMemoryActionCache doesn't support validate()"); +  } +  private:    using DataT = CacheEntry<sizeof(HashType)>;    using InMemoryCacheT = ThreadSafeTrieRawHashMap<DataT, sizeof(HashType)>;    InMemoryCacheT Cache;  }; + +/// Builtin basic OnDiskActionCache that uses one underlying OnDiskKeyValueDB. +class OnDiskActionCache final : public ActionCache { +public: +  Error putImpl(ArrayRef<uint8_t> ActionKey, const CASID &Result, +                bool CanBeDistributed) final; +  Expected<std::optional<CASID>> getImpl(ArrayRef<uint8_t> ActionKey, +                                         bool CanBeDistributed) const final; + +  static Expected<std::unique_ptr<OnDiskActionCache>> create(StringRef Path); + +  Error validate() const final; + +private: +  static StringRef getHashName() { return "BLAKE3"; } + +  OnDiskActionCache(std::unique_ptr<ondisk::OnDiskKeyValueDB> DB); + +  std::unique_ptr<ondisk::OnDiskKeyValueDB> DB; +  using DataT = CacheEntry<sizeof(HashType)>; +}; + +/// Builtin unified ActionCache that wraps around UnifiedOnDiskCache to provide +/// access to its ActionCache. +class UnifiedOnDiskActionCache final : public ActionCache { +public: +  Error putImpl(ArrayRef<uint8_t> ActionKey, const CASID &Result, +                bool CanBeDistributed) final; +  Expected<std::optional<CASID>> getImpl(ArrayRef<uint8_t> ActionKey, +                                         bool CanBeDistributed) const final; + +  UnifiedOnDiskActionCache(std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB); + +  Error validate() const final; + +private: +  std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB; +};  } // end namespace  static Error createResultCachePoisonedError(ArrayRef<uint8_t> KeyHash, @@ -99,3 +145,123 @@ std::unique_ptr<ActionCache> createInMemoryActionCache() {  }  } // namespace llvm::cas + +OnDiskActionCache::OnDiskActionCache( +    std::unique_ptr<ondisk::OnDiskKeyValueDB> DB) +    : ActionCache(builtin::BuiltinCASContext::getDefaultContext()), +      DB(std::move(DB)) {} + +Expected<std::unique_ptr<OnDiskActionCache>> +OnDiskActionCache::create(StringRef AbsPath) { +  std::unique_ptr<ondisk::OnDiskKeyValueDB> DB; +  if (Error E = ondisk::OnDiskKeyValueDB::open(AbsPath, getHashName(), +                                               sizeof(HashType), getHashName(), +                                               sizeof(DataT)) +                    .moveInto(DB)) +    return std::move(E); +  return std::unique_ptr<OnDiskActionCache>( +      new OnDiskActionCache(std::move(DB))); +} + +Expected<std::optional<CASID>> +OnDiskActionCache::getImpl(ArrayRef<uint8_t> Key, +                           bool /*CanBeDistributed*/) const { +  std::optional<ArrayRef<char>> Val; +  if (Error E = DB->get(Key).moveInto(Val)) +    return std::move(E); +  if (!Val) +    return std::nullopt; +  return CASID::create(&getContext(), toStringRef(*Val)); +} + +Error OnDiskActionCache::putImpl(ArrayRef<uint8_t> Key, const CASID &Result, +                                 bool /*CanBeDistributed*/) { +  auto ResultHash = Result.getHash(); +  ArrayRef Expected((const char *)ResultHash.data(), ResultHash.size()); +  ArrayRef<char> Observed; +  if (Error E = DB->put(Key, Expected).moveInto(Observed)) +    return E; + +  if (Expected == Observed) +    return Error::success(); + +  return createResultCachePoisonedError( +      Key, getContext(), Result, +      ArrayRef((const uint8_t *)Observed.data(), Observed.size())); +} + +Error OnDiskActionCache::validate() const { +  // FIXME: without the matching CAS there is nothing we can check about the +  // cached values. The hash size is already validated by the DB validator. +  return DB->validate(nullptr); +} + +UnifiedOnDiskActionCache::UnifiedOnDiskActionCache( +    std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB) +    : ActionCache(builtin::BuiltinCASContext::getDefaultContext()), +      UniDB(std::move(UniDB)) {} + +Expected<std::optional<CASID>> +UnifiedOnDiskActionCache::getImpl(ArrayRef<uint8_t> Key, +                                  bool /*CanBeDistributed*/) const { +  std::optional<ArrayRef<char>> Val; +  if (Error E = UniDB->getKeyValueDB().get(Key).moveInto(Val)) +    return std::move(E); +  if (!Val) +    return std::nullopt; +  auto ID = ondisk::UnifiedOnDiskCache::getObjectIDFromValue(*Val); +  return CASID::create(&getContext(), +                       toStringRef(UniDB->getGraphDB().getDigest(ID))); +} + +Error UnifiedOnDiskActionCache::putImpl(ArrayRef<uint8_t> Key, +                                        const CASID &Result, +                                        bool /*CanBeDistributed*/) { +  auto Expected = UniDB->getGraphDB().getReference(Result.getHash()); +  if (LLVM_UNLIKELY(!Expected)) +    return Expected.takeError(); + +  auto Value = ondisk::UnifiedOnDiskCache::getValueFromObjectID(*Expected); +  std::optional<ArrayRef<char>> Observed; +  if (Error E = UniDB->getKeyValueDB().put(Key, Value).moveInto(Observed)) +    return E; + +  auto ObservedID = ondisk::UnifiedOnDiskCache::getObjectIDFromValue(*Observed); +  if (*Expected == ObservedID) +    return Error::success(); + +  return createResultCachePoisonedError( +      Key, getContext(), Result, UniDB->getGraphDB().getDigest(ObservedID)); +} + +Error UnifiedOnDiskActionCache::validate() const { +  auto ValidateRef = [](FileOffset Offset, ArrayRef<char> Value) -> Error { +    auto ID = ondisk::UnifiedOnDiskCache::getObjectIDFromValue(Value); +    auto formatError = [&](Twine Msg) { +      return createStringError( +          llvm::errc::illegal_byte_sequence, +          "bad record at 0x" + +              utohexstr((unsigned)Offset.get(), /*LowerCase=*/true) + ": " + +              Msg.str()); +    }; +    if (ID.getOpaqueData() == 0) +      return formatError("zero is not a valid ref"); +    return Error::success(); +  }; +  return UniDB->getKeyValueDB().validate(ValidateRef); +} + +Expected<std::unique_ptr<ActionCache>> +cas::createOnDiskActionCache(StringRef Path) { +#if LLVM_ENABLE_ONDISK_CAS +  return OnDiskActionCache::create(Path); +#else +  return createStringError(inconvertibleErrorCode(), "OnDiskCache is disabled"); +#endif +} + +std::unique_ptr<ActionCache> +cas::builtin::createActionCacheFromUnifiedOnDiskCache( +    std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB) { +  return std::make_unique<UnifiedOnDiskActionCache>(std::move(UniDB)); +} diff --git a/llvm/lib/CAS/BuiltinCAS.cpp b/llvm/lib/CAS/BuiltinCAS.cpp index 73646ad..e9bc6d8 100644 --- a/llvm/lib/CAS/BuiltinCAS.cpp +++ b/llvm/lib/CAS/BuiltinCAS.cpp @@ -9,6 +9,7 @@  #include "BuiltinCAS.h"  #include "llvm/ADT/StringExtras.h"  #include "llvm/CAS/BuiltinObjectHasher.h" +#include "llvm/CAS/UnifiedOnDiskCache.h"  #include "llvm/Support/Process.h"  using namespace llvm; @@ -68,7 +69,7 @@ Expected<ObjectRef> BuiltinCAS::store(ArrayRef<ObjectRef> Refs,                     Refs, Data);  } -Error BuiltinCAS::validate(const CASID &ID) { +Error BuiltinCAS::validateObject(const CASID &ID) {    auto Ref = getReference(ID);    if (!Ref)      return createUnknownObjectError(ID); @@ -92,3 +93,14 @@ Error BuiltinCAS::validate(const CASID &ID) {    return Error::success();  } + +Expected<std::unique_ptr<ondisk::UnifiedOnDiskCache>> +cas::builtin::createBuiltinUnifiedOnDiskCache(StringRef Path) { +#if LLVM_ENABLE_ONDISK_CAS +  return ondisk::UnifiedOnDiskCache::open(Path, /*SizeLimit=*/std::nullopt, +                                          BuiltinCASContext::getHashName(), +                                          sizeof(HashType)); +#else +  return createStringError(inconvertibleErrorCode(), "OnDiskCache is disabled"); +#endif +} diff --git a/llvm/lib/CAS/BuiltinCAS.h b/llvm/lib/CAS/BuiltinCAS.h index 3b5374d..4d2de66 100644 --- a/llvm/lib/CAS/BuiltinCAS.h +++ b/llvm/lib/CAS/BuiltinCAS.h @@ -1,4 +1,4 @@ -//===- BuiltinCAS.h ---------------------------------------------*- C++ -*-===// +//===----------------------------------------------------------------------===//  //  // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.  // See https://llvm.org/LICENSE.txt for license information. @@ -15,6 +15,9 @@  namespace llvm::cas {  class ActionCache; +namespace ondisk { +class UnifiedOnDiskCache; +} // namespace ondisk  namespace builtin {  /// Common base class for builtin CAS implementations using the same CASContext. @@ -65,9 +68,27 @@ public:                               "corrupt storage");    } -  Error validate(const CASID &ID) final; +  Error validateObject(const CASID &ID) final;  }; +/// Create a \p UnifiedOnDiskCache instance that uses \p BLAKE3 hashing. +Expected<std::unique_ptr<ondisk::UnifiedOnDiskCache>> +createBuiltinUnifiedOnDiskCache(StringRef Path); + +/// \param UniDB A \p UnifiedOnDiskCache instance from \p +/// createBuiltinUnifiedOnDiskCache. +std::unique_ptr<ObjectStore> createObjectStoreFromUnifiedOnDiskCache( +    std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB); + +/// \param UniDB A \p UnifiedOnDiskCache instance from \p +/// createBuiltinUnifiedOnDiskCache. +std::unique_ptr<ActionCache> createActionCacheFromUnifiedOnDiskCache( +    std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB); + +// FIXME: Proxy not portable. Maybe also error-prone? +constexpr StringLiteral DefaultDirProxy = "/^llvm::cas::builtin::default"; +constexpr StringLiteral DefaultDir = "llvm.cas.builtin.default"; +  } // end namespace builtin  } // end namespace llvm::cas diff --git a/llvm/lib/CAS/BuiltinUnifiedCASDatabases.cpp b/llvm/lib/CAS/BuiltinUnifiedCASDatabases.cpp new file mode 100644 index 0000000..f3f6fa0 --- /dev/null +++ b/llvm/lib/CAS/BuiltinUnifiedCASDatabases.cpp @@ -0,0 +1,38 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/CAS/BuiltinUnifiedCASDatabases.h" +#include "BuiltinCAS.h" +#include "llvm/CAS/ActionCache.h" +#include "llvm/CAS/UnifiedOnDiskCache.h" + +using namespace llvm; +using namespace llvm::cas; + +Expected<std::pair<std::unique_ptr<ObjectStore>, std::unique_ptr<ActionCache>>> +cas::createOnDiskUnifiedCASDatabases(StringRef Path) { +  std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB; +  if (Error E = builtin::createBuiltinUnifiedOnDiskCache(Path).moveInto(UniDB)) +    return std::move(E); +  auto CAS = builtin::createObjectStoreFromUnifiedOnDiskCache(UniDB); +  auto AC = builtin::createActionCacheFromUnifiedOnDiskCache(std::move(UniDB)); +  return std::make_pair(std::move(CAS), std::move(AC)); +} + +Expected<ValidationResult> cas::validateOnDiskUnifiedCASDatabasesIfNeeded( +    StringRef Path, bool CheckHash, bool AllowRecovery, bool ForceValidation, +    std::optional<StringRef> LLVMCasBinary) { +#if LLVM_ENABLE_ONDISK_CAS +  return ondisk::UnifiedOnDiskCache::validateIfNeeded( +      Path, builtin::BuiltinCASContext::getHashName(), +      sizeof(builtin::HashType), CheckHash, AllowRecovery, ForceValidation, +      LLVMCasBinary); +#else +  return createStringError(inconvertibleErrorCode(), "OnDiskCache is disabled"); +#endif +} diff --git a/llvm/lib/CAS/CMakeLists.txt b/llvm/lib/CAS/CMakeLists.txt index a2f8c49..aad77dc 100644 --- a/llvm/lib/CAS/CMakeLists.txt +++ b/llvm/lib/CAS/CMakeLists.txt @@ -2,15 +2,18 @@ add_llvm_component_library(LLVMCAS    ActionCache.cpp    ActionCaches.cpp    BuiltinCAS.cpp +  BuiltinUnifiedCASDatabases.cpp    DatabaseFile.cpp    InMemoryCAS.cpp    MappedFileRegionArena.cpp    ObjectStore.cpp +  OnDiskCAS.cpp    OnDiskCommon.cpp    OnDiskDataAllocator.cpp    OnDiskGraphDB.cpp    OnDiskKeyValueDB.cpp    OnDiskTrieRawHashMap.cpp +  UnifiedOnDiskCache.cpp    ADDITIONAL_HEADER_DIRS    ${LLVM_MAIN_INCLUDE_DIR}/llvm/CAS diff --git a/llvm/lib/CAS/InMemoryCAS.cpp b/llvm/lib/CAS/InMemoryCAS.cpp index c63ee70d..2d4eedd 100644 --- a/llvm/lib/CAS/InMemoryCAS.cpp +++ b/llvm/lib/CAS/InMemoryCAS.cpp @@ -233,6 +233,12 @@ public:      return cast<InMemoryObject>(asInMemoryObject(Node)).getData();    } +  void print(raw_ostream &OS) const final; + +  Error validate(bool CheckHash) const final { +    return createStringError("InMemoryCAS doesn't support validate()"); +  } +    InMemoryCAS() = default;  private: @@ -271,6 +277,8 @@ ArrayRef<const InMemoryObject *> InMemoryObject::getRefs() const {    return cast<InMemoryInlineObject>(this)->getRefsImpl();  } +void InMemoryCAS::print(raw_ostream &OS) const {} +  Expected<ObjectRef>  InMemoryCAS::storeFromNullTerminatedRegion(ArrayRef<uint8_t> ComputedHash,                                             sys::fs::mapped_file_region Map) { diff --git a/llvm/lib/CAS/ObjectStore.cpp b/llvm/lib/CAS/ObjectStore.cpp index e0be50b..3110577 100644 --- a/llvm/lib/CAS/ObjectStore.cpp +++ b/llvm/lib/CAS/ObjectStore.cpp @@ -1,4 +1,4 @@ -//===- ObjectStore.cpp ------------------------------------------*- C++ -*-===// +//===----------------------------------------------------------------------===//  //  // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.  // See https://llvm.org/LICENSE.txt for license information. @@ -12,7 +12,7 @@  #include "llvm/Support/Errc.h"  #include "llvm/Support/FileSystem.h"  #include "llvm/Support/MemoryBuffer.h" -#include <optional> +#include <deque>  using namespace llvm;  using namespace llvm::cas; @@ -21,6 +21,7 @@ void CASContext::anchor() {}  void ObjectStore::anchor() {}  LLVM_DUMP_METHOD void CASID::dump() const { print(dbgs()); } +LLVM_DUMP_METHOD void ObjectStore::dump() const { print(dbgs()); }  LLVM_DUMP_METHOD void ObjectRef::dump() const { print(dbgs()); }  LLVM_DUMP_METHOD void ObjectHandle::dump() const { print(dbgs()); } @@ -141,7 +142,7 @@ Error ObjectStore::validateTree(ObjectRef Root) {      auto [I, Inserted] = ValidatedRefs.insert(Ref);      if (!Inserted)        continue; // already validated. -    if (Error E = validate(getID(Ref))) +    if (Error E = validateObject(getID(Ref)))        return E;      Expected<ObjectHandle> Obj = load(Ref);      if (!Obj) @@ -155,6 +156,92 @@ Error ObjectStore::validateTree(ObjectRef Root) {    return Error::success();  } +Expected<ObjectRef> ObjectStore::importObject(ObjectStore &Upstream, +                                              ObjectRef Other) { +  // Copy the full CAS tree from upstream with depth-first ordering to ensure +  // all the child nodes are available in downstream CAS before inserting +  // current object. This uses a similar algorithm as +  // `OnDiskGraphDB::importFullTree` but doesn't assume the upstream CAS schema +  // so it can be used to import from any other ObjectStore reguardless of the +  // CAS schema. + +  // There is no work to do if importing from self. +  if (this == &Upstream) +    return Other; + +  /// Keeps track of the state of visitation for current node and all of its +  /// parents. Upstream Cursor holds information only from upstream CAS. +  struct UpstreamCursor { +    ObjectRef Ref; +    ObjectHandle Node; +    size_t RefsCount; +    std::deque<ObjectRef> Refs; +  }; +  SmallVector<UpstreamCursor, 16> CursorStack; +  /// PrimaryNodeStack holds the ObjectRef of the current CAS, with nodes either +  /// just stored in the CAS or nodes already exists in the current CAS. +  SmallVector<ObjectRef, 128> PrimaryRefStack; +  /// A map from upstream ObjectRef to current ObjectRef. +  llvm::DenseMap<ObjectRef, ObjectRef> CreatedObjects; + +  auto enqueueNode = [&](ObjectRef Ref, ObjectHandle Node) { +    unsigned NumRefs = Upstream.getNumRefs(Node); +    std::deque<ObjectRef> Refs; +    for (unsigned I = 0; I < NumRefs; ++I) +      Refs.push_back(Upstream.readRef(Node, I)); + +    CursorStack.push_back({Ref, Node, NumRefs, std::move(Refs)}); +  }; + +  auto UpstreamHandle = Upstream.load(Other); +  if (!UpstreamHandle) +    return UpstreamHandle.takeError(); +  enqueueNode(Other, *UpstreamHandle); + +  while (!CursorStack.empty()) { +    UpstreamCursor &Cur = CursorStack.back(); +    if (Cur.Refs.empty()) { +      // Copy the node data into the primary store. +      // The bottom of \p PrimaryRefStack contains the ObjectRef for the +      // current node. +      assert(PrimaryRefStack.size() >= Cur.RefsCount); +      auto Refs = ArrayRef(PrimaryRefStack) +                      .slice(PrimaryRefStack.size() - Cur.RefsCount); +      auto NewNode = store(Refs, Upstream.getData(Cur.Node)); +      if (!NewNode) +        return NewNode.takeError(); + +      // Remove the current node and its IDs from the stack. +      PrimaryRefStack.truncate(PrimaryRefStack.size() - Cur.RefsCount); +      CursorStack.pop_back(); + +      PrimaryRefStack.push_back(*NewNode); +      CreatedObjects.try_emplace(Cur.Ref, *NewNode); +      continue; +    } + +    // Check if the node exists already. +    auto CurrentID = Cur.Refs.front(); +    Cur.Refs.pop_front(); +    auto Ref = CreatedObjects.find(CurrentID); +    if (Ref != CreatedObjects.end()) { +      // If exists already, just need to enqueue the primary node. +      PrimaryRefStack.push_back(Ref->second); +      continue; +    } + +    // Load child. +    auto PrimaryID = Upstream.load(CurrentID); +    if (LLVM_UNLIKELY(!PrimaryID)) +      return PrimaryID.takeError(); + +    enqueueNode(CurrentID, *PrimaryID); +  } + +  assert(PrimaryRefStack.size() == 1); +  return PrimaryRefStack.front(); +} +  std::unique_ptr<MemoryBuffer>  ObjectProxy::getMemoryBuffer(StringRef Name,                               bool RequiresNullTerminator) const { diff --git a/llvm/lib/CAS/OnDiskCAS.cpp b/llvm/lib/CAS/OnDiskCAS.cpp new file mode 100644 index 0000000..7d29f44 --- /dev/null +++ b/llvm/lib/CAS/OnDiskCAS.cpp @@ -0,0 +1,211 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "BuiltinCAS.h" +#include "llvm/CAS/BuiltinCASContext.h" +#include "llvm/CAS/BuiltinObjectHasher.h" +#include "llvm/CAS/OnDiskGraphDB.h" +#include "llvm/CAS/UnifiedOnDiskCache.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/Error.h" + +using namespace llvm; +using namespace llvm::cas; +using namespace llvm::cas::builtin; + +namespace { + +class OnDiskCAS : public BuiltinCAS { +public: +  Expected<ObjectRef> storeImpl(ArrayRef<uint8_t> ComputedHash, +                                ArrayRef<ObjectRef> Refs, +                                ArrayRef<char> Data) final; + +  Expected<std::optional<ObjectHandle>> loadIfExists(ObjectRef Ref) final; + +  CASID getID(ObjectRef Ref) const final; + +  std::optional<ObjectRef> getReference(const CASID &ID) const final; + +  Expected<bool> isMaterialized(ObjectRef Ref) const final; + +  ArrayRef<char> getDataConst(ObjectHandle Node) const final; + +  void print(raw_ostream &OS) const final; +  Error validate(bool CheckHash) const final; + +  static Expected<std::unique_ptr<OnDiskCAS>> open(StringRef Path); + +  OnDiskCAS(std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB) +      : UnifiedDB(std::move(UniDB)), DB(&UnifiedDB->getGraphDB()) {} + +private: +  ObjectHandle convertHandle(ondisk::ObjectHandle Node) const { +    return makeObjectHandle(Node.getOpaqueData()); +  } + +  ondisk::ObjectHandle convertHandle(ObjectHandle Node) const { +    return ondisk::ObjectHandle(Node.getInternalRef(*this)); +  } + +  ObjectRef convertRef(ondisk::ObjectID Ref) const { +    return makeObjectRef(Ref.getOpaqueData()); +  } + +  ondisk::ObjectID convertRef(ObjectRef Ref) const { +    return ondisk::ObjectID::fromOpaqueData(Ref.getInternalRef(*this)); +  } + +  size_t getNumRefs(ObjectHandle Node) const final { +    auto RefsRange = DB->getObjectRefs(convertHandle(Node)); +    return std::distance(RefsRange.begin(), RefsRange.end()); +  } + +  ObjectRef readRef(ObjectHandle Node, size_t I) const final { +    auto RefsRange = DB->getObjectRefs(convertHandle(Node)); +    return convertRef(RefsRange.begin()[I]); +  } + +  Error forEachRef(ObjectHandle Node, +                   function_ref<Error(ObjectRef)> Callback) const final; + +  Error setSizeLimit(std::optional<uint64_t> SizeLimit) final; +  Expected<std::optional<uint64_t>> getStorageSize() const final; +  Error pruneStorageData() final; + +  OnDiskCAS(std::unique_ptr<ondisk::OnDiskGraphDB> GraphDB) +      : OwnedDB(std::move(GraphDB)), DB(OwnedDB.get()) {} + +  std::unique_ptr<ondisk::OnDiskGraphDB> OwnedDB; +  std::shared_ptr<ondisk::UnifiedOnDiskCache> UnifiedDB; +  ondisk::OnDiskGraphDB *DB; +}; + +} // end anonymous namespace + +void OnDiskCAS::print(raw_ostream &OS) const { DB->print(OS); } +Error OnDiskCAS::validate(bool CheckHash) const { +  auto Hasher = [](ArrayRef<ArrayRef<uint8_t>> Refs, ArrayRef<char> Data, +                   SmallVectorImpl<uint8_t> &Result) { +    auto Hash = BuiltinObjectHasher<llvm::cas::builtin::HasherT>::hashObject( +        Refs, Data); +    Result.assign(Hash.begin(), Hash.end()); +  }; + +  if (auto E = DB->validate(CheckHash, Hasher)) +    return E; + +  return Error::success(); +} + +CASID OnDiskCAS::getID(ObjectRef Ref) const { +  ArrayRef<uint8_t> Hash = DB->getDigest(convertRef(Ref)); +  return CASID::create(&getContext(), toStringRef(Hash)); +} + +std::optional<ObjectRef> OnDiskCAS::getReference(const CASID &ID) const { +  std::optional<ondisk::ObjectID> ObjID = +      DB->getExistingReference(ID.getHash()); +  if (!ObjID) +    return std::nullopt; +  return convertRef(*ObjID); +} + +Expected<bool> OnDiskCAS::isMaterialized(ObjectRef ExternalRef) const { +  return DB->isMaterialized(convertRef(ExternalRef)); +} + +ArrayRef<char> OnDiskCAS::getDataConst(ObjectHandle Node) const { +  return DB->getObjectData(convertHandle(Node)); +} + +Expected<std::optional<ObjectHandle>> +OnDiskCAS::loadIfExists(ObjectRef ExternalRef) { +  Expected<std::optional<ondisk::ObjectHandle>> ObjHnd = +      DB->load(convertRef(ExternalRef)); +  if (!ObjHnd) +    return ObjHnd.takeError(); +  if (!*ObjHnd) +    return std::nullopt; +  return convertHandle(**ObjHnd); +} + +Expected<ObjectRef> OnDiskCAS::storeImpl(ArrayRef<uint8_t> ComputedHash, +                                         ArrayRef<ObjectRef> Refs, +                                         ArrayRef<char> Data) { +  SmallVector<ondisk::ObjectID, 64> IDs; +  IDs.reserve(Refs.size()); +  for (ObjectRef Ref : Refs) { +    IDs.push_back(convertRef(Ref)); +  } + +  auto StoredID = DB->getReference(ComputedHash); +  if (LLVM_UNLIKELY(!StoredID)) +    return StoredID.takeError(); +  if (Error E = DB->store(*StoredID, IDs, Data)) +    return std::move(E); +  return convertRef(*StoredID); +} + +Error OnDiskCAS::forEachRef(ObjectHandle Node, +                            function_ref<Error(ObjectRef)> Callback) const { +  auto RefsRange = DB->getObjectRefs(convertHandle(Node)); +  for (ondisk::ObjectID Ref : RefsRange) { +    if (Error E = Callback(convertRef(Ref))) +      return E; +  } +  return Error::success(); +} + +Error OnDiskCAS::setSizeLimit(std::optional<uint64_t> SizeLimit) { +  UnifiedDB->setSizeLimit(SizeLimit); +  return Error::success(); +} + +Expected<std::optional<uint64_t>> OnDiskCAS::getStorageSize() const { +  return UnifiedDB->getStorageSize(); +} + +Error OnDiskCAS::pruneStorageData() { return UnifiedDB->collectGarbage(); } + +Expected<std::unique_ptr<OnDiskCAS>> OnDiskCAS::open(StringRef AbsPath) { +  Expected<std::unique_ptr<ondisk::OnDiskGraphDB>> DB = +      ondisk::OnDiskGraphDB::open(AbsPath, BuiltinCASContext::getHashName(), +                                  sizeof(HashType)); +  if (!DB) +    return DB.takeError(); +  return std::unique_ptr<OnDiskCAS>(new OnDiskCAS(std::move(*DB))); +} + +bool cas::isOnDiskCASEnabled() { +#if LLVM_ENABLE_ONDISK_CAS +  return true; +#else +  return false; +#endif +} + +Expected<std::unique_ptr<ObjectStore>> cas::createOnDiskCAS(const Twine &Path) { +#if LLVM_ENABLE_ONDISK_CAS +  // FIXME: An absolute path isn't really good enough. Should open a directory +  // and use openat() for files underneath. +  SmallString<256> AbsPath; +  Path.toVector(AbsPath); +  sys::fs::make_absolute(AbsPath); + +  return OnDiskCAS::open(AbsPath); +#else +  return createStringError(inconvertibleErrorCode(), "OnDiskCAS is disabled"); +#endif /* LLVM_ENABLE_ONDISK_CAS */ +} + +std::unique_ptr<ObjectStore> +cas::builtin::createObjectStoreFromUnifiedOnDiskCache( +    std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB) { +  return std::make_unique<OnDiskCAS>(std::move(UniDB)); +} diff --git a/llvm/lib/CAS/OnDiskGraphDB.cpp b/llvm/lib/CAS/OnDiskGraphDB.cpp index 64cbe9d..245b6fb 100644 --- a/llvm/lib/CAS/OnDiskGraphDB.cpp +++ b/llvm/lib/CAS/OnDiskGraphDB.cpp @@ -893,6 +893,10 @@ int64_t DataRecordHandle::getDataRelOffset() const {  }  Error OnDiskGraphDB::validate(bool Deep, HashingFuncT Hasher) const { +  if (UpstreamDB) { +    if (auto E = UpstreamDB->validate(Deep, Hasher)) +      return E; +  }    return Index.validate([&](FileOffset Offset,                              OnDiskTrieRawHashMap::ConstValueProxy Record)                              -> Error { @@ -1202,11 +1206,8 @@ OnDiskGraphDB::load(ObjectID ExternalRef) {      return I.takeError();    TrieRecord::Data Object = I->Ref.load(); -  if (Object.SK == TrieRecord::StorageKind::Unknown) { -    if (!UpstreamDB) -      return std::nullopt; +  if (Object.SK == TrieRecord::StorageKind::Unknown)      return faultInFromUpstream(ExternalRef); -  }    if (Object.SK == TrieRecord::StorageKind::DataPool)      return ObjectHandle::fromFileOffset(Object.Offset); @@ -1286,8 +1287,10 @@ OnDiskGraphDB::getObjectPresence(ObjectID ExternalRef,    TrieRecord::Data Object = I->Ref.load();    if (Object.SK != TrieRecord::StorageKind::Unknown)      return ObjectPresence::InPrimaryDB; +    if (!CheckUpstream || !UpstreamDB)      return ObjectPresence::Missing; +    std::optional<ObjectID> UpstreamID =        UpstreamDB->getExistingReference(getDigest(*I));    return UpstreamID.has_value() ? ObjectPresence::OnlyInUpstreamDB @@ -1549,9 +1552,10 @@ unsigned OnDiskGraphDB::getHardStorageLimitUtilization() const {    return std::max(IndexPercent, DataPercent);  } -Expected<std::unique_ptr<OnDiskGraphDB>> OnDiskGraphDB::open( -    StringRef AbsPath, StringRef HashName, unsigned HashByteSize, -    std::unique_ptr<OnDiskGraphDB> UpstreamDB, FaultInPolicy Policy) { +Expected<std::unique_ptr<OnDiskGraphDB>> +OnDiskGraphDB::open(StringRef AbsPath, StringRef HashName, +                    unsigned HashByteSize, OnDiskGraphDB *UpstreamDB, +                    FaultInPolicy Policy) {    if (std::error_code EC = sys::fs::create_directories(AbsPath))      return createFileError(AbsPath, EC); @@ -1604,18 +1608,15 @@ Expected<std::unique_ptr<OnDiskGraphDB>> OnDiskGraphDB::open(                               "unexpected user header in '" + DataPoolPath +                                   "'"); -  return std::unique_ptr<OnDiskGraphDB>( -      new OnDiskGraphDB(AbsPath, std::move(*Index), std::move(*DataPool), -                        std::move(UpstreamDB), Policy)); +  return std::unique_ptr<OnDiskGraphDB>(new OnDiskGraphDB( +      AbsPath, std::move(*Index), std::move(*DataPool), UpstreamDB, Policy));  }  OnDiskGraphDB::OnDiskGraphDB(StringRef RootPath, OnDiskTrieRawHashMap Index,                               OnDiskDataAllocator DataPool, -                             std::unique_ptr<OnDiskGraphDB> UpstreamDB, -                             FaultInPolicy Policy) +                             OnDiskGraphDB *UpstreamDB, FaultInPolicy Policy)      : Index(std::move(Index)), DataPool(std::move(DataPool)), -      RootPath(RootPath.str()), UpstreamDB(std::move(UpstreamDB)), -      FIPolicy(Policy) { +      RootPath(RootPath.str()), UpstreamDB(UpstreamDB), FIPolicy(Policy) {    /// Lifetime for "big" objects not in DataPool.    ///    /// NOTE: Could use ThreadSafeTrieRawHashMap here. For now, doing something @@ -1638,7 +1639,6 @@ Error OnDiskGraphDB::importFullTree(ObjectID PrimaryID,    // against the process dying during importing and leaving the database with an    // incomplete tree. Note that if the upstream has missing nodes then the tree    // will be copied with missing nodes as well, it won't be considered an error. -    struct UpstreamCursor {      ObjectHandle Node;      size_t RefsCount; @@ -1720,7 +1720,6 @@ Error OnDiskGraphDB::importSingleNode(ObjectID PrimaryID,    // Copy the node data into the primary store.    // FIXME: Use hard-link or cloning if the file-system supports it and data is    // stored into a separate file. -    auto Data = UpstreamDB->getObjectData(UpstreamNode);    auto UpstreamRefs = UpstreamDB->getObjectRefs(UpstreamNode);    SmallVector<ObjectID, 64> Refs; @@ -1737,7 +1736,8 @@ Error OnDiskGraphDB::importSingleNode(ObjectID PrimaryID,  Expected<std::optional<ObjectHandle>>  OnDiskGraphDB::faultInFromUpstream(ObjectID PrimaryID) { -  assert(UpstreamDB); +  if (!UpstreamDB) +    return std::nullopt;    auto UpstreamID = UpstreamDB->getReference(getDigest(PrimaryID));    if (LLVM_UNLIKELY(!UpstreamID)) diff --git a/llvm/lib/CAS/OnDiskKeyValueDB.cpp b/llvm/lib/CAS/OnDiskKeyValueDB.cpp index 2186071..15656cb 100644 --- a/llvm/lib/CAS/OnDiskKeyValueDB.cpp +++ b/llvm/lib/CAS/OnDiskKeyValueDB.cpp @@ -20,6 +20,7 @@  #include "llvm/CAS/OnDiskKeyValueDB.h"  #include "OnDiskCommon.h"  #include "llvm/ADT/StringExtras.h" +#include "llvm/CAS/UnifiedOnDiskCache.h"  #include "llvm/Support/Alignment.h"  #include "llvm/Support/Compiler.h"  #include "llvm/Support/Errc.h" @@ -53,15 +54,21 @@ Expected<std::optional<ArrayRef<char>>>  OnDiskKeyValueDB::get(ArrayRef<uint8_t> Key) {    // Check the result cache.    OnDiskTrieRawHashMap::ConstOnDiskPtr ActionP = Cache.find(Key); -  if (!ActionP) +  if (ActionP) { +    assert(isAddrAligned(Align(8), ActionP->Data.data())); +    return ActionP->Data; +  } +  if (!UnifiedCache || !UnifiedCache->UpstreamKVDB)      return std::nullopt; -  assert(isAddrAligned(Align(8), ActionP->Data.data())); -  return ActionP->Data; + +  // Try to fault in from upstream. +  return UnifiedCache->faultInFromUpstreamKV(Key);  }  Expected<std::unique_ptr<OnDiskKeyValueDB>>  OnDiskKeyValueDB::open(StringRef Path, StringRef HashName, unsigned KeySize, -                       StringRef ValueName, size_t ValueSize) { +                       StringRef ValueName, size_t ValueSize, +                       UnifiedOnDiskCache *Cache) {    if (std::error_code EC = sys::fs::create_directories(Path))      return createFileError(Path, EC); @@ -87,10 +94,14 @@ OnDiskKeyValueDB::open(StringRef Path, StringRef HashName, unsigned KeySize,      return std::move(E);    return std::unique_ptr<OnDiskKeyValueDB>( -      new OnDiskKeyValueDB(ValueSize, std::move(*ActionCache))); +      new OnDiskKeyValueDB(ValueSize, std::move(*ActionCache), Cache));  }  Error OnDiskKeyValueDB::validate(CheckValueT CheckValue) const { +  if (UnifiedCache && UnifiedCache->UpstreamKVDB) { +    if (auto E = UnifiedCache->UpstreamKVDB->validate(CheckValue)) +      return E; +  }    return Cache.validate(        [&](FileOffset Offset,            OnDiskTrieRawHashMap::ConstValueProxy Record) -> Error { diff --git a/llvm/lib/CAS/UnifiedOnDiskCache.cpp b/llvm/lib/CAS/UnifiedOnDiskCache.cpp new file mode 100644 index 0000000..ae9d818 --- /dev/null +++ b/llvm/lib/CAS/UnifiedOnDiskCache.cpp @@ -0,0 +1,613 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Encapsulates \p OnDiskGraphDB and \p OnDiskKeyValueDB instances within one +/// directory while also restricting storage growth with a scheme of chaining +/// the two most recent directories (primary & upstream), where the primary +/// "faults-in" data from the upstream one. When the primary (most recent) +/// directory exceeds its intended limit a new empty directory becomes the +/// primary one. +/// +/// Within the top-level directory (the path that \p UnifiedOnDiskCache::open +/// receives) there are directories named like this: +/// +/// 'v<version>.<x>' +/// 'v<version>.<x+1>' +/// 'v<version>.<x+2>' +/// ... +/// +/// 'version' is the version integer for this \p UnifiedOnDiskCache's scheme and +/// the part after the dot is an increasing integer. The primary directory is +/// the one with the highest integer and the upstream one is the directory +/// before it. For example, if the sub-directories contained are: +/// +/// 'v1.5', 'v1.6', 'v1.7', 'v1.8' +/// +/// Then the primary one is 'v1.8', the upstream one is 'v1.7', and the rest are +/// unused directories that can be safely deleted at any time and by any +/// process. +/// +/// Contained within the top-level directory is a file named "lock" which is +/// used for processes to take shared or exclusive locks for the contents of the +/// top directory. While a \p UnifiedOnDiskCache is open it keeps a shared lock +/// for the top-level directory; when it closes, if the primary sub-directory +/// exceeded its limit, it attempts to get an exclusive lock in order to create +/// a new empty primary directory; if it can't get the exclusive lock it gives +/// up and lets the next \p UnifiedOnDiskCache instance that closes to attempt +/// again. +/// +/// The downside of this scheme is that while \p UnifiedOnDiskCache is open on a +/// directory, by any process, the storage size in that directory will keep +/// growing unrestricted. But the major benefit is that garbage-collection can +/// be triggered on a directory concurrently, at any time and by any process, +/// without affecting any active readers/writers in the same process or other +/// processes. +/// +/// The \c UnifiedOnDiskCache also provides validation and recovery on top of +/// the underlying on-disk storage. The low-level storage is designed to remain +/// coherent across regular process crashes, but may be invalid after power loss +/// or similar system failures. \c UnifiedOnDiskCache::validateIfNeeded allows +/// validating the contents once per boot and can recover by marking invalid +/// data for garbage collection. +/// +/// The data recovery described above requires exclusive access to the CAS, and +/// it is an error to attempt recovery if the CAS is open in any process/thread. +/// In order to maximize backwards compatibility with tools that do not perform +/// validation before opening the CAS, we do not attempt to get exclusive access +/// until recovery is actually performed, meaning as long as the data is valid +/// it will not conflict with concurrent use. +// +//===----------------------------------------------------------------------===// + +#include "llvm/CAS/UnifiedOnDiskCache.h" +#include "BuiltinCAS.h" +#include "OnDiskCommon.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/ScopeExit.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/CAS/ActionCache.h" +#include "llvm/CAS/OnDiskGraphDB.h" +#include "llvm/CAS/OnDiskKeyValueDB.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/Errc.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/FileUtilities.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/Path.h" +#include "llvm/Support/Program.h" +#include "llvm/Support/raw_ostream.h" +#include <optional> + +#if __has_include(<sys/sysctl.h>) +#include <sys/sysctl.h> +#endif + +using namespace llvm; +using namespace llvm::cas; +using namespace llvm::cas::ondisk; + +/// FIXME: When the version of \p DBDirPrefix is bumped up we need to figure out +/// how to handle the leftover sub-directories of the previous version, within +/// the \p UnifiedOnDiskCache::collectGarbage function. +static constexpr StringLiteral DBDirPrefix = "v1."; + +static constexpr StringLiteral ValidationFilename = "v1.validation"; +static constexpr StringLiteral CorruptPrefix = "corrupt."; + +ObjectID UnifiedOnDiskCache::getObjectIDFromValue(ArrayRef<char> Value) { +  // little endian encoded. +  assert(Value.size() == sizeof(uint64_t)); +  return ObjectID::fromOpaqueData(support::endian::read64le(Value.data())); +} + +UnifiedOnDiskCache::ValueBytes +UnifiedOnDiskCache::getValueFromObjectID(ObjectID ID) { +  // little endian encoded. +  UnifiedOnDiskCache::ValueBytes ValBytes; +  static_assert(ValBytes.size() == sizeof(ID.getOpaqueData())); +  support::endian::write64le(ValBytes.data(), ID.getOpaqueData()); +  return ValBytes; +} + +Expected<std::optional<ArrayRef<char>>> +UnifiedOnDiskCache::faultInFromUpstreamKV(ArrayRef<uint8_t> Key) { +  assert(UpstreamGraphDB); +  assert(UpstreamKVDB); + +  std::optional<ArrayRef<char>> UpstreamValue; +  if (Error E = UpstreamKVDB->get(Key).moveInto(UpstreamValue)) +    return std::move(E); +  if (!UpstreamValue) +    return std::nullopt; + +  // The value is the \p ObjectID in the context of the upstream +  // \p OnDiskGraphDB instance. Translate it to the context of the primary +  // \p OnDiskGraphDB instance. +  ObjectID UpstreamID = getObjectIDFromValue(*UpstreamValue); +  auto PrimaryID = +      PrimaryGraphDB->getReference(UpstreamGraphDB->getDigest(UpstreamID)); +  if (LLVM_UNLIKELY(!PrimaryID)) +    return PrimaryID.takeError(); +  return PrimaryKVDB->put(Key, getValueFromObjectID(*PrimaryID)); +} + +/// \returns all the 'v<version>.<x>' names of sub-directories, sorted with +/// ascending order of the integer after the dot. Corrupt directories, if +/// included, will come first. +static Expected<SmallVector<std::string, 4>> +getAllDBDirs(StringRef Path, bool IncludeCorrupt = false) { +  struct DBDir { +    uint64_t Order; +    std::string Name; +  }; +  SmallVector<DBDir> FoundDBDirs; + +  std::error_code EC; +  for (sys::fs::directory_iterator DirI(Path, EC), DirE; !EC && DirI != DirE; +       DirI.increment(EC)) { +    if (DirI->type() != sys::fs::file_type::directory_file) +      continue; +    StringRef SubDir = sys::path::filename(DirI->path()); +    if (IncludeCorrupt && SubDir.starts_with(CorruptPrefix)) { +      FoundDBDirs.push_back({0, std::string(SubDir)}); +      continue; +    } +    if (!SubDir.starts_with(DBDirPrefix)) +      continue; +    uint64_t Order; +    if (SubDir.substr(DBDirPrefix.size()).getAsInteger(10, Order)) +      return createStringError(inconvertibleErrorCode(), +                               "unexpected directory " + DirI->path()); +    FoundDBDirs.push_back({Order, std::string(SubDir)}); +  } +  if (EC) +    return createFileError(Path, EC); + +  llvm::sort(FoundDBDirs, [](const DBDir &LHS, const DBDir &RHS) -> bool { +    return LHS.Order <= RHS.Order; +  }); + +  SmallVector<std::string, 4> DBDirs; +  for (DBDir &Dir : FoundDBDirs) +    DBDirs.push_back(std::move(Dir.Name)); +  return DBDirs; +} + +static Expected<SmallVector<std::string, 4>> getAllGarbageDirs(StringRef Path) { +  auto DBDirs = getAllDBDirs(Path, /*IncludeCorrupt=*/true); +  if (!DBDirs) +    return DBDirs.takeError(); + +  // FIXME: When the version of \p DBDirPrefix is bumped up we need to figure +  // out how to handle the leftover sub-directories of the previous version. + +  for (unsigned Keep = 2; Keep > 0 && !DBDirs->empty(); --Keep) { +    StringRef Back(DBDirs->back()); +    if (Back.starts_with(CorruptPrefix)) +      break; +    DBDirs->pop_back(); +  } +  return *DBDirs; +} + +/// \returns Given a sub-directory named 'v<version>.<x>', it outputs the +/// 'v<version>.<x+1>' name. +static void getNextDBDirName(StringRef DBDir, llvm::raw_ostream &OS) { +  assert(DBDir.starts_with(DBDirPrefix)); +  uint64_t Count; +  bool Failed = DBDir.substr(DBDirPrefix.size()).getAsInteger(10, Count); +  assert(!Failed); +  (void)Failed; +  OS << DBDirPrefix << Count + 1; +} + +static Error validateOutOfProcess(StringRef LLVMCasBinary, StringRef RootPath, +                                  bool CheckHash) { +  SmallVector<StringRef> Args{LLVMCasBinary, "-cas", RootPath, "-validate"}; +  if (CheckHash) +    Args.push_back("-check-hash"); + +  llvm::SmallString<128> StdErrPath; +  int StdErrFD = -1; +  if (std::error_code EC = sys::fs::createTemporaryFile( +          "llvm-cas-validate-stderr", "txt", StdErrFD, StdErrPath, +          llvm::sys::fs::OF_Text)) +    return createStringError(EC, "failed to create temporary file"); +  FileRemover OutputRemover(StdErrPath.c_str()); + +  std::optional<llvm::StringRef> Redirects[] = { +      {""}, // stdin = /dev/null +      {""}, // stdout = /dev/null +      StdErrPath.str(), +  }; + +  std::string ErrMsg; +  int Result = +      sys::ExecuteAndWait(LLVMCasBinary, Args, /*Env=*/std::nullopt, Redirects, +                          /*SecondsToWait=*/120, /*MemoryLimit=*/0, &ErrMsg); + +  if (Result == -1) +    return createStringError("failed to exec " + join(Args, " ") + ": " + +                             ErrMsg); +  if (Result != 0) { +    llvm::SmallString<64> Err("cas contents invalid"); +    if (!ErrMsg.empty()) { +      Err += ": "; +      Err += ErrMsg; +    } +    auto StdErrBuf = MemoryBuffer::getFile(StdErrPath.c_str()); +    if (StdErrBuf && !(*StdErrBuf)->getBuffer().empty()) { +      Err += ": "; +      Err += (*StdErrBuf)->getBuffer(); +    } +    return createStringError(Err); +  } +  return Error::success(); +} + +static Error validateInProcess(StringRef RootPath, StringRef HashName, +                               unsigned HashByteSize, bool CheckHash) { +  std::shared_ptr<UnifiedOnDiskCache> UniDB; +  if (Error E = UnifiedOnDiskCache::open(RootPath, std::nullopt, HashName, +                                         HashByteSize) +                    .moveInto(UniDB)) +    return E; +  auto CAS = builtin::createObjectStoreFromUnifiedOnDiskCache(UniDB); +  if (Error E = CAS->validate(CheckHash)) +    return E; +  auto Cache = builtin::createActionCacheFromUnifiedOnDiskCache(UniDB); +  if (Error E = Cache->validate()) +    return E; +  return Error::success(); +} + +static Expected<uint64_t> getBootTime() { +#if __has_include(<sys/sysctl.h>) && defined(KERN_BOOTTIME) +  struct timeval TV; +  size_t TVLen = sizeof(TV); +  int KernBoot[2] = {CTL_KERN, KERN_BOOTTIME}; +  if (sysctl(KernBoot, 2, &TV, &TVLen, nullptr, 0) < 0) +    return createStringError(llvm::errnoAsErrorCode(), +                             "failed to get boottime"); +  if (TVLen != sizeof(TV)) +    return createStringError("sysctl kern.boottime unexpected format"); +  return TV.tv_sec; +#elif defined(__linux__) +  // Use the mtime for /proc, which is recreated during system boot. +  // We could also read /proc/stat and search for 'btime'. +  sys::fs::file_status Status; +  if (std::error_code EC = sys::fs::status("/proc", Status)) +    return createFileError("/proc", EC); +  return Status.getLastModificationTime().time_since_epoch().count(); +#else +  llvm::report_fatal_error("getBootTime unimplemented"); +#endif +} + +Expected<ValidationResult> UnifiedOnDiskCache::validateIfNeeded( +    StringRef RootPath, StringRef HashName, unsigned HashByteSize, +    bool CheckHash, bool AllowRecovery, bool ForceValidation, +    std::optional<StringRef> LLVMCasBinaryPath) { +  if (std::error_code EC = sys::fs::create_directories(RootPath)) +    return createFileError(RootPath, EC); + +  SmallString<256> PathBuf(RootPath); +  sys::path::append(PathBuf, ValidationFilename); +  int FD = -1; +  if (std::error_code EC = sys::fs::openFileForReadWrite( +          PathBuf, FD, sys::fs::CD_OpenAlways, sys::fs::OF_None)) +    return createFileError(PathBuf, EC); +  assert(FD != -1); + +  sys::fs::file_t File = sys::fs::convertFDToNativeFile(FD); +  auto CloseFile = make_scope_exit([&]() { sys::fs::closeFile(File); }); + +  if (std::error_code EC = lockFileThreadSafe(FD, sys::fs::LockKind::Exclusive)) +    return createFileError(PathBuf, EC); +  auto UnlockFD = make_scope_exit([&]() { unlockFileThreadSafe(FD); }); + +  SmallString<8> Bytes; +  if (Error E = sys::fs::readNativeFileToEOF(File, Bytes)) +    return createFileError(PathBuf, std::move(E)); + +  uint64_t ValidationBootTime = 0; +  if (!Bytes.empty() && +      StringRef(Bytes).trim().getAsInteger(10, ValidationBootTime)) +    return createFileError(PathBuf, errc::illegal_byte_sequence, +                           "expected integer"); + +  static uint64_t BootTime = 0; +  if (BootTime == 0) +    if (Error E = getBootTime().moveInto(BootTime)) +      return std::move(E); + +  std::string LogValidationError; + +  if (ValidationBootTime == BootTime && !ForceValidation) +    return ValidationResult::Skipped; + +  // Validate! +  bool NeedsRecovery = false; +  if (Error E = +          LLVMCasBinaryPath +              ? validateOutOfProcess(*LLVMCasBinaryPath, RootPath, CheckHash) +              : validateInProcess(RootPath, HashName, HashByteSize, +                                  CheckHash)) { +    if (AllowRecovery) { +      consumeError(std::move(E)); +      NeedsRecovery = true; +    } else { +      return std::move(E); +    } +  } + +  if (NeedsRecovery) { +    sys::path::remove_filename(PathBuf); +    sys::path::append(PathBuf, "lock"); + +    int LockFD = -1; +    if (std::error_code EC = sys::fs::openFileForReadWrite( +            PathBuf, LockFD, sys::fs::CD_OpenAlways, sys::fs::OF_None)) +      return createFileError(PathBuf, EC); +    sys::fs::file_t LockFile = sys::fs::convertFDToNativeFile(LockFD); +    auto CloseLock = make_scope_exit([&]() { sys::fs::closeFile(LockFile); }); +    if (std::error_code EC = tryLockFileThreadSafe(LockFD)) { +      if (EC == std::errc::no_lock_available) +        return createFileError( +            PathBuf, EC, +            "CAS validation requires exclusive access but CAS was in use"); +      return createFileError(PathBuf, EC); +    } +    auto UnlockFD = make_scope_exit([&]() { unlockFileThreadSafe(LockFD); }); + +    auto DBDirs = getAllDBDirs(RootPath); +    if (!DBDirs) +      return DBDirs.takeError(); + +    for (StringRef DBDir : *DBDirs) { +      sys::path::remove_filename(PathBuf); +      sys::path::append(PathBuf, DBDir); +      std::error_code EC; +      int Attempt = 0, MaxAttempts = 100; +      SmallString<128> GCPath; +      for (; Attempt < MaxAttempts; ++Attempt) { +        GCPath.assign(RootPath); +        sys::path::append(GCPath, CorruptPrefix + std::to_string(Attempt) + +                                      "." + DBDir); +        EC = sys::fs::rename(PathBuf, GCPath); +        // Darwin uses ENOTEMPTY. Linux may return either ENOTEMPTY or EEXIST. +        if (EC != errc::directory_not_empty && EC != errc::file_exists) +          break; +      } +      if (Attempt == MaxAttempts) +        return createStringError( +            EC, "rename " + PathBuf + +                    " failed: too many CAS directories awaiting pruning"); +      if (EC) +        return createStringError(EC, "rename " + PathBuf + " to " + GCPath + +                                         " failed: " + EC.message()); +    } +  } + +  if (ValidationBootTime != BootTime) { +    // Fix filename in case we have error to report. +    sys::path::remove_filename(PathBuf); +    sys::path::append(PathBuf, ValidationFilename); +    if (std::error_code EC = sys::fs::resize_file(FD, 0)) +      return createFileError(PathBuf, EC); +    raw_fd_ostream OS(FD, /*shouldClose=*/false); +    OS.seek(0); // resize does not reset position +    OS << BootTime << '\n'; +    if (OS.has_error()) +      return createFileError(PathBuf, OS.error()); +  } + +  return NeedsRecovery ? ValidationResult::Recovered : ValidationResult::Valid; +} + +Expected<std::unique_ptr<UnifiedOnDiskCache>> +UnifiedOnDiskCache::open(StringRef RootPath, std::optional<uint64_t> SizeLimit, +                         StringRef HashName, unsigned HashByteSize, +                         OnDiskGraphDB::FaultInPolicy FaultInPolicy) { +  if (std::error_code EC = sys::fs::create_directories(RootPath)) +    return createFileError(RootPath, EC); + +  SmallString<256> PathBuf(RootPath); +  sys::path::append(PathBuf, "lock"); +  int LockFD = -1; +  if (std::error_code EC = sys::fs::openFileForReadWrite( +          PathBuf, LockFD, sys::fs::CD_OpenAlways, sys::fs::OF_None)) +    return createFileError(PathBuf, EC); +  assert(LockFD != -1); +  // Locking the directory using shared lock, which will prevent other processes +  // from creating a new chain (essentially while a \p UnifiedOnDiskCache +  // instance holds a shared lock the storage for the primary directory will +  // grow unrestricted). +  if (std::error_code EC = +          lockFileThreadSafe(LockFD, sys::fs::LockKind::Shared)) +    return createFileError(PathBuf, EC); + +  auto DBDirs = getAllDBDirs(RootPath); +  if (!DBDirs) +    return DBDirs.takeError(); +  if (DBDirs->empty()) +    DBDirs->push_back((Twine(DBDirPrefix) + "1").str()); + +  assert(!DBDirs->empty()); + +  /// If there is only one directory open databases on it. If there are 2 or +  /// more directories, get the most recent directories and chain them, with the +  /// most recent being the primary one. The remaining directories are unused +  /// data than can be garbage-collected. +  auto UniDB = std::unique_ptr<UnifiedOnDiskCache>(new UnifiedOnDiskCache()); +  std::unique_ptr<OnDiskGraphDB> UpstreamGraphDB; +  std::unique_ptr<OnDiskKeyValueDB> UpstreamKVDB; +  if (DBDirs->size() > 1) { +    StringRef UpstreamDir = *(DBDirs->end() - 2); +    PathBuf = RootPath; +    sys::path::append(PathBuf, UpstreamDir); +    if (Error E = OnDiskGraphDB::open(PathBuf, HashName, HashByteSize, +                                      /*UpstreamDB=*/nullptr, FaultInPolicy) +                      .moveInto(UpstreamGraphDB)) +      return std::move(E); +    if (Error E = OnDiskKeyValueDB::open(PathBuf, HashName, HashByteSize, +                                         /*ValueName=*/"objectid", +                                         /*ValueSize=*/sizeof(uint64_t)) +                      .moveInto(UpstreamKVDB)) +      return std::move(E); +  } + +  StringRef PrimaryDir = *(DBDirs->end() - 1); +  PathBuf = RootPath; +  sys::path::append(PathBuf, PrimaryDir); +  std::unique_ptr<OnDiskGraphDB> PrimaryGraphDB; +  if (Error E = OnDiskGraphDB::open(PathBuf, HashName, HashByteSize, +                                    UpstreamGraphDB.get(), FaultInPolicy) +                    .moveInto(PrimaryGraphDB)) +    return std::move(E); +  std::unique_ptr<OnDiskKeyValueDB> PrimaryKVDB; +  // \p UnifiedOnDiskCache does manual chaining for key-value requests, +  // including an extra translation step of the value during fault-in. +  if (Error E = +          OnDiskKeyValueDB::open(PathBuf, HashName, HashByteSize, +                                 /*ValueName=*/"objectid", +                                 /*ValueSize=*/sizeof(uint64_t), UniDB.get()) +              .moveInto(PrimaryKVDB)) +    return std::move(E); + +  UniDB->RootPath = RootPath; +  UniDB->SizeLimit = SizeLimit.value_or(0); +  UniDB->LockFD = LockFD; +  UniDB->NeedsGarbageCollection = DBDirs->size() > 2; +  UniDB->PrimaryDBDir = PrimaryDir; +  UniDB->UpstreamGraphDB = std::move(UpstreamGraphDB); +  UniDB->PrimaryGraphDB = std::move(PrimaryGraphDB); +  UniDB->UpstreamKVDB = std::move(UpstreamKVDB); +  UniDB->PrimaryKVDB = std::move(PrimaryKVDB); + +  return std::move(UniDB); +} + +void UnifiedOnDiskCache::setSizeLimit(std::optional<uint64_t> SizeLimit) { +  this->SizeLimit = SizeLimit.value_or(0); +} + +uint64_t UnifiedOnDiskCache::getStorageSize() const { +  uint64_t TotalSize = getPrimaryStorageSize(); +  if (UpstreamGraphDB) +    TotalSize += UpstreamGraphDB->getStorageSize(); +  if (UpstreamKVDB) +    TotalSize += UpstreamKVDB->getStorageSize(); +  return TotalSize; +} + +uint64_t UnifiedOnDiskCache::getPrimaryStorageSize() const { +  return PrimaryGraphDB->getStorageSize() + PrimaryKVDB->getStorageSize(); +} + +bool UnifiedOnDiskCache::hasExceededSizeLimit() const { +  uint64_t CurSizeLimit = SizeLimit; +  if (!CurSizeLimit) +    return false; + +  // If the hard limit is beyond 85%, declare above limit and request clean up. +  unsigned CurrentPercent = +      std::max(PrimaryGraphDB->getHardStorageLimitUtilization(), +               PrimaryKVDB->getHardStorageLimitUtilization()); +  if (CurrentPercent > 85) +    return true; + +  // We allow each of the directories in the chain to reach up to half the +  // intended size limit. Check whether the primary directory has exceeded half +  // the limit or not, in order to decide whether we need to start a new chain. +  // +  // We could check the size limit against the sum of sizes of both the primary +  // and upstream directories but then if the upstream is significantly larger +  // than the intended limit, it would trigger a new chain to be created before +  // the primary has reached its own limit. Essentially in such situation we +  // prefer reclaiming the storage later in order to have more consistent cache +  // hits behavior. +  return (CurSizeLimit / 2) < getPrimaryStorageSize(); +} + +Error UnifiedOnDiskCache::close(bool CheckSizeLimit) { +  if (LockFD == -1) +    return Error::success(); // already closed. +  auto CloseLock = make_scope_exit([&]() { +    assert(LockFD >= 0); +    sys::fs::file_t LockFile = sys::fs::convertFDToNativeFile(LockFD); +    sys::fs::closeFile(LockFile); +    LockFD = -1; +  }); + +  bool ExceededSizeLimit = CheckSizeLimit ? hasExceededSizeLimit() : false; +  UpstreamKVDB.reset(); +  PrimaryKVDB.reset(); +  UpstreamGraphDB.reset(); +  PrimaryGraphDB.reset(); +  if (std::error_code EC = unlockFileThreadSafe(LockFD)) +    return createFileError(RootPath, EC); + +  if (!ExceededSizeLimit) +    return Error::success(); + +  // The primary directory exceeded its intended size limit. Try to get an +  // exclusive lock in order to create a new primary directory for next time +  // this \p UnifiedOnDiskCache path is opened. + +  if (std::error_code EC = tryLockFileThreadSafe( +          LockFD, std::chrono::milliseconds(0), sys::fs::LockKind::Exclusive)) { +    if (EC == errc::no_lock_available) +      return Error::success(); // couldn't get exclusive lock, give up. +    return createFileError(RootPath, EC); +  } +  auto UnlockFile = make_scope_exit([&]() { unlockFileThreadSafe(LockFD); }); + +  // Managed to get an exclusive lock which means there are no other open +  // \p UnifiedOnDiskCache instances for the same path, so we can safely start a +  // new primary directory. To start a new primary directory we just have to +  // create a new empty directory with the next consecutive index; since this is +  // an atomic operation we will leave the top-level directory in a consistent +  // state even if the process dies during this code-path. + +  SmallString<256> PathBuf(RootPath); +  raw_svector_ostream OS(PathBuf); +  OS << sys::path::get_separator(); +  getNextDBDirName(PrimaryDBDir, OS); +  if (std::error_code EC = sys::fs::create_directory(PathBuf)) +    return createFileError(PathBuf, EC); + +  NeedsGarbageCollection = true; +  return Error::success(); +} + +UnifiedOnDiskCache::UnifiedOnDiskCache() = default; + +UnifiedOnDiskCache::~UnifiedOnDiskCache() { consumeError(close()); } + +Error UnifiedOnDiskCache::collectGarbage(StringRef Path) { +  auto DBDirs = getAllGarbageDirs(Path); +  if (!DBDirs) +    return DBDirs.takeError(); + +  SmallString<256> PathBuf(Path); +  for (StringRef UnusedSubDir : *DBDirs) { +    sys::path::append(PathBuf, UnusedSubDir); +    if (std::error_code EC = sys::fs::remove_directories(PathBuf)) +      return createFileError(PathBuf, EC); +    sys::path::remove_filename(PathBuf); +  } +  return Error::success(); +} + +Error UnifiedOnDiskCache::collectGarbage() { return collectGarbage(RootPath); }  | 
