//===----------------------------------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// /// \file Implements OnDiskTrieRawHashMap. /// //===----------------------------------------------------------------------===// #include "llvm/CAS/OnDiskTrieRawHashMap.h" #include "DatabaseFile.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/TrieHashIndexGenerator.h" #include "llvm/CAS/MappedFileRegionArena.h" #include "llvm/Config/llvm-config.h" #include "llvm/Support/ThreadPool.h" #include "llvm/Support/Threading.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; using namespace llvm::cas; using namespace llvm::cas::ondisk; #if LLVM_ENABLE_ONDISK_CAS //===----------------------------------------------------------------------===// // TrieRawHashMap data structures. //===----------------------------------------------------------------------===// namespace { class SubtrieHandle; class TrieRawHashMapHandle; class TrieVisitor; /// A value stored in the slots inside a SubTrie. A stored value can either be a /// subtrie (encoded after negation) which is the file offset to another /// subtrie, or it can be a fileset to a DataRecord. class SubtrieSlotValue { public: explicit operator bool() const { return !isEmpty(); } bool isEmpty() const { return !Offset; } bool isData() const { return Offset > 0; } bool isSubtrie() const { return Offset < 0; } uint64_t asData() const { assert(isData()); return Offset; } uint64_t asSubtrie() const { assert(isSubtrie()); return -Offset; } FileOffset asSubtrieFileOffset() const { return FileOffset(asSubtrie()); } FileOffset asDataFileOffset() const { return FileOffset(asData()); } int64_t getRawOffset() const { return Offset; } static SubtrieSlotValue getDataOffset(int64_t Offset) { return SubtrieSlotValue(Offset); } static SubtrieSlotValue getSubtrieOffset(int64_t Offset) { return SubtrieSlotValue(-Offset); } static SubtrieSlotValue getDataOffset(FileOffset Offset) { return getDataOffset(Offset.get()); } static SubtrieSlotValue getSubtrieOffset(FileOffset Offset) { return getDataOffset(Offset.get()); } static SubtrieSlotValue getFromSlot(std::atomic &Slot) { return SubtrieSlotValue(Slot.load()); } SubtrieSlotValue() = default; private: friend class SubtrieHandle; explicit SubtrieSlotValue(int64_t Offset) : Offset(Offset) {} int64_t Offset = 0; }; /// Subtrie layout: /// - 2-bytes: StartBit /// - 1-bytes: NumBits=lg(num-slots) /// - 5-bytes: 0-pad /// - class SubtrieHandle { public: struct Header { /// The bit this subtrie starts on. uint16_t StartBit; /// The number of bits this subtrie handles. It has 2^NumBits slots. uint8_t NumBits; /// 0-pad to 8B. uint8_t ZeroPad1B; uint32_t ZeroPad4B; }; /// Slot storage: /// - zero: Empty /// - positive: RecordOffset /// - negative: SubtrieOffset using SlotT = std::atomic; static int64_t getSlotsSize(uint32_t NumBits) { return sizeof(int64_t) * (1u << NumBits); } static int64_t getSize(uint32_t NumBits) { return sizeof(SubtrieHandle::Header) + getSlotsSize(NumBits); } int64_t getSize() const { return getSize(H->NumBits); } size_t getNumSlots() const { return Slots.size(); } SubtrieSlotValue load(size_t I) const { return SubtrieSlotValue(Slots[I].load()); } void store(size_t I, SubtrieSlotValue V) { return Slots[I].store(V.getRawOffset()); } void printHash(raw_ostream &OS, ArrayRef Bytes) const; /// Return None on success, or the existing offset on failure. bool compare_exchange_strong(size_t I, SubtrieSlotValue &Expected, SubtrieSlotValue New) { return Slots[I].compare_exchange_strong(Expected.Offset, New.Offset); } /// Sink \p V from \p I in this subtrie down to \p NewI in a new subtrie with /// \p NumSubtrieBits. /// /// \p UnusedSubtrie maintains a 1-item "free" list of unused subtries. If a /// new subtrie is created that isn't used because of a lost race, then it If /// it's already valid, it should be used instead of allocating a new one. /// should be returned as an out parameter to be passed back in the future. /// If it's already valid, it should be used instead of allocating a new one. /// /// Returns the subtrie that now lives at \p I. Expected sink(size_t I, SubtrieSlotValue V, MappedFileRegionArena &Alloc, size_t NumSubtrieBits, SubtrieHandle &UnusedSubtrie, size_t NewI); /// Only safe if the subtrie is empty. void reinitialize(uint32_t StartBit, uint32_t NumBits); SubtrieSlotValue getOffset() const { return SubtrieSlotValue::getSubtrieOffset( reinterpret_cast(H) - Region->data()); } FileOffset getFileOffset() const { return getOffset().asSubtrieFileOffset(); } explicit operator bool() const { return H; } Header &getHeader() const { return *H; } uint32_t getStartBit() const { return H->StartBit; } uint32_t getNumBits() const { return H->NumBits; } static Expected create(MappedFileRegionArena &Alloc, uint32_t StartBit, uint32_t NumBits); static SubtrieHandle getFromFileOffset(MappedFileRegion &Region, FileOffset Offset) { return SubtrieHandle(Region, SubtrieSlotValue::getSubtrieOffset(Offset)); } SubtrieHandle() = default; SubtrieHandle(MappedFileRegion &Region, Header &H) : Region(&Region), H(&H), Slots(getSlots(H)) {} SubtrieHandle(MappedFileRegion &Region, SubtrieSlotValue Offset) : SubtrieHandle(Region, *reinterpret_cast
( Region.data() + Offset.asSubtrie())) {} private: MappedFileRegion *Region = nullptr; Header *H = nullptr; MutableArrayRef Slots; static MutableArrayRef getSlots(Header &H) { return MutableArrayRef(reinterpret_cast(&H + 1), 1u << H.NumBits); } }; /// Handle for a TrieRawHashMap table. /// /// TrieRawHashMap table layout: /// - [8-bytes: Generic table header] /// - 1-byte: NumSubtrieBits /// - 1-byte: Flags (not used yet) /// - 2-bytes: NumHashBits /// - 4-bytes: RecordDataSize (in bytes) /// - 8-bytes: RootTrieOffset /// - 8-bytes: AllocatorOffset (reserved for implementing free lists) /// - '\0' /// /// Record layout: /// - /// - class TrieRawHashMapHandle { public: static constexpr TableHandle::TableKind Kind = TableHandle::TableKind::TrieRawHashMap; struct Header { TableHandle::Header GenericHeader; uint8_t NumSubtrieBits; uint8_t Flags; ///< None used yet. uint16_t NumHashBits; uint32_t RecordDataSize; std::atomic RootTrieOffset; std::atomic AllocatorOffset; }; operator TableHandle() const { if (!H) return TableHandle(); return TableHandle(*Region, H->GenericHeader); } struct RecordData { OnDiskTrieRawHashMap::ValueProxy Proxy; SubtrieSlotValue Offset; FileOffset getFileOffset() const { return Offset.asDataFileOffset(); } }; enum Limits : size_t { /// Seems like 65528 hash bits ought to be enough. MaxNumHashBytes = UINT16_MAX >> 3, MaxNumHashBits = MaxNumHashBytes << 3, /// 2^16 bits in a trie is 65536 slots. This restricts us to a 16-bit /// index. This many slots is suspicously large anyway. MaxNumRootBits = 16, /// 2^10 bits in a trie is 1024 slots. This many slots seems suspiciously /// large for subtries. MaxNumSubtrieBits = 10, }; static constexpr size_t getNumHashBytes(size_t NumHashBits) { assert(NumHashBits % 8 == 0); return NumHashBits / 8; } static constexpr size_t getRecordSize(size_t RecordDataSize, size_t NumHashBits) { return RecordDataSize + getNumHashBytes(NumHashBits); } RecordData getRecord(SubtrieSlotValue Offset); Expected createRecord(MappedFileRegionArena &Alloc, ArrayRef Hash); explicit operator bool() const { return H; } const Header &getHeader() const { return *H; } SubtrieHandle getRoot() const; Expected getOrCreateRoot(MappedFileRegionArena &Alloc); MappedFileRegion &getRegion() const { return *Region; } size_t getFlags() const { return H->Flags; } size_t getNumSubtrieBits() const { return H->NumSubtrieBits; } size_t getNumHashBits() const { return H->NumHashBits; } size_t getNumHashBytes() const { return getNumHashBytes(H->NumHashBits); } size_t getRecordDataSize() const { return H->RecordDataSize; } size_t getRecordSize() const { return getRecordSize(H->RecordDataSize, H->NumHashBits); } TrieHashIndexGenerator getIndexGen(SubtrieHandle Root, ArrayRef Hash) { assert(Root.getStartBit() == 0); assert(getNumHashBytes() == Hash.size()); assert(getNumHashBits() == Hash.size() * 8); return TrieHashIndexGenerator{Root.getNumBits(), getNumSubtrieBits(), Hash}; } static Expected create(MappedFileRegionArena &Alloc, StringRef Name, std::optional NumRootBits, uint64_t NumSubtrieBits, uint64_t NumHashBits, uint64_t RecordDataSize); void print(raw_ostream &OS, function_ref)> PrintRecordData = nullptr) const; Error validate( function_ref RecordVerifier) const; TrieRawHashMapHandle() = default; TrieRawHashMapHandle(MappedFileRegion &Region, Header &H) : Region(&Region), H(&H) {} TrieRawHashMapHandle(MappedFileRegion &Region, intptr_t HeaderOffset) : TrieRawHashMapHandle( Region, *reinterpret_cast
(Region.data() + HeaderOffset)) { } private: MappedFileRegion *Region = nullptr; Header *H = nullptr; }; } // end anonymous namespace struct OnDiskTrieRawHashMap::ImplType { DatabaseFile File; TrieRawHashMapHandle Trie; }; Expected SubtrieHandle::create(MappedFileRegionArena &Alloc, uint32_t StartBit, uint32_t NumBits) { assert(StartBit <= TrieRawHashMapHandle::MaxNumHashBits); assert(NumBits <= UINT8_MAX); assert(NumBits <= TrieRawHashMapHandle::MaxNumRootBits); auto Mem = Alloc.allocate(getSize(NumBits)); if (LLVM_UNLIKELY(!Mem)) return Mem.takeError(); auto *H = new (*Mem) SubtrieHandle::Header{(uint16_t)StartBit, (uint8_t)NumBits, /*ZeroPad1B=*/0, /*ZeroPad4B=*/0}; SubtrieHandle S(Alloc.getRegion(), *H); for (auto I = S.Slots.begin(), E = S.Slots.end(); I != E; ++I) new (I) SlotT(0); return S; } SubtrieHandle TrieRawHashMapHandle::getRoot() const { if (int64_t Root = H->RootTrieOffset) return SubtrieHandle(getRegion(), SubtrieSlotValue::getSubtrieOffset(Root)); return SubtrieHandle(); } Expected TrieRawHashMapHandle::getOrCreateRoot(MappedFileRegionArena &Alloc) { assert(&Alloc.getRegion() == &getRegion()); if (SubtrieHandle Root = getRoot()) return Root; int64_t Race = 0; auto LazyRoot = SubtrieHandle::create(Alloc, 0, H->NumSubtrieBits); if (LLVM_UNLIKELY(!LazyRoot)) return LazyRoot.takeError(); if (H->RootTrieOffset.compare_exchange_strong( Race, LazyRoot->getOffset().asSubtrie())) return *LazyRoot; // There was a race. Return the other root. // // TODO: Avoid leaking the lazy root by storing it in an allocator. return SubtrieHandle(getRegion(), SubtrieSlotValue::getSubtrieOffset(Race)); } Expected TrieRawHashMapHandle::create(MappedFileRegionArena &Alloc, StringRef Name, std::optional NumRootBits, uint64_t NumSubtrieBits, uint64_t NumHashBits, uint64_t RecordDataSize) { // Allocate. auto Offset = Alloc.allocateOffset(sizeof(Header) + Name.size() + 1); if (LLVM_UNLIKELY(!Offset)) return Offset.takeError(); // Construct the header and the name. assert(Name.size() <= UINT16_MAX && "Expected smaller table name"); assert(NumSubtrieBits <= UINT8_MAX && "Expected valid subtrie bits"); assert(NumHashBits <= UINT16_MAX && "Expected valid hash size"); assert(RecordDataSize <= UINT32_MAX && "Expected smaller table name"); auto *H = new (Alloc.getRegion().data() + *Offset) Header{{TableHandle::TableKind::TrieRawHashMap, (uint16_t)Name.size(), (uint32_t)sizeof(Header)}, (uint8_t)NumSubtrieBits, /*Flags=*/0, (uint16_t)NumHashBits, (uint32_t)RecordDataSize, /*RootTrieOffset=*/{0}, /*AllocatorOffset=*/{0}}; char *NameStorage = reinterpret_cast(H + 1); llvm::copy(Name, NameStorage); NameStorage[Name.size()] = 0; // Construct a root trie, if requested. TrieRawHashMapHandle Trie(Alloc.getRegion(), *H); auto Sub = SubtrieHandle::create(Alloc, 0, *NumRootBits); if (LLVM_UNLIKELY(!Sub)) return Sub.takeError(); if (NumRootBits) H->RootTrieOffset = Sub->getOffset().asSubtrie(); return Trie; } TrieRawHashMapHandle::RecordData TrieRawHashMapHandle::getRecord(SubtrieSlotValue Offset) { char *Begin = Region->data() + Offset.asData(); OnDiskTrieRawHashMap::ValueProxy Proxy; Proxy.Data = MutableArrayRef(Begin, getRecordDataSize()); Proxy.Hash = ArrayRef(reinterpret_cast(Proxy.Data.end()), getNumHashBytes()); return RecordData{Proxy, Offset}; } Expected TrieRawHashMapHandle::createRecord(MappedFileRegionArena &Alloc, ArrayRef Hash) { assert(&Alloc.getRegion() == Region); assert(Hash.size() == getNumHashBytes()); auto Offset = Alloc.allocateOffset(getRecordSize()); if (LLVM_UNLIKELY(!Offset)) return Offset.takeError(); RecordData Record = getRecord(SubtrieSlotValue::getDataOffset(*Offset)); llvm::copy(Hash, const_cast(Record.Proxy.Hash.begin())); return Record; } Expected OnDiskTrieRawHashMap::recoverFromFileOffset(FileOffset Offset) const { // Check alignment. if (!isAligned(MappedFileRegionArena::getAlign(), Offset.get())) return createStringError(make_error_code(std::errc::protocol_error), "unaligned file offset at 0x" + utohexstr(Offset.get(), /*LowerCase=*/true)); // Check bounds. // // Note: There's no potential overflow when using \c uint64_t because Offset // is in valid offset range and the record size is in \c [0,UINT32_MAX]. if (!validOffset(Offset) || Offset.get() + Impl->Trie.getRecordSize() > Impl->File.getAlloc().size()) return createStringError(make_error_code(std::errc::protocol_error), "file offset too large: 0x" + utohexstr(Offset.get(), /*LowerCase=*/true)); // Looks okay... TrieRawHashMapHandle::RecordData D = Impl->Trie.getRecord(SubtrieSlotValue::getDataOffset(Offset)); return const_pointer(D.Proxy, D.getFileOffset()); } OnDiskTrieRawHashMap::const_pointer OnDiskTrieRawHashMap::find(ArrayRef Hash) const { TrieRawHashMapHandle Trie = Impl->Trie; assert(Hash.size() == Trie.getNumHashBytes() && "Invalid hash"); SubtrieHandle S = Trie.getRoot(); if (!S) return const_pointer(); TrieHashIndexGenerator IndexGen = Trie.getIndexGen(S, Hash); size_t Index = IndexGen.next(); for (;;) { // Try to set the content. SubtrieSlotValue V = S.load(Index); if (!V) return const_pointer(); // Check for an exact match. if (V.isData()) { TrieRawHashMapHandle::RecordData D = Trie.getRecord(V); return D.Proxy.Hash == Hash ? const_pointer(D.Proxy, D.getFileOffset()) : const_pointer(); } Index = IndexGen.next(); S = SubtrieHandle(Trie.getRegion(), V); } } /// Only safe if the subtrie is empty. void SubtrieHandle::reinitialize(uint32_t StartBit, uint32_t NumBits) { assert(StartBit > H->StartBit); assert(NumBits <= H->NumBits); // Ideally would also assert that all slots are empty, but that's expensive. H->StartBit = StartBit; H->NumBits = NumBits; } Expected OnDiskTrieRawHashMap::insertLazy(ArrayRef Hash, LazyInsertOnConstructCB OnConstruct, LazyInsertOnLeakCB OnLeak) { TrieRawHashMapHandle Trie = Impl->Trie; assert(Hash.size() == Trie.getNumHashBytes() && "Invalid hash"); MappedFileRegionArena &Alloc = Impl->File.getAlloc(); std::optional S; auto Err = Trie.getOrCreateRoot(Alloc).moveInto(S); if (LLVM_UNLIKELY(Err)) return std::move(Err); TrieHashIndexGenerator IndexGen = Trie.getIndexGen(*S, Hash); size_t Index = IndexGen.next(); // Walk through the hash bytes and insert into correct trie position. std::optional NewRecord; SubtrieHandle UnusedSubtrie; for (;;) { SubtrieSlotValue Existing = S->load(Index); // Try to set it, if it's empty. if (!Existing) { if (!NewRecord) { auto Err = Trie.createRecord(Alloc, Hash).moveInto(NewRecord); if (LLVM_UNLIKELY(Err)) return std::move(Err); if (OnConstruct) OnConstruct(NewRecord->Offset.asDataFileOffset(), NewRecord->Proxy); } if (S->compare_exchange_strong(Index, Existing, NewRecord->Offset)) return pointer(NewRecord->Proxy, NewRecord->Offset.asDataFileOffset()); // Race means that Existing is no longer empty; fall through... } if (Existing.isSubtrie()) { S = SubtrieHandle(Trie.getRegion(), Existing); Index = IndexGen.next(); continue; } // Check for an exact match. TrieRawHashMapHandle::RecordData ExistingRecord = Trie.getRecord(Existing); if (ExistingRecord.Proxy.Hash == Hash) { if (NewRecord && OnLeak) OnLeak(NewRecord->Offset.asDataFileOffset(), NewRecord->Proxy, ExistingRecord.Offset.asDataFileOffset(), ExistingRecord.Proxy); return pointer(ExistingRecord.Proxy, ExistingRecord.Offset.asDataFileOffset()); } // Sink the existing content as long as the indexes match. for (;;) { size_t NextIndex = IndexGen.next(); size_t NewIndexForExistingContent = IndexGen.getCollidingBits(ExistingRecord.Proxy.Hash); auto Err = S->sink(Index, Existing, Alloc, IndexGen.getNumBits(), UnusedSubtrie, NewIndexForExistingContent) .moveInto(S); if (LLVM_UNLIKELY(Err)) return std::move(Err); Index = NextIndex; // Found the difference. if (NextIndex != NewIndexForExistingContent) break; } } } Expected SubtrieHandle::sink(size_t I, SubtrieSlotValue V, MappedFileRegionArena &Alloc, size_t NumSubtrieBits, SubtrieHandle &UnusedSubtrie, size_t NewI) { std::optional NewS; if (UnusedSubtrie) { // Steal UnusedSubtrie and initialize it. NewS.emplace(); std::swap(*NewS, UnusedSubtrie); NewS->reinitialize(getStartBit() + getNumBits(), NumSubtrieBits); } else { // Allocate a new, empty subtrie. auto Err = SubtrieHandle::create(Alloc, getStartBit() + getNumBits(), NumSubtrieBits) .moveInto(NewS); if (LLVM_UNLIKELY(Err)) return std::move(Err); } NewS->store(NewI, V); if (compare_exchange_strong(I, V, NewS->getOffset())) return *NewS; // Success! // Raced. assert(V.isSubtrie() && "Expected racing sink() to add a subtrie"); // Wipe out the new slot so NewS can be reused and set the out parameter. NewS->store(NewI, SubtrieSlotValue()); UnusedSubtrie = *NewS; // Return the subtrie added by the concurrent sink() call. return SubtrieHandle(Alloc.getRegion(), V); } void OnDiskTrieRawHashMap::print( raw_ostream &OS, function_ref)> PrintRecordData) const { Impl->Trie.print(OS, PrintRecordData); } Error OnDiskTrieRawHashMap::validate( function_ref RecordVerifier) const { return Impl->Trie.validate(RecordVerifier); } // Helper function that prints hexdigit and have a sub-byte starting position. static void printHexDigits(raw_ostream &OS, ArrayRef Bytes, size_t StartBit, size_t NumBits) { assert(StartBit % 4 == 0); assert(NumBits % 4 == 0); for (size_t I = StartBit, E = StartBit + NumBits; I != E; I += 4) { uint8_t HexPair = Bytes[I / 8]; uint8_t HexDigit = I % 8 == 0 ? HexPair >> 4 : HexPair & 0xf; OS << hexdigit(HexDigit, /*LowerCase=*/true); } } static void printBits(raw_ostream &OS, ArrayRef Bytes, size_t StartBit, size_t NumBits) { assert(StartBit + NumBits <= Bytes.size() * 8u); for (size_t I = StartBit, E = StartBit + NumBits; I != E; ++I) { uint8_t Byte = Bytes[I / 8]; size_t ByteOffset = I % 8; if (size_t ByteShift = 8 - ByteOffset - 1) Byte >>= ByteShift; OS << (Byte & 0x1 ? '1' : '0'); } } void SubtrieHandle::printHash(raw_ostream &OS, ArrayRef Bytes) const { // afb[1c:00*01110*0]def size_t EndBit = getStartBit() + getNumBits(); size_t HashEndBit = Bytes.size() * 8u; size_t FirstBinaryBit = getStartBit() & ~0x3u; printHexDigits(OS, Bytes, 0, FirstBinaryBit); size_t LastBinaryBit = (EndBit + 3u) & ~0x3u; OS << "["; printBits(OS, Bytes, FirstBinaryBit, LastBinaryBit - FirstBinaryBit); OS << "]"; printHexDigits(OS, Bytes, LastBinaryBit, HashEndBit - LastBinaryBit); } static void appendIndexBits(std::string &Prefix, size_t Index, size_t NumSlots) { std::string Bits; for (size_t NumBits = 1u; NumBits < NumSlots; NumBits <<= 1) { Bits.push_back('0' + (Index & 0x1)); Index >>= 1; } for (char Ch : llvm::reverse(Bits)) Prefix += Ch; } static void printPrefix(raw_ostream &OS, StringRef Prefix) { while (Prefix.size() >= 4) { uint8_t Digit; bool ErrorParsingBinary = Prefix.take_front(4).getAsInteger(2, Digit); assert(!ErrorParsingBinary); (void)ErrorParsingBinary; OS << hexdigit(Digit, /*LowerCase=*/true); Prefix = Prefix.drop_front(4); } if (!Prefix.empty()) OS << "[" << Prefix << "]"; } LLVM_DUMP_METHOD void OnDiskTrieRawHashMap::dump() const { print(dbgs()); } static Expected checkParameter(StringRef Label, size_t Max, std::optional Value, std::optional Default, StringRef Path, StringRef TableName) { assert(Value || Default); assert(!Default || *Default <= Max); if (!Value) return *Default; if (*Value <= Max) return *Value; return createTableConfigError( std::errc::argument_out_of_domain, Path, TableName, "invalid " + Label + ": " + Twine(*Value) + " (max: " + Twine(Max) + ")"); } size_t OnDiskTrieRawHashMap::size() const { return Impl->File.size(); } size_t OnDiskTrieRawHashMap::capacity() const { return Impl->File.getRegion().size(); } Expected OnDiskTrieRawHashMap::create(const Twine &PathTwine, const Twine &TrieNameTwine, size_t NumHashBits, uint64_t DataSize, uint64_t MaxFileSize, std::optional NewFileInitialSize, std::optional NewTableNumRootBits, std::optional NewTableNumSubtrieBits) { SmallString<128> PathStorage; StringRef Path = PathTwine.toStringRef(PathStorage); SmallString<128> TrieNameStorage; StringRef TrieName = TrieNameTwine.toStringRef(TrieNameStorage); constexpr size_t DefaultNumRootBits = 10; constexpr size_t DefaultNumSubtrieBits = 6; size_t NumRootBits; if (Error E = checkParameter( "root bits", TrieRawHashMapHandle::MaxNumRootBits, NewTableNumRootBits, DefaultNumRootBits, Path, TrieName) .moveInto(NumRootBits)) return std::move(E); size_t NumSubtrieBits; if (Error E = checkParameter("subtrie bits", TrieRawHashMapHandle::MaxNumSubtrieBits, NewTableNumSubtrieBits, DefaultNumSubtrieBits, Path, TrieName) .moveInto(NumSubtrieBits)) return std::move(E); size_t NumHashBytes = NumHashBits >> 3; if (Error E = checkParameter("hash size", TrieRawHashMapHandle::MaxNumHashBits, NumHashBits, std::nullopt, Path, TrieName) .takeError()) return std::move(E); assert(NumHashBits == NumHashBytes << 3 && "Expected hash size to be byte-aligned"); if (NumHashBits != NumHashBytes << 3) return createTableConfigError( std::errc::argument_out_of_domain, Path, TrieName, "invalid hash size: " + Twine(NumHashBits) + " (not byte-aligned)"); // Constructor for if the file doesn't exist. auto NewDBConstructor = [&](DatabaseFile &DB) -> Error { auto Trie = TrieRawHashMapHandle::create(DB.getAlloc(), TrieName, NumRootBits, NumSubtrieBits, NumHashBits, DataSize); if (LLVM_UNLIKELY(!Trie)) return Trie.takeError(); return DB.addTable(*Trie); }; // Get or create the file. Expected File = DatabaseFile::create(Path, MaxFileSize, NewDBConstructor); if (!File) return File.takeError(); // Find the trie and validate it. std::optional Table = File->findTable(TrieName); if (!Table) return createTableConfigError(std::errc::argument_out_of_domain, Path, TrieName, "table not found"); if (Error E = checkTable("table kind", (size_t)TrieRawHashMapHandle::Kind, (size_t)Table->getHeader().Kind, Path, TrieName)) return std::move(E); auto Trie = Table->cast(); assert(Trie && "Already checked the kind"); // Check the hash and data size. if (Error E = checkTable("hash size", NumHashBits, Trie.getNumHashBits(), Path, TrieName)) return std::move(E); if (Error E = checkTable("data size", DataSize, Trie.getRecordDataSize(), Path, TrieName)) return std::move(E); // No flags supported right now. Either corrupt, or coming from a future // writer. if (size_t Flags = Trie.getFlags()) return createTableConfigError(std::errc::invalid_argument, Path, TrieName, "unsupported flags: " + Twine(Flags)); // Success. OnDiskTrieRawHashMap::ImplType Impl{DatabaseFile(std::move(*File)), Trie}; return OnDiskTrieRawHashMap(std::make_unique(std::move(Impl))); } static Error createInvalidTrieError(uint64_t Offset, const Twine &Msg) { return createStringError(make_error_code(std::errc::protocol_error), "invalid trie at 0x" + utohexstr(Offset, /*LowerCase=*/true) + ": " + Msg); } //===----------------------------------------------------------------------===// // TrieVisitor data structures. //===----------------------------------------------------------------------===// namespace { /// A multi-threaded vistior to traverse the Trie. /// /// TODO: add more sanity checks that isn't just plain data corruption. For /// example, some ill-formed data can be constructed to form a cycle using /// Sub-Tries and it can lead to inifinite loop when visiting (or inserting /// data). class TrieVisitor { public: TrieVisitor(TrieRawHashMapHandle Trie, unsigned ThreadCount = 0, unsigned ErrorLimit = 50) : Trie(Trie), ErrorLimit(ErrorLimit), Threads(hardware_concurrency(ThreadCount)) {} virtual ~TrieVisitor() = default; Error visit(); private: // Virtual method to implement the action when visiting a sub-trie. virtual Error visitSubTrie(StringRef Prefix, SubtrieHandle SubTrie) { return Error::success(); } // Virtual method to implement the action when visiting a slot in a trie node. virtual Error visitSlot(unsigned I, SubtrieHandle Subtrie, StringRef Prefix, SubtrieSlotValue Slot) { return Error::success(); } protected: TrieRawHashMapHandle Trie; private: Error traverseTrieNode(SubtrieHandle Node, StringRef Prefix); Error validateSubTrie(SubtrieHandle Node, bool IsRoot); // Helper function to capture errors when visiting the trie nodes. void addError(Error NewError) { assert(NewError && "not an error"); std::lock_guard ErrorLock(Lock); if (NumError >= ErrorLimit) { // Too many errors. consumeError(std::move(NewError)); return; } if (Err) Err = joinErrors(std::move(*Err), std::move(NewError)); else Err = std::move(NewError); NumError++; } bool tooManyErrors() { std::lock_guard ErrorLock(Lock); return (bool)Err && NumError >= ErrorLimit; } const unsigned ErrorLimit; std::optional Err; unsigned NumError = 0; std::mutex Lock; DefaultThreadPool Threads; }; /// A visitor that traverse and print the Trie. class TriePrinter : public TrieVisitor { public: TriePrinter(TrieRawHashMapHandle Trie, raw_ostream &OS, function_ref)> PrintRecordData) : TrieVisitor(Trie, /*ThreadCount=*/1), OS(OS), PrintRecordData(PrintRecordData) {} Error printRecords() { if (Records.empty()) return Error::success(); OS << "records\n"; llvm::sort(Records); for (int64_t Offset : Records) { TrieRawHashMapHandle::RecordData Record = Trie.getRecord(SubtrieSlotValue::getDataOffset(Offset)); if (auto Err = printRecord(Record)) return Err; } return Error::success(); } Error printRecord(TrieRawHashMapHandle::RecordData &Record) { OS << "- addr=" << (void *)Record.getFileOffset().get() << " "; if (PrintRecordData) { PrintRecordData(Record.Proxy.Data); } else { OS << "bytes="; ArrayRef Data( reinterpret_cast(Record.Proxy.Data.data()), Record.Proxy.Data.size()); printHexDigits(OS, Data, 0, Data.size() * 8); } OS << "\n"; return Error::success(); } Error visitSubTrie(StringRef Prefix, SubtrieHandle SubTrie) override { if (Prefix.empty()) { OS << "root"; } else { OS << "subtrie="; printPrefix(OS, Prefix); } OS << " addr=" << (void *)(reinterpret_cast(&SubTrie.getHeader()) - Trie.getRegion().data()); OS << " num-slots=" << SubTrie.getNumSlots() << "\n"; return Error::success(); } Error visitSlot(unsigned I, SubtrieHandle Subtrie, StringRef Prefix, SubtrieSlotValue Slot) override { OS << "- index="; for (size_t Pad : {10, 100, 1000}) if (I < Pad && Subtrie.getNumSlots() >= Pad) OS << "0"; OS << I << " "; if (Slot.isSubtrie()) { OS << "addr=" << (void *)Slot.asSubtrie(); OS << " subtrie="; printPrefix(OS, Prefix); OS << "\n"; return Error::success(); } TrieRawHashMapHandle::RecordData Record = Trie.getRecord(Slot); OS << "addr=" << (void *)Record.getFileOffset().get(); OS << " content="; Subtrie.printHash(OS, Record.Proxy.Hash); OS << "\n"; Records.push_back(Slot.asData()); return Error::success(); } private: raw_ostream &OS; function_ref)> PrintRecordData; SmallVector Records; }; /// TrieVerifier that adds additional verification on top of the basic visitor. class TrieVerifier : public TrieVisitor { public: TrieVerifier( TrieRawHashMapHandle Trie, function_ref RecordVerifier) : TrieVisitor(Trie), RecordVerifier(RecordVerifier) {} private: Error visitSubTrie(StringRef Prefix, SubtrieHandle SubTrie) final { return Error::success(); } Error visitSlot(unsigned I, SubtrieHandle Subtrie, StringRef Prefix, SubtrieSlotValue Slot) final { if (RecordVerifier && Slot.isData()) { if (!isAligned(MappedFileRegionArena::getAlign(), Slot.asData())) return createInvalidTrieError(Slot.asData(), "mis-aligned data entry"); TrieRawHashMapHandle::RecordData Record = Trie.getRecord(SubtrieSlotValue::getDataOffset(Slot.asData())); return RecordVerifier(Slot.asDataFileOffset(), OnDiskTrieRawHashMap::ConstValueProxy{ Record.Proxy.Hash, Record.Proxy.Data}); } return Error::success(); } function_ref RecordVerifier; }; } // namespace Error TrieVisitor::visit() { auto Root = Trie.getRoot(); if (!Root) return Error::success(); if (auto Err = validateSubTrie(Root, /*IsRoot=*/true)) return Err; if (auto Err = visitSubTrie("", Root)) return Err; SmallVector Subs; SmallVector Prefixes; const size_t NumSlots = Root.getNumSlots(); for (size_t I = 0, E = NumSlots; I != E; ++I) { SubtrieSlotValue Slot = Root.load(I); if (!Slot) continue; uint64_t Offset = Slot.isSubtrie() ? Slot.asSubtrie() : Slot.asData(); if (Offset >= (uint64_t)Trie.getRegion().size()) return createInvalidTrieError(Offset, "slot points out of bound"); std::string SubtriePrefix; appendIndexBits(SubtriePrefix, I, NumSlots); if (Slot.isSubtrie()) { SubtrieHandle S(Trie.getRegion(), Slot); Subs.push_back(S); Prefixes.push_back(SubtriePrefix); } if (auto Err = visitSlot(I, Root, SubtriePrefix, Slot)) return Err; } for (size_t I = 0, E = Subs.size(); I != E; ++I) { Threads.async( [&](unsigned Idx) { // Don't run if there is an error already. if (tooManyErrors()) return; if (auto Err = traverseTrieNode(Subs[Idx], Prefixes[Idx])) addError(std::move(Err)); }, I); } Threads.wait(); if (Err) return std::move(*Err); return Error::success(); } Error TrieVisitor::validateSubTrie(SubtrieHandle Node, bool IsRoot) { char *Addr = reinterpret_cast(&Node.getHeader()); const int64_t Offset = Node.getFileOffset().get(); if (Addr + Node.getSize() >= Trie.getRegion().data() + Trie.getRegion().size()) return createInvalidTrieError(Offset, "subtrie node spans out of bound"); if (!IsRoot && Node.getStartBit() + Node.getNumBits() > Trie.getNumHashBits()) { return createInvalidTrieError(Offset, "subtrie represents too many hash bits"); } if (IsRoot) { if (Node.getStartBit() != 0) return createInvalidTrieError(Offset, "root node doesn't start at 0 index"); return Error::success(); } if (Node.getNumBits() > Trie.getNumSubtrieBits()) return createInvalidTrieError(Offset, "subtrie has wrong number of slots"); return Error::success(); } Error TrieVisitor::traverseTrieNode(SubtrieHandle Node, StringRef Prefix) { if (auto Err = validateSubTrie(Node, /*IsRoot=*/false)) return Err; if (auto Err = visitSubTrie(Prefix, Node)) return Err; SmallVector Subs; SmallVector Prefixes; const size_t NumSlots = Node.getNumSlots(); for (size_t I = 0, E = NumSlots; I != E; ++I) { SubtrieSlotValue Slot = Node.load(I); if (!Slot) continue; uint64_t Offset = Slot.isSubtrie() ? Slot.asSubtrie() : Slot.asData(); if (Offset >= (uint64_t)Trie.getRegion().size()) return createInvalidTrieError(Offset, "slot points out of bound"); std::string SubtriePrefix = Prefix.str(); appendIndexBits(SubtriePrefix, I, NumSlots); if (Slot.isSubtrie()) { SubtrieHandle S(Trie.getRegion(), Slot); Subs.push_back(S); Prefixes.push_back(SubtriePrefix); } if (auto Err = visitSlot(I, Node, SubtriePrefix, Slot)) return Err; } for (size_t I = 0, E = Subs.size(); I != E; ++I) if (auto Err = traverseTrieNode(Subs[I], Prefixes[I])) return Err; return Error::success(); } void TrieRawHashMapHandle::print( raw_ostream &OS, function_ref)> PrintRecordData) const { OS << "hash-num-bits=" << getNumHashBits() << " hash-size=" << getNumHashBytes() << " record-data-size=" << getRecordDataSize() << "\n"; TriePrinter Printer(*this, OS, PrintRecordData); if (auto Err = Printer.visit()) OS << "error: " << toString(std::move(Err)) << "\n"; if (auto Err = Printer.printRecords()) OS << "error: " << toString(std::move(Err)) << "\n"; return; } Error TrieRawHashMapHandle::validate( function_ref RecordVerifier) const { // Use the base TrieVisitor to identify the errors inside trie first. TrieVisitor BasicVerifier(*this); if (auto Err = BasicVerifier.visit()) return Err; // If the trie data structure is sound, do a second pass to verify data and // verifier function can assume the index is correct. However, there can be // newly added bad entries that can still produce error. TrieVerifier Verifier(*this, RecordVerifier); return Verifier.visit(); } #else // !LLVM_ENABLE_ONDISK_CAS struct OnDiskTrieRawHashMap::ImplType {}; Expected OnDiskTrieRawHashMap::create(const Twine &PathTwine, const Twine &TrieNameTwine, size_t NumHashBits, uint64_t DataSize, uint64_t MaxFileSize, std::optional NewFileInitialSize, std::optional NewTableNumRootBits, std::optional NewTableNumSubtrieBits) { return createStringError(make_error_code(std::errc::not_supported), "OnDiskTrieRawHashMap is not supported"); } Expected OnDiskTrieRawHashMap::insertLazy(ArrayRef Hash, LazyInsertOnConstructCB OnConstruct, LazyInsertOnLeakCB OnLeak) { return createStringError(make_error_code(std::errc::not_supported), "OnDiskTrieRawHashMap is not supported"); } Expected OnDiskTrieRawHashMap::recoverFromFileOffset(FileOffset Offset) const { return createStringError(make_error_code(std::errc::not_supported), "OnDiskTrieRawHashMap is not supported"); } OnDiskTrieRawHashMap::const_pointer OnDiskTrieRawHashMap::find(ArrayRef Hash) const { return const_pointer(); } void OnDiskTrieRawHashMap::print( raw_ostream &OS, function_ref)> PrintRecordData) const { } Error OnDiskTrieRawHashMap::validate( function_ref RecordVerifier) const { return createStringError(make_error_code(std::errc::not_supported), "OnDiskTrieRawHashMap is not supported"); } size_t OnDiskTrieRawHashMap::size() const { return 0; } size_t OnDiskTrieRawHashMap::capacity() const { return 0; } #endif // LLVM_ENABLE_ONDISK_CAS OnDiskTrieRawHashMap::OnDiskTrieRawHashMap(std::unique_ptr Impl) : Impl(std::move(Impl)) {} OnDiskTrieRawHashMap::OnDiskTrieRawHashMap(OnDiskTrieRawHashMap &&RHS) = default; OnDiskTrieRawHashMap & OnDiskTrieRawHashMap::operator=(OnDiskTrieRawHashMap &&RHS) = default; OnDiskTrieRawHashMap::~OnDiskTrieRawHashMap() = default;