diff options
Diffstat (limited to 'llvm/lib')
168 files changed, 3485 insertions, 1197 deletions
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index e27a9b1..5d88e5f 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -806,11 +806,11 @@ public: typedef SmallVector<MemAccessInfo, 8> MemAccessInfoList; AccessAnalysis(const Loop *TheLoop, AAResults *AA, const LoopInfo *LI, - MemoryDepChecker::DepCandidates &DA, + DominatorTree &DT, MemoryDepChecker::DepCandidates &DA, PredicatedScalarEvolution &PSE, SmallPtrSetImpl<MDNode *> &LoopAliasScopes) - : TheLoop(TheLoop), BAA(*AA), AST(BAA), LI(LI), DepCands(DA), PSE(PSE), - LoopAliasScopes(LoopAliasScopes) { + : TheLoop(TheLoop), BAA(*AA), AST(BAA), LI(LI), DT(DT), DepCands(DA), + PSE(PSE), LoopAliasScopes(LoopAliasScopes) { // We're analyzing dependences across loop iterations. BAA.enableCrossIterationMode(); } @@ -934,6 +934,9 @@ private: /// The LoopInfo of the loop being checked. const LoopInfo *LI; + /// The dominator tree of the function. + DominatorTree &DT; + /// Sets of potentially dependent accesses - members of one set share an /// underlying pointer. The set "CheckDeps" identfies which sets really need a /// dependence check. @@ -1015,6 +1018,7 @@ getStrideFromAddRec(const SCEVAddRecExpr *AR, const Loop *Lp, Type *AccessTy, /// informating from the IR pointer value to determine no-wrap. static bool isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR, Value *Ptr, Type *AccessTy, const Loop *L, bool Assume, + const DominatorTree &DT, std::optional<int64_t> Stride = std::nullopt) { // FIXME: This should probably only return true for NUW. if (AR->getNoWrapFlags(SCEV::NoWrapMask)) @@ -1029,8 +1033,18 @@ static bool isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR, // case, the GEP would be poison and any memory access dependent on it would // be immediate UB when executed. if (auto *GEP = dyn_cast_if_present<GetElementPtrInst>(Ptr); - GEP && GEP->hasNoUnsignedSignedWrap()) - return true; + GEP && GEP->hasNoUnsignedSignedWrap()) { + // For the above reasoning to apply, the pointer must be dereferenced in + // every iteration. + if (L->getHeader() == L->getLoopLatch() || + any_of(GEP->users(), [L, &DT, GEP](User *U) { + if (getLoadStorePointerOperand(U) != GEP) + return false; + BasicBlock *UserBB = cast<Instruction>(U)->getParent(); + return !LoopAccessInfo::blockNeedsPredication(UserBB, L, &DT); + })) + return true; + } if (!Stride) Stride = getStrideFromAddRec(AR, L, AccessTy, Ptr, PSE); @@ -1293,7 +1307,7 @@ bool AccessAnalysis::createCheckForAccess( } if (!isNoWrap(PSE, AR, RTCheckPtrs.size() == 1 ? Ptr : nullptr, AccessTy, - TheLoop, Assume)) + TheLoop, Assume, DT)) return false; } @@ -1606,7 +1620,7 @@ void AccessAnalysis::processMemAccesses() { /// Check whether the access through \p Ptr has a constant stride. std::optional<int64_t> llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, - const Loop *Lp, + const Loop *Lp, const DominatorTree &DT, const DenseMap<Value *, const SCEV *> &StridesMap, bool Assume, bool ShouldCheckWrap) { const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr); @@ -1630,7 +1644,7 @@ llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, if (!ShouldCheckWrap || !Stride) return Stride; - if (isNoWrap(PSE, AR, Ptr, AccessTy, Lp, Assume, Stride)) + if (isNoWrap(PSE, AR, Ptr, AccessTy, Lp, Assume, DT, Stride)) return Stride; LLVM_DEBUG( @@ -2047,10 +2061,10 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize( BPtr->getType()->getPointerAddressSpace()) return MemoryDepChecker::Dependence::Unknown; - std::optional<int64_t> StrideAPtr = - getPtrStride(PSE, ATy, APtr, InnermostLoop, SymbolicStrides, true, true); - std::optional<int64_t> StrideBPtr = - getPtrStride(PSE, BTy, BPtr, InnermostLoop, SymbolicStrides, true, true); + std::optional<int64_t> StrideAPtr = getPtrStride( + PSE, ATy, APtr, InnermostLoop, *DT, SymbolicStrides, true, true); + std::optional<int64_t> StrideBPtr = getPtrStride( + PSE, BTy, BPtr, InnermostLoop, *DT, SymbolicStrides, true, true); const SCEV *Src = PSE.getSCEV(APtr); const SCEV *Sink = PSE.getSCEV(BPtr); @@ -2627,7 +2641,8 @@ bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI, } MemoryDepChecker::DepCandidates DepCands; - AccessAnalysis Accesses(TheLoop, AA, LI, DepCands, *PSE, LoopAliasScopes); + AccessAnalysis Accesses(TheLoop, AA, LI, *DT, DepCands, *PSE, + LoopAliasScopes); // Holds the analyzed pointers. We don't want to call getUnderlyingObjects // multiple times on the same object. If the ptr is accessed twice, once @@ -2691,7 +2706,8 @@ bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI, bool IsReadOnlyPtr = false; Type *AccessTy = getLoadStoreType(LD); if (Seen.insert({Ptr, AccessTy}).second || - !getPtrStride(*PSE, AccessTy, Ptr, TheLoop, SymbolicStrides)) { + !getPtrStride(*PSE, AccessTy, Ptr, TheLoop, *DT, SymbolicStrides, false, + true)) { ++NumReads; IsReadOnlyPtr = true; } diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index 0a72076..789a983 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -7419,84 +7419,20 @@ static bool canCreateUndefOrPoison(const Operator *Op, UndefPoisonKind Kind, if (cast<ConstantInt>(II->getArgOperand(1))->isNullValue()) return false; break; - case Intrinsic::ctpop: - case Intrinsic::bswap: - case Intrinsic::bitreverse: - case Intrinsic::fshl: - case Intrinsic::fshr: - case Intrinsic::smax: - case Intrinsic::smin: - case Intrinsic::scmp: - case Intrinsic::umax: - case Intrinsic::umin: - case Intrinsic::ucmp: - case Intrinsic::ptrmask: - case Intrinsic::fptoui_sat: - case Intrinsic::fptosi_sat: - case Intrinsic::sadd_with_overflow: - case Intrinsic::ssub_with_overflow: - case Intrinsic::smul_with_overflow: - case Intrinsic::uadd_with_overflow: - case Intrinsic::usub_with_overflow: - case Intrinsic::umul_with_overflow: - case Intrinsic::sadd_sat: - case Intrinsic::uadd_sat: - case Intrinsic::ssub_sat: - case Intrinsic::usub_sat: - return false; case Intrinsic::sshl_sat: case Intrinsic::ushl_sat: - return includesPoison(Kind) && - !shiftAmountKnownInRange(II->getArgOperand(1)); - case Intrinsic::fma: - case Intrinsic::fmuladd: - case Intrinsic::sqrt: - case Intrinsic::powi: - case Intrinsic::sin: - case Intrinsic::cos: - case Intrinsic::pow: - case Intrinsic::log: - case Intrinsic::log10: - case Intrinsic::log2: - case Intrinsic::exp: - case Intrinsic::exp2: - case Intrinsic::exp10: - case Intrinsic::fabs: - case Intrinsic::copysign: - case Intrinsic::floor: - case Intrinsic::ceil: - case Intrinsic::trunc: - case Intrinsic::rint: - case Intrinsic::nearbyint: - case Intrinsic::round: - case Intrinsic::roundeven: - case Intrinsic::fptrunc_round: - case Intrinsic::canonicalize: - case Intrinsic::arithmetic_fence: - case Intrinsic::minnum: - case Intrinsic::maxnum: - case Intrinsic::minimum: - case Intrinsic::maximum: - case Intrinsic::minimumnum: - case Intrinsic::maximumnum: - case Intrinsic::is_fpclass: - case Intrinsic::ldexp: - case Intrinsic::frexp: - return false; - case Intrinsic::lround: - case Intrinsic::llround: - case Intrinsic::lrint: - case Intrinsic::llrint: - // If the value doesn't fit an unspecified value is returned (but this - // is not poison). - return false; + if (!includesPoison(Kind) || + shiftAmountKnownInRange(II->getArgOperand(1))) + return false; + break; } } [[fallthrough]]; case Instruction::CallBr: case Instruction::Invoke: { const auto *CB = cast<CallBase>(Op); - return !CB->hasRetAttr(Attribute::NoUndef); + return !CB->hasRetAttr(Attribute::NoUndef) && + !CB->hasFnAttr(Attribute::NoCreateUndefOrPoison); } case Instruction::InsertElement: case Instruction::ExtractElement: { @@ -10405,3 +10341,55 @@ const Value *llvm::stripNullTest(const Value *V) { Value *llvm::stripNullTest(Value *V) { return const_cast<Value *>(stripNullTest(const_cast<const Value *>(V))); } + +bool llvm::collectPossibleValues(const Value *V, + SmallPtrSetImpl<const Constant *> &Constants, + unsigned MaxCount, bool AllowUndefOrPoison) { + SmallPtrSet<const Instruction *, 8> Visited; + SmallVector<const Instruction *, 8> Worklist; + auto Push = [&](const Value *V) -> bool { + if (auto *C = dyn_cast<Constant>(V)) { + if (!AllowUndefOrPoison && !isGuaranteedNotToBeUndefOrPoison(C)) + return false; + // Check existence first to avoid unnecessary allocations. + if (Constants.contains(C)) + return true; + if (Constants.size() == MaxCount) + return false; + Constants.insert(C); + return true; + } + + if (auto *Inst = dyn_cast<Instruction>(V)) { + if (Visited.insert(Inst).second) + Worklist.push_back(Inst); + return true; + } + return false; + }; + if (!Push(V)) + return false; + while (!Worklist.empty()) { + const Instruction *CurInst = Worklist.pop_back_val(); + switch (CurInst->getOpcode()) { + case Instruction::Select: + if (!Push(CurInst->getOperand(1))) + return false; + if (!Push(CurInst->getOperand(2))) + return false; + break; + case Instruction::PHI: + for (Value *IncomingValue : cast<PHINode>(CurInst)->incoming_values()) { + // Fast path for recurrence PHI. + if (IncomingValue == CurInst) + continue; + if (!Push(IncomingValue)) + return false; + } + break; + default: + return false; + } + } + return true; +} diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp index 091d948..977ed59 100644 --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -1387,9 +1387,9 @@ void InterleavedAccessInfo::collectConstStrideAccesses( // wrap around the address space we would do a memory access at nullptr // even without the transformation. The wrapping checks are therefore // deferred until after we've formed the interleaved groups. - int64_t Stride = - getPtrStride(PSE, ElementTy, Ptr, TheLoop, Strides, - /*Assume=*/true, /*ShouldCheckWrap=*/false).value_or(0); + int64_t Stride = getPtrStride(PSE, ElementTy, Ptr, TheLoop, *DT, Strides, + /*Assume=*/true, /*ShouldCheckWrap=*/false) + .value_or(0); const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr); AccessStrideInfo[&I] = StrideDescriptor(Stride, Scev, Size, @@ -1643,8 +1643,9 @@ void InterleavedAccessInfo::analyzeInterleaving( assert(Member && "Group member does not exist"); Value *MemberPtr = getLoadStorePointerOperand(Member); Type *AccessTy = getLoadStoreType(Member); - if (getPtrStride(PSE, AccessTy, MemberPtr, TheLoop, Strides, - /*Assume=*/false, /*ShouldCheckWrap=*/true).value_or(0)) + if (getPtrStride(PSE, AccessTy, MemberPtr, TheLoop, *DT, Strides, + /*Assume=*/false, /*ShouldCheckWrap=*/true) + .value_or(0)) return false; LLVM_DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to " << FirstOrLast diff --git a/llvm/lib/BinaryFormat/CMakeLists.txt b/llvm/lib/BinaryFormat/CMakeLists.txt index 4b2debb..0c8af1e 100644 --- a/llvm/lib/BinaryFormat/CMakeLists.txt +++ b/llvm/lib/BinaryFormat/CMakeLists.txt @@ -6,7 +6,6 @@ add_llvm_component_library(LLVMBinaryFormat ELF.cpp MachO.cpp Magic.cpp - Minidump.cpp MsgPackDocument.cpp MsgPackDocumentYAML.cpp MsgPackReader.cpp diff --git a/llvm/lib/BinaryFormat/Minidump.cpp b/llvm/lib/BinaryFormat/Minidump.cpp deleted file mode 100644 index b618fb1..0000000 --- a/llvm/lib/BinaryFormat/Minidump.cpp +++ /dev/null @@ -1,14 +0,0 @@ -//===-- Minidump.cpp - Minidump constants and structures ---------*- C++-*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "llvm/BinaryFormat/Minidump.h" - -using namespace llvm::minidump; - -constexpr uint32_t Header::MagicSignature; -constexpr uint16_t Header::MagicVersion; diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index 466dcb0..8930d64 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -2257,6 +2257,8 @@ static Attribute::AttrKind getAttrFromCode(uint64_t Code) { return Attribute::Captures; case bitc::ATTR_KIND_DEAD_ON_RETURN: return Attribute::DeadOnReturn; + case bitc::ATTR_KIND_NO_CREATE_UNDEF_OR_POISON: + return Attribute::NoCreateUndefOrPoison; } } @@ -8566,16 +8568,13 @@ Expected<std::unique_ptr<ModuleSummaryIndex>> BitcodeModule::getSummary() { } static Expected<std::pair<bool, bool>> -getEnableSplitLTOUnitAndUnifiedFlag(BitstreamCursor &Stream, - unsigned ID, - BitcodeLTOInfo <OInfo) { +getEnableSplitLTOUnitAndUnifiedFlag(BitstreamCursor &Stream, unsigned ID) { if (Error Err = Stream.EnterSubBlock(ID)) return std::move(Err); - SmallVector<uint64_t, 64> Record; + SmallVector<uint64_t, 64> Record; while (true) { BitstreamEntry Entry; - std::pair<bool, bool> Result = {false,false}; if (Error E = Stream.advanceSkippingSubblocks().moveInto(Entry)) return std::move(E); @@ -8584,8 +8583,8 @@ getEnableSplitLTOUnitAndUnifiedFlag(BitstreamCursor &Stream, case BitstreamEntry::Error: return error("Malformed block"); case BitstreamEntry::EndBlock: { - // If no flags record found, set both flags to false. - return Result; + // If no flags record found, return both flags as false. + return std::make_pair(false, false); } case BitstreamEntry::Record: // The interesting case. @@ -8607,9 +8606,7 @@ getEnableSplitLTOUnitAndUnifiedFlag(BitstreamCursor &Stream, bool EnableSplitLTOUnit = Flags & 0x8; bool UnifiedLTO = Flags & 0x200; - Result = {EnableSplitLTOUnit, UnifiedLTO}; - - return Result; + return std::make_pair(EnableSplitLTOUnit, UnifiedLTO); } } } @@ -8638,26 +8635,15 @@ Expected<BitcodeLTOInfo> BitcodeModule::getLTOInfo() { /*EnableSplitLTOUnit=*/false, /*UnifiedLTO=*/false}; case BitstreamEntry::SubBlock: - if (Entry.ID == bitc::GLOBALVAL_SUMMARY_BLOCK_ID) { - BitcodeLTOInfo LTOInfo; + if (Entry.ID == bitc::GLOBALVAL_SUMMARY_BLOCK_ID || + Entry.ID == bitc::FULL_LTO_GLOBALVAL_SUMMARY_BLOCK_ID) { Expected<std::pair<bool, bool>> Flags = - getEnableSplitLTOUnitAndUnifiedFlag(Stream, Entry.ID, LTOInfo); + getEnableSplitLTOUnitAndUnifiedFlag(Stream, Entry.ID); if (!Flags) return Flags.takeError(); - std::tie(LTOInfo.EnableSplitLTOUnit, LTOInfo.UnifiedLTO) = Flags.get(); - LTOInfo.IsThinLTO = true; - LTOInfo.HasSummary = true; - return LTOInfo; - } - - if (Entry.ID == bitc::FULL_LTO_GLOBALVAL_SUMMARY_BLOCK_ID) { BitcodeLTOInfo LTOInfo; - Expected<std::pair<bool, bool>> Flags = - getEnableSplitLTOUnitAndUnifiedFlag(Stream, Entry.ID, LTOInfo); - if (!Flags) - return Flags.takeError(); std::tie(LTOInfo.EnableSplitLTOUnit, LTOInfo.UnifiedLTO) = Flags.get(); - LTOInfo.IsThinLTO = false; + LTOInfo.IsThinLTO = (Entry.ID == bitc::GLOBALVAL_SUMMARY_BLOCK_ID); LTOInfo.HasSummary = true; return LTOInfo; } diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index f17656c..76494c7 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -956,6 +956,8 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) { return bitc::ATTR_KIND_CAPTURES; case Attribute::DeadOnReturn: return bitc::ATTR_KIND_DEAD_ON_RETURN; + case Attribute::NoCreateUndefOrPoison: + return bitc::ATTR_KIND_NO_CREATE_UNDEF_OR_POISON; case Attribute::EndAttrKinds: llvm_unreachable("Can not encode end-attribute kinds marker."); case Attribute::None: diff --git a/llvm/lib/CAS/ActionCaches.cpp b/llvm/lib/CAS/ActionCaches.cpp index 571c5b3..003c850 100644 --- a/llvm/lib/CAS/ActionCaches.cpp +++ b/llvm/lib/CAS/ActionCaches.cpp @@ -13,7 +13,11 @@ #include "BuiltinCAS.h" #include "llvm/ADT/TrieRawHashMap.h" #include "llvm/CAS/ActionCache.h" +#include "llvm/CAS/OnDiskKeyValueDB.h" +#include "llvm/CAS/UnifiedOnDiskCache.h" +#include "llvm/Config/llvm-config.h" #include "llvm/Support/BLAKE3.h" +#include "llvm/Support/Errc.h" #define DEBUG_TYPE "cas-action-caches" @@ -47,12 +51,54 @@ public: Expected<std::optional<CASID>> getImpl(ArrayRef<uint8_t> ActionKey, bool CanBeDistributed) const final; + Error validate() const final { + return createStringError("InMemoryActionCache doesn't support validate()"); + } + private: using DataT = CacheEntry<sizeof(HashType)>; using InMemoryCacheT = ThreadSafeTrieRawHashMap<DataT, sizeof(HashType)>; InMemoryCacheT Cache; }; + +/// Builtin basic OnDiskActionCache that uses one underlying OnDiskKeyValueDB. +class OnDiskActionCache final : public ActionCache { +public: + Error putImpl(ArrayRef<uint8_t> ActionKey, const CASID &Result, + bool CanBeDistributed) final; + Expected<std::optional<CASID>> getImpl(ArrayRef<uint8_t> ActionKey, + bool CanBeDistributed) const final; + + static Expected<std::unique_ptr<OnDiskActionCache>> create(StringRef Path); + + Error validate() const final; + +private: + static StringRef getHashName() { return "BLAKE3"; } + + OnDiskActionCache(std::unique_ptr<ondisk::OnDiskKeyValueDB> DB); + + std::unique_ptr<ondisk::OnDiskKeyValueDB> DB; + using DataT = CacheEntry<sizeof(HashType)>; +}; + +/// Builtin unified ActionCache that wraps around UnifiedOnDiskCache to provide +/// access to its ActionCache. +class UnifiedOnDiskActionCache final : public ActionCache { +public: + Error putImpl(ArrayRef<uint8_t> ActionKey, const CASID &Result, + bool CanBeDistributed) final; + Expected<std::optional<CASID>> getImpl(ArrayRef<uint8_t> ActionKey, + bool CanBeDistributed) const final; + + UnifiedOnDiskActionCache(std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB); + + Error validate() const final; + +private: + std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB; +}; } // end namespace static Error createResultCachePoisonedError(ArrayRef<uint8_t> KeyHash, @@ -99,3 +145,123 @@ std::unique_ptr<ActionCache> createInMemoryActionCache() { } } // namespace llvm::cas + +OnDiskActionCache::OnDiskActionCache( + std::unique_ptr<ondisk::OnDiskKeyValueDB> DB) + : ActionCache(builtin::BuiltinCASContext::getDefaultContext()), + DB(std::move(DB)) {} + +Expected<std::unique_ptr<OnDiskActionCache>> +OnDiskActionCache::create(StringRef AbsPath) { + std::unique_ptr<ondisk::OnDiskKeyValueDB> DB; + if (Error E = ondisk::OnDiskKeyValueDB::open(AbsPath, getHashName(), + sizeof(HashType), getHashName(), + sizeof(DataT)) + .moveInto(DB)) + return std::move(E); + return std::unique_ptr<OnDiskActionCache>( + new OnDiskActionCache(std::move(DB))); +} + +Expected<std::optional<CASID>> +OnDiskActionCache::getImpl(ArrayRef<uint8_t> Key, + bool /*CanBeDistributed*/) const { + std::optional<ArrayRef<char>> Val; + if (Error E = DB->get(Key).moveInto(Val)) + return std::move(E); + if (!Val) + return std::nullopt; + return CASID::create(&getContext(), toStringRef(*Val)); +} + +Error OnDiskActionCache::putImpl(ArrayRef<uint8_t> Key, const CASID &Result, + bool /*CanBeDistributed*/) { + auto ResultHash = Result.getHash(); + ArrayRef Expected((const char *)ResultHash.data(), ResultHash.size()); + ArrayRef<char> Observed; + if (Error E = DB->put(Key, Expected).moveInto(Observed)) + return E; + + if (Expected == Observed) + return Error::success(); + + return createResultCachePoisonedError( + Key, getContext(), Result, + ArrayRef((const uint8_t *)Observed.data(), Observed.size())); +} + +Error OnDiskActionCache::validate() const { + // FIXME: without the matching CAS there is nothing we can check about the + // cached values. The hash size is already validated by the DB validator. + return DB->validate(nullptr); +} + +UnifiedOnDiskActionCache::UnifiedOnDiskActionCache( + std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB) + : ActionCache(builtin::BuiltinCASContext::getDefaultContext()), + UniDB(std::move(UniDB)) {} + +Expected<std::optional<CASID>> +UnifiedOnDiskActionCache::getImpl(ArrayRef<uint8_t> Key, + bool /*CanBeDistributed*/) const { + std::optional<ArrayRef<char>> Val; + if (Error E = UniDB->getKeyValueDB().get(Key).moveInto(Val)) + return std::move(E); + if (!Val) + return std::nullopt; + auto ID = ondisk::UnifiedOnDiskCache::getObjectIDFromValue(*Val); + return CASID::create(&getContext(), + toStringRef(UniDB->getGraphDB().getDigest(ID))); +} + +Error UnifiedOnDiskActionCache::putImpl(ArrayRef<uint8_t> Key, + const CASID &Result, + bool /*CanBeDistributed*/) { + auto Expected = UniDB->getGraphDB().getReference(Result.getHash()); + if (LLVM_UNLIKELY(!Expected)) + return Expected.takeError(); + + auto Value = ondisk::UnifiedOnDiskCache::getValueFromObjectID(*Expected); + std::optional<ArrayRef<char>> Observed; + if (Error E = UniDB->getKeyValueDB().put(Key, Value).moveInto(Observed)) + return E; + + auto ObservedID = ondisk::UnifiedOnDiskCache::getObjectIDFromValue(*Observed); + if (*Expected == ObservedID) + return Error::success(); + + return createResultCachePoisonedError( + Key, getContext(), Result, UniDB->getGraphDB().getDigest(ObservedID)); +} + +Error UnifiedOnDiskActionCache::validate() const { + auto ValidateRef = [](FileOffset Offset, ArrayRef<char> Value) -> Error { + auto ID = ondisk::UnifiedOnDiskCache::getObjectIDFromValue(Value); + auto formatError = [&](Twine Msg) { + return createStringError( + llvm::errc::illegal_byte_sequence, + "bad record at 0x" + + utohexstr((unsigned)Offset.get(), /*LowerCase=*/true) + ": " + + Msg.str()); + }; + if (ID.getOpaqueData() == 0) + return formatError("zero is not a valid ref"); + return Error::success(); + }; + return UniDB->getKeyValueDB().validate(ValidateRef); +} + +Expected<std::unique_ptr<ActionCache>> +cas::createOnDiskActionCache(StringRef Path) { +#if LLVM_ENABLE_ONDISK_CAS + return OnDiskActionCache::create(Path); +#else + return createStringError(inconvertibleErrorCode(), "OnDiskCache is disabled"); +#endif +} + +std::unique_ptr<ActionCache> +cas::builtin::createActionCacheFromUnifiedOnDiskCache( + std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB) { + return std::make_unique<UnifiedOnDiskActionCache>(std::move(UniDB)); +} diff --git a/llvm/lib/CAS/BuiltinCAS.cpp b/llvm/lib/CAS/BuiltinCAS.cpp index 73646ad..e9bc6d8 100644 --- a/llvm/lib/CAS/BuiltinCAS.cpp +++ b/llvm/lib/CAS/BuiltinCAS.cpp @@ -9,6 +9,7 @@ #include "BuiltinCAS.h" #include "llvm/ADT/StringExtras.h" #include "llvm/CAS/BuiltinObjectHasher.h" +#include "llvm/CAS/UnifiedOnDiskCache.h" #include "llvm/Support/Process.h" using namespace llvm; @@ -68,7 +69,7 @@ Expected<ObjectRef> BuiltinCAS::store(ArrayRef<ObjectRef> Refs, Refs, Data); } -Error BuiltinCAS::validate(const CASID &ID) { +Error BuiltinCAS::validateObject(const CASID &ID) { auto Ref = getReference(ID); if (!Ref) return createUnknownObjectError(ID); @@ -92,3 +93,14 @@ Error BuiltinCAS::validate(const CASID &ID) { return Error::success(); } + +Expected<std::unique_ptr<ondisk::UnifiedOnDiskCache>> +cas::builtin::createBuiltinUnifiedOnDiskCache(StringRef Path) { +#if LLVM_ENABLE_ONDISK_CAS + return ondisk::UnifiedOnDiskCache::open(Path, /*SizeLimit=*/std::nullopt, + BuiltinCASContext::getHashName(), + sizeof(HashType)); +#else + return createStringError(inconvertibleErrorCode(), "OnDiskCache is disabled"); +#endif +} diff --git a/llvm/lib/CAS/BuiltinCAS.h b/llvm/lib/CAS/BuiltinCAS.h index 3b5374d..4d2de66 100644 --- a/llvm/lib/CAS/BuiltinCAS.h +++ b/llvm/lib/CAS/BuiltinCAS.h @@ -1,4 +1,4 @@ -//===- BuiltinCAS.h ---------------------------------------------*- C++ -*-===// +//===----------------------------------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -15,6 +15,9 @@ namespace llvm::cas { class ActionCache; +namespace ondisk { +class UnifiedOnDiskCache; +} // namespace ondisk namespace builtin { /// Common base class for builtin CAS implementations using the same CASContext. @@ -65,9 +68,27 @@ public: "corrupt storage"); } - Error validate(const CASID &ID) final; + Error validateObject(const CASID &ID) final; }; +/// Create a \p UnifiedOnDiskCache instance that uses \p BLAKE3 hashing. +Expected<std::unique_ptr<ondisk::UnifiedOnDiskCache>> +createBuiltinUnifiedOnDiskCache(StringRef Path); + +/// \param UniDB A \p UnifiedOnDiskCache instance from \p +/// createBuiltinUnifiedOnDiskCache. +std::unique_ptr<ObjectStore> createObjectStoreFromUnifiedOnDiskCache( + std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB); + +/// \param UniDB A \p UnifiedOnDiskCache instance from \p +/// createBuiltinUnifiedOnDiskCache. +std::unique_ptr<ActionCache> createActionCacheFromUnifiedOnDiskCache( + std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB); + +// FIXME: Proxy not portable. Maybe also error-prone? +constexpr StringLiteral DefaultDirProxy = "/^llvm::cas::builtin::default"; +constexpr StringLiteral DefaultDir = "llvm.cas.builtin.default"; + } // end namespace builtin } // end namespace llvm::cas diff --git a/llvm/lib/CAS/BuiltinUnifiedCASDatabases.cpp b/llvm/lib/CAS/BuiltinUnifiedCASDatabases.cpp new file mode 100644 index 0000000..f3f6fa0 --- /dev/null +++ b/llvm/lib/CAS/BuiltinUnifiedCASDatabases.cpp @@ -0,0 +1,38 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/CAS/BuiltinUnifiedCASDatabases.h" +#include "BuiltinCAS.h" +#include "llvm/CAS/ActionCache.h" +#include "llvm/CAS/UnifiedOnDiskCache.h" + +using namespace llvm; +using namespace llvm::cas; + +Expected<std::pair<std::unique_ptr<ObjectStore>, std::unique_ptr<ActionCache>>> +cas::createOnDiskUnifiedCASDatabases(StringRef Path) { + std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB; + if (Error E = builtin::createBuiltinUnifiedOnDiskCache(Path).moveInto(UniDB)) + return std::move(E); + auto CAS = builtin::createObjectStoreFromUnifiedOnDiskCache(UniDB); + auto AC = builtin::createActionCacheFromUnifiedOnDiskCache(std::move(UniDB)); + return std::make_pair(std::move(CAS), std::move(AC)); +} + +Expected<ValidationResult> cas::validateOnDiskUnifiedCASDatabasesIfNeeded( + StringRef Path, bool CheckHash, bool AllowRecovery, bool ForceValidation, + std::optional<StringRef> LLVMCasBinary) { +#if LLVM_ENABLE_ONDISK_CAS + return ondisk::UnifiedOnDiskCache::validateIfNeeded( + Path, builtin::BuiltinCASContext::getHashName(), + sizeof(builtin::HashType), CheckHash, AllowRecovery, ForceValidation, + LLVMCasBinary); +#else + return createStringError(inconvertibleErrorCode(), "OnDiskCache is disabled"); +#endif +} diff --git a/llvm/lib/CAS/CMakeLists.txt b/llvm/lib/CAS/CMakeLists.txt index a2f8c49..aad77dc 100644 --- a/llvm/lib/CAS/CMakeLists.txt +++ b/llvm/lib/CAS/CMakeLists.txt @@ -2,15 +2,18 @@ add_llvm_component_library(LLVMCAS ActionCache.cpp ActionCaches.cpp BuiltinCAS.cpp + BuiltinUnifiedCASDatabases.cpp DatabaseFile.cpp InMemoryCAS.cpp MappedFileRegionArena.cpp ObjectStore.cpp + OnDiskCAS.cpp OnDiskCommon.cpp OnDiskDataAllocator.cpp OnDiskGraphDB.cpp OnDiskKeyValueDB.cpp OnDiskTrieRawHashMap.cpp + UnifiedOnDiskCache.cpp ADDITIONAL_HEADER_DIRS ${LLVM_MAIN_INCLUDE_DIR}/llvm/CAS diff --git a/llvm/lib/CAS/InMemoryCAS.cpp b/llvm/lib/CAS/InMemoryCAS.cpp index c63ee70d..2d4eedd 100644 --- a/llvm/lib/CAS/InMemoryCAS.cpp +++ b/llvm/lib/CAS/InMemoryCAS.cpp @@ -233,6 +233,12 @@ public: return cast<InMemoryObject>(asInMemoryObject(Node)).getData(); } + void print(raw_ostream &OS) const final; + + Error validate(bool CheckHash) const final { + return createStringError("InMemoryCAS doesn't support validate()"); + } + InMemoryCAS() = default; private: @@ -271,6 +277,8 @@ ArrayRef<const InMemoryObject *> InMemoryObject::getRefs() const { return cast<InMemoryInlineObject>(this)->getRefsImpl(); } +void InMemoryCAS::print(raw_ostream &OS) const {} + Expected<ObjectRef> InMemoryCAS::storeFromNullTerminatedRegion(ArrayRef<uint8_t> ComputedHash, sys::fs::mapped_file_region Map) { diff --git a/llvm/lib/CAS/ObjectStore.cpp b/llvm/lib/CAS/ObjectStore.cpp index e0be50b..3110577 100644 --- a/llvm/lib/CAS/ObjectStore.cpp +++ b/llvm/lib/CAS/ObjectStore.cpp @@ -1,4 +1,4 @@ -//===- ObjectStore.cpp ------------------------------------------*- C++ -*-===// +//===----------------------------------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -12,7 +12,7 @@ #include "llvm/Support/Errc.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/MemoryBuffer.h" -#include <optional> +#include <deque> using namespace llvm; using namespace llvm::cas; @@ -21,6 +21,7 @@ void CASContext::anchor() {} void ObjectStore::anchor() {} LLVM_DUMP_METHOD void CASID::dump() const { print(dbgs()); } +LLVM_DUMP_METHOD void ObjectStore::dump() const { print(dbgs()); } LLVM_DUMP_METHOD void ObjectRef::dump() const { print(dbgs()); } LLVM_DUMP_METHOD void ObjectHandle::dump() const { print(dbgs()); } @@ -141,7 +142,7 @@ Error ObjectStore::validateTree(ObjectRef Root) { auto [I, Inserted] = ValidatedRefs.insert(Ref); if (!Inserted) continue; // already validated. - if (Error E = validate(getID(Ref))) + if (Error E = validateObject(getID(Ref))) return E; Expected<ObjectHandle> Obj = load(Ref); if (!Obj) @@ -155,6 +156,92 @@ Error ObjectStore::validateTree(ObjectRef Root) { return Error::success(); } +Expected<ObjectRef> ObjectStore::importObject(ObjectStore &Upstream, + ObjectRef Other) { + // Copy the full CAS tree from upstream with depth-first ordering to ensure + // all the child nodes are available in downstream CAS before inserting + // current object. This uses a similar algorithm as + // `OnDiskGraphDB::importFullTree` but doesn't assume the upstream CAS schema + // so it can be used to import from any other ObjectStore reguardless of the + // CAS schema. + + // There is no work to do if importing from self. + if (this == &Upstream) + return Other; + + /// Keeps track of the state of visitation for current node and all of its + /// parents. Upstream Cursor holds information only from upstream CAS. + struct UpstreamCursor { + ObjectRef Ref; + ObjectHandle Node; + size_t RefsCount; + std::deque<ObjectRef> Refs; + }; + SmallVector<UpstreamCursor, 16> CursorStack; + /// PrimaryNodeStack holds the ObjectRef of the current CAS, with nodes either + /// just stored in the CAS or nodes already exists in the current CAS. + SmallVector<ObjectRef, 128> PrimaryRefStack; + /// A map from upstream ObjectRef to current ObjectRef. + llvm::DenseMap<ObjectRef, ObjectRef> CreatedObjects; + + auto enqueueNode = [&](ObjectRef Ref, ObjectHandle Node) { + unsigned NumRefs = Upstream.getNumRefs(Node); + std::deque<ObjectRef> Refs; + for (unsigned I = 0; I < NumRefs; ++I) + Refs.push_back(Upstream.readRef(Node, I)); + + CursorStack.push_back({Ref, Node, NumRefs, std::move(Refs)}); + }; + + auto UpstreamHandle = Upstream.load(Other); + if (!UpstreamHandle) + return UpstreamHandle.takeError(); + enqueueNode(Other, *UpstreamHandle); + + while (!CursorStack.empty()) { + UpstreamCursor &Cur = CursorStack.back(); + if (Cur.Refs.empty()) { + // Copy the node data into the primary store. + // The bottom of \p PrimaryRefStack contains the ObjectRef for the + // current node. + assert(PrimaryRefStack.size() >= Cur.RefsCount); + auto Refs = ArrayRef(PrimaryRefStack) + .slice(PrimaryRefStack.size() - Cur.RefsCount); + auto NewNode = store(Refs, Upstream.getData(Cur.Node)); + if (!NewNode) + return NewNode.takeError(); + + // Remove the current node and its IDs from the stack. + PrimaryRefStack.truncate(PrimaryRefStack.size() - Cur.RefsCount); + CursorStack.pop_back(); + + PrimaryRefStack.push_back(*NewNode); + CreatedObjects.try_emplace(Cur.Ref, *NewNode); + continue; + } + + // Check if the node exists already. + auto CurrentID = Cur.Refs.front(); + Cur.Refs.pop_front(); + auto Ref = CreatedObjects.find(CurrentID); + if (Ref != CreatedObjects.end()) { + // If exists already, just need to enqueue the primary node. + PrimaryRefStack.push_back(Ref->second); + continue; + } + + // Load child. + auto PrimaryID = Upstream.load(CurrentID); + if (LLVM_UNLIKELY(!PrimaryID)) + return PrimaryID.takeError(); + + enqueueNode(CurrentID, *PrimaryID); + } + + assert(PrimaryRefStack.size() == 1); + return PrimaryRefStack.front(); +} + std::unique_ptr<MemoryBuffer> ObjectProxy::getMemoryBuffer(StringRef Name, bool RequiresNullTerminator) const { diff --git a/llvm/lib/CAS/OnDiskCAS.cpp b/llvm/lib/CAS/OnDiskCAS.cpp new file mode 100644 index 0000000..7d29f44 --- /dev/null +++ b/llvm/lib/CAS/OnDiskCAS.cpp @@ -0,0 +1,211 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "BuiltinCAS.h" +#include "llvm/CAS/BuiltinCASContext.h" +#include "llvm/CAS/BuiltinObjectHasher.h" +#include "llvm/CAS/OnDiskGraphDB.h" +#include "llvm/CAS/UnifiedOnDiskCache.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/Error.h" + +using namespace llvm; +using namespace llvm::cas; +using namespace llvm::cas::builtin; + +namespace { + +class OnDiskCAS : public BuiltinCAS { +public: + Expected<ObjectRef> storeImpl(ArrayRef<uint8_t> ComputedHash, + ArrayRef<ObjectRef> Refs, + ArrayRef<char> Data) final; + + Expected<std::optional<ObjectHandle>> loadIfExists(ObjectRef Ref) final; + + CASID getID(ObjectRef Ref) const final; + + std::optional<ObjectRef> getReference(const CASID &ID) const final; + + Expected<bool> isMaterialized(ObjectRef Ref) const final; + + ArrayRef<char> getDataConst(ObjectHandle Node) const final; + + void print(raw_ostream &OS) const final; + Error validate(bool CheckHash) const final; + + static Expected<std::unique_ptr<OnDiskCAS>> open(StringRef Path); + + OnDiskCAS(std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB) + : UnifiedDB(std::move(UniDB)), DB(&UnifiedDB->getGraphDB()) {} + +private: + ObjectHandle convertHandle(ondisk::ObjectHandle Node) const { + return makeObjectHandle(Node.getOpaqueData()); + } + + ondisk::ObjectHandle convertHandle(ObjectHandle Node) const { + return ondisk::ObjectHandle(Node.getInternalRef(*this)); + } + + ObjectRef convertRef(ondisk::ObjectID Ref) const { + return makeObjectRef(Ref.getOpaqueData()); + } + + ondisk::ObjectID convertRef(ObjectRef Ref) const { + return ondisk::ObjectID::fromOpaqueData(Ref.getInternalRef(*this)); + } + + size_t getNumRefs(ObjectHandle Node) const final { + auto RefsRange = DB->getObjectRefs(convertHandle(Node)); + return std::distance(RefsRange.begin(), RefsRange.end()); + } + + ObjectRef readRef(ObjectHandle Node, size_t I) const final { + auto RefsRange = DB->getObjectRefs(convertHandle(Node)); + return convertRef(RefsRange.begin()[I]); + } + + Error forEachRef(ObjectHandle Node, + function_ref<Error(ObjectRef)> Callback) const final; + + Error setSizeLimit(std::optional<uint64_t> SizeLimit) final; + Expected<std::optional<uint64_t>> getStorageSize() const final; + Error pruneStorageData() final; + + OnDiskCAS(std::unique_ptr<ondisk::OnDiskGraphDB> GraphDB) + : OwnedDB(std::move(GraphDB)), DB(OwnedDB.get()) {} + + std::unique_ptr<ondisk::OnDiskGraphDB> OwnedDB; + std::shared_ptr<ondisk::UnifiedOnDiskCache> UnifiedDB; + ondisk::OnDiskGraphDB *DB; +}; + +} // end anonymous namespace + +void OnDiskCAS::print(raw_ostream &OS) const { DB->print(OS); } +Error OnDiskCAS::validate(bool CheckHash) const { + auto Hasher = [](ArrayRef<ArrayRef<uint8_t>> Refs, ArrayRef<char> Data, + SmallVectorImpl<uint8_t> &Result) { + auto Hash = BuiltinObjectHasher<llvm::cas::builtin::HasherT>::hashObject( + Refs, Data); + Result.assign(Hash.begin(), Hash.end()); + }; + + if (auto E = DB->validate(CheckHash, Hasher)) + return E; + + return Error::success(); +} + +CASID OnDiskCAS::getID(ObjectRef Ref) const { + ArrayRef<uint8_t> Hash = DB->getDigest(convertRef(Ref)); + return CASID::create(&getContext(), toStringRef(Hash)); +} + +std::optional<ObjectRef> OnDiskCAS::getReference(const CASID &ID) const { + std::optional<ondisk::ObjectID> ObjID = + DB->getExistingReference(ID.getHash()); + if (!ObjID) + return std::nullopt; + return convertRef(*ObjID); +} + +Expected<bool> OnDiskCAS::isMaterialized(ObjectRef ExternalRef) const { + return DB->isMaterialized(convertRef(ExternalRef)); +} + +ArrayRef<char> OnDiskCAS::getDataConst(ObjectHandle Node) const { + return DB->getObjectData(convertHandle(Node)); +} + +Expected<std::optional<ObjectHandle>> +OnDiskCAS::loadIfExists(ObjectRef ExternalRef) { + Expected<std::optional<ondisk::ObjectHandle>> ObjHnd = + DB->load(convertRef(ExternalRef)); + if (!ObjHnd) + return ObjHnd.takeError(); + if (!*ObjHnd) + return std::nullopt; + return convertHandle(**ObjHnd); +} + +Expected<ObjectRef> OnDiskCAS::storeImpl(ArrayRef<uint8_t> ComputedHash, + ArrayRef<ObjectRef> Refs, + ArrayRef<char> Data) { + SmallVector<ondisk::ObjectID, 64> IDs; + IDs.reserve(Refs.size()); + for (ObjectRef Ref : Refs) { + IDs.push_back(convertRef(Ref)); + } + + auto StoredID = DB->getReference(ComputedHash); + if (LLVM_UNLIKELY(!StoredID)) + return StoredID.takeError(); + if (Error E = DB->store(*StoredID, IDs, Data)) + return std::move(E); + return convertRef(*StoredID); +} + +Error OnDiskCAS::forEachRef(ObjectHandle Node, + function_ref<Error(ObjectRef)> Callback) const { + auto RefsRange = DB->getObjectRefs(convertHandle(Node)); + for (ondisk::ObjectID Ref : RefsRange) { + if (Error E = Callback(convertRef(Ref))) + return E; + } + return Error::success(); +} + +Error OnDiskCAS::setSizeLimit(std::optional<uint64_t> SizeLimit) { + UnifiedDB->setSizeLimit(SizeLimit); + return Error::success(); +} + +Expected<std::optional<uint64_t>> OnDiskCAS::getStorageSize() const { + return UnifiedDB->getStorageSize(); +} + +Error OnDiskCAS::pruneStorageData() { return UnifiedDB->collectGarbage(); } + +Expected<std::unique_ptr<OnDiskCAS>> OnDiskCAS::open(StringRef AbsPath) { + Expected<std::unique_ptr<ondisk::OnDiskGraphDB>> DB = + ondisk::OnDiskGraphDB::open(AbsPath, BuiltinCASContext::getHashName(), + sizeof(HashType)); + if (!DB) + return DB.takeError(); + return std::unique_ptr<OnDiskCAS>(new OnDiskCAS(std::move(*DB))); +} + +bool cas::isOnDiskCASEnabled() { +#if LLVM_ENABLE_ONDISK_CAS + return true; +#else + return false; +#endif +} + +Expected<std::unique_ptr<ObjectStore>> cas::createOnDiskCAS(const Twine &Path) { +#if LLVM_ENABLE_ONDISK_CAS + // FIXME: An absolute path isn't really good enough. Should open a directory + // and use openat() for files underneath. + SmallString<256> AbsPath; + Path.toVector(AbsPath); + sys::fs::make_absolute(AbsPath); + + return OnDiskCAS::open(AbsPath); +#else + return createStringError(inconvertibleErrorCode(), "OnDiskCAS is disabled"); +#endif /* LLVM_ENABLE_ONDISK_CAS */ +} + +std::unique_ptr<ObjectStore> +cas::builtin::createObjectStoreFromUnifiedOnDiskCache( + std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB) { + return std::make_unique<OnDiskCAS>(std::move(UniDB)); +} diff --git a/llvm/lib/CAS/OnDiskGraphDB.cpp b/llvm/lib/CAS/OnDiskGraphDB.cpp index 64cbe9d..245b6fb 100644 --- a/llvm/lib/CAS/OnDiskGraphDB.cpp +++ b/llvm/lib/CAS/OnDiskGraphDB.cpp @@ -893,6 +893,10 @@ int64_t DataRecordHandle::getDataRelOffset() const { } Error OnDiskGraphDB::validate(bool Deep, HashingFuncT Hasher) const { + if (UpstreamDB) { + if (auto E = UpstreamDB->validate(Deep, Hasher)) + return E; + } return Index.validate([&](FileOffset Offset, OnDiskTrieRawHashMap::ConstValueProxy Record) -> Error { @@ -1202,11 +1206,8 @@ OnDiskGraphDB::load(ObjectID ExternalRef) { return I.takeError(); TrieRecord::Data Object = I->Ref.load(); - if (Object.SK == TrieRecord::StorageKind::Unknown) { - if (!UpstreamDB) - return std::nullopt; + if (Object.SK == TrieRecord::StorageKind::Unknown) return faultInFromUpstream(ExternalRef); - } if (Object.SK == TrieRecord::StorageKind::DataPool) return ObjectHandle::fromFileOffset(Object.Offset); @@ -1286,8 +1287,10 @@ OnDiskGraphDB::getObjectPresence(ObjectID ExternalRef, TrieRecord::Data Object = I->Ref.load(); if (Object.SK != TrieRecord::StorageKind::Unknown) return ObjectPresence::InPrimaryDB; + if (!CheckUpstream || !UpstreamDB) return ObjectPresence::Missing; + std::optional<ObjectID> UpstreamID = UpstreamDB->getExistingReference(getDigest(*I)); return UpstreamID.has_value() ? ObjectPresence::OnlyInUpstreamDB @@ -1549,9 +1552,10 @@ unsigned OnDiskGraphDB::getHardStorageLimitUtilization() const { return std::max(IndexPercent, DataPercent); } -Expected<std::unique_ptr<OnDiskGraphDB>> OnDiskGraphDB::open( - StringRef AbsPath, StringRef HashName, unsigned HashByteSize, - std::unique_ptr<OnDiskGraphDB> UpstreamDB, FaultInPolicy Policy) { +Expected<std::unique_ptr<OnDiskGraphDB>> +OnDiskGraphDB::open(StringRef AbsPath, StringRef HashName, + unsigned HashByteSize, OnDiskGraphDB *UpstreamDB, + FaultInPolicy Policy) { if (std::error_code EC = sys::fs::create_directories(AbsPath)) return createFileError(AbsPath, EC); @@ -1604,18 +1608,15 @@ Expected<std::unique_ptr<OnDiskGraphDB>> OnDiskGraphDB::open( "unexpected user header in '" + DataPoolPath + "'"); - return std::unique_ptr<OnDiskGraphDB>( - new OnDiskGraphDB(AbsPath, std::move(*Index), std::move(*DataPool), - std::move(UpstreamDB), Policy)); + return std::unique_ptr<OnDiskGraphDB>(new OnDiskGraphDB( + AbsPath, std::move(*Index), std::move(*DataPool), UpstreamDB, Policy)); } OnDiskGraphDB::OnDiskGraphDB(StringRef RootPath, OnDiskTrieRawHashMap Index, OnDiskDataAllocator DataPool, - std::unique_ptr<OnDiskGraphDB> UpstreamDB, - FaultInPolicy Policy) + OnDiskGraphDB *UpstreamDB, FaultInPolicy Policy) : Index(std::move(Index)), DataPool(std::move(DataPool)), - RootPath(RootPath.str()), UpstreamDB(std::move(UpstreamDB)), - FIPolicy(Policy) { + RootPath(RootPath.str()), UpstreamDB(UpstreamDB), FIPolicy(Policy) { /// Lifetime for "big" objects not in DataPool. /// /// NOTE: Could use ThreadSafeTrieRawHashMap here. For now, doing something @@ -1638,7 +1639,6 @@ Error OnDiskGraphDB::importFullTree(ObjectID PrimaryID, // against the process dying during importing and leaving the database with an // incomplete tree. Note that if the upstream has missing nodes then the tree // will be copied with missing nodes as well, it won't be considered an error. - struct UpstreamCursor { ObjectHandle Node; size_t RefsCount; @@ -1720,7 +1720,6 @@ Error OnDiskGraphDB::importSingleNode(ObjectID PrimaryID, // Copy the node data into the primary store. // FIXME: Use hard-link or cloning if the file-system supports it and data is // stored into a separate file. - auto Data = UpstreamDB->getObjectData(UpstreamNode); auto UpstreamRefs = UpstreamDB->getObjectRefs(UpstreamNode); SmallVector<ObjectID, 64> Refs; @@ -1737,7 +1736,8 @@ Error OnDiskGraphDB::importSingleNode(ObjectID PrimaryID, Expected<std::optional<ObjectHandle>> OnDiskGraphDB::faultInFromUpstream(ObjectID PrimaryID) { - assert(UpstreamDB); + if (!UpstreamDB) + return std::nullopt; auto UpstreamID = UpstreamDB->getReference(getDigest(PrimaryID)); if (LLVM_UNLIKELY(!UpstreamID)) diff --git a/llvm/lib/CAS/OnDiskKeyValueDB.cpp b/llvm/lib/CAS/OnDiskKeyValueDB.cpp index 2186071..15656cb 100644 --- a/llvm/lib/CAS/OnDiskKeyValueDB.cpp +++ b/llvm/lib/CAS/OnDiskKeyValueDB.cpp @@ -20,6 +20,7 @@ #include "llvm/CAS/OnDiskKeyValueDB.h" #include "OnDiskCommon.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/CAS/UnifiedOnDiskCache.h" #include "llvm/Support/Alignment.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Errc.h" @@ -53,15 +54,21 @@ Expected<std::optional<ArrayRef<char>>> OnDiskKeyValueDB::get(ArrayRef<uint8_t> Key) { // Check the result cache. OnDiskTrieRawHashMap::ConstOnDiskPtr ActionP = Cache.find(Key); - if (!ActionP) + if (ActionP) { + assert(isAddrAligned(Align(8), ActionP->Data.data())); + return ActionP->Data; + } + if (!UnifiedCache || !UnifiedCache->UpstreamKVDB) return std::nullopt; - assert(isAddrAligned(Align(8), ActionP->Data.data())); - return ActionP->Data; + + // Try to fault in from upstream. + return UnifiedCache->faultInFromUpstreamKV(Key); } Expected<std::unique_ptr<OnDiskKeyValueDB>> OnDiskKeyValueDB::open(StringRef Path, StringRef HashName, unsigned KeySize, - StringRef ValueName, size_t ValueSize) { + StringRef ValueName, size_t ValueSize, + UnifiedOnDiskCache *Cache) { if (std::error_code EC = sys::fs::create_directories(Path)) return createFileError(Path, EC); @@ -87,10 +94,14 @@ OnDiskKeyValueDB::open(StringRef Path, StringRef HashName, unsigned KeySize, return std::move(E); return std::unique_ptr<OnDiskKeyValueDB>( - new OnDiskKeyValueDB(ValueSize, std::move(*ActionCache))); + new OnDiskKeyValueDB(ValueSize, std::move(*ActionCache), Cache)); } Error OnDiskKeyValueDB::validate(CheckValueT CheckValue) const { + if (UnifiedCache && UnifiedCache->UpstreamKVDB) { + if (auto E = UnifiedCache->UpstreamKVDB->validate(CheckValue)) + return E; + } return Cache.validate( [&](FileOffset Offset, OnDiskTrieRawHashMap::ConstValueProxy Record) -> Error { diff --git a/llvm/lib/CAS/UnifiedOnDiskCache.cpp b/llvm/lib/CAS/UnifiedOnDiskCache.cpp new file mode 100644 index 0000000..ae9d818 --- /dev/null +++ b/llvm/lib/CAS/UnifiedOnDiskCache.cpp @@ -0,0 +1,613 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Encapsulates \p OnDiskGraphDB and \p OnDiskKeyValueDB instances within one +/// directory while also restricting storage growth with a scheme of chaining +/// the two most recent directories (primary & upstream), where the primary +/// "faults-in" data from the upstream one. When the primary (most recent) +/// directory exceeds its intended limit a new empty directory becomes the +/// primary one. +/// +/// Within the top-level directory (the path that \p UnifiedOnDiskCache::open +/// receives) there are directories named like this: +/// +/// 'v<version>.<x>' +/// 'v<version>.<x+1>' +/// 'v<version>.<x+2>' +/// ... +/// +/// 'version' is the version integer for this \p UnifiedOnDiskCache's scheme and +/// the part after the dot is an increasing integer. The primary directory is +/// the one with the highest integer and the upstream one is the directory +/// before it. For example, if the sub-directories contained are: +/// +/// 'v1.5', 'v1.6', 'v1.7', 'v1.8' +/// +/// Then the primary one is 'v1.8', the upstream one is 'v1.7', and the rest are +/// unused directories that can be safely deleted at any time and by any +/// process. +/// +/// Contained within the top-level directory is a file named "lock" which is +/// used for processes to take shared or exclusive locks for the contents of the +/// top directory. While a \p UnifiedOnDiskCache is open it keeps a shared lock +/// for the top-level directory; when it closes, if the primary sub-directory +/// exceeded its limit, it attempts to get an exclusive lock in order to create +/// a new empty primary directory; if it can't get the exclusive lock it gives +/// up and lets the next \p UnifiedOnDiskCache instance that closes to attempt +/// again. +/// +/// The downside of this scheme is that while \p UnifiedOnDiskCache is open on a +/// directory, by any process, the storage size in that directory will keep +/// growing unrestricted. But the major benefit is that garbage-collection can +/// be triggered on a directory concurrently, at any time and by any process, +/// without affecting any active readers/writers in the same process or other +/// processes. +/// +/// The \c UnifiedOnDiskCache also provides validation and recovery on top of +/// the underlying on-disk storage. The low-level storage is designed to remain +/// coherent across regular process crashes, but may be invalid after power loss +/// or similar system failures. \c UnifiedOnDiskCache::validateIfNeeded allows +/// validating the contents once per boot and can recover by marking invalid +/// data for garbage collection. +/// +/// The data recovery described above requires exclusive access to the CAS, and +/// it is an error to attempt recovery if the CAS is open in any process/thread. +/// In order to maximize backwards compatibility with tools that do not perform +/// validation before opening the CAS, we do not attempt to get exclusive access +/// until recovery is actually performed, meaning as long as the data is valid +/// it will not conflict with concurrent use. +// +//===----------------------------------------------------------------------===// + +#include "llvm/CAS/UnifiedOnDiskCache.h" +#include "BuiltinCAS.h" +#include "OnDiskCommon.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/ScopeExit.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/CAS/ActionCache.h" +#include "llvm/CAS/OnDiskGraphDB.h" +#include "llvm/CAS/OnDiskKeyValueDB.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/Errc.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/FileUtilities.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/Path.h" +#include "llvm/Support/Program.h" +#include "llvm/Support/raw_ostream.h" +#include <optional> + +#if __has_include(<sys/sysctl.h>) +#include <sys/sysctl.h> +#endif + +using namespace llvm; +using namespace llvm::cas; +using namespace llvm::cas::ondisk; + +/// FIXME: When the version of \p DBDirPrefix is bumped up we need to figure out +/// how to handle the leftover sub-directories of the previous version, within +/// the \p UnifiedOnDiskCache::collectGarbage function. +static constexpr StringLiteral DBDirPrefix = "v1."; + +static constexpr StringLiteral ValidationFilename = "v1.validation"; +static constexpr StringLiteral CorruptPrefix = "corrupt."; + +ObjectID UnifiedOnDiskCache::getObjectIDFromValue(ArrayRef<char> Value) { + // little endian encoded. + assert(Value.size() == sizeof(uint64_t)); + return ObjectID::fromOpaqueData(support::endian::read64le(Value.data())); +} + +UnifiedOnDiskCache::ValueBytes +UnifiedOnDiskCache::getValueFromObjectID(ObjectID ID) { + // little endian encoded. + UnifiedOnDiskCache::ValueBytes ValBytes; + static_assert(ValBytes.size() == sizeof(ID.getOpaqueData())); + support::endian::write64le(ValBytes.data(), ID.getOpaqueData()); + return ValBytes; +} + +Expected<std::optional<ArrayRef<char>>> +UnifiedOnDiskCache::faultInFromUpstreamKV(ArrayRef<uint8_t> Key) { + assert(UpstreamGraphDB); + assert(UpstreamKVDB); + + std::optional<ArrayRef<char>> UpstreamValue; + if (Error E = UpstreamKVDB->get(Key).moveInto(UpstreamValue)) + return std::move(E); + if (!UpstreamValue) + return std::nullopt; + + // The value is the \p ObjectID in the context of the upstream + // \p OnDiskGraphDB instance. Translate it to the context of the primary + // \p OnDiskGraphDB instance. + ObjectID UpstreamID = getObjectIDFromValue(*UpstreamValue); + auto PrimaryID = + PrimaryGraphDB->getReference(UpstreamGraphDB->getDigest(UpstreamID)); + if (LLVM_UNLIKELY(!PrimaryID)) + return PrimaryID.takeError(); + return PrimaryKVDB->put(Key, getValueFromObjectID(*PrimaryID)); +} + +/// \returns all the 'v<version>.<x>' names of sub-directories, sorted with +/// ascending order of the integer after the dot. Corrupt directories, if +/// included, will come first. +static Expected<SmallVector<std::string, 4>> +getAllDBDirs(StringRef Path, bool IncludeCorrupt = false) { + struct DBDir { + uint64_t Order; + std::string Name; + }; + SmallVector<DBDir> FoundDBDirs; + + std::error_code EC; + for (sys::fs::directory_iterator DirI(Path, EC), DirE; !EC && DirI != DirE; + DirI.increment(EC)) { + if (DirI->type() != sys::fs::file_type::directory_file) + continue; + StringRef SubDir = sys::path::filename(DirI->path()); + if (IncludeCorrupt && SubDir.starts_with(CorruptPrefix)) { + FoundDBDirs.push_back({0, std::string(SubDir)}); + continue; + } + if (!SubDir.starts_with(DBDirPrefix)) + continue; + uint64_t Order; + if (SubDir.substr(DBDirPrefix.size()).getAsInteger(10, Order)) + return createStringError(inconvertibleErrorCode(), + "unexpected directory " + DirI->path()); + FoundDBDirs.push_back({Order, std::string(SubDir)}); + } + if (EC) + return createFileError(Path, EC); + + llvm::sort(FoundDBDirs, [](const DBDir &LHS, const DBDir &RHS) -> bool { + return LHS.Order <= RHS.Order; + }); + + SmallVector<std::string, 4> DBDirs; + for (DBDir &Dir : FoundDBDirs) + DBDirs.push_back(std::move(Dir.Name)); + return DBDirs; +} + +static Expected<SmallVector<std::string, 4>> getAllGarbageDirs(StringRef Path) { + auto DBDirs = getAllDBDirs(Path, /*IncludeCorrupt=*/true); + if (!DBDirs) + return DBDirs.takeError(); + + // FIXME: When the version of \p DBDirPrefix is bumped up we need to figure + // out how to handle the leftover sub-directories of the previous version. + + for (unsigned Keep = 2; Keep > 0 && !DBDirs->empty(); --Keep) { + StringRef Back(DBDirs->back()); + if (Back.starts_with(CorruptPrefix)) + break; + DBDirs->pop_back(); + } + return *DBDirs; +} + +/// \returns Given a sub-directory named 'v<version>.<x>', it outputs the +/// 'v<version>.<x+1>' name. +static void getNextDBDirName(StringRef DBDir, llvm::raw_ostream &OS) { + assert(DBDir.starts_with(DBDirPrefix)); + uint64_t Count; + bool Failed = DBDir.substr(DBDirPrefix.size()).getAsInteger(10, Count); + assert(!Failed); + (void)Failed; + OS << DBDirPrefix << Count + 1; +} + +static Error validateOutOfProcess(StringRef LLVMCasBinary, StringRef RootPath, + bool CheckHash) { + SmallVector<StringRef> Args{LLVMCasBinary, "-cas", RootPath, "-validate"}; + if (CheckHash) + Args.push_back("-check-hash"); + + llvm::SmallString<128> StdErrPath; + int StdErrFD = -1; + if (std::error_code EC = sys::fs::createTemporaryFile( + "llvm-cas-validate-stderr", "txt", StdErrFD, StdErrPath, + llvm::sys::fs::OF_Text)) + return createStringError(EC, "failed to create temporary file"); + FileRemover OutputRemover(StdErrPath.c_str()); + + std::optional<llvm::StringRef> Redirects[] = { + {""}, // stdin = /dev/null + {""}, // stdout = /dev/null + StdErrPath.str(), + }; + + std::string ErrMsg; + int Result = + sys::ExecuteAndWait(LLVMCasBinary, Args, /*Env=*/std::nullopt, Redirects, + /*SecondsToWait=*/120, /*MemoryLimit=*/0, &ErrMsg); + + if (Result == -1) + return createStringError("failed to exec " + join(Args, " ") + ": " + + ErrMsg); + if (Result != 0) { + llvm::SmallString<64> Err("cas contents invalid"); + if (!ErrMsg.empty()) { + Err += ": "; + Err += ErrMsg; + } + auto StdErrBuf = MemoryBuffer::getFile(StdErrPath.c_str()); + if (StdErrBuf && !(*StdErrBuf)->getBuffer().empty()) { + Err += ": "; + Err += (*StdErrBuf)->getBuffer(); + } + return createStringError(Err); + } + return Error::success(); +} + +static Error validateInProcess(StringRef RootPath, StringRef HashName, + unsigned HashByteSize, bool CheckHash) { + std::shared_ptr<UnifiedOnDiskCache> UniDB; + if (Error E = UnifiedOnDiskCache::open(RootPath, std::nullopt, HashName, + HashByteSize) + .moveInto(UniDB)) + return E; + auto CAS = builtin::createObjectStoreFromUnifiedOnDiskCache(UniDB); + if (Error E = CAS->validate(CheckHash)) + return E; + auto Cache = builtin::createActionCacheFromUnifiedOnDiskCache(UniDB); + if (Error E = Cache->validate()) + return E; + return Error::success(); +} + +static Expected<uint64_t> getBootTime() { +#if __has_include(<sys/sysctl.h>) && defined(KERN_BOOTTIME) + struct timeval TV; + size_t TVLen = sizeof(TV); + int KernBoot[2] = {CTL_KERN, KERN_BOOTTIME}; + if (sysctl(KernBoot, 2, &TV, &TVLen, nullptr, 0) < 0) + return createStringError(llvm::errnoAsErrorCode(), + "failed to get boottime"); + if (TVLen != sizeof(TV)) + return createStringError("sysctl kern.boottime unexpected format"); + return TV.tv_sec; +#elif defined(__linux__) + // Use the mtime for /proc, which is recreated during system boot. + // We could also read /proc/stat and search for 'btime'. + sys::fs::file_status Status; + if (std::error_code EC = sys::fs::status("/proc", Status)) + return createFileError("/proc", EC); + return Status.getLastModificationTime().time_since_epoch().count(); +#else + llvm::report_fatal_error("getBootTime unimplemented"); +#endif +} + +Expected<ValidationResult> UnifiedOnDiskCache::validateIfNeeded( + StringRef RootPath, StringRef HashName, unsigned HashByteSize, + bool CheckHash, bool AllowRecovery, bool ForceValidation, + std::optional<StringRef> LLVMCasBinaryPath) { + if (std::error_code EC = sys::fs::create_directories(RootPath)) + return createFileError(RootPath, EC); + + SmallString<256> PathBuf(RootPath); + sys::path::append(PathBuf, ValidationFilename); + int FD = -1; + if (std::error_code EC = sys::fs::openFileForReadWrite( + PathBuf, FD, sys::fs::CD_OpenAlways, sys::fs::OF_None)) + return createFileError(PathBuf, EC); + assert(FD != -1); + + sys::fs::file_t File = sys::fs::convertFDToNativeFile(FD); + auto CloseFile = make_scope_exit([&]() { sys::fs::closeFile(File); }); + + if (std::error_code EC = lockFileThreadSafe(FD, sys::fs::LockKind::Exclusive)) + return createFileError(PathBuf, EC); + auto UnlockFD = make_scope_exit([&]() { unlockFileThreadSafe(FD); }); + + SmallString<8> Bytes; + if (Error E = sys::fs::readNativeFileToEOF(File, Bytes)) + return createFileError(PathBuf, std::move(E)); + + uint64_t ValidationBootTime = 0; + if (!Bytes.empty() && + StringRef(Bytes).trim().getAsInteger(10, ValidationBootTime)) + return createFileError(PathBuf, errc::illegal_byte_sequence, + "expected integer"); + + static uint64_t BootTime = 0; + if (BootTime == 0) + if (Error E = getBootTime().moveInto(BootTime)) + return std::move(E); + + std::string LogValidationError; + + if (ValidationBootTime == BootTime && !ForceValidation) + return ValidationResult::Skipped; + + // Validate! + bool NeedsRecovery = false; + if (Error E = + LLVMCasBinaryPath + ? validateOutOfProcess(*LLVMCasBinaryPath, RootPath, CheckHash) + : validateInProcess(RootPath, HashName, HashByteSize, + CheckHash)) { + if (AllowRecovery) { + consumeError(std::move(E)); + NeedsRecovery = true; + } else { + return std::move(E); + } + } + + if (NeedsRecovery) { + sys::path::remove_filename(PathBuf); + sys::path::append(PathBuf, "lock"); + + int LockFD = -1; + if (std::error_code EC = sys::fs::openFileForReadWrite( + PathBuf, LockFD, sys::fs::CD_OpenAlways, sys::fs::OF_None)) + return createFileError(PathBuf, EC); + sys::fs::file_t LockFile = sys::fs::convertFDToNativeFile(LockFD); + auto CloseLock = make_scope_exit([&]() { sys::fs::closeFile(LockFile); }); + if (std::error_code EC = tryLockFileThreadSafe(LockFD)) { + if (EC == std::errc::no_lock_available) + return createFileError( + PathBuf, EC, + "CAS validation requires exclusive access but CAS was in use"); + return createFileError(PathBuf, EC); + } + auto UnlockFD = make_scope_exit([&]() { unlockFileThreadSafe(LockFD); }); + + auto DBDirs = getAllDBDirs(RootPath); + if (!DBDirs) + return DBDirs.takeError(); + + for (StringRef DBDir : *DBDirs) { + sys::path::remove_filename(PathBuf); + sys::path::append(PathBuf, DBDir); + std::error_code EC; + int Attempt = 0, MaxAttempts = 100; + SmallString<128> GCPath; + for (; Attempt < MaxAttempts; ++Attempt) { + GCPath.assign(RootPath); + sys::path::append(GCPath, CorruptPrefix + std::to_string(Attempt) + + "." + DBDir); + EC = sys::fs::rename(PathBuf, GCPath); + // Darwin uses ENOTEMPTY. Linux may return either ENOTEMPTY or EEXIST. + if (EC != errc::directory_not_empty && EC != errc::file_exists) + break; + } + if (Attempt == MaxAttempts) + return createStringError( + EC, "rename " + PathBuf + + " failed: too many CAS directories awaiting pruning"); + if (EC) + return createStringError(EC, "rename " + PathBuf + " to " + GCPath + + " failed: " + EC.message()); + } + } + + if (ValidationBootTime != BootTime) { + // Fix filename in case we have error to report. + sys::path::remove_filename(PathBuf); + sys::path::append(PathBuf, ValidationFilename); + if (std::error_code EC = sys::fs::resize_file(FD, 0)) + return createFileError(PathBuf, EC); + raw_fd_ostream OS(FD, /*shouldClose=*/false); + OS.seek(0); // resize does not reset position + OS << BootTime << '\n'; + if (OS.has_error()) + return createFileError(PathBuf, OS.error()); + } + + return NeedsRecovery ? ValidationResult::Recovered : ValidationResult::Valid; +} + +Expected<std::unique_ptr<UnifiedOnDiskCache>> +UnifiedOnDiskCache::open(StringRef RootPath, std::optional<uint64_t> SizeLimit, + StringRef HashName, unsigned HashByteSize, + OnDiskGraphDB::FaultInPolicy FaultInPolicy) { + if (std::error_code EC = sys::fs::create_directories(RootPath)) + return createFileError(RootPath, EC); + + SmallString<256> PathBuf(RootPath); + sys::path::append(PathBuf, "lock"); + int LockFD = -1; + if (std::error_code EC = sys::fs::openFileForReadWrite( + PathBuf, LockFD, sys::fs::CD_OpenAlways, sys::fs::OF_None)) + return createFileError(PathBuf, EC); + assert(LockFD != -1); + // Locking the directory using shared lock, which will prevent other processes + // from creating a new chain (essentially while a \p UnifiedOnDiskCache + // instance holds a shared lock the storage for the primary directory will + // grow unrestricted). + if (std::error_code EC = + lockFileThreadSafe(LockFD, sys::fs::LockKind::Shared)) + return createFileError(PathBuf, EC); + + auto DBDirs = getAllDBDirs(RootPath); + if (!DBDirs) + return DBDirs.takeError(); + if (DBDirs->empty()) + DBDirs->push_back((Twine(DBDirPrefix) + "1").str()); + + assert(!DBDirs->empty()); + + /// If there is only one directory open databases on it. If there are 2 or + /// more directories, get the most recent directories and chain them, with the + /// most recent being the primary one. The remaining directories are unused + /// data than can be garbage-collected. + auto UniDB = std::unique_ptr<UnifiedOnDiskCache>(new UnifiedOnDiskCache()); + std::unique_ptr<OnDiskGraphDB> UpstreamGraphDB; + std::unique_ptr<OnDiskKeyValueDB> UpstreamKVDB; + if (DBDirs->size() > 1) { + StringRef UpstreamDir = *(DBDirs->end() - 2); + PathBuf = RootPath; + sys::path::append(PathBuf, UpstreamDir); + if (Error E = OnDiskGraphDB::open(PathBuf, HashName, HashByteSize, + /*UpstreamDB=*/nullptr, FaultInPolicy) + .moveInto(UpstreamGraphDB)) + return std::move(E); + if (Error E = OnDiskKeyValueDB::open(PathBuf, HashName, HashByteSize, + /*ValueName=*/"objectid", + /*ValueSize=*/sizeof(uint64_t)) + .moveInto(UpstreamKVDB)) + return std::move(E); + } + + StringRef PrimaryDir = *(DBDirs->end() - 1); + PathBuf = RootPath; + sys::path::append(PathBuf, PrimaryDir); + std::unique_ptr<OnDiskGraphDB> PrimaryGraphDB; + if (Error E = OnDiskGraphDB::open(PathBuf, HashName, HashByteSize, + UpstreamGraphDB.get(), FaultInPolicy) + .moveInto(PrimaryGraphDB)) + return std::move(E); + std::unique_ptr<OnDiskKeyValueDB> PrimaryKVDB; + // \p UnifiedOnDiskCache does manual chaining for key-value requests, + // including an extra translation step of the value during fault-in. + if (Error E = + OnDiskKeyValueDB::open(PathBuf, HashName, HashByteSize, + /*ValueName=*/"objectid", + /*ValueSize=*/sizeof(uint64_t), UniDB.get()) + .moveInto(PrimaryKVDB)) + return std::move(E); + + UniDB->RootPath = RootPath; + UniDB->SizeLimit = SizeLimit.value_or(0); + UniDB->LockFD = LockFD; + UniDB->NeedsGarbageCollection = DBDirs->size() > 2; + UniDB->PrimaryDBDir = PrimaryDir; + UniDB->UpstreamGraphDB = std::move(UpstreamGraphDB); + UniDB->PrimaryGraphDB = std::move(PrimaryGraphDB); + UniDB->UpstreamKVDB = std::move(UpstreamKVDB); + UniDB->PrimaryKVDB = std::move(PrimaryKVDB); + + return std::move(UniDB); +} + +void UnifiedOnDiskCache::setSizeLimit(std::optional<uint64_t> SizeLimit) { + this->SizeLimit = SizeLimit.value_or(0); +} + +uint64_t UnifiedOnDiskCache::getStorageSize() const { + uint64_t TotalSize = getPrimaryStorageSize(); + if (UpstreamGraphDB) + TotalSize += UpstreamGraphDB->getStorageSize(); + if (UpstreamKVDB) + TotalSize += UpstreamKVDB->getStorageSize(); + return TotalSize; +} + +uint64_t UnifiedOnDiskCache::getPrimaryStorageSize() const { + return PrimaryGraphDB->getStorageSize() + PrimaryKVDB->getStorageSize(); +} + +bool UnifiedOnDiskCache::hasExceededSizeLimit() const { + uint64_t CurSizeLimit = SizeLimit; + if (!CurSizeLimit) + return false; + + // If the hard limit is beyond 85%, declare above limit and request clean up. + unsigned CurrentPercent = + std::max(PrimaryGraphDB->getHardStorageLimitUtilization(), + PrimaryKVDB->getHardStorageLimitUtilization()); + if (CurrentPercent > 85) + return true; + + // We allow each of the directories in the chain to reach up to half the + // intended size limit. Check whether the primary directory has exceeded half + // the limit or not, in order to decide whether we need to start a new chain. + // + // We could check the size limit against the sum of sizes of both the primary + // and upstream directories but then if the upstream is significantly larger + // than the intended limit, it would trigger a new chain to be created before + // the primary has reached its own limit. Essentially in such situation we + // prefer reclaiming the storage later in order to have more consistent cache + // hits behavior. + return (CurSizeLimit / 2) < getPrimaryStorageSize(); +} + +Error UnifiedOnDiskCache::close(bool CheckSizeLimit) { + if (LockFD == -1) + return Error::success(); // already closed. + auto CloseLock = make_scope_exit([&]() { + assert(LockFD >= 0); + sys::fs::file_t LockFile = sys::fs::convertFDToNativeFile(LockFD); + sys::fs::closeFile(LockFile); + LockFD = -1; + }); + + bool ExceededSizeLimit = CheckSizeLimit ? hasExceededSizeLimit() : false; + UpstreamKVDB.reset(); + PrimaryKVDB.reset(); + UpstreamGraphDB.reset(); + PrimaryGraphDB.reset(); + if (std::error_code EC = unlockFileThreadSafe(LockFD)) + return createFileError(RootPath, EC); + + if (!ExceededSizeLimit) + return Error::success(); + + // The primary directory exceeded its intended size limit. Try to get an + // exclusive lock in order to create a new primary directory for next time + // this \p UnifiedOnDiskCache path is opened. + + if (std::error_code EC = tryLockFileThreadSafe( + LockFD, std::chrono::milliseconds(0), sys::fs::LockKind::Exclusive)) { + if (EC == errc::no_lock_available) + return Error::success(); // couldn't get exclusive lock, give up. + return createFileError(RootPath, EC); + } + auto UnlockFile = make_scope_exit([&]() { unlockFileThreadSafe(LockFD); }); + + // Managed to get an exclusive lock which means there are no other open + // \p UnifiedOnDiskCache instances for the same path, so we can safely start a + // new primary directory. To start a new primary directory we just have to + // create a new empty directory with the next consecutive index; since this is + // an atomic operation we will leave the top-level directory in a consistent + // state even if the process dies during this code-path. + + SmallString<256> PathBuf(RootPath); + raw_svector_ostream OS(PathBuf); + OS << sys::path::get_separator(); + getNextDBDirName(PrimaryDBDir, OS); + if (std::error_code EC = sys::fs::create_directory(PathBuf)) + return createFileError(PathBuf, EC); + + NeedsGarbageCollection = true; + return Error::success(); +} + +UnifiedOnDiskCache::UnifiedOnDiskCache() = default; + +UnifiedOnDiskCache::~UnifiedOnDiskCache() { consumeError(close()); } + +Error UnifiedOnDiskCache::collectGarbage(StringRef Path) { + auto DBDirs = getAllGarbageDirs(Path); + if (!DBDirs) + return DBDirs.takeError(); + + SmallString<256> PathBuf(Path); + for (StringRef UnusedSubDir : *DBDirs) { + sys::path::append(PathBuf, UnusedSubDir); + if (std::error_code EC = sys::fs::remove_directories(PathBuf)) + return createFileError(PathBuf, EC); + sys::path::remove_filename(PathBuf); + } + return Error::success(); +} + +Error UnifiedOnDiskCache::collectGarbage() { return collectGarbage(RootPath); } diff --git a/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp b/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp index e5c85d5..1ea30d8 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp @@ -745,11 +745,6 @@ void AppleAccelTableStaticTypeData::emit(AsmPrinter *Asm) const { Asm->emitInt32(QualifiedNameHash); } -constexpr AppleAccelTableData::Atom AppleAccelTableTypeData::Atoms[]; -constexpr AppleAccelTableData::Atom AppleAccelTableOffsetData::Atoms[]; -constexpr AppleAccelTableData::Atom AppleAccelTableStaticOffsetData::Atoms[]; -constexpr AppleAccelTableData::Atom AppleAccelTableStaticTypeData::Atoms[]; - #ifndef NDEBUG void AppleAccelTableWriter::Header::print(raw_ostream &OS) const { OS << "Magic: " << format("0x%x", Magic) << "\n" diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index 1fc90d0..4fd2204 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -294,6 +294,10 @@ void IRTranslator::addMachineCFGPred(CFGEdge Edge, MachineBasicBlock *NewPred) { MachinePreds[Edge].push_back(NewPred); } +static bool targetSupportsBF16Type(const MachineFunction *MF) { + return MF->getTarget().getTargetTriple().isSPIRV(); +} + static bool containsBF16Type(const User &U) { // BF16 cannot currently be represented by LLT, to avoid miscompiles we // prevent any instructions using them. FIXME: This can be removed once LLT @@ -306,7 +310,7 @@ static bool containsBF16Type(const User &U) { bool IRTranslator::translateBinaryOp(unsigned Opcode, const User &U, MachineIRBuilder &MIRBuilder) { - if (containsBF16Type(U)) + if (containsBF16Type(U) && !targetSupportsBF16Type(MF)) return false; // Get or create a virtual register for each value. @@ -328,7 +332,7 @@ bool IRTranslator::translateBinaryOp(unsigned Opcode, const User &U, bool IRTranslator::translateUnaryOp(unsigned Opcode, const User &U, MachineIRBuilder &MIRBuilder) { - if (containsBF16Type(U)) + if (containsBF16Type(U) && !targetSupportsBF16Type(MF)) return false; Register Op0 = getOrCreateVReg(*U.getOperand(0)); @@ -348,7 +352,7 @@ bool IRTranslator::translateFNeg(const User &U, MachineIRBuilder &MIRBuilder) { bool IRTranslator::translateCompare(const User &U, MachineIRBuilder &MIRBuilder) { - if (containsBF16Type(U)) + if (containsBF16Type(U) && !targetSupportsBF16Type(MF)) return false; auto *CI = cast<CmpInst>(&U); @@ -1569,7 +1573,7 @@ bool IRTranslator::translateBitCast(const User &U, bool IRTranslator::translateCast(unsigned Opcode, const User &U, MachineIRBuilder &MIRBuilder) { - if (containsBF16Type(U)) + if (containsBF16Type(U) && !targetSupportsBF16Type(MF)) return false; uint32_t Flags = 0; @@ -2688,7 +2692,7 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, bool IRTranslator::translateInlineAsm(const CallBase &CB, MachineIRBuilder &MIRBuilder) { - if (containsBF16Type(CB)) + if (containsBF16Type(CB) && !targetSupportsBF16Type(MF)) return false; const InlineAsmLowering *ALI = MF->getSubtarget().getInlineAsmLowering(); @@ -2779,7 +2783,7 @@ bool IRTranslator::translateCallBase(const CallBase &CB, } bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) { - if (!MF->getTarget().getTargetTriple().isSPIRV() && containsBF16Type(U)) + if (containsBF16Type(U) && !targetSupportsBF16Type(MF)) return false; const CallInst &CI = cast<CallInst>(U); diff --git a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp index 4b4df98..637acd6 100644 --- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp +++ b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp @@ -109,8 +109,10 @@ MachineInstrBuilder MachineIRBuilder::buildConstDbgValue(const Constant &C, if (auto *CI = dyn_cast<ConstantInt>(NumericConstant)) { if (CI->getBitWidth() > 64) MIB.addCImm(CI); - else + else if (CI->getBitWidth() == 1) MIB.addImm(CI->getZExtValue()); + else + MIB.addImm(CI->getSExtValue()); } else if (auto *CFP = dyn_cast<ConstantFP>(NumericConstant)) { MIB.addFPImm(CFP); } else if (isa<ConstantPointerNull>(NumericConstant)) { diff --git a/llvm/lib/CodeGen/MachineFunctionSplitter.cpp b/llvm/lib/CodeGen/MachineFunctionSplitter.cpp index c31454a..b5d3092 100644 --- a/llvm/lib/CodeGen/MachineFunctionSplitter.cpp +++ b/llvm/lib/CodeGen/MachineFunctionSplitter.cpp @@ -129,6 +129,9 @@ static bool isColdBlock(const MachineBasicBlock &MBB, } bool MachineFunctionSplitter::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction())) + return false; + // Do not split functions when -basic-block-sections=all is specified. if (MF.getTarget().getBBSectionsType() == llvm::BasicBlockSection::All) return false; diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp index bb10cf6..d84c3fb 100644 --- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp @@ -733,6 +733,8 @@ MachineOperand GetMOForConstDbgOp(const SDDbgOperand &Op) { if (const ConstantInt *CI = dyn_cast<ConstantInt>(V)) { if (CI->getBitWidth() > 64) return MachineOperand::CreateCImm(CI); + if (CI->getBitWidth() == 1) + return MachineOperand::CreateImm(CI->getZExtValue()); return MachineOperand::CreateImm(CI->getSExtValue()); } if (const ConstantFP *CF = dyn_cast<ConstantFP>(V)) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index 431a810..316aacd 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -163,6 +163,8 @@ private: RTLIB::Libcall CallI128); void ExpandDivRemLibCall(SDNode *Node, SmallVectorImpl<SDValue> &Results); + SDValue ExpandSincosStretLibCall(SDNode *Node) const; + SDValue EmitStackConvert(SDValue SrcOp, EVT SlotVT, EVT DestVT, const SDLoc &dl); SDValue EmitStackConvert(SDValue SrcOp, EVT SlotVT, EVT DestVT, @@ -2423,6 +2425,101 @@ static bool useSinCos(SDNode *Node) { return false; } +SDValue SelectionDAGLegalize::ExpandSincosStretLibCall(SDNode *Node) const { + // For iOS, we want to call an alternative entry point: __sincos_stret, + // which returns the values in two S / D registers. + SDLoc dl(Node); + SDValue Arg = Node->getOperand(0); + EVT ArgVT = Arg.getValueType(); + RTLIB::Libcall LC = RTLIB::getSINCOS_STRET(ArgVT); + RTLIB::LibcallImpl SincosStret = TLI.getLibcallImpl(LC); + if (SincosStret == RTLIB::Unsupported) + return SDValue(); + + /// There are 3 different ABI cases to handle: + /// - Direct return of separate fields in registers + /// - Single return as vector elements + /// - sret struct + + const RTLIB::RuntimeLibcallsInfo &CallsInfo = TLI.getRuntimeLibcallsInfo(); + + const DataLayout &DL = DAG.getDataLayout(); + + auto [FuncTy, FuncAttrs] = CallsInfo.getFunctionTy( + *DAG.getContext(), TM.getTargetTriple(), DL, SincosStret); + + Type *SincosStretRetTy = FuncTy->getReturnType(); + CallingConv::ID CallConv = CallsInfo.getLibcallImplCallingConv(SincosStret); + StringRef LibcallImplName = CallsInfo.getLibcallImplName(SincosStret); + + SDValue Callee = DAG.getExternalSymbol(LibcallImplName.data(), + TLI.getProgramPointerTy(DL)); + + TargetLowering::ArgListTy Args; + SDValue SRet; + + int FrameIdx; + if (FuncTy->getParamType(0)->isPointerTy()) { + // Uses sret + MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); + + AttributeSet PtrAttrs = FuncAttrs.getParamAttrs(0); + Type *StructTy = PtrAttrs.getStructRetType(); + const uint64_t ByteSize = DL.getTypeAllocSize(StructTy); + const Align StackAlign = DL.getPrefTypeAlign(StructTy); + + FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false); + SRet = DAG.getFrameIndex(FrameIdx, TLI.getFrameIndexTy(DL)); + + TargetLowering::ArgListEntry Entry(SRet, FuncTy->getParamType(0)); + Entry.IsSRet = true; + Entry.IndirectType = StructTy; + Entry.Alignment = StackAlign; + + Args.push_back(Entry); + Args.emplace_back(Arg, FuncTy->getParamType(1)); + } else { + Args.emplace_back(Arg, FuncTy->getParamType(0)); + } + + TargetLowering::CallLoweringInfo CLI(DAG); + CLI.setDebugLoc(dl) + .setChain(DAG.getEntryNode()) + .setLibCallee(CallConv, SincosStretRetTy, Callee, std::move(Args)) + .setIsPostTypeLegalization(); + + std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); + + if (SRet) { + MachinePointerInfo PtrInfo = + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); + SDValue LoadSin = DAG.getLoad(ArgVT, dl, CallResult.second, SRet, PtrInfo); + + TypeSize StoreSize = ArgVT.getStoreSize(); + + // Address of cos field. + SDValue Add = DAG.getObjectPtrOffset(dl, SRet, StoreSize); + SDValue LoadCos = DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, + PtrInfo.getWithOffset(StoreSize)); + + SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); + return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, LoadSin.getValue(0), + LoadCos.getValue(0)); + } + + if (!CallResult.first.getValueType().isVector()) + return CallResult.first; + + SDValue SinVal = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first, + DAG.getVectorIdxConstant(0, dl)); + SDValue CosVal = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first, + DAG.getVectorIdxConstant(1, dl)); + SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); + return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal); +} + SDValue SelectionDAGLegalize::expandLdexp(SDNode *Node) const { SDLoc dl(Node); EVT VT = Node->getValueType(0); @@ -4730,6 +4827,18 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) { case ISD::FSINCOS: case ISD::FSINCOSPI: { EVT VT = Node->getValueType(0); + + if (Node->getOpcode() == ISD::FSINCOS) { + RTLIB::Libcall SincosStret = RTLIB::getSINCOS_STRET(VT); + if (SincosStret != RTLIB::UNKNOWN_LIBCALL) { + if (SDValue Expanded = ExpandSincosStretLibCall(Node)) { + Results.push_back(Expanded); + Results.push_back(Expanded.getValue(1)); + break; + } + } + } + RTLIB::Libcall LC = Node->getOpcode() == ISD::FSINCOS ? RTLIB::getSINCOS(VT) : RTLIB::getSINCOSPI(VT); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index a522650..fa0c899 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -8958,9 +8958,8 @@ bool SelectionDAGBuilder::canTailCall(const CallBase &CB) const { // Avoid emitting tail calls in functions with the disable-tail-calls // attribute. const Function *Caller = CB.getParent()->getParent(); - if (Caller->getFnAttribute("disable-tail-calls").getValueAsString() == - "true" && - !isMustTailCall) + if (!isMustTailCall && + Caller->getFnAttribute("disable-tail-calls").getValueAsBool()) return false; // We can't tail call inside a function with a swifterror argument. Lowering diff --git a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerUnit.h b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerUnit.h index 84757ae..970abdc 100644 --- a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerUnit.h +++ b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerUnit.h @@ -28,7 +28,7 @@ using MacroOffset2UnitMapTy = DenseMap<uint64_t, DwarfUnit *>; /// Base class for all Dwarf units(Compile unit/Type table unit). class DwarfUnit : public OutputSections { public: - virtual ~DwarfUnit() {} + virtual ~DwarfUnit() = default; DwarfUnit(LinkingGlobalData &GlobalData, unsigned ID, StringRef ClangModuleName) : OutputSections(GlobalData), ID(ID), ClangModuleName(ClangModuleName), diff --git a/llvm/lib/DWARFLinker/Parallel/StringEntryToDwarfStringPoolEntryMap.h b/llvm/lib/DWARFLinker/Parallel/StringEntryToDwarfStringPoolEntryMap.h index f67536e..8ccb4a5 100644 --- a/llvm/lib/DWARFLinker/Parallel/StringEntryToDwarfStringPoolEntryMap.h +++ b/llvm/lib/DWARFLinker/Parallel/StringEntryToDwarfStringPoolEntryMap.h @@ -22,7 +22,7 @@ class StringEntryToDwarfStringPoolEntryMap { public: StringEntryToDwarfStringPoolEntryMap(LinkingGlobalData &GlobalData) : GlobalData(GlobalData) {} - ~StringEntryToDwarfStringPoolEntryMap() {} + ~StringEntryToDwarfStringPoolEntryMap() = default; /// Create DwarfStringPoolEntry for specified StringEntry if necessary. /// Initialize DwarfStringPoolEntry with initial values. diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp index 6c78ef0..7496c5a 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp @@ -704,7 +704,9 @@ void DWARFDie::dump(raw_ostream &OS, unsigned Indent, DIDumpOptions ChildDumpOpts = DumpOpts; ChildDumpOpts.ShowParents = false; while (Child) { - Child.dump(OS, Indent + 2, ChildDumpOpts); + if (DumpOpts.FilterChildTag.empty() || + llvm::is_contained(DumpOpts.FilterChildTag, Child.getTag())) + Child.dump(OS, Indent + 2, ChildDumpOpts); Child = Child.getSibling(); } } diff --git a/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp b/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp index 7e606c6a..4e7db82 100644 --- a/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp +++ b/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp @@ -27,7 +27,7 @@ namespace llvm { namespace orc { -MemoryMapper::~MemoryMapper() {} +MemoryMapper::~MemoryMapper() = default; InProcessMemoryMapper::InProcessMemoryMapper(size_t PageSize) : PageSize(PageSize) {} diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 0e5926f..fff9a81 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -528,7 +528,7 @@ void OpenMPIRBuilder::getKernelArgsVector(TargetKernelArgs &KernelArgs, Value *Version = Builder.getInt32(OMP_KERNEL_ARG_VERSION); Value *PointerNum = Builder.getInt32(KernelArgs.NumTargetItems); auto Int32Ty = Type::getInt32Ty(Builder.getContext()); - constexpr const size_t MaxDim = 3; + constexpr size_t MaxDim = 3; Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, MaxDim)); Value *Flags = Builder.getInt64(KernelArgs.HasNoWait); diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp index 3b8fde8..cd39970 100644 --- a/llvm/lib/IR/Instructions.cpp +++ b/llvm/lib/IR/Instructions.cpp @@ -4171,6 +4171,16 @@ SwitchInstProfUpdateWrapper::removeCase(SwitchInst::CaseIt I) { return SI.removeCase(I); } +void SwitchInstProfUpdateWrapper::replaceDefaultDest(SwitchInst::CaseIt I) { + auto *DestBlock = I->getCaseSuccessor(); + if (Weights) { + auto Weight = getSuccessorWeight(I->getCaseIndex() + 1); + (*Weights)[0] = Weight.value(); + } + + SI.setDefaultDest(DestBlock); +} + void SwitchInstProfUpdateWrapper::addCase( ConstantInt *OnVal, BasicBlock *Dest, SwitchInstProfUpdateWrapper::CaseWeightOpt W) { diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp index 77af29b..2ce5719 100644 --- a/llvm/lib/IR/RuntimeLibcalls.cpp +++ b/llvm/lib/IR/RuntimeLibcalls.cpp @@ -7,7 +7,9 @@ //===----------------------------------------------------------------------===// #include "llvm/IR/RuntimeLibcalls.h" +#include "llvm/ADT/FloatingPointMode.h" #include "llvm/ADT/StringTable.h" +#include "llvm/IR/DataLayout.h" #include "llvm/Support/Debug.h" #include "llvm/Support/xxhash.h" #include "llvm/TargetParser/ARMTargetParser.h" @@ -72,3 +74,80 @@ bool RuntimeLibcallsInfo::darwinHasExp10(const Triple &TT) { return false; } } + +std::pair<FunctionType *, AttributeList> +RuntimeLibcallsInfo::getFunctionTy(LLVMContext &Ctx, const Triple &TT, + const DataLayout &DL, + RTLIB::LibcallImpl LibcallImpl) const { + static constexpr Attribute::AttrKind CommonFnAttrs[] = { + Attribute::NoCallback, Attribute::NoFree, Attribute::NoSync, + Attribute::NoUnwind, Attribute::WillReturn}; + + switch (LibcallImpl) { + case RTLIB::impl___sincos_stret: + case RTLIB::impl___sincosf_stret: { + if (!darwinHasSinCosStret(TT)) // Non-darwin currently unexpected + return {}; + + Type *ScalarTy = LibcallImpl == RTLIB::impl___sincosf_stret + ? Type::getFloatTy(Ctx) + : Type::getDoubleTy(Ctx); + + AttrBuilder FuncAttrBuilder(Ctx); + for (Attribute::AttrKind Attr : CommonFnAttrs) + FuncAttrBuilder.addAttribute(Attr); + + const bool UseSret = + TT.isX86_32() || ((TT.isARM() || TT.isThumb()) && + ARM::computeTargetABI(TT) == ARM::ARM_ABI_APCS); + + FuncAttrBuilder.addMemoryAttr(MemoryEffects::argumentOrErrnoMemOnly( + UseSret ? ModRefInfo::Mod : ModRefInfo::NoModRef, ModRefInfo::Mod)); + + AttributeList Attrs; + Attrs = Attrs.addFnAttributes(Ctx, FuncAttrBuilder); + + if (UseSret) { + AttrBuilder AttrBuilder(Ctx); + StructType *StructTy = StructType::get(ScalarTy, ScalarTy); + AttrBuilder.addStructRetAttr(StructTy); + AttrBuilder.addAlignmentAttr(DL.getABITypeAlign(StructTy)); + FunctionType *FuncTy = FunctionType::get( + Type::getVoidTy(Ctx), {DL.getAllocaPtrType(Ctx), ScalarTy}, false); + + return {FuncTy, Attrs.addParamAttributes(Ctx, 0, AttrBuilder)}; + } + + Type *RetTy = + LibcallImpl == RTLIB::impl___sincosf_stret && TT.isX86_64() + ? static_cast<Type *>(FixedVectorType::get(ScalarTy, 2)) + : static_cast<Type *>(StructType::get(ScalarTy, ScalarTy)); + + return {FunctionType::get(RetTy, {ScalarTy}, false), Attrs}; + } + case RTLIB::impl_sqrtf: + case RTLIB::impl_sqrt: { + AttrBuilder FuncAttrBuilder(Ctx); + + for (Attribute::AttrKind Attr : CommonFnAttrs) + FuncAttrBuilder.addAttribute(Attr); + FuncAttrBuilder.addMemoryAttr(MemoryEffects::errnoMemOnly(ModRefInfo::Mod)); + + AttributeList Attrs; + Attrs = Attrs.addFnAttributes(Ctx, FuncAttrBuilder); + + Type *ScalarTy = LibcallImpl == RTLIB::impl_sqrtf ? Type::getFloatTy(Ctx) + : Type::getDoubleTy(Ctx); + FunctionType *FuncTy = FunctionType::get(ScalarTy, {ScalarTy}, false); + + Attrs = Attrs.addRetAttribute( + Ctx, Attribute::getWithNoFPClass(Ctx, fcNegInf | fcNegSubnormal | + fcNegNormal)); + return {FuncTy, Attrs}; + } + default: + return {}; + } + + return {}; +} diff --git a/llvm/lib/IR/Value.cpp b/llvm/lib/IR/Value.cpp index b775cbb..95d61a9 100644 --- a/llvm/lib/IR/Value.cpp +++ b/llvm/lib/IR/Value.cpp @@ -148,18 +148,10 @@ void Value::destroyValueName() { } bool Value::hasNUses(unsigned N) const { - if (!UseList) - return N == 0; - - // TODO: Disallow for ConstantData and remove !UseList check? return hasNItems(use_begin(), use_end(), N); } bool Value::hasNUsesOrMore(unsigned N) const { - // TODO: Disallow for ConstantData and remove !UseList check? - if (!UseList) - return N == 0; - return hasNItemsOrMore(use_begin(), use_end(), N); } diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp index b618222..23be42f 100644 --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -1076,63 +1076,59 @@ Expected<ArrayRef<SymbolResolution>> LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms, ArrayRef<SymbolResolution> Res) { llvm::TimeTraceScope timeScope("LTO add thin LTO"); + const auto BMID = BM.getModuleIdentifier(); ArrayRef<SymbolResolution> ResTmp = Res; for (const InputFile::Symbol &Sym : Syms) { assert(!ResTmp.empty()); const SymbolResolution &R = ResTmp.consume_front(); - if (!Sym.getIRName().empty()) { + if (!Sym.getIRName().empty() && R.Prevailing) { auto GUID = GlobalValue::getGUIDAssumingExternalLinkage( GlobalValue::getGlobalIdentifier(Sym.getIRName(), GlobalValue::ExternalLinkage, "")); - if (R.Prevailing) - ThinLTO.setPrevailingModuleForGUID(GUID, BM.getModuleIdentifier()); + ThinLTO.setPrevailingModuleForGUID(GUID, BMID); } } - if (Error Err = - BM.readSummary(ThinLTO.CombinedIndex, BM.getModuleIdentifier(), - [&](GlobalValue::GUID GUID) { - return ThinLTO.isPrevailingModuleForGUID( - GUID, BM.getModuleIdentifier()); - })) + if (Error Err = BM.readSummary( + ThinLTO.CombinedIndex, BMID, [&](GlobalValue::GUID GUID) { + return ThinLTO.isPrevailingModuleForGUID(GUID, BMID); + })) return Err; - LLVM_DEBUG(dbgs() << "Module " << BM.getModuleIdentifier() << "\n"); + LLVM_DEBUG(dbgs() << "Module " << BMID << "\n"); for (const InputFile::Symbol &Sym : Syms) { assert(!Res.empty()); const SymbolResolution &R = Res.consume_front(); - if (!Sym.getIRName().empty()) { + if (!Sym.getIRName().empty() && + (R.Prevailing || R.FinalDefinitionInLinkageUnit)) { auto GUID = GlobalValue::getGUIDAssumingExternalLinkage( GlobalValue::getGlobalIdentifier(Sym.getIRName(), GlobalValue::ExternalLinkage, "")); if (R.Prevailing) { - assert( - ThinLTO.isPrevailingModuleForGUID(GUID, BM.getModuleIdentifier())); + assert(ThinLTO.isPrevailingModuleForGUID(GUID, BMID)); // For linker redefined symbols (via --wrap or --defsym) we want to // switch the linkage to `weak` to prevent IPOs from happening. // Find the summary in the module for this very GV and record the new // linkage so that we can switch it when we import the GV. if (R.LinkerRedefined) - if (auto S = ThinLTO.CombinedIndex.findSummaryInModule( - GUID, BM.getModuleIdentifier())) + if (auto S = ThinLTO.CombinedIndex.findSummaryInModule(GUID, BMID)) S->setLinkage(GlobalValue::WeakAnyLinkage); } // If the linker resolved the symbol to a local definition then mark it // as local in the summary for the module we are adding. if (R.FinalDefinitionInLinkageUnit) { - if (auto S = ThinLTO.CombinedIndex.findSummaryInModule( - GUID, BM.getModuleIdentifier())) { + if (auto S = ThinLTO.CombinedIndex.findSummaryInModule(GUID, BMID)) { S->setDSOLocal(true); } } } } - if (!ThinLTO.ModuleMap.insert({BM.getModuleIdentifier(), BM}).second) + if (!ThinLTO.ModuleMap.insert({BMID, BM}).second) return make_error<StringError>( "Expected at most one ThinLTO module per bitcode file", inconvertibleErrorCode()); @@ -1143,10 +1139,10 @@ LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms, // This is a fuzzy name matching where only modules with name containing the // specified switch values are going to be compiled. for (const std::string &Name : Conf.ThinLTOModulesToCompile) { - if (BM.getModuleIdentifier().contains(Name)) { - ThinLTO.ModulesToCompile->insert({BM.getModuleIdentifier(), BM}); - LLVM_DEBUG(dbgs() << "[ThinLTO] Selecting " << BM.getModuleIdentifier() - << " to compile\n"); + if (BMID.contains(Name)) { + ThinLTO.ModulesToCompile->insert({BMID, BM}); + LLVM_DEBUG(dbgs() << "[ThinLTO] Selecting " << BMID << " to compile\n"); + break; } } } diff --git a/llvm/lib/MC/GOFFObjectWriter.cpp b/llvm/lib/MC/GOFFObjectWriter.cpp index 71bd397..a3eaaa7 100644 --- a/llvm/lib/MC/GOFFObjectWriter.cpp +++ b/llvm/lib/MC/GOFFObjectWriter.cpp @@ -520,7 +520,7 @@ GOFFObjectWriter::GOFFObjectWriter( std::unique_ptr<MCGOFFObjectTargetWriter> MOTW, raw_pwrite_stream &OS) : TargetObjectWriter(std::move(MOTW)), OS(OS) {} -GOFFObjectWriter::~GOFFObjectWriter() {} +GOFFObjectWriter::~GOFFObjectWriter() = default; uint64_t GOFFObjectWriter::writeObject() { uint64_t Size = GOFFWriter(OS, *Asm).writeObject(); diff --git a/llvm/lib/MC/MCDXContainerWriter.cpp b/llvm/lib/MC/MCDXContainerWriter.cpp index 5eda039..ebed411 100644 --- a/llvm/lib/MC/MCDXContainerWriter.cpp +++ b/llvm/lib/MC/MCDXContainerWriter.cpp @@ -16,7 +16,7 @@ using namespace llvm; -MCDXContainerTargetWriter::~MCDXContainerTargetWriter() {} +MCDXContainerTargetWriter::~MCDXContainerTargetWriter() = default; uint64_t DXContainerObjectWriter::writeObject() { auto &Asm = *this->Asm; diff --git a/llvm/lib/MC/MCGOFFStreamer.cpp b/llvm/lib/MC/MCGOFFStreamer.cpp index 8b228db..ad6397b 100644 --- a/llvm/lib/MC/MCGOFFStreamer.cpp +++ b/llvm/lib/MC/MCGOFFStreamer.cpp @@ -20,7 +20,7 @@ using namespace llvm; -MCGOFFStreamer::~MCGOFFStreamer() {} +MCGOFFStreamer::~MCGOFFStreamer() = default; GOFFObjectWriter &MCGOFFStreamer::getWriter() { return static_cast<GOFFObjectWriter &>(getAssembler().getWriter()); diff --git a/llvm/lib/MC/SPIRVObjectWriter.cpp b/llvm/lib/MC/SPIRVObjectWriter.cpp index 5e37137..d693ea3 100644 --- a/llvm/lib/MC/SPIRVObjectWriter.cpp +++ b/llvm/lib/MC/SPIRVObjectWriter.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "llvm/MC/MCAssembler.h" +#include "llvm/MC/MCContext.h" #include "llvm/MC/MCSPIRVObjectWriter.h" #include "llvm/MC/MCSection.h" #include "llvm/MC/MCValue.h" @@ -17,8 +18,10 @@ using namespace llvm; void SPIRVObjectWriter::writeHeader(const MCAssembler &Asm) { constexpr uint32_t MagicNumber = 0x07230203; constexpr uint32_t GeneratorID = 43; - constexpr uint32_t GeneratorMagicNumber = - (GeneratorID << 16) | (LLVM_VERSION_MAJOR); + const uint32_t GeneratorMagicNumber = + Asm.getContext().getTargetTriple().getVendor() == Triple::AMD + ? UINT16_MAX + : ((GeneratorID << 16) | (LLVM_VERSION_MAJOR)); constexpr uint32_t Schema = 0; W.write<uint32_t>(MagicNumber); diff --git a/llvm/lib/ObjCopy/COFF/COFFWriter.h b/llvm/lib/ObjCopy/COFF/COFFWriter.h index 66d7f01..3ee0e06 100644 --- a/llvm/lib/ObjCopy/COFF/COFFWriter.h +++ b/llvm/lib/ObjCopy/COFF/COFFWriter.h @@ -50,7 +50,7 @@ class COFFWriter { Expected<uint32_t> virtualAddressToFileAddress(uint32_t RVA); public: - virtual ~COFFWriter() {} + virtual ~COFFWriter() = default; Error write(); COFFWriter(Object &Obj, raw_ostream &Out) diff --git a/llvm/lib/ObjCopy/ELF/ELFObject.h b/llvm/lib/ObjCopy/ELF/ELFObject.h index 4f6473f..2783ef27 100644 --- a/llvm/lib/ObjCopy/ELF/ELFObject.h +++ b/llvm/lib/ObjCopy/ELF/ELFObject.h @@ -134,7 +134,7 @@ private: using Elf_Sym = typename ELFT::Sym; public: - ~ELFSectionWriter() override {} + ~ELFSectionWriter() override = default; Error visit(const SymbolTableSection &Sec) override; Error visit(const RelocationSection &Sec) override; Error visit(const GnuDebugLinkSection &Sec) override; @@ -180,7 +180,7 @@ public: class BinarySectionWriter : public SectionWriter { public: - ~BinarySectionWriter() override {} + ~BinarySectionWriter() override = default; Error visit(const SymbolTableSection &Sec) override; Error visit(const RelocationSection &Sec) override; @@ -346,7 +346,7 @@ private: size_t totalSize() const; public: - ~ELFWriter() override {} + ~ELFWriter() override = default; bool WriteSectionHeaders; // For --only-keep-debug, select an alternative section/segment layout @@ -367,7 +367,7 @@ private: uint64_t TotalSize = 0; public: - ~BinaryWriter() override {} + ~BinaryWriter() override = default; Error finalize() override; Error write() override; BinaryWriter(Object &Obj, raw_ostream &Out, const CommonConfig &Config) @@ -784,7 +784,7 @@ private: SymbolTableSection *Symbols = nullptr; public: - ~SectionIndexSection() override {} + ~SectionIndexSection() override = default; void addIndex(uint32_t Index) { assert(Size > 0); Indexes.push_back(Index); diff --git a/llvm/lib/ObjCopy/MachO/MachOReader.h b/llvm/lib/ObjCopy/MachO/MachOReader.h index e315e6fd..940ba4c 100644 --- a/llvm/lib/ObjCopy/MachO/MachOReader.h +++ b/llvm/lib/ObjCopy/MachO/MachOReader.h @@ -23,7 +23,7 @@ namespace macho { // raw binaries and regular MachO object files. class Reader { public: - virtual ~Reader(){}; + virtual ~Reader() = default; virtual Expected<std::unique_ptr<Object>> create() const = 0; }; diff --git a/llvm/lib/ObjCopy/XCOFF/XCOFFWriter.h b/llvm/lib/ObjCopy/XCOFF/XCOFFWriter.h index 8620548..47639ad 100644 --- a/llvm/lib/ObjCopy/XCOFF/XCOFFWriter.h +++ b/llvm/lib/ObjCopy/XCOFF/XCOFFWriter.h @@ -20,7 +20,7 @@ namespace xcoff { class XCOFFWriter { public: - virtual ~XCOFFWriter() {} + virtual ~XCOFFWriter() = default; XCOFFWriter(Object &Obj, raw_ostream &Out) : Obj(Obj), Out(Out) {} Error write(); diff --git a/llvm/lib/Object/MachOObjectFile.cpp b/llvm/lib/Object/MachOObjectFile.cpp index e09dc94..c2f4560 100644 --- a/llvm/lib/Object/MachOObjectFile.cpp +++ b/llvm/lib/Object/MachOObjectFile.cpp @@ -1978,20 +1978,42 @@ uint64_t MachOObjectFile::getSectionSize(DataRefImpl Sec) const { return SectSize; } -ArrayRef<uint8_t> MachOObjectFile::getSectionContents(uint32_t Offset, +ArrayRef<uint8_t> MachOObjectFile::getSectionContents(uint64_t Offset, uint64_t Size) const { return arrayRefFromStringRef(getData().substr(Offset, Size)); } Expected<ArrayRef<uint8_t>> MachOObjectFile::getSectionContents(DataRefImpl Sec) const { - uint32_t Offset; + uint64_t Offset; uint64_t Size; if (is64Bit()) { MachO::section_64 Sect = getSection64(Sec); Offset = Sect.offset; Size = Sect.size; + // Check for large mach-o files where the section contents might exceed + // 4GB. MachO::section_64 objects only have 32 bit file offsets to the + // section contents and can overflow in dSYM files. We can track this and + // adjust the section offset to be 64 bit safe. If sections overflow then + // section ordering is enforced. If sections are not ordered, then an error + // will be returned stopping invalid section data from being returned. + uint64_t PrevTrueOffset = 0; + uint64_t SectOffsetAdjust = 0; + for (uint32_t SectIdx = 0; SectIdx < Sec.d.a; ++SectIdx) { + MachO::section_64 CurrSect = + getStruct<MachO::section_64>(*this, Sections[SectIdx]); + uint64_t CurrTrueOffset = (uint64_t)CurrSect.offset + SectOffsetAdjust; + if ((SectOffsetAdjust > 0) && (PrevTrueOffset > CurrTrueOffset)) + return malformedError("section data exceeds 4GB and section file " + "offsets are not ordered"); + const uint64_t EndSectFileOffset = + (uint64_t)CurrSect.offset + CurrSect.size; + if (EndSectFileOffset > UINT32_MAX) + SectOffsetAdjust += EndSectFileOffset & 0xFFFFFFFF00000000ull; + PrevTrueOffset = CurrTrueOffset; + } + Offset += SectOffsetAdjust; } else { MachO::section Sect = getSection(Sec); Offset = Sect.offset; diff --git a/llvm/lib/ObjectYAML/GOFFYAML.cpp b/llvm/lib/ObjectYAML/GOFFYAML.cpp index 60bc1f7..ecd7fb6 100644 --- a/llvm/lib/ObjectYAML/GOFFYAML.cpp +++ b/llvm/lib/ObjectYAML/GOFFYAML.cpp @@ -15,7 +15,7 @@ namespace llvm { namespace GOFFYAML { -Object::Object() {} +Object::Object() = default; } // namespace GOFFYAML diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index bd03ac0..3f41618 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -228,7 +228,7 @@ static cl::opt<bool> EnableLoopHeaderDuplication( static cl::opt<bool> EnableDFAJumpThreading("enable-dfa-jump-thread", cl::desc("Enable DFA jump threading"), - cl::init(false), cl::Hidden); + cl::init(true), cl::Hidden); static cl::opt<bool> EnableHotColdSplit("hot-cold-split", diff --git a/llvm/lib/Passes/StandardInstrumentations.cpp b/llvm/lib/Passes/StandardInstrumentations.cpp index 7290a86..6b7e980 100644 --- a/llvm/lib/Passes/StandardInstrumentations.cpp +++ b/llvm/lib/Passes/StandardInstrumentations.cpp @@ -537,7 +537,7 @@ void IRChangedPrinter::handleAfter(StringRef PassID, std::string &Name, Out << "*** IR Dump After " << PassID << " on " << Name << " ***\n" << After; } -IRChangedTester::~IRChangedTester() {} +IRChangedTester::~IRChangedTester() = default; void IRChangedTester::registerCallbacks(PassInstrumentationCallbacks &PIC) { if (TestChanged != "") @@ -1566,7 +1566,7 @@ void InLineChangePrinter::registerCallbacks(PassInstrumentationCallbacks &PIC) { TextChangeReporter<IRDataT<EmptyData>>::registerRequiredCallbacks(PIC); } -TimeProfilingPassesHandler::TimeProfilingPassesHandler() {} +TimeProfilingPassesHandler::TimeProfilingPassesHandler() = default; void TimeProfilingPassesHandler::registerCallbacks( PassInstrumentationCallbacks &PIC) { diff --git a/llvm/lib/SandboxIR/Context.cpp b/llvm/lib/SandboxIR/Context.cpp index fb6ff62..6f5d072 100644 --- a/llvm/lib/SandboxIR/Context.cpp +++ b/llvm/lib/SandboxIR/Context.cpp @@ -637,7 +637,7 @@ Context::Context(LLVMContext &LLVMCtx) : LLVMCtx(LLVMCtx), IRTracker(*this), LLVMIRBuilder(LLVMCtx, ConstantFolder()) {} -Context::~Context() {} +Context::~Context() = default; void Context::clear() { // TODO: Ideally we should clear only function-scope objects, and keep global diff --git a/llvm/lib/Support/BalancedPartitioning.cpp b/llvm/lib/Support/BalancedPartitioning.cpp index 1914f4c..d859abd 100644 --- a/llvm/lib/Support/BalancedPartitioning.cpp +++ b/llvm/lib/Support/BalancedPartitioning.cpp @@ -231,7 +231,7 @@ unsigned BalancedPartitioning::runIteration(const FunctionNodeRange Nodes, } // Compute move gains - typedef std::pair<float, BPFunctionNode *> GainPair; + using GainPair = std::pair<float, BPFunctionNode *>; std::vector<GainPair> Gains; for (auto &N : Nodes) { bool FromLeftToRight = (N.Bucket == LeftBucket); diff --git a/llvm/lib/Support/BranchProbability.cpp b/llvm/lib/Support/BranchProbability.cpp index ea42f34..143e58a 100644 --- a/llvm/lib/Support/BranchProbability.cpp +++ b/llvm/lib/Support/BranchProbability.cpp @@ -20,8 +20,6 @@ using namespace llvm; -constexpr uint32_t BranchProbability::D; - raw_ostream &BranchProbability::print(raw_ostream &OS) const { if (isUnknown()) return OS << "?%"; diff --git a/llvm/lib/Support/CommandLine.cpp b/llvm/lib/Support/CommandLine.cpp index de5bd79..dab8bee 100644 --- a/llvm/lib/Support/CommandLine.cpp +++ b/llvm/lib/Support/CommandLine.cpp @@ -2343,10 +2343,10 @@ namespace { class HelpPrinter { protected: const bool ShowHidden; - typedef SmallVector<std::pair<const char *, Option *>, 128> - StrOptionPairVector; - typedef SmallVector<std::pair<const char *, SubCommand *>, 128> - StrSubCommandPairVector; + using StrOptionPairVector = + SmallVector<std::pair<const char *, Option *>, 128>; + using StrSubCommandPairVector = + SmallVector<std::pair<const char *, SubCommand *>, 128>; // Print the options. Opts is assumed to be alphabetically sorted. virtual void printOptions(StrOptionPairVector &Opts, size_t MaxArgLen) { for (const auto &Opt : Opts) diff --git a/llvm/lib/Support/DAGDeltaAlgorithm.cpp b/llvm/lib/Support/DAGDeltaAlgorithm.cpp index 98153647..3bfae14 100644 --- a/llvm/lib/Support/DAGDeltaAlgorithm.cpp +++ b/llvm/lib/Support/DAGDeltaAlgorithm.cpp @@ -47,16 +47,16 @@ class DAGDeltaAlgorithmImpl { friend class DeltaActiveSetHelper; public: - typedef DAGDeltaAlgorithm::change_ty change_ty; - typedef DAGDeltaAlgorithm::changeset_ty changeset_ty; - typedef DAGDeltaAlgorithm::changesetlist_ty changesetlist_ty; - typedef DAGDeltaAlgorithm::edge_ty edge_ty; + using change_ty = DAGDeltaAlgorithm::change_ty; + using changeset_ty = DAGDeltaAlgorithm::changeset_ty; + using changesetlist_ty = DAGDeltaAlgorithm::changesetlist_ty; + using edge_ty = DAGDeltaAlgorithm::edge_ty; private: - typedef std::vector<change_ty>::iterator pred_iterator_ty; - typedef std::vector<change_ty>::iterator succ_iterator_ty; - typedef std::set<change_ty>::iterator pred_closure_iterator_ty; - typedef std::set<change_ty>::iterator succ_closure_iterator_ty; + using pred_iterator_ty = std::vector<change_ty>::iterator; + using succ_iterator_ty = std::vector<change_ty>::iterator; + using pred_closure_iterator_ty = std::set<change_ty>::iterator; + using succ_closure_iterator_ty = std::set<change_ty>::iterator; DAGDeltaAlgorithm &DDA; diff --git a/llvm/lib/Support/DynamicLibrary.cpp b/llvm/lib/Support/DynamicLibrary.cpp index f1c15c0..61566d3 100644 --- a/llvm/lib/Support/DynamicLibrary.cpp +++ b/llvm/lib/Support/DynamicLibrary.cpp @@ -23,7 +23,7 @@ using namespace llvm::sys; // All methods for HandleSet should be used holding SymbolsMutex. class DynamicLibrary::HandleSet { - typedef std::vector<void *> HandleList; + using HandleList = std::vector<void *>; HandleList Handles; void *Process = &Invalid; diff --git a/llvm/lib/Support/Mustache.cpp b/llvm/lib/Support/Mustache.cpp index 708e79d..012e1ff 100644 --- a/llvm/lib/Support/Mustache.cpp +++ b/llvm/lib/Support/Mustache.cpp @@ -51,7 +51,7 @@ static Accessor splitMustacheString(StringRef Str, MustacheContext &Ctx) { std::tie(Part, Str) = Str.split('.'); // Each part of the accessor needs to be saved to the arena // to ensure it has a stable address. - Tokens.push_back(Ctx.Saver.save(Part.trim())); + Tokens.push_back(Part.trim()); } } // Now, allocate memory for the array of StringRefs in the arena. @@ -368,141 +368,99 @@ struct Tag { llvm_unreachable("Unknown json::Value::Kind"); } -static Tag findNextTag(StringRef Template, size_t StartPos, StringRef Open, - StringRef Close) { - const StringLiteral TripleOpen("{{{"); - const StringLiteral TripleClose("}}}"); - - size_t NormalOpenPos = Template.find(Open, StartPos); - size_t TripleOpenPos = Template.find(TripleOpen, StartPos); - - Tag Result; - - // Determine which tag comes first. - if (TripleOpenPos != StringRef::npos && - (NormalOpenPos == StringRef::npos || TripleOpenPos <= NormalOpenPos)) { - // Found a triple mustache tag. - size_t EndPos = - Template.find(TripleClose, TripleOpenPos + TripleOpen.size()); - if (EndPos == StringRef::npos) - return Result; // No closing tag found. - - Result.TagKind = Tag::Kind::Triple; - Result.StartPosition = TripleOpenPos; - size_t ContentStart = TripleOpenPos + TripleOpen.size(); - Result.Content = Template.substr(ContentStart, EndPos - ContentStart); - Result.FullMatch = Template.substr( - TripleOpenPos, (EndPos + TripleClose.size()) - TripleOpenPos); - } else if (NormalOpenPos != StringRef::npos) { - // Found a normal mustache tag. - size_t EndPos = Template.find(Close, NormalOpenPos + Open.size()); - if (EndPos == StringRef::npos) - return Result; // No closing tag found. - - Result.TagKind = Tag::Kind::Normal; - Result.StartPosition = NormalOpenPos; - size_t ContentStart = NormalOpenPos + Open.size(); - Result.Content = Template.substr(ContentStart, EndPos - ContentStart); - Result.FullMatch = - Template.substr(NormalOpenPos, (EndPos + Close.size()) - NormalOpenPos); - } - - return Result; -} - -static std::optional<std::pair<StringRef, StringRef>> -processTag(const Tag &T, SmallVectorImpl<Token> &Tokens, MustacheContext &Ctx) { - LLVM_DEBUG(dbgs() << "[Tag] " << T.FullMatch << ", Content: " << T.Content - << ", Kind: " << tagKindToString(T.TagKind) << "\n"); - if (T.TagKind == Tag::Kind::Triple) { - Tokens.emplace_back(T.FullMatch, Ctx.Saver.save("&" + T.Content), '&', Ctx); - return std::nullopt; - } - StringRef Interpolated = T.Content; - if (!Interpolated.trim().starts_with("=")) { - char Front = Interpolated.empty() ? ' ' : Interpolated.trim().front(); - Tokens.emplace_back(T.FullMatch, Interpolated, Front, Ctx); - return std::nullopt; - } - Tokens.emplace_back(T.FullMatch, Interpolated, '=', Ctx); - StringRef DelimSpec = Interpolated.trim(); - DelimSpec = DelimSpec.drop_front(1); - DelimSpec = DelimSpec.take_until([](char C) { return C == '='; }); - DelimSpec = DelimSpec.trim(); - - std::pair<StringRef, StringRef> Ret = DelimSpec.split(' '); - LLVM_DEBUG(dbgs() << "[Set Delimiter] NewOpen: " << Ret.first - << ", NewClose: " << Ret.second << "\n"); - return Ret; -} - // Simple tokenizer that splits the template into tokens. -// The mustache spec allows {{{ }}} to unescape variables, -// but we don't support that here. An unescape variable -// is represented only by {{& variable}}. static SmallVector<Token> tokenize(StringRef Template, MustacheContext &Ctx) { LLVM_DEBUG(dbgs() << "[Tokenize Template] \"" << Template << "\"\n"); SmallVector<Token> Tokens; SmallString<8> Open("{{"); SmallString<8> Close("}}"); - size_t Start = 0; + size_t Cursor = 0; + size_t TextStart = 0; - while (Start < Template.size()) { - LLVM_DEBUG(dbgs() << "[Tokenize Loop] Start:" << Start << ", Open:'" << Open - << "', Close:'" << Close << "'\n"); - Tag T = findNextTag(Template, Start, Open, Close); + const StringLiteral TripleOpen("{{{"); + const StringLiteral TripleClose("}}}"); - if (T.TagKind == Tag::Kind::None) { - // No more tags, the rest is text. - Tokens.emplace_back(Template.substr(Start)); - break; + while (Cursor < Template.size()) { + StringRef TemplateSuffix = Template.substr(Cursor); + StringRef TagOpen, TagClose; + Tag::Kind Kind; + + // Determine which tag we've encountered. + if (TemplateSuffix.starts_with(TripleOpen)) { + Kind = Tag::Kind::Triple; + TagOpen = TripleOpen; + TagClose = TripleClose; + } else if (TemplateSuffix.starts_with(Open)) { + Kind = Tag::Kind::Normal; + TagOpen = Open; + TagClose = Close; + } else { + // Not at a tag, continue scanning. + ++Cursor; + continue; } - // Add the text before the tag. - if (T.StartPosition > Start) { - StringRef Text = Template.substr(Start, T.StartPosition - Start); - Tokens.emplace_back(Text); + // Found a tag, first add the preceding text. + if (Cursor > TextStart) + Tokens.emplace_back(Template.slice(TextStart, Cursor)); + + // Find the closing tag. + size_t EndPos = Template.find(TagClose, Cursor + TagOpen.size()); + if (EndPos == StringRef::npos) { + // No closing tag, the rest is text. + Tokens.emplace_back(Template.substr(Cursor)); + TextStart = Cursor = Template.size(); + break; } - if (auto NewDelims = processTag(T, Tokens, Ctx)) { - std::tie(Open, Close) = *NewDelims; + // Extract tag content and full match. + size_t ContentStart = Cursor + TagOpen.size(); + StringRef Content = Template.substr(ContentStart, EndPos - ContentStart); + StringRef FullMatch = + Template.substr(Cursor, (EndPos + TagClose.size()) - Cursor); + + // Process the tag (inlined logic from processTag). + LLVM_DEBUG(dbgs() << "[Tag] " << FullMatch << ", Content: " << Content + << ", Kind: " << tagKindToString(Kind) << "\n"); + if (Kind == Tag::Kind::Triple) { + Tokens.emplace_back(FullMatch, Ctx.Saver.save("&" + Content), '&', Ctx); + } else { // Normal Tag + StringRef Interpolated = Content; + if (!Interpolated.trim().starts_with("=")) { + char Front = Interpolated.empty() ? ' ' : Interpolated.trim().front(); + Tokens.emplace_back(FullMatch, Interpolated, Front, Ctx); + } else { // Set Delimiter + Tokens.emplace_back(FullMatch, Interpolated, '=', Ctx); + StringRef DelimSpec = Interpolated.trim(); + DelimSpec = DelimSpec.drop_front(1); + DelimSpec = DelimSpec.take_until([](char C) { return C == '='; }); + DelimSpec = DelimSpec.trim(); + + auto [NewOpen, NewClose] = DelimSpec.split(' '); + LLVM_DEBUG(dbgs() << "[Set Delimiter] NewOpen: " << NewOpen + << ", NewClose: " << NewClose << "\n"); + Open = NewOpen; + Close = NewClose; + } } - // Move past the tag. - Start = T.StartPosition + T.FullMatch.size(); + // Move past the tag for the next iteration. + Cursor += FullMatch.size(); + TextStart = Cursor; } - // Fix up white spaces for: - // - open sections - // - inverted sections - // - close sections - // - comments - // - // This loop attempts to find standalone tokens and tries to trim out - // the surrounding whitespace. - // For example: - // if you have the template string - // {{#section}} \n Example \n{{/section}} - // The output should would be - // For example: - // \n Example \n + // Add any remaining text after the last tag. + if (TextStart < Template.size()) + Tokens.emplace_back(Template.substr(TextStart)); + + // Fix up white spaces for standalone tags. size_t LastIdx = Tokens.size() - 1; for (size_t Idx = 0, End = Tokens.size(); Idx < End; ++Idx) { Token &CurrentToken = Tokens[Idx]; Token::Type CurrentType = CurrentToken.getType(); - // Check if token type requires cleanup. - bool RequiresCleanUp = requiresCleanUp(CurrentType); - - if (!RequiresCleanUp) + if (!requiresCleanUp(CurrentType)) continue; - // We adjust the token body if there's no text behind or ahead. - // A token is considered to have no text ahead if the right of the previous - // token is a newline followed by spaces. - // A token is considered to have no text behind if the left of the next - // token is spaces followed by a newline. - // eg. - // "Line 1\n {{#section}} \n Line 2 \n {{/section}} \n Line 3" bool HasTextBehind = hasTextBehind(Idx, Tokens); bool HasTextAhead = hasTextAhead(Idx, Tokens); diff --git a/llvm/lib/Support/Timer.cpp b/llvm/lib/Support/Timer.cpp index 9d45096..b08f508 100644 --- a/llvm/lib/Support/Timer.cpp +++ b/llvm/lib/Support/Timer.cpp @@ -207,7 +207,7 @@ void TimeRecord::print(const TimeRecord &Total, raw_ostream &OS) const { namespace { -typedef StringMap<Timer> Name2TimerMap; +using Name2TimerMap = StringMap<Timer>; class Name2PairMap { StringMap<std::pair<TimerGroup*, Name2TimerMap> > Map; diff --git a/llvm/lib/Support/UnicodeNameToCodepoint.cpp b/llvm/lib/Support/UnicodeNameToCodepoint.cpp index 6f8e091..8f0d24e 100644 --- a/llvm/lib/Support/UnicodeNameToCodepoint.cpp +++ b/llvm/lib/Support/UnicodeNameToCodepoint.cpp @@ -251,10 +251,10 @@ constexpr const char *const HangulSyllables[][3] = { // Unicode 15.0 // 3.12 Conjoining Jamo Behavior Common constants -constexpr const char32_t SBase = 0xAC00; -constexpr const uint32_t LCount = 19; -constexpr const uint32_t VCount = 21; -constexpr const uint32_t TCount = 28; +constexpr char32_t SBase = 0xAC00; +constexpr uint32_t LCount = 19; +constexpr uint32_t VCount = 21; +constexpr uint32_t TCount = 28; static std::size_t findSyllable(StringRef Name, bool Strict, char &PreviousInName, int &Pos, int Column) { diff --git a/llvm/lib/TableGen/Record.cpp b/llvm/lib/TableGen/Record.cpp index afce803..8ad20b4 100644 --- a/llvm/lib/TableGen/Record.cpp +++ b/llvm/lib/TableGen/Record.cpp @@ -46,12 +46,11 @@ using namespace llvm; // Context //===----------------------------------------------------------------------===// -namespace llvm::detail { /// This class represents the internal implementation of the RecordKeeper. /// It contains all of the contextual static state of the Record classes. It is /// kept out-of-line to simplify dependencies, and also make it easier for /// internal classes to access the uniquer state of the keeper. -struct RecordKeeperImpl { +struct detail::RecordKeeperImpl { RecordKeeperImpl(RecordKeeper &RK) : SharedBitRecTy(RK), SharedIntRecTy(RK), SharedStringRecTy(RK), SharedDagRecTy(RK), AnyRecord(RK, {}), TheUnsetInit(RK), @@ -99,7 +98,6 @@ struct RecordKeeperImpl { void dumpAllocationStats(raw_ostream &OS) const; }; -} // namespace llvm::detail void detail::RecordKeeperImpl::dumpAllocationStats(raw_ostream &OS) const { // Dump memory allocation related stats. diff --git a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp index 1169f26..97298f9 100644 --- a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp @@ -655,16 +655,10 @@ Function *AArch64Arm64ECCallLowering::buildGuestExitThunk(Function *F) { BasicBlock *BB = BasicBlock::Create(M->getContext(), "", GuestExit); IRBuilder<> B(BB); - // Load the global symbol as a pointer to the check function. - Value *GuardFn; - if (cfguard_module_flag == 2 && !F->hasFnAttribute("guard_nocf")) - GuardFn = GuardFnCFGlobal; - else - GuardFn = GuardFnGlobal; - LoadInst *GuardCheckLoad = B.CreateLoad(PtrTy, GuardFn); - - // Create new call instruction. The CFGuard check should always be a call, - // even if the original CallBase is an Invoke or CallBr instruction. + // Create new call instruction. The call check should always be a call, + // even if the original CallBase is an Invoke or CallBr instructio. + // This is treated as a direct call, so do not use GuardFnCFGlobal. + LoadInst *GuardCheckLoad = B.CreateLoad(PtrTy, GuardFnGlobal); Function *Thunk = buildExitThunk(F->getFunctionType(), F->getAttributes()); CallInst *GuardCheck = B.CreateCall( GuardFnType, GuardCheckLoad, {F, Thunk}); diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/llvm/lib/Target/AArch64/AArch64CallingConvention.td index 1b5a713..34c85d5 100644 --- a/llvm/lib/Target/AArch64/AArch64CallingConvention.td +++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.td @@ -601,6 +601,12 @@ def CSR_Win_AArch64_AAPCS_SwiftError def CSR_Win_AArch64_AAPCS_SwiftTail : CalleeSavedRegs<(sub CSR_Win_AArch64_AAPCS, X20, X22)>; +def CSR_Win_AArch64_RT_MostRegs + : CalleeSavedRegs<(add CSR_Win_AArch64_AAPCS, (sequence "X%u", 9, 15))>; + +def CSR_Win_AArch64_RT_AllRegs + : CalleeSavedRegs<(add CSR_Win_AArch64_RT_MostRegs, (sequence "Q%u", 8, 31))>; + // The Control Flow Guard check call uses a custom calling convention that also // preserves X0-X8 and Q0-Q7. def CSR_Win_AArch64_CFGuard_Check : CalleeSavedRegs<(add CSR_Win_AArch64_AAPCS, diff --git a/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/llvm/lib/Target/AArch64/AArch64FastISel.cpp index cf34498..18e246e 100644 --- a/llvm/lib/Target/AArch64/AArch64FastISel.cpp +++ b/llvm/lib/Target/AArch64/AArch64FastISel.cpp @@ -81,10 +81,7 @@ namespace { class AArch64FastISel final : public FastISel { class Address { public: - using BaseKind = enum { - RegBase, - FrameIndexBase - }; + enum BaseKind { RegBase, FrameIndexBase }; private: BaseKind Kind = RegBase; diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index 0f7b34c..3ee4d58 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -2380,13 +2380,6 @@ void AArch64FrameLowering::determineStackHazardSlot( return; } - const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); - if (MFI.hasVarSizedObjects() || TRI->hasStackRealignment(MF)) { - LLVM_DEBUG(dbgs() << "SplitSVEObjects is not supported with variable " - "sized objects or realignment\n"); - return; - } - // If another calling convention is explicitly set FPRs can't be promoted to // ZPR callee-saves. if (!is_contained({CallingConv::C, CallingConv::Fast, @@ -2402,6 +2395,7 @@ void AArch64FrameLowering::determineStackHazardSlot( assert(Subtarget.isSVEorStreamingSVEAvailable() && "Expected SVE to be available for PPRs"); + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); // With SplitSVEObjects the CS hazard padding is placed between the // PPRs and ZPRs. If there are any FPR CS there would be a hazard between // them and the CS GRPs. Avoid this by promoting all FPR CS to ZPRs. diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 60aa61e..d08f9b9 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1052,15 +1052,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, // Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0. setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal); - if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr && - getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) { - // Issue __sincos_stret if available. - setOperationAction(ISD::FSINCOS, MVT::f64, Custom); - setOperationAction(ISD::FSINCOS, MVT::f32, Custom); - } else { - setOperationAction(ISD::FSINCOS, MVT::f64, Expand); - setOperationAction(ISD::FSINCOS, MVT::f32, Expand); - } + // Issue __sincos_stret if available. + setOperationAction(ISD::FSINCOS, MVT::f64, Expand); + setOperationAction(ISD::FSINCOS, MVT::f32, Expand); // Make floating-point constants legal for the large code model, so they don't // become loads from the constant pool. @@ -5346,35 +5340,6 @@ SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op, return SDValue(); } -SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op, - SelectionDAG &DAG) const { - // For iOS, we want to call an alternative entry point: __sincos_stret, - // which returns the values in two S / D registers. - SDLoc DL(Op); - SDValue Arg = Op.getOperand(0); - EVT ArgVT = Arg.getValueType(); - Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); - - ArgListTy Args; - Args.emplace_back(Arg, ArgTy); - - RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64 - : RTLIB::SINCOS_STRET_F32; - const char *LibcallName = getLibcallName(LC); - SDValue Callee = - DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout())); - - StructType *RetTy = StructType::get(ArgTy, ArgTy); - TargetLowering::CallLoweringInfo CLI(DAG); - CallingConv::ID CC = getLibcallCallingConv(LC); - CLI.setDebugLoc(DL) - .setChain(DAG.getEntryNode()) - .setLibCallee(CC, RetTy, Callee, std::move(Args)); - - std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); - return CallResult.first; -} - static MVT getSVEContainerType(EVT ContentTy); SDValue @@ -7723,8 +7688,6 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, case ISD::FP_TO_SINT_SAT: case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG); - case ISD::FSINCOS: - return LowerFSINCOS(Op, DAG); case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG); case ISD::SET_ROUNDING: diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 2cb8ed2..70bfae7 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -745,7 +745,6 @@ private: SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerLOOP_DEPENDENCE_MASK(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVSCALE(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp index e69fa32..2ab7bf1 100644 --- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -1386,6 +1386,25 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, if (MOP.isReg() && MOP.isKill()) DefinedInBB.addReg(MOP.getReg()); + // Copy over any implicit-def operands. This is like MI.copyImplicitOps, but + // only copies implicit defs and makes sure that each operand is only added + // once in case of duplicates. + auto CopyImplicitOps = [&](MachineBasicBlock::iterator MI1, + MachineBasicBlock::iterator MI2) { + SmallSetVector<Register, 4> Ops; + for (const MachineOperand &MO : + llvm::drop_begin(MI1->operands(), MI1->getDesc().getNumOperands())) + if (MO.isReg() && MO.isImplicit() && MO.isDef()) + Ops.insert(MO.getReg()); + for (const MachineOperand &MO : + llvm::drop_begin(MI2->operands(), MI2->getDesc().getNumOperands())) + if (MO.isReg() && MO.isImplicit() && MO.isDef()) + Ops.insert(MO.getReg()); + for (auto Op : Ops) + MIB.addDef(Op, RegState::Implicit); + }; + CopyImplicitOps(I, Paired); + // Erase the old instructions. I->eraseFromParent(); Paired->eraseFromParent(); diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp index 7e03b97..4df4d54 100644 --- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp +++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp @@ -370,6 +370,22 @@ SVEFrameSizes AArch64PrologueEpilogueCommon::getSVEStackFrameSizes() const { {ZPRCalleeSavesSize, PPRLocalsSize + ZPRLocalsSize}}; } +SVEStackAllocations AArch64PrologueEpilogueCommon::getSVEStackAllocations( + SVEFrameSizes const &SVE) { + StackOffset AfterZPRs = SVE.ZPR.LocalsSize; + StackOffset BeforePPRs = SVE.ZPR.CalleeSavesSize + SVE.PPR.CalleeSavesSize; + StackOffset AfterPPRs = {}; + if (SVELayout == SVEStackLayout::Split) { + BeforePPRs = SVE.PPR.CalleeSavesSize; + // If there are no ZPR CSRs, place all local allocations after the ZPRs. + if (SVE.ZPR.CalleeSavesSize) + AfterPPRs += SVE.PPR.LocalsSize + SVE.ZPR.CalleeSavesSize; + else + AfterZPRs += SVE.PPR.LocalsSize; // Group allocation of locals. + } + return {BeforePPRs, AfterPPRs, AfterZPRs}; +} + struct SVEPartitions { struct { MachineBasicBlock::iterator Begin, End; @@ -687,16 +703,19 @@ void AArch64PrologueEmitter::emitPrologue() { // All of the remaining stack allocations are for locals. determineLocalsStackSize(NumBytes, PrologueSaveSize); + auto [PPR, ZPR] = getSVEStackFrameSizes(); + SVEStackAllocations SVEAllocs = getSVEStackAllocations({PPR, ZPR}); + MachineBasicBlock::iterator FirstGPRSaveI = PrologueBeginI; if (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord) { + assert(!SVEAllocs.AfterPPRs && + "unexpected SVE allocs after PPRs with CalleeSavesAboveFrameRecord"); // If we're doing SVE saves first, we need to immediately allocate space // for fixed objects, then space for the SVE callee saves. // // Windows unwind requires that the scalable size is a multiple of 16; // that's handled when the callee-saved size is computed. - auto SaveSize = - StackOffset::getScalable(AFI->getSVECalleeSavedStackSize()) + - StackOffset::getFixed(FixedObject); + auto SaveSize = SVEAllocs.BeforePPRs + StackOffset::getFixed(FixedObject); allocateStackSpace(PrologueBeginI, 0, SaveSize, false, StackOffset{}, /*FollowupAllocs=*/true); NumBytes -= FixedObject; @@ -764,12 +783,11 @@ void AArch64PrologueEmitter::emitPrologue() { if (AFL.windowsRequiresStackProbe(MF, NumBytes + RealignmentPadding)) emitWindowsStackProbe(AfterGPRSavesI, DL, NumBytes, RealignmentPadding); - auto [PPR, ZPR] = getSVEStackFrameSizes(); - StackOffset SVECalleeSavesSize = ZPR.CalleeSavesSize + PPR.CalleeSavesSize; StackOffset NonSVELocalsSize = StackOffset::getFixed(NumBytes); + SVEAllocs.AfterZPRs += NonSVELocalsSize; + StackOffset CFAOffset = StackOffset::getFixed(MFI.getStackSize()) - NonSVELocalsSize; - MachineBasicBlock::iterator AfterSVESavesI = AfterGPRSavesI; // Allocate space for the callee saves and PPR locals (if any). if (SVELayout != SVEStackLayout::CalleeSavesAboveFrameRecord) { @@ -780,31 +798,23 @@ void AArch64PrologueEmitter::emitPrologue() { if (EmitAsyncCFI) emitCalleeSavedSVELocations(AfterSVESavesI); - StackOffset AllocateBeforePPRs = SVECalleeSavesSize; - StackOffset AllocateAfterPPRs = PPR.LocalsSize; - if (SVELayout == SVEStackLayout::Split) { - AllocateBeforePPRs = PPR.CalleeSavesSize; - AllocateAfterPPRs = PPR.LocalsSize + ZPR.CalleeSavesSize; - } - allocateStackSpace(PPRRange.Begin, 0, AllocateBeforePPRs, + allocateStackSpace(PPRRange.Begin, 0, SVEAllocs.BeforePPRs, EmitAsyncCFI && !HasFP, CFAOffset, - MFI.hasVarSizedObjects() || AllocateAfterPPRs || - ZPR.LocalsSize || NonSVELocalsSize); - CFAOffset += AllocateBeforePPRs; + MFI.hasVarSizedObjects() || SVEAllocs.AfterPPRs || + SVEAllocs.AfterZPRs); + CFAOffset += SVEAllocs.BeforePPRs; assert(PPRRange.End == ZPRRange.Begin && "Expected ZPR callee saves after PPR locals"); - allocateStackSpace(PPRRange.End, RealignmentPadding, AllocateAfterPPRs, + allocateStackSpace(PPRRange.End, 0, SVEAllocs.AfterPPRs, EmitAsyncCFI && !HasFP, CFAOffset, - MFI.hasVarSizedObjects() || ZPR.LocalsSize || - NonSVELocalsSize); - CFAOffset += AllocateAfterPPRs; + MFI.hasVarSizedObjects() || SVEAllocs.AfterZPRs); + CFAOffset += SVEAllocs.AfterPPRs; } else { assert(SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord); - // Note: With CalleeSavesAboveFrameRecord, the SVE CS have already been - // allocated (and separate PPR locals are not supported, all SVE locals, - // both PPR and ZPR, are within the ZPR locals area). - assert(!PPR.LocalsSize && "Unexpected PPR locals!"); - CFAOffset += SVECalleeSavesSize; + // Note: With CalleeSavesAboveFrameRecord, the SVE CS (BeforePPRs) have + // already been allocated. PPR locals (included in AfterPPRs) are not + // supported (note: this is asserted above). + CFAOffset += SVEAllocs.BeforePPRs; } // Allocate space for the rest of the frame including ZPR locals. Align the @@ -815,9 +825,9 @@ void AArch64PrologueEmitter::emitPrologue() { // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have the // correct value here, as NumBytes also includes padding bytes, which // shouldn't be counted here. - allocateStackSpace( - AfterSVESavesI, RealignmentPadding, ZPR.LocalsSize + NonSVELocalsSize, - EmitAsyncCFI && !HasFP, CFAOffset, MFI.hasVarSizedObjects()); + allocateStackSpace(AfterSVESavesI, RealignmentPadding, SVEAllocs.AfterZPRs, + EmitAsyncCFI && !HasFP, CFAOffset, + MFI.hasVarSizedObjects()); } // If we need a base pointer, set it up here. It's whatever the value of the @@ -1308,6 +1318,26 @@ AArch64EpilogueEmitter::AArch64EpilogueEmitter(MachineFunction &MF, SEHEpilogueStartI = MBB.end(); } +void AArch64EpilogueEmitter::moveSPBelowFP(MachineBasicBlock::iterator MBBI, + StackOffset Offset) { + // Other combinations could be supported, but are not currently needed. + assert(Offset.getScalable() < 0 && Offset.getFixed() <= 0 && + "expected negative offset (with optional fixed portion)"); + Register Base = AArch64::FP; + if (int64_t FixedOffset = Offset.getFixed()) { + // If we have a negative fixed offset, we need to first subtract it in a + // temporary register first (to avoid briefly deallocating the scalable + // portion of the offset). + Base = MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass); + emitFrameOffset(MBB, MBBI, DL, Base, AArch64::FP, + StackOffset::getFixed(FixedOffset), TII, + MachineInstr::FrameDestroy); + } + emitFrameOffset(MBB, MBBI, DL, AArch64::SP, Base, + StackOffset::getScalable(Offset.getScalable()), TII, + MachineInstr::FrameDestroy); +} + void AArch64EpilogueEmitter::emitEpilogue() { MachineBasicBlock::iterator EpilogueEndI = MBB.getLastNonDebugInstr(); if (MBB.end() != EpilogueEndI) { @@ -1408,6 +1438,7 @@ void AArch64EpilogueEmitter::emitEpilogue() { AfterCSRPopSize += ProloguePopSize; } } + // Move past the restores of the callee-saved registers. // If we plan on combining the sp bump of the local stack size and the callee // save stack size, we might need to adjust the CSR save and restore offsets. @@ -1472,27 +1503,25 @@ void AArch64EpilogueEmitter::emitEpilogue() { assert(NumBytes >= 0 && "Negative stack allocation size!?"); StackOffset SVECalleeSavesSize = ZPR.CalleeSavesSize + PPR.CalleeSavesSize; - StackOffset SVEStackSize = - SVECalleeSavesSize + PPR.LocalsSize + ZPR.LocalsSize; - MachineBasicBlock::iterator RestoreBegin = ZPRRange.Begin; - MachineBasicBlock::iterator RestoreEnd = PPRRange.End; + SVEStackAllocations SVEAllocs = getSVEStackAllocations({PPR, ZPR}); // Deallocate the SVE area. if (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord) { - StackOffset SVELocalsSize = ZPR.LocalsSize + PPR.LocalsSize; + assert(!SVEAllocs.AfterPPRs && + "unexpected SVE allocs after PPRs with CalleeSavesAboveFrameRecord"); // If the callee-save area is before FP, restoring the FP implicitly - // deallocates non-callee-save SVE allocations. Otherwise, deallocate them + // deallocates non-callee-save SVE allocations. Otherwise, deallocate them // explicitly. if (!AFI->isStackRealigned() && !MFI.hasVarSizedObjects()) { emitFrameOffset(MBB, FirstGPRRestoreI, DL, AArch64::SP, AArch64::SP, - SVELocalsSize, TII, MachineInstr::FrameDestroy, false, - NeedsWinCFI, &HasWinCFI); + SVEAllocs.AfterZPRs, TII, MachineInstr::FrameDestroy, + false, NeedsWinCFI, &HasWinCFI); } // Deallocate callee-save SVE registers. - emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP, - SVECalleeSavesSize, TII, MachineInstr::FrameDestroy, false, - NeedsWinCFI, &HasWinCFI); + emitFrameOffset(MBB, PPRRange.End, DL, AArch64::SP, AArch64::SP, + SVEAllocs.BeforePPRs, TII, MachineInstr::FrameDestroy, + false, NeedsWinCFI, &HasWinCFI); } else if (AFI->hasSVEStackSize()) { // If we have stack realignment or variable-sized objects we must use the FP // to restore SVE callee saves (as there is an unknown amount of @@ -1501,69 +1530,53 @@ void AArch64EpilogueEmitter::emitEpilogue() { (AFI->isStackRealigned() || MFI.hasVarSizedObjects()) ? AArch64::FP : AArch64::SP; if (SVECalleeSavesSize && BaseForSVEDealloc == AArch64::FP) { - // TODO: Support stack realigment and variable-sized objects. - assert( - SVELayout != SVEStackLayout::Split && - "unexpected stack realignment or variable sized objects with split " - "SVE stack objects"); - - Register CalleeSaveBase = AArch64::FP; - if (int64_t CalleeSaveBaseOffset = - AFI->getCalleeSaveBaseToFrameRecordOffset()) { - // If we have have an non-zero offset to the non-SVE CS base we need to - // compute the base address by subtracting the offest in a temporary - // register first (to avoid briefly deallocating the SVE CS). - CalleeSaveBase = MBB.getParent()->getRegInfo().createVirtualRegister( - &AArch64::GPR64RegClass); - emitFrameOffset(MBB, RestoreBegin, DL, CalleeSaveBase, AArch64::FP, - StackOffset::getFixed(-CalleeSaveBaseOffset), TII, - MachineInstr::FrameDestroy); + if (ZPR.CalleeSavesSize || SVELayout != SVEStackLayout::Split) { + // The offset from the frame-pointer to the start of the ZPR saves. + StackOffset FPOffsetZPR = + -SVECalleeSavesSize - PPR.LocalsSize - + StackOffset::getFixed(AFI->getCalleeSaveBaseToFrameRecordOffset()); + // Deallocate the stack space space by moving the SP to the start of the + // ZPR/PPR callee-save area. + moveSPBelowFP(ZPRRange.Begin, FPOffsetZPR); } - // The code below will deallocate the stack space space by moving the SP - // to the start of the SVE callee-save area. - emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, CalleeSaveBase, - -SVECalleeSavesSize, TII, MachineInstr::FrameDestroy); - } else if (BaseForSVEDealloc == AArch64::SP) { - auto CFAOffset = - SVEStackSize + StackOffset::getFixed(NumBytes + PrologueSaveSize); - - if (SVECalleeSavesSize) { - // Deallocate the non-SVE locals first before we can deallocate (and - // restore callee saves) from the SVE area. - auto NonSVELocals = StackOffset::getFixed(NumBytes); - emitFrameOffset(MBB, ZPRRange.Begin, DL, AArch64::SP, AArch64::SP, - NonSVELocals, TII, MachineInstr::FrameDestroy, false, - NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP, CFAOffset); - CFAOffset -= NonSVELocals; - NumBytes = 0; + // With split SVE, the predicates are stored in a separate area above the + // ZPR saves, so we must adjust the stack to the start of the PPRs. + if (PPR.CalleeSavesSize && SVELayout == SVEStackLayout::Split) { + // The offset from the frame-pointer to the start of the PPR saves. + StackOffset FPOffsetPPR = -PPR.CalleeSavesSize; + // Move to the start of the PPR area. + assert(!FPOffsetPPR.getFixed() && "expected only scalable offset"); + emitFrameOffset(MBB, ZPRRange.End, DL, AArch64::SP, AArch64::FP, + FPOffsetPPR, TII, MachineInstr::FrameDestroy); } - - if (ZPR.LocalsSize) { - emitFrameOffset(MBB, ZPRRange.Begin, DL, AArch64::SP, AArch64::SP, - ZPR.LocalsSize, TII, MachineInstr::FrameDestroy, false, - NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP, CFAOffset); - CFAOffset -= ZPR.LocalsSize; - } - - StackOffset SVECalleeSavesToDealloc = SVECalleeSavesSize; - if (SVELayout == SVEStackLayout::Split && - (PPR.LocalsSize || ZPR.CalleeSavesSize)) { - assert(PPRRange.Begin == ZPRRange.End && - "Expected PPR restores after ZPR"); - emitFrameOffset(MBB, PPRRange.Begin, DL, AArch64::SP, AArch64::SP, - PPR.LocalsSize + ZPR.CalleeSavesSize, TII, - MachineInstr::FrameDestroy, false, NeedsWinCFI, - &HasWinCFI, EmitCFI && !HasFP, CFAOffset); - CFAOffset -= PPR.LocalsSize + ZPR.CalleeSavesSize; - SVECalleeSavesToDealloc -= ZPR.CalleeSavesSize; + } else if (BaseForSVEDealloc == AArch64::SP) { + auto NonSVELocals = StackOffset::getFixed(NumBytes); + auto CFAOffset = NonSVELocals + StackOffset::getFixed(PrologueSaveSize) + + SVEAllocs.totalSize(); + + if (SVECalleeSavesSize || SVELayout == SVEStackLayout::Split) { + // Deallocate non-SVE locals now. This is needed to reach the SVE callee + // saves, but may also allow combining stack hazard bumps for split SVE. + SVEAllocs.AfterZPRs += NonSVELocals; + NumBytes -= NonSVELocals.getFixed(); } - - // If split SVE is on, this dealloc PPRs, otherwise, deallocs ZPRs + PPRs: - if (SVECalleeSavesToDealloc) - emitFrameOffset(MBB, PPRRange.End, DL, AArch64::SP, AArch64::SP, - SVECalleeSavesToDealloc, TII, - MachineInstr::FrameDestroy, false, NeedsWinCFI, - &HasWinCFI, EmitCFI && !HasFP, CFAOffset); + // To deallocate the SVE stack adjust by the allocations in reverse. + emitFrameOffset(MBB, ZPRRange.Begin, DL, AArch64::SP, AArch64::SP, + SVEAllocs.AfterZPRs, TII, MachineInstr::FrameDestroy, + false, NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP, + CFAOffset); + CFAOffset -= SVEAllocs.AfterZPRs; + assert(PPRRange.Begin == ZPRRange.End && + "Expected PPR restores after ZPR"); + emitFrameOffset(MBB, PPRRange.Begin, DL, AArch64::SP, AArch64::SP, + SVEAllocs.AfterPPRs, TII, MachineInstr::FrameDestroy, + false, NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP, + CFAOffset); + CFAOffset -= SVEAllocs.AfterPPRs; + emitFrameOffset(MBB, PPRRange.End, DL, AArch64::SP, AArch64::SP, + SVEAllocs.BeforePPRs, TII, MachineInstr::FrameDestroy, + false, NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP, + CFAOffset); } if (EmitCFI) diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h index bccadda..7f297b5 100644 --- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h +++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h @@ -33,6 +33,11 @@ struct SVEFrameSizes { } PPR, ZPR; }; +struct SVEStackAllocations { + StackOffset BeforePPRs, AfterPPRs, AfterZPRs; + StackOffset totalSize() const { return BeforePPRs + AfterPPRs + AfterZPRs; } +}; + class AArch64PrologueEpilogueCommon { public: AArch64PrologueEpilogueCommon(MachineFunction &MF, MachineBasicBlock &MBB, @@ -66,6 +71,7 @@ protected: bool shouldCombineCSRLocalStackBump(uint64_t StackBumpBytes) const; SVEFrameSizes getSVEStackFrameSizes() const; + SVEStackAllocations getSVEStackAllocations(SVEFrameSizes const &); MachineFunction &MF; MachineBasicBlock &MBB; @@ -174,6 +180,10 @@ public: private: bool shouldCombineCSRLocalStackBump(uint64_t StackBumpBytes) const; + /// A helper for moving the SP to a negative offset from the FP, without + /// deallocating any stack in the range FP to FP + Offset. + void moveSPBelowFP(MachineBasicBlock::iterator MBBI, StackOffset Offset); + void emitSwiftAsyncContextFramePointer(MachineBasicBlock::iterator MBBI, const DebugLoc &DL) const; diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp index 5bfb19d9..a5048b9 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -90,6 +90,16 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { if (MF->getSubtarget<AArch64Subtarget>().isTargetDarwin()) return getDarwinCalleeSavedRegs(MF); + if (MF->getFunction().getCallingConv() == CallingConv::PreserveMost) + return MF->getSubtarget<AArch64Subtarget>().isTargetWindows() + ? CSR_Win_AArch64_RT_MostRegs_SaveList + : CSR_AArch64_RT_MostRegs_SaveList; + + if (MF->getFunction().getCallingConv() == CallingConv::PreserveAll) + return MF->getSubtarget<AArch64Subtarget>().isTargetWindows() + ? CSR_Win_AArch64_RT_AllRegs_SaveList + : CSR_AArch64_RT_AllRegs_SaveList; + if (MF->getFunction().getCallingConv() == CallingConv::CFGuard_Check) return CSR_Win_AArch64_CFGuard_Check_SaveList; if (MF->getSubtarget<AArch64Subtarget>().isTargetWindows()) { @@ -138,10 +148,6 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { return CSR_AArch64_AAPCS_SwiftError_SaveList; if (MF->getFunction().getCallingConv() == CallingConv::SwiftTail) return CSR_AArch64_AAPCS_SwiftTail_SaveList; - if (MF->getFunction().getCallingConv() == CallingConv::PreserveMost) - return CSR_AArch64_RT_MostRegs_SaveList; - if (MF->getFunction().getCallingConv() == CallingConv::PreserveAll) - return CSR_AArch64_RT_AllRegs_SaveList; if (MF->getFunction().getCallingConv() == CallingConv::Win64) // This is for OSes other than Windows; Windows is a separate case further // above. diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index e8352be..197aae6 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -3007,9 +3007,9 @@ AArch64TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { llvm_unreachable("Unsupported register kind"); } -bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, - ArrayRef<const Value *> Args, - Type *SrcOverrideTy) const { +bool AArch64TTIImpl::isSingleExtWideningInstruction( + unsigned Opcode, Type *DstTy, ArrayRef<const Value *> Args, + Type *SrcOverrideTy) const { // A helper that returns a vector type from the given type. The number of // elements in type Ty determines the vector width. auto toVectorTy = [&](Type *ArgTy) { @@ -3027,48 +3027,29 @@ bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64)) return false; - // Determine if the operation has a widening variant. We consider both the - // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the - // instructions. - // - // TODO: Add additional widening operations (e.g., shl, etc.) once we - // verify that their extending operands are eliminated during code - // generation. Type *SrcTy = SrcOverrideTy; switch (Opcode) { - case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2). - case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2). + case Instruction::Add: // UADDW(2), SADDW(2). + case Instruction::Sub: { // USUBW(2), SSUBW(2). // The second operand needs to be an extend if (isa<SExtInst>(Args[1]) || isa<ZExtInst>(Args[1])) { if (!SrcTy) SrcTy = toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->getType()); - } else + break; + } + + if (Opcode == Instruction::Sub) return false; - break; - case Instruction::Mul: { // SMULL(2), UMULL(2) - // Both operands need to be extends of the same type. - if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) || - (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) { + + // UADDW(2), SADDW(2) can be commutted. + if (isa<SExtInst>(Args[0]) || isa<ZExtInst>(Args[0])) { if (!SrcTy) SrcTy = toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->getType()); - } else if (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1])) { - // If one of the operands is a Zext and the other has enough zero bits to - // be treated as unsigned, we can still general a umull, meaning the zext - // is free. - KnownBits Known = - computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL); - if (Args[0]->getType()->getScalarSizeInBits() - - Known.Zero.countLeadingOnes() > - DstTy->getScalarSizeInBits() / 2) - return false; - if (!SrcTy) - SrcTy = toVectorTy(Type::getIntNTy(DstTy->getContext(), - DstTy->getScalarSizeInBits() / 2)); - } else - return false; - break; + break; + } + return false; } default: return false; @@ -3099,6 +3080,73 @@ bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize; } +Type *AArch64TTIImpl::isBinExtWideningInstruction(unsigned Opcode, Type *DstTy, + ArrayRef<const Value *> Args, + Type *SrcOverrideTy) const { + if (Opcode != Instruction::Add && Opcode != Instruction::Sub && + Opcode != Instruction::Mul) + return nullptr; + + // Exit early if DstTy is not a vector type whose elements are one of [i16, + // i32, i64]. SVE doesn't generally have the same set of instructions to + // perform an extend with the add/sub/mul. There are SMULLB style + // instructions, but they operate on top/bottom, requiring some sort of lane + // interleaving to be used with zext/sext. + unsigned DstEltSize = DstTy->getScalarSizeInBits(); + if (!useNeonVector(DstTy) || Args.size() != 2 || + (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64)) + return nullptr; + + auto getScalarSizeWithOverride = [&](const Value *V) { + if (SrcOverrideTy) + return SrcOverrideTy->getScalarSizeInBits(); + return cast<Instruction>(V) + ->getOperand(0) + ->getType() + ->getScalarSizeInBits(); + }; + + unsigned MaxEltSize = 0; + if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) || + (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) { + unsigned EltSize0 = getScalarSizeWithOverride(Args[0]); + unsigned EltSize1 = getScalarSizeWithOverride(Args[1]); + MaxEltSize = std::max(EltSize0, EltSize1); + } else if (isa<SExtInst, ZExtInst>(Args[0]) && + isa<SExtInst, ZExtInst>(Args[1])) { + unsigned EltSize0 = getScalarSizeWithOverride(Args[0]); + unsigned EltSize1 = getScalarSizeWithOverride(Args[1]); + // mul(sext, zext) will become smull(sext, zext) if the extends are large + // enough. + if (EltSize0 >= DstEltSize / 2 || EltSize1 >= DstEltSize / 2) + return nullptr; + MaxEltSize = DstEltSize / 2; + } else if (Opcode == Instruction::Mul && + (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1]))) { + // If one of the operands is a Zext and the other has enough zero bits + // to be treated as unsigned, we can still generate a umull, meaning the + // zext is free. + KnownBits Known = + computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL); + if (Args[0]->getType()->getScalarSizeInBits() - + Known.Zero.countLeadingOnes() > + DstTy->getScalarSizeInBits() / 2) + return nullptr; + + MaxEltSize = + getScalarSizeWithOverride(isa<ZExtInst>(Args[0]) ? Args[0] : Args[1]); + } else + return nullptr; + + if (MaxEltSize * 2 > DstEltSize) + return nullptr; + + Type *ExtTy = DstTy->getWithNewBitWidth(MaxEltSize * 2); + if (ExtTy->getPrimitiveSizeInBits() <= 64) + return nullptr; + return ExtTy; +} + // s/urhadd instructions implement the following pattern, making the // extends free: // %x = add ((zext i8 -> i16), 1) @@ -3159,7 +3207,24 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, if (I && I->hasOneUser()) { auto *SingleUser = cast<Instruction>(*I->user_begin()); SmallVector<const Value *, 4> Operands(SingleUser->operand_values()); - if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands, Src)) { + if (Type *ExtTy = isBinExtWideningInstruction( + SingleUser->getOpcode(), Dst, Operands, + Src != I->getOperand(0)->getType() ? Src : nullptr)) { + // The cost from Src->Src*2 needs to be added if required, the cost from + // Src*2->ExtTy is free. + if (ExtTy->getScalarSizeInBits() > Src->getScalarSizeInBits() * 2) { + Type *DoubleSrcTy = + Src->getWithNewBitWidth(Src->getScalarSizeInBits() * 2); + return getCastInstrCost(Opcode, DoubleSrcTy, Src, + TTI::CastContextHint::None, CostKind); + } + + return 0; + } + + if (isSingleExtWideningInstruction( + SingleUser->getOpcode(), Dst, Operands, + Src != I->getOperand(0)->getType() ? Src : nullptr)) { // For adds only count the second operand as free if both operands are // extends but not the same operation. (i.e both operands are not free in // add(sext, zext)). @@ -3168,8 +3233,11 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, (isa<CastInst>(SingleUser->getOperand(1)) && cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode)) return 0; - } else // Others are free so long as isWideningInstruction returned true. + } else { + // Others are free so long as isSingleExtWideningInstruction + // returned true. return 0; + } } // The cast will be free for the s/urhadd instructions @@ -4148,6 +4216,18 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost( })) return *PromotedCost; + // If the operation is a widening instruction (smull or umull) and both + // operands are extends the cost can be cheaper by considering that the + // operation will operate on the narrowest type size possible (double the + // largest input size) and a further extend. + if (Type *ExtTy = isBinExtWideningInstruction(Opcode, Ty, Args)) { + if (ExtTy != Ty) + return getArithmeticInstrCost(Opcode, ExtTy, CostKind) + + getCastInstrCost(Instruction::ZExt, Ty, ExtTy, + TTI::CastContextHint::None, CostKind); + return LT.first; + } + switch (ISD) { default: return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, @@ -4381,10 +4461,8 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost( // - two 2-cost i64 inserts, and // - two 1-cost muls. // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with - // LT.first = 2 the cost is 28. If both operands are extensions it will not - // need to scalarize so the cost can be cheaper (smull or umull). - // so the cost can be cheaper (smull or umull). - if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args)) + // LT.first = 2 the cost is 28. + if (LT.second != MVT::v2i64) return LT.first; return cast<VectorType>(Ty)->getElementCount().getKnownMinValue() * (getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind) + @@ -6129,7 +6207,8 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, } static bool containsDecreasingPointers(Loop *TheLoop, - PredicatedScalarEvolution *PSE) { + PredicatedScalarEvolution *PSE, + const DominatorTree &DT) { const auto &Strides = DenseMap<Value *, const SCEV *>(); for (BasicBlock *BB : TheLoop->blocks()) { // Scan the instructions in the block and look for addresses that are @@ -6138,8 +6217,8 @@ static bool containsDecreasingPointers(Loop *TheLoop, if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) { Value *Ptr = getLoadStorePointerOperand(&I); Type *AccessTy = getLoadStoreType(&I); - if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, Strides, /*Assume=*/true, - /*ShouldCheckWrap=*/false) + if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, DT, Strides, + /*Assume=*/true, /*ShouldCheckWrap=*/false) .value_or(0) < 0) return true; } @@ -6184,7 +6263,8 @@ bool AArch64TTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) const { // negative strides. This will require extra work to reverse the loop // predicate, which may be expensive. if (containsDecreasingPointers(TFI->LVL->getLoop(), - TFI->LVL->getPredicatedScalarEvolution())) + TFI->LVL->getPredicatedScalarEvolution(), + *TFI->LVL->getDominatorTree())) Required |= TailFoldingOpts::Reverse; if (Required == TailFoldingOpts::Disabled) Required |= TailFoldingOpts::Simple; @@ -6657,10 +6737,15 @@ bool AArch64TTIImpl::isProfitableToSinkOperands( Ops.push_back(&Ext->getOperandUse(0)); Ops.push_back(&Op); - if (isa<SExtInst>(Ext)) + if (isa<SExtInst>(Ext)) { NumSExts++; - else + } else { NumZExts++; + // A zext(a) is also a sext(zext(a)), if we take more than 2 steps. + if (Ext->getOperand(0)->getType()->getScalarSizeInBits() * 2 < + I->getType()->getScalarSizeInBits()) + NumSExts++; + } continue; } diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index b39546a..e3b0a1b 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -59,9 +59,17 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> { VECTOR_LDST_FOUR_ELEMENTS }; - bool isWideningInstruction(Type *DstTy, unsigned Opcode, - ArrayRef<const Value *> Args, - Type *SrcOverrideTy = nullptr) const; + /// Given a add/sub/mul operation, detect a widening addl/subl/mull pattern + /// where both operands can be treated like extends. Returns the minimal type + /// needed to compute the operation. + Type *isBinExtWideningInstruction(unsigned Opcode, Type *DstTy, + ArrayRef<const Value *> Args, + Type *SrcOverrideTy = nullptr) const; + /// Given a add/sub operation with a single extend operand, detect a + /// widening addw/subw pattern. + bool isSingleExtWideningInstruction(unsigned Opcode, Type *DstTy, + ArrayRef<const Value *> Args, + Type *SrcOverrideTy = nullptr) const; // A helper function called by 'getVectorInstrCost'. // diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index cd8b249..67042b7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -69,7 +69,7 @@ FunctionPass *createAMDGPUPreloadKernArgPrologLegacyPass(); ModulePass *createAMDGPUPreloadKernelArgumentsLegacyPass(const TargetMachine *); struct AMDGPUSimplifyLibCallsPass : PassInfoMixin<AMDGPUSimplifyLibCallsPass> { - AMDGPUSimplifyLibCallsPass() {} + AMDGPUSimplifyLibCallsPass() = default; PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); }; @@ -371,13 +371,13 @@ public: class AMDGPUAnnotateUniformValuesPass : public PassInfoMixin<AMDGPUAnnotateUniformValuesPass> { public: - AMDGPUAnnotateUniformValuesPass() {} + AMDGPUAnnotateUniformValuesPass() = default; PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); }; class SIModeRegisterPass : public PassInfoMixin<SIModeRegisterPass> { public: - SIModeRegisterPass() {} + SIModeRegisterPass() = default; PreservedAnalyses run(MachineFunction &F, MachineFunctionAnalysisManager &AM); }; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h index 1064e57..dad94b8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h @@ -96,7 +96,7 @@ inline raw_ostream &operator<<(raw_ostream &OS, const ArgDescriptor &Arg) { } struct KernArgPreloadDescriptor : public ArgDescriptor { - KernArgPreloadDescriptor() {} + KernArgPreloadDescriptor() = default; SmallVector<MCRegister> Regs; }; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index 9907c88f..8669978 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -1555,7 +1555,7 @@ private: AMDGPU::ClusterDimsAttr Attr; - static constexpr const char AttrName[] = "amdgpu-cluster-dims"; + static constexpr char AttrName[] = "amdgpu-cluster-dims"; }; AAAMDGPUClusterDims & diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h index cf2ab825..a3be0f5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h @@ -48,7 +48,7 @@ private: FuncInfoMap FIM; public: - AMDGPUPerfHintAnalysis() {} + AMDGPUPerfHintAnalysis() = default; // OldPM bool runOnSCC(const GCNTargetMachine &TM, CallGraphSCC &SCC); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index 103cdec..1e5885a2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -202,7 +202,7 @@ bool PredicateMapping::match(const MachineInstr &MI, return true; } -SetOfRulesForOpcode::SetOfRulesForOpcode() {} +SetOfRulesForOpcode::SetOfRulesForOpcode() = default; SetOfRulesForOpcode::SetOfRulesForOpcode(FastRulesTypes FastTypes) : FastTypes(FastTypes) {} @@ -913,6 +913,8 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, addRulesForGOpcs({G_ABS}, Standard).Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt}}); + addRulesForGOpcs({G_FENCE}).Any({{{}}, {{}, {}}}); + addRulesForGOpcs({G_READSTEADYCOUNTER, G_READCYCLECOUNTER}, Standard) .Uni(S64, {{Sgpr64}, {}}); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp index 733c5d5..fe81a5e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp @@ -181,14 +181,52 @@ BasicBlock *AMDGPUUnifyDivergentExitNodesImpl::unifyReturnBlockSet( return NewRetBlock; } +static BasicBlock * +createDummyReturnBlock(Function &F, + SmallVector<BasicBlock *, 4> &ReturningBlocks) { + BasicBlock *DummyReturnBB = + BasicBlock::Create(F.getContext(), "DummyReturnBlock", &F); + Type *RetTy = F.getReturnType(); + Value *RetVal = RetTy->isVoidTy() ? nullptr : PoisonValue::get(RetTy); + ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB); + ReturningBlocks.push_back(DummyReturnBB); + return DummyReturnBB; +} + +/// Handle conditional branch instructions (-> 2 targets) and callbr +/// instructions with N targets. +static void handleNBranch(Function &F, BasicBlock *BB, Instruction *BI, + BasicBlock *DummyReturnBB, + std::vector<DominatorTree::UpdateType> &Updates) { + SmallVector<BasicBlock *, 2> Successors(successors(BB)); + + // Create a new transition block to hold the conditional branch. + BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock"); + + Updates.reserve(Updates.size() + 2 * Successors.size() + 2); + + // 'Successors' become successors of TransitionBB instead of BB, + // and TransitionBB becomes a single successor of BB. + Updates.emplace_back(DominatorTree::Insert, BB, TransitionBB); + for (BasicBlock *Successor : Successors) { + Updates.emplace_back(DominatorTree::Insert, TransitionBB, Successor); + Updates.emplace_back(DominatorTree::Delete, BB, Successor); + } + + // Create a branch that will always branch to the transition block and + // references DummyReturnBB. + BB->getTerminator()->eraseFromParent(); + BranchInst::Create(TransitionBB, DummyReturnBB, + ConstantInt::getTrue(F.getContext()), BB); + Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB); +} + bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT, const PostDominatorTree &PDT, const UniformityInfo &UA) { - assert(hasOnlySimpleTerminator(F) && "Unsupported block terminator."); - if (PDT.root_size() == 0 || (PDT.root_size() == 1 && - !isa<BranchInst>(PDT.getRoot()->getTerminator()))) + !isa<BranchInst, CallBrInst>(PDT.getRoot()->getTerminator()))) return false; // Loop over all of the blocks in a function, tracking all of the blocks that @@ -222,46 +260,28 @@ bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT, if (HasDivergentExitBlock) UnreachableBlocks.push_back(BB); } else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) { - - ConstantInt *BoolTrue = ConstantInt::getTrue(F.getContext()); - if (DummyReturnBB == nullptr) { - DummyReturnBB = BasicBlock::Create(F.getContext(), - "DummyReturnBlock", &F); - Type *RetTy = F.getReturnType(); - Value *RetVal = RetTy->isVoidTy() ? nullptr : PoisonValue::get(RetTy); - ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB); - ReturningBlocks.push_back(DummyReturnBB); - } + if (!DummyReturnBB) + DummyReturnBB = createDummyReturnBlock(F, ReturningBlocks); if (BI->isUnconditional()) { BasicBlock *LoopHeaderBB = BI->getSuccessor(0); BI->eraseFromParent(); // Delete the unconditional branch. // Add a new conditional branch with a dummy edge to the return block. - BranchInst::Create(LoopHeaderBB, DummyReturnBB, BoolTrue, BB); - Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB); - } else { // Conditional branch. - SmallVector<BasicBlock *, 2> Successors(successors(BB)); - - // Create a new transition block to hold the conditional branch. - BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock"); - - Updates.reserve(Updates.size() + 2 * Successors.size() + 2); - - // 'Successors' become successors of TransitionBB instead of BB, - // and TransitionBB becomes a single successor of BB. - Updates.emplace_back(DominatorTree::Insert, BB, TransitionBB); - for (BasicBlock *Successor : Successors) { - Updates.emplace_back(DominatorTree::Insert, TransitionBB, Successor); - Updates.emplace_back(DominatorTree::Delete, BB, Successor); - } - - // Create a branch that will always branch to the transition block and - // references DummyReturnBB. - BB->getTerminator()->eraseFromParent(); - BranchInst::Create(TransitionBB, DummyReturnBB, BoolTrue, BB); + BranchInst::Create(LoopHeaderBB, DummyReturnBB, + ConstantInt::getTrue(F.getContext()), BB); Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB); + } else { + handleNBranch(F, BB, BI, DummyReturnBB, Updates); } Changed = true; + } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(BB->getTerminator())) { + if (!DummyReturnBB) + DummyReturnBB = createDummyReturnBlock(F, ReturningBlocks); + + handleNBranch(F, BB, CBI, DummyReturnBB, Updates); + Changed = true; + } else { + llvm_unreachable("unsupported block terminator"); } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp b/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp index 61c5dcd..ded2f5a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp @@ -54,7 +54,7 @@ public: bool CullSGPRHazardsAtMemWait; unsigned CullSGPRHazardsMemWaitThreshold; - AMDGPUWaitSGPRHazards() {} + AMDGPUWaitSGPRHazards() = default; // Return the numeric ID 0-127 for a given SGPR. static std::optional<unsigned> sgprNumber(Register Reg, diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 52cc4ca..1a14629 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -435,7 +435,7 @@ void GCNHazardRecognizer::RecedeCycle() { // Helper Functions //===----------------------------------------------------------------------===// -using HazardFnResult = enum { HazardFound, HazardExpired, NoHazardFound }; +enum HazardFnResult { HazardFound, HazardExpired, NoHazardFound }; using IsExpiredFn = function_ref<bool(const MachineInstr &, int WaitStates)>; using GetNumWaitStatesFn = function_ref<unsigned int(const MachineInstr &)>; diff --git a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp index 959ce69..1682abb 100644 --- a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp +++ b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp @@ -43,7 +43,7 @@ public: bool run(MachineFunction &MF); private: - using NSA_Status = enum { + enum NSA_Status { NOT_NSA, // Not an NSA instruction FIXED, // NSA which we cannot modify NON_CONTIGUOUS, // NSA with non-sequential address which we can try diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h index 975781f..f357981 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -183,7 +183,7 @@ class ScheduleMetrics { unsigned BubbleCycles; public: - ScheduleMetrics() {} + ScheduleMetrics() = default; ScheduleMetrics(unsigned L, unsigned BC) : ScheduleLength(L), BubbleCycles(BC) {} unsigned getLength() const { return ScheduleLength; } @@ -217,7 +217,7 @@ class RegionPressureMap { bool IsLiveOut; public: - RegionPressureMap() {} + RegionPressureMap() = default; RegionPressureMap(GCNScheduleDAGMILive *GCNDAG, bool LiveOut) : DAG(GCNDAG), IsLiveOut(LiveOut) {} // Build the Instr->LiveReg and RegionIdx->Instr maps diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp index 2aa54c9..31eca04 100644 --- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -1129,12 +1129,9 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { if ((AS == AMDGPUAS::PRIVATE_ADDRESS) && TruncatingStore) { // Add an extra level of chain to isolate this vector SDValue NewChain = DAG.getNode(AMDGPUISD::DUMMY_CHAIN, DL, MVT::Other, Chain); - // TODO: can the chain be replaced without creating a new store? - SDValue NewStore = DAG.getTruncStore( - NewChain, DL, Value, Ptr, StoreNode->getPointerInfo(), MemVT, - StoreNode->getAlign(), StoreNode->getMemOperand()->getFlags(), - StoreNode->getAAInfo()); - StoreNode = cast<StoreSDNode>(NewStore); + SmallVector<SDValue, 4> NewOps(StoreNode->ops()); + NewOps[0] = NewChain; + StoreNode = cast<StoreSDNode>(DAG.UpdateNodeOperands(StoreNode, NewOps)); } return scalarizeVectorStore(StoreNode, DAG); diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index 5c39f7a..aa5ea77 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -2170,7 +2170,9 @@ bool SIFrameLowering::hasFPImpl(const MachineFunction &MF) const { return MFI.getStackSize() != 0; } - return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() || + return (frameTriviallyRequiresSP(MFI) && + !MF.getInfo<SIMachineFunctionInfo>()->isChainFunction()) || + MFI.isFrameAddressTaken() || MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->hasStackRealignment( MF) || mayReserveScratchForCWSR(MF) || diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index d9f76c9..45f5919 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -6153,7 +6153,7 @@ bool SIInstrInfo::isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx, // information. if (AMDGPU::isPackedFP32Inst(MI.getOpcode()) && AMDGPU::isGFX12Plus(ST) && MO.isReg() && RI.isSGPRReg(MRI, MO.getReg())) { - constexpr const AMDGPU::OpName OpNames[] = { + constexpr AMDGPU::OpName OpNames[] = { AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2}; for (auto [I, OpName] : enumerate(OpNames)) { @@ -6215,8 +6215,8 @@ bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI, bool SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand( const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN, const MachineOperand *MO) const { - constexpr const unsigned NumOps = 3; - constexpr const AMDGPU::OpName OpNames[NumOps * 2] = { + constexpr unsigned NumOps = 3; + constexpr AMDGPU::OpName OpNames[NumOps * 2] = { AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2, AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers}; diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp index 7431e11..abefa32 100644 --- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp @@ -296,7 +296,7 @@ bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First, for (MachineBasicBlock::instr_iterator I = std::next(First.getIterator()), E = MI.getIterator(); I != E; ++I) { - if (I->isBundle()) + if (I->isBundle() || I->isDebugInstr()) continue; switch (I->getOpcode()) { case AMDGPU::S_SET_GPR_IDX_MODE: diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 4ae2c1e..31d8bce4 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -1707,7 +1707,7 @@ multiclass WMMAInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string Pse defvar WMMAConstraints2Addr = !if(DiffVdstSrc2, "@earlyclobber $vdst", "@earlyclobber $vdst,$vdst = $src2"); defvar WMMAConstraints3Addr = "@earlyclobber $vdst"; - let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in { + let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0, isConvergent = 1 in { let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = 1 in def _twoaddr : VOP3P_Pseudo<Instr, WMMAProfile>, WMMAInstInfo { let PseudoInstr = Instr#PseudoInstrSuffix; @@ -1734,7 +1734,7 @@ multiclass SWMMACInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string P let mayRaiseFPException = 0; let ReadsModeReg = 0; let AsmMatchConverter = "cvtSWMMAC"; - + let isConvergent = 1; let Constraints = "@earlyclobber $vdst,$vdst = $srcTiedDef"; } } @@ -1906,8 +1906,10 @@ defm V_WMMA_SCALE_F32_32X16X128_F4_w32 : WMMAInstGFX12<"v_wmma_scale_f32_32x16 defm V_WMMA_SCALE16_F32_32X16X128_F4_w32 : WMMAInstGFX12<"v_wmma_scale16_f32_32x16x128_f4", F32_32X16X128_F4_SCALE16_w32, "_w32">; } // End is_wmma_xdl = 1. -defm V_WMMA_LD_SCALE_PAIRED_B32 : VOP3PInst<"v_wmma_ld_scale_paired_b32", VOP_WMMA_LD_SCALE<i32, VCSrc_b32_Lo256>>; -defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3PInst<"v_wmma_ld_scale16_paired_b64", VOP_WMMA_LD_SCALE<i64, VCSrc_b64_Lo256>>; +let isConvergent = 1 in { + defm V_WMMA_LD_SCALE_PAIRED_B32 : VOP3PInst<"v_wmma_ld_scale_paired_b32", VOP_WMMA_LD_SCALE<i32, VCSrc_b32_Lo256>>; + defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3PInst<"v_wmma_ld_scale16_paired_b64", VOP_WMMA_LD_SCALE<i64, VCSrc_b64_Lo256>>; +} } // End SubtargetPredicate = isGFX125xOnly } // End WaveSizePredicate = isWave32 diff --git a/llvm/lib/Target/ARM/ARMFastISel.cpp b/llvm/lib/Target/ARM/ARMFastISel.cpp index 14e1160..88d3b6f 100644 --- a/llvm/lib/Target/ARM/ARMFastISel.cpp +++ b/llvm/lib/Target/ARM/ARMFastISel.cpp @@ -86,7 +86,7 @@ namespace { // All possible address modes, plus some. class Address { public: - using BaseKind = enum { RegBase, FrameIndexBase }; + enum BaseKind { RegBase, FrameIndexBase }; private: BaseKind Kind = RegBase; diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 6b06534..92fae71 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -1312,8 +1312,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_, setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom); } - setOperationAction(ISD::FSINCOS, MVT::f64, Custom); - setOperationAction(ISD::FSINCOS, MVT::f32, Custom); + setOperationAction(ISD::FSINCOS, MVT::f64, Expand); + setOperationAction(ISD::FSINCOS, MVT::f32, Expand); // FP-ARMv8 implements a lot of rounding-like FP operations. if (Subtarget->hasFPARMv8Base()) { @@ -9855,76 +9855,6 @@ static SDValue LowerUADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Carry); } -SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { - // For iOS, we want to call an alternative entry point: __sincos_stret, - // return values are passed via sret. - SDLoc dl(Op); - SDValue Arg = Op.getOperand(0); - EVT ArgVT = Arg.getValueType(); - RTLIB::Libcall LC = RTLIB::getSINCOS_STRET(ArgVT); - RTLIB::LibcallImpl SincosStret = getLibcallImpl(LC); - if (SincosStret == RTLIB::Unsupported) - return SDValue(); - - assert(Subtarget->isTargetDarwin()); - - Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); - auto PtrVT = getPointerTy(DAG.getDataLayout()); - - MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); - - // Pair of floats / doubles used to pass the result. - Type *RetTy = StructType::get(ArgTy, ArgTy); - auto &DL = DAG.getDataLayout(); - - ArgListTy Args; - bool ShouldUseSRet = getTM().isAPCS_ABI(); - SDValue SRet; - if (ShouldUseSRet) { - // Create stack object for sret. - const uint64_t ByteSize = DL.getTypeAllocSize(RetTy); - const Align StackAlign = DL.getPrefTypeAlign(RetTy); - int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false); - SRet = DAG.getFrameIndex(FrameIdx, getPointerTy(DL)); - - ArgListEntry Entry(SRet, PointerType::getUnqual(RetTy->getContext())); - Entry.IsSExt = false; - Entry.IsZExt = false; - Entry.IsSRet = true; - Args.push_back(Entry); - RetTy = Type::getVoidTy(*DAG.getContext()); - } - - Args.emplace_back(Arg, ArgTy); - - StringRef LibcallName = getLibcallImplName(SincosStret); - CallingConv::ID CC = getLibcallImplCallingConv(SincosStret); - SDValue Callee = DAG.getExternalSymbol(LibcallName.data(), getPointerTy(DL)); - - TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl) - .setChain(DAG.getEntryNode()) - .setCallee(CC, RetTy, Callee, std::move(Args)) - .setDiscardResult(ShouldUseSRet); - std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); - - if (!ShouldUseSRet) - return CallResult.first; - - SDValue LoadSin = - DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo()); - - // Address of cos field. - SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet, - DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl)); - SDValue LoadCos = - DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo()); - - SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); - return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, - LoadSin.getValue(0), LoadCos.getValue(0)); -} - SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG, bool Signed, SDValue &Chain) const { @@ -10726,8 +10656,8 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::VECREDUCE_SMAX: return LowerVecReduceMinMax(Op, DAG, Subtarget); case ISD::ATOMIC_LOAD: - case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG); - case ISD::FSINCOS: return LowerFSINCOS(Op, DAG); + case ISD::ATOMIC_STORE: + return LowerAtomicLoadStore(Op, DAG); case ISD::SDIVREM: case ISD::UDIVREM: return LowerDivRem(Op, DAG); case ISD::DYNAMIC_STACKALLOC: diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index bf3438b..bc2fec3 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -901,7 +901,6 @@ class VectorType; SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST) const; SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDivRem(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed) const; void ExpandDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed, diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index 9b250e6..24f58a6 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -2448,7 +2448,8 @@ static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) { // static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE, const DataLayout &DL, - const LoopAccessInfo *LAI) { + const LoopAccessInfo *LAI, + const DominatorTree &DT) { LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n"); // If there are live-out values, it is probably a reduction. We can predicate @@ -2498,7 +2499,8 @@ static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE, if (isa<StoreInst>(I) || isa<LoadInst>(I)) { Value *Ptr = getLoadStorePointerOperand(&I); Type *AccessTy = getLoadStoreType(&I); - int64_t NextStride = getPtrStride(PSE, AccessTy, Ptr, L).value_or(0); + int64_t NextStride = + getPtrStride(PSE, AccessTy, Ptr, L, DT).value_or(0); if (NextStride == 1) { // TODO: for now only allow consecutive strides of 1. We could support // other strides as long as it is uniform, but let's keep it simple @@ -2585,7 +2587,8 @@ bool ARMTTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) const { return false; } - return canTailPredicateLoop(L, LI, *SE, DL, LVL->getLAI()); + return canTailPredicateLoop(L, LI, *SE, DL, LVL->getLAI(), + *LVL->getDominatorTree()); } TailFoldingStyle diff --git a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp index 77dc4a7..378a72a 100644 --- a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp +++ b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp @@ -88,6 +88,16 @@ bool BPFAsmPrinter::doFinalization(Module &M) { } } + for (GlobalObject &GO : M.global_objects()) { + if (!GO.hasExternalWeakLinkage()) + continue; + + if (!SawTrapCall && GO.getName() == BPF_TRAP) { + GO.eraseFromParent(); + break; + } + } + return AsmPrinter::doFinalization(M); } @@ -160,6 +170,20 @@ bool BPFAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, } void BPFAsmPrinter::emitInstruction(const MachineInstr *MI) { + if (MI->isCall()) { + for (const MachineOperand &Op : MI->operands()) { + if (Op.isGlobal()) { + if (const GlobalValue *GV = Op.getGlobal()) + if (GV->getName() == BPF_TRAP) + SawTrapCall = true; + } else if (Op.isSymbol()) { + if (const MCSymbol *Sym = Op.getMCSymbol()) + if (Sym->getName() == BPF_TRAP) + SawTrapCall = true; + } + } + } + BPF_MC::verifyInstructionPredicates(MI->getOpcode(), getSubtargetInfo().getFeatureBits()); @@ -195,6 +219,10 @@ void BPFAsmPrinter::emitJumpTableInfo() { const TargetLoweringObjectFile &TLOF = getObjFileLowering(); const Function &F = MF->getFunction(); + + MCSection *Sec = OutStreamer->getCurrentSectionOnly(); + MCSymbol *SecStart = Sec->getBeginSymbol(); + MCSection *JTS = TLOF.getSectionForJumpTable(F, TM); assert(MJTI->getEntryKind() == MachineJumpTableInfo::EK_BlockAddress); unsigned EntrySize = MJTI->getEntrySize(getDataLayout()); @@ -207,8 +235,10 @@ void BPFAsmPrinter::emitJumpTableInfo() { MCSymbol *JTStart = getJTPublicSymbol(JTI); OutStreamer->emitLabel(JTStart); for (const MachineBasicBlock *MBB : JTBBs) { - const MCExpr *LHS = MCSymbolRefExpr::create(MBB->getSymbol(), OutContext); - OutStreamer->emitValue(LHS, EntrySize); + const MCExpr *Diff = MCBinaryExpr::createSub( + MCSymbolRefExpr::create(MBB->getSymbol(), OutContext), + MCSymbolRefExpr::create(SecStart, OutContext), OutContext); + OutStreamer->emitValue(Diff, EntrySize); } const MCExpr *JTSize = MCConstantExpr::create(JTBBs.size() * EntrySize, OutContext); diff --git a/llvm/lib/Target/BPF/BPFAsmPrinter.h b/llvm/lib/Target/BPF/BPFAsmPrinter.h index 90ef207..75a1d7e 100644 --- a/llvm/lib/Target/BPF/BPFAsmPrinter.h +++ b/llvm/lib/Target/BPF/BPFAsmPrinter.h @@ -39,6 +39,7 @@ public: private: BTFDebug *BTF; TargetMachine &TM; + bool SawTrapCall = false; const BPFTargetMachine &getBTM() const; }; diff --git a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp index 8ace2d2..eb4c884 100644 --- a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp +++ b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp @@ -194,9 +194,10 @@ void DXContainerGlobals::addResourcesForPSV(Module &M, PSVRuntimeInfo &PSV) { dxbc::PSV::v2::ResourceBindInfo BindInfo; BindInfo.Type = Type; BindInfo.LowerBound = Binding.LowerBound; - assert(Binding.Size == UINT32_MAX || - (uint64_t)Binding.LowerBound + Binding.Size - 1 <= UINT32_MAX && - "Resource range is too large"); + assert( + (Binding.Size == UINT32_MAX || + (uint64_t)Binding.LowerBound + Binding.Size - 1 <= UINT32_MAX) && + "Resource range is too large"); BindInfo.UpperBound = (Binding.Size == UINT32_MAX) ? UINT32_MAX : Binding.LowerBound + Binding.Size - 1; diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td index 7ae500a..67437f6 100644 --- a/llvm/lib/Target/DirectX/DXIL.td +++ b/llvm/lib/Target/DirectX/DXIL.td @@ -1079,6 +1079,15 @@ def WaveActiveOp : DXILOp<119, waveActiveOp> { let attributes = [Attributes<DXIL1_0, []>]; } +def LegacyF16ToF32 : DXILOp<131, legacyF16ToF32> { + let Doc = "returns the float16 stored in the low-half of the uint converted " + "to a float"; + let intrinsics = [IntrinSelect<int_dx_legacyf16tof32>]; + let arguments = [Int32Ty]; + let result = FloatTy; + let stages = [Stages<DXIL1_0, [all_stages]>]; +} + def WaveAllBitCount : DXILOp<135, waveAllOp> { let Doc = "returns the count of bits set to 1 across the wave"; let intrinsics = [IntrinSelect<int_dx_wave_active_countbits>]; diff --git a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp index 60dfd96..6cacbf6 100644 --- a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp +++ b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp @@ -29,11 +29,12 @@ bool DirectXTTIImpl::isTargetIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx) const { switch (ID) { case Intrinsic::dx_asdouble: - case Intrinsic::dx_isinf: - case Intrinsic::dx_isnan: case Intrinsic::dx_firstbitlow: - case Intrinsic::dx_firstbituhigh: case Intrinsic::dx_firstbitshigh: + case Intrinsic::dx_firstbituhigh: + case Intrinsic::dx_isinf: + case Intrinsic::dx_isnan: + case Intrinsic::dx_legacyf16tof32: return OpdIdx == 0; default: return OpdIdx == -1; @@ -50,6 +51,7 @@ bool DirectXTTIImpl::isTargetIntrinsicTriviallyScalarizable( case Intrinsic::dx_frac: case Intrinsic::dx_isinf: case Intrinsic::dx_isnan: + case Intrinsic::dx_legacyf16tof32: case Intrinsic::dx_rsqrt: case Intrinsic::dx_saturate: case Intrinsic::dx_splitdouble: diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp index 7ee280d..eadf020 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp @@ -1815,7 +1815,7 @@ struct WeightedLeaf { int Weight; int InsertionOrder; - WeightedLeaf() {} + WeightedLeaf() = default; WeightedLeaf(SDValue Value, int Weight, int InsertionOrder) : Value(Value), Weight(Weight), InsertionOrder(InsertionOrder) { diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index 904aabed..fe700e1 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -375,6 +375,8 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FFLOOR, VT, Legal); setOperationAction(ISD::FTRUNC, VT, Legal); setOperationAction(ISD::FROUNDEVEN, VT, Legal); + setOperationAction(ISD::FMINNUM, VT, Legal); + setOperationAction(ISD::FMAXNUM, VT, Legal); } setOperationAction(ISD::CTPOP, GRLenVT, Legal); setOperationAction(ISD::FCEIL, {MVT::f32, MVT::f64}, Legal); @@ -461,6 +463,8 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FFLOOR, VT, Legal); setOperationAction(ISD::FTRUNC, VT, Legal); setOperationAction(ISD::FROUNDEVEN, VT, Legal); + setOperationAction(ISD::FMINNUM, VT, Legal); + setOperationAction(ISD::FMAXNUM, VT, Legal); } } diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td index 610ba05..b502b056 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td @@ -1558,6 +1558,10 @@ defm : PatXrXrF<fmul, "XVFMUL">; // XVFDIV_{S/D} defm : PatXrXrF<fdiv, "XVFDIV">; +// XVFMAX_{S/D}, XVFMIN_{S/D} +defm : PatXrXrF<fmaxnum, "XVFMAX">; +defm : PatXrXrF<fminnum, "XVFMIN">; + // XVFMADD_{S/D} def : Pat<(fma v8f32:$xj, v8f32:$xk, v8f32:$xa), (XVFMADD_S v8f32:$xj, v8f32:$xk, v8f32:$xa)>; diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td index 6470842..6b74a4b 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td @@ -1760,6 +1760,10 @@ defm : PatVrVrF<fmul, "VFMUL">; // VFDIV_{S/D} defm : PatVrVrF<fdiv, "VFDIV">; +// VFMAX_{S/D}, VFMIN_{S/D} +defm : PatVrVrF<fmaxnum, "VFMAX">; +defm : PatVrVrF<fminnum, "VFMIN">; + // VFMADD_{S/D} def : Pat<(fma v4f32:$vj, v4f32:$vk, v4f32:$va), (VFMADD_S v4f32:$vj, v4f32:$vk, v4f32:$va)>; diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp index 7d54565..6d69af5 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp @@ -39,7 +39,7 @@ LoongArchELFObjectWriter::LoongArchELFObjectWriter(uint8_t OSABI, bool Is64Bit) : MCELFObjectTargetWriter(Is64Bit, OSABI, ELF::EM_LOONGARCH, /*HasRelocationAddend=*/true) {} -LoongArchELFObjectWriter::~LoongArchELFObjectWriter() {} +LoongArchELFObjectWriter::~LoongArchELFObjectWriter() = default; unsigned LoongArchELFObjectWriter::getRelocType(const MCFixup &Fixup, const MCValue &Target, diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp index f0e2bc4..08fa51d 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp @@ -38,7 +38,7 @@ public: LoongArchMCCodeEmitter(MCContext &ctx, MCInstrInfo const &MCII) : Ctx(ctx), MCII(MCII) {} - ~LoongArchMCCodeEmitter() override {} + ~LoongArchMCCodeEmitter() override = default; void encodeInstruction(const MCInst &MI, SmallVectorImpl<char> &CB, SmallVectorImpl<MCFixup> &Fixups, diff --git a/llvm/lib/Target/Mips/MipsFastISel.cpp b/llvm/lib/Target/Mips/MipsFastISel.cpp index df0c8c1..06210b6 100644 --- a/llvm/lib/Target/Mips/MipsFastISel.cpp +++ b/llvm/lib/Target/Mips/MipsFastISel.cpp @@ -82,7 +82,7 @@ class MipsFastISel final : public FastISel { // All possible address modes. class Address { public: - using BaseKind = enum { RegBase, FrameIndexBase }; + enum BaseKind { RegBase, FrameIndexBase }; private: BaseKind Kind = RegBase; diff --git a/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.h b/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.h index caef8fe7..b832b82 100644 --- a/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.h +++ b/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.h @@ -20,7 +20,7 @@ class MemoryLocation; class NVPTXAAResult : public AAResultBase { public: - NVPTXAAResult() {} + NVPTXAAResult() = default; NVPTXAAResult(NVPTXAAResult &&Arg) : AAResultBase(std::move(Arg)) {} /// Handle invalidation events from the new pass manager. diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index c667a09..996d653 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -1836,7 +1836,7 @@ bool NVPTXDAGToDAGISel::tryFence(SDNode *N) { return true; } -NVPTXScopes::NVPTXScopes(LLVMContext &C) { +NVPTXScopes::NVPTXScopes(LLVMContext &C) : Context(&C) { Scopes[C.getOrInsertSyncScopeID("singlethread")] = NVPTX::Scope::Thread; Scopes[C.getOrInsertSyncScopeID("")] = NVPTX::Scope::System; Scopes[C.getOrInsertSyncScopeID("block")] = NVPTX::Scope::Block; @@ -1851,11 +1851,21 @@ NVPTX::Scope NVPTXScopes::operator[](SyncScope::ID ID) const { auto S = Scopes.find(ID); if (S == Scopes.end()) { - // TODO: - // - Add API to LLVMContext to get the name of a single scope. - // - Use that API here to print an error containing the name - // of this Unknown ID. - report_fatal_error(formatv("Could not find scope ID={}.", int(ID))); + auto scopeName = Context->getSyncScopeName(ID); + assert(scopeName.has_value() && "Scope name must exist."); + + // Build list of supported syncscopes programmatically + SmallVector<StringRef> supportedScopes; + for (const auto &Entry : Scopes) { + if (auto name = Context->getSyncScopeName(Entry.first)) + supportedScopes.push_back(name->empty() ? "<empty string>" : *name); + } + + reportFatalUsageError( + formatv("NVPTX backend does not support syncscope \"{0}\" (ID={1}).\n" + "Supported syncscopes are: {2}.", + scopeName.value(), int(ID), + make_range(supportedScopes.begin(), supportedScopes.end()))); } return S->second; } diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h index 1cb579b..d525531 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -35,6 +35,7 @@ struct NVPTXScopes { private: SmallMapVector<SyncScope::ID, NVPTX::Scope, 8> Scopes{}; + LLVMContext *Context = nullptr; }; class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel { diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index b260221..f0bdf47 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -2267,7 +2267,7 @@ def : Pat<(f32 (fpround f64:$a)), (CVT_f32_f64 $a, CvtRN)>; def : Pat<(f32 (fpextend f16:$a)), (CVT_f32_f16 $a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>; def : Pat<(f32 (fpextend f16:$a)), (CVT_f32_f16 $a, CvtNONE)>; // fpextend bf16 -> f32 -def : Pat<(f32 (fpextend bf16:$a)), (CVT_f32_bf16 $a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(f32 (fpextend bf16:$a)), (CVT_f32_bf16 $a, CvtNONE_FTZ)>, Requires<[doF32FTZ, hasPTX<78>, hasSM<90>]>; def : Pat<(f32 (fpextend bf16:$a)), (CVT_f32_bf16 $a, CvtNONE)>, Requires<[hasPTX<71>, hasSM<80>]>; // fpextend f16 -> f64 diff --git a/llvm/lib/Target/PowerPC/PPCInstrFuture.td b/llvm/lib/Target/PowerPC/PPCInstrFuture.td index da3efdc..0c2e44e 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrFuture.td +++ b/llvm/lib/Target/PowerPC/PPCInstrFuture.td @@ -360,6 +360,10 @@ let Predicates = [HasVSX, IsISAFuture] in { def LXVPRLL : XForm_XTp5_RAB5<31, 621, (outs vsrprc:$XTp), (ins (memr $RA):$addr, g8rc:$RB), "lxvprll $XTp, $addr, $RB", IIC_LdStLFD, []>; + def LXVPB32X + : XForm_XTp5_RAB5<31, 877, (outs vsrprc:$XTp), + (ins (memr $RA):$addr, g8rc:$RB), + "lxvpb32x $XTp, $addr, $RB", IIC_LdStLFD, []>; } let mayStore = 1 in { @@ -376,6 +380,10 @@ let Predicates = [HasVSX, IsISAFuture] in { : XForm_XTp5_RAB5<31, 749, (outs), (ins vsrprc:$XTp, (memr $RA):$addr, g8rc:$RB), "stxvprll $XTp, $addr, $RB", IIC_LdStLFD, []>; + def STXVPB32X + : XForm_XTp5_RAB5<31, 1005, (outs), + (ins vsrprc:$XTp, (memr $RA):$addr, g8rc:$RB), + "stxvpb32x $XTp, $addr, $RB", IIC_LdStLFD, []>; } def VUPKHSNTOB : VXForm_VRTB5<387, 0, (outs vrrc:$VRT), (ins vrrc:$VRB), diff --git a/llvm/lib/Target/PowerPC/PPCInstrMMA.td b/llvm/lib/Target/PowerPC/PPCInstrMMA.td index b38dd4a..fc3cde3 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrMMA.td +++ b/llvm/lib/Target/PowerPC/PPCInstrMMA.td @@ -202,7 +202,7 @@ multiclass ACC_UM_M244_XO46<bits<6> opcode, bits<8> xo, dag IOL, string asmbase, RegConstraint<"@earlyclobber $AT">; def PM#NAME#WPP : MMIRR_XX3Form_XY4P2_XAB6< - opcode, !or(xo, 0x20), (outs acc:$AT), + opcode, !or(xo, 0x20), (outs wacc:$AT), !con((ins wacc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))), !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"), @@ -765,7 +765,7 @@ let Predicates = [MMA, IsISAFuture] in { def : Pat<(v512i1 (int_ppc_mma_xvf64gerpn v512i1:$ATi, v256i1:$XA, v16i8:$XB)), (XVF64GERWPN $ATi, $XA, RCCp.BToVSRC)>; def : Pat<(v512i1 (int_ppc_mma_xvf64gernp v512i1:$ATi, v256i1:$XA, v16i8:$XB)), - (XVF64GERNP $ATi, $XA, RCCp.BToVSRC)>; + (XVF64GERWNP $ATi, $XA, RCCp.BToVSRC)>; def : Pat<(v512i1 (int_ppc_mma_xvf64gernn v512i1:$ATi, v256i1:$XA, v16i8:$XB)), (XVF64GERWNN $ATi, $XA, RCCp.BToVSRC)>; diff --git a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp index 282cf5d..3d5a55c 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp @@ -95,7 +95,8 @@ private: void addVectorLoadStoreOperands(MachineInstr &I, SmallVectorImpl<SrcOp> &SrcOps, unsigned &CurOp, bool IsMasked, - bool IsStrided) const; + bool IsStridedOrIndexed, + LLT *IndexVT = nullptr) const; bool selectIntrinsicWithSideEffects(MachineInstr &I, MachineIRBuilder &MIB) const; @@ -722,15 +723,17 @@ static unsigned selectRegImmLoadStoreOp(unsigned GenericOpc, unsigned OpSize) { void RISCVInstructionSelector::addVectorLoadStoreOperands( MachineInstr &I, SmallVectorImpl<SrcOp> &SrcOps, unsigned &CurOp, - bool IsMasked, bool IsStrided) const { + bool IsMasked, bool IsStridedOrIndexed, LLT *IndexVT) const { // Base Pointer auto PtrReg = I.getOperand(CurOp++).getReg(); SrcOps.push_back(PtrReg); - // Stride - if (IsStrided) { + // Stride or Index + if (IsStridedOrIndexed) { auto StrideReg = I.getOperand(CurOp++).getReg(); SrcOps.push_back(StrideReg); + if (IndexVT) + *IndexVT = MRI->getType(StrideReg); } // Mask @@ -805,6 +808,70 @@ bool RISCVInstructionSelector::selectIntrinsicWithSideEffects( I.eraseFromParent(); return constrainSelectedInstRegOperands(*PseudoMI, TII, TRI, RBI); } + case Intrinsic::riscv_vloxei: + case Intrinsic::riscv_vloxei_mask: + case Intrinsic::riscv_vluxei: + case Intrinsic::riscv_vluxei_mask: { + bool IsMasked = IntrinID == Intrinsic::riscv_vloxei_mask || + IntrinID == Intrinsic::riscv_vluxei_mask; + bool IsOrdered = IntrinID == Intrinsic::riscv_vloxei || + IntrinID == Intrinsic::riscv_vloxei_mask; + LLT VT = MRI->getType(I.getOperand(0).getReg()); + unsigned Log2SEW = Log2_32(VT.getScalarSizeInBits()); + + // Result vector + const Register DstReg = I.getOperand(0).getReg(); + + // Sources + bool HasPassthruOperand = IntrinID != Intrinsic::riscv_vlm; + unsigned CurOp = 2; + SmallVector<SrcOp, 4> SrcOps; // Source registers. + + // Passthru + if (HasPassthruOperand) { + auto PassthruReg = I.getOperand(CurOp++).getReg(); + SrcOps.push_back(PassthruReg); + } else { + // Use NoRegister if there is no specified passthru. + SrcOps.push_back(Register()); + } + LLT IndexVT; + addVectorLoadStoreOperands(I, SrcOps, CurOp, IsMasked, true, &IndexVT); + + RISCVVType::VLMUL LMUL = RISCVTargetLowering::getLMUL(getMVTForLLT(VT)); + RISCVVType::VLMUL IndexLMUL = + RISCVTargetLowering::getLMUL(getMVTForLLT(IndexVT)); + unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits()); + if (IndexLog2EEW == 6 && !Subtarget->is64Bit()) { + reportFatalUsageError("The V extension does not support EEW=64 for index " + "values when XLEN=32"); + } + const RISCV::VLX_VSXPseudo *P = RISCV::getVLXPseudo( + IsMasked, IsOrdered, IndexLog2EEW, static_cast<unsigned>(LMUL), + static_cast<unsigned>(IndexLMUL)); + + auto PseudoMI = MIB.buildInstr(P->Pseudo, {DstReg}, SrcOps); + + // Select VL + auto VLOpFn = renderVLOp(I.getOperand(CurOp++)); + for (auto &RenderFn : *VLOpFn) + RenderFn(PseudoMI); + + // SEW + PseudoMI.addImm(Log2SEW); + + // Policy + uint64_t Policy = RISCVVType::MASK_AGNOSTIC; + if (IsMasked) + Policy = I.getOperand(CurOp++).getImm(); + PseudoMI.addImm(Policy); + + // Memref + PseudoMI.cloneMemRefs(I); + + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*PseudoMI, TII, TRI, RBI); + } case Intrinsic::riscv_vsm: case Intrinsic::riscv_vse: case Intrinsic::riscv_vse_mask: @@ -847,6 +914,56 @@ bool RISCVInstructionSelector::selectIntrinsicWithSideEffects( I.eraseFromParent(); return constrainSelectedInstRegOperands(*PseudoMI, TII, TRI, RBI); } + case Intrinsic::riscv_vsoxei: + case Intrinsic::riscv_vsoxei_mask: + case Intrinsic::riscv_vsuxei: + case Intrinsic::riscv_vsuxei_mask: { + bool IsMasked = IntrinID == Intrinsic::riscv_vsoxei_mask || + IntrinID == Intrinsic::riscv_vsuxei_mask; + bool IsOrdered = IntrinID == Intrinsic::riscv_vsoxei || + IntrinID == Intrinsic::riscv_vsoxei_mask; + LLT VT = MRI->getType(I.getOperand(1).getReg()); + unsigned Log2SEW = Log2_32(VT.getScalarSizeInBits()); + + // Sources + unsigned CurOp = 1; + SmallVector<SrcOp, 4> SrcOps; // Source registers. + + // Store value + auto PassthruReg = I.getOperand(CurOp++).getReg(); + SrcOps.push_back(PassthruReg); + + LLT IndexVT; + addVectorLoadStoreOperands(I, SrcOps, CurOp, IsMasked, true, &IndexVT); + + RISCVVType::VLMUL LMUL = RISCVTargetLowering::getLMUL(getMVTForLLT(VT)); + RISCVVType::VLMUL IndexLMUL = + RISCVTargetLowering::getLMUL(getMVTForLLT(IndexVT)); + unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits()); + if (IndexLog2EEW == 6 && !Subtarget->is64Bit()) { + reportFatalUsageError("The V extension does not support EEW=64 for index " + "values when XLEN=32"); + } + const RISCV::VLX_VSXPseudo *P = RISCV::getVSXPseudo( + IsMasked, IsOrdered, IndexLog2EEW, static_cast<unsigned>(LMUL), + static_cast<unsigned>(IndexLMUL)); + + auto PseudoMI = MIB.buildInstr(P->Pseudo, {}, SrcOps); + + // Select VL + auto VLOpFn = renderVLOp(I.getOperand(CurOp++)); + for (auto &RenderFn : *VLOpFn) + RenderFn(PseudoMI); + + // SEW + PseudoMI.addImm(Log2SEW); + + // Memref + PseudoMI.cloneMemRefs(I); + + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*PseudoMI, TII, TRI, RBI); + } } } diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h index e75dfe3..5b8cfb2 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h @@ -407,7 +407,6 @@ enum OperandType : unsigned { OPERAND_SIMM5_PLUS1, OPERAND_SIMM6, OPERAND_SIMM6_NONZERO, - OPERAND_SIMM8, OPERAND_SIMM8_UNSIGNED, OPERAND_SIMM10, OPERAND_SIMM10_LSB0000_NONZERO, diff --git a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp index 526675a..b0453fc 100644 --- a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp +++ b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp @@ -131,6 +131,7 @@ bool RISCVExpandPseudo::expandMI(MachineBasicBlock &MBB, case RISCV::PseudoCCMAXU: case RISCV::PseudoCCMIN: case RISCV::PseudoCCMINU: + case RISCV::PseudoCCMUL: case RISCV::PseudoCCADDW: case RISCV::PseudoCCSUBW: case RISCV::PseudoCCSLL: @@ -237,6 +238,7 @@ bool RISCVExpandPseudo::expandCCOp(MachineBasicBlock &MBB, case RISCV::PseudoCCMIN: NewOpc = RISCV::MIN; break; case RISCV::PseudoCCMAXU: NewOpc = RISCV::MAXU; break; case RISCV::PseudoCCMINU: NewOpc = RISCV::MINU; break; + case RISCV::PseudoCCMUL: NewOpc = RISCV::MUL; break; case RISCV::PseudoCCADDI: NewOpc = RISCV::ADDI; break; case RISCV::PseudoCCSLLI: NewOpc = RISCV::SLLI; break; case RISCV::PseudoCCSRLI: NewOpc = RISCV::SRLI; break; diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index cfee6ab..5b72334 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -1856,6 +1856,11 @@ def TuneShortForwardBranchIMinMax "true", "Enable short forward branch optimization for min,max instructions in Zbb", [TuneShortForwardBranchOpt]>; +def TuneShortForwardBranchIMul + : SubtargetFeature<"short-forward-branch-i-mul", "HasShortForwardBranchIMul", + "true", "Enable short forward branch optimization for mul instruction", + [TuneShortForwardBranchOpt]>; + // Some subtargets require a S2V transfer buffer to move scalars into vectors. // FIXME: Forming .vx/.vf/.wx/.wf can reduce register pressure. def TuneNoSinkSplatOperands diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index b25a054..9078335 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -371,8 +371,8 @@ void RISCVDAGToDAGISel::selectVLXSEG(SDNode *Node, unsigned NF, bool IsMasked, RISCVVType::VLMUL IndexLMUL = RISCVTargetLowering::getLMUL(IndexVT); unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits()); if (IndexLog2EEW == 6 && !Subtarget->is64Bit()) { - report_fatal_error("The V extension does not support EEW=64 for index " - "values when XLEN=32"); + reportFatalUsageError("The V extension does not support EEW=64 for index " + "values when XLEN=32"); } const RISCV::VLXSEGPseudo *P = RISCV::getVLXSEGPseudo( NF, IsMasked, IsOrdered, IndexLog2EEW, static_cast<unsigned>(LMUL), @@ -444,8 +444,8 @@ void RISCVDAGToDAGISel::selectVSXSEG(SDNode *Node, unsigned NF, bool IsMasked, RISCVVType::VLMUL IndexLMUL = RISCVTargetLowering::getLMUL(IndexVT); unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits()); if (IndexLog2EEW == 6 && !Subtarget->is64Bit()) { - report_fatal_error("The V extension does not support EEW=64 for index " - "values when XLEN=32"); + reportFatalUsageError("The V extension does not support EEW=64 for index " + "values when XLEN=32"); } const RISCV::VSXSEGPseudo *P = RISCV::getVSXSEGPseudo( NF, IsMasked, IsOrdered, IndexLog2EEW, static_cast<unsigned>(LMUL), @@ -2223,8 +2223,8 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { RISCVVType::VLMUL IndexLMUL = RISCVTargetLowering::getLMUL(IndexVT); unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits()); if (IndexLog2EEW == 6 && !Subtarget->is64Bit()) { - report_fatal_error("The V extension does not support EEW=64 for index " - "values when XLEN=32"); + reportFatalUsageError("The V extension does not support EEW=64 for " + "index values when XLEN=32"); } const RISCV::VLX_VSXPseudo *P = RISCV::getVLXPseudo( IsMasked, IsOrdered, IndexLog2EEW, static_cast<unsigned>(LMUL), @@ -2457,8 +2457,8 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { RISCVVType::VLMUL IndexLMUL = RISCVTargetLowering::getLMUL(IndexVT); unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits()); if (IndexLog2EEW == 6 && !Subtarget->is64Bit()) { - report_fatal_error("The V extension does not support EEW=64 for index " - "values when XLEN=32"); + reportFatalUsageError("The V extension does not support EEW=64 for " + "index values when XLEN=32"); } const RISCV::VLX_VSXPseudo *P = RISCV::getVSXPseudo( IsMasked, IsOrdered, IndexLog2EEW, diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index e0cf739..b860562 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -9186,7 +9186,7 @@ static SDValue lowerSelectToBinOp(SDNode *N, SelectionDAG &DAG, unsigned ShAmount = Log2_64(TrueM1); if (Subtarget.hasShlAdd(ShAmount)) return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, CondV, - DAG.getConstant(ShAmount, DL, VT), CondV); + DAG.getTargetConstant(ShAmount, DL, VT), CondV); } } // (select c, y, 0) -> -c & y @@ -15463,7 +15463,7 @@ static SDValue transformAddShlImm(SDNode *N, SelectionDAG &DAG, SDValue NS = (C0 < C1) ? N0->getOperand(0) : N1->getOperand(0); SDValue NL = (C0 > C1) ? N0->getOperand(0) : N1->getOperand(0); SDValue SHADD = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, NL, - DAG.getConstant(Diff, DL, VT), NS); + DAG.getTargetConstant(Diff, DL, VT), NS); return DAG.getNode(ISD::SHL, DL, VT, SHADD, DAG.getConstant(Bits, DL, VT)); } @@ -15501,7 +15501,7 @@ static SDValue combineShlAddIAddImpl(SDNode *N, SDValue AddI, SDValue Other, int64_t AddConst = AddVal.getSExtValue(); SDValue SHADD = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, SHLVal->getOperand(0), - DAG.getConstant(ShlConst, DL, VT), Other); + DAG.getTargetConstant(ShlConst, DL, VT), Other); return DAG.getNode(ISD::ADD, DL, VT, SHADD, DAG.getSignedConstant(AddConst, DL, VT)); } @@ -16495,6 +16495,35 @@ static SDValue expandMulToAddOrSubOfShl(SDNode *N, SelectionDAG &DAG, return DAG.getNode(Op, DL, VT, Shift1, Shift2); } +static SDValue getShlAddShlAdd(SDNode *N, SelectionDAG &DAG, unsigned ShX, + unsigned ShY) { + SDLoc DL(N); + EVT VT = N->getValueType(0); + SDValue X = N->getOperand(0); + SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, + DAG.getTargetConstant(ShY, DL, VT), X); + return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359, + DAG.getTargetConstant(ShX, DL, VT), Mul359); +} + +static SDValue expandMulToShlAddShlAdd(SDNode *N, SelectionDAG &DAG, + uint64_t MulAmt) { + switch (MulAmt) { + case 5 * 3: + return getShlAddShlAdd(N, DAG, 2, 1); + case 9 * 3: + return getShlAddShlAdd(N, DAG, 3, 1); + case 5 * 5: + return getShlAddShlAdd(N, DAG, 2, 2); + case 9 * 5: + return getShlAddShlAdd(N, DAG, 3, 2); + case 9 * 9: + return getShlAddShlAdd(N, DAG, 3, 3); + default: + return SDValue(); + } +} + // Try to expand a scalar multiply to a faster sequence. static SDValue expandMul(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -16524,18 +16553,17 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, if (Subtarget.hasVendorXqciac() && isInt<12>(CNode->getSExtValue())) return SDValue(); - // WARNING: The code below is knowingly incorrect with regards to undef semantics. - // We're adding additional uses of X here, and in principle, we should be freezing - // X before doing so. However, adding freeze here causes real regressions, and no - // other target properly freezes X in these cases either. - SDValue X = N->getOperand(0); - + // WARNING: The code below is knowingly incorrect with regards to undef + // semantics. We're adding additional uses of X here, and in principle, we + // should be freezing X before doing so. However, adding freeze here causes + // real regressions, and no other target properly freezes X in these cases + // either. if (Subtarget.hasShlAdd(3)) { + SDValue X = N->getOperand(0); int Shift; if (int ShXAmount = isShifted359(MulAmt, Shift)) { // 3/5/9 * 2^N -> shl (shXadd X, X), N SDLoc DL(N); - SDValue X = N->getOperand(0); // Put the shift first if we can fold a zext into the shift forming // a slli.uw. if (X.getOpcode() == ISD::AND && isa<ConstantSDNode>(X.getOperand(1)) && @@ -16543,49 +16571,19 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(Shift, DL, VT)); return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Shl, - DAG.getConstant(ShXAmount, DL, VT), Shl); + DAG.getTargetConstant(ShXAmount, DL, VT), Shl); } // Otherwise, put the shl second so that it can fold with following // instructions (e.g. sext or add). SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, - DAG.getConstant(ShXAmount, DL, VT), X); + DAG.getTargetConstant(ShXAmount, DL, VT), X); return DAG.getNode(ISD::SHL, DL, VT, Mul359, DAG.getConstant(Shift, DL, VT)); } // 3/5/9 * 3/5/9 -> shXadd (shYadd X, X), (shYadd X, X) - int ShX; - int ShY; - switch (MulAmt) { - case 3 * 5: - ShY = 1; - ShX = 2; - break; - case 3 * 9: - ShY = 1; - ShX = 3; - break; - case 5 * 5: - ShX = ShY = 2; - break; - case 5 * 9: - ShY = 2; - ShX = 3; - break; - case 9 * 9: - ShX = ShY = 3; - break; - default: - ShX = ShY = 0; - break; - } - if (ShX) { - SDLoc DL(N); - SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, - DAG.getConstant(ShY, DL, VT), X); - return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359, - DAG.getConstant(ShX, DL, VT), Mul359); - } + if (SDValue V = expandMulToShlAddShlAdd(N, DAG, MulAmt)) + return V; // If this is a power 2 + 2/4/8, we can use a shift followed by a single // shXadd. First check if this a sum of two power of 2s because that's @@ -16598,7 +16596,7 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShiftAmt, DL, VT)); return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, - DAG.getConstant(ScaleShift, DL, VT), Shift1); + DAG.getTargetConstant(ScaleShift, DL, VT), Shift1); } } @@ -16611,10 +16609,11 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, assert(Shift != 0 && "MulAmt=4,6,10 handled before"); if (Shift <= 3) { SDLoc DL(N); - SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, - DAG.getConstant(ShXAmount, DL, VT), X); + SDValue Mul359 = + DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, + DAG.getTargetConstant(ShXAmount, DL, VT), X); return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359, - DAG.getConstant(Shift, DL, VT), X); + DAG.getTargetConstant(Shift, DL, VT), X); } } @@ -16626,9 +16625,10 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, SDLoc DL(N); SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShiftAmt, DL, VT)); - return DAG.getNode(ISD::ADD, DL, VT, Shift1, - DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, - DAG.getConstant(ScaleShift, DL, VT), X)); + return DAG.getNode( + ISD::ADD, DL, VT, Shift1, + DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, + DAG.getTargetConstant(ScaleShift, DL, VT), X)); } } @@ -16643,28 +16643,17 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShAmt, DL, VT)); SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, - DAG.getConstant(Log2_64(Offset - 1), DL, VT), X); + DAG.getTargetConstant(Log2_64(Offset - 1), DL, VT), X); return DAG.getNode(ISD::SUB, DL, VT, Shift1, Mul359); } } - for (uint64_t Divisor : {3, 5, 9}) { - if (MulAmt % Divisor != 0) - continue; - uint64_t MulAmt2 = MulAmt / Divisor; - // 3/5/9 * 3/5/9 * 2^N - In particular, this covers multiples - // of 25 which happen to be quite common. - if (int ShBAmount = isShifted359(MulAmt2, Shift)) { - SDLoc DL(N); - SDValue Mul359A = - DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, - DAG.getConstant(Log2_64(Divisor - 1), DL, VT), X); - SDValue Mul359B = - DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359A, - DAG.getConstant(ShBAmount, DL, VT), Mul359A); - return DAG.getNode(ISD::SHL, DL, VT, Mul359B, - DAG.getConstant(Shift, DL, VT)); - } + // 3/5/9 * 3/5/9 * 2^N - In particular, this covers multiples + // of 25 which happen to be quite common. + Shift = llvm::countr_zero(MulAmt); + if (SDValue V = expandMulToShlAddShlAdd(N, DAG, MulAmt >> Shift)) { + SDLoc DL(N); + return DAG.getNode(ISD::SHL, DL, VT, V, DAG.getConstant(Shift, DL, VT)); } } diff --git a/llvm/lib/Target/RISCV/RISCVInsertWriteVXRM.cpp b/llvm/lib/Target/RISCV/RISCVInsertWriteVXRM.cpp index a1c8e23..c58a5c0 100644 --- a/llvm/lib/Target/RISCV/RISCVInsertWriteVXRM.cpp +++ b/llvm/lib/Target/RISCV/RISCVInsertWriteVXRM.cpp @@ -48,7 +48,7 @@ class VXRMInfo { } State = Uninitialized; public: - VXRMInfo() {} + VXRMInfo() = default; static VXRMInfo getUnknown() { VXRMInfo Info; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index c9df787..b8ab70b 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -1703,6 +1703,7 @@ unsigned getPredicatedOpcode(unsigned Opcode) { case RISCV::MAXU: return RISCV::PseudoCCMAXU; case RISCV::MIN: return RISCV::PseudoCCMIN; case RISCV::MINU: return RISCV::PseudoCCMINU; + case RISCV::MUL: return RISCV::PseudoCCMUL; case RISCV::ADDI: return RISCV::PseudoCCADDI; case RISCV::SLLI: return RISCV::PseudoCCSLLI; @@ -1754,6 +1755,9 @@ static MachineInstr *canFoldAsPredicatedOp(Register Reg, MI->getOpcode() == RISCV::MINU || MI->getOpcode() == RISCV::MAXU)) return nullptr; + if (!STI.hasShortForwardBranchIMul() && MI->getOpcode() == RISCV::MUL) + return nullptr; + // Check if MI can be predicated and folded into the CCMOV. if (getPredicatedOpcode(MI->getOpcode()) == RISCV::INSTRUCTION_LIST_END) return nullptr; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td b/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td index 5a67a5a..494b1c9 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td @@ -110,6 +110,7 @@ def PseudoCCMAX : SFBALU_rr; def PseudoCCMIN : SFBALU_rr; def PseudoCCMAXU : SFBALU_rr; def PseudoCCMINU : SFBALU_rr; +def PseudoCCMUL : SFBALU_rr; def PseudoCCADDI : SFBALU_ri; def PseudoCCANDI : SFBALU_ri; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td index b37ceaae..c2b25c6 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td @@ -60,6 +60,8 @@ def immfour : RISCVOp { let DecoderMethod = "decodeImmFourOperand"; } +def tuimm2 : TImmLeaf<XLenVT, [{return isUInt<2>(Imm);}]>; + //===----------------------------------------------------------------------===// // Instruction class templates //===----------------------------------------------------------------------===// @@ -557,8 +559,8 @@ multiclass VPatTernaryVMAQA_VV_VX<string intrinsic, string instruction, let Predicates = [HasVendorXTHeadBa] in { def : Pat<(add_like_non_imm12 (shl GPR:$rs2, uimm2:$uimm2), (XLenVT GPR:$rs1)), (TH_ADDSL GPR:$rs1, GPR:$rs2, uimm2:$uimm2)>; -def : Pat<(XLenVT (riscv_shl_add GPR:$rs2, uimm2:$uimm2, GPR:$rs1)), - (TH_ADDSL GPR:$rs1, GPR:$rs2, uimm2:$uimm2)>; +def : Pat<(XLenVT (riscv_shl_add GPR:$rs2, tuimm2:$uimm2, GPR:$rs1)), + (TH_ADDSL GPR:$rs1, GPR:$rs2, tuimm2:$uimm2)>; // Reuse complex patterns from StdExtZba def : Pat<(add_like_non_imm12 sh1add_op:$rs2, (XLenVT GPR:$rs1)), diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td index 4537bfe..8376da5 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td @@ -53,6 +53,8 @@ def uimm5gt3 : RISCVOp<XLenVT>, ImmLeaf<XLenVT, let OperandType = "OPERAND_UIMM5_GT3"; } +def tuimm5gt3 : TImmLeaf<XLenVT, [{return (Imm > 3) && isUInt<5>(Imm);}]>; + def UImm5Plus1AsmOperand : AsmOperandClass { let Name = "UImm5Plus1"; let RenderMethod = "addImmOperands"; @@ -1419,8 +1421,8 @@ def : Pat<(i32 (add GPRNoX0:$rd, (mul GPRNoX0:$rs1, simm12_lo:$imm12))), (QC_MULIADD GPRNoX0:$rd, GPRNoX0:$rs1, simm12_lo:$imm12)>; def : Pat<(i32 (add_like_non_imm12 (shl GPRNoX0:$rs1, (i32 uimm5gt3:$imm)), GPRNoX0:$rs2)), (QC_SHLADD GPRNoX0:$rs1, GPRNoX0:$rs2, uimm5gt3:$imm)>; -def : Pat<(i32 (riscv_shl_add GPRNoX0:$rs1, (i32 uimm5gt3:$imm), GPRNoX0:$rs2)), - (QC_SHLADD GPRNoX0:$rs1, GPRNoX0:$rs2, uimm5gt3:$imm)>; +def : Pat<(i32 (riscv_shl_add GPRNoX0:$rs1, (i32 tuimm5gt3:$imm), GPRNoX0:$rs2)), + (QC_SHLADD GPRNoX0:$rs1, GPRNoX0:$rs2, tuimm5gt3:$imm)>; } // Predicates = [HasVendorXqciac, IsRV32] /// Simple arithmetic operations diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td index c31713e..1c6a5af 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td @@ -90,6 +90,7 @@ defvar ZfhminDExts = [ZfhminDExt, ZhinxminZdinxExt, ZhinxminZdinx32Ext]; //===----------------------------------------------------------------------===// let Predicates = [HasHalfFPLoadStoreMove] in { +let canFoldAsLoad = 1 in def FLH : FPLoad_r<0b001, "flh", FPR16, WriteFLD16>; // Operands for stores are in the order srcreg, base, offset rather than diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVTargetStreamer.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVTargetStreamer.cpp index 0a318e0..ed6d355 100644 --- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVTargetStreamer.cpp +++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVTargetStreamer.cpp @@ -15,4 +15,4 @@ using namespace llvm; SPIRVTargetStreamer::SPIRVTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {} -SPIRVTargetStreamer::~SPIRVTargetStreamer() {} +SPIRVTargetStreamer::~SPIRVTargetStreamer() = default; diff --git a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp index 640b014..0175f2f 100644 --- a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp @@ -577,6 +577,11 @@ void SPIRVAsmPrinter::outputExecutionMode(const Module &M) { if (MDNode *Node = F.getMetadata("intel_reqd_sub_group_size")) outputExecutionModeFromMDNode(FReg, Node, SPIRV::ExecutionMode::SubgroupSize, 0, 0); + if (MDNode *Node = F.getMetadata("max_work_group_size")) { + if (ST->canUseExtension(SPIRV::Extension::SPV_INTEL_kernel_attributes)) + outputExecutionModeFromMDNode( + FReg, Node, SPIRV::ExecutionMode::MaxWorkgroupSizeINTEL, 3, 1); + } if (MDNode *Node = F.getMetadata("vec_type_hint")) { MCInst Inst; Inst.setOpcode(SPIRV::OpExecutionMode); diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp index 9e11c3a..dd57b74 100644 --- a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp @@ -149,23 +149,23 @@ static FunctionType *getOriginalFunctionType(const Function &F) { return isa<MDString>(N->getOperand(0)) && cast<MDString>(N->getOperand(0))->getString() == F.getName(); }); - // TODO: probably one function can have numerous type mutations, - // so we should support this. if (ThisFuncMDIt != NamedMD->op_end()) { auto *ThisFuncMD = *ThisFuncMDIt; - MDNode *MD = dyn_cast<MDNode>(ThisFuncMD->getOperand(1)); - assert(MD && "MDNode operand is expected"); - ConstantInt *Const = getConstInt(MD, 0); - if (Const) { - auto *CMeta = dyn_cast<ConstantAsMetadata>(MD->getOperand(1)); - assert(CMeta && "ConstantAsMetadata operand is expected"); - assert(Const->getSExtValue() >= -1); - // Currently -1 indicates return value, greater values mean - // argument numbers. - if (Const->getSExtValue() == -1) - RetTy = CMeta->getType(); - else - ArgTypes[Const->getSExtValue()] = CMeta->getType(); + for (unsigned I = 1; I != ThisFuncMD->getNumOperands(); ++I) { + MDNode *MD = dyn_cast<MDNode>(ThisFuncMD->getOperand(I)); + assert(MD && "MDNode operand is expected"); + ConstantInt *Const = getConstInt(MD, 0); + if (Const) { + auto *CMeta = dyn_cast<ConstantAsMetadata>(MD->getOperand(1)); + assert(CMeta && "ConstantAsMetadata operand is expected"); + assert(Const->getSExtValue() >= -1); + // Currently -1 indicates return value, greater values mean + // argument numbers. + if (Const->getSExtValue() == -1) + RetTy = CMeta->getType(); + else + ArgTypes[Const->getSExtValue()] = CMeta->getType(); + } } } diff --git a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp index 96f5dee..43b2869 100644 --- a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp @@ -107,6 +107,8 @@ static const std::map<std::string, SPIRV::Extension::Extension, std::less<>> SPIRV::Extension::Extension::SPV_INTEL_inline_assembly}, {"SPV_INTEL_bindless_images", SPIRV::Extension::Extension::SPV_INTEL_bindless_images}, + {"SPV_INTEL_bfloat16_arithmetic", + SPIRV::Extension::Extension::SPV_INTEL_bfloat16_arithmetic}, {"SPV_INTEL_bfloat16_conversion", SPIRV::Extension::Extension::SPV_INTEL_bfloat16_conversion}, {"SPV_KHR_subgroup_rotate", @@ -155,7 +157,9 @@ static const std::map<std::string, SPIRV::Extension::Extension, std::less<>> {"SPV_INTEL_predicated_io", SPIRV::Extension::Extension::SPV_INTEL_predicated_io}, {"SPV_KHR_maximal_reconvergence", - SPIRV::Extension::Extension::SPV_KHR_maximal_reconvergence}}; + SPIRV::Extension::Extension::SPV_KHR_maximal_reconvergence}, + {"SPV_INTEL_kernel_attributes", + SPIRV::Extension::Extension::SPV_INTEL_kernel_attributes}}; bool SPIRVExtensionsParser::parse(cl::Option &O, StringRef ArgName, StringRef ArgValue, diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index 3f0424f..245e5a2 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -3516,6 +3516,10 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg, case Intrinsic::spv_resource_nonuniformindex: { return selectResourceNonUniformIndex(ResVReg, ResType, I); } + case Intrinsic::spv_unpackhalf2x16: { + return selectExtInst(ResVReg, ResType, I, GL::UnpackHalf2x16); + } + default: { std::string DiagMsg; raw_string_ostream OS(DiagMsg); diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp index db036a5..e5ac76c4 100644 --- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp @@ -1435,6 +1435,8 @@ void addInstrRequirements(const MachineInstr &MI, addPrintfRequirements(MI, Reqs, ST); break; } + // TODO: handle bfloat16 extended instructions when + // SPV_INTEL_bfloat16_arithmetic is enabled. break; } case SPIRV::OpAliasDomainDeclINTEL: @@ -2060,7 +2062,64 @@ void addInstrRequirements(const MachineInstr &MI, Reqs.addCapability(SPIRV::Capability::PredicatedIOINTEL); break; } - + case SPIRV::OpFAddS: + case SPIRV::OpFSubS: + case SPIRV::OpFMulS: + case SPIRV::OpFDivS: + case SPIRV::OpFRemS: + case SPIRV::OpFMod: + case SPIRV::OpFNegate: + case SPIRV::OpFAddV: + case SPIRV::OpFSubV: + case SPIRV::OpFMulV: + case SPIRV::OpFDivV: + case SPIRV::OpFRemV: + case SPIRV::OpFNegateV: { + const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); + SPIRVType *TypeDef = MRI.getVRegDef(MI.getOperand(1).getReg()); + if (TypeDef->getOpcode() == SPIRV::OpTypeVector) + TypeDef = MRI.getVRegDef(TypeDef->getOperand(1).getReg()); + if (isBFloat16Type(TypeDef)) { + if (!ST.canUseExtension(SPIRV::Extension::SPV_INTEL_bfloat16_arithmetic)) + report_fatal_error( + "Arithmetic instructions with bfloat16 arguments require the " + "following SPIR-V extension: SPV_INTEL_bfloat16_arithmetic", + false); + Reqs.addExtension(SPIRV::Extension::SPV_INTEL_bfloat16_arithmetic); + Reqs.addCapability(SPIRV::Capability::BFloat16ArithmeticINTEL); + } + break; + } + case SPIRV::OpOrdered: + case SPIRV::OpUnordered: + case SPIRV::OpFOrdEqual: + case SPIRV::OpFOrdNotEqual: + case SPIRV::OpFOrdLessThan: + case SPIRV::OpFOrdLessThanEqual: + case SPIRV::OpFOrdGreaterThan: + case SPIRV::OpFOrdGreaterThanEqual: + case SPIRV::OpFUnordEqual: + case SPIRV::OpFUnordNotEqual: + case SPIRV::OpFUnordLessThan: + case SPIRV::OpFUnordLessThanEqual: + case SPIRV::OpFUnordGreaterThan: + case SPIRV::OpFUnordGreaterThanEqual: { + const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); + MachineInstr *OperandDef = MRI.getVRegDef(MI.getOperand(2).getReg()); + SPIRVType *TypeDef = MRI.getVRegDef(OperandDef->getOperand(1).getReg()); + if (TypeDef->getOpcode() == SPIRV::OpTypeVector) + TypeDef = MRI.getVRegDef(TypeDef->getOperand(1).getReg()); + if (isBFloat16Type(TypeDef)) { + if (!ST.canUseExtension(SPIRV::Extension::SPV_INTEL_bfloat16_arithmetic)) + report_fatal_error( + "Relational instructions with bfloat16 arguments require the " + "following SPIR-V extension: SPV_INTEL_bfloat16_arithmetic", + false); + Reqs.addExtension(SPIRV::Extension::SPV_INTEL_bfloat16_arithmetic); + Reqs.addCapability(SPIRV::Capability::BFloat16ArithmeticINTEL); + } + break; + } default: break; } @@ -2180,6 +2239,10 @@ static void collectReqs(const Module &M, SPIRV::ModuleAnalysisInfo &MAI, MAI.Reqs.getAndAddRequirements( SPIRV::OperandCategory::ExecutionModeOperand, SPIRV::ExecutionMode::SubgroupSize, ST); + if (F.getMetadata("max_work_group_size")) + MAI.Reqs.getAndAddRequirements( + SPIRV::OperandCategory::ExecutionModeOperand, + SPIRV::ExecutionMode::MaxWorkgroupSizeINTEL, ST); if (F.getMetadata("vec_type_hint")) MAI.Reqs.getAndAddRequirements( SPIRV::OperandCategory::ExecutionModeOperand, diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h index 2d19f6de..44b6c66 100644 --- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h +++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h @@ -81,7 +81,7 @@ private: void initAvailableCapabilitiesForVulkan(const SPIRVSubtarget &ST); public: - RequirementHandler() {} + RequirementHandler() = default; void clear() { MinimalCaps.clear(); AllCaps.clear(); diff --git a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp index ba09692..ad6c9cd 100644 --- a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp @@ -70,7 +70,6 @@ SPIRVSubtarget::SPIRVSubtarget(const Triple &TT, const std::string &CPU, SPIRVVersion = VersionTuple(1, 3); break; case Triple::SPIRVSubArch_v14: - default: SPIRVVersion = VersionTuple(1, 4); break; case Triple::SPIRVSubArch_v15: @@ -79,13 +78,19 @@ SPIRVSubtarget::SPIRVSubtarget(const Triple &TT, const std::string &CPU, case Triple::SPIRVSubArch_v16: SPIRVVersion = VersionTuple(1, 6); break; + default: + if (TT.getVendor() == Triple::AMD) + SPIRVVersion = VersionTuple(1, 6); + else + SPIRVVersion = VersionTuple(1, 4); } OpenCLVersion = VersionTuple(2, 2); // Set the environment based on the target triple. if (TargetTriple.getOS() == Triple::Vulkan) Env = Shader; - else if (TargetTriple.getEnvironment() == Triple::OpenCL) + else if (TargetTriple.getEnvironment() == Triple::OpenCL || + TargetTriple.getVendor() == Triple::AMD) Env = Kernel; else Env = Unknown; @@ -93,6 +98,8 @@ SPIRVSubtarget::SPIRVSubtarget(const Triple &TT, const std::string &CPU, // Set the default extensions based on the target triple. if (TargetTriple.getVendor() == Triple::Intel) Extensions.insert(SPIRV::Extension::SPV_INTEL_function_pointers); + if (TargetTriple.getVendor() == Triple::AMD) + Extensions = SPIRVExtensionsParser::getValidExtensions(TargetTriple); // The order of initialization is important. initAvailableExtensions(Extensions); diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td index 7d08b29..1b4b29b 100644 --- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td +++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td @@ -387,6 +387,8 @@ defm SPV_INTEL_tensor_float32_conversion : ExtensionOperand<125, [EnvOpenCL]>; defm SPV_KHR_bfloat16 : ExtensionOperand<126, [EnvVulkan, EnvOpenCL]>; defm SPV_INTEL_predicated_io : ExtensionOperand<127, [EnvOpenCL]>; defm SPV_KHR_maximal_reconvergence : ExtensionOperand<128, [EnvVulkan]>; +defm SPV_INTEL_bfloat16_arithmetic + : ExtensionOperand<129, [EnvVulkan, EnvOpenCL]>; //===----------------------------------------------------------------------===// // Multiclass used to define Capabilities enum values and at the same time @@ -570,6 +572,7 @@ defm AtomicFloat64MinMaxEXT : CapabilityOperand<5613, 0, 0, [SPV_EXT_shader_atom defm VariableLengthArrayINTEL : CapabilityOperand<5817, 0, 0, [SPV_INTEL_variable_length_array], []>; defm GroupUniformArithmeticKHR : CapabilityOperand<6400, 0, 0, [SPV_KHR_uniform_group_instructions], []>; defm USMStorageClassesINTEL : CapabilityOperand<5935, 0, 0, [SPV_INTEL_usm_storage_classes], [Kernel]>; +defm BFloat16ArithmeticINTEL : CapabilityOperand<6226, 0, 0, [SPV_INTEL_bfloat16_arithmetic], []>; defm BFloat16ConversionINTEL : CapabilityOperand<6115, 0, 0, [SPV_INTEL_bfloat16_conversion], []>; defm GlobalVariableHostAccessINTEL : CapabilityOperand<6187, 0, 0, [SPV_INTEL_global_variable_host_access], []>; defm HostAccessINTEL : CapabilityOperand<6188, 0, 0, [SPV_INTEL_global_variable_host_access], []>; @@ -587,6 +590,11 @@ defm CooperativeMatrixBFloat16ComponentTypeINTEL : CapabilityOperand<6437, 0, 0, defm RoundToInfinityINTEL : CapabilityOperand<5582, 0, 0, [SPV_INTEL_float_controls2], []>; defm FloatingPointModeINTEL : CapabilityOperand<5583, 0, 0, [SPV_INTEL_float_controls2], []>; defm FunctionFloatControlINTEL : CapabilityOperand<5821, 0, 0, [SPV_INTEL_float_controls2], []>; +defm KernelAttributesINTEL : CapabilityOperand<5892, 0, 0, [SPV_INTEL_kernel_attributes], [Kernel]>; +// TODO-SPIRV: add these once they are used / tested. +// defm FPGAKernelAttributesINTEL : CapabilityOperand<5897, 0, 0, [SPV_INTEL_kernel_attributes], [Kernel]>; +// defm FPGAKernelAttributesv2INTEL : CapabilityOperand<6161, 0, 0, [SPV_INTEL_kernel_attributes], [Kernel]>; +// END TODO-SPIRV defm LongCompositesINTEL : CapabilityOperand<6089, 0, 0, [SPV_INTEL_long_composites], []>; defm BindlessImagesINTEL : CapabilityOperand<6528, 0, 0, [SPV_INTEL_bindless_images], []>; defm MemoryAccessAliasingINTEL : CapabilityOperand<5910, 0, 0, [SPV_INTEL_memory_access_aliasing], []>; @@ -805,6 +813,15 @@ defm RoundingModeRTPINTEL : ExecutionModeOperand<5620, [RoundToInfinityINTEL]>; defm RoundingModeRTNINTEL : ExecutionModeOperand<5621, [RoundToInfinityINTEL]>; defm FloatingPointModeALTINTEL : ExecutionModeOperand<5622, [FloatingPointModeINTEL]>; defm FloatingPointModeIEEEINTEL : ExecutionModeOperand<5623, [FloatingPointModeINTEL]>; +defm MaxWorkgroupSizeINTEL : ExecutionModeOperand<5893, [KernelAttributesINTEL]>; +// TODO-SPIRV: Add the following once they are used / tested. +// defm MaxWorkDimINTEL : ExecutionModeOperand<5894, [KernelAttributesINTEL]>; +// defm NoGlobalOffsetINTEL : ExecutionModeOperand<5895, [KernelAttributesINTEL]>; +// defm NumSIMDWorkitemsINTEL : ExecutionModeOperand<5896, [FPGAKernelAttributesINTEL]>; +// defm SchedulerTargetFmaxMhzINTEL : ExecutionModeOperand<5903, [FPGAKernelAttributesINTEL]>; +// defm StreamingInterfaceINTEL : ExecutionModeOperand<6154, [FPGAKernelAttributesv2INTEL]>; +// defm RegisterMapInterfaceINTEL : ExecutionModeOperand<6160, [FPGAKernelAttributesv2INTEL]>; +// END TODO-SPIRV defm FPFastMathDefault : ExecutionModeOperand<6028, [FloatControls2]>; defm MaximallyReconvergesKHR : ExecutionModeOperand<6023, [Shader]>; @@ -1919,7 +1936,7 @@ defm GenericCastToPtr : SpecConstantOpOperandsOperand<122, [], [Kernel]>; defm PtrCastToGeneric : SpecConstantOpOperandsOperand<121, [], [Kernel]>; defm Bitcast : SpecConstantOpOperandsOperand<124, [], []>; defm QuantizeToF16 : SpecConstantOpOperandsOperand<116, [], [Shader]>; -// Arithmetic +// Arithmetic defm SNegate : SpecConstantOpOperandsOperand<126, [], []>; defm Not : SpecConstantOpOperandsOperand<200, [], []>; defm IAdd : SpecConstantOpOperandsOperand<128, [], []>; diff --git a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp index 7dd0b95..2951a4b 100644 --- a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp @@ -69,7 +69,7 @@ static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) { } // Pin SPIRVTargetObjectFile's vtables to this file. -SPIRVTargetObjectFile::~SPIRVTargetObjectFile() {} +SPIRVTargetObjectFile::~SPIRVTargetObjectFile() = default; SPIRVTargetMachine::SPIRVTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, @@ -244,7 +244,8 @@ static cl::opt<bool> SPVEnableNonSemanticDI( cl::Optional, cl::init(false)); void SPIRVPassConfig::addPreEmitPass() { - if (SPVEnableNonSemanticDI) { + if (SPVEnableNonSemanticDI || + getSPIRVTargetMachine().getTargetTriple().getVendor() == Triple::AMD) { addPass(createSPIRVEmitNonSemanticDIPass(&getTM<SPIRVTargetMachine>())); } } diff --git a/llvm/lib/Target/SystemZ/SystemZTargetObjectFile.h b/llvm/lib/Target/SystemZ/SystemZTargetObjectFile.h index 9d0adbb..87ec256 100644 --- a/llvm/lib/Target/SystemZ/SystemZTargetObjectFile.h +++ b/llvm/lib/Target/SystemZ/SystemZTargetObjectFile.h @@ -16,7 +16,7 @@ namespace llvm { /// This implementation is used for SystemZ ELF targets. class SystemZELFTargetObjectFile : public TargetLoweringObjectFileELF { public: - SystemZELFTargetObjectFile() {} + SystemZELFTargetObjectFile() = default; /// Describe a TLS variable address within debug info. const MCExpr *getDebugThreadLocalSymbol(const MCSymbol *Sym) const override; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp index 2666342..66ed8b0 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp @@ -46,7 +46,7 @@ class WebAssemblyFastISel final : public FastISel { // All possible address modes. class Address { public: - using BaseKind = enum { RegBase, FrameIndexBase }; + enum BaseKind { RegBase, FrameIndexBase }; private: BaseKind Kind = RegBase; diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h index 51b540a..fa23656 100644 --- a/llvm/lib/Target/X86/X86.h +++ b/llvm/lib/Target/X86/X86.h @@ -158,7 +158,16 @@ FunctionPass *createX86InsertX87waitPass(); /// This pass optimizes arithmetic based on knowledge that is only used by /// a reduction sequence and is therefore safe to reassociate in interesting /// ways. -FunctionPass *createX86PartialReductionPass(); +class X86PartialReductionPass : public PassInfoMixin<X86PartialReductionPass> { +private: + const X86TargetMachine *TM; + +public: + X86PartialReductionPass(const X86TargetMachine *TM) : TM(TM) {} + PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM); +}; + +FunctionPass *createX86PartialReductionLegacyPass(); /// // Analyzes and emits pseudos to support Win x64 Unwind V2. FunctionPass *createX86WinEHUnwindV2Pass(); @@ -179,7 +188,18 @@ FunctionPass *createX86LowerAMXTypeLegacyPass(); /// The pass transforms amx intrinsics to scalar operation if the function has /// optnone attribute or it is O0. -FunctionPass *createX86LowerAMXIntrinsicsPass(); +class X86LowerAMXIntrinsicsPass + : public PassInfoMixin<X86LowerAMXIntrinsicsPass> { +private: + const TargetMachine *TM; + +public: + X86LowerAMXIntrinsicsPass(const TargetMachine *TM) : TM(TM) {} + PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM); + static bool isRequired() { return true; } +}; + +FunctionPass *createX86LowerAMXIntrinsicsLegacyPass(); InstructionSelector *createX86InstructionSelector(const X86TargetMachine &TM, const X86Subtarget &, @@ -220,7 +240,7 @@ void initializeX86LowerAMXIntrinsicsLegacyPassPass(PassRegistry &); void initializeX86LowerAMXTypeLegacyPassPass(PassRegistry &); void initializeX86LowerTileCopyPass(PassRegistry &); void initializeX86OptimizeLEAPassPass(PassRegistry &); -void initializeX86PartialReductionPass(PassRegistry &); +void initializeX86PartialReductionLegacyPass(PassRegistry &); void initializeX86PreTileConfigPass(PassRegistry &); void initializeX86ReturnThunksPass(PassRegistry &); void initializeX86SpeculativeExecutionSideEffectSuppressionPass(PassRegistry &); diff --git a/llvm/lib/Target/X86/X86CompressEVEX.cpp b/llvm/lib/Target/X86/X86CompressEVEX.cpp index c0c7f5a..ddbd10d 100644 --- a/llvm/lib/Target/X86/X86CompressEVEX.cpp +++ b/llvm/lib/Target/X86/X86CompressEVEX.cpp @@ -272,7 +272,7 @@ static bool CompressEVEXImpl(MachineInstr &MI, MachineBasicBlock &MBB, const MachineOperand &Src2 = MI.getOperand(2); bool Is32BitReg = Opc == X86::ADD32ri_ND || Opc == X86::ADD32rr_ND; const MCInstrDesc &NewDesc = - ST.getInstrInfo()->get(Is32BitReg ? X86::LEA32r : X86::LEA64r); + ST.getInstrInfo()->get(Is32BitReg ? X86::LEA64_32r : X86::LEA64r); if (Is32BitReg) Src1 = getX86SubSuperRegister(Src1, 64); MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), NewDesc, Dst) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 133406b..06b8f7614 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -2572,8 +2572,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } // Combine sin / cos into _sincos_stret if it is available. - setOperationAction(ISD::FSINCOS, MVT::f64, Custom); - setOperationAction(ISD::FSINCOS, MVT::f32, Custom); + setOperationAction(ISD::FSINCOS, MVT::f64, Expand); + setOperationAction(ISD::FSINCOS, MVT::f32, Expand); if (Subtarget.isTargetWin64()) { setOperationAction(ISD::SDIV, MVT::i128, Custom); @@ -30908,6 +30908,63 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR); } + if (VT == MVT::v64i8 && Subtarget.canExtendTo512BW()) { + // On AVX512BW, we can use variable 16-bit shifts to implement variable + // 8-bit shifts. For this, we split the input into two vectors, RLo and RHi. + // The i-th lane of RLo contains the (2*i)-th lane of R, and the i-th lane + // of RHi contains the (2*i+1)-th lane of R. After shifting, these vectors + // can efficiently be merged together using a masked move. + MVT ExtVT = MVT::v32i16; + + SDValue RLo, RHi; + // Isolate lower and upper lanes of Amt by masking odd lanes in AmtLo and + // right shifting AmtHi. + SDValue AmtLo = DAG.getNode(ISD::AND, dl, ExtVT, DAG.getBitcast(ExtVT, Amt), + DAG.getConstant(0x00ff, dl, ExtVT)); + SDValue AmtHi = getTargetVShiftByConstNode( + X86ISD::VSRLI, dl, ExtVT, DAG.getBitcast(ExtVT, Amt), 8, DAG); + switch (Opc) { + case ISD::SHL: + // Because we shift left, no bits from the high half can influence the low + // half, so we don't need to mask RLo. We do however need to mask RHi, to + // prevent high bits of an even lane overflowing into low bits of an odd + // lane. + RLo = DAG.getBitcast(ExtVT, R); + RHi = DAG.getNode(ISD::AND, dl, ExtVT, RLo, + DAG.getConstant(0xff00, dl, ExtVT)); + break; + case ISD::SRL: + // Same idea as above, but this time we need to make sure no low bits of + // an odd lane can overflow into high bits of an even lane. + RHi = DAG.getBitcast(ExtVT, R); + RLo = DAG.getNode(ISD::AND, dl, ExtVT, RHi, + DAG.getConstant(0x00ff, dl, ExtVT)); + break; + case ISD::SRA: + // For arithmetic right shifts, we want to sign extend each even lane of R + // such that the upper half of the corresponding lane of RLo is 0 or -1 + // depending on the sign bit of the original lane. We do this using 2 + // immediate shifts. + RHi = DAG.getBitcast(ExtVT, R); + RLo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, RHi, 8, DAG); + RLo = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExtVT, RLo, 8, DAG); + break; + default: + llvm_unreachable("Unexpected Shift Op"); + } + + SDValue ShiftedLo = + DAG.getBitcast(VT, DAG.getNode(Opc, dl, ExtVT, RLo, AmtLo)); + SDValue ShiftedHi = + DAG.getBitcast(VT, DAG.getNode(Opc, dl, ExtVT, RHi, AmtHi)); + + // To merge the shifted vectors back together, we select even lanes + // from ShiftedLo and odd lanes from ShiftedHi. + SDValue SelectMask = DAG.getBitcast( + MVT::v64i1, DAG.getConstant(0x5555555555555555, dl, MVT::i64)); + return DAG.getSelect(dl, VT, SelectMask, ShiftedLo, ShiftedHi); + } + if (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) { @@ -33004,60 +33061,6 @@ static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); } -static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget, - SelectionDAG &DAG) { - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - SDValue Arg = Op.getOperand(0); - EVT ArgVT = Arg.getValueType(); - bool isF64 = ArgVT == MVT::f64; - - RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32; - const char *LibcallName = TLI.getLibcallName(LC); - if (!LibcallName) - return SDValue(); - - assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit()); - - // For MacOSX, we want to call an alternative entry point: __sincos_stret, - // which returns the values as { float, float } (in XMM0) or - // { double, double } (which is returned in XMM0, XMM1). - SDLoc dl(Op); - Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); - - TargetLowering::ArgListTy Args; - Args.emplace_back(Arg, ArgTy); - - // Only optimize x86_64 for now. i386 is a bit messy. For f32, - // the small struct {f32, f32} is returned in (eax, edx). For f64, - // the results are returned via SRet in memory. - SDValue Callee = - DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout())); - - Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy) - : (Type *)FixedVectorType::get(ArgTy, 4); - - TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl) - .setChain(DAG.getEntryNode()) - .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args)); - - std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); - - if (isF64) - // Returned in xmm0 and xmm1. - return CallResult.first; - - // Returned in bits 0:31 and 32:64 xmm0. - SDValue SinVal = - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first, - DAG.getVectorIdxConstant(0, dl)); - SDValue CosVal = - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first, - DAG.getVectorIdxConstant(1, dl)); - SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); - return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal); -} - /// Widen a vector input to a vector of NVT. The /// input vector must have the same element type as NVT. static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG, @@ -33662,7 +33665,6 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::ABDS: case ISD::ABDU: return LowerABD(Op, Subtarget, DAG); case ISD::AVGCEILU: return LowerAVG(Op, Subtarget, DAG); - case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG); case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG); case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG); case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG); @@ -53347,6 +53349,103 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, return SDValue(); } +// Look for a RMW operation that only touches one bit of a larger than legal +// type and fold it to a BTC/BTR/BTS or bit insertion pattern acting on a single +// i32 sub value. +static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL, + SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + using namespace SDPatternMatch; + SDValue StoredVal = St->getValue(); + EVT VT = StoredVal.getValueType(); + + // Only narrow normal stores of larger than legal scalar integers. + if (!ISD::isNormalStore(St) || !St->isSimple() || !VT.isScalarInteger() || + VT.getSizeInBits() <= (Subtarget.is64Bit() ? 64 : 32)) + return SDValue(); + + // BTR: X & ~(1 << ShAmt) + // BTS: X | (1 << ShAmt) + // BTC: X ^ (1 << ShAmt) + // + // BitInsert: (X & ~(1 << ShAmt)) | (InsertBit << ShAmt) + SDValue SrcVal, InsertBit, ShAmt; + if (!StoredVal.hasOneUse() || + !(sd_match(StoredVal, m_And(m_Value(SrcVal), + m_Not(m_Shl(m_One(), m_Value(ShAmt))))) || + sd_match(StoredVal, + m_Or(m_Value(SrcVal), m_Shl(m_One(), m_Value(ShAmt)))) || + sd_match(StoredVal, + m_Xor(m_Value(SrcVal), m_Shl(m_One(), m_Value(ShAmt)))) || + sd_match( + StoredVal, + m_Or(m_And(m_Value(SrcVal), m_Not(m_Shl(m_One(), m_Value(ShAmt)))), + m_Shl(m_Value(InsertBit), m_Deferred(ShAmt)))))) + return SDValue(); + + // SrcVal must be a matching normal load further up the chain. + auto *Ld = dyn_cast<LoadSDNode>(SrcVal); + if (!Ld || !ISD::isNormalLoad(Ld) || !Ld->isSimple() || + Ld->getBasePtr() != St->getBasePtr() || + Ld->getOffset() != St->getOffset() || + !St->getChain().reachesChainWithoutSideEffects(SDValue(Ld, 1))) + return SDValue(); + + // Ensure the shift amount is in bounds. + KnownBits KnownAmt = DAG.computeKnownBits(ShAmt); + if (KnownAmt.getMaxValue().uge(VT.getSizeInBits())) + return SDValue(); + + // If we're inserting a bit then it must be the LSB. + if (InsertBit) { + KnownBits KnownInsert = DAG.computeKnownBits(InsertBit); + if (KnownInsert.countMinLeadingZeros() < (VT.getSizeInBits() - 1)) + return SDValue(); + } + + // Split the shift into an alignment shift that moves the active i32 block to + // the bottom bits for truncation and a modulo shift that can act on the i32. + EVT AmtVT = ShAmt.getValueType(); + SDValue AlignAmt = DAG.getNode(ISD::AND, DL, AmtVT, ShAmt, + DAG.getSignedConstant(-32LL, DL, AmtVT)); + SDValue ModuloAmt = + DAG.getNode(ISD::AND, DL, AmtVT, ShAmt, DAG.getConstant(31, DL, AmtVT)); + ModuloAmt = DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8); + + // Compute the byte offset for the i32 block that is changed by the RMW. + // combineTruncate will adjust the load for us in a similar way. + EVT PtrVT = St->getBasePtr().getValueType(); + SDValue PtrBitOfs = DAG.getZExtOrTrunc(AlignAmt, DL, PtrVT); + SDValue PtrByteOfs = DAG.getNode(ISD::SRL, DL, PtrVT, PtrBitOfs, + DAG.getShiftAmountConstant(3, PtrVT, DL)); + SDValue NewPtr = DAG.getMemBasePlusOffset(St->getBasePtr(), PtrByteOfs, DL, + SDNodeFlags::NoUnsignedWrap); + + // Reconstruct the BTC/BTR/BTS pattern for the i32 block and store. + SDValue X = DAG.getNode(ISD::SRL, DL, VT, SrcVal, AlignAmt); + X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X); + + SDValue Mask = DAG.getNode(ISD::SHL, DL, MVT::i32, + DAG.getConstant(1, DL, MVT::i32), ModuloAmt); + + SDValue Res; + if (InsertBit) { + SDValue BitMask = + DAG.getNode(ISD::SHL, DL, MVT::i32, + DAG.getZExtOrTrunc(InsertBit, DL, MVT::i32), ModuloAmt); + Res = + DAG.getNode(ISD::AND, DL, MVT::i32, X, DAG.getNOT(DL, Mask, MVT::i32)); + Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, BitMask); + } else { + if (StoredVal.getOpcode() == ISD::AND) + Mask = DAG.getNOT(DL, Mask, MVT::i32); + Res = DAG.getNode(StoredVal.getOpcode(), DL, MVT::i32, X, Mask); + } + + return DAG.getStore(St->getChain(), DL, Res, NewPtr, St->getPointerInfo(), + Align(), St->getMemOperand()->getFlags()); +} + static SDValue combineStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { @@ -53573,6 +53672,9 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, } } + if (SDValue R = narrowBitOpRMW(St, dl, DAG, Subtarget)) + return R; + // Convert store(cmov(load(p), x, CC), p) to cstore(x, p, CC) // store(cmov(x, load(p), CC), p) to cstore(x, p, InvertCC) if ((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && @@ -54505,8 +54607,9 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, // truncation, see if we can convert the shift into a pointer offset instead. // Limit this to normal (non-ext) scalar integer loads. if (SrcVT.isScalarInteger() && Src.getOpcode() == ISD::SRL && - Src.hasOneUse() && Src.getOperand(0).hasOneUse() && - ISD::isNormalLoad(Src.getOperand(0).getNode())) { + Src.hasOneUse() && ISD::isNormalLoad(Src.getOperand(0).getNode()) && + (Src.getOperand(0).hasOneUse() || + !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, SrcVT))) { auto *Ld = cast<LoadSDNode>(Src.getOperand(0)); if (Ld->isSimple() && VT.isByteSized() && isPowerOf2_64(VT.getSizeInBits())) { @@ -54529,8 +54632,7 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, SDValue NewLoad = DAG.getLoad(VT, DL, Ld->getChain(), NewPtr, Ld->getPointerInfo(), Align(), Ld->getMemOperand()->getFlags()); - DAG.ReplaceAllUsesOfValueWith(Src.getOperand(0).getValue(1), - NewLoad.getValue(1)); + DAG.makeEquivalentMemoryOrdering(Ld, NewLoad); return NewLoad; } } @@ -56306,6 +56408,7 @@ static SDValue combineAVX512SetCCToKMOV(EVT VT, SDValue Op0, ISD::CondCode CC, static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { + using namespace SDPatternMatch; const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); const SDValue LHS = N->getOperand(0); const SDValue RHS = N->getOperand(1); @@ -56364,6 +56467,37 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, if (SDValue AndN = MatchAndCmpEq(RHS, LHS)) return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC); + // If we're performing a bit test on a larger than legal type, attempt + // to (aligned) shift down the value to the bottom 32-bits and then + // perform the bittest on the i32 value. + // ICMP_ZERO(AND(X,SHL(1,IDX))) + // --> ICMP_ZERO(AND(TRUNC(SRL(X,AND(IDX,-32))),SHL(1,AND(IDX,31)))) + if (isNullConstant(RHS) && + OpVT.getScalarSizeInBits() > (Subtarget.is64Bit() ? 64 : 32)) { + SDValue X, ShAmt; + if (sd_match(LHS, m_OneUse(m_And(m_Value(X), + m_Shl(m_One(), m_Value(ShAmt)))))) { + // Only attempt this if the shift amount is known to be in bounds. + KnownBits KnownAmt = DAG.computeKnownBits(ShAmt); + if (KnownAmt.getMaxValue().ult(OpVT.getScalarSizeInBits())) { + EVT AmtVT = ShAmt.getValueType(); + SDValue AlignAmt = + DAG.getNode(ISD::AND, DL, AmtVT, ShAmt, + DAG.getSignedConstant(-32LL, DL, AmtVT)); + SDValue ModuloAmt = DAG.getNode(ISD::AND, DL, AmtVT, ShAmt, + DAG.getConstant(31, DL, AmtVT)); + SDValue Mask = DAG.getNode( + ISD::SHL, DL, MVT::i32, DAG.getConstant(1, DL, MVT::i32), + DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8)); + X = DAG.getNode(ISD::SRL, DL, OpVT, X, AlignAmt); + X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X); + X = DAG.getNode(ISD::AND, DL, MVT::i32, X, Mask); + return DAG.getSetCC(DL, VT, X, DAG.getConstant(0, DL, MVT::i32), + CC); + } + } + } + // cmpeq(trunc(x),C) --> cmpeq(x,C) // cmpne(trunc(x),C) --> cmpne(x,C) // iff x upper bits are zero. diff --git a/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp b/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp index b655183..3b96e70 100644 --- a/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp +++ b/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp @@ -227,9 +227,6 @@ struct DOTGraphTraits<MachineGadgetGraph *> : DefaultDOTGraphTraits { } // end namespace llvm -constexpr MachineInstr *MachineGadgetGraph::ArgNodeSentinel; -constexpr int MachineGadgetGraph::GadgetEdgeSentinel; - char X86LoadValueInjectionLoadHardeningPass::ID = 0; void X86LoadValueInjectionLoadHardeningPass::getAnalysisUsage( diff --git a/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp b/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp index 7f33939..662aec2 100644 --- a/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp +++ b/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp @@ -23,12 +23,15 @@ #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/Analysis.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/IntrinsicsX86.h" +#include "llvm/IR/PassManager.h" #include "llvm/IR/PatternMatch.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" @@ -40,7 +43,7 @@ using namespace llvm; using namespace PatternMatch; -#define DEBUG_TYPE "lower-amx-intrinsics" +#define DEBUG_TYPE "x86-lower-amx-intrinsics" #ifndef NDEBUG static bool isV256I32Ty(Type *Ty) { @@ -627,6 +630,37 @@ bool X86LowerAMXIntrinsics::visit() { } namespace { +bool shouldRunLowerAMXIntrinsics(const Function &F, const TargetMachine *TM) { + return X86ScalarizeAMX && (F.hasFnAttribute(Attribute::OptimizeNone) || + TM->getOptLevel() == CodeGenOptLevel::None); +} + +bool runLowerAMXIntrinsics(Function &F, DominatorTree *DT, LoopInfo *LI) { + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); + + X86LowerAMXIntrinsics LAT(F, DTU, LI); + return LAT.visit(); +} +} // namespace + +PreservedAnalyses X86LowerAMXIntrinsicsPass::run(Function &F, + FunctionAnalysisManager &FAM) { + if (!shouldRunLowerAMXIntrinsics(F, TM)) + return PreservedAnalyses::all(); + + DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F); + LoopInfo &LI = FAM.getResult<LoopAnalysis>(F); + bool Changed = runLowerAMXIntrinsics(F, &DT, &LI); + if (!Changed) + return PreservedAnalyses::all(); + + PreservedAnalyses PA = PreservedAnalyses::none(); + PA.preserve<DominatorTreeAnalysis>(); + PA.preserve<LoopAnalysis>(); + return PA; +} + +namespace { class X86LowerAMXIntrinsicsLegacyPass : public FunctionPass { public: static char ID; @@ -634,21 +668,15 @@ public: X86LowerAMXIntrinsicsLegacyPass() : FunctionPass(ID) {} bool runOnFunction(Function &F) override { - if (!X86ScalarizeAMX) - return false; TargetMachine *TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>(); - if (!F.hasFnAttribute(Attribute::OptimizeNone) && - TM->getOptLevel() != CodeGenOptLevel::None) + if (!shouldRunLowerAMXIntrinsics(F, TM)) return false; auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); auto *DT = DTWP ? &DTWP->getDomTree() : nullptr; auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>(); auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr; - DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); - - X86LowerAMXIntrinsics LAT(F, DTU, LI); - return LAT.visit(); + return runLowerAMXIntrinsics(F, DT, LI); } StringRef getPassName() const override { return "Lower AMX intrinsics"; } @@ -668,6 +696,6 @@ INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) INITIALIZE_PASS_END(X86LowerAMXIntrinsicsLegacyPass, DEBUG_TYPE, PassName, false, false) -FunctionPass *llvm::createX86LowerAMXIntrinsicsPass() { +FunctionPass *llvm::createX86LowerAMXIntrinsicsLegacyPass() { return new X86LowerAMXIntrinsicsLegacyPass(); } diff --git a/llvm/lib/Target/X86/X86PartialReduction.cpp b/llvm/lib/Target/X86/X86PartialReduction.cpp index a25e4e0..898c83c 100644 --- a/llvm/lib/Target/X86/X86PartialReduction.cpp +++ b/llvm/lib/Target/X86/X86PartialReduction.cpp @@ -16,10 +16,12 @@ #include "X86TargetMachine.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/Analysis.h" #include "llvm/IR/Constants.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicsX86.h" +#include "llvm/IR/PassManager.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Pass.h" #include "llvm/Support/KnownBits.h" @@ -30,39 +32,44 @@ using namespace llvm; namespace { -class X86PartialReduction : public FunctionPass { +class X86PartialReduction { + const X86TargetMachine *TM; const DataLayout *DL = nullptr; const X86Subtarget *ST = nullptr; public: + X86PartialReduction(const X86TargetMachine *TM) : TM(TM) {} + bool run(Function &F); + +private: + bool tryMAddReplacement(Instruction *Op, bool ReduceInOneBB); + bool trySADReplacement(Instruction *Op); +}; + +class X86PartialReductionLegacy : public FunctionPass { +public: static char ID; // Pass identification, replacement for typeid. - X86PartialReduction() : FunctionPass(ID) { } + X86PartialReductionLegacy() : FunctionPass(ID) {} - bool runOnFunction(Function &Fn) override; + bool runOnFunction(Function &F) override; void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); } - StringRef getPassName() const override { - return "X86 Partial Reduction"; - } - -private: - bool tryMAddReplacement(Instruction *Op, bool ReduceInOneBB); - bool trySADReplacement(Instruction *Op); + StringRef getPassName() const override { return "X86 Partial Reduction"; } }; } -FunctionPass *llvm::createX86PartialReductionPass() { - return new X86PartialReduction(); +FunctionPass *llvm::createX86PartialReductionLegacyPass() { + return new X86PartialReductionLegacy(); } -char X86PartialReduction::ID = 0; +char X86PartialReductionLegacy::ID = 0; -INITIALIZE_PASS(X86PartialReduction, DEBUG_TYPE, - "X86 Partial Reduction", false, false) +INITIALIZE_PASS(X86PartialReductionLegacy, DEBUG_TYPE, "X86 Partial Reduction", + false, false) // This function should be aligned with detectExtMul() in X86ISelLowering.cpp. static bool matchVPDPBUSDPattern(const X86Subtarget *ST, BinaryOperator *Mul, @@ -494,17 +501,8 @@ static void collectLeaves(Value *Root, SmallVectorImpl<Instruction *> &Leaves) { } } -bool X86PartialReduction::runOnFunction(Function &F) { - if (skipFunction(F)) - return false; - - auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); - if (!TPC) - return false; - - auto &TM = TPC->getTM<X86TargetMachine>(); - ST = TM.getSubtargetImpl(F); - +bool X86PartialReduction::run(Function &F) { + ST = TM->getSubtargetImpl(F); DL = &F.getDataLayout(); bool MadeChange = false; @@ -540,3 +538,25 @@ bool X86PartialReduction::runOnFunction(Function &F) { return MadeChange; } + +bool X86PartialReductionLegacy::runOnFunction(Function &F) { + if (skipFunction(F)) + return false; + + auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); + if (!TPC) + return false; + + return X86PartialReduction(&TPC->getTM<X86TargetMachine>()).run(F); +} + +PreservedAnalyses X86PartialReductionPass::run(Function &F, + FunctionAnalysisManager &FAM) { + bool Changed = X86PartialReduction(TM).run(F); + if (!Changed) + return PreservedAnalyses::all(); + + PreservedAnalyses PA = PreservedAnalyses::none(); + PA.preserveSet<CFGAnalyses>(); + return PA; +} diff --git a/llvm/lib/Target/X86/X86PassRegistry.def b/llvm/lib/Target/X86/X86PassRegistry.def index fc25d55..db25594 100644 --- a/llvm/lib/Target/X86/X86PassRegistry.def +++ b/llvm/lib/Target/X86/X86PassRegistry.def @@ -15,14 +15,14 @@ #ifndef FUNCTION_PASS #define FUNCTION_PASS(NAME, CREATE_PASS) #endif +FUNCTION_PASS("x86-lower-amx-intrinsics", X86LowerAMXIntrinsicsPass(this)) FUNCTION_PASS("x86-lower-amx-type", X86LowerAMXTypePass(this)) +FUNCTION_PASS("x86-partial-reduction", X86PartialReductionPass(this)) #undef FUNCTION_PASS #ifndef DUMMY_FUNCTION_PASS #define DUMMY_FUNCTION_PASS(NAME, CREATE_PASS) #endif -DUMMY_FUNCTION_PASS("lower-amx-intrinsics", X86LowerAMXIntrinsics(*this)) -DUMMY_FUNCTION_PASS("x86-partial-reduction", X86PartialReduction()) DUMMY_FUNCTION_PASS("x86-winehstate", WinEHStatePass()) #undef DUMMY_FUNCTION_PASS diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp index 9a76abc..5f0bcab 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -97,7 +97,7 @@ extern "C" LLVM_C_ABI void LLVMInitializeX86Target() { initializeX86LoadValueInjectionLoadHardeningPassPass(PR); initializeX86LoadValueInjectionRetHardeningPassPass(PR); initializeX86OptimizeLEAPassPass(PR); - initializeX86PartialReductionPass(PR); + initializeX86PartialReductionLegacyPass(PR); initializePseudoProbeInserterPass(PR); initializeX86ReturnThunksPass(PR); initializeX86DAGToDAGISelLegacyPass(PR); @@ -422,14 +422,14 @@ void X86PassConfig::addIRPasses() { // We add both pass anyway and when these two passes run, we skip the pass // based on the option level and option attribute. - addPass(createX86LowerAMXIntrinsicsPass()); + addPass(createX86LowerAMXIntrinsicsLegacyPass()); addPass(createX86LowerAMXTypeLegacyPass()); TargetPassConfig::addIRPasses(); if (TM->getOptLevel() != CodeGenOptLevel::None) { addPass(createInterleavedAccessPass()); - addPass(createX86PartialReductionPass()); + addPass(createX86PartialReductionLegacyPass()); } // Add passes that handle indirect branch removal and insertion of a retpoline diff --git a/llvm/lib/Target/X86/X86VZeroUpper.cpp b/llvm/lib/Target/X86/X86VZeroUpper.cpp index f6f7e92..2f28ab3 100644 --- a/llvm/lib/Target/X86/X86VZeroUpper.cpp +++ b/llvm/lib/Target/X86/X86VZeroUpper.cpp @@ -66,7 +66,7 @@ namespace { MachineBasicBlock &MBB); void addDirtySuccessor(MachineBasicBlock &MBB); - using BlockExitState = enum { PASS_THROUGH, EXITS_CLEAN, EXITS_DIRTY }; + enum BlockExitState { PASS_THROUGH, EXITS_CLEAN, EXITS_DIRTY }; static const char* getBlockExitStateName(BlockExitState ST); diff --git a/llvm/lib/TextAPI/RecordVisitor.cpp b/llvm/lib/TextAPI/RecordVisitor.cpp index d333b33..24971a7 100644 --- a/llvm/lib/TextAPI/RecordVisitor.cpp +++ b/llvm/lib/TextAPI/RecordVisitor.cpp @@ -15,7 +15,7 @@ using namespace llvm; using namespace llvm::MachO; -RecordVisitor::~RecordVisitor() {} +RecordVisitor::~RecordVisitor() = default; void RecordVisitor::visitObjCInterface(const ObjCInterfaceRecord &) {} void RecordVisitor::visitObjCCategory(const ObjCCategoryRecord &) {} diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp index 5ed47ae..a6ac761 100644 --- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp +++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp @@ -5185,6 +5185,7 @@ struct AADereferenceableCallSiteReturned final // ------------------------ Align Argument Attribute ------------------------ namespace { + static unsigned getKnownAlignForUse(Attributor &A, AAAlign &QueryingAA, Value &AssociatedValue, const Use *U, const Instruction *I, bool &TrackUse) { @@ -5200,6 +5201,28 @@ static unsigned getKnownAlignForUse(Attributor &A, AAAlign &QueryingAA, TrackUse = true; return 0; } + if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) + switch (II->getIntrinsicID()) { + case Intrinsic::ptrmask: { + // Is it appropriate to pull attribute in initialization? + const auto *ConstVals = A.getAAFor<AAPotentialConstantValues>( + QueryingAA, IRPosition::value(*II->getOperand(1)), DepClassTy::NONE); + const auto *AlignAA = A.getAAFor<AAAlign>( + QueryingAA, IRPosition::value(*II), DepClassTy::NONE); + if (ConstVals && ConstVals->isValidState() && ConstVals->isAtFixpoint()) { + unsigned ShiftValue = std::min(ConstVals->getAssumedMinTrailingZeros(), + Value::MaxAlignmentExponent); + Align ConstAlign(UINT64_C(1) << ShiftValue); + if (ConstAlign >= AlignAA->getKnownAlign()) + return Align(1).value(); + } + if (AlignAA) + return AlignAA->getKnownAlign().value(); + break; + } + default: + break; + } MaybeAlign MA; if (const auto *CB = dyn_cast<CallBase>(I)) { @@ -5499,6 +5522,44 @@ struct AAAlignCallSiteReturned final AAAlignCallSiteReturned(const IRPosition &IRP, Attributor &A) : Base(IRP, A) {} + ChangeStatus updateImpl(Attributor &A) override { + Instruction *I = getIRPosition().getCtxI(); + if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { + switch (II->getIntrinsicID()) { + case Intrinsic::ptrmask: { + Align Alignment; + bool Valid = false; + + const auto *ConstVals = A.getAAFor<AAPotentialConstantValues>( + *this, IRPosition::value(*II->getOperand(1)), DepClassTy::REQUIRED); + if (ConstVals && ConstVals->isValidState()) { + unsigned ShiftValue = + std::min(ConstVals->getAssumedMinTrailingZeros(), + Value::MaxAlignmentExponent); + Alignment = Align(UINT64_C(1) << ShiftValue); + Valid = true; + } + + const auto *AlignAA = + A.getAAFor<AAAlign>(*this, IRPosition::value(*(II->getOperand(0))), + DepClassTy::REQUIRED); + if (AlignAA && AlignAA->isValidState()) { + Alignment = std::max(AlignAA->getAssumedAlign(), Alignment); + Valid = true; + } + + if (Valid) + return clampStateAndIndicateChange<StateType>( + this->getState(), + std::min(this->getAssumedAlign(), Alignment).value()); + break; + } + default: + break; + } + } + return Base::updateImpl(A); + }; /// See AbstractAttribute::trackStatistics() void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(align); } }; diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp index d7eb745..2a87a0f 100644 --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -208,7 +208,7 @@ namespace KernelInfo { // }; #define KERNEL_ENVIRONMENT_IDX(MEMBER, IDX) \ - constexpr const unsigned MEMBER##Idx = IDX; + constexpr unsigned MEMBER##Idx = IDX; KERNEL_ENVIRONMENT_IDX(Configuration, 0) KERNEL_ENVIRONMENT_IDX(Ident, 1) @@ -216,7 +216,7 @@ KERNEL_ENVIRONMENT_IDX(Ident, 1) #undef KERNEL_ENVIRONMENT_IDX #define KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MEMBER, IDX) \ - constexpr const unsigned MEMBER##Idx = IDX; + constexpr unsigned MEMBER##Idx = IDX; KERNEL_ENVIRONMENT_CONFIGURATION_IDX(UseGenericStateMachine, 0) KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MayUseNestedParallelism, 1) @@ -258,7 +258,7 @@ KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MaxTeams) GlobalVariable * getKernelEnvironementGVFromKernelInitCB(CallBase *KernelInitCB) { - constexpr const int InitKernelEnvironmentArgNo = 0; + constexpr int InitKernelEnvironmentArgNo = 0; return cast<GlobalVariable>( KernelInitCB->getArgOperand(InitKernelEnvironmentArgNo) ->stripPointerCasts()); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index 3ddf182..cbaff29 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -3997,6 +3997,27 @@ static Value *foldOrUnsignedUMulOverflowICmp(BinaryOperator &I, return nullptr; } +/// Fold select(X >s 0, 0, -X) | smax(X, 0) --> abs(X) +/// select(X <s 0, -X, 0) | smax(X, 0) --> abs(X) +static Value *FoldOrOfSelectSmaxToAbs(BinaryOperator &I, + InstCombiner::BuilderTy &Builder) { + Value *X; + Value *Sel; + if (match(&I, + m_c_Or(m_Value(Sel), m_OneUse(m_SMax(m_Value(X), m_ZeroInt()))))) { + auto NegX = m_Neg(m_Specific(X)); + if (match(Sel, m_Select(m_SpecificICmp(ICmpInst::ICMP_SGT, m_Specific(X), + m_ZeroInt()), + m_ZeroInt(), NegX)) || + match(Sel, m_Select(m_SpecificICmp(ICmpInst::ICMP_SLT, m_Specific(X), + m_ZeroInt()), + NegX, m_ZeroInt()))) + return Builder.CreateBinaryIntrinsic(Intrinsic::abs, X, + Builder.getFalse()); + } + return nullptr; +} + // FIXME: We use commutative matchers (m_c_*) for some, but not all, matches // here. We should standardize that construct where it is needed or choose some // other way to ensure that commutated variants of patterns are not missed. @@ -4545,6 +4566,9 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) { if (Value *V = SimplifyAddWithRemainder(I)) return replaceInstUsesWith(I, V); + if (Value *Res = FoldOrOfSelectSmaxToAbs(I, Builder)) + return replaceInstUsesWith(I, Res); + return nullptr; } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index f5130da..9572f9d 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -3599,6 +3599,21 @@ Instruction *InstCombinerImpl::foldSelectOfBools(SelectInst &SI) { m_Not(m_Specific(SelCond->getTrueValue()))); if (MayNeedFreeze) C = Builder.CreateFreeze(C); + if (!ProfcheckDisableMetadataFixes) { + Value *C2 = nullptr, *A2 = nullptr, *B2 = nullptr; + if (match(CondVal, m_LogicalAnd(m_Specific(C), m_Value(A2))) && + SelCond) { + return SelectInst::Create(C, A, B, "", nullptr, SelCond); + } else if (match(FalseVal, + m_LogicalAnd(m_Not(m_Value(C2)), m_Value(B2))) && + SelFVal) { + SelectInst *NewSI = SelectInst::Create(C, A, B, "", nullptr, SelFVal); + NewSI->swapProfMetadata(); + return NewSI; + } else { + return createSelectInstWithUnknownProfile(C, A, B); + } + } return SelectInst::Create(C, A, B); } @@ -3615,6 +3630,20 @@ Instruction *InstCombinerImpl::foldSelectOfBools(SelectInst &SI) { m_Not(m_Specific(SelFVal->getTrueValue()))); if (MayNeedFreeze) C = Builder.CreateFreeze(C); + if (!ProfcheckDisableMetadataFixes) { + Value *C2 = nullptr, *A2 = nullptr, *B2 = nullptr; + if (match(CondVal, m_LogicalAnd(m_Not(m_Value(C2)), m_Value(A2))) && + SelCond) { + SelectInst *NewSI = SelectInst::Create(C, B, A, "", nullptr, SelCond); + NewSI->swapProfMetadata(); + return NewSI; + } else if (match(FalseVal, m_LogicalAnd(m_Specific(C), m_Value(B2))) && + SelFVal) { + return SelectInst::Create(C, B, A, "", nullptr, SelFVal); + } else { + return createSelectInstWithUnknownProfile(C, B, A); + } + } return SelectInst::Create(C, B, A); } } diff --git a/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp index 80e77e09..a2fad02 100644 --- a/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp @@ -161,7 +161,7 @@ template <char NsanTypeId> class ShadowTypeConfigImpl : public ShadowTypeConfig { public: char getNsanTypeId() const override { return NsanTypeId; } - static constexpr const char kNsanTypeId = NsanTypeId; + static constexpr char kNsanTypeId = NsanTypeId; }; // `double` (`d`) shadow type. diff --git a/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp b/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp index 89980d5..a577f51 100644 --- a/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp +++ b/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp @@ -122,7 +122,8 @@ DropUnnecessaryAssumesPass::run(Function &F, FunctionAnalysisManager &FAM) { Value *Cond = Assume->getArgOperand(0); // Don't drop type tests, which have special semantics. - if (match(Cond, m_Intrinsic<Intrinsic::type_test>())) + if (match(Cond, m_Intrinsic<Intrinsic::type_test>()) || + match(Cond, m_Intrinsic<Intrinsic::public_type_test>())) continue; SmallVector<Value *> Affected; diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp index 19eccb9..9ffa602 100644 --- a/llvm/lib/Transforms/Scalar/LoopFuse.cpp +++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp @@ -1796,14 +1796,16 @@ private: // mergeLatch may remove the only block in FC1. SE.forgetLoop(FC1.L); SE.forgetLoop(FC0.L); - // Forget block dispositions as well, so that there are no dangling - // pointers to erased/free'ed blocks. - SE.forgetBlockAndLoopDispositions(); // Move instructions from FC0.Latch to FC1.Latch. // Note: mergeLatch requires an updated DT. mergeLatch(FC0, FC1); + // Forget block dispositions as well, so that there are no dangling + // pointers to erased/free'ed blocks. It should be done after mergeLatch() + // since merging the latches may affect the dispositions. + SE.forgetBlockAndLoopDispositions(); + // Merge the loops. SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks()); for (BasicBlock *BB : Blocks) { @@ -2092,14 +2094,16 @@ private: // mergeLatch may remove the only block in FC1. SE.forgetLoop(FC1.L); SE.forgetLoop(FC0.L); - // Forget block dispositions as well, so that there are no dangling - // pointers to erased/free'ed blocks. - SE.forgetBlockAndLoopDispositions(); // Move instructions from FC0.Latch to FC1.Latch. // Note: mergeLatch requires an updated DT. mergeLatch(FC0, FC1); + // Forget block dispositions as well, so that there are no dangling + // pointers to erased/free'ed blocks. It should be done after mergeLatch() + // since merging the latches may affect the dispositions. + SE.forgetBlockAndLoopDispositions(); + // Merge the loops. SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks()); for (BasicBlock *BB : Blocks) { diff --git a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp index a883998..1b770be 100644 --- a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp +++ b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp @@ -89,8 +89,8 @@ struct StoreToLoadForwardingCandidate { /// Return true if the dependence from the store to the load has an /// absolute distance of one. /// E.g. A[i+1] = A[i] (or A[i-1] = A[i] for descending loop) - bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE, - Loop *L) const { + bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE, Loop *L, + const DominatorTree &DT) const { Value *LoadPtr = Load->getPointerOperand(); Value *StorePtr = Store->getPointerOperand(); Type *LoadType = getLoadStoreType(Load); @@ -102,8 +102,10 @@ struct StoreToLoadForwardingCandidate { DL.getTypeSizeInBits(getLoadStoreType(Store)) && "Should be a known dependence"); - int64_t StrideLoad = getPtrStride(PSE, LoadType, LoadPtr, L).value_or(0); - int64_t StrideStore = getPtrStride(PSE, LoadType, StorePtr, L).value_or(0); + int64_t StrideLoad = + getPtrStride(PSE, LoadType, LoadPtr, L, DT).value_or(0); + int64_t StrideStore = + getPtrStride(PSE, LoadType, StorePtr, L, DT).value_or(0); if (!StrideLoad || !StrideStore || StrideLoad != StrideStore) return false; @@ -287,8 +289,8 @@ public: // so deciding which one forwards is easy. The later one forwards as // long as they both have a dependence distance of one to the load. if (Cand.Store->getParent() == OtherCand->Store->getParent() && - Cand.isDependenceDistanceOfOne(PSE, L) && - OtherCand->isDependenceDistanceOfOne(PSE, L)) { + Cand.isDependenceDistanceOfOne(PSE, L, *DT) && + OtherCand->isDependenceDistanceOfOne(PSE, L, *DT)) { // They are in the same block, the later one will forward to the load. if (getInstrIndex(OtherCand->Store) < getInstrIndex(Cand.Store)) OtherCand = &Cand; @@ -538,7 +540,7 @@ public: // Check whether the SCEV difference is the same as the induction step, // thus we load the value in the next iteration. - if (!Cand.isDependenceDistanceOfOne(PSE, L)) + if (!Cand.isDependenceDistanceOfOne(PSE, L, *DT)) continue; assert(isa<SCEVAddRecExpr>(PSE.getSCEV(Cand.Load->getPointerOperand())) && diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp index 2bda9d8..802ae4e 100644 --- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -1327,7 +1327,8 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE, } // Do not attempt partial/runtime unrolling in FullLoopUnrolling - if (OnlyFullUnroll && (UP.Count < TripCount || UP.Count < MaxTripCount)) { + if (OnlyFullUnroll && ((!TripCount && !MaxTripCount) || + UP.Count < TripCount || UP.Count < MaxTripCount)) { LLVM_DEBUG( dbgs() << "Not attempting partial/runtime unroll in FullLoopUnroll.\n"); return LoopUnrollResult::Unmodified; diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index bb6c879..239526e 100644 --- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -337,7 +337,7 @@ static void buildPartialUnswitchConditionalBranch( static void buildPartialInvariantUnswitchConditionalBranch( BasicBlock &BB, ArrayRef<Value *> ToDuplicate, bool Direction, BasicBlock &UnswitchedSucc, BasicBlock &NormalSucc, Loop &L, - MemorySSAUpdater *MSSAU) { + MemorySSAUpdater *MSSAU, const BranchInst &OriginalBranch) { ValueToValueMapTy VMap; for (auto *Val : reverse(ToDuplicate)) { Instruction *Inst = cast<Instruction>(Val); @@ -377,8 +377,19 @@ static void buildPartialInvariantUnswitchConditionalBranch( IRBuilder<> IRB(&BB); IRB.SetCurrentDebugLocation(DebugLoc::getCompilerGenerated()); Value *Cond = VMap[ToDuplicate[0]]; - IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc, - Direction ? &NormalSucc : &UnswitchedSucc); + // The expectation is that ToDuplicate[0] is the condition used by the + // OriginalBranch, case in which we can clone the profile metadata from there. + auto *ProfData = + !ProfcheckDisableMetadataFixes && + ToDuplicate[0] == skipTrivialSelect(OriginalBranch.getCondition()) + ? OriginalBranch.getMetadata(LLVMContext::MD_prof) + : nullptr; + auto *BR = + IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc, + Direction ? &NormalSucc : &UnswitchedSucc, ProfData); + if (!ProfData) + setExplicitlyUnknownBranchWeightsIfProfiled(*BR, *BR->getFunction(), + DEBUG_TYPE); } /// Rewrite the PHI nodes in an unswitched loop exit basic block. @@ -2515,7 +2526,7 @@ static void unswitchNontrivialInvariants( // the branch in the split block. if (PartiallyInvariant) buildPartialInvariantUnswitchConditionalBranch( - *SplitBB, Invariants, Direction, *ClonedPH, *LoopPH, L, MSSAU); + *SplitBB, Invariants, Direction, *ClonedPH, *LoopPH, L, MSSAU, *BI); else { buildPartialUnswitchConditionalBranch( *SplitBB, Invariants, Direction, *ClonedPH, *LoopPH, diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp index 5f6f66a..0a8f5ea 100644 --- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp +++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -558,11 +558,10 @@ void StructurizeCFG::analyzeLoops(RegionNode *N) { } else { // Test for successors as back edge BasicBlock *BB = N->getNodeAs<BasicBlock>(); - BranchInst *Term = cast<BranchInst>(BB->getTerminator()); - - for (BasicBlock *Succ : Term->successors()) - if (Visited.count(Succ)) - Loops[Succ] = BB; + if (BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator())) + for (BasicBlock *Succ : Term->successors()) + if (Visited.count(Succ)) + Loops[Succ] = BB; } } @@ -594,7 +593,7 @@ void StructurizeCFG::gatherPredicates(RegionNode *N) { for (BasicBlock *P : predecessors(BB)) { // Ignore it if it's a branch from outside into our region entry - if (!ParentRegion->contains(P)) + if (!ParentRegion->contains(P) || !dyn_cast<BranchInst>(P->getTerminator())) continue; Region *R = RI->getRegionFor(P); @@ -1402,13 +1401,17 @@ bool StructurizeCFG::makeUniformRegion(Region *R, UniformityInfo &UA) { /// Run the transformation for each region found bool StructurizeCFG::run(Region *R, DominatorTree *DT, const TargetTransformInfo *TTI) { - if (R->isTopLevelRegion()) + // CallBr and its corresponding direct target blocks are for now ignored by + // this pass. This is not a limitation for the currently intended uses cases + // of callbr in the AMDGPU backend. + // Parent and child regions are not affected by this (current) restriction. + // See `llvm/test/Transforms/StructurizeCFG/callbr.ll` for details. + if (R->isTopLevelRegion() || isa<CallBrInst>(R->getEntry()->getTerminator())) return false; this->DT = DT; this->TTI = TTI; Func = R->getEntry()->getParent(); - assert(hasOnlySimpleTerminator(*Func) && "Unsupported block terminator."); ParentRegion = R; diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp index 5ba6f95f..6086615 100644 --- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp +++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp @@ -933,6 +933,7 @@ Function *CodeExtractor::constructFunctionDeclaration( case Attribute::CoroDestroyOnlyWhenComplete: case Attribute::CoroElideSafe: case Attribute::NoDivergenceSource: + case Attribute::NoCreateUndefOrPoison: continue; // Those attributes should be safe to propagate to the extracted function. case Attribute::AlwaysInline: diff --git a/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp b/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp index 0642d51..6d4436b 100644 --- a/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp +++ b/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp @@ -16,22 +16,62 @@ using namespace llvm; +static void mergeAttributes(LLVMContext &Ctx, const Module &M, + const DataLayout &DL, const Triple &TT, + Function *Func, FunctionType *FuncTy, + AttributeList FuncAttrs) { + AttributeList OldAttrs = Func->getAttributes(); + AttributeList NewAttrs = OldAttrs; + + { + AttrBuilder OldBuilder(Ctx, OldAttrs.getFnAttrs()); + AttrBuilder NewBuilder(Ctx, FuncAttrs.getFnAttrs()); + OldBuilder.merge(NewBuilder); + NewAttrs = NewAttrs.addFnAttributes(Ctx, OldBuilder); + } + + { + AttrBuilder OldBuilder(Ctx, OldAttrs.getRetAttrs()); + AttrBuilder NewBuilder(Ctx, FuncAttrs.getRetAttrs()); + OldBuilder.merge(NewBuilder); + NewAttrs = NewAttrs.addRetAttributes(Ctx, OldBuilder); + } + + for (unsigned I = 0, E = FuncTy->getNumParams(); I != E; ++I) { + AttrBuilder OldBuilder(Ctx, OldAttrs.getParamAttrs(I)); + AttrBuilder NewBuilder(Ctx, FuncAttrs.getParamAttrs(I)); + OldBuilder.merge(NewBuilder); + NewAttrs = NewAttrs.addParamAttributes(Ctx, I, OldBuilder); + } + + Func->setAttributes(NewAttrs); +} + PreservedAnalyses DeclareRuntimeLibcallsPass::run(Module &M, ModuleAnalysisManager &MAM) { RTLIB::RuntimeLibcallsInfo RTLCI(M.getTargetTriple()); LLVMContext &Ctx = M.getContext(); + const DataLayout &DL = M.getDataLayout(); + const Triple &TT = M.getTargetTriple(); for (RTLIB::LibcallImpl Impl : RTLCI.getLibcallImpls()) { if (Impl == RTLIB::Unsupported) continue; - // TODO: Declare with correct type, calling convention, and attributes. + auto [FuncTy, FuncAttrs] = RTLCI.getFunctionTy(Ctx, TT, DL, Impl); - FunctionType *FuncTy = - FunctionType::get(Type::getVoidTy(Ctx), {}, /*IsVarArgs=*/true); + // TODO: Declare with correct type, calling convention, and attributes. + if (!FuncTy) + FuncTy = FunctionType::get(Type::getVoidTy(Ctx), {}, /*IsVarArgs=*/true); StringRef FuncName = RTLCI.getLibcallImplName(Impl); - M.getOrInsertFunction(FuncName, FuncTy); + + Function *Func = + cast<Function>(M.getOrInsertFunction(FuncName, FuncTy).getCallee()); + if (Func->getFunctionType() == FuncTy) { + mergeAttributes(Ctx, M, DL, TT, Func, FuncTy, FuncAttrs); + Func->setCallingConv(RTLCI.getLibcallImplCallingConv(Impl)); + } } return PreservedAnalyses::none(); diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index 46f2903..a03cf6e 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -3416,7 +3416,11 @@ DIExpression *llvm::getExpressionForConstant(DIBuilder &DIB, const Constant &C, // Create integer constant expression. auto createIntegerExpression = [&DIB](const Constant &CV) -> DIExpression * { const APInt &API = cast<ConstantInt>(&CV)->getValue(); - std::optional<int64_t> InitIntOpt = API.trySExtValue(); + std::optional<int64_t> InitIntOpt; + if (API.getBitWidth() == 1) + InitIntOpt = API.tryZExtValue(); + else + InitIntOpt = API.trySExtValue(); return InitIntOpt ? DIB.createConstantValueExpression( static_cast<uint64_t>(*InitIntOpt)) : nullptr; diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp index 1e8f6cc..6c9467b 100644 --- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp @@ -202,6 +202,27 @@ static void ConnectProlog(Loop *L, Value *BECount, unsigned Count, /// probability of executing at least one more iteration? static BranchProbability probOfNextInRemainder(BranchProbability OriginalLoopProb, unsigned N) { + // OriginalLoopProb == 1 would produce a division by zero in the calculation + // below. The problem is that case indicates an always infinite loop, but a + // remainder loop cannot be calculated at run time if the original loop is + // infinite as infinity % UnrollCount is undefined. We then choose + // probabilities indicating that all remainder loop iterations will always + // execute. + // + // Currently, the remainder loop here is an epilogue, which cannot be reached + // if the original loop is infinite, so the aforementioned choice is + // arbitrary. + // + // FIXME: Branch weights still need to be fixed in the case of prologues + // (issue #135812). In that case, the aforementioned choice seems reasonable + // for the goal of maintaining the original loop's block frequencies. That + // is, an infinite loop's initial iterations are not skipped, and the prologue + // loop body might have unique blocks that execute a finite number of times + // if, for example, the original loop body contains conditionals like i < + // UnrollCount. + if (OriginalLoopProb == BranchProbability::getOne()) + return BranchProbability::getOne(); + // Each of these variables holds the original loop's probability that the // number of iterations it will execute is some m in the specified range. BranchProbability ProbOne = OriginalLoopProb; // 1 <= m diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index 8be471b..6e60b94 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -992,9 +992,12 @@ BranchProbability llvm::getBranchProbability(BranchInst *B, uint64_t Weight0, Weight1; if (!extractBranchWeights(*B, Weight0, Weight1)) return BranchProbability::getUnknown(); + uint64_t Denominator = Weight0 + Weight1; + if (Denominator == 0) + return BranchProbability::getUnknown(); if (!ForFirstTarget) std::swap(Weight0, Weight1); - return BranchProbability::getBranchProbability(Weight0, Weight0 + Weight1); + return BranchProbability::getBranchProbability(Weight0, Denominator); } bool llvm::setBranchProbability(BranchInst *B, BranchProbability P, diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index cbc604e..3a3e3ad 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -778,8 +778,10 @@ private: return false; // Add all values from the range to the set - for (APInt Tmp = Span.getLower(); Tmp != Span.getUpper(); ++Tmp) + APInt Tmp = Span.getLower(); + do Vals.push_back(ConstantInt::get(I->getContext(), Tmp)); + while (++Tmp != Span.getUpper()); UsedICmps++; return true; @@ -6020,6 +6022,8 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU, const DataLayout &DL) { Value *Cond = SI->getCondition(); KnownBits Known = computeKnownBits(Cond, DL, AC, SI); + SmallPtrSet<const Constant *, 4> KnownValues; + bool IsKnownValuesValid = collectPossibleValues(Cond, KnownValues, 4); // We can also eliminate cases by determining that their values are outside of // the limited range of the condition based on how many significant (non-sign) @@ -6039,15 +6043,18 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU, UniqueSuccessors.push_back(Successor); ++It->second; } - const APInt &CaseVal = Case.getCaseValue()->getValue(); + ConstantInt *CaseC = Case.getCaseValue(); + const APInt &CaseVal = CaseC->getValue(); if (Known.Zero.intersects(CaseVal) || !Known.One.isSubsetOf(CaseVal) || - (CaseVal.getSignificantBits() > MaxSignificantBitsInCond)) { - DeadCases.push_back(Case.getCaseValue()); + (CaseVal.getSignificantBits() > MaxSignificantBitsInCond) || + (IsKnownValuesValid && !KnownValues.contains(CaseC))) { + DeadCases.push_back(CaseC); if (DTU) --NumPerSuccessorCases[Successor]; LLVM_DEBUG(dbgs() << "SimplifyCFG: switch case " << CaseVal << " is dead.\n"); - } + } else if (IsKnownValuesValid) + KnownValues.erase(CaseC); } // If we can prove that the cases must cover all possible values, the @@ -6058,33 +6065,41 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU, const unsigned NumUnknownBits = Known.getBitWidth() - (Known.Zero | Known.One).popcount(); assert(NumUnknownBits <= Known.getBitWidth()); - if (HasDefault && DeadCases.empty() && - NumUnknownBits < 64 /* avoid overflow */) { - uint64_t AllNumCases = 1ULL << NumUnknownBits; - if (SI->getNumCases() == AllNumCases) { + if (HasDefault && DeadCases.empty()) { + if (IsKnownValuesValid && all_of(KnownValues, IsaPred<UndefValue>)) { createUnreachableSwitchDefault(SI, DTU); return true; } - // When only one case value is missing, replace default with that case. - // Eliminating the default branch will provide more opportunities for - // optimization, such as lookup tables. - if (SI->getNumCases() == AllNumCases - 1) { - assert(NumUnknownBits > 1 && "Should be canonicalized to a branch"); - IntegerType *CondTy = cast<IntegerType>(Cond->getType()); - if (CondTy->getIntegerBitWidth() > 64 || - !DL.fitsInLegalInteger(CondTy->getIntegerBitWidth())) - return false; - uint64_t MissingCaseVal = 0; - for (const auto &Case : SI->cases()) - MissingCaseVal ^= Case.getCaseValue()->getValue().getLimitedValue(); - auto *MissingCase = - cast<ConstantInt>(ConstantInt::get(Cond->getType(), MissingCaseVal)); - SwitchInstProfUpdateWrapper SIW(*SI); - SIW.addCase(MissingCase, SI->getDefaultDest(), SIW.getSuccessorWeight(0)); - createUnreachableSwitchDefault(SI, DTU, /*RemoveOrigDefaultBlock*/ false); - SIW.setSuccessorWeight(0, 0); - return true; + if (NumUnknownBits < 64 /* avoid overflow */) { + uint64_t AllNumCases = 1ULL << NumUnknownBits; + if (SI->getNumCases() == AllNumCases) { + createUnreachableSwitchDefault(SI, DTU); + return true; + } + // When only one case value is missing, replace default with that case. + // Eliminating the default branch will provide more opportunities for + // optimization, such as lookup tables. + if (SI->getNumCases() == AllNumCases - 1) { + assert(NumUnknownBits > 1 && "Should be canonicalized to a branch"); + IntegerType *CondTy = cast<IntegerType>(Cond->getType()); + if (CondTy->getIntegerBitWidth() > 64 || + !DL.fitsInLegalInteger(CondTy->getIntegerBitWidth())) + return false; + + uint64_t MissingCaseVal = 0; + for (const auto &Case : SI->cases()) + MissingCaseVal ^= Case.getCaseValue()->getValue().getLimitedValue(); + auto *MissingCase = cast<ConstantInt>( + ConstantInt::get(Cond->getType(), MissingCaseVal)); + SwitchInstProfUpdateWrapper SIW(*SI); + SIW.addCase(MissingCase, SI->getDefaultDest(), + SIW.getSuccessorWeight(0)); + createUnreachableSwitchDefault(SI, DTU, + /*RemoveOrigDefaultBlock*/ false); + SIW.setSuccessorWeight(0, 0); + return true; + } } } @@ -7570,6 +7585,81 @@ static bool reduceSwitchRange(SwitchInst *SI, IRBuilder<> &Builder, return true; } +/// Tries to transform the switch when the condition is umin with a constant. +/// In that case, the default branch can be replaced by the constant's branch. +/// This method also removes dead cases when the simplification cannot replace +/// the default branch. +/// +/// For example: +/// switch(umin(a, 3)) { +/// case 0: +/// case 1: +/// case 2: +/// case 3: +/// case 4: +/// // ... +/// default: +/// unreachable +/// } +/// +/// Transforms into: +/// +/// switch(a) { +/// case 0: +/// case 1: +/// case 2: +/// default: +/// // This is case 3 +/// } +static bool simplifySwitchWhenUMin(SwitchInst *SI, DomTreeUpdater *DTU) { + Value *A; + ConstantInt *Constant; + + if (!match(SI->getCondition(), m_UMin(m_Value(A), m_ConstantInt(Constant)))) + return false; + + SmallVector<DominatorTree::UpdateType> Updates; + SwitchInstProfUpdateWrapper SIW(*SI); + BasicBlock *BB = SIW->getParent(); + + // Dead cases are removed even when the simplification fails. + // A case is dead when its value is higher than the Constant. + for (auto I = SI->case_begin(), E = SI->case_end(); I != E;) { + if (!I->getCaseValue()->getValue().ugt(Constant->getValue())) { + ++I; + continue; + } + BasicBlock *DeadCaseBB = I->getCaseSuccessor(); + DeadCaseBB->removePredecessor(BB); + Updates.push_back({DominatorTree::Delete, BB, DeadCaseBB}); + I = SIW->removeCase(I); + E = SIW->case_end(); + } + + auto Case = SI->findCaseValue(Constant); + // If the case value is not found, `findCaseValue` returns the default case. + // In this scenario, since there is no explicit `case 3:`, the simplification + // fails. The simplification also fails when the switch’s default destination + // is reachable. + if (!SI->defaultDestUnreachable() || Case == SI->case_default()) { + if (DTU) + DTU->applyUpdates(Updates); + return !Updates.empty(); + } + + BasicBlock *Unreachable = SI->getDefaultDest(); + SIW.replaceDefaultDest(Case); + SIW.removeCase(Case); + SIW->setCondition(A); + + Updates.push_back({DominatorTree::Delete, BB, Unreachable}); + + if (DTU) + DTU->applyUpdates(Updates); + + return true; +} + /// Tries to transform switch of powers of two to reduce switch range. /// For example, switch like: /// switch (C) { case 1: case 2: case 64: case 128: } @@ -8037,6 +8127,9 @@ bool SimplifyCFGOpt::simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) { if (simplifyDuplicateSwitchArms(SI, DTU)) return requestResimplify(); + if (simplifySwitchWhenUMin(SI, DTU)) + return requestResimplify(); + return false; } diff --git a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp index 94c5c170..e86ab13 100644 --- a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp +++ b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp @@ -158,6 +158,7 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) { SmallVector<BasicBlock *, 8> CallBrTargetBlocksToFix; // Redirect exiting edges through a control flow hub. ControlFlowHub CHub; + bool Changed = false; for (unsigned I = 0; I < ExitingBlocks.size(); ++I) { BasicBlock *BB = ExitingBlocks[I]; @@ -182,6 +183,10 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) { bool UpdatedLI = false; BasicBlock *NewSucc = SplitCallBrEdge(BB, Succ, J, &DTU, nullptr, &LI, &UpdatedLI); + // SplitCallBrEdge modifies the CFG because it creates an intermediate + // block. So we need to set the changed flag no matter what the + // ControlFlowHub is going to do later. + Changed = true; // Even if CallBr and Succ do not have a common parent loop, we need to // add the new target block to the parent loop of the current loop. if (!UpdatedLI) @@ -207,6 +212,7 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) { bool ChangedCFG; std::tie(LoopExitBlock, ChangedCFG) = CHub.finalize( &DTU, GuardBlocks, "loop.exit", MaxBooleansInControlFlowHub.getValue()); + ChangedCFG |= Changed; if (!ChangedCFG) return false; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index fdfff16..03112c6 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -462,8 +462,9 @@ int LoopVectorizationLegality::isConsecutivePtr(Type *AccessTy, bool CanAddPredicate = !llvm::shouldOptimizeForSize( TheLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); - int Stride = getPtrStride(PSE, AccessTy, Ptr, TheLoop, Strides, - CanAddPredicate, false).value_or(0); + int Stride = getPtrStride(PSE, AccessTy, Ptr, TheLoop, *DT, Strides, + CanAddPredicate, false) + .value_or(0); if (Stride == 1 || Stride == -1) return Stride; return 0; diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 34b405c..bf3f52c 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -20975,6 +20975,27 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, if (isa<PHINode>(S.getMainOp()) || isVectorLikeInstWithConstOps(S.getMainOp())) return nullptr; + // If the parent node is non-schedulable and the current node is copyable, and + // any of parent instructions are used outside several basic blocks or in + // bin-op node - cancel scheduling, it may cause wrong def-use deps in + // analysis, leading to a crash. + // Non-scheduled nodes may not have related ScheduleData model, which may lead + // to a skipped dep analysis. + if (S.areInstructionsWithCopyableElements() && EI && EI.UserTE->hasState() && + EI.UserTE->doesNotNeedToSchedule() && + EI.UserTE->getOpcode() != Instruction::PHI && + any_of(EI.UserTE->Scalars, [](Value *V) { + auto *I = dyn_cast<Instruction>(V); + if (!I || I->hasOneUser()) + return false; + for (User *U : I->users()) { + auto *UI = cast<Instruction>(U); + if (isa<BinaryOperator>(UI)) + return true; + } + return false; + })) + return std::nullopt; bool HasCopyables = S.areInstructionsWithCopyableElements(); if (((!HasCopyables && doesNotNeedToSchedule(VL)) || all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))) { diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp index 9c869dd..d354933 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp @@ -92,7 +92,7 @@ void MemDGNode::print(raw_ostream &OS, bool PrintDeps) const { DGNode::print(OS, false); if (PrintDeps) { // Print memory preds. - static constexpr const unsigned Indent = 4; + static constexpr unsigned Indent = 4; for (auto *Pred : MemPreds) OS.indent(Indent) << "<-" << *Pred->getInstruction() << "\n"; } diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp index 86dbd21..5534da9 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp @@ -25,14 +25,14 @@ static cl::opt<bool> "emit new instructions (*very* expensive).")); #endif // NDEBUG -static constexpr const unsigned long StopAtDisabled = +static constexpr unsigned long StopAtDisabled = std::numeric_limits<unsigned long>::max(); static cl::opt<unsigned long> StopAt("sbvec-stop-at", cl::init(StopAtDisabled), cl::Hidden, cl::desc("Vectorize if the invocation count is < than this. 0 " "disables vectorization.")); -static constexpr const unsigned long StopBundleDisabled = +static constexpr unsigned long StopBundleDisabled = std::numeric_limits<unsigned long>::max(); static cl::opt<unsigned long> StopBundle("sbvec-stop-bndl", cl::init(StopBundleDisabled), cl::Hidden, diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp index ed2f80b..2de6921 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp @@ -43,7 +43,7 @@ cl::opt<std::string> AllowFiles( "sbvec-allow-files", cl::init(".*"), cl::Hidden, cl::desc("Run the vectorizer only on file paths that match any in the " "list of comma-separated regex's.")); -static constexpr const char AllowFilesDelim = ','; +static constexpr char AllowFilesDelim = ','; SandboxVectorizerPass::SandboxVectorizerPass() : FPM("fpm") { if (UserDefinedPassPipeline == DefaultPipelineMagicStr) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index aba6d35..cfe1f1e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1110,9 +1110,8 @@ public: VP_CLASSOF_IMPL(VPDef::VPInstructionSC) VPInstruction *clone() override { - SmallVector<VPValue *, 2> Operands(operands()); - auto *New = - new VPInstruction(Opcode, Operands, *this, *this, getDebugLoc(), Name); + auto *New = new VPInstruction(Opcode, operands(), *this, *this, + getDebugLoc(), Name); if (getUnderlyingValue()) New->setUnderlyingValue(getUnderlyingInstr()); return New; @@ -1226,10 +1225,9 @@ public: } VPInstruction *clone() override { - SmallVector<VPValue *, 2> Operands(operands()); auto *New = - new VPInstructionWithType(getOpcode(), Operands, getResultType(), *this, - getDebugLoc(), getName()); + new VPInstructionWithType(getOpcode(), operands(), getResultType(), + *this, getDebugLoc(), getName()); New->setUnderlyingValue(getUnderlyingValue()); return New; } @@ -3211,6 +3209,9 @@ protected: : VPRecipeBase(SC, Operands, DL), VPIRMetadata(Metadata), Ingredient(I), Alignment(Alignment), Consecutive(Consecutive), Reverse(Reverse) { assert((Consecutive || !Reverse) && "Reverse implies consecutive"); + assert(isa<VPVectorEndPointerRecipe>(getAddr()) || + !Reverse && + "Reversed acccess without VPVectorEndPointerRecipe address?"); } public: diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index b5b98c6..b57c448 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -313,7 +313,8 @@ private: // Check for recipes that do not have opcodes. if constexpr (std::is_same_v<RecipeTy, VPScalarIVStepsRecipe> || std::is_same_v<RecipeTy, VPCanonicalIVPHIRecipe> || - std::is_same_v<RecipeTy, VPDerivedIVRecipe>) + std::is_same_v<RecipeTy, VPDerivedIVRecipe> || + std::is_same_v<RecipeTy, VPVectorEndPointerRecipe>) return DefR; else return DefR && DefR->getOpcode() == Opcode; @@ -686,6 +687,64 @@ m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2) { return VPDerivedIV_match<Op0_t, Op1_t, Op2_t>({Op0, Op1, Op2}); } +template <typename Addr_t, typename Mask_t> struct Load_match { + Addr_t Addr; + Mask_t Mask; + + Load_match(Addr_t Addr, Mask_t Mask) : Addr(Addr), Mask(Mask) {} + + template <typename OpTy> bool match(const OpTy *V) const { + auto *Load = dyn_cast<VPWidenLoadRecipe>(V); + if (!Load || !Addr.match(Load->getAddr()) || !Load->isMasked() || + !Mask.match(Load->getMask())) + return false; + return true; + } +}; + +/// Match a (possibly reversed) masked load. +template <typename Addr_t, typename Mask_t> +inline Load_match<Addr_t, Mask_t> m_MaskedLoad(const Addr_t &Addr, + const Mask_t &Mask) { + return Load_match<Addr_t, Mask_t>(Addr, Mask); +} + +template <typename Addr_t, typename Val_t, typename Mask_t> struct Store_match { + Addr_t Addr; + Val_t Val; + Mask_t Mask; + + Store_match(Addr_t Addr, Val_t Val, Mask_t Mask) + : Addr(Addr), Val(Val), Mask(Mask) {} + + template <typename OpTy> bool match(const OpTy *V) const { + auto *Store = dyn_cast<VPWidenStoreRecipe>(V); + if (!Store || !Addr.match(Store->getAddr()) || + !Val.match(Store->getStoredValue()) || !Store->isMasked() || + !Mask.match(Store->getMask())) + return false; + return true; + } +}; + +/// Match a (possibly reversed) masked store. +template <typename Addr_t, typename Val_t, typename Mask_t> +inline Store_match<Addr_t, Val_t, Mask_t> +m_MaskedStore(const Addr_t &Addr, const Val_t &Val, const Mask_t &Mask) { + return Store_match<Addr_t, Val_t, Mask_t>(Addr, Val, Mask); +} + +template <typename Op0_t, typename Op1_t> +using VectorEndPointerRecipe_match = + Recipe_match<std::tuple<Op0_t, Op1_t>, 0, + /*Commutative*/ false, VPVectorEndPointerRecipe>; + +template <typename Op0_t, typename Op1_t> +VectorEndPointerRecipe_match<Op0_t, Op1_t> m_VecEndPtr(const Op0_t &Op0, + const Op1_t &Op1) { + return VectorEndPointerRecipe_match<Op0_t, Op1_t>(Op0, Op1); +} + /// Match a call argument at a given argument index. template <typename Opnd_t> struct Argument_match { /// Call argument index to match. diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 3757a59..2588c87 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -151,59 +151,65 @@ static bool cannotHoistOrSinkRecipe(const VPRecipeBase &R) { static bool sinkScalarOperands(VPlan &Plan) { auto Iter = vp_depth_first_deep(Plan.getEntry()); + bool ScalarVFOnly = Plan.hasScalarVFOnly(); bool Changed = false; + + SetVector<std::pair<VPBasicBlock *, VPSingleDefRecipe *>> WorkList; + auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList]( + VPBasicBlock *SinkTo, VPValue *Op) { + auto *Candidate = + dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()); + if (!Candidate) + return; + + // We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes + // for now. + if (!isa<VPReplicateRecipe, VPScalarIVStepsRecipe>(Candidate)) + return; + + if (Candidate->getParent() == SinkTo || Candidate->mayHaveSideEffects() || + Candidate->mayReadOrWriteMemory()) + return; + + if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate)) + if (!ScalarVFOnly && RepR->isSingleScalar()) + return; + + WorkList.insert({SinkTo, Candidate}); + }; + // First, collect the operands of all recipes in replicate blocks as seeds for // sinking. - SetVector<std::pair<VPBasicBlock *, VPSingleDefRecipe *>> WorkList; for (VPRegionBlock *VPR : VPBlockUtils::blocksOnly<VPRegionBlock>(Iter)) { VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock(); if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2) continue; - VPBasicBlock *VPBB = dyn_cast<VPBasicBlock>(EntryVPBB->getSuccessors()[0]); - if (!VPBB || VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock()) + VPBasicBlock *VPBB = cast<VPBasicBlock>(EntryVPBB->getSuccessors().front()); + if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock()) continue; - for (auto &Recipe : *VPBB) { + for (auto &Recipe : *VPBB) for (VPValue *Op : Recipe.operands()) - if (auto *Def = - dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe())) - WorkList.insert({VPBB, Def}); - } + InsertIfValidSinkCandidate(VPBB, Op); } - bool ScalarVFOnly = Plan.hasScalarVFOnly(); // Try to sink each replicate or scalar IV steps recipe in the worklist. for (unsigned I = 0; I != WorkList.size(); ++I) { VPBasicBlock *SinkTo; VPSingleDefRecipe *SinkCandidate; std::tie(SinkTo, SinkCandidate) = WorkList[I]; - if (SinkCandidate->getParent() == SinkTo || - SinkCandidate->mayHaveSideEffects() || - SinkCandidate->mayReadOrWriteMemory()) - continue; - if (auto *RepR = dyn_cast<VPReplicateRecipe>(SinkCandidate)) { - if (!ScalarVFOnly && RepR->isSingleScalar()) - continue; - } else if (!isa<VPScalarIVStepsRecipe>(SinkCandidate)) - continue; - bool NeedsDuplicating = false; - // All recipe users of the sink candidate must be in the same block SinkTo - // or all users outside of SinkTo must be uniform-after-vectorization ( - // i.e., only first lane is used) . In the latter case, we need to duplicate - // SinkCandidate. - auto CanSinkWithUser = [SinkTo, &NeedsDuplicating, - SinkCandidate](VPUser *U) { - auto *UI = cast<VPRecipeBase>(U); - if (UI->getParent() == SinkTo) - return true; - NeedsDuplicating = UI->onlyFirstLaneUsed(SinkCandidate); - // We only know how to duplicate VPReplicateRecipes and - // VPScalarIVStepsRecipes for now. - return NeedsDuplicating && - isa<VPReplicateRecipe, VPScalarIVStepsRecipe>(SinkCandidate); - }; - if (!all_of(SinkCandidate->users(), CanSinkWithUser)) + // All recipe users of SinkCandidate must be in the same block SinkTo or all + // users outside of SinkTo must only use the first lane of SinkCandidate. In + // the latter case, we need to duplicate SinkCandidate. + auto UsersOutsideSinkTo = + make_filter_range(SinkCandidate->users(), [SinkTo](VPUser *U) { + return cast<VPRecipeBase>(U)->getParent() != SinkTo; + }); + if (any_of(UsersOutsideSinkTo, [SinkCandidate](VPUser *U) { + return !U->onlyFirstLaneUsed(SinkCandidate); + })) continue; + bool NeedsDuplicating = !UsersOutsideSinkTo.empty(); if (NeedsDuplicating) { if (ScalarVFOnly) @@ -228,9 +234,7 @@ static bool sinkScalarOperands(VPlan &Plan) { } SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi()); for (VPValue *Op : SinkCandidate->operands()) - if (auto *Def = - dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe())) - WorkList.insert({SinkTo, Def}); + InsertIfValidSinkCandidate(SinkTo, Op); Changed = true; } return Changed; @@ -1056,13 +1060,9 @@ static VPValue *tryToFoldLiveIns(VPSingleDefRecipe &R, return nullptr; } -/// Try to simplify recipe \p R. -static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { - VPlan *Plan = R.getParent()->getPlan(); - - auto *Def = dyn_cast<VPSingleDefRecipe>(&R); - if (!Def) - return; +/// Try to simplify VPSingleDefRecipe \p Def. +static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) { + VPlan *Plan = Def->getParent()->getPlan(); // Simplification of live-in IR values for SingleDef recipes using // InstSimplifyFolder. @@ -1072,7 +1072,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { return Def->replaceAllUsesWith(V); // Fold PredPHI LiveIn -> LiveIn. - if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(&R)) { + if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(Def)) { VPValue *Op = PredPHI->getOperand(0); if (Op->isLiveIn()) PredPHI->replaceAllUsesWith(Op); @@ -1091,12 +1091,12 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { return; if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) { - unsigned ExtOpcode = match(R.getOperand(0), m_SExt(m_VPValue())) + unsigned ExtOpcode = match(Def->getOperand(0), m_SExt(m_VPValue())) ? Instruction::SExt : Instruction::ZExt; auto *Ext = Builder.createWidenCast(Instruction::CastOps(ExtOpcode), A, TruncTy); - if (auto *UnderlyingExt = R.getOperand(0)->getUnderlyingValue()) { + if (auto *UnderlyingExt = Def->getOperand(0)->getUnderlyingValue()) { // UnderlyingExt has distinct return type, used to retain legacy cost. Ext->setUnderlyingValue(UnderlyingExt); } @@ -1159,7 +1159,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { Builder.createLogicalAnd(X, Builder.createOr(Y, Z))); // x && !x -> 0 - if (match(&R, m_LogicalAnd(m_VPValue(X), m_Not(m_Deferred(X))))) + if (match(Def, m_LogicalAnd(m_VPValue(X), m_Not(m_Deferred(X))))) return Def->replaceAllUsesWith(Plan->getFalse()); if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X)))) @@ -1187,8 +1187,8 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { return Def->replaceAllUsesWith(A); if (match(Def, m_c_Mul(m_VPValue(A), m_ZeroInt()))) - return Def->replaceAllUsesWith(R.getOperand(0) == A ? R.getOperand(1) - : R.getOperand(0)); + return Def->replaceAllUsesWith( + Def->getOperand(0) == A ? Def->getOperand(1) : Def->getOperand(0)); if (match(Def, m_Not(m_VPValue(A)))) { if (match(A, m_Not(m_VPValue(A)))) @@ -1217,8 +1217,8 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { } // If Cmp doesn't have a debug location, use the one from the negation, // to preserve the location. - if (!Cmp->getDebugLoc() && R.getDebugLoc()) - Cmp->setDebugLoc(R.getDebugLoc()); + if (!Cmp->getDebugLoc() && Def->getDebugLoc()) + Cmp->setDebugLoc(Def->getDebugLoc()); } } } @@ -1244,7 +1244,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { if (match(Def, m_Intrinsic<Intrinsic::vp_merge>(m_True(), m_VPValue(A), m_VPValue(X), m_VPValue())) && match(A, m_c_BinaryOr(m_Specific(X), m_VPValue(Y))) && - TypeInfo.inferScalarType(R.getVPSingleValue())->isIntegerTy(1)) { + TypeInfo.inferScalarType(Def)->isIntegerTy(1)) { Def->setOperand(1, Def->getOperand(0)); Def->setOperand(0, Y); return; @@ -1252,36 +1252,36 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) { if (Phi->getOperand(0) == Phi->getOperand(1)) - Def->replaceAllUsesWith(Phi->getOperand(0)); + Phi->replaceAllUsesWith(Phi->getOperand(0)); return; } // Look through ExtractLastElement (BuildVector ....). - if (match(&R, m_CombineOr(m_ExtractLastElement(m_BuildVector()), - m_ExtractLastLanePerPart(m_BuildVector())))) { - auto *BuildVector = cast<VPInstruction>(R.getOperand(0)); + if (match(Def, m_CombineOr(m_ExtractLastElement(m_BuildVector()), + m_ExtractLastLanePerPart(m_BuildVector())))) { + auto *BuildVector = cast<VPInstruction>(Def->getOperand(0)); Def->replaceAllUsesWith( BuildVector->getOperand(BuildVector->getNumOperands() - 1)); return; } // Look through ExtractPenultimateElement (BuildVector ....). - if (match(&R, m_VPInstruction<VPInstruction::ExtractPenultimateElement>( - m_BuildVector()))) { - auto *BuildVector = cast<VPInstruction>(R.getOperand(0)); + if (match(Def, m_VPInstruction<VPInstruction::ExtractPenultimateElement>( + m_BuildVector()))) { + auto *BuildVector = cast<VPInstruction>(Def->getOperand(0)); Def->replaceAllUsesWith( BuildVector->getOperand(BuildVector->getNumOperands() - 2)); return; } uint64_t Idx; - if (match(&R, m_ExtractElement(m_BuildVector(), m_ConstantInt(Idx)))) { - auto *BuildVector = cast<VPInstruction>(R.getOperand(0)); + if (match(Def, m_ExtractElement(m_BuildVector(), m_ConstantInt(Idx)))) { + auto *BuildVector = cast<VPInstruction>(Def->getOperand(0)); Def->replaceAllUsesWith(BuildVector->getOperand(Idx)); return; } - if (match(Def, m_BuildVector()) && all_equal(R.operands())) { + if (match(Def, m_BuildVector()) && all_equal(Def->operands())) { Def->replaceAllUsesWith( Builder.createNaryOp(VPInstruction::Broadcast, Def->getOperand(0))); return; @@ -1303,7 +1303,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { isa<VPPhi>(X)) { auto *Phi = cast<VPPhi>(X); if (Phi->getOperand(1) != Def && match(Phi->getOperand(0), m_ZeroInt()) && - Phi->getNumUsers() == 1 && (*Phi->user_begin() == &R)) { + Phi->getNumUsers() == 1 && (*Phi->user_begin() == Def)) { Phi->setOperand(0, Y); Def->replaceAllUsesWith(Phi); return; @@ -1311,7 +1311,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { } // VPVectorPointer for part 0 can be replaced by their start pointer. - if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(&R)) { + if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(Def)) { if (VecPtr->isFirstPart()) { VecPtr->replaceAllUsesWith(VecPtr->getOperand(0)); return; @@ -1366,9 +1366,9 @@ void VPlanTransforms::simplifyRecipes(VPlan &Plan) { Plan.getEntry()); VPTypeAnalysis TypeInfo(Plan); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) { - for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { - simplifyRecipe(R, TypeInfo); - } + for (VPRecipeBase &R : make_early_inc_range(*VPBB)) + if (auto *Def = dyn_cast<VPSingleDefRecipe>(&R)) + simplifyRecipe(Def, TypeInfo); } } @@ -2521,90 +2521,102 @@ void VPlanTransforms::addActiveLaneMask( HeaderMask->eraseFromParent(); } +template <typename Op0_t, typename Op1_t> struct RemoveMask_match { + Op0_t In; + Op1_t &Out; + + RemoveMask_match(const Op0_t &In, Op1_t &Out) : In(In), Out(Out) {} + + template <typename OpTy> bool match(OpTy *V) const { + if (m_Specific(In).match(V)) { + Out = nullptr; + return true; + } + if (m_LogicalAnd(m_Specific(In), m_VPValue(Out)).match(V)) + return true; + return false; + } +}; + +/// Match a specific mask \p In, or a combination of it (logical-and In, Out). +/// Returns the remaining part \p Out if so, or nullptr otherwise. +template <typename Op0_t, typename Op1_t> +static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In, + Op1_t &Out) { + return RemoveMask_match<Op0_t, Op1_t>(In, Out); +} + /// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding /// EVL-based recipe without the header mask. Returns nullptr if no EVL-based /// recipe could be created. /// \p HeaderMask Header Mask. /// \p CurRecipe Recipe to be transform. /// \p TypeInfo VPlan-based type analysis. -/// \p AllOneMask The vector mask parameter of vector-predication intrinsics. /// \p EVL The explicit vector length parameter of vector-predication /// intrinsics. static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask, VPRecipeBase &CurRecipe, - VPTypeAnalysis &TypeInfo, - VPValue &AllOneMask, VPValue &EVL) { - // FIXME: Don't transform recipes to EVL recipes if they're not masked by the - // header mask. - auto GetNewMask = [&](VPValue *OrigMask) -> VPValue * { - assert(OrigMask && "Unmasked recipe when folding tail"); - // HeaderMask will be handled using EVL. - VPValue *Mask; - if (match(OrigMask, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(Mask)))) - return Mask; - return HeaderMask == OrigMask ? nullptr : OrigMask; - }; + VPTypeAnalysis &TypeInfo, VPValue &EVL) { + VPlan *Plan = CurRecipe.getParent()->getPlan(); + VPValue *Addr, *Mask, *EndPtr; /// Adjust any end pointers so that they point to the end of EVL lanes not VF. - auto GetNewAddr = [&CurRecipe, &EVL](VPValue *Addr) -> VPValue * { - auto *EndPtr = dyn_cast<VPVectorEndPointerRecipe>(Addr); - if (!EndPtr) - return Addr; - assert(EndPtr->getOperand(1) == &EndPtr->getParent()->getPlan()->getVF() && - "VPVectorEndPointerRecipe with non-VF VF operand?"); - assert( - all_of(EndPtr->users(), - [](VPUser *U) { - return cast<VPWidenMemoryRecipe>(U)->isReverse(); - }) && - "VPVectorEndPointRecipe not used by reversed widened memory recipe?"); - VPVectorEndPointerRecipe *EVLAddr = EndPtr->clone(); - EVLAddr->insertBefore(&CurRecipe); - EVLAddr->setOperand(1, &EVL); - return EVLAddr; + auto AdjustEndPtr = [&CurRecipe, &EVL](VPValue *EndPtr) { + auto *EVLEndPtr = cast<VPVectorEndPointerRecipe>(EndPtr)->clone(); + EVLEndPtr->insertBefore(&CurRecipe); + EVLEndPtr->setOperand(1, &EVL); + return EVLEndPtr; }; - return TypeSwitch<VPRecipeBase *, VPRecipeBase *>(&CurRecipe) - .Case<VPWidenLoadRecipe>([&](VPWidenLoadRecipe *L) { - VPValue *NewMask = GetNewMask(L->getMask()); - VPValue *NewAddr = GetNewAddr(L->getAddr()); - return new VPWidenLoadEVLRecipe(*L, NewAddr, EVL, NewMask); - }) - .Case<VPWidenStoreRecipe>([&](VPWidenStoreRecipe *S) { - VPValue *NewMask = GetNewMask(S->getMask()); - VPValue *NewAddr = GetNewAddr(S->getAddr()); - return new VPWidenStoreEVLRecipe(*S, NewAddr, EVL, NewMask); - }) - .Case<VPInterleaveRecipe>([&](VPInterleaveRecipe *IR) { - VPValue *NewMask = GetNewMask(IR->getMask()); - return new VPInterleaveEVLRecipe(*IR, EVL, NewMask); - }) - .Case<VPReductionRecipe>([&](VPReductionRecipe *Red) { - VPValue *NewMask = GetNewMask(Red->getCondOp()); - return new VPReductionEVLRecipe(*Red, EVL, NewMask); - }) - .Case<VPInstruction>([&](VPInstruction *VPI) -> VPRecipeBase * { - VPValue *LHS, *RHS; - // Transform select with a header mask condition - // select(header_mask, LHS, RHS) - // into vector predication merge. - // vp.merge(all-true, LHS, RHS, EVL) - if (!match(VPI, m_Select(m_Specific(HeaderMask), m_VPValue(LHS), - m_VPValue(RHS)))) - return nullptr; - // Use all true as the condition because this transformation is - // limited to selects whose condition is a header mask. - return new VPWidenIntrinsicRecipe( - Intrinsic::vp_merge, {&AllOneMask, LHS, RHS, &EVL}, - TypeInfo.inferScalarType(LHS), VPI->getDebugLoc()); - }) - .Default([&](VPRecipeBase *R) { return nullptr; }); + if (match(&CurRecipe, + m_MaskedLoad(m_VPValue(Addr), m_RemoveMask(HeaderMask, Mask))) && + !cast<VPWidenLoadRecipe>(CurRecipe).isReverse()) + return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), Addr, + EVL, Mask); + + if (match(&CurRecipe, + m_MaskedLoad(m_VPValue(EndPtr), m_RemoveMask(HeaderMask, Mask))) && + match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) && + cast<VPWidenLoadRecipe>(CurRecipe).isReverse()) + return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), + AdjustEndPtr(EndPtr), EVL, Mask); + + if (match(&CurRecipe, m_MaskedStore(m_VPValue(Addr), m_VPValue(), + m_RemoveMask(HeaderMask, Mask))) && + !cast<VPWidenStoreRecipe>(CurRecipe).isReverse()) + return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr, + EVL, Mask); + + if (match(&CurRecipe, m_MaskedStore(m_VPValue(EndPtr), m_VPValue(), + m_RemoveMask(HeaderMask, Mask))) && + match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) && + cast<VPWidenStoreRecipe>(CurRecipe).isReverse()) + return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), + AdjustEndPtr(EndPtr), EVL, Mask); + + if (auto *Rdx = dyn_cast<VPReductionRecipe>(&CurRecipe)) + if (Rdx->isConditional() && + match(Rdx->getCondOp(), m_RemoveMask(HeaderMask, Mask))) + return new VPReductionEVLRecipe(*Rdx, EVL, Mask); + + if (auto *Interleave = dyn_cast<VPInterleaveRecipe>(&CurRecipe)) + if (Interleave->getMask() && + match(Interleave->getMask(), m_RemoveMask(HeaderMask, Mask))) + return new VPInterleaveEVLRecipe(*Interleave, EVL, Mask); + + VPValue *LHS, *RHS; + if (match(&CurRecipe, + m_Select(m_Specific(HeaderMask), m_VPValue(LHS), m_VPValue(RHS)))) + return new VPWidenIntrinsicRecipe( + Intrinsic::vp_merge, {Plan->getTrue(), LHS, RHS, &EVL}, + TypeInfo.inferScalarType(LHS), CurRecipe.getDebugLoc()); + + return nullptr; } /// Replace recipes with their EVL variants. static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { VPTypeAnalysis TypeInfo(Plan); - VPValue *AllOneMask = Plan.getTrue(); VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); VPBasicBlock *Header = LoopRegion->getEntryBasicBlock(); @@ -2664,7 +2676,7 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { ConstantInt::getSigned(Type::getInt32Ty(Plan.getContext()), -1)); VPWidenIntrinsicRecipe *VPSplice = new VPWidenIntrinsicRecipe( Intrinsic::experimental_vp_splice, - {V1, V2, Imm, AllOneMask, PrevEVL, &EVL}, + {V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL}, TypeInfo.inferScalarType(R.getVPSingleValue()), R.getDebugLoc()); VPSplice->insertBefore(&R); R.getVPSingleValue()->replaceAllUsesWith(VPSplice); @@ -2698,7 +2710,7 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { for (VPUser *U : collectUsersRecursively(EVLMask)) { auto *CurRecipe = cast<VPRecipeBase>(U); VPRecipeBase *EVLRecipe = - optimizeMaskToEVL(EVLMask, *CurRecipe, TypeInfo, *AllOneMask, EVL); + optimizeMaskToEVL(EVLMask, *CurRecipe, TypeInfo, EVL); if (!EVLRecipe) continue; diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index 8c23e78..c6380d3 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -32,22 +32,17 @@ bool vputils::onlyScalarValuesUsed(const VPValue *Def) { } VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr) { - VPValue *Expanded = nullptr; if (auto *E = dyn_cast<SCEVConstant>(Expr)) - Expanded = Plan.getOrAddLiveIn(E->getValue()); - else { - auto *U = dyn_cast<SCEVUnknown>(Expr); - // Skip SCEV expansion if Expr is a SCEVUnknown wrapping a non-instruction - // value. Otherwise the value may be defined in a loop and using it directly - // will break LCSSA form. The SCEV expansion takes care of preserving LCSSA - // form. - if (U && !isa<Instruction>(U->getValue())) { - Expanded = Plan.getOrAddLiveIn(U->getValue()); - } else { - Expanded = new VPExpandSCEVRecipe(Expr); - Plan.getEntry()->appendRecipe(Expanded->getDefiningRecipe()); - } - } + return Plan.getOrAddLiveIn(E->getValue()); + // Skip SCEV expansion if Expr is a SCEVUnknown wrapping a non-instruction + // value. Otherwise the value may be defined in a loop and using it directly + // will break LCSSA form. The SCEV expansion takes care of preserving LCSSA + // form. + auto *U = dyn_cast<SCEVUnknown>(Expr); + if (U && !isa<Instruction>(U->getValue())) + return Plan.getOrAddLiveIn(U->getValue()); + auto *Expanded = new VPExpandSCEVRecipe(Expr); + Plan.getEntry()->appendRecipe(Expanded); return Expanded; } diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index 91734a1..34754a1 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -252,6 +252,13 @@ bool VPlanVerifier::verifyVPBasicBlock(const VPBasicBlock *VPBB) { for (const VPUser *U : V->users()) { auto *UI = cast<VPRecipeBase>(U); + if (isa<VPIRPhi>(UI) && + UI->getNumOperands() != UI->getParent()->getNumPredecessors()) { + errs() << "Phi-like recipe with different number of operands and " + "predecessors.\n"; + return false; + } + if (auto *Phi = dyn_cast<VPPhiAccessors>(UI)) { for (const auto &[IncomingVPV, IncomingVPBB] : Phi->incoming_values_and_blocks()) { diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index d6eb00d..27a8bbd 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -2017,8 +2017,31 @@ bool VectorCombine::scalarizeExtExtract(Instruction &I) { Value *ScalarV = Ext->getOperand(0); if (!isGuaranteedNotToBePoison(ScalarV, &AC, dyn_cast<Instruction>(ScalarV), - &DT)) - ScalarV = Builder.CreateFreeze(ScalarV); + &DT)) { + // Check wether all lanes are extracted, all extracts trigger UB + // on poison, and the last extract (and hence all previous ones) + // are guaranteed to execute if Ext executes. If so, we do not + // need to insert a freeze. + SmallDenseSet<ConstantInt *, 8> ExtractedLanes; + bool AllExtractsTriggerUB = true; + ExtractElementInst *LastExtract = nullptr; + BasicBlock *ExtBB = Ext->getParent(); + for (User *U : Ext->users()) { + auto *Extract = cast<ExtractElementInst>(U); + if (Extract->getParent() != ExtBB || !programUndefinedIfPoison(Extract)) { + AllExtractsTriggerUB = false; + break; + } + ExtractedLanes.insert(cast<ConstantInt>(Extract->getIndexOperand())); + if (!LastExtract || LastExtract->comesBefore(Extract)) + LastExtract = Extract; + } + if (ExtractedLanes.size() != DstTy->getNumElements() || + !AllExtractsTriggerUB || + !isGuaranteedToTransferExecutionToSuccessor(Ext->getIterator(), + LastExtract->getIterator())) + ScalarV = Builder.CreateFreeze(ScalarV); + } ScalarV = Builder.CreateBitCast( ScalarV, IntegerType::get(SrcTy->getContext(), DL->getTypeSizeInBits(SrcTy))); |
