diff options
Diffstat (limited to 'llvm/lib')
184 files changed, 3018 insertions, 1439 deletions
diff --git a/llvm/lib/Analysis/Analysis.cpp b/llvm/lib/Analysis/Analysis.cpp index 9f5daf3..aaac2cf 100644 --- a/llvm/lib/Analysis/Analysis.cpp +++ b/llvm/lib/Analysis/Analysis.cpp @@ -63,6 +63,7 @@ void llvm::initializeAnalysis(PassRegistry &Registry) { initializeRegionPrinterPass(Registry); initializeRegionOnlyViewerPass(Registry); initializeRegionOnlyPrinterPass(Registry); + initializeRuntimeLibraryInfoWrapperPass(Registry); initializeSCEVAAWrapperPassPass(Registry); initializeScalarEvolutionWrapperPassPass(Registry); initializeStackSafetyGlobalInfoWrapperPassPass(Registry); diff --git a/llvm/lib/Analysis/CMakeLists.txt b/llvm/lib/Analysis/CMakeLists.txt index 16dd6f8..88ebd65 100644 --- a/llvm/lib/Analysis/CMakeLists.txt +++ b/llvm/lib/Analysis/CMakeLists.txt @@ -137,6 +137,7 @@ add_llvm_component_library(LLVMAnalysis RegionPass.cpp RegionPrinter.cpp ReplayInlineAdvisor.cpp + RuntimeLibcallInfo.cpp ScalarEvolution.cpp ScalarEvolutionAliasAnalysis.cpp ScalarEvolutionDivision.cpp diff --git a/llvm/lib/Analysis/DXILMetadataAnalysis.cpp b/llvm/lib/Analysis/DXILMetadataAnalysis.cpp index 23f1aa8..bd77cba 100644 --- a/llvm/lib/Analysis/DXILMetadataAnalysis.cpp +++ b/llvm/lib/Analysis/DXILMetadataAnalysis.cpp @@ -66,6 +66,22 @@ static ModuleMetadataInfo collectMetadataInfo(Module &M) { Success = llvm::to_integer(NumThreadsVec[2], EFP.NumThreadsZ, 10); assert(Success && "Failed to parse Z component of numthreads"); } + // Get wavesize attribute value, if one exists + StringRef WaveSizeStr = + F.getFnAttribute("hlsl.wavesize").getValueAsString(); + if (!WaveSizeStr.empty()) { + SmallVector<StringRef> WaveSizeVec; + WaveSizeStr.split(WaveSizeVec, ','); + assert(WaveSizeVec.size() == 3 && "Invalid wavesize specified"); + // Read in the three component values of numthreads + [[maybe_unused]] bool Success = + llvm::to_integer(WaveSizeVec[0], EFP.WaveSizeMin, 10); + assert(Success && "Failed to parse Min component of wavesize"); + Success = llvm::to_integer(WaveSizeVec[1], EFP.WaveSizeMax, 10); + assert(Success && "Failed to parse Max component of wavesize"); + Success = llvm::to_integer(WaveSizeVec[2], EFP.WaveSizePref, 10); + assert(Success && "Failed to parse Preferred component of wavesize"); + } MMDAI.EntryPropertyVec.push_back(EFP); } return MMDAI; diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index e27a9b1..5d88e5f 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -806,11 +806,11 @@ public: typedef SmallVector<MemAccessInfo, 8> MemAccessInfoList; AccessAnalysis(const Loop *TheLoop, AAResults *AA, const LoopInfo *LI, - MemoryDepChecker::DepCandidates &DA, + DominatorTree &DT, MemoryDepChecker::DepCandidates &DA, PredicatedScalarEvolution &PSE, SmallPtrSetImpl<MDNode *> &LoopAliasScopes) - : TheLoop(TheLoop), BAA(*AA), AST(BAA), LI(LI), DepCands(DA), PSE(PSE), - LoopAliasScopes(LoopAliasScopes) { + : TheLoop(TheLoop), BAA(*AA), AST(BAA), LI(LI), DT(DT), DepCands(DA), + PSE(PSE), LoopAliasScopes(LoopAliasScopes) { // We're analyzing dependences across loop iterations. BAA.enableCrossIterationMode(); } @@ -934,6 +934,9 @@ private: /// The LoopInfo of the loop being checked. const LoopInfo *LI; + /// The dominator tree of the function. + DominatorTree &DT; + /// Sets of potentially dependent accesses - members of one set share an /// underlying pointer. The set "CheckDeps" identfies which sets really need a /// dependence check. @@ -1015,6 +1018,7 @@ getStrideFromAddRec(const SCEVAddRecExpr *AR, const Loop *Lp, Type *AccessTy, /// informating from the IR pointer value to determine no-wrap. static bool isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR, Value *Ptr, Type *AccessTy, const Loop *L, bool Assume, + const DominatorTree &DT, std::optional<int64_t> Stride = std::nullopt) { // FIXME: This should probably only return true for NUW. if (AR->getNoWrapFlags(SCEV::NoWrapMask)) @@ -1029,8 +1033,18 @@ static bool isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR, // case, the GEP would be poison and any memory access dependent on it would // be immediate UB when executed. if (auto *GEP = dyn_cast_if_present<GetElementPtrInst>(Ptr); - GEP && GEP->hasNoUnsignedSignedWrap()) - return true; + GEP && GEP->hasNoUnsignedSignedWrap()) { + // For the above reasoning to apply, the pointer must be dereferenced in + // every iteration. + if (L->getHeader() == L->getLoopLatch() || + any_of(GEP->users(), [L, &DT, GEP](User *U) { + if (getLoadStorePointerOperand(U) != GEP) + return false; + BasicBlock *UserBB = cast<Instruction>(U)->getParent(); + return !LoopAccessInfo::blockNeedsPredication(UserBB, L, &DT); + })) + return true; + } if (!Stride) Stride = getStrideFromAddRec(AR, L, AccessTy, Ptr, PSE); @@ -1293,7 +1307,7 @@ bool AccessAnalysis::createCheckForAccess( } if (!isNoWrap(PSE, AR, RTCheckPtrs.size() == 1 ? Ptr : nullptr, AccessTy, - TheLoop, Assume)) + TheLoop, Assume, DT)) return false; } @@ -1606,7 +1620,7 @@ void AccessAnalysis::processMemAccesses() { /// Check whether the access through \p Ptr has a constant stride. std::optional<int64_t> llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, - const Loop *Lp, + const Loop *Lp, const DominatorTree &DT, const DenseMap<Value *, const SCEV *> &StridesMap, bool Assume, bool ShouldCheckWrap) { const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr); @@ -1630,7 +1644,7 @@ llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, if (!ShouldCheckWrap || !Stride) return Stride; - if (isNoWrap(PSE, AR, Ptr, AccessTy, Lp, Assume, Stride)) + if (isNoWrap(PSE, AR, Ptr, AccessTy, Lp, Assume, DT, Stride)) return Stride; LLVM_DEBUG( @@ -2047,10 +2061,10 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize( BPtr->getType()->getPointerAddressSpace()) return MemoryDepChecker::Dependence::Unknown; - std::optional<int64_t> StrideAPtr = - getPtrStride(PSE, ATy, APtr, InnermostLoop, SymbolicStrides, true, true); - std::optional<int64_t> StrideBPtr = - getPtrStride(PSE, BTy, BPtr, InnermostLoop, SymbolicStrides, true, true); + std::optional<int64_t> StrideAPtr = getPtrStride( + PSE, ATy, APtr, InnermostLoop, *DT, SymbolicStrides, true, true); + std::optional<int64_t> StrideBPtr = getPtrStride( + PSE, BTy, BPtr, InnermostLoop, *DT, SymbolicStrides, true, true); const SCEV *Src = PSE.getSCEV(APtr); const SCEV *Sink = PSE.getSCEV(BPtr); @@ -2627,7 +2641,8 @@ bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI, } MemoryDepChecker::DepCandidates DepCands; - AccessAnalysis Accesses(TheLoop, AA, LI, DepCands, *PSE, LoopAliasScopes); + AccessAnalysis Accesses(TheLoop, AA, LI, *DT, DepCands, *PSE, + LoopAliasScopes); // Holds the analyzed pointers. We don't want to call getUnderlyingObjects // multiple times on the same object. If the ptr is accessed twice, once @@ -2691,7 +2706,8 @@ bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI, bool IsReadOnlyPtr = false; Type *AccessTy = getLoadStoreType(LD); if (Seen.insert({Ptr, AccessTy}).second || - !getPtrStride(*PSE, AccessTy, Ptr, TheLoop, SymbolicStrides)) { + !getPtrStride(*PSE, AccessTy, Ptr, TheLoop, *DT, SymbolicStrides, false, + true)) { ++NumReads; IsReadOnlyPtr = true; } diff --git a/llvm/lib/Analysis/RuntimeLibcallInfo.cpp b/llvm/lib/Analysis/RuntimeLibcallInfo.cpp new file mode 100644 index 0000000..6fb4119 --- /dev/null +++ b/llvm/lib/Analysis/RuntimeLibcallInfo.cpp @@ -0,0 +1,43 @@ +//===- RuntimeLibcallInfo.cpp ---------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/Analysis/RuntimeLibcallInfo.h" +#include "llvm/InitializePasses.h" + +using namespace llvm; + +AnalysisKey RuntimeLibraryAnalysis::Key; + +RTLIB::RuntimeLibcallsInfo +RuntimeLibraryAnalysis::run(const Module &M, ModuleAnalysisManager &) { + return RTLIB::RuntimeLibcallsInfo(M); +} + +INITIALIZE_PASS(RuntimeLibraryInfoWrapper, "runtime-library-info", + "Runtime Library Function Analysis", false, true) + +RuntimeLibraryInfoWrapper::RuntimeLibraryInfoWrapper() + : ImmutablePass(ID), RTLA(RTLIB::RuntimeLibcallsInfo(Triple())) {} + +char RuntimeLibraryInfoWrapper::ID = 0; + +ModulePass *llvm::createRuntimeLibraryInfoWrapperPass() { + return new RuntimeLibraryInfoWrapper(); +} + +void RuntimeLibraryInfoWrapper::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesAll(); +} + +// Assume this is stable unless explicitly invalidated. +bool RTLIB::RuntimeLibcallsInfo::invalidate( + Module &M, const PreservedAnalyses &PA, + ModuleAnalysisManager::Invalidator &) { + auto PAC = PA.getChecker<RuntimeLibraryAnalysis>(); + return !PAC.preservedWhenStateless(); +} diff --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp index 813632c..74f3a7d 100644 --- a/llvm/lib/Analysis/TargetLibraryInfo.cpp +++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp @@ -388,6 +388,10 @@ static void initializeLibCalls(TargetLibraryInfoImpl &TLI, const Triple &T, TLI.setAvailableWithName(LibFunc_logbf, "_logbf"); else TLI.setUnavailable(LibFunc_logbf); + TLI.setUnavailable(LibFunc_nextafter); + TLI.setUnavailable(LibFunc_nextafterf); + TLI.setUnavailable(LibFunc_nexttoward); + TLI.setUnavailable(LibFunc_nexttowardf); TLI.setUnavailable(LibFunc_rint); TLI.setUnavailable(LibFunc_rintf); TLI.setUnavailable(LibFunc_round); @@ -418,6 +422,8 @@ static void initializeLibCalls(TargetLibraryInfoImpl &TLI, const Triple &T, TLI.setUnavailable(LibFunc_logbl); TLI.setUnavailable(LibFunc_ilogbl); TLI.setUnavailable(LibFunc_nearbyintl); + TLI.setUnavailable(LibFunc_nextafterl); + TLI.setUnavailable(LibFunc_nexttowardl); TLI.setUnavailable(LibFunc_rintl); TLI.setUnavailable(LibFunc_roundl); TLI.setUnavailable(LibFunc_scalblnl); diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index 0a72076..789a983 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -7419,84 +7419,20 @@ static bool canCreateUndefOrPoison(const Operator *Op, UndefPoisonKind Kind, if (cast<ConstantInt>(II->getArgOperand(1))->isNullValue()) return false; break; - case Intrinsic::ctpop: - case Intrinsic::bswap: - case Intrinsic::bitreverse: - case Intrinsic::fshl: - case Intrinsic::fshr: - case Intrinsic::smax: - case Intrinsic::smin: - case Intrinsic::scmp: - case Intrinsic::umax: - case Intrinsic::umin: - case Intrinsic::ucmp: - case Intrinsic::ptrmask: - case Intrinsic::fptoui_sat: - case Intrinsic::fptosi_sat: - case Intrinsic::sadd_with_overflow: - case Intrinsic::ssub_with_overflow: - case Intrinsic::smul_with_overflow: - case Intrinsic::uadd_with_overflow: - case Intrinsic::usub_with_overflow: - case Intrinsic::umul_with_overflow: - case Intrinsic::sadd_sat: - case Intrinsic::uadd_sat: - case Intrinsic::ssub_sat: - case Intrinsic::usub_sat: - return false; case Intrinsic::sshl_sat: case Intrinsic::ushl_sat: - return includesPoison(Kind) && - !shiftAmountKnownInRange(II->getArgOperand(1)); - case Intrinsic::fma: - case Intrinsic::fmuladd: - case Intrinsic::sqrt: - case Intrinsic::powi: - case Intrinsic::sin: - case Intrinsic::cos: - case Intrinsic::pow: - case Intrinsic::log: - case Intrinsic::log10: - case Intrinsic::log2: - case Intrinsic::exp: - case Intrinsic::exp2: - case Intrinsic::exp10: - case Intrinsic::fabs: - case Intrinsic::copysign: - case Intrinsic::floor: - case Intrinsic::ceil: - case Intrinsic::trunc: - case Intrinsic::rint: - case Intrinsic::nearbyint: - case Intrinsic::round: - case Intrinsic::roundeven: - case Intrinsic::fptrunc_round: - case Intrinsic::canonicalize: - case Intrinsic::arithmetic_fence: - case Intrinsic::minnum: - case Intrinsic::maxnum: - case Intrinsic::minimum: - case Intrinsic::maximum: - case Intrinsic::minimumnum: - case Intrinsic::maximumnum: - case Intrinsic::is_fpclass: - case Intrinsic::ldexp: - case Intrinsic::frexp: - return false; - case Intrinsic::lround: - case Intrinsic::llround: - case Intrinsic::lrint: - case Intrinsic::llrint: - // If the value doesn't fit an unspecified value is returned (but this - // is not poison). - return false; + if (!includesPoison(Kind) || + shiftAmountKnownInRange(II->getArgOperand(1))) + return false; + break; } } [[fallthrough]]; case Instruction::CallBr: case Instruction::Invoke: { const auto *CB = cast<CallBase>(Op); - return !CB->hasRetAttr(Attribute::NoUndef); + return !CB->hasRetAttr(Attribute::NoUndef) && + !CB->hasFnAttr(Attribute::NoCreateUndefOrPoison); } case Instruction::InsertElement: case Instruction::ExtractElement: { @@ -10405,3 +10341,55 @@ const Value *llvm::stripNullTest(const Value *V) { Value *llvm::stripNullTest(Value *V) { return const_cast<Value *>(stripNullTest(const_cast<const Value *>(V))); } + +bool llvm::collectPossibleValues(const Value *V, + SmallPtrSetImpl<const Constant *> &Constants, + unsigned MaxCount, bool AllowUndefOrPoison) { + SmallPtrSet<const Instruction *, 8> Visited; + SmallVector<const Instruction *, 8> Worklist; + auto Push = [&](const Value *V) -> bool { + if (auto *C = dyn_cast<Constant>(V)) { + if (!AllowUndefOrPoison && !isGuaranteedNotToBeUndefOrPoison(C)) + return false; + // Check existence first to avoid unnecessary allocations. + if (Constants.contains(C)) + return true; + if (Constants.size() == MaxCount) + return false; + Constants.insert(C); + return true; + } + + if (auto *Inst = dyn_cast<Instruction>(V)) { + if (Visited.insert(Inst).second) + Worklist.push_back(Inst); + return true; + } + return false; + }; + if (!Push(V)) + return false; + while (!Worklist.empty()) { + const Instruction *CurInst = Worklist.pop_back_val(); + switch (CurInst->getOpcode()) { + case Instruction::Select: + if (!Push(CurInst->getOperand(1))) + return false; + if (!Push(CurInst->getOperand(2))) + return false; + break; + case Instruction::PHI: + for (Value *IncomingValue : cast<PHINode>(CurInst)->incoming_values()) { + // Fast path for recurrence PHI. + if (IncomingValue == CurInst) + continue; + if (!Push(IncomingValue)) + return false; + } + break; + default: + return false; + } + } + return true; +} diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp index 091d948..977ed59 100644 --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -1387,9 +1387,9 @@ void InterleavedAccessInfo::collectConstStrideAccesses( // wrap around the address space we would do a memory access at nullptr // even without the transformation. The wrapping checks are therefore // deferred until after we've formed the interleaved groups. - int64_t Stride = - getPtrStride(PSE, ElementTy, Ptr, TheLoop, Strides, - /*Assume=*/true, /*ShouldCheckWrap=*/false).value_or(0); + int64_t Stride = getPtrStride(PSE, ElementTy, Ptr, TheLoop, *DT, Strides, + /*Assume=*/true, /*ShouldCheckWrap=*/false) + .value_or(0); const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr); AccessStrideInfo[&I] = StrideDescriptor(Stride, Scev, Size, @@ -1643,8 +1643,9 @@ void InterleavedAccessInfo::analyzeInterleaving( assert(Member && "Group member does not exist"); Value *MemberPtr = getLoadStorePointerOperand(Member); Type *AccessTy = getLoadStoreType(Member); - if (getPtrStride(PSE, AccessTy, MemberPtr, TheLoop, Strides, - /*Assume=*/false, /*ShouldCheckWrap=*/true).value_or(0)) + if (getPtrStride(PSE, AccessTy, MemberPtr, TheLoop, *DT, Strides, + /*Assume=*/false, /*ShouldCheckWrap=*/true) + .value_or(0)) return false; LLVM_DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to " << FirstOrLast diff --git a/llvm/lib/BinaryFormat/CMakeLists.txt b/llvm/lib/BinaryFormat/CMakeLists.txt index 4b2debb..0c8af1e 100644 --- a/llvm/lib/BinaryFormat/CMakeLists.txt +++ b/llvm/lib/BinaryFormat/CMakeLists.txt @@ -6,7 +6,6 @@ add_llvm_component_library(LLVMBinaryFormat ELF.cpp MachO.cpp Magic.cpp - Minidump.cpp MsgPackDocument.cpp MsgPackDocumentYAML.cpp MsgPackReader.cpp diff --git a/llvm/lib/BinaryFormat/Minidump.cpp b/llvm/lib/BinaryFormat/Minidump.cpp deleted file mode 100644 index b618fb1..0000000 --- a/llvm/lib/BinaryFormat/Minidump.cpp +++ /dev/null @@ -1,14 +0,0 @@ -//===-- Minidump.cpp - Minidump constants and structures ---------*- C++-*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "llvm/BinaryFormat/Minidump.h" - -using namespace llvm::minidump; - -constexpr uint32_t Header::MagicSignature; -constexpr uint16_t Header::MagicVersion; diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index 466dcb0..8930d64 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -2257,6 +2257,8 @@ static Attribute::AttrKind getAttrFromCode(uint64_t Code) { return Attribute::Captures; case bitc::ATTR_KIND_DEAD_ON_RETURN: return Attribute::DeadOnReturn; + case bitc::ATTR_KIND_NO_CREATE_UNDEF_OR_POISON: + return Attribute::NoCreateUndefOrPoison; } } @@ -8566,16 +8568,13 @@ Expected<std::unique_ptr<ModuleSummaryIndex>> BitcodeModule::getSummary() { } static Expected<std::pair<bool, bool>> -getEnableSplitLTOUnitAndUnifiedFlag(BitstreamCursor &Stream, - unsigned ID, - BitcodeLTOInfo <OInfo) { +getEnableSplitLTOUnitAndUnifiedFlag(BitstreamCursor &Stream, unsigned ID) { if (Error Err = Stream.EnterSubBlock(ID)) return std::move(Err); - SmallVector<uint64_t, 64> Record; + SmallVector<uint64_t, 64> Record; while (true) { BitstreamEntry Entry; - std::pair<bool, bool> Result = {false,false}; if (Error E = Stream.advanceSkippingSubblocks().moveInto(Entry)) return std::move(E); @@ -8584,8 +8583,8 @@ getEnableSplitLTOUnitAndUnifiedFlag(BitstreamCursor &Stream, case BitstreamEntry::Error: return error("Malformed block"); case BitstreamEntry::EndBlock: { - // If no flags record found, set both flags to false. - return Result; + // If no flags record found, return both flags as false. + return std::make_pair(false, false); } case BitstreamEntry::Record: // The interesting case. @@ -8607,9 +8606,7 @@ getEnableSplitLTOUnitAndUnifiedFlag(BitstreamCursor &Stream, bool EnableSplitLTOUnit = Flags & 0x8; bool UnifiedLTO = Flags & 0x200; - Result = {EnableSplitLTOUnit, UnifiedLTO}; - - return Result; + return std::make_pair(EnableSplitLTOUnit, UnifiedLTO); } } } @@ -8638,26 +8635,15 @@ Expected<BitcodeLTOInfo> BitcodeModule::getLTOInfo() { /*EnableSplitLTOUnit=*/false, /*UnifiedLTO=*/false}; case BitstreamEntry::SubBlock: - if (Entry.ID == bitc::GLOBALVAL_SUMMARY_BLOCK_ID) { - BitcodeLTOInfo LTOInfo; + if (Entry.ID == bitc::GLOBALVAL_SUMMARY_BLOCK_ID || + Entry.ID == bitc::FULL_LTO_GLOBALVAL_SUMMARY_BLOCK_ID) { Expected<std::pair<bool, bool>> Flags = - getEnableSplitLTOUnitAndUnifiedFlag(Stream, Entry.ID, LTOInfo); + getEnableSplitLTOUnitAndUnifiedFlag(Stream, Entry.ID); if (!Flags) return Flags.takeError(); - std::tie(LTOInfo.EnableSplitLTOUnit, LTOInfo.UnifiedLTO) = Flags.get(); - LTOInfo.IsThinLTO = true; - LTOInfo.HasSummary = true; - return LTOInfo; - } - - if (Entry.ID == bitc::FULL_LTO_GLOBALVAL_SUMMARY_BLOCK_ID) { BitcodeLTOInfo LTOInfo; - Expected<std::pair<bool, bool>> Flags = - getEnableSplitLTOUnitAndUnifiedFlag(Stream, Entry.ID, LTOInfo); - if (!Flags) - return Flags.takeError(); std::tie(LTOInfo.EnableSplitLTOUnit, LTOInfo.UnifiedLTO) = Flags.get(); - LTOInfo.IsThinLTO = false; + LTOInfo.IsThinLTO = (Entry.ID == bitc::GLOBALVAL_SUMMARY_BLOCK_ID); LTOInfo.HasSummary = true; return LTOInfo; } diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index f17656c..76494c7 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -956,6 +956,8 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) { return bitc::ATTR_KIND_CAPTURES; case Attribute::DeadOnReturn: return bitc::ATTR_KIND_DEAD_ON_RETURN; + case Attribute::NoCreateUndefOrPoison: + return bitc::ATTR_KIND_NO_CREATE_UNDEF_OR_POISON; case Attribute::EndAttrKinds: llvm_unreachable("Can not encode end-attribute kinds marker."); case Attribute::None: diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index f65d88a..713277d 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -1708,7 +1708,6 @@ void AsmPrinter::emitCallGraphSection(const MachineFunction &MF, OutStreamer->pushSection(); OutStreamer->switchSection(FuncCGSection); - const MCSymbol *FunctionSymbol = getFunctionBegin(); const Function &F = MF.getFunction(); // If this function has external linkage or has its address taken and // it is not a callback, then anything could call it. @@ -1747,7 +1746,7 @@ void AsmPrinter::emitCallGraphSection(const MachineFunction &MF, // 8) Each unique indirect target type id. OutStreamer->emitInt8(CallGraphSectionFormatVersion::V_0); OutStreamer->emitInt8(static_cast<uint8_t>(CGFlags)); - OutStreamer->emitSymbolValue(FunctionSymbol, TM.getProgramPointerSize()); + OutStreamer->emitSymbolValue(getSymbol(&F), TM.getProgramPointerSize()); const auto *TypeId = extractNumericCGTypeId(F); if (IsIndirectTarget && TypeId) OutStreamer->emitInt64(TypeId->getZExtValue()); diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp index 53f1cfe2..d9bc042 100644 --- a/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -38,6 +38,7 @@ #include "llvm/IR/MDBuilder.h" #include "llvm/IR/MemoryModelRelaxationAnnotations.h" #include "llvm/IR/Module.h" +#include "llvm/IR/ProfDataUtils.h" #include "llvm/IR/Type.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" @@ -1259,8 +1260,7 @@ Value *AtomicExpandImpl::insertRMWLLSCLoop( BasicBlock *BB = Builder.GetInsertBlock(); Function *F = BB->getParent(); - assert(AddrAlign >= - F->getDataLayout().getTypeStoreSize(ResultTy) && + assert(AddrAlign >= F->getDataLayout().getTypeStoreSize(ResultTy) && "Expected at least natural alignment at this point."); // Given: atomicrmw some_op iN* %addr, iN %incr ordering @@ -1295,7 +1295,13 @@ Value *AtomicExpandImpl::insertRMWLLSCLoop( TLI->emitStoreConditional(Builder, NewVal, Addr, MemOpOrder); Value *TryAgain = Builder.CreateICmpNE( StoreSuccess, ConstantInt::get(IntegerType::get(Ctx, 32), 0), "tryagain"); - Builder.CreateCondBr(TryAgain, LoopBB, ExitBB); + + Instruction *CondBr = Builder.CreateCondBr(TryAgain, LoopBB, ExitBB); + + // Atomic RMW expands to a Load-linked / Store-Conditional loop, because it is + // hard to predict precise branch weigths we mark the branch as "unknown" + // (50/50) to prevent misleading optimizations. + setExplicitlyUnknownBranchWeightsIfProfiled(*CondBr, DEBUG_TYPE); Builder.SetInsertPoint(ExitBB, ExitBB->begin()); return Loaded; @@ -1680,7 +1686,12 @@ Value *AtomicExpandImpl::insertRMWCmpXchgLoop( Loaded->addIncoming(NewLoaded, LoopBB); - Builder.CreateCondBr(Success, ExitBB, LoopBB); + Instruction *CondBr = Builder.CreateCondBr(Success, ExitBB, LoopBB); + + // Atomic RMW expands to a cmpxchg loop, Since precise branch weights + // cannot be easily determined here, we mark the branch as "unknown" (50/50) + // to prevent misleading optimizations. + setExplicitlyUnknownBranchWeightsIfProfiled(*CondBr, DEBUG_TYPE); Builder.SetInsertPoint(ExitBB, ExitBB->begin()); return NewLoaded; diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt index 4373c53..1cf0b49 100644 --- a/llvm/lib/CodeGen/CMakeLists.txt +++ b/llvm/lib/CodeGen/CMakeLists.txt @@ -88,6 +88,7 @@ add_llvm_component_library(LLVMCodeGen LatencyPriorityQueue.cpp LazyMachineBlockFrequencyInfo.cpp LexicalScopes.cpp + LibcallLoweringInfo.cpp LiveDebugVariables.cpp LiveIntervals.cpp LiveInterval.cpp diff --git a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp index d6f23b6..c1fb8b6 100644 --- a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp +++ b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp @@ -643,6 +643,38 @@ void GISelValueTracking::computeKnownBitsImpl(Register R, KnownBits &Known, Known.Zero.setBitsFrom(LowBits); break; } + case TargetOpcode::G_EXTRACT_VECTOR_ELT: { + GExtractVectorElement &Extract = cast<GExtractVectorElement>(MI); + Register InVec = Extract.getVectorReg(); + Register EltNo = Extract.getIndexReg(); + + auto ConstEltNo = getIConstantVRegVal(EltNo, MRI); + + LLT VecVT = MRI.getType(InVec); + // computeKnownBits not yet implemented for scalable vectors. + if (VecVT.isScalableVector()) + break; + + const unsigned EltBitWidth = VecVT.getScalarSizeInBits(); + const unsigned NumSrcElts = VecVT.getNumElements(); + // A return type different from the vector's element type may lead to + // issues with pattern selection. Bail out to avoid that. + if (BitWidth > EltBitWidth) + break; + + Known.Zero.setAllBits(); + Known.One.setAllBits(); + + // If we know the element index, just demand that vector element, else for + // an unknown element index, ignore DemandedElts and demand them all. + APInt DemandedSrcElts = APInt::getAllOnes(NumSrcElts); + if (ConstEltNo && ConstEltNo->ult(NumSrcElts)) + DemandedSrcElts = + APInt::getOneBitSet(NumSrcElts, ConstEltNo->getZExtValue()); + + computeKnownBitsImpl(InVec, Known, DemandedSrcElts, Depth + 1); + break; + } case TargetOpcode::G_SHUFFLE_VECTOR: { APInt DemandedLHS, DemandedRHS; // Collect the known bits that are shared by every vector element referenced diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index 1fc90d0..be1b51f 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -294,6 +294,10 @@ void IRTranslator::addMachineCFGPred(CFGEdge Edge, MachineBasicBlock *NewPred) { MachinePreds[Edge].push_back(NewPred); } +static bool targetSupportsBF16Type(const MachineFunction *MF) { + return MF->getTarget().getTargetTriple().isSPIRV(); +} + static bool containsBF16Type(const User &U) { // BF16 cannot currently be represented by LLT, to avoid miscompiles we // prevent any instructions using them. FIXME: This can be removed once LLT @@ -306,7 +310,7 @@ static bool containsBF16Type(const User &U) { bool IRTranslator::translateBinaryOp(unsigned Opcode, const User &U, MachineIRBuilder &MIRBuilder) { - if (containsBF16Type(U)) + if (containsBF16Type(U) && !targetSupportsBF16Type(MF)) return false; // Get or create a virtual register for each value. @@ -328,7 +332,7 @@ bool IRTranslator::translateBinaryOp(unsigned Opcode, const User &U, bool IRTranslator::translateUnaryOp(unsigned Opcode, const User &U, MachineIRBuilder &MIRBuilder) { - if (containsBF16Type(U)) + if (containsBF16Type(U) && !targetSupportsBF16Type(MF)) return false; Register Op0 = getOrCreateVReg(*U.getOperand(0)); @@ -348,7 +352,7 @@ bool IRTranslator::translateFNeg(const User &U, MachineIRBuilder &MIRBuilder) { bool IRTranslator::translateCompare(const User &U, MachineIRBuilder &MIRBuilder) { - if (containsBF16Type(U)) + if (containsBF16Type(U) && !targetSupportsBF16Type(MF)) return false; auto *CI = cast<CmpInst>(&U); @@ -1569,7 +1573,7 @@ bool IRTranslator::translateBitCast(const User &U, bool IRTranslator::translateCast(unsigned Opcode, const User &U, MachineIRBuilder &MIRBuilder) { - if (containsBF16Type(U)) + if (containsBF16Type(U) && !targetSupportsBF16Type(MF)) return false; uint32_t Flags = 0; @@ -2688,7 +2692,7 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, bool IRTranslator::translateInlineAsm(const CallBase &CB, MachineIRBuilder &MIRBuilder) { - if (containsBF16Type(CB)) + if (containsBF16Type(CB) && !targetSupportsBF16Type(MF)) return false; const InlineAsmLowering *ALI = MF->getSubtarget().getInlineAsmLowering(); @@ -2779,7 +2783,7 @@ bool IRTranslator::translateCallBase(const CallBase &CB, } bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) { - if (!MF->getTarget().getTargetTriple().isSPIRV() && containsBF16Type(U)) + if (containsBF16Type(U) && !targetSupportsBF16Type(MF)) return false; const CallInst &CI = cast<CallInst>(U); @@ -2817,20 +2821,34 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) { if (translateKnownIntrinsic(CI, ID, MIRBuilder)) return true; + TargetLowering::IntrinsicInfo Info; + bool IsTgtMemIntrinsic = TLI->getTgtMemIntrinsic(Info, CI, *MF, ID); + + return translateIntrinsic(CI, ID, MIRBuilder, + IsTgtMemIntrinsic ? &Info : nullptr); +} + +/// Translate a call to an intrinsic. +/// Depending on whether TLI->getTgtMemIntrinsic() is true, TgtMemIntrinsicInfo +/// is a pointer to the correspondingly populated IntrinsicInfo object. +/// Otherwise, this pointer is null. +bool IRTranslator::translateIntrinsic( + const CallBase &CB, Intrinsic::ID ID, MachineIRBuilder &MIRBuilder, + const TargetLowering::IntrinsicInfo *TgtMemIntrinsicInfo) { ArrayRef<Register> ResultRegs; - if (!CI.getType()->isVoidTy()) - ResultRegs = getOrCreateVRegs(CI); + if (!CB.getType()->isVoidTy()) + ResultRegs = getOrCreateVRegs(CB); // Ignore the callsite attributes. Backend code is most likely not expecting // an intrinsic to sometimes have side effects and sometimes not. MachineInstrBuilder MIB = MIRBuilder.buildIntrinsic(ID, ResultRegs); - if (isa<FPMathOperator>(CI)) - MIB->copyIRFlags(CI); + if (isa<FPMathOperator>(CB)) + MIB->copyIRFlags(CB); - for (const auto &Arg : enumerate(CI.args())) { + for (const auto &Arg : enumerate(CB.args())) { // If this is required to be an immediate, don't materialize it in a // register. - if (CI.paramHasAttr(Arg.index(), Attribute::ImmArg)) { + if (CB.paramHasAttr(Arg.index(), Attribute::ImmArg)) { if (ConstantInt *CI = dyn_cast<ConstantInt>(Arg.value())) { // imm arguments are more convenient than cimm (and realistically // probably sufficient), so use them. @@ -2859,29 +2877,33 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) { } // Add a MachineMemOperand if it is a target mem intrinsic. - TargetLowering::IntrinsicInfo Info; - // TODO: Add a GlobalISel version of getTgtMemIntrinsic. - if (TLI->getTgtMemIntrinsic(Info, CI, *MF, ID)) { - Align Alignment = Info.align.value_or( - DL->getABITypeAlign(Info.memVT.getTypeForEVT(F->getContext()))); - LLT MemTy = Info.memVT.isSimple() - ? getLLTForMVT(Info.memVT.getSimpleVT()) - : LLT::scalar(Info.memVT.getStoreSizeInBits()); + if (TgtMemIntrinsicInfo) { + const Function *F = CB.getCalledFunction(); + + Align Alignment = TgtMemIntrinsicInfo->align.value_or(DL->getABITypeAlign( + TgtMemIntrinsicInfo->memVT.getTypeForEVT(F->getContext()))); + LLT MemTy = + TgtMemIntrinsicInfo->memVT.isSimple() + ? getLLTForMVT(TgtMemIntrinsicInfo->memVT.getSimpleVT()) + : LLT::scalar(TgtMemIntrinsicInfo->memVT.getStoreSizeInBits()); // TODO: We currently just fallback to address space 0 if getTgtMemIntrinsic // didn't yield anything useful. MachinePointerInfo MPI; - if (Info.ptrVal) - MPI = MachinePointerInfo(Info.ptrVal, Info.offset); - else if (Info.fallbackAddressSpace) - MPI = MachinePointerInfo(*Info.fallbackAddressSpace); + if (TgtMemIntrinsicInfo->ptrVal) { + MPI = MachinePointerInfo(TgtMemIntrinsicInfo->ptrVal, + TgtMemIntrinsicInfo->offset); + } else if (TgtMemIntrinsicInfo->fallbackAddressSpace) { + MPI = MachinePointerInfo(*TgtMemIntrinsicInfo->fallbackAddressSpace); + } MIB.addMemOperand(MF->getMachineMemOperand( - MPI, Info.flags, MemTy, Alignment, CI.getAAMetadata(), - /*Ranges=*/nullptr, Info.ssid, Info.order, Info.failureOrder)); + MPI, TgtMemIntrinsicInfo->flags, MemTy, Alignment, CB.getAAMetadata(), + /*Ranges=*/nullptr, TgtMemIntrinsicInfo->ssid, + TgtMemIntrinsicInfo->order, TgtMemIntrinsicInfo->failureOrder)); } - if (CI.isConvergent()) { - if (auto Bundle = CI.getOperandBundle(LLVMContext::OB_convergencectrl)) { + if (CB.isConvergent()) { + if (auto Bundle = CB.getOperandBundle(LLVMContext::OB_convergencectrl)) { auto *Token = Bundle->Inputs[0].get(); Register TokenReg = getOrCreateVReg(*Token); MIB.addUse(TokenReg, RegState::Implicit); diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index 52c43a4..d02f097 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -776,7 +776,7 @@ llvm::createMemLibcall(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, break; case TargetOpcode::G_MEMCPY: RTLibcall = RTLIB::MEMCPY; - Name = TLI.getMemcpyName(); + Name = TLI.getLibcallImplName(TLI.getMemcpyImpl()).data(); Args[0].Flags[0].setReturned(); break; case TargetOpcode::G_MEMMOVE: diff --git a/llvm/lib/CodeGen/LibcallLoweringInfo.cpp b/llvm/lib/CodeGen/LibcallLoweringInfo.cpp new file mode 100644 index 0000000..5c1698c --- /dev/null +++ b/llvm/lib/CodeGen/LibcallLoweringInfo.cpp @@ -0,0 +1,26 @@ +//===- LibcallLoweringInfo.cpp - Interface for runtime libcalls -----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/LibcallLoweringInfo.h" + +using namespace llvm; + +LibcallLoweringInfo::LibcallLoweringInfo( + const RTLIB::RuntimeLibcallsInfo &RTLCI) + : RTLCI(RTLCI) { + // TODO: This should be generated with lowering predicates, and assert the + // call is available. + for (RTLIB::LibcallImpl Impl : RTLIB::libcall_impls()) { + if (RTLCI.isAvailable(Impl)) { + RTLIB::Libcall LC = RTLIB::RuntimeLibcallsInfo::getLibcallFromImpl(Impl); + // FIXME: Hack, assume the first available libcall wins. + if (LibcallImpls[LC] == RTLIB::Unsupported) + LibcallImpls[LC] = Impl; + } + } +} diff --git a/llvm/lib/CodeGen/MachineFunctionSplitter.cpp b/llvm/lib/CodeGen/MachineFunctionSplitter.cpp index c31454a..b5d3092 100644 --- a/llvm/lib/CodeGen/MachineFunctionSplitter.cpp +++ b/llvm/lib/CodeGen/MachineFunctionSplitter.cpp @@ -129,6 +129,9 @@ static bool isColdBlock(const MachineBasicBlock &MBB, } bool MachineFunctionSplitter::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction())) + return false; + // Do not split functions when -basic-block-sections=all is specified. if (MF.getTarget().getBBSectionsType() == llvm::BasicBlockSection::All) return false; diff --git a/llvm/lib/CodeGen/MachineInstrBundle.cpp b/llvm/lib/CodeGen/MachineInstrBundle.cpp index da29ffc..f4c1a8b 100644 --- a/llvm/lib/CodeGen/MachineInstrBundle.cpp +++ b/llvm/lib/CodeGen/MachineInstrBundle.cpp @@ -83,15 +83,21 @@ llvm::createUnpackMachineBundles( return new UnpackMachineBundles(std::move(Ftor)); } -/// Return the first found DebugLoc that has a DILocation, given a range of -/// instructions. The search range is from FirstMI to LastMI (exclusive). If no -/// DILocation is found, then an empty location is returned. +/// Return the first DebugLoc that has line number information, given a +/// range of instructions. The search range is from FirstMI to LastMI +/// (exclusive). Otherwise return the first DILocation or an empty location if +/// there are none. static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI) { - for (auto MII = FirstMI; MII != LastMI; ++MII) - if (MII->getDebugLoc()) - return MII->getDebugLoc(); - return DebugLoc(); + DebugLoc DL; + for (auto MII = FirstMI; MII != LastMI; ++MII) { + if (DebugLoc MIIDL = MII->getDebugLoc()) { + if (MIIDL.getLine() != 0) + return MIIDL; + DL = MIIDL.get(); + } + } + return DL; } /// Check if target reg is contained in given lists, which are: @@ -136,6 +142,7 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB, SmallSetVector<Register, 8> ExternUses; SmallSet<Register, 8> KilledUseSet; SmallSet<Register, 8> UndefUseSet; + SmallVector<std::pair<Register, Register>> TiedOperands; for (auto MII = FirstMI; MII != LastMI; ++MII) { // Debug instructions have no effects to track. if (MII->isDebugInstr()) @@ -161,6 +168,15 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB, // External def is now killed. KilledUseSet.insert(Reg); } + if (MO.isTied() && Reg.isVirtual()) { + // Record tied operand constraints that involve virtual registers so + // that bundles that are formed pre-register allocation reflect the + // relevant constraints. + unsigned TiedIdx = MII->findTiedOperandIdx(MO.getOperandNo()); + MachineOperand &TiedMO = MII->getOperand(TiedIdx); + Register DefReg = TiedMO.getReg(); + TiedOperands.emplace_back(DefReg, Reg); + } } } @@ -203,7 +219,17 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB, bool isKill = KilledUseSet.contains(Reg); bool isUndef = UndefUseSet.contains(Reg); MIB.addReg(Reg, getKillRegState(isKill) | getUndefRegState(isUndef) | - getImplRegState(true)); + getImplRegState(true)); + } + + for (auto [DefReg, UseReg] : TiedOperands) { + unsigned DefIdx = + std::distance(LocalDefs.begin(), llvm::find(LocalDefs, DefReg)); + unsigned UseIdx = + std::distance(ExternUses.begin(), llvm::find(ExternUses, UseReg)); + assert(DefIdx < LocalDefs.size()); + assert(UseIdx < ExternUses.size()); + MIB->tieOperands(DefIdx, LocalDefs.size() + UseIdx); } } diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp index f18c051..7399370 100644 --- a/llvm/lib/CodeGen/MachineScheduler.cpp +++ b/llvm/lib/CodeGen/MachineScheduler.cpp @@ -2559,7 +2559,7 @@ init(ScheduleDAGMI *dag, const TargetSchedModel *smodel, SchedRemainder *rem) { for (unsigned i = 0; i < ResourceCount; ++i) { ReservedCyclesIndex[i] = NumUnits; NumUnits += SchedModel->getProcResource(i)->NumUnits; - if (isUnbufferedGroup(i)) { + if (isReservedGroup(i)) { auto SubUnits = SchedModel->getProcResource(i)->SubUnitsIdxBegin; for (unsigned U = 0, UE = SchedModel->getProcResource(i)->NumUnits; U != UE; ++U) @@ -2631,7 +2631,7 @@ SchedBoundary::getNextResourceCycle(const MCSchedClassDesc *SC, unsigned PIdx, assert(NumberOfInstances > 0 && "Cannot have zero instances of a ProcResource"); - if (isUnbufferedGroup(PIdx)) { + if (isReservedGroup(PIdx)) { // If any subunits are used by the instruction, report that the // subunits of the resource group are available at the first cycle // in which the unit is available, effectively removing the group diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp index c0710c4..fdf1048 100644 --- a/llvm/lib/CodeGen/MachineVerifier.cpp +++ b/llvm/lib/CodeGen/MachineVerifier.cpp @@ -2584,6 +2584,14 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) { report("Extra explicit operand on non-variadic instruction", MO, MONum); } + // Verify earlyClobber def operand + if (MCID.getOperandConstraint(MONum, MCOI::EARLY_CLOBBER) != -1) { + if (!MO->isReg()) + report("Early clobber must be a register", MI); + if (!MO->isEarlyClobber()) + report("Missing earlyClobber flag", MI); + } + switch (MO->getType()) { case MachineOperand::MO_Register: { // Verify debug flag on debug instructions. Check this first because reg0 diff --git a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp index 620d3d3..d738dc4 100644 --- a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp +++ b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp @@ -244,7 +244,7 @@ static bool canEmitMemcpy(const TargetMachine *TM, Function *F) { if (!TM) return true; const TargetLowering *TLI = TM->getSubtargetImpl(*F)->getTargetLowering(); - return TLI->getMemcpyName() != nullptr; + return TLI->getMemcpyImpl() != RTLIB::Unsupported; } // Return a value appropriate for use with the memset_pattern16 libcall, if diff --git a/llvm/lib/CodeGen/ReachingDefAnalysis.cpp b/llvm/lib/CodeGen/ReachingDefAnalysis.cpp index 40a8907..61706e1 100644 --- a/llvm/lib/CodeGen/ReachingDefAnalysis.cpp +++ b/llvm/lib/CodeGen/ReachingDefAnalysis.cpp @@ -193,7 +193,6 @@ void ReachingDefInfo::processDefs(MachineInstr *MI) { for (auto &MO : MI->operands()) { if (MO.isFI()) { int FrameIndex = MO.getIndex(); - assert(FrameIndex >= 0 && "Can't handle negative frame indicies yet!"); if (!isFIDef(*MI, FrameIndex, TII)) continue; MBBFrameObjsReachingDefs[{MBBNumber, FrameIndex}].push_back(CurInstr); @@ -302,8 +301,6 @@ void ReachingDefInfo::print(raw_ostream &OS) { Register Reg; if (MO.isFI()) { int FrameIndex = MO.getIndex(); - assert(FrameIndex >= 0 && - "Can't handle negative frame indicies yet!"); Reg = Register::index2StackSlot(FrameIndex); } else if (MO.isReg()) { if (MO.isDef()) diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp index 38f6deb..99f7693 100644 --- a/llvm/lib/CodeGen/RegisterCoalescer.cpp +++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp @@ -1600,6 +1600,22 @@ bool RegisterCoalescer::reMaterializeDef(const CoalescerPair &CP, SlotIndex DefIndex = CurrIdx.getRegSlot(NewMI.getOperand(0).isEarlyClobber()); VNInfo::Allocator &Alloc = LIS->getVNInfoAllocator(); + + // Refine the subranges that are now defined by the remat. + // This will split existing subranges if necessary. + DstInt.refineSubRanges( + Alloc, DstMask, + [&DefIndex, &Alloc](LiveInterval::SubRange &SR) { + // We know that this lane is defined by this instruction, + // but at this point it might not be live because it was not defined + // by the original instruction. This happens when the + // rematerialization widens the defined register. Assign that lane a + // dead def so that the interferences are properly modeled. + if (!SR.liveAt(DefIndex)) + SR.createDeadDef(DefIndex, Alloc); + }, + *LIS->getSlotIndexes(), *TRI); + for (LiveInterval::SubRange &SR : DstInt.subranges()) { if ((SR.LaneMask & DstMask).none()) { LLVM_DEBUG(dbgs() @@ -1617,14 +1633,6 @@ bool RegisterCoalescer::reMaterializeDef(const CoalescerPair &CP, // updateRegDefUses. The original subrange def may have only undefed // some lanes. UpdatedSubRanges = true; - } else { - // We know that this lane is defined by this instruction, - // but at this point it might not be live because it was not defined - // by the original instruction. This happens when the - // rematerialization widens the defined register. Assign that lane a - // dead def so that the interferences are properly modeled. - if (!SR.liveAt(DefIndex)) - SR.createDeadDef(DefIndex, Alloc); } } if (UpdatedSubRanges) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 46c4bb8..816b7ba 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -4046,6 +4046,8 @@ static SDValue foldSubCtlzNot(SDNode *N, SelectionDAG &DAG) { m_ConstInt(AndMask)))) { // Type Legalisation Pattern: // (sub (ctlz (and (xor Op XorMask) AndMask)) BitWidthDiff) + if (BitWidthDiff.getZExtValue() >= BitWidth) + return SDValue(); unsigned AndMaskWidth = BitWidth - BitWidthDiff.getZExtValue(); if (!(AndMask.isMask(AndMaskWidth) && XorMask.countr_one() >= AndMaskWidth)) return SDValue(); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index 431a810..316aacd 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -163,6 +163,8 @@ private: RTLIB::Libcall CallI128); void ExpandDivRemLibCall(SDNode *Node, SmallVectorImpl<SDValue> &Results); + SDValue ExpandSincosStretLibCall(SDNode *Node) const; + SDValue EmitStackConvert(SDValue SrcOp, EVT SlotVT, EVT DestVT, const SDLoc &dl); SDValue EmitStackConvert(SDValue SrcOp, EVT SlotVT, EVT DestVT, @@ -2423,6 +2425,101 @@ static bool useSinCos(SDNode *Node) { return false; } +SDValue SelectionDAGLegalize::ExpandSincosStretLibCall(SDNode *Node) const { + // For iOS, we want to call an alternative entry point: __sincos_stret, + // which returns the values in two S / D registers. + SDLoc dl(Node); + SDValue Arg = Node->getOperand(0); + EVT ArgVT = Arg.getValueType(); + RTLIB::Libcall LC = RTLIB::getSINCOS_STRET(ArgVT); + RTLIB::LibcallImpl SincosStret = TLI.getLibcallImpl(LC); + if (SincosStret == RTLIB::Unsupported) + return SDValue(); + + /// There are 3 different ABI cases to handle: + /// - Direct return of separate fields in registers + /// - Single return as vector elements + /// - sret struct + + const RTLIB::RuntimeLibcallsInfo &CallsInfo = TLI.getRuntimeLibcallsInfo(); + + const DataLayout &DL = DAG.getDataLayout(); + + auto [FuncTy, FuncAttrs] = CallsInfo.getFunctionTy( + *DAG.getContext(), TM.getTargetTriple(), DL, SincosStret); + + Type *SincosStretRetTy = FuncTy->getReturnType(); + CallingConv::ID CallConv = CallsInfo.getLibcallImplCallingConv(SincosStret); + StringRef LibcallImplName = CallsInfo.getLibcallImplName(SincosStret); + + SDValue Callee = DAG.getExternalSymbol(LibcallImplName.data(), + TLI.getProgramPointerTy(DL)); + + TargetLowering::ArgListTy Args; + SDValue SRet; + + int FrameIdx; + if (FuncTy->getParamType(0)->isPointerTy()) { + // Uses sret + MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); + + AttributeSet PtrAttrs = FuncAttrs.getParamAttrs(0); + Type *StructTy = PtrAttrs.getStructRetType(); + const uint64_t ByteSize = DL.getTypeAllocSize(StructTy); + const Align StackAlign = DL.getPrefTypeAlign(StructTy); + + FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false); + SRet = DAG.getFrameIndex(FrameIdx, TLI.getFrameIndexTy(DL)); + + TargetLowering::ArgListEntry Entry(SRet, FuncTy->getParamType(0)); + Entry.IsSRet = true; + Entry.IndirectType = StructTy; + Entry.Alignment = StackAlign; + + Args.push_back(Entry); + Args.emplace_back(Arg, FuncTy->getParamType(1)); + } else { + Args.emplace_back(Arg, FuncTy->getParamType(0)); + } + + TargetLowering::CallLoweringInfo CLI(DAG); + CLI.setDebugLoc(dl) + .setChain(DAG.getEntryNode()) + .setLibCallee(CallConv, SincosStretRetTy, Callee, std::move(Args)) + .setIsPostTypeLegalization(); + + std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); + + if (SRet) { + MachinePointerInfo PtrInfo = + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); + SDValue LoadSin = DAG.getLoad(ArgVT, dl, CallResult.second, SRet, PtrInfo); + + TypeSize StoreSize = ArgVT.getStoreSize(); + + // Address of cos field. + SDValue Add = DAG.getObjectPtrOffset(dl, SRet, StoreSize); + SDValue LoadCos = DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, + PtrInfo.getWithOffset(StoreSize)); + + SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); + return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, LoadSin.getValue(0), + LoadCos.getValue(0)); + } + + if (!CallResult.first.getValueType().isVector()) + return CallResult.first; + + SDValue SinVal = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first, + DAG.getVectorIdxConstant(0, dl)); + SDValue CosVal = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first, + DAG.getVectorIdxConstant(1, dl)); + SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); + return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal); +} + SDValue SelectionDAGLegalize::expandLdexp(SDNode *Node) const { SDLoc dl(Node); EVT VT = Node->getValueType(0); @@ -4730,6 +4827,18 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) { case ISD::FSINCOS: case ISD::FSINCOSPI: { EVT VT = Node->getValueType(0); + + if (Node->getOpcode() == ISD::FSINCOS) { + RTLIB::Libcall SincosStret = RTLIB::getSINCOS_STRET(VT); + if (SincosStret != RTLIB::UNKNOWN_LIBCALL) { + if (SDValue Expanded = ExpandSincosStretLibCall(Node)) { + Results.push_back(Expanded); + Results.push_back(Expanded.getValue(1)); + break; + } + } + } + RTLIB::Libcall LC = Node->getOpcode() == ISD::FSINCOS ? RTLIB::getSINCOS(VT) : RTLIB::getSINCOSPI(VT); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 379242e..ff6a7b2 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -9257,21 +9257,22 @@ SDValue SelectionDAG::getMemcpy( // FIXME: pass in SDLoc TargetLowering::CallLoweringInfo CLI(*this); bool IsTailCall = false; - const char *MemCpyName = TLI->getMemcpyName(); + RTLIB::LibcallImpl MemCpyImpl = TLI->getMemcpyImpl(); if (OverrideTailCall.has_value()) { IsTailCall = *OverrideTailCall; } else { - bool LowersToMemcpy = StringRef(MemCpyName) == StringRef("memcpy"); + bool LowersToMemcpy = MemCpyImpl == RTLIB::impl_memcpy; IsTailCall = isInTailCallPositionWrapper(CI, this, LowersToMemcpy); } CLI.setDebugLoc(dl) .setChain(Chain) .setLibCallee( - TLI->getLibcallCallingConv(RTLIB::MEMCPY), + TLI->getLibcallImplCallingConv(MemCpyImpl), Dst.getValueType().getTypeForEVT(*getContext()), - getExternalSymbol(MemCpyName, TLI->getPointerTy(getDataLayout())), + getExternalSymbol(TLI->getLibcallImplName(MemCpyImpl).data(), + TLI->getPointerTy(getDataLayout())), std::move(Args)) .setDiscardResult() .setTailCall(IsTailCall); @@ -9361,22 +9362,24 @@ SDValue SelectionDAG::getMemmove(SDValue Chain, const SDLoc &dl, SDValue Dst, // FIXME: pass in SDLoc TargetLowering::CallLoweringInfo CLI(*this); + RTLIB::LibcallImpl MemmoveImpl = TLI->getLibcallImpl(RTLIB::MEMMOVE); + bool IsTailCall = false; if (OverrideTailCall.has_value()) { IsTailCall = *OverrideTailCall; } else { - bool LowersToMemmove = - TLI->getLibcallName(RTLIB::MEMMOVE) == StringRef("memmove"); + bool LowersToMemmove = MemmoveImpl == RTLIB::impl_memmove; IsTailCall = isInTailCallPositionWrapper(CI, this, LowersToMemmove); } CLI.setDebugLoc(dl) .setChain(Chain) - .setLibCallee(TLI->getLibcallCallingConv(RTLIB::MEMMOVE), - Dst.getValueType().getTypeForEVT(*getContext()), - getExternalSymbol(TLI->getLibcallName(RTLIB::MEMMOVE), - TLI->getPointerTy(getDataLayout())), - std::move(Args)) + .setLibCallee( + TLI->getLibcallImplCallingConv(MemmoveImpl), + Dst.getValueType().getTypeForEVT(*getContext()), + getExternalSymbol(TLI->getLibcallImplName(MemmoveImpl).data(), + TLI->getPointerTy(getDataLayout())), + std::move(Args)) .setDiscardResult() .setTailCall(IsTailCall); @@ -9492,8 +9495,10 @@ SDValue SelectionDAG::getMemset(SDValue Chain, const SDLoc &dl, SDValue Dst, TLI->getPointerTy(DL)), std::move(Args)); } - bool LowersToMemset = - TLI->getLibcallName(RTLIB::MEMSET) == StringRef("memset"); + + RTLIB::LibcallImpl MemsetImpl = TLI->getLibcallImpl(RTLIB::MEMSET); + bool LowersToMemset = MemsetImpl == RTLIB::impl_memset; + // If we're going to use bzero, make sure not to tail call unless the // subsequent return doesn't need a value, as bzero doesn't return the first // arg unlike memset. diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index fa0c899..9961c98 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -3526,8 +3526,7 @@ void SelectionDAGBuilder::visitCallBr(const CallBrInst &I) { // Update successor info. addSuccessorWithProb(CallBrMBB, Return, BranchProbability::getOne()); - for (unsigned i = 0, e = I.getNumIndirectDests(); i < e; ++i) { - BasicBlock *Dest = I.getIndirectDest(i); + for (BasicBlock *Dest : I.getIndirectDests()) { MachineBasicBlock *Target = FuncInfo.getMBB(Dest); Target->setIsInlineAsmBrIndirectTarget(); // If we introduce a type of asm goto statement that is permitted to use an @@ -5313,18 +5312,26 @@ void SelectionDAGBuilder::visitAtomicStore(const StoreInst &I) { DAG.setRoot(OutChain); } -/// visitTargetIntrinsic - Lower a call of a target intrinsic to an INTRINSIC -/// node. -void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I, - unsigned Intrinsic) { - // Ignore the callsite's attributes. A specific call site may be marked with - // readnone, but the lowering code will expect the chain based on the - // definition. +/// Check if this intrinsic call depends on the chain (1st return value) +/// and if it only *loads* memory. +/// Ignore the callsite's attributes. A specific call site may be marked with +/// readnone, but the lowering code will expect the chain based on the +/// definition. +std::pair<bool, bool> +SelectionDAGBuilder::getTargetIntrinsicCallProperties(const CallBase &I) { const Function *F = I.getCalledFunction(); bool HasChain = !F->doesNotAccessMemory(); bool OnlyLoad = HasChain && F->onlyReadsMemory() && F->willReturn() && F->doesNotThrow(); + return {HasChain, OnlyLoad}; +} + +SmallVector<SDValue, 8> SelectionDAGBuilder::getTargetIntrinsicOperands( + const CallBase &I, bool HasChain, bool OnlyLoad, + TargetLowering::IntrinsicInfo *TgtMemIntrinsicInfo) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + // Build the operand list. SmallVector<SDValue, 8> Ops; if (HasChain) { // If this intrinsic has side-effects, chainify it. @@ -5336,17 +5343,10 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I, } } - // Info is set by getTgtMemIntrinsic - TargetLowering::IntrinsicInfo Info; - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - bool IsTgtIntrinsic = TLI.getTgtMemIntrinsic(Info, I, - DAG.getMachineFunction(), - Intrinsic); - // Add the intrinsic ID as an integer operand if it's not a target intrinsic. - if (!IsTgtIntrinsic || Info.opc == ISD::INTRINSIC_VOID || - Info.opc == ISD::INTRINSIC_W_CHAIN) - Ops.push_back(DAG.getTargetConstant(Intrinsic, getCurSDLoc(), + if (!TgtMemIntrinsicInfo || TgtMemIntrinsicInfo->opc == ISD::INTRINSIC_VOID || + TgtMemIntrinsicInfo->opc == ISD::INTRINSIC_W_CHAIN) + Ops.push_back(DAG.getTargetConstant(I.getIntrinsicID(), getCurSDLoc(), TLI.getPointerTy(DAG.getDataLayout()))); // Add all operands of the call to the operand list. @@ -5369,13 +5369,85 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I, } } + if (std::optional<OperandBundleUse> Bundle = + I.getOperandBundle(LLVMContext::OB_convergencectrl)) { + Value *Token = Bundle->Inputs[0].get(); + SDValue ConvControlToken = getValue(Token); + assert(Ops.back().getValueType() != MVT::Glue && + "Did not expect another glue node here."); + ConvControlToken = + DAG.getNode(ISD::CONVERGENCECTRL_GLUE, {}, MVT::Glue, ConvControlToken); + Ops.push_back(ConvControlToken); + } + + return Ops; +} + +SDVTList SelectionDAGBuilder::getTargetIntrinsicVTList(const CallBase &I, + bool HasChain) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + SmallVector<EVT, 4> ValueVTs; ComputeValueVTs(TLI, DAG.getDataLayout(), I.getType(), ValueVTs); if (HasChain) ValueVTs.push_back(MVT::Other); - SDVTList VTs = DAG.getVTList(ValueVTs); + return DAG.getVTList(ValueVTs); +} + +/// Get an INTRINSIC node for a target intrinsic which does not touch memory. +SDValue SelectionDAGBuilder::getTargetNonMemIntrinsicNode( + const Type &IntrinsicVT, bool HasChain, ArrayRef<SDValue> Ops, + const SDVTList &VTs) { + if (!HasChain) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, getCurSDLoc(), VTs, Ops); + if (!IntrinsicVT.isVoidTy()) + return DAG.getNode(ISD::INTRINSIC_W_CHAIN, getCurSDLoc(), VTs, Ops); + return DAG.getNode(ISD::INTRINSIC_VOID, getCurSDLoc(), VTs, Ops); +} + +/// Set root, convert return type if necessary and check alignment. +SDValue SelectionDAGBuilder::handleTargetIntrinsicRet(const CallBase &I, + bool HasChain, + bool OnlyLoad, + SDValue Result) { + if (HasChain) { + SDValue Chain = Result.getValue(Result.getNode()->getNumValues() - 1); + if (OnlyLoad) + PendingLoads.push_back(Chain); + else + DAG.setRoot(Chain); + } + + if (I.getType()->isVoidTy()) + return Result; + + if (MaybeAlign Alignment = I.getRetAlign(); InsertAssertAlign && Alignment) { + // Insert `assertalign` node if there's an alignment. + Result = DAG.getAssertAlign(getCurSDLoc(), Result, Alignment.valueOrOne()); + } else if (!isa<VectorType>(I.getType())) { + Result = lowerRangeToAssertZExt(DAG, I, Result); + } + + return Result; +} + +/// visitTargetIntrinsic - Lower a call of a target intrinsic to an INTRINSIC +/// node. +void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I, + unsigned Intrinsic) { + auto [HasChain, OnlyLoad] = getTargetIntrinsicCallProperties(I); + + // Info is set by getTgtMemIntrinsic + TargetLowering::IntrinsicInfo Info; + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + bool IsTgtMemIntrinsic = + TLI.getTgtMemIntrinsic(Info, I, DAG.getMachineFunction(), Intrinsic); + + SmallVector<SDValue, 8> Ops = getTargetIntrinsicOperands( + I, HasChain, OnlyLoad, IsTgtMemIntrinsic ? &Info : nullptr); + SDVTList VTs = getTargetIntrinsicVTList(I, HasChain); // Propagate fast-math-flags from IR to node(s). SDNodeFlags Flags; @@ -5386,19 +5458,9 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I, // Create the node. SDValue Result; - if (auto Bundle = I.getOperandBundle(LLVMContext::OB_convergencectrl)) { - auto *Token = Bundle->Inputs[0].get(); - SDValue ConvControlToken = getValue(Token); - assert(Ops.back().getValueType() != MVT::Glue && - "Did not expected another glue node here."); - ConvControlToken = - DAG.getNode(ISD::CONVERGENCECTRL_GLUE, {}, MVT::Glue, ConvControlToken); - Ops.push_back(ConvControlToken); - } - // In some cases, custom collection of operands from CallInst I may be needed. TLI.CollectTargetIntrinsicOperands(I, Ops, DAG); - if (IsTgtIntrinsic) { + if (IsTgtMemIntrinsic) { // This is target intrinsic that touches memory // // TODO: We currently just fallback to address space 0 if getTgtMemIntrinsic @@ -5418,34 +5480,11 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I, Info.ssid, Info.order, Info.failureOrder); Result = DAG.getMemIntrinsicNode(Info.opc, getCurSDLoc(), VTs, Ops, MemVT, MMO); - } else if (!HasChain) { - Result = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, getCurSDLoc(), VTs, Ops); - } else if (!I.getType()->isVoidTy()) { - Result = DAG.getNode(ISD::INTRINSIC_W_CHAIN, getCurSDLoc(), VTs, Ops); } else { - Result = DAG.getNode(ISD::INTRINSIC_VOID, getCurSDLoc(), VTs, Ops); + Result = getTargetNonMemIntrinsicNode(*I.getType(), HasChain, Ops, VTs); } - if (HasChain) { - SDValue Chain = Result.getValue(Result.getNode()->getNumValues()-1); - if (OnlyLoad) - PendingLoads.push_back(Chain); - else - DAG.setRoot(Chain); - } - - if (!I.getType()->isVoidTy()) { - if (!isa<VectorType>(I.getType())) - Result = lowerRangeToAssertZExt(DAG, I, Result); - - MaybeAlign Alignment = I.getRetAlign(); - - // Insert `assertalign` node if there's an alignment. - if (InsertAssertAlign && Alignment) { - Result = - DAG.getAssertAlign(getCurSDLoc(), Result, Alignment.valueOrOne()); - } - } + Result = handleTargetIntrinsicRet(I, HasChain, OnlyLoad, Result); setValue(&I, Result); } diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h index 47e19f7..ed63bee 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -727,6 +727,17 @@ private: MCSymbol *&BeginLabel); SDValue lowerEndEH(SDValue Chain, const InvokeInst *II, const BasicBlock *EHPadBB, MCSymbol *BeginLabel); + + std::pair<bool, bool> getTargetIntrinsicCallProperties(const CallBase &I); + SmallVector<SDValue, 8> getTargetIntrinsicOperands( + const CallBase &I, bool HasChain, bool OnlyLoad, + TargetLowering::IntrinsicInfo *TgtMemIntrinsicInfo = nullptr); + SDVTList getTargetIntrinsicVTList(const CallBase &I, bool HasChain); + SDValue getTargetNonMemIntrinsicNode(const Type &IntrinsicVT, bool HasChain, + ArrayRef<SDValue> Ops, + const SDVTList &VTs); + SDValue handleTargetIntrinsicRet(const CallBase &I, bool HasChain, + bool OnlyLoad, SDValue Result); }; /// This struct represents the registers (physical or virtual) diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index b3535eac..1cc591c 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -697,9 +697,11 @@ ISD::CondCode TargetLoweringBase::getSoftFloatCmpLibcallPredicate( /// NOTE: The TargetMachine owns TLOF. TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) - : TM(tm), Libcalls(TM.getTargetTriple(), TM.Options.ExceptionModel, - TM.Options.FloatABIType, TM.Options.EABIVersion, - TM.Options.MCOptions.getABIName()) { + : TM(tm), + RuntimeLibcallInfo(TM.getTargetTriple(), TM.Options.ExceptionModel, + TM.Options.FloatABIType, TM.Options.EABIVersion, + TM.Options.MCOptions.getABIName()), + Libcalls(RuntimeLibcallInfo) { initActions(); // Perform these initializations only once. diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp index 6c78ef0..7496c5a 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp @@ -704,7 +704,9 @@ void DWARFDie::dump(raw_ostream &OS, unsigned Indent, DIDumpOptions ChildDumpOpts = DumpOpts; ChildDumpOpts.ShowParents = false; while (Child) { - Child.dump(OS, Indent + 2, ChildDumpOpts); + if (DumpOpts.FilterChildTag.empty() || + llvm::is_contained(DumpOpts.FilterChildTag, Child.getTag())) + Child.dump(OS, Indent + 2, ChildDumpOpts); Child = Child.getSibling(); } } diff --git a/llvm/lib/Demangle/MicrosoftDemangle.cpp b/llvm/lib/Demangle/MicrosoftDemangle.cpp index b22928b..0aefe6e 100644 --- a/llvm/lib/Demangle/MicrosoftDemangle.cpp +++ b/llvm/lib/Demangle/MicrosoftDemangle.cpp @@ -277,6 +277,18 @@ demanglePointerCVQualifiers(std::string_view &MangledName) { DEMANGLE_UNREACHABLE; } +static NodeArrayNode *nodeListToNodeArray(ArenaAllocator &Arena, NodeList *Head, + size_t Count) { + NodeArrayNode *N = Arena.alloc<NodeArrayNode>(); + N->Count = Count; + N->Nodes = Arena.allocArray<Node *>(Count); + for (size_t I = 0; I < Count; ++I) { + N->Nodes[I] = Head->N; + Head = Head->Next; + } + return N; +} + std::string_view Demangler::copyString(std::string_view Borrowed) { char *Stable = Arena.allocUnalignedBuffer(Borrowed.size()); // This is not a micro-optimization, it avoids UB, should Borrowed be an null @@ -323,8 +335,30 @@ Demangler::demangleSpecialTableSymbolNode(std::string_view &MangledName, } std::tie(STSN->Quals, IsMember) = demangleQualifiers(MangledName); - if (!consumeFront(MangledName, '@')) - STSN->TargetName = demangleFullyQualifiedTypeName(MangledName); + + NodeList *TargetCurrent = nullptr; + NodeList *TargetHead = nullptr; + size_t Count = 0; + while (!consumeFront(MangledName, '@')) { + ++Count; + + NodeList *Next = Arena.alloc<NodeList>(); + if (TargetCurrent) + TargetCurrent->Next = Next; + else + TargetHead = Next; + + TargetCurrent = Next; + QualifiedNameNode *QN = demangleFullyQualifiedTypeName(MangledName); + if (Error) + return nullptr; + assert(QN); + TargetCurrent->N = QN; + } + + if (Count > 0) + STSN->TargetNames = nodeListToNodeArray(Arena, TargetHead, Count); + return STSN; } @@ -1605,18 +1639,6 @@ Demangler::demangleNameScopePiece(std::string_view &MangledName) { return demangleSimpleName(MangledName, /*Memorize=*/true); } -static NodeArrayNode *nodeListToNodeArray(ArenaAllocator &Arena, NodeList *Head, - size_t Count) { - NodeArrayNode *N = Arena.alloc<NodeArrayNode>(); - N->Count = Count; - N->Nodes = Arena.allocArray<Node *>(Count); - for (size_t I = 0; I < Count; ++I) { - N->Nodes[I] = Head->N; - Head = Head->Next; - } - return N; -} - QualifiedNameNode * Demangler::demangleNameScopeChain(std::string_view &MangledName, IdentifierNode *UnqualifiedName) { diff --git a/llvm/lib/Demangle/MicrosoftDemangleNodes.cpp b/llvm/lib/Demangle/MicrosoftDemangleNodes.cpp index 61e4961..17c6aab 100644 --- a/llvm/lib/Demangle/MicrosoftDemangleNodes.cpp +++ b/llvm/lib/Demangle/MicrosoftDemangleNodes.cpp @@ -662,9 +662,9 @@ void VcallThunkIdentifierNode::output(OutputBuffer &OB, void SpecialTableSymbolNode::output(OutputBuffer &OB, OutputFlags Flags) const { outputQualifiers(OB, Quals, false, true); Name->output(OB, Flags); - if (TargetName) { + if (TargetNames) { OB << "{for `"; - TargetName->output(OB, Flags); + TargetNames->output(OB, Flags, "'s `"); OB << "'}"; } } diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryResolver.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryResolver.cpp index 35da82a..7e1d528 100644 --- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryResolver.cpp +++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryResolver.cpp @@ -184,9 +184,9 @@ class SymbolSearchContext { public: SymbolSearchContext(SymbolQuery &Q) : Q(Q) {} - bool hasSearched(LibraryInfo *Lib) const { return Searched.count(Lib); } + bool hasSearched(const LibraryInfo *Lib) const { return Searched.count(Lib); } - void markSearched(LibraryInfo *Lib) { Searched.insert(Lib); } + void markSearched(const LibraryInfo *Lib) { Searched.insert(Lib); } inline bool allResolved() const { return Q.allResolved(); } @@ -194,7 +194,7 @@ public: private: SymbolQuery &Q; - DenseSet<LibraryInfo *> Searched; + DenseSet<const LibraryInfo *> Searched; }; void LibraryResolver::resolveSymbolsInLibrary( @@ -226,19 +226,18 @@ void LibraryResolver::resolveSymbolsInLibrary( return EnumerateResult::Continue; }, Opts); + }; + if (!Lib.hasFilter()) { + LLVM_DEBUG(dbgs() << "Building filter for library: " << Lib.getFullPath() + << "\n";); + enumerateSymbolsIfNeeded(); if (DiscoveredSymbols.empty()) { LLVM_DEBUG(dbgs() << " No symbols and remove library : " << Lib.getFullPath() << "\n";); LibMgr.removeLibrary(Lib.getFullPath()); return; } - }; - - if (!Lib.hasFilter()) { - LLVM_DEBUG(dbgs() << "Building filter for library: " << Lib.getFullPath() - << "\n";); - enumerateSymbolsIfNeeded(); SmallVector<StringRef> SymbolVec; SymbolVec.reserve(DiscoveredSymbols.size()); for (const auto &KV : DiscoveredSymbols) @@ -288,11 +287,15 @@ void LibraryResolver::searchSymbolsInLibraries( SymbolSearchContext Ctx(Q); while (!Ctx.allResolved()) { + std::vector<std::shared_ptr<LibraryInfo>> Libs; + LibMgr.getLibraries(S, K, Libs, [&](const LibraryInfo &Lib) { + return !Ctx.hasSearched(&Lib); + }); - for (auto &Lib : LibMgr.getView(S, K)) { - if (Ctx.hasSearched(Lib.get())) - continue; + if (Libs.empty() && !scanLibrariesIfNeeded(K, scanBatchSize)) + break; // no more new libs to scan + for (auto &Lib : Libs) { // can use Async here? resolveSymbolsInLibrary(*Lib, Ctx.query(), Config.Options); Ctx.markSearched(Lib.get()); @@ -300,12 +303,6 @@ void LibraryResolver::searchSymbolsInLibraries( if (Ctx.allResolved()) return; } - - if (Ctx.allResolved()) - return; - - if (!scanLibrariesIfNeeded(K, scanBatchSize)) - break; // no more new libs to scan } }; diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryScanner.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryScanner.cpp index d93f686..32f6dbe 100644 --- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryScanner.cpp +++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryScanner.cpp @@ -50,7 +50,7 @@ void handleError(Error Err, StringRef context = "") { } bool ObjectFileLoader::isArchitectureCompatible(const object::ObjectFile &Obj) { - Triple HostTriple(sys::getDefaultTargetTriple()); + Triple HostTriple(sys::getProcessTriple()); Triple ObjTriple = Obj.makeTriple(); LLVM_DEBUG({ diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp index 95d954f..0c8565c 100644 --- a/llvm/lib/IR/AsmWriter.cpp +++ b/llvm/lib/IR/AsmWriter.cpp @@ -758,14 +758,12 @@ void TypePrinting::printStructBody(StructType *STy, raw_ostream &OS) { AbstractSlotTrackerStorage::~AbstractSlotTrackerStorage() = default; -namespace llvm { - //===----------------------------------------------------------------------===// // SlotTracker Class: Enumerate slot numbers for unnamed values //===----------------------------------------------------------------------===// /// This class provides computation of slot numbers for LLVM Assembly writing. /// -class SlotTracker : public AbstractSlotTrackerStorage { +class llvm::SlotTracker : public AbstractSlotTrackerStorage { public: /// ValueMap - A mapping of Values to slot numbers. using ValueMap = DenseMap<const Value *, unsigned>; @@ -943,8 +941,6 @@ private: void processDbgRecordMetadata(const DbgRecord &DVR); }; -} // end namespace llvm - ModuleSlotTracker::ModuleSlotTracker(SlotTracker &Machine, const Module *M, const Function *F) : M(M), F(F), Machine(&Machine) {} diff --git a/llvm/lib/IR/DebugLoc.cpp b/llvm/lib/IR/DebugLoc.cpp index 01dafca..bfba6e0 100644 --- a/llvm/lib/IR/DebugLoc.cpp +++ b/llvm/lib/IR/DebugLoc.cpp @@ -10,10 +10,11 @@ #include "llvm/Config/llvm-config.h" #include "llvm/IR/DebugInfo.h" +using namespace llvm; + #if LLVM_ENABLE_DEBUGLOC_TRACKING_ORIGIN #include "llvm/Support/Signals.h" -namespace llvm { DbgLocOrigin::DbgLocOrigin(bool ShouldCollectTrace) { if (!ShouldCollectTrace) return; @@ -30,11 +31,8 @@ void DbgLocOrigin::addTrace() { auto &[Depth, StackTrace] = StackTraces.emplace_back(); Depth = sys::getStackTrace(StackTrace); } -} // namespace llvm #endif -using namespace llvm; - #if LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE DILocAndCoverageTracking::DILocAndCoverageTracking(const DILocation *L) : TrackingMDNodeRef(const_cast<DILocation *>(L)), DbgLocOrigin(!L), diff --git a/llvm/lib/IR/DebugProgramInstruction.cpp b/llvm/lib/IR/DebugProgramInstruction.cpp index d9357bb..6b1fd39 100644 --- a/llvm/lib/IR/DebugProgramInstruction.cpp +++ b/llvm/lib/IR/DebugProgramInstruction.cpp @@ -12,8 +12,9 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/Support/Compiler.h" -namespace llvm { +using namespace llvm; +namespace llvm { template <typename T> DbgRecordParamRef<T>::DbgRecordParamRef(const T *Param) : Ref(const_cast<T *>(Param)) {} @@ -28,6 +29,7 @@ template <typename T> T *DbgRecordParamRef<T>::get() const { template class LLVM_EXPORT_TEMPLATE DbgRecordParamRef<DIExpression>; template class LLVM_EXPORT_TEMPLATE DbgRecordParamRef<DILabel>; template class LLVM_EXPORT_TEMPLATE DbgRecordParamRef<DILocalVariable>; +} // namespace llvm DbgVariableRecord::DbgVariableRecord(const DbgVariableIntrinsic *DVI) : DbgRecord(ValueKind, DVI->getDebugLoc()), @@ -755,5 +757,3 @@ iterator_range<simple_ilist<DbgRecord>::iterator> DbgMarker::cloneDebugInfoFrom( // We inserted a block at the end, return that range. return {First->getIterator(), StoredDbgRecords.end()}; } - -} // end namespace llvm diff --git a/llvm/lib/IR/FPEnv.cpp b/llvm/lib/IR/FPEnv.cpp index 67f21d3..c41d7b3 100644 --- a/llvm/lib/IR/FPEnv.cpp +++ b/llvm/lib/IR/FPEnv.cpp @@ -19,9 +19,10 @@ #include "llvm/IR/Intrinsics.h" #include <optional> -namespace llvm { +using namespace llvm; -std::optional<RoundingMode> convertStrToRoundingMode(StringRef RoundingArg) { +std::optional<RoundingMode> +llvm::convertStrToRoundingMode(StringRef RoundingArg) { // For dynamic rounding mode, we use round to nearest but we will set the // 'exact' SDNodeFlag so that the value will not be rounded. return StringSwitch<std::optional<RoundingMode>>(RoundingArg) @@ -34,7 +35,8 @@ std::optional<RoundingMode> convertStrToRoundingMode(StringRef RoundingArg) { .Default(std::nullopt); } -std::optional<StringRef> convertRoundingModeToStr(RoundingMode UseRounding) { +std::optional<StringRef> +llvm::convertRoundingModeToStr(RoundingMode UseRounding) { std::optional<StringRef> RoundingStr; switch (UseRounding) { case RoundingMode::Dynamic: @@ -62,7 +64,7 @@ std::optional<StringRef> convertRoundingModeToStr(RoundingMode UseRounding) { } std::optional<fp::ExceptionBehavior> -convertStrToExceptionBehavior(StringRef ExceptionArg) { +llvm::convertStrToExceptionBehavior(StringRef ExceptionArg) { return StringSwitch<std::optional<fp::ExceptionBehavior>>(ExceptionArg) .Case("fpexcept.ignore", fp::ebIgnore) .Case("fpexcept.maytrap", fp::ebMayTrap) @@ -71,7 +73,7 @@ convertStrToExceptionBehavior(StringRef ExceptionArg) { } std::optional<StringRef> -convertExceptionBehaviorToStr(fp::ExceptionBehavior UseExcept) { +llvm::convertExceptionBehaviorToStr(fp::ExceptionBehavior UseExcept) { std::optional<StringRef> ExceptStr; switch (UseExcept) { case fp::ebStrict: @@ -87,7 +89,7 @@ convertExceptionBehaviorToStr(fp::ExceptionBehavior UseExcept) { return ExceptStr; } -Intrinsic::ID getConstrainedIntrinsicID(const Instruction &Instr) { +Intrinsic::ID llvm::getConstrainedIntrinsicID(const Instruction &Instr) { Intrinsic::ID IID = Intrinsic::not_intrinsic; switch (Instr.getOpcode()) { case Instruction::FCmp: @@ -127,5 +129,3 @@ Intrinsic::ID getConstrainedIntrinsicID(const Instruction &Instr) { return IID; } - -} // namespace llvm diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp index 88dbd17..95edb2e 100644 --- a/llvm/lib/IR/IRBuilder.cpp +++ b/llvm/lib/IR/IRBuilder.cpp @@ -1019,8 +1019,7 @@ Value *IRBuilderBase::CreateSelectWithUnknownProfile(Value *C, Value *True, const Twine &Name) { Value *Ret = CreateSelectFMF(C, True, False, {}, Name); if (auto *SI = dyn_cast<SelectInst>(Ret)) { - setExplicitlyUnknownBranchWeightsIfProfiled( - *SI, *SI->getParent()->getParent(), PassName); + setExplicitlyUnknownBranchWeightsIfProfiled(*SI, PassName); } return Ret; } diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp index 3b8fde8..cd39970 100644 --- a/llvm/lib/IR/Instructions.cpp +++ b/llvm/lib/IR/Instructions.cpp @@ -4171,6 +4171,16 @@ SwitchInstProfUpdateWrapper::removeCase(SwitchInst::CaseIt I) { return SI.removeCase(I); } +void SwitchInstProfUpdateWrapper::replaceDefaultDest(SwitchInst::CaseIt I) { + auto *DestBlock = I->getCaseSuccessor(); + if (Weights) { + auto Weight = getSuccessorWeight(I->getCaseIndex() + 1); + (*Weights)[0] = Weight.value(); + } + + SI.setDefaultDest(DestBlock); +} + void SwitchInstProfUpdateWrapper::addCase( ConstantInt *OnVal, BasicBlock *Dest, SwitchInstProfUpdateWrapper::CaseWeightOpt W) { diff --git a/llvm/lib/IR/Operator.cpp b/llvm/lib/IR/Operator.cpp index 39e5463c..c3e54a0 100644 --- a/llvm/lib/IR/Operator.cpp +++ b/llvm/lib/IR/Operator.cpp @@ -17,7 +17,8 @@ #include "ConstantsContext.h" -namespace llvm { +using namespace llvm; + bool Operator::hasPoisonGeneratingFlags() const { switch (getOpcode()) { case Instruction::Add: @@ -288,4 +289,3 @@ void FastMathFlags::print(raw_ostream &O) const { O << " afn"; } } -} // namespace llvm diff --git a/llvm/lib/IR/PassTimingInfo.cpp b/llvm/lib/IR/PassTimingInfo.cpp index 4e27086..cb1b91a 100644 --- a/llvm/lib/IR/PassTimingInfo.cpp +++ b/llvm/lib/IR/PassTimingInfo.cpp @@ -32,10 +32,10 @@ using namespace llvm; #define DEBUG_TYPE "time-passes" -namespace llvm { +using namespace llvm; -bool TimePassesIsEnabled = false; -bool TimePassesPerRun = false; +bool llvm::TimePassesIsEnabled = false; +bool llvm::TimePassesPerRun = false; static cl::opt<bool, true> EnableTiming( "time-passes", cl::location(TimePassesIsEnabled), cl::Hidden, @@ -139,7 +139,7 @@ PassTimingInfo *PassTimingInfo::TheTimeInfo; } // namespace legacy } // namespace -Timer *getPassTimer(Pass *P) { +Timer *llvm::getPassTimer(Pass *P) { legacy::PassTimingInfo::init(); if (legacy::PassTimingInfo::TheTimeInfo) return legacy::PassTimingInfo::TheTimeInfo->getPassTimer(P, P); @@ -148,7 +148,7 @@ Timer *getPassTimer(Pass *P) { /// If timing is enabled, report the times collected up to now and then reset /// them. -void reportAndResetTimings(raw_ostream *OutStream) { +void llvm::reportAndResetTimings(raw_ostream *OutStream) { if (legacy::PassTimingInfo::TheTimeInfo) legacy::PassTimingInfo::TheTimeInfo->print(OutStream); } @@ -315,5 +315,3 @@ void TimePassesHandler::registerCallbacks(PassInstrumentationCallbacks &PIC) { PIC.registerAfterAnalysisCallback( [this](StringRef P, Any) { this->stopAnalysisTimer(P); }); } - -} // namespace llvm diff --git a/llvm/lib/IR/ProfDataUtils.cpp b/llvm/lib/IR/ProfDataUtils.cpp index fc2be51..94dbe1f 100644 --- a/llvm/lib/IR/ProfDataUtils.cpp +++ b/llvm/lib/IR/ProfDataUtils.cpp @@ -274,9 +274,12 @@ void llvm::setExplicitlyUnknownBranchWeights(Instruction &I, } void llvm::setExplicitlyUnknownBranchWeightsIfProfiled(Instruction &I, - Function &F, - StringRef PassName) { - if (std::optional<Function::ProfileCount> EC = F.getEntryCount(); + StringRef PassName, + const Function *F) { + F = F ? F : I.getFunction(); + assert(F && "Either pass a instruction attached to a Function, or explicitly " + "pass the Function that it will be attached to"); + if (std::optional<Function::ProfileCount> EC = F->getEntryCount(); EC && EC->getCount() > 0) setExplicitlyUnknownBranchWeights(I, PassName); } diff --git a/llvm/lib/IR/PseudoProbe.cpp b/llvm/lib/IR/PseudoProbe.cpp index 59f218c..3c05f4b 100644 --- a/llvm/lib/IR/PseudoProbe.cpp +++ b/llvm/lib/IR/PseudoProbe.cpp @@ -19,9 +19,7 @@ using namespace llvm; -namespace llvm { - -std::optional<PseudoProbe> +static std::optional<PseudoProbe> extractProbeFromDiscriminator(const DILocation *DIL) { if (DIL) { auto Discriminator = DIL->getDiscriminator(); @@ -43,7 +41,7 @@ extractProbeFromDiscriminator(const DILocation *DIL) { return std::nullopt; } -std::optional<PseudoProbe> +static std::optional<PseudoProbe> extractProbeFromDiscriminator(const Instruction &Inst) { assert(isa<CallBase>(&Inst) && !isa<IntrinsicInst>(&Inst) && "Only call instructions should have pseudo probe encodes as their " @@ -53,7 +51,7 @@ extractProbeFromDiscriminator(const Instruction &Inst) { return std::nullopt; } -std::optional<PseudoProbe> extractProbe(const Instruction &Inst) { +std::optional<PseudoProbe> llvm::extractProbe(const Instruction &Inst) { if (const auto *II = dyn_cast<PseudoProbeInst>(&Inst)) { PseudoProbe Probe; Probe.Id = II->getIndex()->getZExtValue(); @@ -73,7 +71,7 @@ std::optional<PseudoProbe> extractProbe(const Instruction &Inst) { return std::nullopt; } -void setProbeDistributionFactor(Instruction &Inst, float Factor) { +void llvm::setProbeDistributionFactor(Instruction &Inst, float Factor) { assert(Factor >= 0 && Factor <= 1 && "Distribution factor must be in [0, 1.0]"); if (auto *II = dyn_cast<PseudoProbeInst>(&Inst)) { @@ -111,5 +109,3 @@ void setProbeDistributionFactor(Instruction &Inst, float Factor) { } } } - -} // namespace llvm diff --git a/llvm/lib/IR/ReplaceConstant.cpp b/llvm/lib/IR/ReplaceConstant.cpp index 962368f..b3586b4 100644 --- a/llvm/lib/IR/ReplaceConstant.cpp +++ b/llvm/lib/IR/ReplaceConstant.cpp @@ -16,7 +16,7 @@ #include "llvm/IR/Constants.h" #include "llvm/IR/Instructions.h" -namespace llvm { +using namespace llvm; static bool isExpandableUser(User *U) { return isa<ConstantExpr>(U) || isa<ConstantAggregate>(U); @@ -49,10 +49,10 @@ static SmallVector<Instruction *, 4> expandUser(BasicBlock::iterator InsertPt, return NewInsts; } -bool convertUsersOfConstantsToInstructions(ArrayRef<Constant *> Consts, - Function *RestrictToFunc, - bool RemoveDeadConstants, - bool IncludeSelf) { +bool llvm::convertUsersOfConstantsToInstructions(ArrayRef<Constant *> Consts, + Function *RestrictToFunc, + bool RemoveDeadConstants, + bool IncludeSelf) { // Find all expandable direct users of Consts. SmallVector<Constant *> Stack; for (Constant *C : Consts) { @@ -121,5 +121,3 @@ bool convertUsersOfConstantsToInstructions(ArrayRef<Constant *> Consts, return Changed; } - -} // namespace llvm diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp index 77af29b..f4c5c6f 100644 --- a/llvm/lib/IR/RuntimeLibcalls.cpp +++ b/llvm/lib/IR/RuntimeLibcalls.cpp @@ -7,7 +7,9 @@ //===----------------------------------------------------------------------===// #include "llvm/IR/RuntimeLibcalls.h" +#include "llvm/ADT/FloatingPointMode.h" #include "llvm/ADT/StringTable.h" +#include "llvm/IR/Module.h" #include "llvm/Support/Debug.h" #include "llvm/Support/xxhash.h" #include "llvm/TargetParser/ARMTargetParser.h" @@ -17,11 +19,17 @@ using namespace llvm; using namespace RTLIB; +#define GET_RUNTIME_LIBCALLS_INFO #define GET_INIT_RUNTIME_LIBCALL_NAMES #define GET_SET_TARGET_RUNTIME_LIBCALL_SETS #define DEFINE_GET_LOOKUP_LIBCALL_IMPL_NAME #include "llvm/IR/RuntimeLibcalls.inc" +RuntimeLibcallsInfo::RuntimeLibcallsInfo(const Module &M) + : RuntimeLibcallsInfo(M.getTargetTriple()) { + // TODO: Consider module flags +} + /// Set default libcall names. If a target wants to opt-out of a libcall it /// should be placed here. void RuntimeLibcallsInfo::initLibcalls(const Triple &TT, @@ -72,3 +80,80 @@ bool RuntimeLibcallsInfo::darwinHasExp10(const Triple &TT) { return false; } } + +std::pair<FunctionType *, AttributeList> +RuntimeLibcallsInfo::getFunctionTy(LLVMContext &Ctx, const Triple &TT, + const DataLayout &DL, + RTLIB::LibcallImpl LibcallImpl) const { + static constexpr Attribute::AttrKind CommonFnAttrs[] = { + Attribute::NoCallback, Attribute::NoFree, Attribute::NoSync, + Attribute::NoUnwind, Attribute::WillReturn}; + + switch (LibcallImpl) { + case RTLIB::impl___sincos_stret: + case RTLIB::impl___sincosf_stret: { + if (!darwinHasSinCosStret(TT)) // Non-darwin currently unexpected + return {}; + + Type *ScalarTy = LibcallImpl == RTLIB::impl___sincosf_stret + ? Type::getFloatTy(Ctx) + : Type::getDoubleTy(Ctx); + + AttrBuilder FuncAttrBuilder(Ctx); + for (Attribute::AttrKind Attr : CommonFnAttrs) + FuncAttrBuilder.addAttribute(Attr); + + const bool UseSret = + TT.isX86_32() || ((TT.isARM() || TT.isThumb()) && + ARM::computeTargetABI(TT) == ARM::ARM_ABI_APCS); + + FuncAttrBuilder.addMemoryAttr(MemoryEffects::argumentOrErrnoMemOnly( + UseSret ? ModRefInfo::Mod : ModRefInfo::NoModRef, ModRefInfo::Mod)); + + AttributeList Attrs; + Attrs = Attrs.addFnAttributes(Ctx, FuncAttrBuilder); + + if (UseSret) { + AttrBuilder AttrBuilder(Ctx); + StructType *StructTy = StructType::get(ScalarTy, ScalarTy); + AttrBuilder.addStructRetAttr(StructTy); + AttrBuilder.addAlignmentAttr(DL.getABITypeAlign(StructTy)); + FunctionType *FuncTy = FunctionType::get( + Type::getVoidTy(Ctx), {DL.getAllocaPtrType(Ctx), ScalarTy}, false); + + return {FuncTy, Attrs.addParamAttributes(Ctx, 0, AttrBuilder)}; + } + + Type *RetTy = + LibcallImpl == RTLIB::impl___sincosf_stret && TT.isX86_64() + ? static_cast<Type *>(FixedVectorType::get(ScalarTy, 2)) + : static_cast<Type *>(StructType::get(ScalarTy, ScalarTy)); + + return {FunctionType::get(RetTy, {ScalarTy}, false), Attrs}; + } + case RTLIB::impl_sqrtf: + case RTLIB::impl_sqrt: { + AttrBuilder FuncAttrBuilder(Ctx); + + for (Attribute::AttrKind Attr : CommonFnAttrs) + FuncAttrBuilder.addAttribute(Attr); + FuncAttrBuilder.addMemoryAttr(MemoryEffects::errnoMemOnly(ModRefInfo::Mod)); + + AttributeList Attrs; + Attrs = Attrs.addFnAttributes(Ctx, FuncAttrBuilder); + + Type *ScalarTy = LibcallImpl == RTLIB::impl_sqrtf ? Type::getFloatTy(Ctx) + : Type::getDoubleTy(Ctx); + FunctionType *FuncTy = FunctionType::get(ScalarTy, {ScalarTy}, false); + + Attrs = Attrs.addRetAttribute( + Ctx, Attribute::getWithNoFPClass(Ctx, fcNegInf | fcNegSubnormal | + fcNegNormal)); + return {FuncTy, Attrs}; + } + default: + return {}; + } + + return {}; +} diff --git a/llvm/lib/IR/Use.cpp b/llvm/lib/IR/Use.cpp index 67882ba..5042335 100644 --- a/llvm/lib/IR/Use.cpp +++ b/llvm/lib/IR/Use.cpp @@ -9,7 +9,7 @@ #include "llvm/IR/Use.h" #include "llvm/IR/User.h" -namespace llvm { +using namespace llvm; void Use::swap(Use &RHS) { if (Val == RHS.Val) @@ -42,5 +42,3 @@ void Use::zap(Use *Start, const Use *Stop, bool del) { if (del) ::operator delete(Start); } - -} // namespace llvm diff --git a/llvm/lib/IR/User.cpp b/llvm/lib/IR/User.cpp index ab44cb4..9bb7c12 100644 --- a/llvm/lib/IR/User.cpp +++ b/llvm/lib/IR/User.cpp @@ -11,8 +11,11 @@ #include "llvm/IR/GlobalValue.h" #include "llvm/IR/IntrinsicInst.h" +using namespace llvm; + namespace llvm { class BasicBlock; +} //===----------------------------------------------------------------------===// // User Class @@ -214,5 +217,3 @@ LLVM_NO_SANITIZE_MEMORY_ATTRIBUTE void User::operator delete(void *Usr) { ::operator delete(Storage); } } - -} // namespace llvm diff --git a/llvm/lib/IR/Value.cpp b/llvm/lib/IR/Value.cpp index 95d61a9..b775cbb 100644 --- a/llvm/lib/IR/Value.cpp +++ b/llvm/lib/IR/Value.cpp @@ -148,10 +148,18 @@ void Value::destroyValueName() { } bool Value::hasNUses(unsigned N) const { + if (!UseList) + return N == 0; + + // TODO: Disallow for ConstantData and remove !UseList check? return hasNItems(use_begin(), use_end(), N); } bool Value::hasNUsesOrMore(unsigned N) const { + // TODO: Disallow for ConstantData and remove !UseList check? + if (!UseList) + return N == 0; + return hasNItemsOrMore(use_begin(), use_end(), N); } diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index 7917712..24f90bf 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -136,9 +136,7 @@ static cl::opt<bool> VerifyNoAliasScopeDomination( cl::desc("Ensure that llvm.experimental.noalias.scope.decl for identical " "scopes are not dominating")); -namespace llvm { - -struct VerifierSupport { +struct llvm::VerifierSupport { raw_ostream *OS; const Module &M; ModuleSlotTracker MST; @@ -318,8 +316,6 @@ public: } }; -} // namespace llvm - namespace { class Verifier : public InstVisitor<Verifier>, VerifierSupport { diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp index 23be42f..fefc733 100644 --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -1396,11 +1396,10 @@ Error LTO::runRegularLTO(AddStreamFn AddStream) { SmallVector<const char *> LTO::getRuntimeLibcallSymbols(const Triple &TT) { RTLIB::RuntimeLibcallsInfo Libcalls(TT); SmallVector<const char *> LibcallSymbols; - ArrayRef<RTLIB::LibcallImpl> LibcallImpls = Libcalls.getLibcallImpls(); - LibcallSymbols.reserve(LibcallImpls.size()); + LibcallSymbols.reserve(Libcalls.getNumAvailableLibcallImpls()); - for (RTLIB::LibcallImpl Impl : LibcallImpls) { - if (Impl != RTLIB::Unsupported) + for (RTLIB::LibcallImpl Impl : RTLIB::libcall_impls()) { + if (Libcalls.isAvailable(Impl)) LibcallSymbols.push_back(Libcalls.getLibcallImplName(Impl).data()); } diff --git a/llvm/lib/MC/SPIRVObjectWriter.cpp b/llvm/lib/MC/SPIRVObjectWriter.cpp index 5e37137..d693ea3 100644 --- a/llvm/lib/MC/SPIRVObjectWriter.cpp +++ b/llvm/lib/MC/SPIRVObjectWriter.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "llvm/MC/MCAssembler.h" +#include "llvm/MC/MCContext.h" #include "llvm/MC/MCSPIRVObjectWriter.h" #include "llvm/MC/MCSection.h" #include "llvm/MC/MCValue.h" @@ -17,8 +18,10 @@ using namespace llvm; void SPIRVObjectWriter::writeHeader(const MCAssembler &Asm) { constexpr uint32_t MagicNumber = 0x07230203; constexpr uint32_t GeneratorID = 43; - constexpr uint32_t GeneratorMagicNumber = - (GeneratorID << 16) | (LLVM_VERSION_MAJOR); + const uint32_t GeneratorMagicNumber = + Asm.getContext().getTargetTriple().getVendor() == Triple::AMD + ? UINT16_MAX + : ((GeneratorID << 16) | (LLVM_VERSION_MAJOR)); constexpr uint32_t Schema = 0; W.write<uint32_t>(MagicNumber); diff --git a/llvm/lib/Object/MachOObjectFile.cpp b/llvm/lib/Object/MachOObjectFile.cpp index e09dc94..c2f4560 100644 --- a/llvm/lib/Object/MachOObjectFile.cpp +++ b/llvm/lib/Object/MachOObjectFile.cpp @@ -1978,20 +1978,42 @@ uint64_t MachOObjectFile::getSectionSize(DataRefImpl Sec) const { return SectSize; } -ArrayRef<uint8_t> MachOObjectFile::getSectionContents(uint32_t Offset, +ArrayRef<uint8_t> MachOObjectFile::getSectionContents(uint64_t Offset, uint64_t Size) const { return arrayRefFromStringRef(getData().substr(Offset, Size)); } Expected<ArrayRef<uint8_t>> MachOObjectFile::getSectionContents(DataRefImpl Sec) const { - uint32_t Offset; + uint64_t Offset; uint64_t Size; if (is64Bit()) { MachO::section_64 Sect = getSection64(Sec); Offset = Sect.offset; Size = Sect.size; + // Check for large mach-o files where the section contents might exceed + // 4GB. MachO::section_64 objects only have 32 bit file offsets to the + // section contents and can overflow in dSYM files. We can track this and + // adjust the section offset to be 64 bit safe. If sections overflow then + // section ordering is enforced. If sections are not ordered, then an error + // will be returned stopping invalid section data from being returned. + uint64_t PrevTrueOffset = 0; + uint64_t SectOffsetAdjust = 0; + for (uint32_t SectIdx = 0; SectIdx < Sec.d.a; ++SectIdx) { + MachO::section_64 CurrSect = + getStruct<MachO::section_64>(*this, Sections[SectIdx]); + uint64_t CurrTrueOffset = (uint64_t)CurrSect.offset + SectOffsetAdjust; + if ((SectOffsetAdjust > 0) && (PrevTrueOffset > CurrTrueOffset)) + return malformedError("section data exceeds 4GB and section file " + "offsets are not ordered"); + const uint64_t EndSectFileOffset = + (uint64_t)CurrSect.offset + CurrSect.size; + if (EndSectFileOffset > UINT32_MAX) + SectOffsetAdjust += EndSectFileOffset & 0xFFFFFFFF00000000ull; + PrevTrueOffset = CurrTrueOffset; + } + Offset += SectOffsetAdjust; } else { MachO::section Sect = getSection(Sec); Offset = Sect.offset; diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp index e5e5fc2..29f2916 100644 --- a/llvm/lib/ObjectYAML/ELFYAML.cpp +++ b/llvm/lib/ObjectYAML/ELFYAML.cpp @@ -37,8 +37,6 @@ unsigned Object::getMachine() const { return *Header.Machine; return llvm::ELF::EM_NONE; } - -constexpr StringRef SectionHeaderTable::TypeStr; } // namespace ELFYAML namespace yaml { diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 3c9a27a..40ceb6f 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -67,6 +67,7 @@ #include "llvm/Analysis/PostDominators.h" #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/RegionInfo.h" +#include "llvm/Analysis/RuntimeLibcallInfo.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" #include "llvm/Analysis/ScalarEvolutionDivision.h" diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index bd03ac0..3f41618 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -228,7 +228,7 @@ static cl::opt<bool> EnableLoopHeaderDuplication( static cl::opt<bool> EnableDFAJumpThreading("enable-dfa-jump-thread", cl::desc("Enable DFA jump threading"), - cl::init(false), cl::Hidden); + cl::init(true), cl::Hidden); static cl::opt<bool> EnableHotColdSplit("hot-cold-split", diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index 1853cdd..d870f99 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -35,6 +35,7 @@ MODULE_ANALYSIS("no-op-module", NoOpModuleAnalysis()) MODULE_ANALYSIS("pass-instrumentation", PassInstrumentationAnalysis(PIC)) MODULE_ANALYSIS("profile-summary", ProfileSummaryAnalysis()) MODULE_ANALYSIS("reg-usage", PhysicalRegisterUsageAnalysis()) +MODULE_ANALYSIS("runtime-libcall-info", RuntimeLibraryAnalysis()) MODULE_ANALYSIS("stack-safety", StackSafetyGlobalAnalysis()) MODULE_ANALYSIS("verify", VerifierAnalysis()) diff --git a/llvm/lib/Support/Mustache.cpp b/llvm/lib/Support/Mustache.cpp index 708e79d..6c140be 100644 --- a/llvm/lib/Support/Mustache.cpp +++ b/llvm/lib/Support/Mustache.cpp @@ -34,6 +34,31 @@ static bool isContextFalsey(const json::Value *V) { return isFalsey(*V); } +static void splitAndTrim(StringRef Str, SmallVectorImpl<StringRef> &Tokens) { + size_t CurrentPos = 0; + while (CurrentPos < Str.size()) { + // Find the next delimiter. + size_t DelimiterPos = Str.find('.', CurrentPos); + + // If no delimiter is found, process the rest of the string. + if (DelimiterPos == StringRef::npos) + DelimiterPos = Str.size(); + + // Get the current part, which may have whitespace. + StringRef Part = Str.slice(CurrentPos, DelimiterPos); + + // Manually trim the part without creating a new string object. + size_t Start = Part.find_first_not_of(" \t\r\n"); + if (Start != StringRef::npos) { + size_t End = Part.find_last_not_of(" \t\r\n"); + Tokens.push_back(Part.slice(Start, End + 1)); + } + + // Move past the delimiter for the next iteration. + CurrentPos = DelimiterPos + 1; + } +} + static Accessor splitMustacheString(StringRef Str, MustacheContext &Ctx) { // We split the mustache string into an accessor. // For example: @@ -46,13 +71,7 @@ static Accessor splitMustacheString(StringRef Str, MustacheContext &Ctx) { // It's a literal, so it doesn't need to be saved. Tokens.push_back("."); } else { - while (!Str.empty()) { - StringRef Part; - std::tie(Part, Str) = Str.split('.'); - // Each part of the accessor needs to be saved to the arena - // to ensure it has a stable address. - Tokens.push_back(Ctx.Saver.save(Part.trim())); - } + splitAndTrim(Str, Tokens); } // Now, allocate memory for the array of StringRefs in the arena. StringRef *ArenaTokens = Ctx.Allocator.Allocate<StringRef>(Tokens.size()); @@ -368,141 +387,99 @@ struct Tag { llvm_unreachable("Unknown json::Value::Kind"); } -static Tag findNextTag(StringRef Template, size_t StartPos, StringRef Open, - StringRef Close) { - const StringLiteral TripleOpen("{{{"); - const StringLiteral TripleClose("}}}"); - - size_t NormalOpenPos = Template.find(Open, StartPos); - size_t TripleOpenPos = Template.find(TripleOpen, StartPos); - - Tag Result; - - // Determine which tag comes first. - if (TripleOpenPos != StringRef::npos && - (NormalOpenPos == StringRef::npos || TripleOpenPos <= NormalOpenPos)) { - // Found a triple mustache tag. - size_t EndPos = - Template.find(TripleClose, TripleOpenPos + TripleOpen.size()); - if (EndPos == StringRef::npos) - return Result; // No closing tag found. - - Result.TagKind = Tag::Kind::Triple; - Result.StartPosition = TripleOpenPos; - size_t ContentStart = TripleOpenPos + TripleOpen.size(); - Result.Content = Template.substr(ContentStart, EndPos - ContentStart); - Result.FullMatch = Template.substr( - TripleOpenPos, (EndPos + TripleClose.size()) - TripleOpenPos); - } else if (NormalOpenPos != StringRef::npos) { - // Found a normal mustache tag. - size_t EndPos = Template.find(Close, NormalOpenPos + Open.size()); - if (EndPos == StringRef::npos) - return Result; // No closing tag found. - - Result.TagKind = Tag::Kind::Normal; - Result.StartPosition = NormalOpenPos; - size_t ContentStart = NormalOpenPos + Open.size(); - Result.Content = Template.substr(ContentStart, EndPos - ContentStart); - Result.FullMatch = - Template.substr(NormalOpenPos, (EndPos + Close.size()) - NormalOpenPos); - } - - return Result; -} - -static std::optional<std::pair<StringRef, StringRef>> -processTag(const Tag &T, SmallVectorImpl<Token> &Tokens, MustacheContext &Ctx) { - LLVM_DEBUG(dbgs() << "[Tag] " << T.FullMatch << ", Content: " << T.Content - << ", Kind: " << tagKindToString(T.TagKind) << "\n"); - if (T.TagKind == Tag::Kind::Triple) { - Tokens.emplace_back(T.FullMatch, Ctx.Saver.save("&" + T.Content), '&', Ctx); - return std::nullopt; - } - StringRef Interpolated = T.Content; - if (!Interpolated.trim().starts_with("=")) { - char Front = Interpolated.empty() ? ' ' : Interpolated.trim().front(); - Tokens.emplace_back(T.FullMatch, Interpolated, Front, Ctx); - return std::nullopt; - } - Tokens.emplace_back(T.FullMatch, Interpolated, '=', Ctx); - StringRef DelimSpec = Interpolated.trim(); - DelimSpec = DelimSpec.drop_front(1); - DelimSpec = DelimSpec.take_until([](char C) { return C == '='; }); - DelimSpec = DelimSpec.trim(); - - std::pair<StringRef, StringRef> Ret = DelimSpec.split(' '); - LLVM_DEBUG(dbgs() << "[Set Delimiter] NewOpen: " << Ret.first - << ", NewClose: " << Ret.second << "\n"); - return Ret; -} - // Simple tokenizer that splits the template into tokens. -// The mustache spec allows {{{ }}} to unescape variables, -// but we don't support that here. An unescape variable -// is represented only by {{& variable}}. static SmallVector<Token> tokenize(StringRef Template, MustacheContext &Ctx) { LLVM_DEBUG(dbgs() << "[Tokenize Template] \"" << Template << "\"\n"); SmallVector<Token> Tokens; SmallString<8> Open("{{"); SmallString<8> Close("}}"); - size_t Start = 0; + size_t Cursor = 0; + size_t TextStart = 0; - while (Start < Template.size()) { - LLVM_DEBUG(dbgs() << "[Tokenize Loop] Start:" << Start << ", Open:'" << Open - << "', Close:'" << Close << "'\n"); - Tag T = findNextTag(Template, Start, Open, Close); + const StringLiteral TripleOpen("{{{"); + const StringLiteral TripleClose("}}}"); - if (T.TagKind == Tag::Kind::None) { - // No more tags, the rest is text. - Tokens.emplace_back(Template.substr(Start)); - break; + while (Cursor < Template.size()) { + StringRef TemplateSuffix = Template.substr(Cursor); + StringRef TagOpen, TagClose; + Tag::Kind Kind; + + // Determine which tag we've encountered. + if (TemplateSuffix.starts_with(TripleOpen)) { + Kind = Tag::Kind::Triple; + TagOpen = TripleOpen; + TagClose = TripleClose; + } else if (TemplateSuffix.starts_with(Open)) { + Kind = Tag::Kind::Normal; + TagOpen = Open; + TagClose = Close; + } else { + // Not at a tag, continue scanning. + ++Cursor; + continue; } - // Add the text before the tag. - if (T.StartPosition > Start) { - StringRef Text = Template.substr(Start, T.StartPosition - Start); - Tokens.emplace_back(Text); + // Found a tag, first add the preceding text. + if (Cursor > TextStart) + Tokens.emplace_back(Template.slice(TextStart, Cursor)); + + // Find the closing tag. + size_t EndPos = Template.find(TagClose, Cursor + TagOpen.size()); + if (EndPos == StringRef::npos) { + // No closing tag, the rest is text. + Tokens.emplace_back(Template.substr(Cursor)); + TextStart = Cursor = Template.size(); + break; } - if (auto NewDelims = processTag(T, Tokens, Ctx)) { - std::tie(Open, Close) = *NewDelims; + // Extract tag content and full match. + size_t ContentStart = Cursor + TagOpen.size(); + StringRef Content = Template.substr(ContentStart, EndPos - ContentStart); + StringRef FullMatch = + Template.substr(Cursor, (EndPos + TagClose.size()) - Cursor); + + // Process the tag (inlined logic from processTag). + LLVM_DEBUG(dbgs() << "[Tag] " << FullMatch << ", Content: " << Content + << ", Kind: " << tagKindToString(Kind) << "\n"); + if (Kind == Tag::Kind::Triple) { + Tokens.emplace_back(FullMatch, Ctx.Saver.save("&" + Content), '&', Ctx); + } else { // Normal Tag + StringRef Interpolated = Content; + if (!Interpolated.trim().starts_with("=")) { + char Front = Interpolated.empty() ? ' ' : Interpolated.trim().front(); + Tokens.emplace_back(FullMatch, Interpolated, Front, Ctx); + } else { // Set Delimiter + Tokens.emplace_back(FullMatch, Interpolated, '=', Ctx); + StringRef DelimSpec = Interpolated.trim(); + DelimSpec = DelimSpec.drop_front(1); + DelimSpec = DelimSpec.take_until([](char C) { return C == '='; }); + DelimSpec = DelimSpec.trim(); + + auto [NewOpen, NewClose] = DelimSpec.split(' '); + LLVM_DEBUG(dbgs() << "[Set Delimiter] NewOpen: " << NewOpen + << ", NewClose: " << NewClose << "\n"); + Open = NewOpen; + Close = NewClose; + } } - // Move past the tag. - Start = T.StartPosition + T.FullMatch.size(); + // Move past the tag for the next iteration. + Cursor += FullMatch.size(); + TextStart = Cursor; } - // Fix up white spaces for: - // - open sections - // - inverted sections - // - close sections - // - comments - // - // This loop attempts to find standalone tokens and tries to trim out - // the surrounding whitespace. - // For example: - // if you have the template string - // {{#section}} \n Example \n{{/section}} - // The output should would be - // For example: - // \n Example \n + // Add any remaining text after the last tag. + if (TextStart < Template.size()) + Tokens.emplace_back(Template.substr(TextStart)); + + // Fix up white spaces for standalone tags. size_t LastIdx = Tokens.size() - 1; for (size_t Idx = 0, End = Tokens.size(); Idx < End; ++Idx) { Token &CurrentToken = Tokens[Idx]; Token::Type CurrentType = CurrentToken.getType(); - // Check if token type requires cleanup. - bool RequiresCleanUp = requiresCleanUp(CurrentType); - - if (!RequiresCleanUp) + if (!requiresCleanUp(CurrentType)) continue; - // We adjust the token body if there's no text behind or ahead. - // A token is considered to have no text ahead if the right of the previous - // token is a newline followed by spaces. - // A token is considered to have no text behind if the left of the next - // token is spaces followed by a newline. - // eg. - // "Line 1\n {{#section}} \n Line 2 \n {{/section}} \n Line 3" bool HasTextBehind = hasTextBehind(Idx, Tokens); bool HasTextAhead = hasTextAhead(Idx, Tokens); @@ -622,9 +599,16 @@ void Parser::parseSection(ASTNode *Parent, ASTNode::Type Ty, size_t Start = CurrentPtr; parseMustache(CurrentNode); const size_t End = CurrentPtr - 1; + + size_t RawBodySize = 0; + for (size_t I = Start; I < End; ++I) + RawBodySize += Tokens[I].RawBody.size(); + SmallString<128> RawBody; - for (std::size_t I = Start; I < End; I++) + RawBody.reserve(RawBodySize); + for (std::size_t I = Start; I < End; ++I) RawBody += Tokens[I].RawBody; + CurrentNode->setRawBody(Ctx.Saver.save(StringRef(RawBody))); Parent->addChild(CurrentNode); } diff --git a/llvm/lib/TableGen/Record.cpp b/llvm/lib/TableGen/Record.cpp index afce803..8ad20b4 100644 --- a/llvm/lib/TableGen/Record.cpp +++ b/llvm/lib/TableGen/Record.cpp @@ -46,12 +46,11 @@ using namespace llvm; // Context //===----------------------------------------------------------------------===// -namespace llvm::detail { /// This class represents the internal implementation of the RecordKeeper. /// It contains all of the contextual static state of the Record classes. It is /// kept out-of-line to simplify dependencies, and also make it easier for /// internal classes to access the uniquer state of the keeper. -struct RecordKeeperImpl { +struct detail::RecordKeeperImpl { RecordKeeperImpl(RecordKeeper &RK) : SharedBitRecTy(RK), SharedIntRecTy(RK), SharedStringRecTy(RK), SharedDagRecTy(RK), AnyRecord(RK, {}), TheUnsetInit(RK), @@ -99,7 +98,6 @@ struct RecordKeeperImpl { void dumpAllocationStats(raw_ostream &OS) const; }; -} // namespace llvm::detail void detail::RecordKeeperImpl::dumpAllocationStats(raw_ostream &OS) const { // Dump memory allocation related stats. diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/llvm/lib/Target/AArch64/AArch64CallingConvention.td index 1b5a713..34c85d5 100644 --- a/llvm/lib/Target/AArch64/AArch64CallingConvention.td +++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.td @@ -601,6 +601,12 @@ def CSR_Win_AArch64_AAPCS_SwiftError def CSR_Win_AArch64_AAPCS_SwiftTail : CalleeSavedRegs<(sub CSR_Win_AArch64_AAPCS, X20, X22)>; +def CSR_Win_AArch64_RT_MostRegs + : CalleeSavedRegs<(add CSR_Win_AArch64_AAPCS, (sequence "X%u", 9, 15))>; + +def CSR_Win_AArch64_RT_AllRegs + : CalleeSavedRegs<(add CSR_Win_AArch64_RT_MostRegs, (sequence "Q%u", 8, 31))>; + // The Control Flow Guard check call uses a custom calling convention that also // preserves X0-X8 and Q0-Q7. def CSR_Win_AArch64_CFGuard_Check : CalleeSavedRegs<(add CSR_Win_AArch64_AAPCS, diff --git a/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/llvm/lib/Target/AArch64/AArch64FastISel.cpp index cf34498..18e246e 100644 --- a/llvm/lib/Target/AArch64/AArch64FastISel.cpp +++ b/llvm/lib/Target/AArch64/AArch64FastISel.cpp @@ -81,10 +81,7 @@ namespace { class AArch64FastISel final : public FastISel { class Address { public: - using BaseKind = enum { - RegBase, - FrameIndexBase - }; + enum BaseKind { RegBase, FrameIndexBase }; private: BaseKind Kind = RegBase; diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index 0f7b34c..3ee4d58 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -2380,13 +2380,6 @@ void AArch64FrameLowering::determineStackHazardSlot( return; } - const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); - if (MFI.hasVarSizedObjects() || TRI->hasStackRealignment(MF)) { - LLVM_DEBUG(dbgs() << "SplitSVEObjects is not supported with variable " - "sized objects or realignment\n"); - return; - } - // If another calling convention is explicitly set FPRs can't be promoted to // ZPR callee-saves. if (!is_contained({CallingConv::C, CallingConv::Fast, @@ -2402,6 +2395,7 @@ void AArch64FrameLowering::determineStackHazardSlot( assert(Subtarget.isSVEorStreamingSVEAvailable() && "Expected SVE to be available for PPRs"); + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); // With SplitSVEObjects the CS hazard padding is placed between the // PPRs and ZPRs. If there are any FPR CS there would be a hazard between // them and the CS GRPs. Avoid this by promoting all FPR CS to ZPRs. diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 60aa61e..40e6400 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -50,6 +50,7 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/SDPatternMatch.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/TargetCallingConv.h" @@ -104,7 +105,6 @@ #include <vector> using namespace llvm; -using namespace llvm::PatternMatch; #define DEBUG_TYPE "aarch64-lower" @@ -1052,15 +1052,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, // Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0. setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal); - if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr && - getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) { - // Issue __sincos_stret if available. - setOperationAction(ISD::FSINCOS, MVT::f64, Custom); - setOperationAction(ISD::FSINCOS, MVT::f32, Custom); - } else { - setOperationAction(ISD::FSINCOS, MVT::f64, Expand); - setOperationAction(ISD::FSINCOS, MVT::f32, Expand); - } + // Issue __sincos_stret if available. + setOperationAction(ISD::FSINCOS, MVT::f64, Expand); + setOperationAction(ISD::FSINCOS, MVT::f32, Expand); // Make floating-point constants legal for the large code model, so they don't // become loads from the constant pool. @@ -1180,6 +1174,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::SHL); setTargetDAGCombine(ISD::VECTOR_DEINTERLEAVE); + setTargetDAGCombine(ISD::CTPOP); // In case of strict alignment, avoid an excessive number of byte wide stores. MaxStoresPerMemsetOptSize = 8; @@ -5346,35 +5341,6 @@ SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op, return SDValue(); } -SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op, - SelectionDAG &DAG) const { - // For iOS, we want to call an alternative entry point: __sincos_stret, - // which returns the values in two S / D registers. - SDLoc DL(Op); - SDValue Arg = Op.getOperand(0); - EVT ArgVT = Arg.getValueType(); - Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); - - ArgListTy Args; - Args.emplace_back(Arg, ArgTy); - - RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64 - : RTLIB::SINCOS_STRET_F32; - const char *LibcallName = getLibcallName(LC); - SDValue Callee = - DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout())); - - StructType *RetTy = StructType::get(ArgTy, ArgTy); - TargetLowering::CallLoweringInfo CLI(DAG); - CallingConv::ID CC = getLibcallCallingConv(LC); - CLI.setDebugLoc(DL) - .setChain(DAG.getEntryNode()) - .setLibCallee(CC, RetTy, Callee, std::move(Args)); - - std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); - return CallResult.first; -} - static MVT getSVEContainerType(EVT ContentTy); SDValue @@ -7723,8 +7689,6 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, case ISD::FP_TO_SINT_SAT: case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG); - case ISD::FSINCOS: - return LowerFSINCOS(Op, DAG); case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG); case ISD::SET_ROUNDING: @@ -11367,9 +11331,10 @@ SDValue AArch64TargetLowering::LowerMinMax(SDValue Op, break; } + // Note: This lowering only overrides NEON for v1i64 and v2i64, where we + // prefer using SVE if available. if (VT.isScalableVector() || - useSVEForFixedLengthVectorVT( - VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) { + useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true)) { switch (Opcode) { default: llvm_unreachable("Wrong instruction"); @@ -17591,6 +17556,7 @@ bool AArch64TargetLowering::optimizeExtendOrTruncateConversion( // udot instruction. if (SrcWidth * 4 <= DstWidth) { if (all_of(I->users(), [&](auto *U) { + using namespace llvm::PatternMatch; auto *SingleUser = cast<Instruction>(&*U); if (match(SingleUser, m_c_Mul(m_Specific(I), m_SExt(m_Value())))) return true; @@ -17862,6 +17828,7 @@ bool AArch64TargetLowering::lowerInterleavedLoad( // into shift / and masks. For the moment we do this just for uitofp (not // zext) to avoid issues with widening instructions. if (Shuffles.size() == 4 && all_of(Shuffles, [](ShuffleVectorInst *SI) { + using namespace llvm::PatternMatch; return SI->hasOneUse() && match(SI->user_back(), m_UIToFP(m_Value())) && SI->getType()->getScalarSizeInBits() * 4 == SI->user_back()->getType()->getScalarSizeInBits(); @@ -27878,6 +27845,35 @@ static SDValue performRNDRCombine(SDNode *N, SelectionDAG &DAG) { {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL); } +static SDValue performCTPOPCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG) { + using namespace llvm::SDPatternMatch; + if (!DCI.isBeforeLegalize()) + return SDValue(); + + // ctpop(zext(bitcast(vector_mask))) -> neg(signed_reduce_add(vector_mask)) + SDValue Mask; + if (!sd_match(N->getOperand(0), m_ZExt(m_BitCast(m_Value(Mask))))) + return SDValue(); + + EVT VT = N->getValueType(0); + EVT MaskVT = Mask.getValueType(); + + if (VT.isVector() || !MaskVT.isFixedLengthVector() || + MaskVT.getVectorElementType() != MVT::i1) + return SDValue(); + + EVT ReduceInVT = + EVT::getVectorVT(*DAG.getContext(), VT, MaskVT.getVectorElementCount()); + + SDLoc DL(N); + // Sign extend to best fit ZeroOrNegativeOneBooleanContent. + SDValue ExtMask = DAG.getNode(ISD::SIGN_EXTEND, DL, ReduceInVT, Mask); + SDValue NegPopCount = DAG.getNode(ISD::VECREDUCE_ADD, DL, VT, ExtMask); + return DAG.getNegative(NegPopCount, DL, VT); +} + SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -28223,6 +28219,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, return performScalarToVectorCombine(N, DCI, DAG); case ISD::SHL: return performSHLCombine(N, DCI, DAG); + case ISD::CTPOP: + return performCTPOPCombine(N, DCI, DAG); } return SDValue(); } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 2cb8ed2..70bfae7 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -745,7 +745,6 @@ private: SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerLOOP_DEPENDENCE_MASK(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVSCALE(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/AArch64/AArch64InstrGISel.td b/llvm/lib/Target/AArch64/AArch64InstrGISel.td index 30b7b03..52b216c 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrGISel.td +++ b/llvm/lib/Target/AArch64/AArch64InstrGISel.td @@ -197,6 +197,12 @@ def G_SMULL : AArch64GenericInstruction { let hasSideEffects = 0; } +def G_PMULL : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type1:$src1, type1:$src2); + let hasSideEffects = 0; +} + def G_UADDLP : AArch64GenericInstruction { let OutOperandList = (outs type0:$dst); let InOperandList = (ins type0:$src1); @@ -273,6 +279,7 @@ def : GINodeEquiv<G_FCMGT, AArch64fcmgt>; def : GINodeEquiv<G_BSP, AArch64bsp>; +def : GINodeEquiv<G_PMULL, AArch64pmull>; def : GINodeEquiv<G_UMULL, AArch64umull>; def : GINodeEquiv<G_SMULL, AArch64smull>; diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp index e69fa32..2ab7bf1 100644 --- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -1386,6 +1386,25 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, if (MOP.isReg() && MOP.isKill()) DefinedInBB.addReg(MOP.getReg()); + // Copy over any implicit-def operands. This is like MI.copyImplicitOps, but + // only copies implicit defs and makes sure that each operand is only added + // once in case of duplicates. + auto CopyImplicitOps = [&](MachineBasicBlock::iterator MI1, + MachineBasicBlock::iterator MI2) { + SmallSetVector<Register, 4> Ops; + for (const MachineOperand &MO : + llvm::drop_begin(MI1->operands(), MI1->getDesc().getNumOperands())) + if (MO.isReg() && MO.isImplicit() && MO.isDef()) + Ops.insert(MO.getReg()); + for (const MachineOperand &MO : + llvm::drop_begin(MI2->operands(), MI2->getDesc().getNumOperands())) + if (MO.isReg() && MO.isImplicit() && MO.isDef()) + Ops.insert(MO.getReg()); + for (auto Op : Ops) + MIB.addDef(Op, RegState::Implicit); + }; + CopyImplicitOps(I, Paired); + // Erase the old instructions. I->eraseFromParent(); Paired->eraseFromParent(); diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp index 45b7120..4df4d54 100644 --- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp +++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp @@ -805,7 +805,7 @@ void AArch64PrologueEmitter::emitPrologue() { CFAOffset += SVEAllocs.BeforePPRs; assert(PPRRange.End == ZPRRange.Begin && "Expected ZPR callee saves after PPR locals"); - allocateStackSpace(PPRRange.End, RealignmentPadding, SVEAllocs.AfterPPRs, + allocateStackSpace(PPRRange.End, 0, SVEAllocs.AfterPPRs, EmitAsyncCFI && !HasFP, CFAOffset, MFI.hasVarSizedObjects() || SVEAllocs.AfterZPRs); CFAOffset += SVEAllocs.AfterPPRs; @@ -1318,6 +1318,26 @@ AArch64EpilogueEmitter::AArch64EpilogueEmitter(MachineFunction &MF, SEHEpilogueStartI = MBB.end(); } +void AArch64EpilogueEmitter::moveSPBelowFP(MachineBasicBlock::iterator MBBI, + StackOffset Offset) { + // Other combinations could be supported, but are not currently needed. + assert(Offset.getScalable() < 0 && Offset.getFixed() <= 0 && + "expected negative offset (with optional fixed portion)"); + Register Base = AArch64::FP; + if (int64_t FixedOffset = Offset.getFixed()) { + // If we have a negative fixed offset, we need to first subtract it in a + // temporary register first (to avoid briefly deallocating the scalable + // portion of the offset). + Base = MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass); + emitFrameOffset(MBB, MBBI, DL, Base, AArch64::FP, + StackOffset::getFixed(FixedOffset), TII, + MachineInstr::FrameDestroy); + } + emitFrameOffset(MBB, MBBI, DL, AArch64::SP, Base, + StackOffset::getScalable(Offset.getScalable()), TII, + MachineInstr::FrameDestroy); +} + void AArch64EpilogueEmitter::emitEpilogue() { MachineBasicBlock::iterator EpilogueEndI = MBB.getLastNonDebugInstr(); if (MBB.end() != EpilogueEndI) { @@ -1418,6 +1438,7 @@ void AArch64EpilogueEmitter::emitEpilogue() { AfterCSRPopSize += ProloguePopSize; } } + // Move past the restores of the callee-saved registers. // If we plan on combining the sp bump of the local stack size and the callee // save stack size, we might need to adjust the CSR save and restore offsets. @@ -1483,7 +1504,6 @@ void AArch64EpilogueEmitter::emitEpilogue() { StackOffset SVECalleeSavesSize = ZPR.CalleeSavesSize + PPR.CalleeSavesSize; SVEStackAllocations SVEAllocs = getSVEStackAllocations({PPR, ZPR}); - MachineBasicBlock::iterator RestoreBegin = ZPRRange.Begin; // Deallocate the SVE area. if (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord) { @@ -1510,28 +1530,25 @@ void AArch64EpilogueEmitter::emitEpilogue() { (AFI->isStackRealigned() || MFI.hasVarSizedObjects()) ? AArch64::FP : AArch64::SP; if (SVECalleeSavesSize && BaseForSVEDealloc == AArch64::FP) { - // TODO: Support stack realigment and variable-sized objects. - assert( - SVELayout != SVEStackLayout::Split && - "unexpected stack realignment or variable sized objects with split " - "SVE stack objects"); - - Register CalleeSaveBase = AArch64::FP; - if (int64_t CalleeSaveBaseOffset = - AFI->getCalleeSaveBaseToFrameRecordOffset()) { - // If we have have an non-zero offset to the non-SVE CS base we need to - // compute the base address by subtracting the offest in a temporary - // register first (to avoid briefly deallocating the SVE CS). - CalleeSaveBase = MBB.getParent()->getRegInfo().createVirtualRegister( - &AArch64::GPR64RegClass); - emitFrameOffset(MBB, RestoreBegin, DL, CalleeSaveBase, AArch64::FP, - StackOffset::getFixed(-CalleeSaveBaseOffset), TII, - MachineInstr::FrameDestroy); + if (ZPR.CalleeSavesSize || SVELayout != SVEStackLayout::Split) { + // The offset from the frame-pointer to the start of the ZPR saves. + StackOffset FPOffsetZPR = + -SVECalleeSavesSize - PPR.LocalsSize - + StackOffset::getFixed(AFI->getCalleeSaveBaseToFrameRecordOffset()); + // Deallocate the stack space space by moving the SP to the start of the + // ZPR/PPR callee-save area. + moveSPBelowFP(ZPRRange.Begin, FPOffsetZPR); + } + // With split SVE, the predicates are stored in a separate area above the + // ZPR saves, so we must adjust the stack to the start of the PPRs. + if (PPR.CalleeSavesSize && SVELayout == SVEStackLayout::Split) { + // The offset from the frame-pointer to the start of the PPR saves. + StackOffset FPOffsetPPR = -PPR.CalleeSavesSize; + // Move to the start of the PPR area. + assert(!FPOffsetPPR.getFixed() && "expected only scalable offset"); + emitFrameOffset(MBB, ZPRRange.End, DL, AArch64::SP, AArch64::FP, + FPOffsetPPR, TII, MachineInstr::FrameDestroy); } - // The code below will deallocate the stack space space by moving the SP - // to the start of the SVE callee-save area. - emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, CalleeSaveBase, - -SVECalleeSavesSize, TII, MachineInstr::FrameDestroy); } else if (BaseForSVEDealloc == AArch64::SP) { auto NonSVELocals = StackOffset::getFixed(NumBytes); auto CFAOffset = NonSVELocals + StackOffset::getFixed(PrologueSaveSize) + diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h index 6e0e283..7f297b5 100644 --- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h +++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h @@ -180,6 +180,10 @@ public: private: bool shouldCombineCSRLocalStackBump(uint64_t StackBumpBytes) const; + /// A helper for moving the SP to a negative offset from the FP, without + /// deallocating any stack in the range FP to FP + Offset. + void moveSPBelowFP(MachineBasicBlock::iterator MBBI, StackOffset Offset); + void emitSwiftAsyncContextFramePointer(MachineBasicBlock::iterator MBBI, const DebugLoc &DL) const; diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp index 5bfb19d9..a5048b9 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -90,6 +90,16 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { if (MF->getSubtarget<AArch64Subtarget>().isTargetDarwin()) return getDarwinCalleeSavedRegs(MF); + if (MF->getFunction().getCallingConv() == CallingConv::PreserveMost) + return MF->getSubtarget<AArch64Subtarget>().isTargetWindows() + ? CSR_Win_AArch64_RT_MostRegs_SaveList + : CSR_AArch64_RT_MostRegs_SaveList; + + if (MF->getFunction().getCallingConv() == CallingConv::PreserveAll) + return MF->getSubtarget<AArch64Subtarget>().isTargetWindows() + ? CSR_Win_AArch64_RT_AllRegs_SaveList + : CSR_AArch64_RT_AllRegs_SaveList; + if (MF->getFunction().getCallingConv() == CallingConv::CFGuard_Check) return CSR_Win_AArch64_CFGuard_Check_SaveList; if (MF->getSubtarget<AArch64Subtarget>().isTargetWindows()) { @@ -138,10 +148,6 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { return CSR_AArch64_AAPCS_SwiftError_SaveList; if (MF->getFunction().getCallingConv() == CallingConv::SwiftTail) return CSR_AArch64_AAPCS_SwiftTail_SaveList; - if (MF->getFunction().getCallingConv() == CallingConv::PreserveMost) - return CSR_AArch64_RT_MostRegs_SaveList; - if (MF->getFunction().getCallingConv() == CallingConv::PreserveAll) - return CSR_AArch64_RT_AllRegs_SaveList; if (MF->getFunction().getCallingConv() == CallingConv::Win64) // This is for OSes other than Windows; Windows is a separate case further // above. diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 5b5565a..197aae6 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -3007,9 +3007,9 @@ AArch64TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { llvm_unreachable("Unsupported register kind"); } -bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, - ArrayRef<const Value *> Args, - Type *SrcOverrideTy) const { +bool AArch64TTIImpl::isSingleExtWideningInstruction( + unsigned Opcode, Type *DstTy, ArrayRef<const Value *> Args, + Type *SrcOverrideTy) const { // A helper that returns a vector type from the given type. The number of // elements in type Ty determines the vector width. auto toVectorTy = [&](Type *ArgTy) { @@ -3027,48 +3027,29 @@ bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64)) return false; - // Determine if the operation has a widening variant. We consider both the - // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the - // instructions. - // - // TODO: Add additional widening operations (e.g., shl, etc.) once we - // verify that their extending operands are eliminated during code - // generation. Type *SrcTy = SrcOverrideTy; switch (Opcode) { - case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2). - case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2). + case Instruction::Add: // UADDW(2), SADDW(2). + case Instruction::Sub: { // USUBW(2), SSUBW(2). // The second operand needs to be an extend if (isa<SExtInst>(Args[1]) || isa<ZExtInst>(Args[1])) { if (!SrcTy) SrcTy = toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->getType()); - } else + break; + } + + if (Opcode == Instruction::Sub) return false; - break; - case Instruction::Mul: { // SMULL(2), UMULL(2) - // Both operands need to be extends of the same type. - if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) || - (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) { + + // UADDW(2), SADDW(2) can be commutted. + if (isa<SExtInst>(Args[0]) || isa<ZExtInst>(Args[0])) { if (!SrcTy) SrcTy = toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->getType()); - } else if (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1])) { - // If one of the operands is a Zext and the other has enough zero bits to - // be treated as unsigned, we can still general a umull, meaning the zext - // is free. - KnownBits Known = - computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL); - if (Args[0]->getType()->getScalarSizeInBits() - - Known.Zero.countLeadingOnes() > - DstTy->getScalarSizeInBits() / 2) - return false; - if (!SrcTy) - SrcTy = toVectorTy(Type::getIntNTy(DstTy->getContext(), - DstTy->getScalarSizeInBits() / 2)); - } else - return false; - break; + break; + } + return false; } default: return false; @@ -3099,6 +3080,73 @@ bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize; } +Type *AArch64TTIImpl::isBinExtWideningInstruction(unsigned Opcode, Type *DstTy, + ArrayRef<const Value *> Args, + Type *SrcOverrideTy) const { + if (Opcode != Instruction::Add && Opcode != Instruction::Sub && + Opcode != Instruction::Mul) + return nullptr; + + // Exit early if DstTy is not a vector type whose elements are one of [i16, + // i32, i64]. SVE doesn't generally have the same set of instructions to + // perform an extend with the add/sub/mul. There are SMULLB style + // instructions, but they operate on top/bottom, requiring some sort of lane + // interleaving to be used with zext/sext. + unsigned DstEltSize = DstTy->getScalarSizeInBits(); + if (!useNeonVector(DstTy) || Args.size() != 2 || + (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64)) + return nullptr; + + auto getScalarSizeWithOverride = [&](const Value *V) { + if (SrcOverrideTy) + return SrcOverrideTy->getScalarSizeInBits(); + return cast<Instruction>(V) + ->getOperand(0) + ->getType() + ->getScalarSizeInBits(); + }; + + unsigned MaxEltSize = 0; + if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) || + (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) { + unsigned EltSize0 = getScalarSizeWithOverride(Args[0]); + unsigned EltSize1 = getScalarSizeWithOverride(Args[1]); + MaxEltSize = std::max(EltSize0, EltSize1); + } else if (isa<SExtInst, ZExtInst>(Args[0]) && + isa<SExtInst, ZExtInst>(Args[1])) { + unsigned EltSize0 = getScalarSizeWithOverride(Args[0]); + unsigned EltSize1 = getScalarSizeWithOverride(Args[1]); + // mul(sext, zext) will become smull(sext, zext) if the extends are large + // enough. + if (EltSize0 >= DstEltSize / 2 || EltSize1 >= DstEltSize / 2) + return nullptr; + MaxEltSize = DstEltSize / 2; + } else if (Opcode == Instruction::Mul && + (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1]))) { + // If one of the operands is a Zext and the other has enough zero bits + // to be treated as unsigned, we can still generate a umull, meaning the + // zext is free. + KnownBits Known = + computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL); + if (Args[0]->getType()->getScalarSizeInBits() - + Known.Zero.countLeadingOnes() > + DstTy->getScalarSizeInBits() / 2) + return nullptr; + + MaxEltSize = + getScalarSizeWithOverride(isa<ZExtInst>(Args[0]) ? Args[0] : Args[1]); + } else + return nullptr; + + if (MaxEltSize * 2 > DstEltSize) + return nullptr; + + Type *ExtTy = DstTy->getWithNewBitWidth(MaxEltSize * 2); + if (ExtTy->getPrimitiveSizeInBits() <= 64) + return nullptr; + return ExtTy; +} + // s/urhadd instructions implement the following pattern, making the // extends free: // %x = add ((zext i8 -> i16), 1) @@ -3159,7 +3207,24 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, if (I && I->hasOneUser()) { auto *SingleUser = cast<Instruction>(*I->user_begin()); SmallVector<const Value *, 4> Operands(SingleUser->operand_values()); - if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands, Src)) { + if (Type *ExtTy = isBinExtWideningInstruction( + SingleUser->getOpcode(), Dst, Operands, + Src != I->getOperand(0)->getType() ? Src : nullptr)) { + // The cost from Src->Src*2 needs to be added if required, the cost from + // Src*2->ExtTy is free. + if (ExtTy->getScalarSizeInBits() > Src->getScalarSizeInBits() * 2) { + Type *DoubleSrcTy = + Src->getWithNewBitWidth(Src->getScalarSizeInBits() * 2); + return getCastInstrCost(Opcode, DoubleSrcTy, Src, + TTI::CastContextHint::None, CostKind); + } + + return 0; + } + + if (isSingleExtWideningInstruction( + SingleUser->getOpcode(), Dst, Operands, + Src != I->getOperand(0)->getType() ? Src : nullptr)) { // For adds only count the second operand as free if both operands are // extends but not the same operation. (i.e both operands are not free in // add(sext, zext)). @@ -3168,8 +3233,11 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, (isa<CastInst>(SingleUser->getOperand(1)) && cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode)) return 0; - } else // Others are free so long as isWideningInstruction returned true. + } else { + // Others are free so long as isSingleExtWideningInstruction + // returned true. return 0; + } } // The cast will be free for the s/urhadd instructions @@ -4148,6 +4216,18 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost( })) return *PromotedCost; + // If the operation is a widening instruction (smull or umull) and both + // operands are extends the cost can be cheaper by considering that the + // operation will operate on the narrowest type size possible (double the + // largest input size) and a further extend. + if (Type *ExtTy = isBinExtWideningInstruction(Opcode, Ty, Args)) { + if (ExtTy != Ty) + return getArithmeticInstrCost(Opcode, ExtTy, CostKind) + + getCastInstrCost(Instruction::ZExt, Ty, ExtTy, + TTI::CastContextHint::None, CostKind); + return LT.first; + } + switch (ISD) { default: return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, @@ -4381,10 +4461,8 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost( // - two 2-cost i64 inserts, and // - two 1-cost muls. // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with - // LT.first = 2 the cost is 28. If both operands are extensions it will not - // need to scalarize so the cost can be cheaper (smull or umull). - // so the cost can be cheaper (smull or umull). - if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args)) + // LT.first = 2 the cost is 28. + if (LT.second != MVT::v2i64) return LT.first; return cast<VectorType>(Ty)->getElementCount().getKnownMinValue() * (getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind) + @@ -6129,7 +6207,8 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, } static bool containsDecreasingPointers(Loop *TheLoop, - PredicatedScalarEvolution *PSE) { + PredicatedScalarEvolution *PSE, + const DominatorTree &DT) { const auto &Strides = DenseMap<Value *, const SCEV *>(); for (BasicBlock *BB : TheLoop->blocks()) { // Scan the instructions in the block and look for addresses that are @@ -6138,8 +6217,8 @@ static bool containsDecreasingPointers(Loop *TheLoop, if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) { Value *Ptr = getLoadStorePointerOperand(&I); Type *AccessTy = getLoadStoreType(&I); - if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, Strides, /*Assume=*/true, - /*ShouldCheckWrap=*/false) + if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, DT, Strides, + /*Assume=*/true, /*ShouldCheckWrap=*/false) .value_or(0) < 0) return true; } @@ -6184,7 +6263,8 @@ bool AArch64TTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) const { // negative strides. This will require extra work to reverse the loop // predicate, which may be expensive. if (containsDecreasingPointers(TFI->LVL->getLoop(), - TFI->LVL->getPredicatedScalarEvolution())) + TFI->LVL->getPredicatedScalarEvolution(), + *TFI->LVL->getDominatorTree())) Required |= TailFoldingOpts::Reverse; if (Required == TailFoldingOpts::Disabled) Required |= TailFoldingOpts::Simple; diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index b39546a..e62fdb6 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -59,9 +59,17 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> { VECTOR_LDST_FOUR_ELEMENTS }; - bool isWideningInstruction(Type *DstTy, unsigned Opcode, - ArrayRef<const Value *> Args, - Type *SrcOverrideTy = nullptr) const; + /// Given a add/sub/mul operation, detect a widening addl/subl/mull pattern + /// where both operands can be treated like extends. Returns the minimal type + /// needed to compute the operation. + Type *isBinExtWideningInstruction(unsigned Opcode, Type *DstTy, + ArrayRef<const Value *> Args, + Type *SrcOverrideTy = nullptr) const; + /// Given a add/sub operation with a single extend operand, detect a + /// widening addw/subw pattern. + bool isSingleExtWideningInstruction(unsigned Opcode, Type *DstTy, + ArrayRef<const Value *> Args, + Type *SrcOverrideTy = nullptr) const; // A helper function called by 'getVectorInstrCost'. // @@ -304,7 +312,7 @@ public: } bool isLegalMaskedLoadStore(Type *DataType, Align Alignment) const { - if (!ST->hasSVE()) + if (!ST->isSVEorStreamingSVEAvailable()) return false; // For fixed vectors, avoid scalarization if using SVE for them. diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index 5f93847..038ad77 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -1809,6 +1809,9 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, return LowerBinOp(TargetOpcode::G_FMAXNUM); case Intrinsic::aarch64_neon_fminnm: return LowerBinOp(TargetOpcode::G_FMINNUM); + case Intrinsic::aarch64_neon_pmull: + case Intrinsic::aarch64_neon_pmull64: + return LowerBinOp(AArch64::G_PMULL); case Intrinsic::aarch64_neon_smull: return LowerBinOp(AArch64::G_SMULL); case Intrinsic::aarch64_neon_umull: diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp index 6d2d705..6b920f0 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp @@ -560,6 +560,7 @@ bool AArch64RegisterBankInfo::onlyUsesFP(const MachineInstr &MI, case TargetOpcode::G_FCMP: case TargetOpcode::G_LROUND: case TargetOpcode::G_LLROUND: + case AArch64::G_PMULL: return true; case TargetOpcode::G_INTRINSIC: switch (cast<GIntrinsic>(MI).getIntrinsicID()) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index 8669978..56ab040 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -38,9 +38,10 @@ enum ImplicitArgumentPositions { #define AMDGPU_ATTRIBUTE(Name, Str) Name = 1 << Name##_POS, enum ImplicitArgumentMask { - NOT_IMPLICIT_INPUT = 0, + UNKNOWN_INTRINSIC = 0, #include "AMDGPUAttributes.def" - ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1 + ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1, + NOT_IMPLICIT_INPUT }; #define AMDGPU_ATTRIBUTE(Name, Str) {Name, Str}, @@ -115,7 +116,7 @@ intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit, NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5); return QUEUE_PTR; default: - return NOT_IMPLICIT_INPUT; + return UNKNOWN_INTRINSIC; } } @@ -534,6 +535,21 @@ struct AAAMDAttributesFunction : public AAAMDAttributes { ImplicitArgumentMask AttrMask = intrinsicToAttrMask(IID, NonKernelOnly, NeedsImplicit, HasApertureRegs, SupportsGetDoorbellID, COV); + + if (AttrMask == UNKNOWN_INTRINSIC) { + // Assume not-nocallback intrinsics may invoke a function which accesses + // implicit arguments. + // + // FIXME: This isn't really the correct check. We want to ensure it + // isn't calling any function that may use implicit arguments regardless + // of whether it's internal to the module or not. + // + // TODO: Ignoring callsite attributes. + if (!Callee->hasFnAttribute(Attribute::NoCallback)) + return indicatePessimisticFixpoint(); + continue; + } + if (AttrMask != NOT_IMPLICIT_INPUT) { if ((IsNonEntryFunc || !NonKernelOnly)) removeAssumedBits(AttrMask); @@ -1357,7 +1373,10 @@ struct AAAMDGPUMinAGPRAlloc default: // Some intrinsics may use AGPRs, but if we have a choice, we are not // required to use AGPRs. - return true; + + // Assume !nocallback intrinsics may call a function which requires + // AGPRs. + return CB.hasFnAttr(Attribute::NoCallback); } // TODO: Handle callsite attributes diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 0c97741..15ed60b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -116,8 +116,14 @@ bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI, if (!DstRC || DstRC != SrcRC) return false; - return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) && - RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI); + if (!RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) || + !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI)) + return false; + const MCInstrDesc &MCID = MI.getDesc(); + if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) { + MI.getOperand(0).setIsEarlyClobber(true); + } + return true; } bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { @@ -602,6 +608,7 @@ bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32( I.setDesc(TII.get(Opc)); I.addOperand(*MF, MachineOperand::CreateImm(0)); I.addImplicitDefUseOperands(*MF); + I.getOperand(0).setIsEarlyClobber(true); return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } @@ -3787,6 +3794,10 @@ bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const { MI.removeOperand(1); // Intrinsic ID MI.addOperand(VDst_In); // Readd VDst_In to the end MI.addImplicitDefUseOperands(*MI.getParent()->getParent()); + const MCInstrDesc &MCID = MI.getDesc(); + if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) { + MI.getOperand(0).setIsEarlyClobber(true); + } return true; } @@ -6753,7 +6764,7 @@ bool AMDGPUInstructionSelector::selectSGetBarrierState( MachineInstr &I, Intrinsic::ID IntrID) const { MachineBasicBlock *MBB = I.getParent(); const DebugLoc &DL = I.getDebugLoc(); - MachineOperand BarOp = I.getOperand(2); + const MachineOperand &BarOp = I.getOperand(2); std::optional<int64_t> BarValImm = getIConstantVRegSExtVal(BarOp.getReg(), *MRI); @@ -6806,8 +6817,8 @@ bool AMDGPUInstructionSelector::selectNamedBarrierInit( MachineInstr &I, Intrinsic::ID IntrID) const { MachineBasicBlock *MBB = I.getParent(); const DebugLoc &DL = I.getDebugLoc(); - MachineOperand BarOp = I.getOperand(1); - MachineOperand CntOp = I.getOperand(2); + const MachineOperand &BarOp = I.getOperand(1); + const MachineOperand &CntOp = I.getOperand(2); // BarID = (BarOp >> 4) & 0x3F Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp index 0a59132..fdff21b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp @@ -1565,8 +1565,11 @@ void SplitPtrStructs::processConditionals() { } else if (isa<SelectInst>(I)) { if (MaybeRsrc) { if (auto *RsrcInst = dyn_cast<Instruction>(Rsrc)) { - ConditionalTemps.push_back(RsrcInst); - RsrcInst->replaceAllUsesWith(*MaybeRsrc); + // Guard against conditionals that were already folded away. + if (RsrcInst != *MaybeRsrc) { + ConditionalTemps.push_back(RsrcInst); + RsrcInst->replaceAllUsesWith(*MaybeRsrc); + } } for (Value *V : Seen) FoundRsrcs[V] = *MaybeRsrc; diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 52cc4ca..1a14629 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -435,7 +435,7 @@ void GCNHazardRecognizer::RecedeCycle() { // Helper Functions //===----------------------------------------------------------------------===// -using HazardFnResult = enum { HazardFound, HazardExpired, NoHazardFound }; +enum HazardFnResult { HazardFound, HazardExpired, NoHazardFound }; using IsExpiredFn = function_ref<bool(const MachineInstr &, int WaitStates)>; using GetNumWaitStatesFn = function_ref<unsigned int(const MachineInstr &)>; diff --git a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp index 959ce69..1682abb 100644 --- a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp +++ b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp @@ -43,7 +43,7 @@ public: bool run(MachineFunction &MF); private: - using NSA_Status = enum { + enum NSA_Status { NOT_NSA, // Not an NSA instruction FIXED, // NSA which we cannot modify NON_CONTIGUOUS, // NSA with non-sequential address which we can try diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp index 4deb2a9..62172a0 100644 --- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp +++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp @@ -136,7 +136,7 @@ bool GCNPreRAOptimizationsImpl::processReg(Register Reg) { continue; if (Def.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64) { - MachineOperand DefSrcMO = Def.getOperand(1); + const MachineOperand &DefSrcMO = Def.getOperand(1); // Immediates are not an issue and can be propagated in // postrapseudos pass. Only handle cases where defining diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp index 2aa54c9..31eca04 100644 --- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -1129,12 +1129,9 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { if ((AS == AMDGPUAS::PRIVATE_ADDRESS) && TruncatingStore) { // Add an extra level of chain to isolate this vector SDValue NewChain = DAG.getNode(AMDGPUISD::DUMMY_CHAIN, DL, MVT::Other, Chain); - // TODO: can the chain be replaced without creating a new store? - SDValue NewStore = DAG.getTruncStore( - NewChain, DL, Value, Ptr, StoreNode->getPointerInfo(), MemVT, - StoreNode->getAlign(), StoreNode->getMemOperand()->getFlags(), - StoreNode->getAAInfo()); - StoreNode = cast<StoreSDNode>(NewStore); + SmallVector<SDValue, 4> NewOps(StoreNode->ops()); + NewOps[0] = NewChain; + StoreNode = cast<StoreSDNode>(DAG.UpdateNodeOperands(StoreNode, NewOps)); } return scalarizeVectorStore(StoreNode, DAG); diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 6616b30..84984a0 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -1129,40 +1129,11 @@ bool SIFoldOperandsImpl::tryToFoldACImm( if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx)) return false; - MachineOperand &UseOp = UseMI->getOperand(UseOpIdx); if (OpToFold.isImm() && OpToFold.isOperandLegal(*TII, *UseMI, UseOpIdx)) { appendFoldCandidate(FoldList, UseMI, UseOpIdx, OpToFold); return true; } - // TODO: Verify the following code handles subregisters correctly. - // TODO: Handle extract of global reference - if (UseOp.getSubReg()) - return false; - - if (!OpToFold.isReg()) - return false; - - Register UseReg = OpToFold.getReg(); - if (!UseReg.isVirtual()) - return false; - - // Maybe it is just a COPY of an immediate itself. - - // FIXME: Remove this handling. There is already special case folding of - // immediate into copy in foldOperand. This is looking for the def of the - // value the folding started from in the first place. - MachineInstr *Def = MRI->getVRegDef(UseReg); - if (Def && TII->isFoldableCopy(*Def)) { - MachineOperand &DefOp = Def->getOperand(1); - if (DefOp.isImm() && TII->isOperandLegal(*UseMI, UseOpIdx, &DefOp)) { - FoldableDef FoldableImm(DefOp.getImm(), OpToFold.DefRC, - OpToFold.DefSubReg); - appendFoldCandidate(FoldList, UseMI, UseOpIdx, FoldableImm); - return true; - } - } - return false; } diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index 5c39f7a..aa5ea77 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -2170,7 +2170,9 @@ bool SIFrameLowering::hasFPImpl(const MachineFunction &MF) const { return MFI.getStackSize() != 0; } - return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() || + return (frameTriviallyRequiresSP(MFI) && + !MF.getInfo<SIMachineFunctionInfo>()->isChainFunction()) || + MFI.isFrameAddressTaken() || MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->hasStackRealignment( MF) || mayReserveScratchForCWSR(MF) || diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 45f5919..9460145 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -7945,7 +7945,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, } legalizeOperands(*NewInstr, MDT); int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr); - MachineOperand SCCOp = Inst.getOperand(SCCIdx); + const MachineOperand &SCCOp = Inst.getOperand(SCCIdx); addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg); Inst.eraseFromParent(); return; @@ -7985,7 +7985,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, legalizeOperandsVALUt16(*NewInstr, MRI); legalizeOperands(*NewInstr, MDT); int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr); - MachineOperand SCCOp = Inst.getOperand(SCCIdx); + const MachineOperand &SCCOp = Inst.getOperand(SCCIdx); addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg); Inst.eraseFromParent(); return; @@ -8183,7 +8183,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, AMDGPU::OpName::src0_modifiers) >= 0) NewInstr.addImm(0); if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) { - MachineOperand Src = Inst.getOperand(1); + const MachineOperand &Src = Inst.getOperand(1); NewInstr->addOperand(Src); } @@ -9199,7 +9199,7 @@ void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist, addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); } -void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op, +void SIInstrInfo::addSCCDefUsersToVALUWorklist(const MachineOperand &Op, MachineInstr &SCCDefInst, SIInstrWorklist &Worklist, Register NewCond) const { diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index dc23a21..0643b53 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -172,7 +172,7 @@ private: void addUsersToMoveToVALUWorklist(Register Reg, MachineRegisterInfo &MRI, SIInstrWorklist &Worklist) const; - void addSCCDefUsersToVALUWorklist(MachineOperand &Op, + void addSCCDefUsersToVALUWorklist(const MachineOperand &Op, MachineInstr &SCCDefInst, SIInstrWorklist &Worklist, Register NewCond = Register()) const; diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp index 7431e11..8785968 100644 --- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp @@ -296,7 +296,7 @@ bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First, for (MachineBasicBlock::instr_iterator I = std::next(First.getIterator()), E = MI.getIterator(); I != E; ++I) { - if (I->isBundle()) + if (I->isBundle() || I->isDebugInstr()) continue; switch (I->getOpcode()) { case AMDGPU::S_SET_GPR_IDX_MODE: @@ -640,7 +640,7 @@ void SIPreEmitPeephole::collectUnpackingCandidates( } void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) { - MachineOperand DstOp = I.getOperand(0); + const MachineOperand &DstOp = I.getOperand(0); uint16_t UnpackedOpcode = mapToUnpackedOpcode(I); assert(UnpackedOpcode != std::numeric_limits<uint16_t>::max() && diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 4ae2c1e..31d8bce4 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -1707,7 +1707,7 @@ multiclass WMMAInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string Pse defvar WMMAConstraints2Addr = !if(DiffVdstSrc2, "@earlyclobber $vdst", "@earlyclobber $vdst,$vdst = $src2"); defvar WMMAConstraints3Addr = "@earlyclobber $vdst"; - let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in { + let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0, isConvergent = 1 in { let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = 1 in def _twoaddr : VOP3P_Pseudo<Instr, WMMAProfile>, WMMAInstInfo { let PseudoInstr = Instr#PseudoInstrSuffix; @@ -1734,7 +1734,7 @@ multiclass SWMMACInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string P let mayRaiseFPException = 0; let ReadsModeReg = 0; let AsmMatchConverter = "cvtSWMMAC"; - + let isConvergent = 1; let Constraints = "@earlyclobber $vdst,$vdst = $srcTiedDef"; } } @@ -1906,8 +1906,10 @@ defm V_WMMA_SCALE_F32_32X16X128_F4_w32 : WMMAInstGFX12<"v_wmma_scale_f32_32x16 defm V_WMMA_SCALE16_F32_32X16X128_F4_w32 : WMMAInstGFX12<"v_wmma_scale16_f32_32x16x128_f4", F32_32X16X128_F4_SCALE16_w32, "_w32">; } // End is_wmma_xdl = 1. -defm V_WMMA_LD_SCALE_PAIRED_B32 : VOP3PInst<"v_wmma_ld_scale_paired_b32", VOP_WMMA_LD_SCALE<i32, VCSrc_b32_Lo256>>; -defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3PInst<"v_wmma_ld_scale16_paired_b64", VOP_WMMA_LD_SCALE<i64, VCSrc_b64_Lo256>>; +let isConvergent = 1 in { + defm V_WMMA_LD_SCALE_PAIRED_B32 : VOP3PInst<"v_wmma_ld_scale_paired_b32", VOP_WMMA_LD_SCALE<i32, VCSrc_b32_Lo256>>; + defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3PInst<"v_wmma_ld_scale16_paired_b64", VOP_WMMA_LD_SCALE<i64, VCSrc_b64_Lo256>>; +} } // End SubtargetPredicate = isGFX125xOnly } // End WaveSizePredicate = isWave32 diff --git a/llvm/lib/Target/ARM/ARMFastISel.cpp b/llvm/lib/Target/ARM/ARMFastISel.cpp index 14e1160..88d3b6f 100644 --- a/llvm/lib/Target/ARM/ARMFastISel.cpp +++ b/llvm/lib/Target/ARM/ARMFastISel.cpp @@ -86,7 +86,7 @@ namespace { // All possible address modes, plus some. class Address { public: - using BaseKind = enum { RegBase, FrameIndexBase }; + enum BaseKind { RegBase, FrameIndexBase }; private: BaseKind Kind = RegBase; diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 6b06534..92fae71 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -1312,8 +1312,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_, setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom); } - setOperationAction(ISD::FSINCOS, MVT::f64, Custom); - setOperationAction(ISD::FSINCOS, MVT::f32, Custom); + setOperationAction(ISD::FSINCOS, MVT::f64, Expand); + setOperationAction(ISD::FSINCOS, MVT::f32, Expand); // FP-ARMv8 implements a lot of rounding-like FP operations. if (Subtarget->hasFPARMv8Base()) { @@ -9855,76 +9855,6 @@ static SDValue LowerUADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Carry); } -SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { - // For iOS, we want to call an alternative entry point: __sincos_stret, - // return values are passed via sret. - SDLoc dl(Op); - SDValue Arg = Op.getOperand(0); - EVT ArgVT = Arg.getValueType(); - RTLIB::Libcall LC = RTLIB::getSINCOS_STRET(ArgVT); - RTLIB::LibcallImpl SincosStret = getLibcallImpl(LC); - if (SincosStret == RTLIB::Unsupported) - return SDValue(); - - assert(Subtarget->isTargetDarwin()); - - Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); - auto PtrVT = getPointerTy(DAG.getDataLayout()); - - MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); - - // Pair of floats / doubles used to pass the result. - Type *RetTy = StructType::get(ArgTy, ArgTy); - auto &DL = DAG.getDataLayout(); - - ArgListTy Args; - bool ShouldUseSRet = getTM().isAPCS_ABI(); - SDValue SRet; - if (ShouldUseSRet) { - // Create stack object for sret. - const uint64_t ByteSize = DL.getTypeAllocSize(RetTy); - const Align StackAlign = DL.getPrefTypeAlign(RetTy); - int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false); - SRet = DAG.getFrameIndex(FrameIdx, getPointerTy(DL)); - - ArgListEntry Entry(SRet, PointerType::getUnqual(RetTy->getContext())); - Entry.IsSExt = false; - Entry.IsZExt = false; - Entry.IsSRet = true; - Args.push_back(Entry); - RetTy = Type::getVoidTy(*DAG.getContext()); - } - - Args.emplace_back(Arg, ArgTy); - - StringRef LibcallName = getLibcallImplName(SincosStret); - CallingConv::ID CC = getLibcallImplCallingConv(SincosStret); - SDValue Callee = DAG.getExternalSymbol(LibcallName.data(), getPointerTy(DL)); - - TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl) - .setChain(DAG.getEntryNode()) - .setCallee(CC, RetTy, Callee, std::move(Args)) - .setDiscardResult(ShouldUseSRet); - std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); - - if (!ShouldUseSRet) - return CallResult.first; - - SDValue LoadSin = - DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo()); - - // Address of cos field. - SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet, - DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl)); - SDValue LoadCos = - DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo()); - - SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); - return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, - LoadSin.getValue(0), LoadCos.getValue(0)); -} - SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG, bool Signed, SDValue &Chain) const { @@ -10726,8 +10656,8 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::VECREDUCE_SMAX: return LowerVecReduceMinMax(Op, DAG, Subtarget); case ISD::ATOMIC_LOAD: - case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG); - case ISD::FSINCOS: return LowerFSINCOS(Op, DAG); + case ISD::ATOMIC_STORE: + return LowerAtomicLoadStore(Op, DAG); case ISD::SDIVREM: case ISD::UDIVREM: return LowerDivRem(Op, DAG); case ISD::DYNAMIC_STACKALLOC: diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index bf3438b..bc2fec3 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -901,7 +901,6 @@ class VectorType; SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST) const; SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDivRem(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed) const; void ExpandDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed, diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index 9b250e6..24f58a6 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -2448,7 +2448,8 @@ static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) { // static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE, const DataLayout &DL, - const LoopAccessInfo *LAI) { + const LoopAccessInfo *LAI, + const DominatorTree &DT) { LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n"); // If there are live-out values, it is probably a reduction. We can predicate @@ -2498,7 +2499,8 @@ static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE, if (isa<StoreInst>(I) || isa<LoadInst>(I)) { Value *Ptr = getLoadStorePointerOperand(&I); Type *AccessTy = getLoadStoreType(&I); - int64_t NextStride = getPtrStride(PSE, AccessTy, Ptr, L).value_or(0); + int64_t NextStride = + getPtrStride(PSE, AccessTy, Ptr, L, DT).value_or(0); if (NextStride == 1) { // TODO: for now only allow consecutive strides of 1. We could support // other strides as long as it is uniform, but let's keep it simple @@ -2585,7 +2587,8 @@ bool ARMTTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) const { return false; } - return canTailPredicateLoop(L, LI, *SE, DL, LVL->getLAI()); + return canTailPredicateLoop(L, LI, *SE, DL, LVL->getLAI(), + *LVL->getDominatorTree()); } TailFoldingStyle diff --git a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp index b2a8204..abe081c 100644 --- a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp +++ b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp @@ -176,10 +176,6 @@ void BPFAsmPrinter::emitInstruction(const MachineInstr *MI) { if (const GlobalValue *GV = Op.getGlobal()) if (GV->getName() == BPF_TRAP) SawTrapCall = true; - } else if (Op.isSymbol()) { - if (const MCSymbol *Sym = Op.getMCSymbol()) - if (Sym->getName() == BPF_TRAP) - SawTrapCall = true; } } } @@ -219,6 +215,10 @@ void BPFAsmPrinter::emitJumpTableInfo() { const TargetLoweringObjectFile &TLOF = getObjFileLowering(); const Function &F = MF->getFunction(); + + MCSection *Sec = OutStreamer->getCurrentSectionOnly(); + MCSymbol *SecStart = Sec->getBeginSymbol(); + MCSection *JTS = TLOF.getSectionForJumpTable(F, TM); assert(MJTI->getEntryKind() == MachineJumpTableInfo::EK_BlockAddress); unsigned EntrySize = MJTI->getEntrySize(getDataLayout()); @@ -231,8 +231,10 @@ void BPFAsmPrinter::emitJumpTableInfo() { MCSymbol *JTStart = getJTPublicSymbol(JTI); OutStreamer->emitLabel(JTStart); for (const MachineBasicBlock *MBB : JTBBs) { - const MCExpr *LHS = MCSymbolRefExpr::create(MBB->getSymbol(), OutContext); - OutStreamer->emitValue(LHS, EntrySize); + const MCExpr *Diff = MCBinaryExpr::createSub( + MCSymbolRefExpr::create(MBB->getSymbol(), OutContext), + MCSymbolRefExpr::create(SecStart, OutContext), OutContext); + OutStreamer->emitValue(Diff, EntrySize); } const MCExpr *JTSize = MCConstantExpr::create(JTBBs.size() * EntrySize, OutContext); diff --git a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp index eb4c884..677203d 100644 --- a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp +++ b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp @@ -285,6 +285,13 @@ void DXContainerGlobals::addPipelineStateValidationInfo( PSV.BaseData.NumThreadsX = MMI.EntryPropertyVec[0].NumThreadsX; PSV.BaseData.NumThreadsY = MMI.EntryPropertyVec[0].NumThreadsY; PSV.BaseData.NumThreadsZ = MMI.EntryPropertyVec[0].NumThreadsZ; + if (MMI.EntryPropertyVec[0].WaveSizeMin) { + PSV.BaseData.MinimumWaveLaneCount = MMI.EntryPropertyVec[0].WaveSizeMin; + PSV.BaseData.MaximumWaveLaneCount = + MMI.EntryPropertyVec[0].WaveSizeMax + ? MMI.EntryPropertyVec[0].WaveSizeMax + : MMI.EntryPropertyVec[0].WaveSizeMin; + } break; default: break; diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td index 7ae500a..67437f6 100644 --- a/llvm/lib/Target/DirectX/DXIL.td +++ b/llvm/lib/Target/DirectX/DXIL.td @@ -1079,6 +1079,15 @@ def WaveActiveOp : DXILOp<119, waveActiveOp> { let attributes = [Attributes<DXIL1_0, []>]; } +def LegacyF16ToF32 : DXILOp<131, legacyF16ToF32> { + let Doc = "returns the float16 stored in the low-half of the uint converted " + "to a float"; + let intrinsics = [IntrinSelect<int_dx_legacyf16tof32>]; + let arguments = [Int32Ty]; + let result = FloatTy; + let stages = [Stages<DXIL1_0, [all_stages]>]; +} + def WaveAllBitCount : DXILOp<135, waveAllOp> { let Doc = "returns the count of bits set to 1 across the wave"; let intrinsics = [IntrinSelect<int_dx_wave_active_countbits>]; diff --git a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp index cf8b833..e1a472f 100644 --- a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp +++ b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp @@ -82,6 +82,7 @@ enum class EntryPropsTag { ASStateTag, WaveSize, EntryRootSig, + WaveRange = 23, }; } // namespace @@ -177,14 +178,15 @@ getTagValueAsMetadata(EntryPropsTag Tag, uint64_t Value, LLVMContext &Ctx) { case EntryPropsTag::ASStateTag: case EntryPropsTag::WaveSize: case EntryPropsTag::EntryRootSig: + case EntryPropsTag::WaveRange: llvm_unreachable("NYI: Unhandled entry property tag"); } return MDVals; } -static MDTuple * -getEntryPropAsMetadata(const EntryProperties &EP, uint64_t EntryShaderFlags, - const Triple::EnvironmentType ShaderProfile) { +static MDTuple *getEntryPropAsMetadata(Module &M, const EntryProperties &EP, + uint64_t EntryShaderFlags, + const ModuleMetadataInfo &MMDI) { SmallVector<Metadata *> MDVals; LLVMContext &Ctx = EP.Entry->getContext(); if (EntryShaderFlags != 0) @@ -195,12 +197,13 @@ getEntryPropAsMetadata(const EntryProperties &EP, uint64_t EntryShaderFlags, // FIXME: support more props. // See https://github.com/llvm/llvm-project/issues/57948. // Add shader kind for lib entries. - if (ShaderProfile == Triple::EnvironmentType::Library && + if (MMDI.ShaderProfile == Triple::EnvironmentType::Library && EP.ShaderStage != Triple::EnvironmentType::Library) MDVals.append(getTagValueAsMetadata(EntryPropsTag::ShaderKind, getShaderStage(EP.ShaderStage), Ctx)); if (EP.ShaderStage == Triple::EnvironmentType::Compute) { + // Handle mandatory "hlsl.numthreads" MDVals.emplace_back(ConstantAsMetadata::get(ConstantInt::get( Type::getInt32Ty(Ctx), static_cast<int>(EntryPropsTag::NumThreads)))); Metadata *NumThreadVals[] = {ConstantAsMetadata::get(ConstantInt::get( @@ -210,8 +213,48 @@ getEntryPropAsMetadata(const EntryProperties &EP, uint64_t EntryShaderFlags, ConstantAsMetadata::get(ConstantInt::get( Type::getInt32Ty(Ctx), EP.NumThreadsZ))}; MDVals.emplace_back(MDNode::get(Ctx, NumThreadVals)); + + // Handle optional "hlsl.wavesize". The fields are optionally represented + // if they are non-zero. + if (EP.WaveSizeMin != 0) { + bool IsWaveRange = VersionTuple(6, 8) <= MMDI.ShaderModelVersion; + bool IsWaveSize = + !IsWaveRange && VersionTuple(6, 6) <= MMDI.ShaderModelVersion; + + if (!IsWaveRange && !IsWaveSize) { + reportError(M, "Shader model 6.6 or greater is required to specify " + "the \"hlsl.wavesize\" function attribute"); + return nullptr; + } + + // A range is being specified if EP.WaveSizeMax != 0 + if (EP.WaveSizeMax && !IsWaveRange) { + reportError( + M, "Shader model 6.8 or greater is required to specify " + "wave size range values of the \"hlsl.wavesize\" function " + "attribute"); + return nullptr; + } + + EntryPropsTag Tag = + IsWaveSize ? EntryPropsTag::WaveSize : EntryPropsTag::WaveRange; + MDVals.emplace_back(ConstantAsMetadata::get( + ConstantInt::get(Type::getInt32Ty(Ctx), static_cast<int>(Tag)))); + + SmallVector<Metadata *> WaveSizeVals = {ConstantAsMetadata::get( + ConstantInt::get(Type::getInt32Ty(Ctx), EP.WaveSizeMin))}; + if (IsWaveRange) { + WaveSizeVals.push_back(ConstantAsMetadata::get( + ConstantInt::get(Type::getInt32Ty(Ctx), EP.WaveSizeMax))); + WaveSizeVals.push_back(ConstantAsMetadata::get( + ConstantInt::get(Type::getInt32Ty(Ctx), EP.WaveSizePref))); + } + + MDVals.emplace_back(MDNode::get(Ctx, WaveSizeVals)); + } } } + if (MDVals.empty()) return nullptr; return MDNode::get(Ctx, MDVals); @@ -236,12 +279,11 @@ static MDTuple *constructEntryMetadata(const Function *EntryFn, return MDNode::get(Ctx, MDVals); } -static MDTuple *emitEntryMD(const EntryProperties &EP, MDTuple *Signatures, - MDNode *MDResources, +static MDTuple *emitEntryMD(Module &M, const EntryProperties &EP, + MDTuple *Signatures, MDNode *MDResources, const uint64_t EntryShaderFlags, - const Triple::EnvironmentType ShaderProfile) { - MDTuple *Properties = - getEntryPropAsMetadata(EP, EntryShaderFlags, ShaderProfile); + const ModuleMetadataInfo &MMDI) { + MDTuple *Properties = getEntryPropAsMetadata(M, EP, EntryShaderFlags, MMDI); return constructEntryMetadata(EP.Entry, Signatures, MDResources, Properties, EP.Entry->getContext()); } @@ -523,10 +565,8 @@ static void translateGlobalMetadata(Module &M, DXILResourceMap &DRM, Twine(Triple::getEnvironmentTypeName(MMDI.ShaderProfile) + "'")); } - - EntryFnMDNodes.emplace_back(emitEntryMD(EntryProp, Signatures, ResourceMD, - EntryShaderFlags, - MMDI.ShaderProfile)); + EntryFnMDNodes.emplace_back(emitEntryMD( + M, EntryProp, Signatures, ResourceMD, EntryShaderFlags, MMDI)); } NamedMDNode *EntryPointsNamedMD = diff --git a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp index 60dfd96..6cacbf6 100644 --- a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp +++ b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp @@ -29,11 +29,12 @@ bool DirectXTTIImpl::isTargetIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx) const { switch (ID) { case Intrinsic::dx_asdouble: - case Intrinsic::dx_isinf: - case Intrinsic::dx_isnan: case Intrinsic::dx_firstbitlow: - case Intrinsic::dx_firstbituhigh: case Intrinsic::dx_firstbitshigh: + case Intrinsic::dx_firstbituhigh: + case Intrinsic::dx_isinf: + case Intrinsic::dx_isnan: + case Intrinsic::dx_legacyf16tof32: return OpdIdx == 0; default: return OpdIdx == -1; @@ -50,6 +51,7 @@ bool DirectXTTIImpl::isTargetIntrinsicTriviallyScalarizable( case Intrinsic::dx_frac: case Intrinsic::dx_isinf: case Intrinsic::dx_isnan: + case Intrinsic::dx_legacyf16tof32: case Intrinsic::dx_rsqrt: case Intrinsic::dx_saturate: case Intrinsic::dx_splitdouble: diff --git a/llvm/lib/Target/Hexagon/HexagonQFPOptimizer.cpp b/llvm/lib/Target/Hexagon/HexagonQFPOptimizer.cpp index 479ac90..f29a739 100644 --- a/llvm/lib/Target/Hexagon/HexagonQFPOptimizer.cpp +++ b/llvm/lib/Target/Hexagon/HexagonQFPOptimizer.cpp @@ -104,13 +104,6 @@ const std::map<unsigned short, unsigned short> QFPInstMap{ {Hexagon::V6_vmpy_qf32_sf, Hexagon::V6_vmpy_qf32}}; } // namespace -namespace llvm { - -FunctionPass *createHexagonQFPOptimizer(); -void initializeHexagonQFPOptimizerPass(PassRegistry &); - -} // namespace llvm - namespace { struct HexagonQFPOptimizer : public MachineFunctionPass { diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index fe700e1..cf4ffc82 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -6630,6 +6630,11 @@ performINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG, return DAG.getNode(LoongArchISD::VANY_NONZERO, DL, N->getValueType(0), N->getOperand(1)); break; + case Intrinsic::loongarch_lasx_concat_128_s: + case Intrinsic::loongarch_lasx_concat_128_d: + case Intrinsic::loongarch_lasx_concat_128: + return DAG.getNode(ISD::CONCAT_VECTORS, DL, N->getValueType(0), + N->getOperand(1), N->getOperand(2)); } return SDValue(); } diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp index c89212d..90a4723 100644 --- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp @@ -756,6 +756,155 @@ LoongArchInstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const { return ArrayRef(TargetFlags); } +bool LoongArchInstrInfo::canFoldIntoAddrMode(const MachineInstr &MemI, + Register Reg, + const MachineInstr &AddrI, + ExtAddrMode &AM) const { + enum MemIOffsetType { + Imm14Shift2, + Imm12, + Imm11Shift1, + Imm10Shift2, + Imm9Shift3, + Imm8, + Imm8Shift1, + Imm8Shift2, + Imm8Shift3 + }; + + MemIOffsetType OT; + switch (MemI.getOpcode()) { + default: + return false; + case LoongArch::LDPTR_W: + case LoongArch::LDPTR_D: + case LoongArch::STPTR_W: + case LoongArch::STPTR_D: + OT = Imm14Shift2; + break; + case LoongArch::LD_B: + case LoongArch::LD_H: + case LoongArch::LD_W: + case LoongArch::LD_D: + case LoongArch::LD_BU: + case LoongArch::LD_HU: + case LoongArch::LD_WU: + case LoongArch::ST_B: + case LoongArch::ST_H: + case LoongArch::ST_W: + case LoongArch::ST_D: + case LoongArch::FLD_S: + case LoongArch::FLD_D: + case LoongArch::FST_S: + case LoongArch::FST_D: + case LoongArch::VLD: + case LoongArch::VST: + case LoongArch::XVLD: + case LoongArch::XVST: + case LoongArch::VLDREPL_B: + case LoongArch::XVLDREPL_B: + OT = Imm12; + break; + case LoongArch::VLDREPL_H: + case LoongArch::XVLDREPL_H: + OT = Imm11Shift1; + break; + case LoongArch::VLDREPL_W: + case LoongArch::XVLDREPL_W: + OT = Imm10Shift2; + break; + case LoongArch::VLDREPL_D: + case LoongArch::XVLDREPL_D: + OT = Imm9Shift3; + break; + case LoongArch::VSTELM_B: + case LoongArch::XVSTELM_B: + OT = Imm8; + break; + case LoongArch::VSTELM_H: + case LoongArch::XVSTELM_H: + OT = Imm8Shift1; + break; + case LoongArch::VSTELM_W: + case LoongArch::XVSTELM_W: + OT = Imm8Shift2; + break; + case LoongArch::VSTELM_D: + case LoongArch::XVSTELM_D: + OT = Imm8Shift3; + break; + } + + if (MemI.getOperand(0).getReg() == Reg) + return false; + + if ((AddrI.getOpcode() != LoongArch::ADDI_W && + AddrI.getOpcode() != LoongArch::ADDI_D) || + !AddrI.getOperand(1).isReg() || !AddrI.getOperand(2).isImm()) + return false; + + int64_t OldOffset = MemI.getOperand(2).getImm(); + int64_t Disp = AddrI.getOperand(2).getImm(); + int64_t NewOffset = OldOffset + Disp; + if (!STI.is64Bit()) + NewOffset = SignExtend64<32>(NewOffset); + + if (!(OT == Imm14Shift2 && isShiftedInt<14, 2>(NewOffset) && STI.hasUAL()) && + !(OT == Imm12 && isInt<12>(NewOffset)) && + !(OT == Imm11Shift1 && isShiftedInt<11, 1>(NewOffset)) && + !(OT == Imm10Shift2 && isShiftedInt<10, 2>(NewOffset)) && + !(OT == Imm9Shift3 && isShiftedInt<9, 3>(NewOffset)) && + !(OT == Imm8 && isInt<8>(NewOffset)) && + !(OT == Imm8Shift1 && isShiftedInt<8, 1>(NewOffset)) && + !(OT == Imm8Shift2 && isShiftedInt<8, 2>(NewOffset)) && + !(OT == Imm8Shift3 && isShiftedInt<8, 3>(NewOffset))) + return false; + + AM.BaseReg = AddrI.getOperand(1).getReg(); + AM.ScaledReg = 0; + AM.Scale = 0; + AM.Displacement = NewOffset; + AM.Form = ExtAddrMode::Formula::Basic; + return true; +} + +MachineInstr * +LoongArchInstrInfo::emitLdStWithAddr(MachineInstr &MemI, + const ExtAddrMode &AM) const { + const DebugLoc &DL = MemI.getDebugLoc(); + MachineBasicBlock &MBB = *MemI.getParent(); + + assert(AM.ScaledReg == 0 && AM.Scale == 0 && + "Addressing mode not supported for folding"); + + unsigned MemIOp = MemI.getOpcode(); + switch (MemIOp) { + default: + return BuildMI(MBB, MemI, DL, get(MemIOp)) + .addReg(MemI.getOperand(0).getReg(), + MemI.mayLoad() ? RegState::Define : 0) + .addReg(AM.BaseReg) + .addImm(AM.Displacement) + .setMemRefs(MemI.memoperands()) + .setMIFlags(MemI.getFlags()); + case LoongArch::VSTELM_B: + case LoongArch::VSTELM_H: + case LoongArch::VSTELM_W: + case LoongArch::VSTELM_D: + case LoongArch::XVSTELM_B: + case LoongArch::XVSTELM_H: + case LoongArch::XVSTELM_W: + case LoongArch::XVSTELM_D: + return BuildMI(MBB, MemI, DL, get(MemIOp)) + .addReg(MemI.getOperand(0).getReg(), 0) + .addReg(AM.BaseReg) + .addImm(AM.Displacement) + .addImm(MemI.getOperand(3).getImm()) + .setMemRefs(MemI.memoperands()) + .setMIFlags(MemI.getFlags()); + } +} + // Returns true if this is the sext.w pattern, addi.w rd, rs, 0. bool LoongArch::isSEXT_W(const MachineInstr &MI) { return MI.getOpcode() == LoongArch::ADDI_W && MI.getOperand(1).isReg() && diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h index f25958a..f69a558 100644 --- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h +++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h @@ -93,6 +93,12 @@ public: ArrayRef<std::pair<unsigned, const char *>> getSerializableBitmaskMachineOperandTargetFlags() const override; + bool canFoldIntoAddrMode(const MachineInstr &MemI, Register Reg, + const MachineInstr &AddrI, + ExtAddrMode &AM) const override; + MachineInstr *emitLdStWithAddr(MachineInstr &MemI, + const ExtAddrMode &AM) const override; + protected: const LoongArchSubtarget &STI; }; diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td index b502b056..00d5287 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td @@ -2113,6 +2113,37 @@ defm : subvector_subreg_lowering<LSX128, v2f64, LASX256, v4f64, 2, sub_128>; defm : subvector_subreg_lowering<LSX128, v8i16, LASX256, v16i16, 8, sub_128>; defm : subvector_subreg_lowering<LSX128, v16i8, LASX256, v32i8, 16, sub_128>; +// LASX and LSX conversion +def : Pat<(int_loongarch_lasx_cast_128_s (v4f32 LSX128:$src)), + (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$src, sub_128)>; +def : Pat<(int_loongarch_lasx_cast_128_d (v2f64 LSX128:$src)), + (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$src, sub_128)>; +def : Pat<(int_loongarch_lasx_cast_128 (v2i64 LSX128:$src)), + (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$src, sub_128)>; +def : Pat<(int_loongarch_lasx_extract_128_lo_s (v8f32 LASX256:$src)), + (EXTRACT_SUBREG LASX256:$src, sub_128)>; +def : Pat<(int_loongarch_lasx_extract_128_lo_d (v4f64 LASX256:$src)), + (EXTRACT_SUBREG LASX256:$src, sub_128)>; +def : Pat<(int_loongarch_lasx_extract_128_lo (v4i64 LASX256:$src)), + (EXTRACT_SUBREG LASX256:$src, sub_128)>; +def : Pat<(int_loongarch_lasx_extract_128_hi_s (v8f32 LASX256:$src)), + (EXTRACT_SUBREG (XVPERMI_Q (IMPLICIT_DEF), LASX256:$src, 1), sub_128)>; +def : Pat<(int_loongarch_lasx_extract_128_hi_d (v4f64 LASX256:$src)), + (EXTRACT_SUBREG (XVPERMI_Q (IMPLICIT_DEF), LASX256:$src, 1), sub_128)>; +def : Pat<(int_loongarch_lasx_extract_128_hi (v4i64 LASX256:$src)), + (EXTRACT_SUBREG (XVPERMI_Q (IMPLICIT_DEF), LASX256:$src, 1), sub_128)>; +def : Pat<(int_loongarch_lasx_insert_128_lo_s (v8f32 LASX256:$src), (v4f32 LSX128:$lo)), + (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 48)>; +def : Pat<(int_loongarch_lasx_insert_128_lo_d (v4f64 LASX256:$src), (v2f64 LSX128:$lo)), + (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 48)>; +def : Pat<(int_loongarch_lasx_insert_128_lo (v4i64 LASX256:$src), (v2i64 LSX128:$lo)), + (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 48)>; +def : Pat<(int_loongarch_lasx_insert_128_hi_s (v8f32 LASX256:$src), (v4f32 LSX128:$lo)), + (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 2)>; +def : Pat<(int_loongarch_lasx_insert_128_hi_d (v4f64 LASX256:$src), (v2f64 LSX128:$lo)), + (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 2)>; +def : Pat<(int_loongarch_lasx_insert_128_hi (v4i64 LASX256:$src), (v2i64 LSX128:$lo)), + (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 2)>; } // Predicates = [HasExtLASX] /// Intrinsic pattern diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp index 9de4c9d..92a9388 100644 --- a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp @@ -62,6 +62,11 @@ static cl::opt<bool> cl::desc("Enable the merge base offset pass"), cl::init(true), cl::Hidden); +static cl::opt<bool> + EnableSinkFold("loongarch-enable-sink-fold", + cl::desc("Enable sinking and folding of instruction copies"), + cl::init(true), cl::Hidden); + static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) { return RM.value_or(Reloc::Static); } @@ -146,7 +151,9 @@ namespace { class LoongArchPassConfig : public TargetPassConfig { public: LoongArchPassConfig(LoongArchTargetMachine &TM, PassManagerBase &PM) - : TargetPassConfig(TM, PM) {} + : TargetPassConfig(TM, PM) { + setEnableSinkAndFold(EnableSinkFold); + } LoongArchTargetMachine &getLoongArchTargetMachine() const { return getTM<LoongArchTargetMachine>(); diff --git a/llvm/lib/Target/Mips/MipsFastISel.cpp b/llvm/lib/Target/Mips/MipsFastISel.cpp index df0c8c1..06210b6 100644 --- a/llvm/lib/Target/Mips/MipsFastISel.cpp +++ b/llvm/lib/Target/Mips/MipsFastISel.cpp @@ -82,7 +82,7 @@ class MipsFastISel final : public FastISel { // All possible address modes. class Address { public: - using BaseKind = enum { RegBase, FrameIndexBase }; + enum BaseKind { RegBase, FrameIndexBase }; private: BaseKind Kind = RegBase; diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index b260221..f0bdf47 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -2267,7 +2267,7 @@ def : Pat<(f32 (fpround f64:$a)), (CVT_f32_f64 $a, CvtRN)>; def : Pat<(f32 (fpextend f16:$a)), (CVT_f32_f16 $a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>; def : Pat<(f32 (fpextend f16:$a)), (CVT_f32_f16 $a, CvtNONE)>; // fpextend bf16 -> f32 -def : Pat<(f32 (fpextend bf16:$a)), (CVT_f32_bf16 $a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(f32 (fpextend bf16:$a)), (CVT_f32_bf16 $a, CvtNONE_FTZ)>, Requires<[doF32FTZ, hasPTX<78>, hasSM<90>]>; def : Pat<(f32 (fpextend bf16:$a)), (CVT_f32_bf16 $a, CvtNONE)>, Requires<[hasPTX<71>, hasSM<80>]>; // fpextend f16 -> f64 diff --git a/llvm/lib/Target/PowerPC/PPCInstrFuture.td b/llvm/lib/Target/PowerPC/PPCInstrFuture.td index 0c2e44e..dfbbba0 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrFuture.td +++ b/llvm/lib/Target/PowerPC/PPCInstrFuture.td @@ -420,6 +420,9 @@ let Predicates = [HasVSX, IsISAFuture] in { : VXForm_VRTAB5<323, (outs vrrc:$VRT), (ins vrrc:$VRA, vrrc:$VRB), "vucmprlh $VRT, $VRA, $VRB", []>; + def XVRLW: XX3Form_XTAB6<60, 184, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xvrlw $XT, $XA, $XB", []>; + // AES Acceleration Instructions def XXAESENCP : XX3Form_XTABp5_M2<194, (outs vsrprc:$XTp), (ins vsrprc:$XAp, vsrprc:$XBp, u2imm:$M), diff --git a/llvm/lib/Target/PowerPC/PPCInstrMMA.td b/llvm/lib/Target/PowerPC/PPCInstrMMA.td index b38dd4a..fc3cde3 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrMMA.td +++ b/llvm/lib/Target/PowerPC/PPCInstrMMA.td @@ -202,7 +202,7 @@ multiclass ACC_UM_M244_XO46<bits<6> opcode, bits<8> xo, dag IOL, string asmbase, RegConstraint<"@earlyclobber $AT">; def PM#NAME#WPP : MMIRR_XX3Form_XY4P2_XAB6< - opcode, !or(xo, 0x20), (outs acc:$AT), + opcode, !or(xo, 0x20), (outs wacc:$AT), !con((ins wacc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))), !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"), @@ -765,7 +765,7 @@ let Predicates = [MMA, IsISAFuture] in { def : Pat<(v512i1 (int_ppc_mma_xvf64gerpn v512i1:$ATi, v256i1:$XA, v16i8:$XB)), (XVF64GERWPN $ATi, $XA, RCCp.BToVSRC)>; def : Pat<(v512i1 (int_ppc_mma_xvf64gernp v512i1:$ATi, v256i1:$XA, v16i8:$XB)), - (XVF64GERNP $ATi, $XA, RCCp.BToVSRC)>; + (XVF64GERWNP $ATi, $XA, RCCp.BToVSRC)>; def : Pat<(v512i1 (int_ppc_mma_xvf64gernn v512i1:$ATi, v256i1:$XA, v16i8:$XB)), (XVF64GERWNN $ATi, $XA, RCCp.BToVSRC)>; diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt index 0ff178e..e9088a4 100644 --- a/llvm/lib/Target/RISCV/CMakeLists.txt +++ b/llvm/lib/Target/RISCV/CMakeLists.txt @@ -58,6 +58,7 @@ add_llvm_target(RISCVCodeGen RISCVMoveMerger.cpp RISCVOptWInstrs.cpp RISCVPostRAExpandPseudoInsts.cpp + RISCVPromoteConstant.cpp RISCVPushPopOptimizer.cpp RISCVRedundantCopyElimination.cpp RISCVRegisterInfo.cpp diff --git a/llvm/lib/Target/RISCV/RISCV.h b/llvm/lib/Target/RISCV/RISCV.h index ae94101..51e8e85 100644 --- a/llvm/lib/Target/RISCV/RISCV.h +++ b/llvm/lib/Target/RISCV/RISCV.h @@ -20,6 +20,7 @@ namespace llvm { class FunctionPass; class InstructionSelector; +class ModulePass; class PassRegistry; class RISCVRegisterBankInfo; class RISCVSubtarget; @@ -111,6 +112,9 @@ void initializeRISCVO0PreLegalizerCombinerPass(PassRegistry &); FunctionPass *createRISCVPreLegalizerCombiner(); void initializeRISCVPreLegalizerCombinerPass(PassRegistry &); +ModulePass *createRISCVPromoteConstantPass(); +void initializeRISCVPromoteConstantPass(PassRegistry &); + FunctionPass *createRISCVVLOptimizerPass(); void initializeRISCVVLOptimizerPass(PassRegistry &); diff --git a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp index 526675a..b0453fc 100644 --- a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp +++ b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp @@ -131,6 +131,7 @@ bool RISCVExpandPseudo::expandMI(MachineBasicBlock &MBB, case RISCV::PseudoCCMAXU: case RISCV::PseudoCCMIN: case RISCV::PseudoCCMINU: + case RISCV::PseudoCCMUL: case RISCV::PseudoCCADDW: case RISCV::PseudoCCSUBW: case RISCV::PseudoCCSLL: @@ -237,6 +238,7 @@ bool RISCVExpandPseudo::expandCCOp(MachineBasicBlock &MBB, case RISCV::PseudoCCMIN: NewOpc = RISCV::MIN; break; case RISCV::PseudoCCMAXU: NewOpc = RISCV::MAXU; break; case RISCV::PseudoCCMINU: NewOpc = RISCV::MINU; break; + case RISCV::PseudoCCMUL: NewOpc = RISCV::MUL; break; case RISCV::PseudoCCADDI: NewOpc = RISCV::ADDI; break; case RISCV::PseudoCCSLLI: NewOpc = RISCV::SLLI; break; case RISCV::PseudoCCSRLI: NewOpc = RISCV::SRLI; break; diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index cfee6ab..5b72334 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -1856,6 +1856,11 @@ def TuneShortForwardBranchIMinMax "true", "Enable short forward branch optimization for min,max instructions in Zbb", [TuneShortForwardBranchOpt]>; +def TuneShortForwardBranchIMul + : SubtargetFeature<"short-forward-branch-i-mul", "HasShortForwardBranchIMul", + "true", "Enable short forward branch optimization for mul instruction", + [TuneShortForwardBranchOpt]>; + // Some subtargets require a S2V transfer buffer to move scalars into vectors. // FIXME: Forming .vx/.vf/.wx/.wf can reduce register pressure. def TuneNoSinkSplatOperands diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index e0cf739..995ae75 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -9186,7 +9186,7 @@ static SDValue lowerSelectToBinOp(SDNode *N, SelectionDAG &DAG, unsigned ShAmount = Log2_64(TrueM1); if (Subtarget.hasShlAdd(ShAmount)) return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, CondV, - DAG.getConstant(ShAmount, DL, VT), CondV); + DAG.getTargetConstant(ShAmount, DL, VT), CondV); } } // (select c, y, 0) -> -c & y @@ -15463,7 +15463,7 @@ static SDValue transformAddShlImm(SDNode *N, SelectionDAG &DAG, SDValue NS = (C0 < C1) ? N0->getOperand(0) : N1->getOperand(0); SDValue NL = (C0 > C1) ? N0->getOperand(0) : N1->getOperand(0); SDValue SHADD = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, NL, - DAG.getConstant(Diff, DL, VT), NS); + DAG.getTargetConstant(Diff, DL, VT), NS); return DAG.getNode(ISD::SHL, DL, VT, SHADD, DAG.getConstant(Bits, DL, VT)); } @@ -15501,7 +15501,7 @@ static SDValue combineShlAddIAddImpl(SDNode *N, SDValue AddI, SDValue Other, int64_t AddConst = AddVal.getSExtValue(); SDValue SHADD = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, SHLVal->getOperand(0), - DAG.getConstant(ShlConst, DL, VT), Other); + DAG.getTargetConstant(ShlConst, DL, VT), Other); return DAG.getNode(ISD::ADD, DL, VT, SHADD, DAG.getSignedConstant(AddConst, DL, VT)); } @@ -16495,6 +16495,45 @@ static SDValue expandMulToAddOrSubOfShl(SDNode *N, SelectionDAG &DAG, return DAG.getNode(Op, DL, VT, Shift1, Shift2); } +static SDValue getShlAddShlAdd(SDNode *N, SelectionDAG &DAG, unsigned ShX, + unsigned ShY, bool AddX) { + SDLoc DL(N); + EVT VT = N->getValueType(0); + SDValue X = N->getOperand(0); + SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, + DAG.getTargetConstant(ShY, DL, VT), X); + return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359, + DAG.getTargetConstant(ShX, DL, VT), AddX ? X : Mul359); +} + +static SDValue expandMulToShlAddShlAdd(SDNode *N, SelectionDAG &DAG, + uint64_t MulAmt) { + // 3/5/9 * 3/5/9 -> (shXadd (shYadd X, X), (shYadd X, X)) + switch (MulAmt) { + case 5 * 3: + return getShlAddShlAdd(N, DAG, 2, 1, /*AddX=*/false); + case 9 * 3: + return getShlAddShlAdd(N, DAG, 3, 1, /*AddX=*/false); + case 5 * 5: + return getShlAddShlAdd(N, DAG, 2, 2, /*AddX=*/false); + case 9 * 5: + return getShlAddShlAdd(N, DAG, 3, 2, /*AddX=*/false); + case 9 * 9: + return getShlAddShlAdd(N, DAG, 3, 3, /*AddX=*/false); + default: + break; + } + + // 2/4/8 * 3/5/9 + 1 -> (shXadd (shYadd X, X), X) + int ShX; + if (int ShY = isShifted359(MulAmt - 1, ShX)) { + assert(ShX != 0 && "MulAmt=4,6,10 handled before"); + if (ShX <= 3) + return getShlAddShlAdd(N, DAG, ShX, ShY, /*AddX=*/true); + } + return SDValue(); +} + // Try to expand a scalar multiply to a faster sequence. static SDValue expandMul(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -16524,18 +16563,17 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, if (Subtarget.hasVendorXqciac() && isInt<12>(CNode->getSExtValue())) return SDValue(); - // WARNING: The code below is knowingly incorrect with regards to undef semantics. - // We're adding additional uses of X here, and in principle, we should be freezing - // X before doing so. However, adding freeze here causes real regressions, and no - // other target properly freezes X in these cases either. - SDValue X = N->getOperand(0); - + // WARNING: The code below is knowingly incorrect with regards to undef + // semantics. We're adding additional uses of X here, and in principle, we + // should be freezing X before doing so. However, adding freeze here causes + // real regressions, and no other target properly freezes X in these cases + // either. if (Subtarget.hasShlAdd(3)) { + SDValue X = N->getOperand(0); int Shift; if (int ShXAmount = isShifted359(MulAmt, Shift)) { // 3/5/9 * 2^N -> shl (shXadd X, X), N SDLoc DL(N); - SDValue X = N->getOperand(0); // Put the shift first if we can fold a zext into the shift forming // a slli.uw. if (X.getOpcode() == ISD::AND && isa<ConstantSDNode>(X.getOperand(1)) && @@ -16543,80 +16581,40 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(Shift, DL, VT)); return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Shl, - DAG.getConstant(ShXAmount, DL, VT), Shl); + DAG.getTargetConstant(ShXAmount, DL, VT), Shl); } // Otherwise, put the shl second so that it can fold with following // instructions (e.g. sext or add). SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, - DAG.getConstant(ShXAmount, DL, VT), X); + DAG.getTargetConstant(ShXAmount, DL, VT), X); return DAG.getNode(ISD::SHL, DL, VT, Mul359, DAG.getConstant(Shift, DL, VT)); } - // 3/5/9 * 3/5/9 -> shXadd (shYadd X, X), (shYadd X, X) - int ShX; - int ShY; - switch (MulAmt) { - case 3 * 5: - ShY = 1; - ShX = 2; - break; - case 3 * 9: - ShY = 1; - ShX = 3; - break; - case 5 * 5: - ShX = ShY = 2; - break; - case 5 * 9: - ShY = 2; - ShX = 3; - break; - case 9 * 9: - ShX = ShY = 3; - break; - default: - ShX = ShY = 0; - break; - } - if (ShX) { + // 3/5/9 * 3/5/9 * 2^N - In particular, this covers multiples + // of 25 which happen to be quite common. + // (2/4/8 * 3/5/9 + 1) * 2^N + Shift = llvm::countr_zero(MulAmt); + if (SDValue V = expandMulToShlAddShlAdd(N, DAG, MulAmt >> Shift)) { + if (Shift == 0) + return V; SDLoc DL(N); - SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, - DAG.getConstant(ShY, DL, VT), X); - return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359, - DAG.getConstant(ShX, DL, VT), Mul359); + return DAG.getNode(ISD::SHL, DL, VT, V, DAG.getConstant(Shift, DL, VT)); } // If this is a power 2 + 2/4/8, we can use a shift followed by a single // shXadd. First check if this a sum of two power of 2s because that's // easy. Then count how many zeros are up to the first bit. - if (isPowerOf2_64(MulAmt & (MulAmt - 1))) { - unsigned ScaleShift = llvm::countr_zero(MulAmt); - if (ScaleShift >= 1 && ScaleShift < 4) { - unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1))); - SDLoc DL(N); - SDValue Shift1 = - DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShiftAmt, DL, VT)); - return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, - DAG.getConstant(ScaleShift, DL, VT), Shift1); - } + if (Shift >= 1 && Shift <= 3 && isPowerOf2_64(MulAmt & (MulAmt - 1))) { + unsigned ShiftAmt = llvm::countr_zero((MulAmt & (MulAmt - 1))); + SDLoc DL(N); + SDValue Shift1 = + DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShiftAmt, DL, VT)); + return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, + DAG.getTargetConstant(Shift, DL, VT), Shift1); } - // 2^(1,2,3) * 3,5,9 + 1 -> (shXadd (shYadd x, x), x) - // This is the two instruction form, there are also three instruction - // variants we could implement. e.g. - // (2^(1,2,3) * 3,5,9 + 1) << C2 - // 2^(C1>3) * 3,5,9 +/- 1 - if (int ShXAmount = isShifted359(MulAmt - 1, Shift)) { - assert(Shift != 0 && "MulAmt=4,6,10 handled before"); - if (Shift <= 3) { - SDLoc DL(N); - SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, - DAG.getConstant(ShXAmount, DL, VT), X); - return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359, - DAG.getConstant(Shift, DL, VT), X); - } - } + // TODO: 2^(C1>3) * 3,5,9 +/- 1 // 2^n + 2/4/8 + 1 -> (add (shl X, C1), (shXadd X, X)) if (MulAmt > 2 && isPowerOf2_64((MulAmt - 1) & (MulAmt - 2))) { @@ -16626,9 +16624,10 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, SDLoc DL(N); SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShiftAmt, DL, VT)); - return DAG.getNode(ISD::ADD, DL, VT, Shift1, - DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, - DAG.getConstant(ScaleShift, DL, VT), X)); + return DAG.getNode( + ISD::ADD, DL, VT, Shift1, + DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, + DAG.getTargetConstant(ScaleShift, DL, VT), X)); } } @@ -16643,29 +16642,10 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShAmt, DL, VT)); SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, - DAG.getConstant(Log2_64(Offset - 1), DL, VT), X); + DAG.getTargetConstant(Log2_64(Offset - 1), DL, VT), X); return DAG.getNode(ISD::SUB, DL, VT, Shift1, Mul359); } } - - for (uint64_t Divisor : {3, 5, 9}) { - if (MulAmt % Divisor != 0) - continue; - uint64_t MulAmt2 = MulAmt / Divisor; - // 3/5/9 * 3/5/9 * 2^N - In particular, this covers multiples - // of 25 which happen to be quite common. - if (int ShBAmount = isShifted359(MulAmt2, Shift)) { - SDLoc DL(N); - SDValue Mul359A = - DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, - DAG.getConstant(Log2_64(Divisor - 1), DL, VT), X); - SDValue Mul359B = - DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359A, - DAG.getConstant(ShBAmount, DL, VT), Mul359A); - return DAG.getNode(ISD::SHL, DL, VT, Mul359B, - DAG.getConstant(Shift, DL, VT)); - } - } } if (SDValue V = expandMulToAddOrSubOfShl(N, DAG, MulAmt)) @@ -25320,3 +25300,12 @@ ArrayRef<MCPhysReg> RISCVTargetLowering::getRoundingControlRegisters() const { } return {}; } + +bool RISCVTargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const { + EVT VT = Y.getValueType(); + + if (VT.isVector()) + return false; + + return VT.getSizeInBits() <= Subtarget.getXLen(); +} diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index 9e3e2a9..dd62a9c 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -465,6 +465,8 @@ public: ArrayRef<MCPhysReg> getRoundingControlRegisters() const override; + bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override; + /// Match a mask which "spreads" the leading elements of a vector evenly /// across the result. Factor is the spread amount, and Index is the /// offset applied. diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp index 636e31c..bf9de0a 100644 --- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp +++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp @@ -1583,7 +1583,10 @@ void RISCVInsertVSETVLI::emitVSETVLIs(MachineBasicBlock &MBB) { if (!TII->isAddImmediate(*DeadMI, Reg)) continue; LIS->RemoveMachineInstrFromMaps(*DeadMI); + Register AddReg = DeadMI->getOperand(1).getReg(); DeadMI->eraseFromParent(); + if (AddReg.isVirtual()) + LIS->shrinkToUses(&LIS->getInterval(AddReg)); } } } @@ -1869,11 +1872,15 @@ void RISCVInsertVSETVLI::coalesceVSETVLIs(MachineBasicBlock &MBB) const { // Loop over the dead AVL values, and delete them now. This has // to be outside the above loop to avoid invalidating iterators. for (auto *MI : ToDelete) { + assert(MI->getOpcode() == RISCV::ADDI); + Register AddReg = MI->getOperand(1).getReg(); if (LIS) { LIS->removeInterval(MI->getOperand(0).getReg()); LIS->RemoveMachineInstrFromMaps(*MI); } MI->eraseFromParent(); + if (LIS && AddReg.isVirtual()) + LIS->shrinkToUses(&LIS->getInterval(AddReg)); } } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index c9df787..b8ab70b 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -1703,6 +1703,7 @@ unsigned getPredicatedOpcode(unsigned Opcode) { case RISCV::MAXU: return RISCV::PseudoCCMAXU; case RISCV::MIN: return RISCV::PseudoCCMIN; case RISCV::MINU: return RISCV::PseudoCCMINU; + case RISCV::MUL: return RISCV::PseudoCCMUL; case RISCV::ADDI: return RISCV::PseudoCCADDI; case RISCV::SLLI: return RISCV::PseudoCCSLLI; @@ -1754,6 +1755,9 @@ static MachineInstr *canFoldAsPredicatedOp(Register Reg, MI->getOpcode() == RISCV::MINU || MI->getOpcode() == RISCV::MAXU)) return nullptr; + if (!STI.hasShortForwardBranchIMul() && MI->getOpcode() == RISCV::MUL) + return nullptr; + // Check if MI can be predicated and folded into the CCMOV. if (getPredicatedOpcode(MI->getOpcode()) == RISCV::INSTRUCTION_LIST_END) return nullptr; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td b/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td index 5a67a5a..494b1c9 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td @@ -110,6 +110,7 @@ def PseudoCCMAX : SFBALU_rr; def PseudoCCMIN : SFBALU_rr; def PseudoCCMAXU : SFBALU_rr; def PseudoCCMINU : SFBALU_rr; +def PseudoCCMUL : SFBALU_rr; def PseudoCCADDI : SFBALU_ri; def PseudoCCANDI : SFBALU_ri; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td index b37ceaae..c2b25c6 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td @@ -60,6 +60,8 @@ def immfour : RISCVOp { let DecoderMethod = "decodeImmFourOperand"; } +def tuimm2 : TImmLeaf<XLenVT, [{return isUInt<2>(Imm);}]>; + //===----------------------------------------------------------------------===// // Instruction class templates //===----------------------------------------------------------------------===// @@ -557,8 +559,8 @@ multiclass VPatTernaryVMAQA_VV_VX<string intrinsic, string instruction, let Predicates = [HasVendorXTHeadBa] in { def : Pat<(add_like_non_imm12 (shl GPR:$rs2, uimm2:$uimm2), (XLenVT GPR:$rs1)), (TH_ADDSL GPR:$rs1, GPR:$rs2, uimm2:$uimm2)>; -def : Pat<(XLenVT (riscv_shl_add GPR:$rs2, uimm2:$uimm2, GPR:$rs1)), - (TH_ADDSL GPR:$rs1, GPR:$rs2, uimm2:$uimm2)>; +def : Pat<(XLenVT (riscv_shl_add GPR:$rs2, tuimm2:$uimm2, GPR:$rs1)), + (TH_ADDSL GPR:$rs1, GPR:$rs2, tuimm2:$uimm2)>; // Reuse complex patterns from StdExtZba def : Pat<(add_like_non_imm12 sh1add_op:$rs2, (XLenVT GPR:$rs1)), diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td index 4537bfe..8376da5 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td @@ -53,6 +53,8 @@ def uimm5gt3 : RISCVOp<XLenVT>, ImmLeaf<XLenVT, let OperandType = "OPERAND_UIMM5_GT3"; } +def tuimm5gt3 : TImmLeaf<XLenVT, [{return (Imm > 3) && isUInt<5>(Imm);}]>; + def UImm5Plus1AsmOperand : AsmOperandClass { let Name = "UImm5Plus1"; let RenderMethod = "addImmOperands"; @@ -1419,8 +1421,8 @@ def : Pat<(i32 (add GPRNoX0:$rd, (mul GPRNoX0:$rs1, simm12_lo:$imm12))), (QC_MULIADD GPRNoX0:$rd, GPRNoX0:$rs1, simm12_lo:$imm12)>; def : Pat<(i32 (add_like_non_imm12 (shl GPRNoX0:$rs1, (i32 uimm5gt3:$imm)), GPRNoX0:$rs2)), (QC_SHLADD GPRNoX0:$rs1, GPRNoX0:$rs2, uimm5gt3:$imm)>; -def : Pat<(i32 (riscv_shl_add GPRNoX0:$rs1, (i32 uimm5gt3:$imm), GPRNoX0:$rs2)), - (QC_SHLADD GPRNoX0:$rs1, GPRNoX0:$rs2, uimm5gt3:$imm)>; +def : Pat<(i32 (riscv_shl_add GPRNoX0:$rs1, (i32 tuimm5gt3:$imm), GPRNoX0:$rs2)), + (QC_SHLADD GPRNoX0:$rs1, GPRNoX0:$rs2, tuimm5gt3:$imm)>; } // Predicates = [HasVendorXqciac, IsRV32] /// Simple arithmetic operations diff --git a/llvm/lib/Target/RISCV/RISCVPromoteConstant.cpp b/llvm/lib/Target/RISCV/RISCVPromoteConstant.cpp new file mode 100644 index 0000000..bf1f69f --- /dev/null +++ b/llvm/lib/Target/RISCV/RISCVPromoteConstant.cpp @@ -0,0 +1,213 @@ +//==- RISCVPromoteConstant.cpp - Promote constant fp to global for RISC-V --==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "RISCV.h" +#include "RISCVSubtarget.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/TargetLowering.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +#define DEBUG_TYPE "riscv-promote-const" +#define RISCV_PROMOTE_CONSTANT_NAME "RISC-V Promote Constants" + +STATISTIC(NumPromoted, "Number of constant literals promoted to globals"); +STATISTIC(NumPromotedUses, "Number of uses of promoted literal constants"); + +namespace { + +class RISCVPromoteConstant : public ModulePass { +public: + static char ID; + RISCVPromoteConstant() : ModulePass(ID) {} + + StringRef getPassName() const override { return RISCV_PROMOTE_CONSTANT_NAME; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<TargetPassConfig>(); + AU.setPreservesCFG(); + } + + /// Iterate over the functions and promote the double fp constants that + /// would otherwise go into the constant pool to a constant array. + bool runOnModule(Module &M) override { + if (skipModule(M)) + return false; + // TargetMachine and Subtarget are needed to query isFPImmlegal. + const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>(); + const TargetMachine &TM = TPC.getTM<TargetMachine>(); + bool Changed = false; + for (Function &F : M) { + const RISCVSubtarget &ST = TM.getSubtarget<RISCVSubtarget>(F); + const RISCVTargetLowering *TLI = ST.getTargetLowering(); + Changed |= runOnFunction(F, TLI); + } + return Changed; + } + +private: + bool runOnFunction(Function &F, const RISCVTargetLowering *TLI); +}; +} // end anonymous namespace + +char RISCVPromoteConstant::ID = 0; + +INITIALIZE_PASS(RISCVPromoteConstant, DEBUG_TYPE, RISCV_PROMOTE_CONSTANT_NAME, + false, false) + +ModulePass *llvm::createRISCVPromoteConstantPass() { + return new RISCVPromoteConstant(); +} + +bool RISCVPromoteConstant::runOnFunction(Function &F, + const RISCVTargetLowering *TLI) { + if (F.hasOptNone() || F.hasOptSize()) + return false; + + // Bail out and make no transformation if the target doesn't support + // doubles, or if we're not targeting RV64 as we currently see some + // regressions for those targets. + if (!TLI->isTypeLegal(MVT::f64) || !TLI->isTypeLegal(MVT::i64)) + return false; + + // Collect all unique double constants and their uses in the function. Use + // MapVector to preserve insertion order. + MapVector<ConstantFP *, SmallVector<Use *, 8>> ConstUsesMap; + + for (Instruction &I : instructions(F)) { + for (Use &U : I.operands()) { + auto *C = dyn_cast<ConstantFP>(U.get()); + if (!C || !C->getType()->isDoubleTy()) + continue; + // Do not promote if it wouldn't be loaded from the constant pool. + if (TLI->isFPImmLegal(C->getValueAPF(), MVT::f64, + /*ForCodeSize=*/false)) + continue; + // Do not promote a constant if it is used as an immediate argument + // for an intrinsic. + if (auto *II = dyn_cast<IntrinsicInst>(U.getUser())) { + Function *IntrinsicFunc = II->getFunction(); + unsigned OperandIdx = U.getOperandNo(); + if (IntrinsicFunc && IntrinsicFunc->getAttributes().hasParamAttr( + OperandIdx, Attribute::ImmArg)) { + LLVM_DEBUG(dbgs() << "Skipping promotion of constant in: " << *II + << " because operand " << OperandIdx + << " must be an immediate.\n"); + continue; + } + } + // Note: FP args to inline asm would be problematic if we had a + // constraint that required an immediate floating point operand. At the + // time of writing LLVM doesn't recognise such a constraint. + ConstUsesMap[C].push_back(&U); + } + } + + int PromotableConstants = ConstUsesMap.size(); + LLVM_DEBUG(dbgs() << "Found " << PromotableConstants + << " promotable constants in " << F.getName() << "\n"); + // Bail out if no promotable constants found, or if only one is found. + if (PromotableConstants < 2) { + LLVM_DEBUG(dbgs() << "Performing no promotions as insufficient promotable " + "constants found\n"); + return false; + } + + NumPromoted += PromotableConstants; + + // Create a global array containing the promoted constants. + Module *M = F.getParent(); + Type *DoubleTy = Type::getDoubleTy(M->getContext()); + + SmallVector<Constant *, 16> ConstantVector; + for (auto const &Pair : ConstUsesMap) + ConstantVector.push_back(Pair.first); + + ArrayType *ArrayTy = ArrayType::get(DoubleTy, ConstantVector.size()); + Constant *GlobalArrayInitializer = + ConstantArray::get(ArrayTy, ConstantVector); + + auto *GlobalArray = new GlobalVariable( + *M, ArrayTy, + /*isConstant=*/true, GlobalValue::InternalLinkage, GlobalArrayInitializer, + ".promoted_doubles." + F.getName()); + + // A cache to hold the loaded value for a given constant within a basic block. + DenseMap<std::pair<ConstantFP *, BasicBlock *>, Value *> LocalLoads; + + // Replace all uses with the loaded value. + unsigned Idx = 0; + for (auto const &Pair : ConstUsesMap) { + ConstantFP *Const = Pair.first; + const SmallVector<Use *, 8> &Uses = Pair.second; + + for (Use *U : Uses) { + Instruction *UserInst = cast<Instruction>(U->getUser()); + BasicBlock *InsertionBB; + + // If the user is a PHI node, we must insert the load in the + // corresponding predecessor basic block. Otherwise, it's inserted into + // the same block as the use. + if (auto *PN = dyn_cast<PHINode>(UserInst)) + InsertionBB = PN->getIncomingBlock(*U); + else + InsertionBB = UserInst->getParent(); + + if (isa<CatchSwitchInst>(InsertionBB->getTerminator())) { + LLVM_DEBUG(dbgs() << "Bailing out: catchswitch means thre is no valid " + "insertion point.\n"); + return false; + } + + auto CacheKey = std::make_pair(Const, InsertionBB); + Value *LoadedVal = nullptr; + + // Re-use a load if it exists in the insertion block. + if (LocalLoads.count(CacheKey)) { + LoadedVal = LocalLoads.at(CacheKey); + } else { + // Otherwise, create a new GEP and Load at the correct insertion point. + // It is always safe to insert in the first insertion point in the BB, + // so do that and let other passes reorder. + IRBuilder<> Builder(InsertionBB, InsertionBB->getFirstInsertionPt()); + Value *ElementPtr = Builder.CreateConstInBoundsGEP2_64( + GlobalArray->getValueType(), GlobalArray, 0, Idx, "double.addr"); + LoadedVal = Builder.CreateLoad(DoubleTy, ElementPtr, "double.val"); + + // Cache the newly created load for this block. + LocalLoads[CacheKey] = LoadedVal; + } + + U->set(LoadedVal); + ++NumPromotedUses; + } + ++Idx; + } + + return true; +} diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp index ae54ff1..16ef67d 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -139,6 +139,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() { initializeRISCVExpandAtomicPseudoPass(*PR); initializeRISCVRedundantCopyEliminationPass(*PR); initializeRISCVAsmPrinterPass(*PR); + initializeRISCVPromoteConstantPass(*PR); } static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) { @@ -462,6 +463,8 @@ void RISCVPassConfig::addIRPasses() { } bool RISCVPassConfig::addPreISel() { + if (TM->getOptLevel() != CodeGenOptLevel::None) + addPass(createRISCVPromoteConstantPass()); if (TM->getOptLevel() != CodeGenOptLevel::None) { // Add a barrier before instruction selection so that we will not get // deleted block address after enabling default outlining. See D99707 for diff --git a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp index 640b014..0175f2f 100644 --- a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp @@ -577,6 +577,11 @@ void SPIRVAsmPrinter::outputExecutionMode(const Module &M) { if (MDNode *Node = F.getMetadata("intel_reqd_sub_group_size")) outputExecutionModeFromMDNode(FReg, Node, SPIRV::ExecutionMode::SubgroupSize, 0, 0); + if (MDNode *Node = F.getMetadata("max_work_group_size")) { + if (ST->canUseExtension(SPIRV::Extension::SPV_INTEL_kernel_attributes)) + outputExecutionModeFromMDNode( + FReg, Node, SPIRV::ExecutionMode::MaxWorkgroupSizeINTEL, 3, 1); + } if (MDNode *Node = F.getMetadata("vec_type_hint")) { MCInst Inst; Inst.setOpcode(SPIRV::OpExecutionMode); diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp index 56a38bb..b2cbdb2 100644 --- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp @@ -2390,6 +2390,15 @@ static bool generateBindlessImageINTELInst(const SPIRV::IncomingCall *Call, return buildBindlessImageINTELInst(Call, Opcode, MIRBuilder, GR); } +static bool generateBlockingPipesInst(const SPIRV::IncomingCall *Call, + MachineIRBuilder &MIRBuilder, + SPIRVGlobalRegistry *GR) { + const SPIRV::DemangledBuiltin *Builtin = Call->Builtin; + unsigned Opcode = + SPIRV::lookupNativeBuiltin(Builtin->Name, Builtin->Set)->Opcode; + return buildOpFromWrapper(MIRBuilder, Opcode, Call, Register(0)); +} + static bool generateTernaryBitwiseFunctionINTELInst(const SPIRV::IncomingCall *Call, MachineIRBuilder &MIRBuilder, @@ -3050,6 +3059,8 @@ std::optional<bool> lowerBuiltin(const StringRef DemangledCall, return generatePipeInst(Call.get(), MIRBuilder, GR); case SPIRV::PredicatedLoadStore: return generatePredicatedLoadStoreInst(Call.get(), MIRBuilder, GR); + case SPIRV::BlockingPipes: + return generateBlockingPipesInst(Call.get(), MIRBuilder, GR); } return false; } diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td index c259cce..492a98e 100644 --- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td +++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td @@ -71,6 +71,7 @@ def TernaryBitwiseINTEL : BuiltinGroup; def Block2DLoadStore : BuiltinGroup; def Pipe : BuiltinGroup; def PredicatedLoadStore : BuiltinGroup; +def BlockingPipes : BuiltinGroup; //===----------------------------------------------------------------------===// // Class defining a demangled builtin record. The information in the record @@ -1174,6 +1175,10 @@ defm : DemangledNativeBuiltin<"clock_read_sub_group", OpenCL_std, KernelClock, 0 defm : DemangledNativeBuiltin<"clock_read_hilo_device", OpenCL_std, KernelClock, 0, 0, OpReadClockKHR>; defm : DemangledNativeBuiltin<"clock_read_hilo_work_group", OpenCL_std, KernelClock, 0, 0, OpReadClockKHR>; defm : DemangledNativeBuiltin<"clock_read_hilo_sub_group", OpenCL_std, KernelClock, 0, 0, OpReadClockKHR>; + +//SPV_ALTERA_blocking_pipes +defm : DemangledNativeBuiltin<"__spirv_WritePipeBlockingINTEL", OpenCL_std, BlockingPipes, 0, 0, OpWritePipeBlockingALTERA>; +defm : DemangledNativeBuiltin<"__spirv_ReadPipeBlockingINTEL", OpenCL_std, BlockingPipes, 0, 0, OpReadPipeBlockingALTERA>; defm : DemangledNativeBuiltin<"__spirv_ReadClockKHR", OpenCL_std, KernelClock, 1, 1, OpReadClockKHR>; //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp index 96f5dee..f681b0d 100644 --- a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp @@ -107,6 +107,8 @@ static const std::map<std::string, SPIRV::Extension::Extension, std::less<>> SPIRV::Extension::Extension::SPV_INTEL_inline_assembly}, {"SPV_INTEL_bindless_images", SPIRV::Extension::Extension::SPV_INTEL_bindless_images}, + {"SPV_INTEL_bfloat16_arithmetic", + SPIRV::Extension::Extension::SPV_INTEL_bfloat16_arithmetic}, {"SPV_INTEL_bfloat16_conversion", SPIRV::Extension::Extension::SPV_INTEL_bfloat16_conversion}, {"SPV_KHR_subgroup_rotate", @@ -155,7 +157,11 @@ static const std::map<std::string, SPIRV::Extension::Extension, std::less<>> {"SPV_INTEL_predicated_io", SPIRV::Extension::Extension::SPV_INTEL_predicated_io}, {"SPV_KHR_maximal_reconvergence", - SPIRV::Extension::Extension::SPV_KHR_maximal_reconvergence}}; + SPIRV::Extension::Extension::SPV_KHR_maximal_reconvergence}, + {"SPV_INTEL_kernel_attributes", + SPIRV::Extension::Extension::SPV_INTEL_kernel_attributes}, + {"SPV_ALTERA_blocking_pipes", + SPIRV::Extension::Extension::SPV_ALTERA_blocking_pipes}}; bool SPIRVExtensionsParser::parse(cl::Option &O, StringRef ArgName, StringRef ArgValue, diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td index a61351e..03bd61b 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td +++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td @@ -993,3 +993,9 @@ def OpPredicatedLoadINTEL: Op<6528, (outs ID:$res), (ins TYPE:$resType, ID:$ptr, "$res = OpPredicatedLoadINTEL $resType $ptr $predicate $default_value">; def OpPredicatedStoreINTEL: Op<6529, (outs), (ins ID:$ptr, ID:$object, ID:$predicate, variable_ops), "OpPredicatedStoreINTEL $ptr $object $predicate">; + +//SPV_ALTERA_blocking_pipes +def OpReadPipeBlockingALTERA :Op<5946, (outs), (ins ID:$pipe, ID:$pointer, ID:$packetSize, ID:$packetAlignment), + "OpReadPipeBlockingALTERA $pipe $pointer $packetSize $packetAlignment">; +def OpWritePipeBlockingALTERA :Op<5946, (outs), (ins ID:$pipe, ID:$pointer, ID:$packetSize, ID:$packetAlignment), + "OpWritePipeBlockingALTERA $pipe $pointer $packetSize $packetAlignment">; diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index 3f0424f..245e5a2 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -3516,6 +3516,10 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg, case Intrinsic::spv_resource_nonuniformindex: { return selectResourceNonUniformIndex(ResVReg, ResType, I); } + case Intrinsic::spv_unpackhalf2x16: { + return selectExtInst(ResVReg, ResType, I, GL::UnpackHalf2x16); + } + default: { std::string DiagMsg; raw_string_ostream OS(DiagMsg); diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp index db036a5..af76016 100644 --- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp @@ -1435,6 +1435,8 @@ void addInstrRequirements(const MachineInstr &MI, addPrintfRequirements(MI, Reqs, ST); break; } + // TODO: handle bfloat16 extended instructions when + // SPV_INTEL_bfloat16_arithmetic is enabled. break; } case SPIRV::OpAliasDomainDeclINTEL: @@ -1883,6 +1885,13 @@ void addInstrRequirements(const MachineInstr &MI, Reqs.addCapability( SPIRV::Capability::CooperativeMatrixCheckedInstructionsINTEL); break; + case SPIRV::OpReadPipeBlockingALTERA: + case SPIRV::OpWritePipeBlockingALTERA: + if (ST.canUseExtension(SPIRV::Extension::SPV_ALTERA_blocking_pipes)) { + Reqs.addExtension(SPIRV::Extension::SPV_ALTERA_blocking_pipes); + Reqs.addCapability(SPIRV::Capability::BlockingPipesALTERA); + } + break; case SPIRV::OpCooperativeMatrixGetElementCoordINTEL: if (!ST.canUseExtension(SPIRV::Extension::SPV_INTEL_joint_matrix)) report_fatal_error("OpCooperativeMatrixGetElementCoordINTEL requires the " @@ -2060,7 +2069,64 @@ void addInstrRequirements(const MachineInstr &MI, Reqs.addCapability(SPIRV::Capability::PredicatedIOINTEL); break; } - + case SPIRV::OpFAddS: + case SPIRV::OpFSubS: + case SPIRV::OpFMulS: + case SPIRV::OpFDivS: + case SPIRV::OpFRemS: + case SPIRV::OpFMod: + case SPIRV::OpFNegate: + case SPIRV::OpFAddV: + case SPIRV::OpFSubV: + case SPIRV::OpFMulV: + case SPIRV::OpFDivV: + case SPIRV::OpFRemV: + case SPIRV::OpFNegateV: { + const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); + SPIRVType *TypeDef = MRI.getVRegDef(MI.getOperand(1).getReg()); + if (TypeDef->getOpcode() == SPIRV::OpTypeVector) + TypeDef = MRI.getVRegDef(TypeDef->getOperand(1).getReg()); + if (isBFloat16Type(TypeDef)) { + if (!ST.canUseExtension(SPIRV::Extension::SPV_INTEL_bfloat16_arithmetic)) + report_fatal_error( + "Arithmetic instructions with bfloat16 arguments require the " + "following SPIR-V extension: SPV_INTEL_bfloat16_arithmetic", + false); + Reqs.addExtension(SPIRV::Extension::SPV_INTEL_bfloat16_arithmetic); + Reqs.addCapability(SPIRV::Capability::BFloat16ArithmeticINTEL); + } + break; + } + case SPIRV::OpOrdered: + case SPIRV::OpUnordered: + case SPIRV::OpFOrdEqual: + case SPIRV::OpFOrdNotEqual: + case SPIRV::OpFOrdLessThan: + case SPIRV::OpFOrdLessThanEqual: + case SPIRV::OpFOrdGreaterThan: + case SPIRV::OpFOrdGreaterThanEqual: + case SPIRV::OpFUnordEqual: + case SPIRV::OpFUnordNotEqual: + case SPIRV::OpFUnordLessThan: + case SPIRV::OpFUnordLessThanEqual: + case SPIRV::OpFUnordGreaterThan: + case SPIRV::OpFUnordGreaterThanEqual: { + const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); + MachineInstr *OperandDef = MRI.getVRegDef(MI.getOperand(2).getReg()); + SPIRVType *TypeDef = MRI.getVRegDef(OperandDef->getOperand(1).getReg()); + if (TypeDef->getOpcode() == SPIRV::OpTypeVector) + TypeDef = MRI.getVRegDef(TypeDef->getOperand(1).getReg()); + if (isBFloat16Type(TypeDef)) { + if (!ST.canUseExtension(SPIRV::Extension::SPV_INTEL_bfloat16_arithmetic)) + report_fatal_error( + "Relational instructions with bfloat16 arguments require the " + "following SPIR-V extension: SPV_INTEL_bfloat16_arithmetic", + false); + Reqs.addExtension(SPIRV::Extension::SPV_INTEL_bfloat16_arithmetic); + Reqs.addCapability(SPIRV::Capability::BFloat16ArithmeticINTEL); + } + break; + } default: break; } @@ -2180,6 +2246,10 @@ static void collectReqs(const Module &M, SPIRV::ModuleAnalysisInfo &MAI, MAI.Reqs.getAndAddRequirements( SPIRV::OperandCategory::ExecutionModeOperand, SPIRV::ExecutionMode::SubgroupSize, ST); + if (F.getMetadata("max_work_group_size")) + MAI.Reqs.getAndAddRequirements( + SPIRV::OperandCategory::ExecutionModeOperand, + SPIRV::ExecutionMode::MaxWorkgroupSizeINTEL, ST); if (F.getMetadata("vec_type_hint")) MAI.Reqs.getAndAddRequirements( SPIRV::OperandCategory::ExecutionModeOperand, diff --git a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp index 4e4e6fb..be88f33 100644 --- a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp @@ -56,6 +56,13 @@ public: } }; +static cl::list<std::string> SPVAllowUnknownIntrinsics( + "spv-allow-unknown-intrinsics", cl::CommaSeparated, + cl::desc("Emit unknown intrinsics as calls to external functions. A " + "comma-separated input list of intrinsic prefixes must be " + "provided, and only intrinsics carrying a listed prefix get " + "emitted as described."), + cl::value_desc("intrinsic_prefix_0,intrinsic_prefix_1"), cl::ValueOptional); } // namespace char SPIRVPrepareFunctions::ID = 0; @@ -445,6 +452,15 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) { EraseFromParent); Changed = true; break; + default: + if (TM.getTargetTriple().getVendor() == Triple::AMD || + any_of(SPVAllowUnknownIntrinsics, [II](auto &&Prefix) { + if (Prefix.empty()) + return false; + return II->getCalledFunction()->getName().starts_with(Prefix); + })) + Changed |= lowerIntrinsicToFunction(II); + break; } } } diff --git a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp index ba09692..ad6c9cd 100644 --- a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp @@ -70,7 +70,6 @@ SPIRVSubtarget::SPIRVSubtarget(const Triple &TT, const std::string &CPU, SPIRVVersion = VersionTuple(1, 3); break; case Triple::SPIRVSubArch_v14: - default: SPIRVVersion = VersionTuple(1, 4); break; case Triple::SPIRVSubArch_v15: @@ -79,13 +78,19 @@ SPIRVSubtarget::SPIRVSubtarget(const Triple &TT, const std::string &CPU, case Triple::SPIRVSubArch_v16: SPIRVVersion = VersionTuple(1, 6); break; + default: + if (TT.getVendor() == Triple::AMD) + SPIRVVersion = VersionTuple(1, 6); + else + SPIRVVersion = VersionTuple(1, 4); } OpenCLVersion = VersionTuple(2, 2); // Set the environment based on the target triple. if (TargetTriple.getOS() == Triple::Vulkan) Env = Shader; - else if (TargetTriple.getEnvironment() == Triple::OpenCL) + else if (TargetTriple.getEnvironment() == Triple::OpenCL || + TargetTriple.getVendor() == Triple::AMD) Env = Kernel; else Env = Unknown; @@ -93,6 +98,8 @@ SPIRVSubtarget::SPIRVSubtarget(const Triple &TT, const std::string &CPU, // Set the default extensions based on the target triple. if (TargetTriple.getVendor() == Triple::Intel) Extensions.insert(SPIRV::Extension::SPV_INTEL_function_pointers); + if (TargetTriple.getVendor() == Triple::AMD) + Extensions = SPIRVExtensionsParser::getValidExtensions(TargetTriple); // The order of initialization is important. initAvailableExtensions(Extensions); diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td index 7d08b29..65a8885 100644 --- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td +++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td @@ -309,7 +309,7 @@ defm SPV_KHR_shader_clock : ExtensionOperand<54, [EnvVulkan, EnvOpenCL]>; defm SPV_INTEL_unstructured_loop_controls : ExtensionOperand<55, [EnvOpenCL]>; defm SPV_EXT_demote_to_helper_invocation : ExtensionOperand<56, [EnvVulkan]>; defm SPV_INTEL_fpga_reg : ExtensionOperand<57, [EnvOpenCL]>; -defm SPV_INTEL_blocking_pipes : ExtensionOperand<58, [EnvOpenCL]>; +defm SPV_ALTERA_blocking_pipes : ExtensionOperand<58, [EnvOpenCL]>; defm SPV_GOOGLE_user_type : ExtensionOperand<59, [EnvVulkan]>; defm SPV_KHR_physical_storage_buffer : ExtensionOperand<60, [EnvVulkan]>; defm SPV_INTEL_kernel_attributes : ExtensionOperand<61, [EnvOpenCL]>; @@ -387,6 +387,8 @@ defm SPV_INTEL_tensor_float32_conversion : ExtensionOperand<125, [EnvOpenCL]>; defm SPV_KHR_bfloat16 : ExtensionOperand<126, [EnvVulkan, EnvOpenCL]>; defm SPV_INTEL_predicated_io : ExtensionOperand<127, [EnvOpenCL]>; defm SPV_KHR_maximal_reconvergence : ExtensionOperand<128, [EnvVulkan]>; +defm SPV_INTEL_bfloat16_arithmetic + : ExtensionOperand<129, [EnvVulkan, EnvOpenCL]>; //===----------------------------------------------------------------------===// // Multiclass used to define Capabilities enum values and at the same time @@ -570,6 +572,7 @@ defm AtomicFloat64MinMaxEXT : CapabilityOperand<5613, 0, 0, [SPV_EXT_shader_atom defm VariableLengthArrayINTEL : CapabilityOperand<5817, 0, 0, [SPV_INTEL_variable_length_array], []>; defm GroupUniformArithmeticKHR : CapabilityOperand<6400, 0, 0, [SPV_KHR_uniform_group_instructions], []>; defm USMStorageClassesINTEL : CapabilityOperand<5935, 0, 0, [SPV_INTEL_usm_storage_classes], [Kernel]>; +defm BFloat16ArithmeticINTEL : CapabilityOperand<6226, 0, 0, [SPV_INTEL_bfloat16_arithmetic], []>; defm BFloat16ConversionINTEL : CapabilityOperand<6115, 0, 0, [SPV_INTEL_bfloat16_conversion], []>; defm GlobalVariableHostAccessINTEL : CapabilityOperand<6187, 0, 0, [SPV_INTEL_global_variable_host_access], []>; defm HostAccessINTEL : CapabilityOperand<6188, 0, 0, [SPV_INTEL_global_variable_host_access], []>; @@ -587,6 +590,11 @@ defm CooperativeMatrixBFloat16ComponentTypeINTEL : CapabilityOperand<6437, 0, 0, defm RoundToInfinityINTEL : CapabilityOperand<5582, 0, 0, [SPV_INTEL_float_controls2], []>; defm FloatingPointModeINTEL : CapabilityOperand<5583, 0, 0, [SPV_INTEL_float_controls2], []>; defm FunctionFloatControlINTEL : CapabilityOperand<5821, 0, 0, [SPV_INTEL_float_controls2], []>; +defm KernelAttributesINTEL : CapabilityOperand<5892, 0, 0, [SPV_INTEL_kernel_attributes], [Kernel]>; +// TODO-SPIRV: add these once they are used / tested. +// defm FPGAKernelAttributesINTEL : CapabilityOperand<5897, 0, 0, [SPV_INTEL_kernel_attributes], [Kernel]>; +// defm FPGAKernelAttributesv2INTEL : CapabilityOperand<6161, 0, 0, [SPV_INTEL_kernel_attributes], [Kernel]>; +// END TODO-SPIRV defm LongCompositesINTEL : CapabilityOperand<6089, 0, 0, [SPV_INTEL_long_composites], []>; defm BindlessImagesINTEL : CapabilityOperand<6528, 0, 0, [SPV_INTEL_bindless_images], []>; defm MemoryAccessAliasingINTEL : CapabilityOperand<5910, 0, 0, [SPV_INTEL_memory_access_aliasing], []>; @@ -603,6 +611,7 @@ defm TensorFloat32RoundingINTEL : CapabilityOperand<6425, 0, 0, [SPV_INTEL_tenso defm BFloat16TypeKHR : CapabilityOperand<5116, 0, 0, [SPV_KHR_bfloat16], []>; defm BFloat16DotProductKHR : CapabilityOperand<5117, 0, 0, [SPV_KHR_bfloat16], [BFloat16TypeKHR]>; defm BFloat16CooperativeMatrixKHR : CapabilityOperand<5118, 0, 0, [SPV_KHR_bfloat16], [BFloat16TypeKHR, CooperativeMatrixKHR]>; +defm BlockingPipesALTERA : CapabilityOperand<5945, 0, 0, [SPV_ALTERA_blocking_pipes], []>; //===----------------------------------------------------------------------===// // Multiclass used to define SourceLanguage enum values and at the same time @@ -805,6 +814,15 @@ defm RoundingModeRTPINTEL : ExecutionModeOperand<5620, [RoundToInfinityINTEL]>; defm RoundingModeRTNINTEL : ExecutionModeOperand<5621, [RoundToInfinityINTEL]>; defm FloatingPointModeALTINTEL : ExecutionModeOperand<5622, [FloatingPointModeINTEL]>; defm FloatingPointModeIEEEINTEL : ExecutionModeOperand<5623, [FloatingPointModeINTEL]>; +defm MaxWorkgroupSizeINTEL : ExecutionModeOperand<5893, [KernelAttributesINTEL]>; +// TODO-SPIRV: Add the following once they are used / tested. +// defm MaxWorkDimINTEL : ExecutionModeOperand<5894, [KernelAttributesINTEL]>; +// defm NoGlobalOffsetINTEL : ExecutionModeOperand<5895, [KernelAttributesINTEL]>; +// defm NumSIMDWorkitemsINTEL : ExecutionModeOperand<5896, [FPGAKernelAttributesINTEL]>; +// defm SchedulerTargetFmaxMhzINTEL : ExecutionModeOperand<5903, [FPGAKernelAttributesINTEL]>; +// defm StreamingInterfaceINTEL : ExecutionModeOperand<6154, [FPGAKernelAttributesv2INTEL]>; +// defm RegisterMapInterfaceINTEL : ExecutionModeOperand<6160, [FPGAKernelAttributesv2INTEL]>; +// END TODO-SPIRV defm FPFastMathDefault : ExecutionModeOperand<6028, [FloatControls2]>; defm MaximallyReconvergesKHR : ExecutionModeOperand<6023, [Shader]>; @@ -1919,7 +1937,7 @@ defm GenericCastToPtr : SpecConstantOpOperandsOperand<122, [], [Kernel]>; defm PtrCastToGeneric : SpecConstantOpOperandsOperand<121, [], [Kernel]>; defm Bitcast : SpecConstantOpOperandsOperand<124, [], []>; defm QuantizeToF16 : SpecConstantOpOperandsOperand<116, [], [Shader]>; -// Arithmetic +// Arithmetic defm SNegate : SpecConstantOpOperandsOperand<126, [], []>; defm Not : SpecConstantOpOperandsOperand<200, [], []>; defm IAdd : SpecConstantOpOperandsOperand<128, [], []>; diff --git a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp index 5ba0356..2951a4b 100644 --- a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp @@ -244,7 +244,8 @@ static cl::opt<bool> SPVEnableNonSemanticDI( cl::Optional, cl::init(false)); void SPIRVPassConfig::addPreEmitPass() { - if (SPVEnableNonSemanticDI) { + if (SPVEnableNonSemanticDI || + getSPIRVTargetMachine().getTargetTriple().getVendor() == Triple::AMD) { addPass(createSPIRVEmitNonSemanticDIPass(&getTM<SPIRVTargetMachine>())); } } diff --git a/llvm/lib/Target/Sparc/Sparc.td b/llvm/lib/Target/Sparc/Sparc.td index 7137e5f..38b0508 100644 --- a/llvm/lib/Target/Sparc/Sparc.td +++ b/llvm/lib/Target/Sparc/Sparc.td @@ -95,6 +95,9 @@ def FeatureSoftFloat : SubtargetFeature<"soft-float", "UseSoftFloat", "true", def TuneSlowRDPC : SubtargetFeature<"slow-rdpc", "HasSlowRDPC", "true", "rd %pc, %XX is slow", [FeatureV9]>; +def TuneNoPredictor : SubtargetFeature<"no-predictor", "HasNoPredictor", "true", + "Processor has no branch predictor, branches stall execution", []>; + //==== Features added predmoninantly for LEON subtarget support include "LeonFeatures.td" @@ -174,12 +177,15 @@ def : Proc<"ultrasparc3", [FeatureV9, FeatureV8Deprecated, FeatureVIS, FeatureVIS2], [TuneSlowRDPC]>; def : Proc<"niagara", [FeatureV9, FeatureV8Deprecated, FeatureVIS, - FeatureVIS2, FeatureUA2005]>; + FeatureVIS2, FeatureUA2005], + [TuneNoPredictor]>; def : Proc<"niagara2", [FeatureV9, FeatureV8Deprecated, UsePopc, - FeatureVIS, FeatureVIS2, FeatureUA2005]>; + FeatureVIS, FeatureVIS2, FeatureUA2005], + [TuneNoPredictor]>; def : Proc<"niagara3", [FeatureV9, FeatureV8Deprecated, UsePopc, FeatureVIS, FeatureVIS2, FeatureVIS3, - FeatureUA2005, FeatureUA2007]>; + FeatureUA2005, FeatureUA2007], + [TuneNoPredictor]>; def : Proc<"niagara4", [FeatureV9, FeatureV8Deprecated, UsePopc, FeatureVIS, FeatureVIS2, FeatureVIS3, FeatureUA2005, FeatureUA2007, FeatureOSA2011, diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp index cbb7db6..ae3c326 100644 --- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp +++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp @@ -2000,6 +2000,14 @@ SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM, setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); + // Some processors have no branch predictor and have pipelines longer than + // what can be covered by the delay slot. This results in a stall, so mark + // branches to be expensive on those processors. + setJumpIsExpensive(Subtarget->hasNoPredictor()); + // The high cost of branching means that using conditional moves will + // still be profitable even if the condition is predictable. + PredictableSelectIsExpensive = !isJumpExpensive(); + setMinFunctionAlignment(Align(4)); computeRegisterProperties(Subtarget->getRegisterInfo()); diff --git a/llvm/lib/Target/Target.cpp b/llvm/lib/Target/Target.cpp index ec673ef..7387571 100644 --- a/llvm/lib/Target/Target.cpp +++ b/llvm/lib/Target/Target.cpp @@ -37,6 +37,7 @@ inline LLVMTargetLibraryInfoRef wrap(const TargetLibraryInfoImpl *P) { void llvm::initializeTarget(PassRegistry &Registry) { initializeTargetLibraryInfoWrapperPassPass(Registry); + initializeRuntimeLibraryInfoWrapperPass(Registry); initializeTargetTransformInfoWrapperPassPass(Registry); } diff --git a/llvm/lib/Target/WebAssembly/CMakeLists.txt b/llvm/lib/Target/WebAssembly/CMakeLists.txt index 1e83cbe..17df119 100644 --- a/llvm/lib/Target/WebAssembly/CMakeLists.txt +++ b/llvm/lib/Target/WebAssembly/CMakeLists.txt @@ -10,6 +10,7 @@ tablegen(LLVM WebAssemblyGenFastISel.inc -gen-fast-isel) tablegen(LLVM WebAssemblyGenInstrInfo.inc -gen-instr-info) tablegen(LLVM WebAssemblyGenMCCodeEmitter.inc -gen-emitter) tablegen(LLVM WebAssemblyGenRegisterInfo.inc -gen-register-info) +tablegen(LLVM WebAssemblyGenSDNodeInfo.inc -gen-sd-node-info) tablegen(LLVM WebAssemblyGenSubtargetInfo.inc -gen-subtarget) add_public_tablegen_target(WebAssemblyCommonTableGen) diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp index 2666342..66ed8b0 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp @@ -46,7 +46,7 @@ class WebAssemblyFastISel final : public FastISel { // All possible address modes. class Address { public: - using BaseKind = enum { RegBase, FrameIndexBase }; + enum BaseKind { RegBase, FrameIndexBase }; private: BaseKind Kind = RegBase; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp index 37a3457..9fef3e6 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp @@ -24,6 +24,7 @@ #include "WebAssembly.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" @@ -114,6 +115,7 @@ static Function *createWrapper(Function *F, FunctionType *Ty) { Wrapper->setAttributes(F->getAttributes()); BasicBlock *BB = BasicBlock::Create(M->getContext(), "body", Wrapper); const DataLayout &DL = BB->getDataLayout(); + IRBuilder<> Builder(BB); // Determine what arguments to pass. SmallVector<Value *, 4> Args; @@ -140,10 +142,7 @@ static Function *createWrapper(Function *F, FunctionType *Ty) { Args.push_back(&*AI); } else { if (CastInst::isBitOrNoopPointerCastable(ArgType, ParamType, DL)) { - Instruction *PtrCast = - CastInst::CreateBitOrPointerCast(AI, ParamType, "cast"); - PtrCast->insertInto(BB, BB->end()); - Args.push_back(PtrCast); + Args.push_back(Builder.CreateBitOrPointerCast(AI, ParamType, "cast")); } else if (ArgType->isStructTy() || ParamType->isStructTy()) { LLVM_DEBUG(dbgs() << "createWrapper: struct param type in bitcast: " << F->getName() << "\n"); @@ -166,24 +165,19 @@ static Function *createWrapper(Function *F, FunctionType *Ty) { for (; AI != AE; ++AI) Args.push_back(&*AI); - CallInst *Call = CallInst::Create(F, Args, "", BB); + CallInst *Call = Builder.CreateCall(F, Args); - Type *ExpectedRtnType = F->getFunctionType()->getReturnType(); - Type *RtnType = Ty->getReturnType(); // Determine what value to return. if (RtnType->isVoidTy()) { - ReturnInst::Create(M->getContext(), BB); + Builder.CreateRetVoid(); } else if (ExpectedRtnType->isVoidTy()) { LLVM_DEBUG(dbgs() << "Creating dummy return: " << *RtnType << "\n"); - ReturnInst::Create(M->getContext(), PoisonValue::get(RtnType), BB); + Builder.CreateRet(PoisonValue::get(RtnType)); } else if (RtnType == ExpectedRtnType) { - ReturnInst::Create(M->getContext(), Call, BB); + Builder.CreateRet(Call); } else if (CastInst::isBitOrNoopPointerCastable(ExpectedRtnType, RtnType, DL)) { - Instruction *Cast = - CastInst::CreateBitOrPointerCast(Call, RtnType, "cast"); - Cast->insertInto(BB, BB->end()); - ReturnInst::Create(M->getContext(), Cast, BB); + Builder.CreateRet(Builder.CreateBitOrPointerCast(Call, RtnType, "cast")); } else if (RtnType->isStructTy() || ExpectedRtnType->isStructTy()) { LLVM_DEBUG(dbgs() << "createWrapper: struct return type in bitcast: " << F->getName() << "\n"); @@ -203,9 +197,8 @@ static Function *createWrapper(Function *F, FunctionType *Ty) { Wrapper = Function::Create(Ty, Function::PrivateLinkage, F->getName() + "_bitcast_invalid", M); Wrapper->setAttributes(F->getAttributes()); - BasicBlock *BB = BasicBlock::Create(M->getContext(), "body", Wrapper); - new UnreachableInst(M->getContext(), BB); - Wrapper->setName(F->getName() + "_bitcast_invalid"); + IRBuilder<> Builder(BasicBlock::Create(M->getContext(), "body", Wrapper)); + Builder.CreateUnreachable(); } else if (!WrapperNeeded) { LLVM_DEBUG(dbgs() << "createWrapper: no wrapper needed: " << F->getName() << "\n"); diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISD.def b/llvm/lib/Target/WebAssembly/WebAssemblyISD.def deleted file mode 100644 index 23108e4..0000000 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISD.def +++ /dev/null @@ -1,64 +0,0 @@ -//- WebAssemblyISD.def - WebAssembly ISD ---------------------------*- C++ -*-// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -/// -/// \file -/// This file describes the various WebAssembly ISD node types. -/// -//===----------------------------------------------------------------------===// - -// NOTE: NO INCLUDE GUARD DESIRED! - -HANDLE_NODETYPE(CALL) -HANDLE_NODETYPE(RET_CALL) -HANDLE_NODETYPE(RETURN) -HANDLE_NODETYPE(ARGUMENT) -HANDLE_NODETYPE(LOCAL_GET) -HANDLE_NODETYPE(LOCAL_SET) -// A wrapper node for TargetExternalSymbol, TargetGlobalAddress, and MCSymbol -HANDLE_NODETYPE(Wrapper) -// A special node for TargetGlobalAddress used in PIC code for -// __memory_base/__table_base relative access. -HANDLE_NODETYPE(WrapperREL) -HANDLE_NODETYPE(BR_IF) -HANDLE_NODETYPE(BR_TABLE) -HANDLE_NODETYPE(DOT) -HANDLE_NODETYPE(EXT_ADD_PAIRWISE_U) -HANDLE_NODETYPE(EXT_ADD_PAIRWISE_S) -HANDLE_NODETYPE(SHUFFLE) -HANDLE_NODETYPE(SWIZZLE) -HANDLE_NODETYPE(VEC_SHL) -HANDLE_NODETYPE(VEC_SHR_S) -HANDLE_NODETYPE(VEC_SHR_U) -HANDLE_NODETYPE(NARROW_U) -HANDLE_NODETYPE(EXTEND_LOW_S) -HANDLE_NODETYPE(EXTEND_LOW_U) -HANDLE_NODETYPE(EXTEND_HIGH_S) -HANDLE_NODETYPE(EXTEND_HIGH_U) -HANDLE_NODETYPE(CONVERT_LOW_S) -HANDLE_NODETYPE(CONVERT_LOW_U) -HANDLE_NODETYPE(PROMOTE_LOW) -HANDLE_NODETYPE(TRUNC_SAT_ZERO_S) -HANDLE_NODETYPE(TRUNC_SAT_ZERO_U) -HANDLE_NODETYPE(DEMOTE_ZERO) -HANDLE_NODETYPE(I64_ADD128) -HANDLE_NODETYPE(I64_SUB128) -HANDLE_NODETYPE(I64_MUL_WIDE_S) -HANDLE_NODETYPE(I64_MUL_WIDE_U) - -// Memory intrinsics -HANDLE_NODETYPE(GLOBAL_GET) -HANDLE_NODETYPE(GLOBAL_SET) -HANDLE_NODETYPE(TABLE_GET) -HANDLE_NODETYPE(TABLE_SET) - -// Bulk memory instructions. These follow LLVM's expected semantics of -// supporting out-of-bounds pointers if the length is zero, by inserting -// a branch around Wasm's `memory.copy` and `memory.fill`, which would -// otherwise trap. -HANDLE_NODETYPE(MEMCPY) -HANDLE_NODETYPE(MEMSET) diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index 7ec463b..fc6c290 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -216,7 +216,8 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering( // Combine fp_to_{s,u}int_sat or fp_round of concat_vectors or vice versa // into conversion ops setTargetDAGCombine({ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT, - ISD::FP_ROUND, ISD::CONCAT_VECTORS}); + ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_ROUND, + ISD::CONCAT_VECTORS}); setTargetDAGCombine(ISD::TRUNCATE); @@ -942,20 +943,6 @@ MachineBasicBlock *WebAssemblyTargetLowering::EmitInstrWithCustomInserter( } } -const char * -WebAssemblyTargetLowering::getTargetNodeName(unsigned Opcode) const { - switch (static_cast<WebAssemblyISD::NodeType>(Opcode)) { - case WebAssemblyISD::FIRST_NUMBER: - break; -#define HANDLE_NODETYPE(NODE) \ - case WebAssemblyISD::NODE: \ - return "WebAssemblyISD::" #NODE; -#include "WebAssemblyISD.def" -#undef HANDLE_NODETYPE - } - return nullptr; -} - std::pair<unsigned, const TargetRegisterClass *> WebAssemblyTargetLowering::getRegForInlineAsmConstraint( const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { @@ -1830,11 +1817,8 @@ SDValue WebAssemblyTargetLowering::LowerLoad(SDValue Op, SDValue Idx = DAG.getTargetConstant(*Local, Base, MVT::i32); EVT LocalVT = LN->getValueType(0); - SDValue LocalGet = DAG.getNode(WebAssemblyISD::LOCAL_GET, DL, LocalVT, - {LN->getChain(), Idx}); - SDValue Result = DAG.getMergeValues({LocalGet, LN->getChain()}, DL); - assert(Result->getNumValues() == 2 && "Loads must carry a chain!"); - return Result; + return DAG.getNode(WebAssemblyISD::LOCAL_GET, DL, {LocalVT, MVT::Other}, + {LN->getChain(), Idx}); } if (WebAssembly::isWasmVarAddressSpace(LN->getAddressSpace())) @@ -3597,6 +3581,64 @@ static SDValue performMulCombine(SDNode *N, } } +SDValue DoubleVectorWidth(SDValue In, unsigned RequiredNumElems, + SelectionDAG &DAG) { + SDLoc DL(In); + LLVMContext &Ctx = *DAG.getContext(); + EVT InVT = In.getValueType(); + unsigned NumElems = InVT.getVectorNumElements() * 2; + EVT OutVT = EVT::getVectorVT(Ctx, InVT.getVectorElementType(), NumElems); + SDValue Concat = + DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, In, DAG.getPOISON(InVT)); + if (NumElems < RequiredNumElems) { + return DoubleVectorWidth(Concat, RequiredNumElems, DAG); + } + return Concat; +} + +SDValue performConvertFPCombine(SDNode *N, SelectionDAG &DAG) { + EVT OutVT = N->getValueType(0); + if (!OutVT.isVector()) + return SDValue(); + + EVT OutElTy = OutVT.getVectorElementType(); + if (OutElTy != MVT::i8 && OutElTy != MVT::i16) + return SDValue(); + + unsigned NumElems = OutVT.getVectorNumElements(); + if (!isPowerOf2_32(NumElems)) + return SDValue(); + + EVT FPVT = N->getOperand(0)->getValueType(0); + if (FPVT.getVectorElementType() != MVT::f32) + return SDValue(); + + SDLoc DL(N); + + // First, convert to i32. + LLVMContext &Ctx = *DAG.getContext(); + EVT IntVT = EVT::getVectorVT(Ctx, MVT::i32, NumElems); + SDValue ToInt = DAG.getNode(N->getOpcode(), DL, IntVT, N->getOperand(0)); + APInt Mask = APInt::getLowBitsSet(IntVT.getScalarSizeInBits(), + OutVT.getScalarSizeInBits()); + // Mask out the top MSBs. + SDValue Masked = + DAG.getNode(ISD::AND, DL, IntVT, ToInt, DAG.getConstant(Mask, DL, IntVT)); + + if (OutVT.getSizeInBits() < 128) { + // Create a wide enough vector that we can use narrow. + EVT NarrowedVT = OutElTy == MVT::i8 ? MVT::v16i8 : MVT::v8i16; + unsigned NumRequiredElems = NarrowedVT.getVectorNumElements(); + SDValue WideVector = DoubleVectorWidth(Masked, NumRequiredElems, DAG); + SDValue Trunc = truncateVectorWithNARROW(NarrowedVT, WideVector, DL, DAG); + return DAG.getBitcast( + OutVT, extractSubVector(Trunc, 0, DAG, DL, OutVT.getSizeInBits())); + } else { + return truncateVectorWithNARROW(OutVT, Masked, DL, DAG); + } + return SDValue(); +} + SDValue WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { @@ -3623,6 +3665,9 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N, case ISD::FP_ROUND: case ISD::CONCAT_VECTORS: return performVectorTruncZeroCombine(N, DCI); + case ISD::FP_TO_SINT: + case ISD::FP_TO_UINT: + return performConvertFPCombine(N, DCI.DAG); case ISD::TRUNCATE: return performTruncateCombine(N, DCI); case ISD::INTRINSIC_WO_CHAIN: diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h index 472ec67..f705298 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h @@ -19,17 +19,6 @@ namespace llvm { -namespace WebAssemblyISD { - -enum NodeType : unsigned { - FIRST_NUMBER = ISD::BUILTIN_OP_END, -#define HANDLE_NODETYPE(NODE) NODE, -#include "WebAssemblyISD.def" -#undef HANDLE_NODETYPE -}; - -} // end namespace WebAssemblyISD - class WebAssemblySubtarget; class WebAssemblyTargetLowering final : public TargetLowering { @@ -53,7 +42,6 @@ private: MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override; - const char *getTargetNodeName(unsigned Opcode) const override; std::pair<unsigned, const TargetRegisterClass *> getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td index fc82e5b..304c4f3 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td @@ -41,6 +41,11 @@ defm REF_TEST_FUNCREF : I<(outs I32:$res), (ins TypeIndex:$type, FUNCREF:$ref), "ref.test\t$type, $ref", "ref.test $type", 0xfb14>, Requires<[HasGC]>; +defm REF_FUNC : I<(outs FUNCREF:$res), (ins function32_op:$func), + (outs), (ins function32_op:$func), [], + "ref.func\t$func", "ref.func $func", 0xd2>, + Requires<[HasReferenceTypes]>; + defm "" : REF_I<FUNCREF, funcref, "func">; defm "" : REF_I<EXTERNREF, externref, "extern">; defm "" : REF_I<EXNREF, exnref, "exn">; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp index 45b0e7d..f3c236c 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp @@ -532,13 +532,19 @@ struct StaticLibcallNameMap { // FIXME: This is broken if there are ever different triples compiled with // different libcalls. RTLIB::RuntimeLibcallsInfo RTCI(TT); - for (RTLIB::Libcall LC : RTLIB::libcalls()) { - StringRef NameLibcall = RTCI.getLibcallName(LC); - if (!NameLibcall.empty() && - getRuntimeLibcallSignatures().Table[LC] != unsupported) { - assert(!Map.contains(NameLibcall) && - "duplicate libcall names in name map"); - Map[NameLibcall] = LC; + + ArrayRef<RuntimeLibcallSignature> Table = + getRuntimeLibcallSignatures().Table; + for (RTLIB::LibcallImpl Impl : RTLIB::libcall_impls()) { + if (!RTCI.isAvailable(Impl)) + continue; + RTLIB::Libcall LC = RTLIB::RuntimeLibcallsInfo::getLibcallFromImpl(Impl); + if (Table[LC] != unsupported) { + StringRef NameLibcall = + RTLIB::RuntimeLibcallsInfo::getLibcallImplName(Impl); + // FIXME: Map should be to LibcallImpl + if (!Map.insert({NameLibcall, LC}).second) + llvm_unreachable("duplicate libcall names in name map"); } } } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp index 2673c81..cf5cc41 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp @@ -11,23 +11,31 @@ /// //===----------------------------------------------------------------------===// +#include "WebAssemblySelectionDAGInfo.h" #include "WebAssemblyTargetMachine.h" + +#define GET_SDNODE_DESC +#include "WebAssemblyGenSDNodeInfo.inc" + using namespace llvm; #define DEBUG_TYPE "wasm-selectiondag-info" +WebAssemblySelectionDAGInfo::WebAssemblySelectionDAGInfo() + : SelectionDAGGenTargetInfo(WebAssemblyGenSDNodeInfo) {} + WebAssemblySelectionDAGInfo::~WebAssemblySelectionDAGInfo() = default; // anchor -bool WebAssemblySelectionDAGInfo::isTargetMemoryOpcode(unsigned Opcode) const { +const char * +WebAssemblySelectionDAGInfo::getTargetNodeName(unsigned Opcode) const { switch (static_cast<WebAssemblyISD::NodeType>(Opcode)) { - default: - return false; - case WebAssemblyISD::GLOBAL_GET: - case WebAssemblyISD::GLOBAL_SET: - case WebAssemblyISD::TABLE_GET: - case WebAssemblyISD::TABLE_SET: - return true; + case WebAssemblyISD::CALL: + return "WebAssemblyISD::CALL"; + case WebAssemblyISD::RET_CALL: + return "WebAssemblyISD::RET_CALL"; } + + return SelectionDAGGenTargetInfo::getTargetNodeName(Opcode); } SDValue WebAssemblySelectionDAGInfo::EmitTargetCodeForMemcpy( diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h index 69c9af0..8775f49 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h +++ b/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h @@ -17,13 +17,26 @@ #include "llvm/CodeGen/SelectionDAGTargetInfo.h" +#define GET_SDNODE_ENUM +#include "WebAssemblyGenSDNodeInfo.inc" + namespace llvm { +namespace WebAssemblyISD { + +enum NodeType : unsigned { + CALL = GENERATED_OPCODE_END, + RET_CALL, +}; -class WebAssemblySelectionDAGInfo final : public SelectionDAGTargetInfo { +} // namespace WebAssemblyISD + +class WebAssemblySelectionDAGInfo final : public SelectionDAGGenTargetInfo { public: + WebAssemblySelectionDAGInfo(); + ~WebAssemblySelectionDAGInfo() override; - bool isTargetMemoryOpcode(unsigned Opcode) const override; + const char *getTargetNodeName(unsigned Opcode) const override; SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Op1, SDValue Op2, diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h index 51b540a..fa23656 100644 --- a/llvm/lib/Target/X86/X86.h +++ b/llvm/lib/Target/X86/X86.h @@ -158,7 +158,16 @@ FunctionPass *createX86InsertX87waitPass(); /// This pass optimizes arithmetic based on knowledge that is only used by /// a reduction sequence and is therefore safe to reassociate in interesting /// ways. -FunctionPass *createX86PartialReductionPass(); +class X86PartialReductionPass : public PassInfoMixin<X86PartialReductionPass> { +private: + const X86TargetMachine *TM; + +public: + X86PartialReductionPass(const X86TargetMachine *TM) : TM(TM) {} + PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM); +}; + +FunctionPass *createX86PartialReductionLegacyPass(); /// // Analyzes and emits pseudos to support Win x64 Unwind V2. FunctionPass *createX86WinEHUnwindV2Pass(); @@ -179,7 +188,18 @@ FunctionPass *createX86LowerAMXTypeLegacyPass(); /// The pass transforms amx intrinsics to scalar operation if the function has /// optnone attribute or it is O0. -FunctionPass *createX86LowerAMXIntrinsicsPass(); +class X86LowerAMXIntrinsicsPass + : public PassInfoMixin<X86LowerAMXIntrinsicsPass> { +private: + const TargetMachine *TM; + +public: + X86LowerAMXIntrinsicsPass(const TargetMachine *TM) : TM(TM) {} + PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM); + static bool isRequired() { return true; } +}; + +FunctionPass *createX86LowerAMXIntrinsicsLegacyPass(); InstructionSelector *createX86InstructionSelector(const X86TargetMachine &TM, const X86Subtarget &, @@ -220,7 +240,7 @@ void initializeX86LowerAMXIntrinsicsLegacyPassPass(PassRegistry &); void initializeX86LowerAMXTypeLegacyPassPass(PassRegistry &); void initializeX86LowerTileCopyPass(PassRegistry &); void initializeX86OptimizeLEAPassPass(PassRegistry &); -void initializeX86PartialReductionPass(PassRegistry &); +void initializeX86PartialReductionLegacyPass(PassRegistry &); void initializeX86PreTileConfigPass(PassRegistry &); void initializeX86ReturnThunksPass(PassRegistry &); void initializeX86SpeculativeExecutionSideEffectSuppressionPass(PassRegistry &); diff --git a/llvm/lib/Target/X86/X86CompressEVEX.cpp b/llvm/lib/Target/X86/X86CompressEVEX.cpp index c0c7f5a..ddbd10d 100644 --- a/llvm/lib/Target/X86/X86CompressEVEX.cpp +++ b/llvm/lib/Target/X86/X86CompressEVEX.cpp @@ -272,7 +272,7 @@ static bool CompressEVEXImpl(MachineInstr &MI, MachineBasicBlock &MBB, const MachineOperand &Src2 = MI.getOperand(2); bool Is32BitReg = Opc == X86::ADD32ri_ND || Opc == X86::ADD32rr_ND; const MCInstrDesc &NewDesc = - ST.getInstrInfo()->get(Is32BitReg ? X86::LEA32r : X86::LEA64r); + ST.getInstrInfo()->get(Is32BitReg ? X86::LEA64_32r : X86::LEA64r); if (Is32BitReg) Src1 = getX86SubSuperRegister(Src1, 64); MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), NewDesc, Dst) diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index d4418c8..6c16fcfb 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -4728,9 +4728,9 @@ bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) { auto tryPeelOuterNotWrappingLogic = [&](SDNode *Op) { if (Op->getOpcode() == ISD::XOR && Op->hasOneUse() && ISD::isBuildVectorAllOnes(Op->getOperand(1).getNode())) { - SDValue InnerOp = Op->getOperand(0); + SDValue InnerOp = getFoldableLogicOp(Op->getOperand(0)); - if (!getFoldableLogicOp(InnerOp)) + if (!InnerOp) return SDValue(); N0 = InnerOp.getOperand(0); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index b97b508..168e041 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -2572,8 +2572,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } // Combine sin / cos into _sincos_stret if it is available. - setOperationAction(ISD::FSINCOS, MVT::f64, Custom); - setOperationAction(ISD::FSINCOS, MVT::f32, Custom); + setOperationAction(ISD::FSINCOS, MVT::f64, Expand); + setOperationAction(ISD::FSINCOS, MVT::f32, Expand); if (Subtarget.isTargetWin64()) { setOperationAction(ISD::SDIV, MVT::i128, Custom); @@ -30908,6 +30908,63 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR); } + if (VT == MVT::v64i8 && Subtarget.canExtendTo512BW()) { + // On AVX512BW, we can use variable 16-bit shifts to implement variable + // 8-bit shifts. For this, we split the input into two vectors, RLo and RHi. + // The i-th lane of RLo contains the (2*i)-th lane of R, and the i-th lane + // of RHi contains the (2*i+1)-th lane of R. After shifting, these vectors + // can efficiently be merged together using a masked move. + MVT ExtVT = MVT::v32i16; + + SDValue RLo, RHi; + // Isolate lower and upper lanes of Amt by masking odd lanes in AmtLo and + // right shifting AmtHi. + SDValue AmtLo = DAG.getNode(ISD::AND, dl, ExtVT, DAG.getBitcast(ExtVT, Amt), + DAG.getConstant(0x00ff, dl, ExtVT)); + SDValue AmtHi = getTargetVShiftByConstNode( + X86ISD::VSRLI, dl, ExtVT, DAG.getBitcast(ExtVT, Amt), 8, DAG); + switch (Opc) { + case ISD::SHL: + // Because we shift left, no bits from the high half can influence the low + // half, so we don't need to mask RLo. We do however need to mask RHi, to + // prevent high bits of an even lane overflowing into low bits of an odd + // lane. + RLo = DAG.getBitcast(ExtVT, R); + RHi = DAG.getNode(ISD::AND, dl, ExtVT, RLo, + DAG.getConstant(0xff00, dl, ExtVT)); + break; + case ISD::SRL: + // Same idea as above, but this time we need to make sure no low bits of + // an odd lane can overflow into high bits of an even lane. + RHi = DAG.getBitcast(ExtVT, R); + RLo = DAG.getNode(ISD::AND, dl, ExtVT, RHi, + DAG.getConstant(0x00ff, dl, ExtVT)); + break; + case ISD::SRA: + // For arithmetic right shifts, we want to sign extend each even lane of R + // such that the upper half of the corresponding lane of RLo is 0 or -1 + // depending on the sign bit of the original lane. We do this using 2 + // immediate shifts. + RHi = DAG.getBitcast(ExtVT, R); + RLo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, RHi, 8, DAG); + RLo = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExtVT, RLo, 8, DAG); + break; + default: + llvm_unreachable("Unexpected Shift Op"); + } + + SDValue ShiftedLo = + DAG.getBitcast(VT, DAG.getNode(Opc, dl, ExtVT, RLo, AmtLo)); + SDValue ShiftedHi = + DAG.getBitcast(VT, DAG.getNode(Opc, dl, ExtVT, RHi, AmtHi)); + + // To merge the shifted vectors back together, we select even lanes + // from ShiftedLo and odd lanes from ShiftedHi. + SDValue SelectMask = DAG.getBitcast( + MVT::v64i1, DAG.getConstant(0x5555555555555555, dl, MVT::i64)); + return DAG.getSelect(dl, VT, SelectMask, ShiftedLo, ShiftedHi); + } + if (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) { @@ -33004,61 +33061,6 @@ static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); } -static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget, - SelectionDAG &DAG) { - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - SDValue Arg = Op.getOperand(0); - EVT ArgVT = Arg.getValueType(); - bool isF64 = ArgVT == MVT::f64; - - RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32; - const char *LibcallName = TLI.getLibcallName(LC); - if (!LibcallName) - return SDValue(); - - assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit()); - - // For MacOSX, we want to call an alternative entry point: __sincos_stret, - // which returns the values as { float, float } (in XMM0) or - // { double, double } (which is returned in XMM0, XMM1). - SDLoc dl(Op); - Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); - - TargetLowering::ArgListTy Args; - Args.emplace_back(Arg, ArgTy); - - // Only optimize x86_64 for now. i386 is a bit messy. For f32, - // the small struct {f32, f32} is returned in (eax, edx). For f64, - // the results are returned via SRet in memory. - SDValue Callee = - DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout())); - - Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy) - : (Type *)FixedVectorType::get(ArgTy, 2); - - TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl) - .setChain(DAG.getEntryNode()) - .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args)) - .setIsPostTypeLegalization(); - - std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); - - if (isF64) - // Returned in xmm0 and xmm1. - return CallResult.first; - - // Returned in bits 0:31 and 32:64 xmm0. - SDValue SinVal = - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first, - DAG.getVectorIdxConstant(0, dl)); - SDValue CosVal = - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first, - DAG.getVectorIdxConstant(1, dl)); - SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); - return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal); -} - /// Widen a vector input to a vector of NVT. The /// input vector must have the same element type as NVT. static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG, @@ -33663,7 +33665,6 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::ABDS: case ISD::ABDU: return LowerABD(Op, Subtarget, DAG); case ISD::AVGCEILU: return LowerAVG(Op, Subtarget, DAG); - case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG); case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG); case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG); case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG); @@ -53349,40 +53350,44 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, } // Look for a RMW operation that only touches one bit of a larger than legal -// type and fold it to a BTC/BTR/BTS pattern acting on a single i32 sub value. +// type and fold it to a BTC/BTR/BTS or bit insertion pattern acting on a single +// i32 sub value. static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { using namespace SDPatternMatch; - - // Only handle normal stores and its chain was a matching normal load. - auto *Ld = dyn_cast<LoadSDNode>(St->getChain()); - if (!ISD::isNormalStore(St) || !St->isSimple() || !Ld || - !ISD::isNormalLoad(Ld) || !Ld->isSimple() || - Ld->getBasePtr() != St->getBasePtr() || - Ld->getOffset() != St->getOffset()) - return SDValue(); - - SDValue LoadVal(Ld, 0); SDValue StoredVal = St->getValue(); EVT VT = StoredVal.getValueType(); - // Only narrow larger than legal scalar integers. - if (!VT.isScalarInteger() || + // Only narrow normal stores of larger than legal scalar integers. + if (!ISD::isNormalStore(St) || !St->isSimple() || !VT.isScalarInteger() || VT.getSizeInBits() <= (Subtarget.is64Bit() ? 64 : 32)) return SDValue(); // BTR: X & ~(1 << ShAmt) // BTS: X | (1 << ShAmt) // BTC: X ^ (1 << ShAmt) - SDValue ShAmt; - if (!StoredVal.hasOneUse() || - !(sd_match(StoredVal, m_And(m_Specific(LoadVal), + // + // BitInsert: (X & ~(1 << ShAmt)) | (InsertBit << ShAmt) + SDValue SrcVal, InsertBit, ShAmt; + if (!(sd_match(StoredVal, m_And(m_Value(SrcVal), m_Not(m_Shl(m_One(), m_Value(ShAmt))))) || sd_match(StoredVal, - m_Or(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))) || + m_Or(m_Value(SrcVal), m_Shl(m_One(), m_Value(ShAmt)))) || sd_match(StoredVal, - m_Xor(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))))) + m_Xor(m_Value(SrcVal), m_Shl(m_One(), m_Value(ShAmt)))) || + sd_match( + StoredVal, + m_Or(m_And(m_Value(SrcVal), m_Not(m_Shl(m_One(), m_Value(ShAmt)))), + m_Shl(m_Value(InsertBit), m_Deferred(ShAmt)))))) + return SDValue(); + + // SrcVal must be a matching normal load further up the chain. + auto *Ld = dyn_cast<LoadSDNode>(SrcVal); + if (!Ld || !ISD::isNormalLoad(Ld) || !Ld->isSimple() || + Ld->getBasePtr() != St->getBasePtr() || + Ld->getOffset() != St->getOffset() || + !St->getChain().reachesChainWithoutSideEffects(SDValue(Ld, 1))) return SDValue(); // Ensure the shift amount is in bounds. @@ -53390,6 +53395,13 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL, if (KnownAmt.getMaxValue().uge(VT.getSizeInBits())) return SDValue(); + // If we're inserting a bit then it must be the LSB. + if (InsertBit) { + KnownBits KnownInsert = DAG.computeKnownBits(InsertBit); + if (KnownInsert.countMinLeadingZeros() < (VT.getSizeInBits() - 1)) + return SDValue(); + } + // Split the shift into an alignment shift that moves the active i32 block to // the bottom bits for truncation and a modulo shift that can act on the i32. EVT AmtVT = ShAmt.getValueType(); @@ -53397,6 +53409,7 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL, DAG.getSignedConstant(-32LL, DL, AmtVT)); SDValue ModuloAmt = DAG.getNode(ISD::AND, DL, AmtVT, ShAmt, DAG.getConstant(31, DL, AmtVT)); + ModuloAmt = DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8); // Compute the byte offset for the i32 block that is changed by the RMW. // combineTruncate will adjust the load for us in a similar way. @@ -53408,18 +53421,39 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL, SDNodeFlags::NoUnsignedWrap); // Reconstruct the BTC/BTR/BTS pattern for the i32 block and store. - SDValue X = DAG.getNode(ISD::SRL, DL, VT, LoadVal, AlignAmt); + SDValue X = DAG.getNode(ISD::SRL, DL, VT, SrcVal, AlignAmt); X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X); - SDValue Mask = - DAG.getNode(ISD::SHL, DL, MVT::i32, DAG.getConstant(1, DL, MVT::i32), - DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8)); - if (StoredVal.getOpcode() == ISD::AND) - Mask = DAG.getNOT(DL, Mask, MVT::i32); + SDValue Mask = DAG.getNode(ISD::SHL, DL, MVT::i32, + DAG.getConstant(1, DL, MVT::i32), ModuloAmt); - SDValue Res = DAG.getNode(StoredVal.getOpcode(), DL, MVT::i32, X, Mask); - return DAG.getStore(St->getChain(), DL, Res, NewPtr, St->getPointerInfo(), - Align(), St->getMemOperand()->getFlags()); + SDValue Res; + if (InsertBit) { + SDValue BitMask = + DAG.getNode(ISD::SHL, DL, MVT::i32, + DAG.getZExtOrTrunc(InsertBit, DL, MVT::i32), ModuloAmt); + Res = + DAG.getNode(ISD::AND, DL, MVT::i32, X, DAG.getNOT(DL, Mask, MVT::i32)); + Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, BitMask); + } else { + if (StoredVal.getOpcode() == ISD::AND) + Mask = DAG.getNOT(DL, Mask, MVT::i32); + Res = DAG.getNode(StoredVal.getOpcode(), DL, MVT::i32, X, Mask); + } + + SDValue NewStore = + DAG.getStore(St->getChain(), DL, Res, NewPtr, + MachinePointerInfo(St->getPointerInfo().getAddrSpace()), + Align(), St->getMemOperand()->getFlags()); + + // If there are other uses of StoredVal, replace with a new load of the + // whole (updated) value. + if (!StoredVal.hasOneUse()) { + SDValue NewLoad = + DAG.getLoad(VT, DL, NewStore, Ld->getBasePtr(), Ld->getMemOperand()); + DAG.ReplaceAllUsesWith(StoredVal, NewLoad); + } + return NewStore; } static SDValue combineStore(SDNode *N, SelectionDAG &DAG, @@ -54606,7 +54640,8 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, SDValue NewPtr = DAG.getMemBasePlusOffset( Ld->getBasePtr(), PtrByteOfs, DL, SDNodeFlags::NoUnsignedWrap); SDValue NewLoad = - DAG.getLoad(VT, DL, Ld->getChain(), NewPtr, Ld->getPointerInfo(), + DAG.getLoad(VT, DL, Ld->getChain(), NewPtr, + MachinePointerInfo(Ld->getPointerInfo().getAddrSpace()), Align(), Ld->getMemOperand()->getFlags()); DAG.makeEquivalentMemoryOrdering(Ld, NewLoad); return NewLoad; diff --git a/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp b/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp index 7f33939..662aec2 100644 --- a/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp +++ b/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp @@ -23,12 +23,15 @@ #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/Analysis.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/IntrinsicsX86.h" +#include "llvm/IR/PassManager.h" #include "llvm/IR/PatternMatch.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" @@ -40,7 +43,7 @@ using namespace llvm; using namespace PatternMatch; -#define DEBUG_TYPE "lower-amx-intrinsics" +#define DEBUG_TYPE "x86-lower-amx-intrinsics" #ifndef NDEBUG static bool isV256I32Ty(Type *Ty) { @@ -627,6 +630,37 @@ bool X86LowerAMXIntrinsics::visit() { } namespace { +bool shouldRunLowerAMXIntrinsics(const Function &F, const TargetMachine *TM) { + return X86ScalarizeAMX && (F.hasFnAttribute(Attribute::OptimizeNone) || + TM->getOptLevel() == CodeGenOptLevel::None); +} + +bool runLowerAMXIntrinsics(Function &F, DominatorTree *DT, LoopInfo *LI) { + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); + + X86LowerAMXIntrinsics LAT(F, DTU, LI); + return LAT.visit(); +} +} // namespace + +PreservedAnalyses X86LowerAMXIntrinsicsPass::run(Function &F, + FunctionAnalysisManager &FAM) { + if (!shouldRunLowerAMXIntrinsics(F, TM)) + return PreservedAnalyses::all(); + + DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F); + LoopInfo &LI = FAM.getResult<LoopAnalysis>(F); + bool Changed = runLowerAMXIntrinsics(F, &DT, &LI); + if (!Changed) + return PreservedAnalyses::all(); + + PreservedAnalyses PA = PreservedAnalyses::none(); + PA.preserve<DominatorTreeAnalysis>(); + PA.preserve<LoopAnalysis>(); + return PA; +} + +namespace { class X86LowerAMXIntrinsicsLegacyPass : public FunctionPass { public: static char ID; @@ -634,21 +668,15 @@ public: X86LowerAMXIntrinsicsLegacyPass() : FunctionPass(ID) {} bool runOnFunction(Function &F) override { - if (!X86ScalarizeAMX) - return false; TargetMachine *TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>(); - if (!F.hasFnAttribute(Attribute::OptimizeNone) && - TM->getOptLevel() != CodeGenOptLevel::None) + if (!shouldRunLowerAMXIntrinsics(F, TM)) return false; auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); auto *DT = DTWP ? &DTWP->getDomTree() : nullptr; auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>(); auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr; - DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); - - X86LowerAMXIntrinsics LAT(F, DTU, LI); - return LAT.visit(); + return runLowerAMXIntrinsics(F, DT, LI); } StringRef getPassName() const override { return "Lower AMX intrinsics"; } @@ -668,6 +696,6 @@ INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) INITIALIZE_PASS_END(X86LowerAMXIntrinsicsLegacyPass, DEBUG_TYPE, PassName, false, false) -FunctionPass *llvm::createX86LowerAMXIntrinsicsPass() { +FunctionPass *llvm::createX86LowerAMXIntrinsicsLegacyPass() { return new X86LowerAMXIntrinsicsLegacyPass(); } diff --git a/llvm/lib/Target/X86/X86PartialReduction.cpp b/llvm/lib/Target/X86/X86PartialReduction.cpp index a25e4e0..898c83c 100644 --- a/llvm/lib/Target/X86/X86PartialReduction.cpp +++ b/llvm/lib/Target/X86/X86PartialReduction.cpp @@ -16,10 +16,12 @@ #include "X86TargetMachine.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/Analysis.h" #include "llvm/IR/Constants.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicsX86.h" +#include "llvm/IR/PassManager.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Pass.h" #include "llvm/Support/KnownBits.h" @@ -30,39 +32,44 @@ using namespace llvm; namespace { -class X86PartialReduction : public FunctionPass { +class X86PartialReduction { + const X86TargetMachine *TM; const DataLayout *DL = nullptr; const X86Subtarget *ST = nullptr; public: + X86PartialReduction(const X86TargetMachine *TM) : TM(TM) {} + bool run(Function &F); + +private: + bool tryMAddReplacement(Instruction *Op, bool ReduceInOneBB); + bool trySADReplacement(Instruction *Op); +}; + +class X86PartialReductionLegacy : public FunctionPass { +public: static char ID; // Pass identification, replacement for typeid. - X86PartialReduction() : FunctionPass(ID) { } + X86PartialReductionLegacy() : FunctionPass(ID) {} - bool runOnFunction(Function &Fn) override; + bool runOnFunction(Function &F) override; void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); } - StringRef getPassName() const override { - return "X86 Partial Reduction"; - } - -private: - bool tryMAddReplacement(Instruction *Op, bool ReduceInOneBB); - bool trySADReplacement(Instruction *Op); + StringRef getPassName() const override { return "X86 Partial Reduction"; } }; } -FunctionPass *llvm::createX86PartialReductionPass() { - return new X86PartialReduction(); +FunctionPass *llvm::createX86PartialReductionLegacyPass() { + return new X86PartialReductionLegacy(); } -char X86PartialReduction::ID = 0; +char X86PartialReductionLegacy::ID = 0; -INITIALIZE_PASS(X86PartialReduction, DEBUG_TYPE, - "X86 Partial Reduction", false, false) +INITIALIZE_PASS(X86PartialReductionLegacy, DEBUG_TYPE, "X86 Partial Reduction", + false, false) // This function should be aligned with detectExtMul() in X86ISelLowering.cpp. static bool matchVPDPBUSDPattern(const X86Subtarget *ST, BinaryOperator *Mul, @@ -494,17 +501,8 @@ static void collectLeaves(Value *Root, SmallVectorImpl<Instruction *> &Leaves) { } } -bool X86PartialReduction::runOnFunction(Function &F) { - if (skipFunction(F)) - return false; - - auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); - if (!TPC) - return false; - - auto &TM = TPC->getTM<X86TargetMachine>(); - ST = TM.getSubtargetImpl(F); - +bool X86PartialReduction::run(Function &F) { + ST = TM->getSubtargetImpl(F); DL = &F.getDataLayout(); bool MadeChange = false; @@ -540,3 +538,25 @@ bool X86PartialReduction::runOnFunction(Function &F) { return MadeChange; } + +bool X86PartialReductionLegacy::runOnFunction(Function &F) { + if (skipFunction(F)) + return false; + + auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); + if (!TPC) + return false; + + return X86PartialReduction(&TPC->getTM<X86TargetMachine>()).run(F); +} + +PreservedAnalyses X86PartialReductionPass::run(Function &F, + FunctionAnalysisManager &FAM) { + bool Changed = X86PartialReduction(TM).run(F); + if (!Changed) + return PreservedAnalyses::all(); + + PreservedAnalyses PA = PreservedAnalyses::none(); + PA.preserveSet<CFGAnalyses>(); + return PA; +} diff --git a/llvm/lib/Target/X86/X86PassRegistry.def b/llvm/lib/Target/X86/X86PassRegistry.def index fc25d55..db25594 100644 --- a/llvm/lib/Target/X86/X86PassRegistry.def +++ b/llvm/lib/Target/X86/X86PassRegistry.def @@ -15,14 +15,14 @@ #ifndef FUNCTION_PASS #define FUNCTION_PASS(NAME, CREATE_PASS) #endif +FUNCTION_PASS("x86-lower-amx-intrinsics", X86LowerAMXIntrinsicsPass(this)) FUNCTION_PASS("x86-lower-amx-type", X86LowerAMXTypePass(this)) +FUNCTION_PASS("x86-partial-reduction", X86PartialReductionPass(this)) #undef FUNCTION_PASS #ifndef DUMMY_FUNCTION_PASS #define DUMMY_FUNCTION_PASS(NAME, CREATE_PASS) #endif -DUMMY_FUNCTION_PASS("lower-amx-intrinsics", X86LowerAMXIntrinsics(*this)) -DUMMY_FUNCTION_PASS("x86-partial-reduction", X86PartialReduction()) DUMMY_FUNCTION_PASS("x86-winehstate", WinEHStatePass()) #undef DUMMY_FUNCTION_PASS diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp index 9a76abc..5f0bcab 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -97,7 +97,7 @@ extern "C" LLVM_C_ABI void LLVMInitializeX86Target() { initializeX86LoadValueInjectionLoadHardeningPassPass(PR); initializeX86LoadValueInjectionRetHardeningPassPass(PR); initializeX86OptimizeLEAPassPass(PR); - initializeX86PartialReductionPass(PR); + initializeX86PartialReductionLegacyPass(PR); initializePseudoProbeInserterPass(PR); initializeX86ReturnThunksPass(PR); initializeX86DAGToDAGISelLegacyPass(PR); @@ -422,14 +422,14 @@ void X86PassConfig::addIRPasses() { // We add both pass anyway and when these two passes run, we skip the pass // based on the option level and option attribute. - addPass(createX86LowerAMXIntrinsicsPass()); + addPass(createX86LowerAMXIntrinsicsLegacyPass()); addPass(createX86LowerAMXTypeLegacyPass()); TargetPassConfig::addIRPasses(); if (TM->getOptLevel() != CodeGenOptLevel::None) { addPass(createInterleavedAccessPass()); - addPass(createX86PartialReductionPass()); + addPass(createX86PartialReductionLegacyPass()); } // Add passes that handle indirect branch removal and insertion of a retpoline diff --git a/llvm/lib/Target/X86/X86VZeroUpper.cpp b/llvm/lib/Target/X86/X86VZeroUpper.cpp index f6f7e92..2f28ab3 100644 --- a/llvm/lib/Target/X86/X86VZeroUpper.cpp +++ b/llvm/lib/Target/X86/X86VZeroUpper.cpp @@ -66,7 +66,7 @@ namespace { MachineBasicBlock &MBB); void addDirtySuccessor(MachineBasicBlock &MBB); - using BlockExitState = enum { PASS_THROUGH, EXITS_CLEAN, EXITS_DIRTY }; + enum BlockExitState { PASS_THROUGH, EXITS_CLEAN, EXITS_DIRTY }; static const char* getBlockExitStateName(BlockExitState ST); diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCCodeEmitter.cpp b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCCodeEmitter.cpp index bd4d4eb..5977a27 100644 --- a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCCodeEmitter.cpp +++ b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCCodeEmitter.cpp @@ -320,7 +320,7 @@ XtensaMCCodeEmitter::getMemRegEncoding(const MCInst &MI, unsigned OpNo, case Xtensa::SSIP: case Xtensa::LSI: case Xtensa::LSIP: - + case Xtensa::S32C1I: if (Res & 0x3) { report_fatal_error("Unexpected operand value!"); } diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp index 4e73070..8d0fd07 100644 --- a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp +++ b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp @@ -202,7 +202,7 @@ bool Xtensa::checkRegister(MCRegister RegNo, const FeatureBitset &FeatureBits, return FeatureBits[Xtensa::FeatureWindowed]; case Xtensa::ATOMCTL: case Xtensa::SCOMPARE1: - return FeatureBits[Xtensa::FeatureWindowed]; + return FeatureBits[Xtensa::FeatureS32C1I]; case Xtensa::NoRegister: return false; } diff --git a/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp b/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp index b0f924f..be69cef 100644 --- a/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp +++ b/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp @@ -114,14 +114,31 @@ void XtensaInstrInfo::copyPhysReg(MachineBasicBlock &MBB, const DebugLoc &DL, Register DestReg, Register SrcReg, bool KillSrc, bool RenamableDest, bool RenamableSrc) const { - // The MOV instruction is not present in core ISA, + unsigned Opcode; + + // The MOV instruction is not present in core ISA for AR registers, // so use OR instruction. - if (Xtensa::ARRegClass.contains(DestReg, SrcReg)) + if (Xtensa::ARRegClass.contains(DestReg, SrcReg)) { BuildMI(MBB, MBBI, DL, get(Xtensa::OR), DestReg) .addReg(SrcReg, getKillRegState(KillSrc)) .addReg(SrcReg, getKillRegState(KillSrc)); + return; + } + + if (STI.hasSingleFloat() && Xtensa::FPRRegClass.contains(SrcReg) && + Xtensa::FPRRegClass.contains(DestReg)) + Opcode = Xtensa::MOV_S; + else if (STI.hasSingleFloat() && Xtensa::FPRRegClass.contains(SrcReg) && + Xtensa::ARRegClass.contains(DestReg)) + Opcode = Xtensa::RFR; + else if (STI.hasSingleFloat() && Xtensa::ARRegClass.contains(SrcReg) && + Xtensa::FPRRegClass.contains(DestReg)) + Opcode = Xtensa::WFR; else report_fatal_error("Impossible reg-to-reg copy"); + + BuildMI(MBB, MBBI, DL, get(Opcode), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); } void XtensaInstrInfo::storeRegToStackSlot( diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp index 7a95df4..b575d76 100644 --- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp +++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp @@ -1378,8 +1378,7 @@ static bool foldMemChr(CallInst *Call, DomTreeUpdater *DTU, IRB.CreateTrunc(Call->getArgOperand(1), ByteTy), BBNext, N); // We can't know the precise weights here, as they would depend on the value // distribution of Call->getArgOperand(1). So we just mark it as "unknown". - setExplicitlyUnknownBranchWeightsIfProfiled(*SI, *Call->getFunction(), - DEBUG_TYPE); + setExplicitlyUnknownBranchWeightsIfProfiled(*SI, DEBUG_TYPE); Type *IndexTy = DL.getIndexType(Call->getType()); SmallVector<DominatorTree::UpdateType, 8> Updates; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h index d85e4f7..9bdd8cb 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -479,7 +479,7 @@ private: const Twine &NameStr = "", InsertPosition InsertBefore = nullptr) { auto *Sel = SelectInst::Create(C, S1, S2, NameStr, InsertBefore, nullptr); - setExplicitlyUnknownBranchWeightsIfProfiled(*Sel, F, DEBUG_TYPE); + setExplicitlyUnknownBranchWeightsIfProfiled(*Sel, DEBUG_TYPE, &F); return Sel; } diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 67f837c..b158e0f 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -2261,11 +2261,11 @@ Instruction *InstCombinerImpl::foldBinopWithPhiOperands(BinaryOperator &BO) { } Instruction *InstCombinerImpl::foldBinOpIntoSelectOrPhi(BinaryOperator &I) { - if (!isa<Constant>(I.getOperand(1))) - return nullptr; + bool IsOtherParamConst = isa<Constant>(I.getOperand(1)); if (auto *Sel = dyn_cast<SelectInst>(I.getOperand(0))) { - if (Instruction *NewSel = FoldOpIntoSelect(I, Sel)) + if (Instruction *NewSel = + FoldOpIntoSelect(I, Sel, false, !IsOtherParamConst)) return NewSel; } else if (auto *PN = dyn_cast<PHINode>(I.getOperand(0))) { if (Instruction *NewPhi = foldOpIntoPhi(I, PN)) diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 471c6ec..ceeece4 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -3903,7 +3903,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { // adding/"accumulating" %s. "Accumulation" stores the result in one // of the source registers, but this accumulate vs. add distinction // is lost when dealing with LLVM intrinsics.) + // + // ZeroPurifies means that multiplying a known-zero with an uninitialized + // value results in an initialized value. This is applicable for integer + // multiplication, but not floating-point (counter-example: NaN). void handleVectorPmaddIntrinsic(IntrinsicInst &I, unsigned ReductionFactor, + bool ZeroPurifies, unsigned EltSizeInBits = 0) { IRBuilder<> IRB(&I); @@ -3945,7 +3950,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { assert(AccumulatorType == ReturnType); } - FixedVectorType *ImplicitReturnType = ReturnType; + FixedVectorType *ImplicitReturnType = + cast<FixedVectorType>(getShadowTy(ReturnType)); // Step 1: instrument multiplication of corresponding vector elements if (EltSizeInBits) { ImplicitReturnType = cast<FixedVectorType>( @@ -3964,30 +3970,40 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { ReturnType->getNumElements() * ReductionFactor); } - // Multiplying an *initialized* zero by an uninitialized element results in - // an initialized zero element. - // - // This is analogous to bitwise AND, where "AND" of 0 and a poisoned value - // results in an unpoisoned value. We can therefore adapt the visitAnd() - // instrumentation: - // OutShadow = (SaNonZero & SbNonZero) - // | (VaNonZero & SbNonZero) - // | (SaNonZero & VbNonZero) - // where non-zero is checked on a per-element basis (not per bit). - Value *SZero = Constant::getNullValue(Va->getType()); - Value *VZero = Constant::getNullValue(Sa->getType()); - Value *SaNonZero = IRB.CreateICmpNE(Sa, SZero); - Value *SbNonZero = IRB.CreateICmpNE(Sb, SZero); - Value *VaNonZero = IRB.CreateICmpNE(Va, VZero); - Value *VbNonZero = IRB.CreateICmpNE(Vb, VZero); - - Value *SaAndSbNonZero = IRB.CreateAnd(SaNonZero, SbNonZero); - Value *VaAndSbNonZero = IRB.CreateAnd(VaNonZero, SbNonZero); - Value *SaAndVbNonZero = IRB.CreateAnd(SaNonZero, VbNonZero); - // Each element of the vector is represented by a single bit (poisoned or // not) e.g., <8 x i1>. - Value *And = IRB.CreateOr({SaAndSbNonZero, VaAndSbNonZero, SaAndVbNonZero}); + Value *SaNonZero = IRB.CreateIsNotNull(Sa); + Value *SbNonZero = IRB.CreateIsNotNull(Sb); + Value *And; + if (ZeroPurifies) { + // Multiplying an *initialized* zero by an uninitialized element results + // in an initialized zero element. + // + // This is analogous to bitwise AND, where "AND" of 0 and a poisoned value + // results in an unpoisoned value. We can therefore adapt the visitAnd() + // instrumentation: + // OutShadow = (SaNonZero & SbNonZero) + // | (VaNonZero & SbNonZero) + // | (SaNonZero & VbNonZero) + // where non-zero is checked on a per-element basis (not per bit). + Value *VaInt = Va; + Value *VbInt = Vb; + if (!Va->getType()->isIntegerTy()) { + VaInt = CreateAppToShadowCast(IRB, Va); + VbInt = CreateAppToShadowCast(IRB, Vb); + } + + Value *VaNonZero = IRB.CreateIsNotNull(VaInt); + Value *VbNonZero = IRB.CreateIsNotNull(VbInt); + + Value *SaAndSbNonZero = IRB.CreateAnd(SaNonZero, SbNonZero); + Value *VaAndSbNonZero = IRB.CreateAnd(VaNonZero, SbNonZero); + Value *SaAndVbNonZero = IRB.CreateAnd(SaNonZero, VbNonZero); + + And = IRB.CreateOr({SaAndSbNonZero, VaAndSbNonZero, SaAndVbNonZero}); + } else { + And = IRB.CreateOr({SaNonZero, SbNonZero}); + } // Extend <8 x i1> to <8 x i16>. // (The real pmadd intrinsic would have computed intermediate values of @@ -5752,17 +5768,20 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { case Intrinsic::x86_ssse3_pmadd_ub_sw_128: case Intrinsic::x86_avx2_pmadd_ub_sw: case Intrinsic::x86_avx512_pmaddubs_w_512: - handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2); + handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, + /*ZeroPurifies=*/true); break; // <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64>, <1 x i64>) case Intrinsic::x86_ssse3_pmadd_ub_sw: - handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/8); + handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, + /*ZeroPurifies=*/true, /*EltSizeInBits=*/8); break; // <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64>, <1 x i64>) case Intrinsic::x86_mmx_pmadd_wd: - handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/16); + handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, + /*ZeroPurifies=*/true, /*EltSizeInBits=*/16); break; // AVX Vector Neural Network Instructions: bytes @@ -5848,7 +5867,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { case Intrinsic::x86_avx2_vpdpbuuds_128: case Intrinsic::x86_avx2_vpdpbuuds_256: case Intrinsic::x86_avx10_vpdpbuuds_512: - handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/4, /*EltSize=*/8); + handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/4, + /*ZeroPurifies=*/true, /*EltSizeInBits=*/8); break; // AVX Vector Neural Network Instructions: words @@ -5901,7 +5921,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { case Intrinsic::x86_avx512_vpdpwssds_128: case Intrinsic::x86_avx512_vpdpwssds_256: case Intrinsic::x86_avx512_vpdpwssds_512: - handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/16); + handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, + /*ZeroPurifies=*/true, /*EltSizeInBits=*/16); break; // TODO: Dot Product of BF16 Pairs Accumulated Into Packed Single diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp index 19eccb9..9ffa602 100644 --- a/llvm/lib/Transforms/Scalar/LoopFuse.cpp +++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp @@ -1796,14 +1796,16 @@ private: // mergeLatch may remove the only block in FC1. SE.forgetLoop(FC1.L); SE.forgetLoop(FC0.L); - // Forget block dispositions as well, so that there are no dangling - // pointers to erased/free'ed blocks. - SE.forgetBlockAndLoopDispositions(); // Move instructions from FC0.Latch to FC1.Latch. // Note: mergeLatch requires an updated DT. mergeLatch(FC0, FC1); + // Forget block dispositions as well, so that there are no dangling + // pointers to erased/free'ed blocks. It should be done after mergeLatch() + // since merging the latches may affect the dispositions. + SE.forgetBlockAndLoopDispositions(); + // Merge the loops. SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks()); for (BasicBlock *BB : Blocks) { @@ -2092,14 +2094,16 @@ private: // mergeLatch may remove the only block in FC1. SE.forgetLoop(FC1.L); SE.forgetLoop(FC0.L); - // Forget block dispositions as well, so that there are no dangling - // pointers to erased/free'ed blocks. - SE.forgetBlockAndLoopDispositions(); // Move instructions from FC0.Latch to FC1.Latch. // Note: mergeLatch requires an updated DT. mergeLatch(FC0, FC1); + // Forget block dispositions as well, so that there are no dangling + // pointers to erased/free'ed blocks. It should be done after mergeLatch() + // since merging the latches may affect the dispositions. + SE.forgetBlockAndLoopDispositions(); + // Merge the loops. SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks()); for (BasicBlock *BB : Blocks) { diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 019536ca..9070d25 100644 --- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -72,6 +72,7 @@ #include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/IR/ProfDataUtils.h" #include "llvm/IR/Type.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" @@ -105,6 +106,7 @@ STATISTIC( STATISTIC(NumShiftUntilZero, "Number of uncountable loops recognized as 'shift until zero' idiom"); +namespace llvm { bool DisableLIRP::All; static cl::opt<bool, true> DisableLIRPAll("disable-" DEBUG_TYPE "-all", @@ -163,6 +165,10 @@ static cl::opt<bool> ForceMemsetPatternIntrinsic( cl::desc("Use memset.pattern intrinsic whenever possible"), cl::init(false), cl::Hidden); +extern cl::opt<bool> ProfcheckDisableMetadataFixes; + +} // namespace llvm + namespace { class LoopIdiomRecognize { @@ -3199,7 +3205,21 @@ bool LoopIdiomRecognize::recognizeShiftUntilBitTest() { // The loop trip count check. auto *IVCheck = Builder.CreateICmpEQ(IVNext, LoopTripCount, CurLoop->getName() + ".ivcheck"); - Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB); + SmallVector<uint32_t> BranchWeights; + const bool HasBranchWeights = + !ProfcheckDisableMetadataFixes && + extractBranchWeights(*LoopHeaderBB->getTerminator(), BranchWeights); + + auto *BI = Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB); + if (HasBranchWeights) { + if (SuccessorBB == LoopHeaderBB->getTerminator()->getSuccessor(1)) + std::swap(BranchWeights[0], BranchWeights[1]); + // We're not changing the loop profile, so we can reuse the original loop's + // profile. + setBranchWeights(*BI, BranchWeights, + /*IsExpected=*/false); + } + LoopHeaderBB->getTerminator()->eraseFromParent(); // Populate the IV PHI. @@ -3368,10 +3388,10 @@ static bool detectShiftUntilZeroIdiom(Loop *CurLoop, ScalarEvolution *SE, /// %start = <...> /// %extraoffset = <...> /// <...> -/// br label %for.cond +/// br label %loop /// /// loop: -/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ] +/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %loop ] /// %nbits = add nsw i8 %iv, %extraoffset /// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits /// %val.shifted.iszero = icmp eq i8 %val.shifted, 0 @@ -3533,7 +3553,19 @@ bool LoopIdiomRecognize::recognizeShiftUntilZero() { // The loop terminator. Builder.SetInsertPoint(LoopHeaderBB->getTerminator()); - Builder.CreateCondBr(CIVCheck, SuccessorBB, LoopHeaderBB); + SmallVector<uint32_t> BranchWeights; + const bool HasBranchWeights = + !ProfcheckDisableMetadataFixes && + extractBranchWeights(*LoopHeaderBB->getTerminator(), BranchWeights); + + auto *BI = Builder.CreateCondBr(CIVCheck, SuccessorBB, LoopHeaderBB); + if (HasBranchWeights) { + if (InvertedCond) + std::swap(BranchWeights[0], BranchWeights[1]); + // We're not changing the loop profile, so we can reuse the original loop's + // profile. + setBranchWeights(*BI, BranchWeights, /*IsExpected=*/false); + } LoopHeaderBB->getTerminator()->eraseFromParent(); // Populate the IV PHI. diff --git a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp index a883998..1b770be 100644 --- a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp +++ b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp @@ -89,8 +89,8 @@ struct StoreToLoadForwardingCandidate { /// Return true if the dependence from the store to the load has an /// absolute distance of one. /// E.g. A[i+1] = A[i] (or A[i-1] = A[i] for descending loop) - bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE, - Loop *L) const { + bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE, Loop *L, + const DominatorTree &DT) const { Value *LoadPtr = Load->getPointerOperand(); Value *StorePtr = Store->getPointerOperand(); Type *LoadType = getLoadStoreType(Load); @@ -102,8 +102,10 @@ struct StoreToLoadForwardingCandidate { DL.getTypeSizeInBits(getLoadStoreType(Store)) && "Should be a known dependence"); - int64_t StrideLoad = getPtrStride(PSE, LoadType, LoadPtr, L).value_or(0); - int64_t StrideStore = getPtrStride(PSE, LoadType, StorePtr, L).value_or(0); + int64_t StrideLoad = + getPtrStride(PSE, LoadType, LoadPtr, L, DT).value_or(0); + int64_t StrideStore = + getPtrStride(PSE, LoadType, StorePtr, L, DT).value_or(0); if (!StrideLoad || !StrideStore || StrideLoad != StrideStore) return false; @@ -287,8 +289,8 @@ public: // so deciding which one forwards is easy. The later one forwards as // long as they both have a dependence distance of one to the load. if (Cand.Store->getParent() == OtherCand->Store->getParent() && - Cand.isDependenceDistanceOfOne(PSE, L) && - OtherCand->isDependenceDistanceOfOne(PSE, L)) { + Cand.isDependenceDistanceOfOne(PSE, L, *DT) && + OtherCand->isDependenceDistanceOfOne(PSE, L, *DT)) { // They are in the same block, the later one will forward to the load. if (getInstrIndex(OtherCand->Store) < getInstrIndex(Cand.Store)) OtherCand = &Cand; @@ -538,7 +540,7 @@ public: // Check whether the SCEV difference is the same as the induction step, // thus we load the value in the next iteration. - if (!Cand.isDependenceDistanceOfOne(PSE, L)) + if (!Cand.isDependenceDistanceOfOne(PSE, L, *DT)) continue; assert(isa<SCEVAddRecExpr>(PSE.getSCEV(Cand.Load->getPointerOperand())) && diff --git a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp index b9546c5..e902b71 100644 --- a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp +++ b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp @@ -24,6 +24,7 @@ #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/ProfDataUtils.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/LoopPassManager.h" @@ -393,6 +394,17 @@ private: DTUpdates.push_back({DominatorTree::Insert, Preheader, BB}); ++NumLoopExitsDeleted; } + // We don't really need to add branch weights to DummySwitch, because all + // but one branches are just a temporary artifact - see the comment on top + // of this function. But, it's easy to estimate the weights, and it helps + // maintain a property of the overall compiler - that the branch weights + // don't "just get dropped" accidentally (i.e. profcheck) + if (DummySwitch->getParent()->getParent()->hasProfileData()) { + SmallVector<uint32_t> DummyBranchWeights(1 + DummySwitch->getNumCases()); + // default. 100% probability, the rest are dead. + DummyBranchWeights[0] = 1; + setBranchWeights(*DummySwitch, DummyBranchWeights, /*IsExpected=*/false); + } assert(L.getLoopPreheader() == NewPreheader && "Malformed CFG?"); if (Loop *OuterLoop = LI.getLoopFor(Preheader)) { diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp index 2bda9d8..802ae4e 100644 --- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -1327,7 +1327,8 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE, } // Do not attempt partial/runtime unrolling in FullLoopUnrolling - if (OnlyFullUnroll && (UP.Count < TripCount || UP.Count < MaxTripCount)) { + if (OnlyFullUnroll && ((!TripCount && !MaxTripCount) || + UP.Count < TripCount || UP.Count < MaxTripCount)) { LLVM_DEBUG( dbgs() << "Not attempting partial/runtime unroll in FullLoopUnroll.\n"); return LoopUnrollResult::Unmodified; diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index 239526e..0f3e664 100644 --- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -40,6 +40,7 @@ #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/MDBuilder.h" #include "llvm/IR/Module.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/ProfDataUtils.h" @@ -329,8 +330,7 @@ static void buildPartialUnswitchConditionalBranch( HasBranchWeights ? ComputeProfFrom.getMetadata(LLVMContext::MD_prof) : nullptr); if (!HasBranchWeights) - setExplicitlyUnknownBranchWeightsIfProfiled( - *BR, *BR->getParent()->getParent(), DEBUG_TYPE); + setExplicitlyUnknownBranchWeightsIfProfiled(*BR, DEBUG_TYPE); } /// Copy a set of loop invariant values, and conditionally branch on them. @@ -388,8 +388,7 @@ static void buildPartialInvariantUnswitchConditionalBranch( IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc, Direction ? &NormalSucc : &UnswitchedSucc, ProfData); if (!ProfData) - setExplicitlyUnknownBranchWeightsIfProfiled(*BR, *BR->getFunction(), - DEBUG_TYPE); + setExplicitlyUnknownBranchWeightsIfProfiled(*BR, DEBUG_TYPE); } /// Rewrite the PHI nodes in an unswitched loop exit basic block. @@ -2831,9 +2830,14 @@ static BranchInst *turnGuardIntoBranch(IntrinsicInst *GI, Loop &L, MSSAU->getMemorySSA()->verifyMemorySSA(); DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); - Instruction *DeoptBlockTerm = - SplitBlockAndInsertIfThen(GI->getArgOperand(0), GI, true, - GI->getMetadata(LLVMContext::MD_prof), &DTU, &LI); + // llvm.experimental.guard doesn't have branch weights. We can assume, + // however, that the deopt path is unlikely. + Instruction *DeoptBlockTerm = SplitBlockAndInsertIfThen( + GI->getArgOperand(0), GI, true, + !ProfcheckDisableMetadataFixes && EstimateProfile + ? MDBuilder(GI->getContext()).createUnlikelyBranchWeights() + : nullptr, + &DTU, &LI); BranchInst *CheckBI = cast<BranchInst>(CheckBB->getTerminator()); // SplitBlockAndInsertIfThen inserts control flow that branches to // DeoptBlockTerm if the condition is true. We want the opposite. @@ -3197,10 +3201,14 @@ injectPendingInvariantConditions(NonTrivialUnswitchCandidate Candidate, Loop &L, Builder.SetInsertPoint(TI); auto *InvariantBr = Builder.CreateCondBr(InjectedCond, InLoopSucc, CheckBlock); + // We don't know anything about the relation between the limits. + setExplicitlyUnknownBranchWeightsIfProfiled(*InvariantBr, DEBUG_TYPE); Builder.SetInsertPoint(CheckBlock); - Builder.CreateCondBr(TI->getCondition(), TI->getSuccessor(0), - TI->getSuccessor(1)); + Builder.CreateCondBr( + TI->getCondition(), TI->getSuccessor(0), TI->getSuccessor(1), + !ProfcheckDisableMetadataFixes ? TI->getMetadata(LLVMContext::MD_prof) + : nullptr); TI->eraseFromParent(); // Fixup phis. diff --git a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp index 573a781..02b73e8 100644 --- a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp +++ b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp @@ -1283,6 +1283,12 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F, case LibFunc_ilogbl: case LibFunc_logf: case LibFunc_logl: + case LibFunc_nextafter: + case LibFunc_nextafterf: + case LibFunc_nextafterl: + case LibFunc_nexttoward: + case LibFunc_nexttowardf: + case LibFunc_nexttowardl: case LibFunc_pow: case LibFunc_powf: case LibFunc_powl: diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp index 5ba6f95f..6086615 100644 --- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp +++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp @@ -933,6 +933,7 @@ Function *CodeExtractor::constructFunctionDeclaration( case Attribute::CoroDestroyOnlyWhenComplete: case Attribute::CoroElideSafe: case Attribute::NoDivergenceSource: + case Attribute::NoCreateUndefOrPoison: continue; // Those attributes should be safe to propagate to the extracted function. case Attribute::AlwaysInline: diff --git a/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp b/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp index 0642d51..dd8706c 100644 --- a/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp +++ b/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp @@ -16,22 +16,62 @@ using namespace llvm; +static void mergeAttributes(LLVMContext &Ctx, const Module &M, + const DataLayout &DL, const Triple &TT, + Function *Func, FunctionType *FuncTy, + AttributeList FuncAttrs) { + AttributeList OldAttrs = Func->getAttributes(); + AttributeList NewAttrs = OldAttrs; + + { + AttrBuilder OldBuilder(Ctx, OldAttrs.getFnAttrs()); + AttrBuilder NewBuilder(Ctx, FuncAttrs.getFnAttrs()); + OldBuilder.merge(NewBuilder); + NewAttrs = NewAttrs.addFnAttributes(Ctx, OldBuilder); + } + + { + AttrBuilder OldBuilder(Ctx, OldAttrs.getRetAttrs()); + AttrBuilder NewBuilder(Ctx, FuncAttrs.getRetAttrs()); + OldBuilder.merge(NewBuilder); + NewAttrs = NewAttrs.addRetAttributes(Ctx, OldBuilder); + } + + for (unsigned I = 0, E = FuncTy->getNumParams(); I != E; ++I) { + AttrBuilder OldBuilder(Ctx, OldAttrs.getParamAttrs(I)); + AttrBuilder NewBuilder(Ctx, FuncAttrs.getParamAttrs(I)); + OldBuilder.merge(NewBuilder); + NewAttrs = NewAttrs.addParamAttributes(Ctx, I, OldBuilder); + } + + Func->setAttributes(NewAttrs); +} + PreservedAnalyses DeclareRuntimeLibcallsPass::run(Module &M, ModuleAnalysisManager &MAM) { RTLIB::RuntimeLibcallsInfo RTLCI(M.getTargetTriple()); LLVMContext &Ctx = M.getContext(); + const DataLayout &DL = M.getDataLayout(); + const Triple &TT = M.getTargetTriple(); - for (RTLIB::LibcallImpl Impl : RTLCI.getLibcallImpls()) { - if (Impl == RTLIB::Unsupported) + for (RTLIB::LibcallImpl Impl : RTLIB::libcall_impls()) { + if (!RTLCI.isAvailable(Impl)) continue; - // TODO: Declare with correct type, calling convention, and attributes. + auto [FuncTy, FuncAttrs] = RTLCI.getFunctionTy(Ctx, TT, DL, Impl); - FunctionType *FuncTy = - FunctionType::get(Type::getVoidTy(Ctx), {}, /*IsVarArgs=*/true); + // TODO: Declare with correct type, calling convention, and attributes. + if (!FuncTy) + FuncTy = FunctionType::get(Type::getVoidTy(Ctx), {}, /*IsVarArgs=*/true); StringRef FuncName = RTLCI.getLibcallImplName(Impl); - M.getOrInsertFunction(FuncName, FuncTy); + + Function *Func = + cast<Function>(M.getOrInsertFunction(FuncName, FuncTy).getCallee()); + if (Func->getFunctionType() == FuncTy) { + mergeAttributes(Ctx, M, DL, TT, Func, FuncTy, FuncAttrs); + Func->setCallingConv(RTLCI.getLibcallImplCallingConv(Impl)); + } } return PreservedAnalyses::none(); diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp index 1e8f6cc..6c9467b 100644 --- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp @@ -202,6 +202,27 @@ static void ConnectProlog(Loop *L, Value *BECount, unsigned Count, /// probability of executing at least one more iteration? static BranchProbability probOfNextInRemainder(BranchProbability OriginalLoopProb, unsigned N) { + // OriginalLoopProb == 1 would produce a division by zero in the calculation + // below. The problem is that case indicates an always infinite loop, but a + // remainder loop cannot be calculated at run time if the original loop is + // infinite as infinity % UnrollCount is undefined. We then choose + // probabilities indicating that all remainder loop iterations will always + // execute. + // + // Currently, the remainder loop here is an epilogue, which cannot be reached + // if the original loop is infinite, so the aforementioned choice is + // arbitrary. + // + // FIXME: Branch weights still need to be fixed in the case of prologues + // (issue #135812). In that case, the aforementioned choice seems reasonable + // for the goal of maintaining the original loop's block frequencies. That + // is, an infinite loop's initial iterations are not skipped, and the prologue + // loop body might have unique blocks that execute a finite number of times + // if, for example, the original loop body contains conditionals like i < + // UnrollCount. + if (OriginalLoopProb == BranchProbability::getOne()) + return BranchProbability::getOne(); + // Each of these variables holds the original loop's probability that the // number of iterations it will execute is some m in the specified range. BranchProbability ProbOne = OriginalLoopProb; // 1 <= m diff --git a/llvm/lib/Transforms/Utils/LoopVersioning.cpp b/llvm/lib/Transforms/Utils/LoopVersioning.cpp index ec2e6c1..9c8b6ef 100644 --- a/llvm/lib/Transforms/Utils/LoopVersioning.cpp +++ b/llvm/lib/Transforms/Utils/LoopVersioning.cpp @@ -23,6 +23,7 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/MDBuilder.h" #include "llvm/IR/PassManager.h" +#include "llvm/IR/ProfDataUtils.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" @@ -109,8 +110,12 @@ void LoopVersioning::versionLoop( // Insert the conditional branch based on the result of the memchecks. Instruction *OrigTerm = RuntimeCheckBB->getTerminator(); Builder.SetInsertPoint(OrigTerm); - Builder.CreateCondBr(RuntimeCheck, NonVersionedLoop->getLoopPreheader(), - VersionedLoop->getLoopPreheader()); + auto *BI = + Builder.CreateCondBr(RuntimeCheck, NonVersionedLoop->getLoopPreheader(), + VersionedLoop->getLoopPreheader()); + // We don't know what the probability of executing the versioned vs the + // unversioned variants is. + setExplicitlyUnknownBranchWeightsIfProfiled(*BI, DEBUG_TYPE); OrigTerm->eraseFromParent(); // The loops merge in the original exit block. This is now dominated by the diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index cbc604e..37c048f 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -778,8 +778,10 @@ private: return false; // Add all values from the range to the set - for (APInt Tmp = Span.getLower(); Tmp != Span.getUpper(); ++Tmp) + APInt Tmp = Span.getLower(); + do Vals.push_back(ConstantInt::get(I->getContext(), Tmp)); + while (++Tmp != Span.getUpper()); UsedICmps++; return true; @@ -5212,8 +5214,7 @@ bool SimplifyCFGOpt::simplifyBranchOnICmpChain(BranchInst *BI, // We don't have any info about this condition. auto *Br = TrueWhenEqual ? Builder.CreateCondBr(ExtraCase, EdgeBB, NewBB) : Builder.CreateCondBr(ExtraCase, NewBB, EdgeBB); - setExplicitlyUnknownBranchWeightsIfProfiled(*Br, *NewBB->getParent(), - DEBUG_TYPE); + setExplicitlyUnknownBranchWeightsIfProfiled(*Br, DEBUG_TYPE); OldTI->eraseFromParent(); @@ -6020,6 +6021,8 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU, const DataLayout &DL) { Value *Cond = SI->getCondition(); KnownBits Known = computeKnownBits(Cond, DL, AC, SI); + SmallPtrSet<const Constant *, 4> KnownValues; + bool IsKnownValuesValid = collectPossibleValues(Cond, KnownValues, 4); // We can also eliminate cases by determining that their values are outside of // the limited range of the condition based on how many significant (non-sign) @@ -6039,15 +6042,18 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU, UniqueSuccessors.push_back(Successor); ++It->second; } - const APInt &CaseVal = Case.getCaseValue()->getValue(); + ConstantInt *CaseC = Case.getCaseValue(); + const APInt &CaseVal = CaseC->getValue(); if (Known.Zero.intersects(CaseVal) || !Known.One.isSubsetOf(CaseVal) || - (CaseVal.getSignificantBits() > MaxSignificantBitsInCond)) { - DeadCases.push_back(Case.getCaseValue()); + (CaseVal.getSignificantBits() > MaxSignificantBitsInCond) || + (IsKnownValuesValid && !KnownValues.contains(CaseC))) { + DeadCases.push_back(CaseC); if (DTU) --NumPerSuccessorCases[Successor]; LLVM_DEBUG(dbgs() << "SimplifyCFG: switch case " << CaseVal << " is dead.\n"); - } + } else if (IsKnownValuesValid) + KnownValues.erase(CaseC); } // If we can prove that the cases must cover all possible values, the @@ -6058,33 +6064,41 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU, const unsigned NumUnknownBits = Known.getBitWidth() - (Known.Zero | Known.One).popcount(); assert(NumUnknownBits <= Known.getBitWidth()); - if (HasDefault && DeadCases.empty() && - NumUnknownBits < 64 /* avoid overflow */) { - uint64_t AllNumCases = 1ULL << NumUnknownBits; - if (SI->getNumCases() == AllNumCases) { + if (HasDefault && DeadCases.empty()) { + if (IsKnownValuesValid && all_of(KnownValues, IsaPred<UndefValue>)) { createUnreachableSwitchDefault(SI, DTU); return true; } - // When only one case value is missing, replace default with that case. - // Eliminating the default branch will provide more opportunities for - // optimization, such as lookup tables. - if (SI->getNumCases() == AllNumCases - 1) { - assert(NumUnknownBits > 1 && "Should be canonicalized to a branch"); - IntegerType *CondTy = cast<IntegerType>(Cond->getType()); - if (CondTy->getIntegerBitWidth() > 64 || - !DL.fitsInLegalInteger(CondTy->getIntegerBitWidth())) - return false; - uint64_t MissingCaseVal = 0; - for (const auto &Case : SI->cases()) - MissingCaseVal ^= Case.getCaseValue()->getValue().getLimitedValue(); - auto *MissingCase = - cast<ConstantInt>(ConstantInt::get(Cond->getType(), MissingCaseVal)); - SwitchInstProfUpdateWrapper SIW(*SI); - SIW.addCase(MissingCase, SI->getDefaultDest(), SIW.getSuccessorWeight(0)); - createUnreachableSwitchDefault(SI, DTU, /*RemoveOrigDefaultBlock*/ false); - SIW.setSuccessorWeight(0, 0); - return true; + if (NumUnknownBits < 64 /* avoid overflow */) { + uint64_t AllNumCases = 1ULL << NumUnknownBits; + if (SI->getNumCases() == AllNumCases) { + createUnreachableSwitchDefault(SI, DTU); + return true; + } + // When only one case value is missing, replace default with that case. + // Eliminating the default branch will provide more opportunities for + // optimization, such as lookup tables. + if (SI->getNumCases() == AllNumCases - 1) { + assert(NumUnknownBits > 1 && "Should be canonicalized to a branch"); + IntegerType *CondTy = cast<IntegerType>(Cond->getType()); + if (CondTy->getIntegerBitWidth() > 64 || + !DL.fitsInLegalInteger(CondTy->getIntegerBitWidth())) + return false; + + uint64_t MissingCaseVal = 0; + for (const auto &Case : SI->cases()) + MissingCaseVal ^= Case.getCaseValue()->getValue().getLimitedValue(); + auto *MissingCase = cast<ConstantInt>( + ConstantInt::get(Cond->getType(), MissingCaseVal)); + SwitchInstProfUpdateWrapper SIW(*SI); + SIW.addCase(MissingCase, SI->getDefaultDest(), + SIW.getSuccessorWeight(0)); + createUnreachableSwitchDefault(SI, DTU, + /*RemoveOrigDefaultBlock*/ false); + SIW.setSuccessorWeight(0, 0); + return true; + } } } @@ -7570,6 +7584,81 @@ static bool reduceSwitchRange(SwitchInst *SI, IRBuilder<> &Builder, return true; } +/// Tries to transform the switch when the condition is umin with a constant. +/// In that case, the default branch can be replaced by the constant's branch. +/// This method also removes dead cases when the simplification cannot replace +/// the default branch. +/// +/// For example: +/// switch(umin(a, 3)) { +/// case 0: +/// case 1: +/// case 2: +/// case 3: +/// case 4: +/// // ... +/// default: +/// unreachable +/// } +/// +/// Transforms into: +/// +/// switch(a) { +/// case 0: +/// case 1: +/// case 2: +/// default: +/// // This is case 3 +/// } +static bool simplifySwitchWhenUMin(SwitchInst *SI, DomTreeUpdater *DTU) { + Value *A; + ConstantInt *Constant; + + if (!match(SI->getCondition(), m_UMin(m_Value(A), m_ConstantInt(Constant)))) + return false; + + SmallVector<DominatorTree::UpdateType> Updates; + SwitchInstProfUpdateWrapper SIW(*SI); + BasicBlock *BB = SIW->getParent(); + + // Dead cases are removed even when the simplification fails. + // A case is dead when its value is higher than the Constant. + for (auto I = SI->case_begin(), E = SI->case_end(); I != E;) { + if (!I->getCaseValue()->getValue().ugt(Constant->getValue())) { + ++I; + continue; + } + BasicBlock *DeadCaseBB = I->getCaseSuccessor(); + DeadCaseBB->removePredecessor(BB); + Updates.push_back({DominatorTree::Delete, BB, DeadCaseBB}); + I = SIW->removeCase(I); + E = SIW->case_end(); + } + + auto Case = SI->findCaseValue(Constant); + // If the case value is not found, `findCaseValue` returns the default case. + // In this scenario, since there is no explicit `case 3:`, the simplification + // fails. The simplification also fails when the switch’s default destination + // is reachable. + if (!SI->defaultDestUnreachable() || Case == SI->case_default()) { + if (DTU) + DTU->applyUpdates(Updates); + return !Updates.empty(); + } + + BasicBlock *Unreachable = SI->getDefaultDest(); + SIW.replaceDefaultDest(Case); + SIW.removeCase(Case); + SIW->setCondition(A); + + Updates.push_back({DominatorTree::Delete, BB, Unreachable}); + + if (DTU) + DTU->applyUpdates(Updates); + + return true; +} + /// Tries to transform switch of powers of two to reduce switch range. /// For example, switch like: /// switch (C) { case 1: case 2: case 64: case 128: } @@ -7642,19 +7731,24 @@ static bool simplifySwitchOfPowersOfTwo(SwitchInst *SI, IRBuilder<> &Builder, // label. The other is those powers of 2 that don't appear in the case // statement. We don't know the distribution of the values coming in, so // the safest is to split 50-50 the original probability to `default`. - uint64_t OrigDenominator = sum_of(map_range( - Weights, [](const auto &V) { return static_cast<uint64_t>(V); })); + uint64_t OrigDenominator = + sum_of(map_range(Weights, StaticCastTo<uint64_t>)); SmallVector<uint64_t> NewWeights(2); NewWeights[1] = Weights[0] / 2; NewWeights[0] = OrigDenominator - NewWeights[1]; setFittedBranchWeights(*BI, NewWeights, /*IsExpected=*/false); - - // For the original switch, we reduce the weight of the default by the - // amount by which the previous branch contributes to getting to default, - // and then make sure the remaining weights have the same relative ratio - // wrt eachother. + // The probability of executing the default block stays constant. It was + // p_d = Weights[0] / OrigDenominator + // we rewrite as W/D + // We want to find the probability of the default branch of the switch + // statement. Let's call it X. We have W/D = W/2D + X * (1-W/2D) + // i.e. the original probability is the probability we go to the default + // branch from the BI branch, or we take the default branch on the SI. + // Meaning X = W / (2D - W), or (W/2) / (D - W/2) + // This matches using W/2 for the default branch probability numerator and + // D-W/2 as the denominator. + Weights[0] = NewWeights[1]; uint64_t CasesDenominator = OrigDenominator - Weights[0]; - Weights[0] /= 2; for (auto &W : drop_begin(Weights)) W = NewWeights[0] * static_cast<double>(W) / CasesDenominator; @@ -8037,6 +8131,9 @@ bool SimplifyCFGOpt::simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) { if (simplifyDuplicateSwitchArms(SI, DTU)) return requestResimplify(); + if (simplifySwitchWhenUMin(SI, DTU)) + return requestResimplify(); + return false; } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index fdfff16..03112c6 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -462,8 +462,9 @@ int LoopVectorizationLegality::isConsecutivePtr(Type *AccessTy, bool CanAddPredicate = !llvm::shouldOptimizeForSize( TheLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); - int Stride = getPtrStride(PSE, AccessTy, Ptr, TheLoop, Strides, - CanAddPredicate, false).value_or(0); + int Stride = getPtrStride(PSE, AccessTy, Ptr, TheLoop, *DT, Strides, + CanAddPredicate, false) + .value_or(0); if (Stride == 1 || Stride == -1) return Stride; return 0; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index e5c3f17..906fa2f 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7550,13 +7550,12 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands, } if (LoadInst *Load = dyn_cast<LoadInst>(I)) return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse, - Load->getAlign(), VPIRMetadata(*Load, LVer), - I->getDebugLoc()); + VPIRMetadata(*Load, LVer), I->getDebugLoc()); StoreInst *Store = cast<StoreInst>(I); return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive, - Reverse, Store->getAlign(), - VPIRMetadata(*Store, LVer), I->getDebugLoc()); + Reverse, VPIRMetadata(*Store, LVer), + I->getDebugLoc()); } /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 428a8f4..dd26a05 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -304,18 +304,7 @@ Value *VPTransformState::get(const VPValue *Def, bool NeedsScalar) { } bool IsSingleScalar = vputils::isSingleScalar(Def); - VPLane LastLane(IsSingleScalar ? 0 : VF.getFixedValue() - 1); - // Check if there is a scalar value for the selected lane. - if (!hasScalarValue(Def, LastLane)) { - // At the moment, VPWidenIntOrFpInductionRecipes, VPScalarIVStepsRecipes and - // VPExpandSCEVRecipes can also be a single scalar. - assert((isa<VPWidenIntOrFpInductionRecipe, VPScalarIVStepsRecipe, - VPExpandSCEVRecipe>(Def->getDefiningRecipe())) && - "unexpected recipe found to be invariant"); - IsSingleScalar = true; - LastLane = 0; - } // We need to construct the vector value for a single-scalar value by // broadcasting the scalar to all lanes. diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index cfe1f1e..3062e1c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1163,10 +1163,10 @@ public: bool opcodeMayReadOrWriteFromMemory() const; /// Returns true if the recipe only uses the first lane of operand \p Op. - bool onlyFirstLaneUsed(const VPValue *Op) const override; + bool usesFirstLaneOnly(const VPValue *Op) const override; /// Returns true if the recipe only uses the first part of operand \p Op. - bool onlyFirstPartUsed(const VPValue *Op) const override; + bool usesFirstPartOnly(const VPValue *Op) const override; /// Returns true if this VPInstruction produces a scalar value from a vector, /// e.g. by performing a reduction or extracting a lane. @@ -1393,13 +1393,13 @@ public: return true; } - bool onlyFirstPartUsed(const VPValue *Op) const override { + bool usesFirstPartOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); return true; } - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); return true; @@ -1628,7 +1628,7 @@ public: VPSlotTracker &SlotTracker) const override; #endif - bool onlyFirstLaneUsed(const VPValue *Op) const override; + bool usesFirstLaneOnly(const VPValue *Op) const override; }; /// A recipe for widening Call instructions using library calls. @@ -1725,7 +1725,9 @@ public: #endif }; -/// A recipe for widening select instructions. +/// A recipe for widening select instructions. Supports both wide vector and +/// single-scalar conditions, matching the behavior of LLVM IR's select +/// instruction. struct LLVM_ABI_FOR_TEST VPWidenSelectRecipe : public VPRecipeWithIRFlags, public VPIRMetadata { VPWidenSelectRecipe(SelectInst &I, ArrayRef<VPValue *> Operands) @@ -1765,7 +1767,7 @@ struct LLVM_ABI_FOR_TEST VPWidenSelectRecipe : public VPRecipeWithIRFlags, } /// Returns true if the recipe only uses the first lane of operand \p Op. - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); return Op == getCond() && isInvariantCond(); @@ -1831,7 +1833,7 @@ public: #endif /// Returns true if the recipe only uses the first lane of operand \p Op. - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); if (Op == getOperand(0)) @@ -1868,7 +1870,7 @@ public: void execute(VPTransformState &State) override; - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); return true; @@ -1882,7 +1884,7 @@ public: } /// Returns true if the recipe only uses the first part of operand \p Op. - bool onlyFirstPartUsed(const VPValue *Op) const override { + bool usesFirstPartOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); assert(getNumOperands() <= 2 && "must have at most two operands"); @@ -1920,14 +1922,14 @@ public: Type *getSourceElementType() const { return SourceElementTy; } - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); return true; } /// Returns true if the recipe only uses the first part of operand \p Op. - bool onlyFirstPartUsed(const VPValue *Op) const override { + bool usesFirstPartOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); assert(getNumOperands() <= 2 && "must have at most two operands"); @@ -2108,7 +2110,7 @@ public: } /// Returns true if the recipe only uses the first lane of operand \p Op. - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); // The recipe creates its own wide start value, so it only requests the @@ -2323,7 +2325,7 @@ struct VPFirstOrderRecurrencePHIRecipe : public VPHeaderPHIRecipe { #endif /// Returns true if the recipe only uses the first lane of operand \p Op. - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); return Op == getStartValue(); @@ -2397,7 +2399,7 @@ public: bool isInLoop() const { return IsInLoop; } /// Returns true if the recipe only uses the first lane of operand \p Op. - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); return isOrdered() || isInLoop(); @@ -2466,13 +2468,13 @@ public: #endif /// Returns true if the recipe only uses the first lane of operand \p Op. - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); // Recursing through Blend recipes only, must terminate at header phi's the // latest. return all_of(users(), - [this](VPUser *U) { return U->onlyFirstLaneUsed(this); }); + [this](VPUser *U) { return U->usesFirstLaneOnly(this); }); } }; @@ -2560,7 +2562,7 @@ public: VPCostContext &Ctx) const override; /// Returns true if the recipe only uses the first lane of operand \p Op. - bool onlyFirstLaneUsed(const VPValue *Op) const override = 0; + bool usesFirstLaneOnly(const VPValue *Op) const override = 0; /// Returns the number of stored operands of this interleave group. Returns 0 /// for load interleave groups. @@ -2606,7 +2608,7 @@ public: VPSlotTracker &SlotTracker) const override; #endif - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); return Op == getAddr() && !llvm::is_contained(getStoredValues(), Op); @@ -2654,7 +2656,7 @@ public: #endif /// The recipe only uses the first lane of the address, and EVL operand. - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); return (Op == getAddr() && !llvm::is_contained(getStoredValues(), Op)) || @@ -2860,7 +2862,7 @@ public: VPValue *getEVL() const { return getOperand(2); } /// Returns true if the recipe only uses the first lane of operand \p Op. - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); return Op == getEVL(); @@ -2922,7 +2924,7 @@ public: bool isPredicated() const { return IsPredicated; } /// Returns true if the recipe only uses the first lane of operand \p Op. - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); return isSingleScalar(); @@ -3204,14 +3206,14 @@ protected: VPWidenMemoryRecipe(const char unsigned SC, Instruction &I, std::initializer_list<VPValue *> Operands, - bool Consecutive, bool Reverse, Align Alignment, + bool Consecutive, bool Reverse, const VPIRMetadata &Metadata, DebugLoc DL) : VPRecipeBase(SC, Operands, DL), VPIRMetadata(Metadata), Ingredient(I), - Alignment(Alignment), Consecutive(Consecutive), Reverse(Reverse) { + Alignment(getLoadStoreAlignment(&I)), Consecutive(Consecutive), + Reverse(Reverse) { assert((Consecutive || !Reverse) && "Reverse implies consecutive"); - assert(isa<VPVectorEndPointerRecipe>(getAddr()) || - !Reverse && - "Reversed acccess without VPVectorEndPointerRecipe address?"); + assert((isa<VPVectorEndPointerRecipe>(getAddr()) || !Reverse) && + "Reversed acccess without VPVectorEndPointerRecipe address?"); } public: @@ -3271,18 +3273,18 @@ public: struct LLVM_ABI_FOR_TEST VPWidenLoadRecipe final : public VPWidenMemoryRecipe, public VPValue { VPWidenLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask, - bool Consecutive, bool Reverse, Align Alignment, + bool Consecutive, bool Reverse, const VPIRMetadata &Metadata, DebugLoc DL) : VPWidenMemoryRecipe(VPDef::VPWidenLoadSC, Load, {Addr}, Consecutive, - Reverse, Alignment, Metadata, DL), + Reverse, Metadata, DL), VPValue(this, &Load) { setMask(Mask); } VPWidenLoadRecipe *clone() override { return new VPWidenLoadRecipe(cast<LoadInst>(Ingredient), getAddr(), - getMask(), Consecutive, Reverse, getAlign(), - *this, getDebugLoc()); + getMask(), Consecutive, Reverse, *this, + getDebugLoc()); } VP_CLASSOF_IMPL(VPDef::VPWidenLoadSC); @@ -3297,7 +3299,7 @@ struct LLVM_ABI_FOR_TEST VPWidenLoadRecipe final : public VPWidenMemoryRecipe, #endif /// Returns true if the recipe only uses the first lane of operand \p Op. - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); // Widened, consecutive loads operations only demand the first lane of @@ -3313,8 +3315,8 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue { VPWidenLoadEVLRecipe(VPWidenLoadRecipe &L, VPValue *Addr, VPValue &EVL, VPValue *Mask) : VPWidenMemoryRecipe(VPDef::VPWidenLoadEVLSC, L.getIngredient(), - {Addr, &EVL}, L.isConsecutive(), L.isReverse(), - L.getAlign(), L, L.getDebugLoc()), + {Addr, &EVL}, L.isConsecutive(), L.isReverse(), L, + L.getDebugLoc()), VPValue(this, &getIngredient()) { setMask(Mask); } @@ -3338,7 +3340,7 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue { #endif /// Returns true if the recipe only uses the first lane of operand \p Op. - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); // Widened loads only demand the first lane of EVL and consecutive loads @@ -3352,16 +3354,16 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue { struct LLVM_ABI_FOR_TEST VPWidenStoreRecipe final : public VPWidenMemoryRecipe { VPWidenStoreRecipe(StoreInst &Store, VPValue *Addr, VPValue *StoredVal, VPValue *Mask, bool Consecutive, bool Reverse, - Align Alignment, const VPIRMetadata &Metadata, DebugLoc DL) + const VPIRMetadata &Metadata, DebugLoc DL) : VPWidenMemoryRecipe(VPDef::VPWidenStoreSC, Store, {Addr, StoredVal}, - Consecutive, Reverse, Alignment, Metadata, DL) { + Consecutive, Reverse, Metadata, DL) { setMask(Mask); } VPWidenStoreRecipe *clone() override { return new VPWidenStoreRecipe(cast<StoreInst>(Ingredient), getAddr(), getStoredValue(), getMask(), Consecutive, - Reverse, getAlign(), *this, getDebugLoc()); + Reverse, *this, getDebugLoc()); } VP_CLASSOF_IMPL(VPDef::VPWidenStoreSC); @@ -3379,7 +3381,7 @@ struct LLVM_ABI_FOR_TEST VPWidenStoreRecipe final : public VPWidenMemoryRecipe { #endif /// Returns true if the recipe only uses the first lane of operand \p Op. - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); // Widened, consecutive stores only demand the first lane of their address, @@ -3396,7 +3398,7 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe { VPValue *Mask) : VPWidenMemoryRecipe(VPDef::VPWidenStoreEVLSC, S.getIngredient(), {Addr, S.getStoredValue(), &EVL}, S.isConsecutive(), - S.isReverse(), S.getAlign(), S, S.getDebugLoc()) { + S.isReverse(), S, S.getDebugLoc()) { setMask(Mask); } @@ -3422,7 +3424,7 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe { #endif /// Returns true if the recipe only uses the first lane of operand \p Op. - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); if (Op == getEVL()) { @@ -3506,14 +3508,14 @@ public: } /// Returns true if the recipe only uses the first lane of operand \p Op. - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); return true; } /// Returns true if the recipe only uses the first part of operand \p Op. - bool onlyFirstPartUsed(const VPValue *Op) const override { + bool usesFirstPartOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); return true; @@ -3588,7 +3590,7 @@ public: } /// Returns true if the recipe only uses the first lane of operand \p Op. - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); return true; @@ -3698,7 +3700,7 @@ public: VPValue *getStepValue() const { return getOperand(2); } /// Returns true if the recipe only uses the first lane of operand \p Op. - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); return true; @@ -3763,7 +3765,7 @@ public: VPValue *getStepValue() const { return getOperand(1); } /// Returns true if the recipe only uses the first lane of operand \p Op. - bool onlyFirstLaneUsed(const VPValue *Op) const override { + bool usesFirstLaneOnly(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); return true; diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 1ee405a..80cd112 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -659,7 +659,9 @@ Value *VPInstruction::generate(VPTransformState &State) { } case Instruction::Select: { bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); - Value *Cond = State.get(getOperand(0), OnlyFirstLaneUsed); + Value *Cond = + State.get(getOperand(0), + OnlyFirstLaneUsed || vputils::isSingleScalar(getOperand(0))); Value *Op1 = State.get(getOperand(1), OnlyFirstLaneUsed); Value *Op2 = State.get(getOperand(2), OnlyFirstLaneUsed); return Builder.CreateSelect(Cond, Op1, Op2, Name); @@ -1274,7 +1276,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { } } -bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { +bool VPInstruction::usesFirstLaneOnly(const VPValue *Op) const { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); if (Instruction::isBinaryOp(getOpcode()) || Instruction::isCast(getOpcode())) return vputils::onlyFirstLaneUsed(this); @@ -1323,7 +1325,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { llvm_unreachable("switch should return"); } -bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const { +bool VPInstruction::usesFirstPartOnly(const VPValue *Op) const { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); if (Instruction::isBinaryOp(getOpcode())) return vputils::onlyFirstPartUsed(this); @@ -1690,7 +1692,7 @@ void VPWidenCallRecipe::execute(VPTransformState &State) { if (!VFTy->getParamType(I.index())->isVectorTy()) Arg = State.get(I.value(), VPLane(0)); else - Arg = State.get(I.value(), onlyFirstLaneUsed(I.value())); + Arg = State.get(I.value(), usesFirstLaneOnly(I.value())); Args.push_back(Arg); } @@ -1759,7 +1761,7 @@ void VPWidenIntrinsicRecipe::execute(VPTransformState &State) { State.TTI)) Arg = State.get(I.value(), VPLane(0)); else - Arg = State.get(I.value(), onlyFirstLaneUsed(I.value())); + Arg = State.get(I.value(), usesFirstLaneOnly(I.value())); if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index(), State.TTI)) TysForDecl.push_back(Arg->getType()); @@ -1841,7 +1843,7 @@ StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const { return Intrinsic::getBaseName(VectorIntrinsicID); } -bool VPWidenIntrinsicRecipe::onlyFirstLaneUsed(const VPValue *Op) const { +bool VPWidenIntrinsicRecipe::usesFirstLaneOnly(const VPValue *Op) const { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); return all_of(enumerate(operands()), [this, &Op](const auto &X) { auto [Idx, V] = X; @@ -1968,16 +1970,13 @@ void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent, getOperand(1)->printAsOperand(O, SlotTracker); O << ", "; getOperand(2)->printAsOperand(O, SlotTracker); - O << (isInvariantCond() ? " (condition is loop invariant)" : ""); + O << (vputils::isSingleScalar(getCond()) ? " (condition is single-scalar)" + : ""); } #endif void VPWidenSelectRecipe::execute(VPTransformState &State) { - // The condition can be loop invariant but still defined inside the - // loop. This means that we can't just use the original 'cond' value. - // We have to take the 'vectorized' value and pick the first lane. - // Instcombine will make this a no-op. - Value *Cond = State.get(getCond(), isInvariantCond()); + Value *Cond = State.get(getCond(), vputils::isSingleScalar(getCond())); Value *Op0 = State.get(getOperand(1)); Value *Op1 = State.get(getOperand(2)); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 9d9bb14..48bd697 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -91,14 +91,13 @@ bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes( if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) { NewRecipe = new VPWidenLoadRecipe( *Load, Ingredient.getOperand(0), nullptr /*Mask*/, - false /*Consecutive*/, false /*Reverse*/, Load->getAlign(), - VPIRMetadata(*Load), Ingredient.getDebugLoc()); + false /*Consecutive*/, false /*Reverse*/, VPIRMetadata(*Load), + Ingredient.getDebugLoc()); } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) { NewRecipe = new VPWidenStoreRecipe( *Store, Ingredient.getOperand(1), Ingredient.getOperand(0), nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/, - Store->getAlign(), VPIRMetadata(*Store), - Ingredient.getDebugLoc()); + VPIRMetadata(*Store), Ingredient.getDebugLoc()); } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) { NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands()); } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) { @@ -154,27 +153,31 @@ static bool sinkScalarOperands(VPlan &Plan) { bool ScalarVFOnly = Plan.hasScalarVFOnly(); bool Changed = false; - auto IsValidSinkCandidate = [ScalarVFOnly](VPBasicBlock *SinkTo, - VPSingleDefRecipe *Candidate) { - // We only know how to duplicate VPReplicateRecipes and - // VPScalarIVStepsRecipes for now. + SetVector<std::pair<VPBasicBlock *, VPSingleDefRecipe *>> WorkList; + auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList]( + VPBasicBlock *SinkTo, VPValue *Op) { + auto *Candidate = + dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()); + if (!Candidate) + return; + + // We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes + // for now. if (!isa<VPReplicateRecipe, VPScalarIVStepsRecipe>(Candidate)) - return false; + return; - if (Candidate->getParent() == SinkTo || Candidate->mayHaveSideEffects() || - Candidate->mayReadOrWriteMemory()) - return false; + if (Candidate->getParent() == SinkTo || cannotHoistOrSinkRecipe(*Candidate)) + return; if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate)) if (!ScalarVFOnly && RepR->isSingleScalar()) - return false; + return; - return true; + WorkList.insert({SinkTo, Candidate}); }; // First, collect the operands of all recipes in replicate blocks as seeds for // sinking. - SetVector<std::pair<VPBasicBlock *, VPSingleDefRecipe *>> WorkList; for (VPRegionBlock *VPR : VPBlockUtils::blocksOnly<VPRegionBlock>(Iter)) { VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock(); if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2) @@ -182,14 +185,9 @@ static bool sinkScalarOperands(VPlan &Plan) { VPBasicBlock *VPBB = cast<VPBasicBlock>(EntryVPBB->getSuccessors().front()); if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock()) continue; - for (auto &Recipe : *VPBB) { - for (VPValue *Op : Recipe.operands()) { - if (auto *Def = - dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe())) - if (IsValidSinkCandidate(VPBB, Def)) - WorkList.insert({VPBB, Def}); - } - } + for (auto &Recipe : *VPBB) + for (VPValue *Op : Recipe.operands()) + InsertIfValidSinkCandidate(VPBB, Op); } // Try to sink each replicate or scalar IV steps recipe in the worklist. @@ -198,15 +196,15 @@ static bool sinkScalarOperands(VPlan &Plan) { VPSingleDefRecipe *SinkCandidate; std::tie(SinkTo, SinkCandidate) = WorkList[I]; - // All recipe users of the sink candidate must be in the same block SinkTo - // or all users outside of SinkTo must have only their first lane used. In + // All recipe users of SinkCandidate must be in the same block SinkTo or all + // users outside of SinkTo must only use the first lane of SinkCandidate. In // the latter case, we need to duplicate SinkCandidate. auto UsersOutsideSinkTo = make_filter_range(SinkCandidate->users(), [SinkTo](VPUser *U) { return cast<VPRecipeBase>(U)->getParent() != SinkTo; }); if (any_of(UsersOutsideSinkTo, [SinkCandidate](VPUser *U) { - return !U->onlyFirstLaneUsed(SinkCandidate); + return !U->usesFirstLaneOnly(SinkCandidate); })) continue; bool NeedsDuplicating = !UsersOutsideSinkTo.empty(); @@ -234,10 +232,7 @@ static bool sinkScalarOperands(VPlan &Plan) { } SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi()); for (VPValue *Op : SinkCandidate->operands()) - if (auto *Def = - dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe())) - if (IsValidSinkCandidate(SinkTo, Def)) - WorkList.insert({SinkTo, Def}); + InsertIfValidSinkCandidate(SinkTo, Op); Changed = true; } return Changed; @@ -1290,6 +1285,15 @@ static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) { return; } + // Look through broadcast of single-scalar when used as select conditions; in + // that case the scalar condition can be used directly. + if (match(Def, + m_Select(m_Broadcast(m_VPValue(C)), m_VPValue(), m_VPValue())) && + vputils::isSingleScalar(C)) { + Def->setOperand(0, C); + return; + } + if (auto *Phi = dyn_cast<VPPhi>(Def)) { if (Phi->getNumOperands() == 1) Phi->replaceAllUsesWith(Phi->getOperand(0)); @@ -4178,6 +4182,59 @@ static bool isAlreadyNarrow(VPValue *VPV) { return RepR && RepR->isSingleScalar(); } +// Convert a wide recipe defining a VPValue \p V feeding an interleave group to +// a narrow variant. +static VPValue * +narrowInterleaveGroupOp(VPValue *V, SmallPtrSetImpl<VPValue *> &NarrowedOps) { + auto *R = V->getDefiningRecipe(); + if (!R || NarrowedOps.contains(V)) + return V; + + if (isAlreadyNarrow(V)) + return V; + + if (auto *WideMember0 = dyn_cast<VPWidenRecipe>(R)) { + for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx) + WideMember0->setOperand( + Idx, + narrowInterleaveGroupOp(WideMember0->getOperand(Idx), NarrowedOps)); + return V; + } + + if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) { + // Narrow interleave group to wide load, as transformed VPlan will only + // process one original iteration. + auto *LI = cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos()); + auto *L = new VPWidenLoadRecipe( + *LI, LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true, + /*Reverse=*/false, {}, LoadGroup->getDebugLoc()); + L->insertBefore(LoadGroup); + NarrowedOps.insert(L); + return L; + } + + if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) { + assert(RepR->isSingleScalar() && + isa<LoadInst>(RepR->getUnderlyingInstr()) && + "must be a single scalar load"); + NarrowedOps.insert(RepR); + return RepR; + } + + auto *WideLoad = cast<VPWidenLoadRecipe>(R); + VPValue *PtrOp = WideLoad->getAddr(); + if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp)) + PtrOp = VecPtr->getOperand(0); + // Narrow wide load to uniform scalar load, as transformed VPlan will only + // process one original iteration. + auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp}, + /*IsUniform*/ true, + /*Mask*/ nullptr, *WideLoad); + N->insertBefore(WideLoad); + NarrowedOps.insert(N); + return N; +} + void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, unsigned VectorRegWidth) { VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion(); @@ -4279,65 +4336,15 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe. SmallPtrSet<VPValue *, 4> NarrowedOps; - auto NarrowOp = [&NarrowedOps](VPValue *V) -> VPValue * { - auto *R = V->getDefiningRecipe(); - if (!R || NarrowedOps.contains(V)) - return V; - if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) { - // Narrow interleave group to wide load, as transformed VPlan will only - // process one original iteration. - auto *LI = - cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos()); - auto *L = new VPWidenLoadRecipe( - *LI, LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true, - /*Reverse=*/false, LI->getAlign(), {}, LoadGroup->getDebugLoc()); - L->insertBefore(LoadGroup); - NarrowedOps.insert(L); - return L; - } - - if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) { - assert(RepR->isSingleScalar() && - isa<LoadInst>(RepR->getUnderlyingInstr()) && - "must be a single scalar load"); - NarrowedOps.insert(RepR); - return RepR; - } - auto *WideLoad = cast<VPWidenLoadRecipe>(R); - - VPValue *PtrOp = WideLoad->getAddr(); - if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp)) - PtrOp = VecPtr->getOperand(0); - // Narrow wide load to uniform scalar load, as transformed VPlan will only - // process one original iteration. - auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp}, - /*IsUniform*/ true, - /*Mask*/ nullptr, *WideLoad); - N->insertBefore(WideLoad); - NarrowedOps.insert(N); - return N; - }; - // Narrow operation tree rooted at store groups. for (auto *StoreGroup : StoreGroups) { - VPValue *Res = nullptr; - VPValue *Member0 = StoreGroup->getStoredValues()[0]; - if (isAlreadyNarrow(Member0)) { - Res = Member0; - } else if (auto *WideMember0 = - dyn_cast<VPWidenRecipe>(Member0->getDefiningRecipe())) { - for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx) - WideMember0->setOperand(Idx, NarrowOp(WideMember0->getOperand(Idx))); - Res = WideMember0; - } else { - Res = NarrowOp(Member0); - } - + VPValue *Res = + narrowInterleaveGroupOp(StoreGroup->getStoredValues()[0], NarrowedOps); auto *SI = cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos()); auto *S = new VPWidenStoreRecipe( *SI, StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true, - /*Reverse=*/false, SI->getAlign(), {}, StoreGroup->getDebugLoc()); + /*Reverse=*/false, {}, StoreGroup->getDebugLoc()); S->insertBefore(StoreGroup); StoreGroup->eraseFromParent(); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp index d6a0028..d4b8b72b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp @@ -582,7 +582,7 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) { /// Users that only demand the first lane can use the definition for lane /// 0. DefR->replaceUsesWithIf(LaneDefs[0], [DefR](VPUser &U, unsigned) { - return U.onlyFirstLaneUsed(DefR); + return U.usesFirstLaneOnly(DefR); }); // Update each build vector user that currently has DefR as its only diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index c6380d3..e22c5df 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -18,12 +18,12 @@ using namespace llvm::VPlanPatternMatch; bool vputils::onlyFirstLaneUsed(const VPValue *Def) { return all_of(Def->users(), - [Def](const VPUser *U) { return U->onlyFirstLaneUsed(Def); }); + [Def](const VPUser *U) { return U->usesFirstLaneOnly(Def); }); } bool vputils::onlyFirstPartUsed(const VPValue *Def) { return all_of(Def->users(), - [Def](const VPUser *U) { return U->onlyFirstPartUsed(Def); }); + [Def](const VPUser *U) { return U->usesFirstPartOnly(Def); }); } bool vputils::onlyScalarValuesUsed(const VPValue *Def) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 83e3fca..5da7463 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -274,12 +274,12 @@ public: virtual bool usesScalars(const VPValue *Op) const { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); - return onlyFirstLaneUsed(Op); + return usesFirstLaneOnly(Op); } /// Returns true if the VPUser only uses the first lane of operand \p Op. /// Conservatively returns false. - virtual bool onlyFirstLaneUsed(const VPValue *Op) const { + virtual bool usesFirstLaneOnly(const VPValue *Op) const { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); return false; @@ -287,7 +287,7 @@ public: /// Returns true if the VPUser only uses the first part of operand \p Op. /// Conservatively returns false. - virtual bool onlyFirstPartUsed(const VPValue *Op) const { + virtual bool usesFirstPartOnly(const VPValue *Op) const { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); return false; diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index 91734a1..34754a1 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -252,6 +252,13 @@ bool VPlanVerifier::verifyVPBasicBlock(const VPBasicBlock *VPBB) { for (const VPUser *U : V->users()) { auto *UI = cast<VPRecipeBase>(U); + if (isa<VPIRPhi>(UI) && + UI->getNumOperands() != UI->getParent()->getNumPredecessors()) { + errs() << "Phi-like recipe with different number of operands and " + "predecessors.\n"; + return false; + } + if (auto *Phi = dyn_cast<VPPhiAccessors>(UI)) { for (const auto &[IncomingVPV, IncomingVPBB] : Phi->incoming_values_and_blocks()) { diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index d6eb00d..27a8bbd 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -2017,8 +2017,31 @@ bool VectorCombine::scalarizeExtExtract(Instruction &I) { Value *ScalarV = Ext->getOperand(0); if (!isGuaranteedNotToBePoison(ScalarV, &AC, dyn_cast<Instruction>(ScalarV), - &DT)) - ScalarV = Builder.CreateFreeze(ScalarV); + &DT)) { + // Check wether all lanes are extracted, all extracts trigger UB + // on poison, and the last extract (and hence all previous ones) + // are guaranteed to execute if Ext executes. If so, we do not + // need to insert a freeze. + SmallDenseSet<ConstantInt *, 8> ExtractedLanes; + bool AllExtractsTriggerUB = true; + ExtractElementInst *LastExtract = nullptr; + BasicBlock *ExtBB = Ext->getParent(); + for (User *U : Ext->users()) { + auto *Extract = cast<ExtractElementInst>(U); + if (Extract->getParent() != ExtBB || !programUndefinedIfPoison(Extract)) { + AllExtractsTriggerUB = false; + break; + } + ExtractedLanes.insert(cast<ConstantInt>(Extract->getIndexOperand())); + if (!LastExtract || LastExtract->comesBefore(Extract)) + LastExtract = Extract; + } + if (ExtractedLanes.size() != DstTy->getNumElements() || + !AllExtractsTriggerUB || + !isGuaranteedToTransferExecutionToSuccessor(Ext->getIterator(), + LastExtract->getIterator())) + ScalarV = Builder.CreateFreeze(ScalarV); + } ScalarV = Builder.CreateBitCast( ScalarV, IntegerType::get(SrcTy->getContext(), DL->getTypeSizeInBits(SrcTy))); |
