diff options
Diffstat (limited to 'llvm/lib')
44 files changed, 767 insertions, 329 deletions
diff --git a/llvm/lib/Analysis/StaticDataProfileInfo.cpp b/llvm/lib/Analysis/StaticDataProfileInfo.cpp index e7f0b2c..61d4935 100644 --- a/llvm/lib/Analysis/StaticDataProfileInfo.cpp +++ b/llvm/lib/Analysis/StaticDataProfileInfo.cpp @@ -1,10 +1,14 @@ #include "llvm/Analysis/StaticDataProfileInfo.h" #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/Module.h" #include "llvm/InitializePasses.h" #include "llvm/ProfileData/InstrProf.h" +#define DEBUG_TYPE "static-data-profile-info" + using namespace llvm; namespace llvm { @@ -79,6 +83,17 @@ StaticDataProfileInfo::getConstantHotnessUsingProfileCount( return StaticDataHotness::LukewarmOrUnknown; } +StaticDataProfileInfo::StaticDataHotness +StaticDataProfileInfo::getSectionHotnessUsingDataAccessProfile( + std::optional<StringRef> MaybeSectionPrefix) const { + if (!MaybeSectionPrefix) + return StaticDataHotness::LukewarmOrUnknown; + StringRef Prefix = *MaybeSectionPrefix; + assert((Prefix == "hot" || Prefix == "unlikely") && + "Expect section_prefix to be one of hot or unlikely"); + return Prefix == "hot" ? StaticDataHotness::Hot : StaticDataHotness::Cold; +} + StringRef StaticDataProfileInfo::hotnessToStr(StaticDataHotness Hotness) const { switch (Hotness) { case StaticDataHotness::Cold: @@ -101,13 +116,66 @@ StaticDataProfileInfo::getConstantProfileCount(const Constant *C) const { StringRef StaticDataProfileInfo::getConstantSectionPrefix( const Constant *C, const ProfileSummaryInfo *PSI) const { std::optional<uint64_t> Count = getConstantProfileCount(C); + +#ifndef NDEBUG + auto DbgPrintPrefix = [](StringRef Prefix) { + return Prefix.empty() ? "<empty>" : Prefix; + }; +#endif + + if (EnableDataAccessProf) { + // Module flag `HasDataAccessProf` is 1 -> empty section prefix means + // unknown hotness except for string literals. + if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(C); + GV && llvm::memprof::IsAnnotationOK(*GV) && + !GV->getName().starts_with(".str")) { + auto HotnessFromDataAccessProf = + getSectionHotnessUsingDataAccessProfile(GV->getSectionPrefix()); + + if (!Count) { + StringRef Prefix = hotnessToStr(HotnessFromDataAccessProf); + LLVM_DEBUG(dbgs() << GV->getName() << " has section prefix " + << DbgPrintPrefix(Prefix) + << ", solely from data access profiles\n"); + return Prefix; + } + + // Both data access profiles and PGO counters are available. Use the + // hotter one. + auto HotnessFromPGO = getConstantHotnessUsingProfileCount(C, PSI, *Count); + StaticDataHotness GlobalVarHotness = StaticDataHotness::LukewarmOrUnknown; + if (HotnessFromDataAccessProf == StaticDataHotness::Hot || + HotnessFromPGO == StaticDataHotness::Hot) { + GlobalVarHotness = StaticDataHotness::Hot; + } else if (HotnessFromDataAccessProf == + StaticDataHotness::LukewarmOrUnknown || + HotnessFromPGO == StaticDataHotness::LukewarmOrUnknown) { + GlobalVarHotness = StaticDataHotness::LukewarmOrUnknown; + } else { + GlobalVarHotness = StaticDataHotness::Cold; + } + StringRef Prefix = hotnessToStr(GlobalVarHotness); + LLVM_DEBUG( + dbgs() << GV->getName() << " has section prefix " + << DbgPrintPrefix(Prefix) + << ", the max from data access profiles as " + << DbgPrintPrefix(hotnessToStr(HotnessFromDataAccessProf)) + << " and PGO counters as " + << DbgPrintPrefix(hotnessToStr(HotnessFromPGO)) << "\n"); + return Prefix; + } + } if (!Count) return ""; return hotnessToStr(getConstantHotnessUsingProfileCount(C, PSI, *Count)); } bool StaticDataProfileInfoWrapperPass::doInitialization(Module &M) { - Info.reset(new StaticDataProfileInfo()); + bool EnableDataAccessProf = false; + if (auto *MD = mdconst::extract_or_null<ConstantInt>( + M.getModuleFlag("EnableDataAccessProf"))) + EnableDataAccessProf = MD->getZExtValue(); + Info.reset(new StaticDataProfileInfo(EnableDataAccessProf)); return false; } diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizeMutations.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizeMutations.cpp index 25c1db9..ded4df4 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizeMutations.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizeMutations.cpp @@ -55,12 +55,10 @@ LegalizeMutation LegalizeMutations::changeElementCountTo(unsigned TypeIdx, } LegalizeMutation LegalizeMutations::changeElementCountTo(unsigned TypeIdx, - LLT NewEltTy) { + ElementCount EC) { return [=](const LegalityQuery &Query) { const LLT OldTy = Query.Types[TypeIdx]; - ElementCount NewEltCount = NewEltTy.isVector() ? NewEltTy.getElementCount() - : ElementCount::getFixed(1); - return std::make_pair(TypeIdx, OldTy.changeElementCount(NewEltCount)); + return std::make_pair(TypeIdx, OldTy.changeElementCount(EC)); }; } diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp index 055fdc6..ca82857 100644 --- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp +++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp @@ -818,8 +818,7 @@ bool llvm::isKnownNeverNaN(Register Val, const MachineRegisterInfo &MRI, if (!DefMI) return false; - const TargetMachine& TM = DefMI->getMF()->getTarget(); - if (DefMI->getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) + if (DefMI->getFlag(MachineInstr::FmNoNans)) return true; // If the value is a constant, we can obviously see if it is a NaN or not. diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 0f2b518..cb0038c 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -3971,8 +3971,14 @@ void SelectionDAGBuilder::visitSIToFP(const User &I) { } void SelectionDAGBuilder::visitPtrToAddr(const User &I) { - // FIXME: this is not correct for pointers with addr width != pointer width - visitPtrToInt(I); + SDValue N = getValue(I.getOperand(0)); + // By definition the type of the ptrtoaddr must be equal to the address type. + const auto &TLI = DAG.getTargetLoweringInfo(); + EVT AddrVT = TLI.getValueType(DAG.getDataLayout(), I.getType()); + // The address width must be smaller or equal to the pointer representation + // width, so we lower ptrtoaddr as a truncate (possibly folded to a no-op). + N = DAG.getNode(ISD::TRUNCATE, getCurSDLoc(), AddrVT, N); + setValue(&I, N); } void SelectionDAGBuilder::visitPtrToInt(const User &I) { diff --git a/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt b/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt index 0ffe3ae..f343925 100644 --- a/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt +++ b/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt @@ -56,6 +56,7 @@ add_llvm_component_library(LLVMOrcJIT SectCreate.cpp SelfExecutorProcessControl.cpp SimpleRemoteEPC.cpp + SimpleRemoteMemoryMapper.cpp Speculation.cpp SpeculateAnalyses.cpp ExecutorProcessControl.cpp diff --git a/llvm/lib/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.cpp b/llvm/lib/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.cpp index 50e6b25..0833af7 100644 --- a/llvm/lib/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.cpp +++ b/llvm/lib/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.cpp @@ -57,16 +57,17 @@ public: std::swap(FR.Actions, G.allocActions()); Parent.EPC.callSPSWrapperAsync< - rt::SPSSimpleExecutorMemoryManagerFinalizeSignature>( - Parent.SAs.Finalize, + rt::SPSSimpleExecutorMemoryManagerInitializeSignature>( + Parent.SAs.Initialize, [OnFinalize = std::move(OnFinalize), AllocAddr = this->AllocAddr]( - Error SerializationErr, Error FinalizeErr) mutable { + Error SerializationErr, + Expected<ExecutorAddr> InitializeKey) mutable { // FIXME: Release abandoned alloc. if (SerializationErr) { - cantFail(std::move(FinalizeErr)); + cantFail(InitializeKey.takeError()); OnFinalize(std::move(SerializationErr)); - } else if (FinalizeErr) - OnFinalize(std::move(FinalizeErr)); + } else if (!InitializeKey) + OnFinalize(InitializeKey.takeError()); else OnFinalize(FinalizedAlloc(AllocAddr)); }, @@ -76,8 +77,8 @@ public: void abandon(OnAbandonedFunction OnAbandoned) override { // FIXME: Return memory to pool instead. Parent.EPC.callSPSWrapperAsync< - rt::SPSSimpleExecutorMemoryManagerDeallocateSignature>( - Parent.SAs.Deallocate, + rt::SPSSimpleExecutorMemoryManagerReleaseSignature>( + Parent.SAs.Release, [OnAbandoned = std::move(OnAbandoned)](Error SerializationErr, Error DeallocateErr) mutable { if (SerializationErr) { @@ -123,9 +124,8 @@ void EPCGenericJITLinkMemoryManager::allocate(const JITLinkDylib *JD, void EPCGenericJITLinkMemoryManager::deallocate( std::vector<FinalizedAlloc> Allocs, OnDeallocatedFunction OnDeallocated) { - EPC.callSPSWrapperAsync< - rt::SPSSimpleExecutorMemoryManagerDeallocateSignature>( - SAs.Deallocate, + EPC.callSPSWrapperAsync<rt::SPSSimpleExecutorMemoryManagerReleaseSignature>( + SAs.Release, [OnDeallocated = std::move(OnDeallocated)](Error SerErr, Error DeallocErr) mutable { if (SerErr) { diff --git a/llvm/lib/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.cpp b/llvm/lib/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.cpp index fec7062..cc72488 100644 --- a/llvm/lib/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.cpp +++ b/llvm/lib/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.cpp @@ -25,9 +25,9 @@ EPCGenericRTDyldMemoryManager::CreateWithDefaultBootstrapSymbols( if (auto Err = EPC.getBootstrapSymbols( {{SAs.Instance, rt::SimpleExecutorMemoryManagerInstanceName}, {SAs.Reserve, rt::SimpleExecutorMemoryManagerReserveWrapperName}, - {SAs.Finalize, rt::SimpleExecutorMemoryManagerFinalizeWrapperName}, - {SAs.Deallocate, - rt::SimpleExecutorMemoryManagerDeallocateWrapperName}, + {SAs.Initialize, + rt::SimpleExecutorMemoryManagerInitializeWrapperName}, + {SAs.Release, rt::SimpleExecutorMemoryManagerReleaseWrapperName}, {SAs.RegisterEHFrame, rt::RegisterEHFrameSectionAllocActionName}, {SAs.DeregisterEHFrame, rt::DeregisterEHFrameSectionAllocActionName}})) @@ -48,7 +48,7 @@ EPCGenericRTDyldMemoryManager::~EPCGenericRTDyldMemoryManager() { Error Err = Error::success(); if (auto Err2 = EPC.callSPSWrapper< - rt::SPSSimpleExecutorMemoryManagerDeallocateSignature>( + rt::SPSSimpleExecutorMemoryManagerReleaseSignature>( SAs.Reserve, Err, SAs.Instance, FinalizedAllocs)) { // FIXME: Report errors through EPC once that functionality is available. logAllUnhandledErrors(std::move(Err2), errs(), ""); @@ -267,10 +267,10 @@ bool EPCGenericRTDyldMemoryManager::finalizeMemory(std::string *ErrMsg) { // We'll also need to make an extra allocation for the eh-frame wrapper call // arguments. - Error FinalizeErr = Error::success(); + Expected<ExecutorAddr> InitializeKey((ExecutorAddr())); if (auto Err = EPC.callSPSWrapper< - rt::SPSSimpleExecutorMemoryManagerFinalizeSignature>( - SAs.Finalize, FinalizeErr, SAs.Instance, std::move(FR))) { + rt::SPSSimpleExecutorMemoryManagerInitializeSignature>( + SAs.Initialize, InitializeKey, SAs.Instance, std::move(FR))) { std::lock_guard<std::mutex> Lock(M); this->ErrMsg = toString(std::move(Err)); dbgs() << "Serialization error: " << this->ErrMsg << "\n"; @@ -278,9 +278,9 @@ bool EPCGenericRTDyldMemoryManager::finalizeMemory(std::string *ErrMsg) { *ErrMsg = this->ErrMsg; return true; } - if (FinalizeErr) { + if (!InitializeKey) { std::lock_guard<std::mutex> Lock(M); - this->ErrMsg = toString(std::move(FinalizeErr)); + this->ErrMsg = toString(InitializeKey.takeError()); dbgs() << "Finalization error: " << this->ErrMsg << "\n"; if (ErrMsg) *ErrMsg = this->ErrMsg; diff --git a/llvm/lib/ExecutionEngine/Orc/Shared/OrcRTBridge.cpp b/llvm/lib/ExecutionEngine/Orc/Shared/OrcRTBridge.cpp index 26e8f53..cc99d3c 100644 --- a/llvm/lib/ExecutionEngine/Orc/Shared/OrcRTBridge.cpp +++ b/llvm/lib/ExecutionEngine/Orc/Shared/OrcRTBridge.cpp @@ -23,10 +23,12 @@ const char *SimpleExecutorMemoryManagerInstanceName = "__llvm_orc_SimpleExecutorMemoryManager_Instance"; const char *SimpleExecutorMemoryManagerReserveWrapperName = "__llvm_orc_SimpleExecutorMemoryManager_reserve_wrapper"; -const char *SimpleExecutorMemoryManagerFinalizeWrapperName = - "__llvm_orc_SimpleExecutorMemoryManager_finalize_wrapper"; -const char *SimpleExecutorMemoryManagerDeallocateWrapperName = - "__llvm_orc_SimpleExecutorMemoryManager_deallocate_wrapper"; +const char *SimpleExecutorMemoryManagerInitializeWrapperName = + "__llvm_orc_SimpleExecutorMemoryManager_initialize_wrapper"; +const char *SimpleExecutorMemoryManagerDeinitializeWrapperName = + "__llvm_orc_SimpleExecutorMemoryManager_deinitialize_wrapper"; +const char *SimpleExecutorMemoryManagerReleaseWrapperName = + "__llvm_orc_SimpleExecutorMemoryManager_release_wrapper"; const char *ExecutorSharedMemoryMapperServiceInstanceName = "__llvm_orc_ExecutorSharedMemoryMapperService_Instance"; diff --git a/llvm/lib/ExecutionEngine/Orc/SimpleRemoteEPC.cpp b/llvm/lib/ExecutionEngine/Orc/SimpleRemoteEPC.cpp index 87d7578..dec1df7 100644 --- a/llvm/lib/ExecutionEngine/Orc/SimpleRemoteEPC.cpp +++ b/llvm/lib/ExecutionEngine/Orc/SimpleRemoteEPC.cpp @@ -216,9 +216,9 @@ SimpleRemoteEPC::createDefaultMemoryManager(SimpleRemoteEPC &SREPC) { if (auto Err = SREPC.getBootstrapSymbols( {{SAs.Allocator, rt::SimpleExecutorMemoryManagerInstanceName}, {SAs.Reserve, rt::SimpleExecutorMemoryManagerReserveWrapperName}, - {SAs.Finalize, rt::SimpleExecutorMemoryManagerFinalizeWrapperName}, - {SAs.Deallocate, - rt::SimpleExecutorMemoryManagerDeallocateWrapperName}})) + {SAs.Initialize, + rt::SimpleExecutorMemoryManagerInitializeWrapperName}, + {SAs.Release, rt::SimpleExecutorMemoryManagerReleaseWrapperName}})) return std::move(Err); return std::make_unique<EPCGenericJITLinkMemoryManager>(SREPC, SAs); diff --git a/llvm/lib/ExecutionEngine/Orc/SimpleRemoteMemoryMapper.cpp b/llvm/lib/ExecutionEngine/Orc/SimpleRemoteMemoryMapper.cpp new file mode 100644 index 0000000..b82de3f --- /dev/null +++ b/llvm/lib/ExecutionEngine/Orc/SimpleRemoteMemoryMapper.cpp @@ -0,0 +1,104 @@ +//===---- SimpleRemoteMemoryMapper.cpp - Remote memory mapper ----*- C++ -*-==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/ExecutionEngine/Orc/SimpleRemoteMemoryMapper.h" + +#include "llvm/ExecutionEngine/JITLink/JITLink.h" +#include "llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h" + +namespace llvm::orc { + +SimpleRemoteMemoryMapper::SimpleRemoteMemoryMapper(ExecutorProcessControl &EPC, + SymbolAddrs SAs) + : EPC(EPC), SAs(SAs) {} + +void SimpleRemoteMemoryMapper::reserve(size_t NumBytes, + OnReservedFunction OnReserved) { + EPC.callSPSWrapperAsync<rt::SPSSimpleRemoteMemoryMapReserveSignature>( + SAs.Reserve, + [NumBytes, OnReserved = std::move(OnReserved)]( + Error SerializationErr, Expected<ExecutorAddr> Result) mutable { + if (SerializationErr) { + cantFail(Result.takeError()); + return OnReserved(std::move(SerializationErr)); + } + + if (Result) + OnReserved(ExecutorAddrRange(*Result, NumBytes)); + else + OnReserved(Result.takeError()); + }, + SAs.Instance, static_cast<uint64_t>(NumBytes)); +} + +char *SimpleRemoteMemoryMapper::prepare(jitlink::LinkGraph &G, + ExecutorAddr Addr, size_t ContentSize) { + return G.allocateBuffer(ContentSize).data(); +} + +void SimpleRemoteMemoryMapper::initialize(MemoryMapper::AllocInfo &AI, + OnInitializedFunction OnInitialized) { + + tpctypes::FinalizeRequest FR; + + std::swap(FR.Actions, AI.Actions); + FR.Segments.reserve(AI.Segments.size()); + + for (auto Seg : AI.Segments) + FR.Segments.push_back({Seg.AG, AI.MappingBase + Seg.Offset, + Seg.ContentSize + Seg.ZeroFillSize, + ArrayRef<char>(Seg.WorkingMem, Seg.ContentSize)}); + + EPC.callSPSWrapperAsync<rt::SPSSimpleRemoteMemoryMapInitializeSignature>( + SAs.Initialize, + [OnInitialized = std::move(OnInitialized)]( + Error SerializationErr, Expected<ExecutorAddr> Result) mutable { + if (SerializationErr) { + cantFail(Result.takeError()); + return OnInitialized(std::move(SerializationErr)); + } + + OnInitialized(std::move(Result)); + }, + SAs.Instance, std::move(FR)); +} + +void SimpleRemoteMemoryMapper::deinitialize( + ArrayRef<ExecutorAddr> Allocations, + MemoryMapper::OnDeinitializedFunction OnDeinitialized) { + EPC.callSPSWrapperAsync<rt::SPSSimpleRemoteMemoryMapDeinitializeSignature>( + SAs.Deinitialize, + [OnDeinitialized = std::move(OnDeinitialized)](Error SerializationErr, + Error Result) mutable { + if (SerializationErr) { + cantFail(std::move(Result)); + return OnDeinitialized(std::move(SerializationErr)); + } + + OnDeinitialized(std::move(Result)); + }, + SAs.Instance, Allocations); +} + +void SimpleRemoteMemoryMapper::release(ArrayRef<ExecutorAddr> Bases, + OnReleasedFunction OnReleased) { + EPC.callSPSWrapperAsync<rt::SPSSimpleRemoteMemoryMapReleaseSignature>( + SAs.Release, + [OnReleased = std::move(OnReleased)](Error SerializationErr, + Error Result) mutable { + if (SerializationErr) { + cantFail(std::move(Result)); + return OnReleased(std::move(SerializationErr)); + } + + return OnReleased(std::move(Result)); + }, + SAs.Instance, Bases); +} + +} // namespace llvm::orc diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp index 3cdffb8..fe881a1 100644 --- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp +++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp @@ -8,6 +8,7 @@ #include "llvm/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.h" +#include "llvm/ADT/ScopeExit.h" #include "llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h" #include "llvm/Support/FormatVariadic.h" @@ -18,166 +19,167 @@ namespace orc { namespace rt_bootstrap { SimpleExecutorMemoryManager::~SimpleExecutorMemoryManager() { - assert(Allocations.empty() && "shutdown not called?"); + assert(Slabs.empty() && "shutdown not called?"); } -Expected<ExecutorAddr> SimpleExecutorMemoryManager::allocate(uint64_t Size) { +Expected<ExecutorAddr> SimpleExecutorMemoryManager::reserve(uint64_t Size) { std::error_code EC; auto MB = sys::Memory::allocateMappedMemory( Size, nullptr, sys::Memory::MF_READ | sys::Memory::MF_WRITE, EC); if (EC) return errorCodeToError(EC); std::lock_guard<std::mutex> Lock(M); - assert(!Allocations.count(MB.base()) && "Duplicate allocation addr"); - Allocations[MB.base()].Size = Size; + assert(!Slabs.count(MB.base()) && "Duplicate allocation addr"); + Slabs[MB.base()].Size = Size; return ExecutorAddr::fromPtr(MB.base()); } -Error SimpleExecutorMemoryManager::finalize(tpctypes::FinalizeRequest &FR) { - ExecutorAddr Base(~0ULL); +Expected<ExecutorAddr> +SimpleExecutorMemoryManager::initialize(tpctypes::FinalizeRequest &FR) { std::vector<shared::WrapperFunctionCall> DeallocationActions; - size_t SuccessfulFinalizationActions = 0; if (FR.Segments.empty()) { - // NOTE: Finalizing nothing is currently a no-op. Should it be an error? if (FR.Actions.empty()) - return Error::success(); + return make_error<StringError>("Finalization request is empty", + inconvertibleErrorCode()); else return make_error<StringError>("Finalization actions attached to empty " "finalization request", inconvertibleErrorCode()); } - for (auto &Seg : FR.Segments) - Base = std::min(Base, Seg.Addr); - - for (auto &ActPair : FR.Actions) - if (ActPair.Dealloc) - DeallocationActions.push_back(ActPair.Dealloc); - - // Get the Allocation for this finalization. - size_t AllocSize = 0; - { - std::lock_guard<std::mutex> Lock(M); - auto I = Allocations.find(Base.toPtr<void *>()); - if (I == Allocations.end()) - return make_error<StringError>("Attempt to finalize unrecognized " - "allocation " + - formatv("{0:x}", Base.getValue()), - inconvertibleErrorCode()); - AllocSize = I->second.Size; - I->second.DeallocationActions = std::move(DeallocationActions); - } - ExecutorAddr AllocEnd = Base + ExecutorAddrDiff(AllocSize); - - // Bail-out function: this will run deallocation actions corresponding to any - // completed finalization actions, then deallocate memory. - auto BailOut = [&](Error Err) { - std::pair<void *, Allocation> AllocToDestroy; - - // Get allocation to destroy. - { - std::lock_guard<std::mutex> Lock(M); - auto I = Allocations.find(Base.toPtr<void *>()); - - // Check for missing allocation (effective a double free). - if (I == Allocations.end()) - return joinErrors( - std::move(Err), - make_error<StringError>("No allocation entry found " - "for " + - formatv("{0:x}", Base.getValue()), - inconvertibleErrorCode())); - AllocToDestroy = std::move(*I); - Allocations.erase(I); - } + ExecutorAddrRange RR(FR.Segments.front().Addr, FR.Segments.front().Addr); - // Run deallocation actions for all completed finalization actions. - while (SuccessfulFinalizationActions) - Err = - joinErrors(std::move(Err), FR.Actions[--SuccessfulFinalizationActions] - .Dealloc.runWithSPSRetErrorMerged()); - - // Deallocate memory. - sys::MemoryBlock MB(AllocToDestroy.first, AllocToDestroy.second.Size); - if (auto EC = sys::Memory::releaseMappedMemory(MB)) - Err = joinErrors(std::move(Err), errorCodeToError(EC)); - - return Err; - }; + std::vector<sys::MemoryBlock> MBsToReset; + auto ResetMBs = make_scope_exit([&]() { + for (auto &MB : MBsToReset) + sys::Memory::protectMappedMemory(MB, sys::Memory::MF_READ | + sys::Memory::MF_WRITE); + sys::Memory::InvalidateInstructionCache(RR.Start.toPtr<void *>(), + RR.size()); + }); // Copy content and apply permissions. for (auto &Seg : FR.Segments) { + RR.Start = std::min(RR.Start, Seg.Addr); + RR.End = std::max(RR.End, Seg.Addr + Seg.Size); // Check segment ranges. if (LLVM_UNLIKELY(Seg.Size < Seg.Content.size())) - return BailOut(make_error<StringError>( + return make_error<StringError>( formatv("Segment {0:x} content size ({1:x} bytes) " "exceeds segment size ({2:x} bytes)", Seg.Addr.getValue(), Seg.Content.size(), Seg.Size), - inconvertibleErrorCode())); + inconvertibleErrorCode()); ExecutorAddr SegEnd = Seg.Addr + ExecutorAddrDiff(Seg.Size); - if (LLVM_UNLIKELY(Seg.Addr < Base || SegEnd > AllocEnd)) - return BailOut(make_error<StringError>( + if (LLVM_UNLIKELY(Seg.Addr < RR.Start || SegEnd > RR.End)) + return make_error<StringError>( formatv("Segment {0:x} -- {1:x} crosses boundary of " "allocation {2:x} -- {3:x}", - Seg.Addr.getValue(), SegEnd.getValue(), Base.getValue(), - AllocEnd.getValue()), - inconvertibleErrorCode())); + Seg.Addr, SegEnd, RR.Start, RR.End), + inconvertibleErrorCode()); char *Mem = Seg.Addr.toPtr<char *>(); if (!Seg.Content.empty()) memcpy(Mem, Seg.Content.data(), Seg.Content.size()); memset(Mem + Seg.Content.size(), 0, Seg.Size - Seg.Content.size()); assert(Seg.Size <= std::numeric_limits<size_t>::max()); + + sys::MemoryBlock MB(Mem, Seg.Size); if (auto EC = sys::Memory::protectMappedMemory( - {Mem, static_cast<size_t>(Seg.Size)}, - toSysMemoryProtectionFlags(Seg.RAG.Prot))) - return BailOut(errorCodeToError(EC)); + MB, toSysMemoryProtectionFlags(Seg.RAG.Prot))) + return errorCodeToError(EC); + + MBsToReset.push_back(MB); + if ((Seg.RAG.Prot & MemProt::Exec) == MemProt::Exec) sys::Memory::InvalidateInstructionCache(Mem, Seg.Size); } - // Run finalization actions. - for (auto &ActPair : FR.Actions) { - if (auto Err = ActPair.Finalize.runWithSPSRetErrorMerged()) - return BailOut(std::move(Err)); - ++SuccessfulFinalizationActions; + auto DeallocActions = runFinalizeActions(FR.Actions); + if (!DeallocActions) + return DeallocActions.takeError(); + + { + std::lock_guard<std::mutex> Lock(M); + auto Region = createRegionInfo(RR, "In initialize"); + if (!Region) + return Region.takeError(); + Region->DeallocActions = std::move(*DeallocActions); } - return Error::success(); + // Successful initialization. + ResetMBs.release(); + + return RR.Start; } -Error SimpleExecutorMemoryManager::deallocate( - const std::vector<ExecutorAddr> &Bases) { - std::vector<std::pair<void *, Allocation>> AllocPairs; - AllocPairs.reserve(Bases.size()); +Error SimpleExecutorMemoryManager::deinitialize( + const std::vector<ExecutorAddr> &InitKeys) { + Error Err = Error::success(); - // Get allocation to destroy. + for (auto &KeyAddr : llvm::reverse(InitKeys)) { + std::vector<shared::WrapperFunctionCall> DeallocActions; + { + std::scoped_lock<std::mutex> Lock(M); + auto Slab = getSlabInfo(KeyAddr, "In deinitialize"); + if (!Slab) { + Err = joinErrors(std::move(Err), Slab.takeError()); + continue; + } + + auto RI = getRegionInfo(*Slab, KeyAddr, "In deinitialize"); + if (!RI) { + Err = joinErrors(std::move(Err), RI.takeError()); + continue; + } + + DeallocActions = std::move(RI->DeallocActions); + } + + Err = joinErrors(std::move(Err), + runDeallocActions(std::move(DeallocActions))); + } + + return Err; +} + +Error SimpleExecutorMemoryManager::release( + const std::vector<ExecutorAddr> &Bases) { Error Err = Error::success(); - { - std::lock_guard<std::mutex> Lock(M); - for (auto &Base : Bases) { - auto I = Allocations.find(Base.toPtr<void *>()); - - // Check for missing allocation (effective a double free). - if (I != Allocations.end()) { - AllocPairs.push_back(std::move(*I)); - Allocations.erase(I); - } else + + // TODO: Prohibit new initializations within the slabs being removed? + for (auto &Base : llvm::reverse(Bases)) { + std::vector<shared::WrapperFunctionCall> DeallocActions; + sys::MemoryBlock MB; + + { + std::scoped_lock<std::mutex> Lock(M); + + auto SlabI = Slabs.find(Base.toPtr<void *>()); + if (SlabI == Slabs.end()) { Err = joinErrors( std::move(Err), - make_error<StringError>("No allocation entry found " - "for " + - formatv("{0:x}", Base.getValue()), + make_error<StringError>("In release, " + formatv("{0:x}", Base) + + " is not part of any reserved " + "address range", inconvertibleErrorCode())); + continue; + } + + auto &Slab = SlabI->second; + + for (auto &[Addr, Region] : Slab.Regions) + llvm::copy(Region.DeallocActions, back_inserter(DeallocActions)); + + MB = {Base.toPtr<void *>(), Slab.Size}; + + Slabs.erase(SlabI); } - } - while (!AllocPairs.empty()) { - auto &P = AllocPairs.back(); - Err = joinErrors(std::move(Err), deallocateImpl(P.first, P.second)); - AllocPairs.pop_back(); + Err = joinErrors(std::move(Err), runDeallocActions(DeallocActions)); + if (auto EC = sys::Memory::releaseMappedMemory(MB)) + Err = joinErrors(std::move(Err), errorCodeToError(EC)); } return Err; @@ -185,16 +187,15 @@ Error SimpleExecutorMemoryManager::deallocate( Error SimpleExecutorMemoryManager::shutdown() { - AllocationsMap AM; + // TODO: Prevent new allocations during shutdown. + std::vector<ExecutorAddr> Bases; { - std::lock_guard<std::mutex> Lock(M); - AM = std::move(Allocations); + std::scoped_lock<std::mutex> Lock(M); + for (auto &[Base, Slab] : Slabs) + Bases.push_back(ExecutorAddr::fromPtr(Base)); } - Error Err = Error::success(); - for (auto &KV : AM) - Err = joinErrors(std::move(Err), deallocateImpl(KV.first, KV.second)); - return Err; + return release(Bases); } void SimpleExecutorMemoryManager::addBootstrapSymbols( @@ -202,58 +203,150 @@ void SimpleExecutorMemoryManager::addBootstrapSymbols( M[rt::SimpleExecutorMemoryManagerInstanceName] = ExecutorAddr::fromPtr(this); M[rt::SimpleExecutorMemoryManagerReserveWrapperName] = ExecutorAddr::fromPtr(&reserveWrapper); - M[rt::SimpleExecutorMemoryManagerFinalizeWrapperName] = - ExecutorAddr::fromPtr(&finalizeWrapper); - M[rt::SimpleExecutorMemoryManagerDeallocateWrapperName] = - ExecutorAddr::fromPtr(&deallocateWrapper); + M[rt::SimpleExecutorMemoryManagerInitializeWrapperName] = + ExecutorAddr::fromPtr(&initializeWrapper); + M[rt::SimpleExecutorMemoryManagerDeinitializeWrapperName] = + ExecutorAddr::fromPtr(&deinitializeWrapper); + M[rt::SimpleExecutorMemoryManagerReleaseWrapperName] = + ExecutorAddr::fromPtr(&releaseWrapper); } -Error SimpleExecutorMemoryManager::deallocateImpl(void *Base, Allocation &A) { - Error Err = Error::success(); +Expected<SimpleExecutorMemoryManager::SlabInfo &> +SimpleExecutorMemoryManager::getSlabInfo(ExecutorAddr A, StringRef Context) { + auto MakeBadSlabError = [&]() { + return make_error<StringError>( + Context + ", address " + formatv("{0:x}", A) + + " is not part of any reserved address range", + inconvertibleErrorCode()); + }; - while (!A.DeallocationActions.empty()) { - Err = joinErrors(std::move(Err), - A.DeallocationActions.back().runWithSPSRetErrorMerged()); - A.DeallocationActions.pop_back(); + auto I = Slabs.upper_bound(A.toPtr<void *>()); + if (I == Slabs.begin()) + return MakeBadSlabError(); + --I; + if (!ExecutorAddrRange(ExecutorAddr::fromPtr(I->first), I->second.Size) + .contains(A)) + return MakeBadSlabError(); + + return I->second; +} + +Expected<SimpleExecutorMemoryManager::SlabInfo &> +SimpleExecutorMemoryManager::getSlabInfo(ExecutorAddrRange R, + StringRef Context) { + auto MakeBadSlabError = [&]() { + return make_error<StringError>( + Context + ", range " + formatv("{0:x}", R) + + " is not part of any reserved address range", + inconvertibleErrorCode()); + }; + + auto I = Slabs.upper_bound(R.Start.toPtr<void *>()); + if (I == Slabs.begin()) + return MakeBadSlabError(); + --I; + if (!ExecutorAddrRange(ExecutorAddr::fromPtr(I->first), I->second.Size) + .contains(R)) + return MakeBadSlabError(); + + return I->second; +} + +Expected<SimpleExecutorMemoryManager::RegionInfo &> +SimpleExecutorMemoryManager::createRegionInfo(ExecutorAddrRange R, + StringRef Context) { + + auto Slab = getSlabInfo(R, Context); + if (!Slab) + return Slab.takeError(); + + auto MakeBadRegionError = [&](ExecutorAddrRange Other, bool Prev) { + return make_error<StringError>(Context + ", region " + formatv("{0:x}", R) + + " overlaps " + + (Prev ? "previous" : "following") + + " region " + formatv("{0:x}", Other), + inconvertibleErrorCode()); + }; + + auto I = Slab->Regions.upper_bound(R.Start); + if (I != Slab->Regions.begin()) { + auto J = std::prev(I); + ExecutorAddrRange PrevRange(J->first, J->second.Size); + if (PrevRange.overlaps(R)) + return MakeBadRegionError(PrevRange, true); + } + if (I != Slab->Regions.end()) { + ExecutorAddrRange NextRange(I->first, I->second.Size); + if (NextRange.overlaps(R)) + return MakeBadRegionError(NextRange, false); } - sys::MemoryBlock MB(Base, A.Size); - if (auto EC = sys::Memory::releaseMappedMemory(MB)) - Err = joinErrors(std::move(Err), errorCodeToError(EC)); + auto &RInfo = Slab->Regions[R.Start]; + RInfo.Size = R.size(); + return RInfo; +} - return Err; +Expected<SimpleExecutorMemoryManager::RegionInfo &> +SimpleExecutorMemoryManager::getRegionInfo(SlabInfo &Slab, ExecutorAddr A, + StringRef Context) { + auto I = Slab.Regions.find(A); + if (I == Slab.Regions.end()) + return make_error<StringError>( + Context + ", address " + formatv("{0:x}", A) + + " does not correspond to the start of any initialized region", + inconvertibleErrorCode()); + + return I->second; +} + +Expected<SimpleExecutorMemoryManager::RegionInfo &> +SimpleExecutorMemoryManager::getRegionInfo(ExecutorAddr A, StringRef Context) { + auto Slab = getSlabInfo(A, Context); + if (!Slab) + return Slab.takeError(); + + return getRegionInfo(*Slab, A, Context); } llvm::orc::shared::CWrapperFunctionResult SimpleExecutorMemoryManager::reserveWrapper(const char *ArgData, size_t ArgSize) { - return shared::WrapperFunction< - rt::SPSSimpleExecutorMemoryManagerReserveSignature>:: + return shared::WrapperFunction<rt::SPSSimpleRemoteMemoryMapReserveSignature>:: handle(ArgData, ArgSize, shared::makeMethodWrapperHandler( - &SimpleExecutorMemoryManager::allocate)) + &SimpleExecutorMemoryManager::reserve)) + .release(); +} + +llvm::orc::shared::CWrapperFunctionResult +SimpleExecutorMemoryManager::initializeWrapper(const char *ArgData, + size_t ArgSize) { + return shared:: + WrapperFunction<rt::SPSSimpleRemoteMemoryMapInitializeSignature>::handle( + ArgData, ArgSize, + shared::makeMethodWrapperHandler( + &SimpleExecutorMemoryManager::initialize)) .release(); } llvm::orc::shared::CWrapperFunctionResult -SimpleExecutorMemoryManager::finalizeWrapper(const char *ArgData, - size_t ArgSize) { +SimpleExecutorMemoryManager::deinitializeWrapper(const char *ArgData, + size_t ArgSize) { return shared::WrapperFunction< - rt::SPSSimpleExecutorMemoryManagerFinalizeSignature>:: + rt::SPSSimpleRemoteMemoryMapDeinitializeSignature>:: handle(ArgData, ArgSize, shared::makeMethodWrapperHandler( - &SimpleExecutorMemoryManager::finalize)) + &SimpleExecutorMemoryManager::deinitialize)) .release(); } llvm::orc::shared::CWrapperFunctionResult -SimpleExecutorMemoryManager::deallocateWrapper(const char *ArgData, - size_t ArgSize) { - return shared::WrapperFunction< - rt::SPSSimpleExecutorMemoryManagerDeallocateSignature>:: +SimpleExecutorMemoryManager::releaseWrapper(const char *ArgData, + size_t ArgSize) { + return shared::WrapperFunction<rt::SPSSimpleRemoteMemoryMapReleaseSignature>:: handle(ArgData, ArgSize, shared::makeMethodWrapperHandler( - &SimpleExecutorMemoryManager::deallocate)) + &SimpleExecutorMemoryManager::release)) .release(); } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index be2f2e4..91c1f59 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1561,6 +1561,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::VECREDUCE_ADD, VT, Custom); setOperationAction(ISD::VECREDUCE_AND, VT, Custom); setOperationAction(ISD::VECREDUCE_OR, VT, Custom); + setOperationAction(ISD::VECREDUCE_MUL, VT, Custom); setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); @@ -1717,6 +1718,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); setOperationAction(ISD::VECREDUCE_FMAXIMUM, VT, Custom); setOperationAction(ISD::VECREDUCE_FMINIMUM, VT, Custom); + setOperationAction(ISD::VECREDUCE_FMUL, VT, Custom); setOperationAction(ISD::VECTOR_SPLICE, VT, Custom); setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom); setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom); @@ -7775,6 +7777,9 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, case ISD::VECREDUCE_FMAXIMUM: case ISD::VECREDUCE_FMINIMUM: return LowerVECREDUCE(Op, DAG); + case ISD::VECREDUCE_MUL: + case ISD::VECREDUCE_FMUL: + return LowerVECREDUCE_MUL(Op, DAG); case ISD::ATOMIC_LOAD_AND: return LowerATOMIC_LOAD_AND(Op, DAG); case ISD::DYNAMIC_STACKALLOC: @@ -16794,6 +16799,33 @@ SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op, } } +SDValue AArch64TargetLowering::LowerVECREDUCE_MUL(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + SDValue Src = Op.getOperand(0); + EVT SrcVT = Src.getValueType(); + assert(SrcVT.isScalableVector() && "Unexpected operand type!"); + + SDVTList SrcVTs = DAG.getVTList(SrcVT, SrcVT); + unsigned BaseOpc = ISD::getVecReduceBaseOpcode(Op.getOpcode()); + SDValue Identity = DAG.getNeutralElement(BaseOpc, DL, SrcVT, Op->getFlags()); + + // Whilst we don't know the size of the vector we do know the maximum size so + // can perform a tree reduction with an identity vector, which means once we + // arrive at the result the remaining stages (when the vector is smaller than + // the maximum) have no affect. + + unsigned Segments = AArch64::SVEMaxBitsPerVector / AArch64::SVEBitsPerBlock; + unsigned Stages = llvm::Log2_32(Segments * SrcVT.getVectorMinNumElements()); + + for (unsigned I = 0; I < Stages; ++I) { + Src = DAG.getNode(ISD::VECTOR_DEINTERLEAVE, DL, SrcVTs, Src, Identity); + Src = DAG.getNode(BaseOpc, DL, SrcVT, Src.getValue(0), Src.getValue(1)); + } + + return DAG.getExtractVectorElt(DL, Op.getValueType(), Src, 0); +} + SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op, SelectionDAG &DAG) const { auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>(); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 00956fd..9495c9f 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -752,6 +752,7 @@ private: SDValue LowerVSCALE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVECREDUCE_MUL(SDValue Op, SelectionDAG &DAG) const; SDValue LowerATOMIC_LOAD_AND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerInlineDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/AArch64/AArch64InstrGISel.td b/llvm/lib/Target/AArch64/AArch64InstrGISel.td index 7322212..fe84193 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrGISel.td +++ b/llvm/lib/Target/AArch64/AArch64InstrGISel.td @@ -233,6 +233,12 @@ def G_SDOT : AArch64GenericInstruction { let hasSideEffects = 0; } +def G_USDOT : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src1, type0:$src2, type0:$src3); + let hasSideEffects = 0; +} + // Generic instruction for the BSP pseudo. It is expanded into BSP, which // expands into BSL/BIT/BIF after register allocation. def G_BSP : AArch64GenericInstruction { @@ -278,6 +284,7 @@ def : GINodeEquiv<G_UADDLV, AArch64uaddlv>; def : GINodeEquiv<G_UDOT, AArch64udot>; def : GINodeEquiv<G_SDOT, AArch64sdot>; +def : GINodeEquiv<G_USDOT, AArch64usdot>; def : GINodeEquiv<G_EXTRACT_VECTOR_ELT, vector_extract>; diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index 9e2d698..05a4313 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -1855,6 +1855,8 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, return LowerTriOp(AArch64::G_UDOT); case Intrinsic::aarch64_neon_sdot: return LowerTriOp(AArch64::G_SDOT); + case Intrinsic::aarch64_neon_usdot: + return LowerTriOp(AArch64::G_USDOT); case Intrinsic::aarch64_neon_sqxtn: return LowerUnaryOp(TargetOpcode::G_TRUNC_SSAT_S); case Intrinsic::aarch64_neon_sqxtun: diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index be62395..e0375ea 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -513,8 +513,7 @@ void AMDGPUDisassembler::decodeImmOperands(MCInst &MI, } if (Imm == AMDGPU::EncValues::LITERAL_CONST) { - Op = decodeLiteralConstant( - Desc, OpDesc, OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_FP64); + Op = decodeLiteralConstant(Desc, OpDesc); continue; } @@ -1545,21 +1544,21 @@ AMDGPUDisassembler::decodeMandatoryLiteralConstant(unsigned Val) const { MCOperand AMDGPUDisassembler::decodeMandatoryLiteral64Constant(uint64_t Val) const { if (HasLiteral) { - if (Literal64 != Val) + if (Literal != Val) return errOperand(Val, "More than one unique literal is illegal"); } HasLiteral = true; - Literal = Literal64 = Val; + Literal = Val; - bool UseLit64 = Hi_32(Literal64) == 0; + bool UseLit64 = Hi_32(Literal) == 0; return UseLit64 ? MCOperand::createExpr(AMDGPUMCExpr::createLit( - LitModifier::Lit64, Literal64, getContext())) - : MCOperand::createImm(Literal64); + LitModifier::Lit64, Literal, getContext())) + : MCOperand::createImm(Literal); } -MCOperand AMDGPUDisassembler::decodeLiteralConstant(const MCInstrDesc &Desc, - const MCOperandInfo &OpDesc, - bool ExtendFP64) const { +MCOperand +AMDGPUDisassembler::decodeLiteralConstant(const MCInstrDesc &Desc, + const MCOperandInfo &OpDesc) const { // For now all literal constants are supposed to be unsigned integer // ToDo: deal with signed/unsigned 64-bit integer constants // ToDo: deal with float/double constants @@ -1569,35 +1568,79 @@ MCOperand AMDGPUDisassembler::decodeLiteralConstant(const MCInstrDesc &Desc, Twine(Bytes.size())); } HasLiteral = true; - Literal = Literal64 = eatBytes<uint32_t>(Bytes); - if (ExtendFP64) - Literal64 <<= 32; + Literal = eatBytes<uint32_t>(Bytes); } - int64_t Val = ExtendFP64 ? Literal64 : Literal; + // For disassembling always assume all inline constants are available. + bool HasInv2Pi = true; - bool CanUse64BitLiterals = - STI.hasFeature(AMDGPU::Feature64BitLiterals) && - !(Desc.TSFlags & (SIInstrFlags::VOP3 | SIInstrFlags::VOP3P)); - - bool UseLit64 = false; - if (CanUse64BitLiterals) { - if (OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 || - OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_C_INT64) - UseLit64 = false; - else if (OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_FP64 || - OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_C_FP64 || - OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_AC_FP64) - UseLit64 = Hi_32(Literal64) == 0; + // Invalid instruction codes may contain literals for inline-only + // operands, so we support them here as well. + int64_t Val = Literal; + bool UseLit = false; + switch (OpDesc.OperandType) { + default: + llvm_unreachable("Unexpected operand type!"); + case AMDGPU::OPERAND_REG_IMM_BF16: + case AMDGPU::OPERAND_REG_INLINE_C_BF16: + case AMDGPU::OPERAND_REG_INLINE_C_V2BF16: + UseLit = AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi); + break; + case AMDGPU::OPERAND_REG_IMM_V2BF16: + UseLit = AMDGPU::isInlinableLiteralV2BF16(Val); + break; + case AMDGPU::OPERAND_REG_IMM_FP16: + case AMDGPU::OPERAND_REG_INLINE_C_FP16: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: + UseLit = AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi); + break; + case AMDGPU::OPERAND_REG_IMM_V2FP16: + UseLit = AMDGPU::isInlinableLiteralV2F16(Val); + break; + case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: + break; + case AMDGPU::OPERAND_REG_IMM_INT16: + case AMDGPU::OPERAND_REG_INLINE_C_INT16: + case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: + UseLit = AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi); + break; + case AMDGPU::OPERAND_REG_IMM_V2INT16: + UseLit = AMDGPU::isInlinableLiteralV2I16(Val); + break; + case AMDGPU::OPERAND_REG_IMM_FP32: + case AMDGPU::OPERAND_REG_INLINE_C_FP32: + case AMDGPU::OPERAND_REG_INLINE_AC_FP32: + case AMDGPU::OPERAND_REG_IMM_INT32: + case AMDGPU::OPERAND_REG_INLINE_C_INT32: + case AMDGPU::OPERAND_REG_INLINE_AC_INT32: + case AMDGPU::OPERAND_REG_IMM_V2FP32: + case AMDGPU::OPERAND_REG_IMM_V2INT32: + case AMDGPU::OPERAND_KIMM32: + UseLit = AMDGPU::isInlinableLiteral32(Val, HasInv2Pi); + break; + case AMDGPU::OPERAND_REG_IMM_FP64: + case AMDGPU::OPERAND_REG_INLINE_C_FP64: + case AMDGPU::OPERAND_REG_INLINE_AC_FP64: + Val <<= 32; + break; + case AMDGPU::OPERAND_REG_IMM_INT64: + case AMDGPU::OPERAND_REG_INLINE_C_INT64: + UseLit = AMDGPU::isInlinableLiteral64(Val, HasInv2Pi); + break; + case MCOI::OPERAND_REGISTER: + // TODO: Disassembling V_DUAL_FMAMK_F32_X_FMAMK_F32_gfx11 hits + // decoding a literal in a position of a register operand. Give + // it special handling in the caller, decodeImmOperands(), instead + // of quietly allowing it here. + break; } - return UseLit64 ? MCOperand::createExpr(AMDGPUMCExpr::createLit( - LitModifier::Lit64, Val, getContext())) - : MCOperand::createImm(Val); + return UseLit ? MCOperand::createExpr(AMDGPUMCExpr::createLit( + LitModifier::Lit, Val, getContext())) + : MCOperand::createImm(Val); } -MCOperand -AMDGPUDisassembler::decodeLiteral64Constant(const MCInst &Inst) const { +MCOperand AMDGPUDisassembler::decodeLiteral64Constant() const { assert(STI.hasFeature(AMDGPU::Feature64BitLiterals)); if (!HasLiteral) { @@ -1606,25 +1649,13 @@ AMDGPUDisassembler::decodeLiteral64Constant(const MCInst &Inst) const { Twine(Bytes.size())); } HasLiteral = true; - Literal64 = eatBytes<uint64_t>(Bytes); - } - - bool UseLit64 = false; - const MCInstrDesc &Desc = MCII->get(Inst.getOpcode()); - const MCOperandInfo &OpDesc = Desc.operands()[Inst.getNumOperands()]; - if (OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 || - OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_C_INT64) { - UseLit64 = false; - } else { - assert(OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_FP64 || - OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_C_FP64 || - OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_AC_FP64); - UseLit64 = Hi_32(Literal64) == 0; + Literal = eatBytes<uint64_t>(Bytes); } + bool UseLit64 = Hi_32(Literal) == 0; return UseLit64 ? MCOperand::createExpr(AMDGPUMCExpr::createLit( - LitModifier::Lit64, Literal64, getContext())) - : MCOperand::createImm(Literal64); + LitModifier::Lit64, Literal, getContext())) + : MCOperand::createImm(Literal); } MCOperand AMDGPUDisassembler::decodeIntImmed(unsigned Imm) { @@ -1913,7 +1944,7 @@ MCOperand AMDGPUDisassembler::decodeNonVGPRSrcOp(const MCInst &Inst, return MCOperand::createImm(Val); if (Val == LITERAL64_CONST && STI.hasFeature(AMDGPU::Feature64BitLiterals)) { - return decodeLiteral64Constant(Inst); + return decodeLiteral64Constant(); } switch (Width) { diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h index 2751857..d103d79 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -44,8 +44,7 @@ private: const unsigned HwModeRegClass; const unsigned TargetMaxInstBytes; mutable ArrayRef<uint8_t> Bytes; - mutable uint32_t Literal; - mutable uint64_t Literal64; + mutable uint64_t Literal; mutable bool HasLiteral; mutable std::optional<bool> EnableWavefrontSize32; unsigned CodeObjectVersion; @@ -144,9 +143,8 @@ public: MCOperand decodeMandatoryLiteralConstant(unsigned Imm) const; MCOperand decodeMandatoryLiteral64Constant(uint64_t Imm) const; MCOperand decodeLiteralConstant(const MCInstrDesc &Desc, - const MCOperandInfo &OpDesc, - bool ExtendFP64) const; - MCOperand decodeLiteral64Constant(const MCInst &Inst) const; + const MCOperandInfo &OpDesc) const; + MCOperand decodeLiteral64Constant() const; MCOperand decodeSrcOp(const MCInst &Inst, unsigned Width, unsigned Val) const; diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td index 228114c..44c4830 100644 --- a/llvm/lib/Target/DirectX/DXIL.td +++ b/llvm/lib/Target/DirectX/DXIL.td @@ -57,6 +57,7 @@ def ResBindTy : DXILOpParamType; def ResPropsTy : DXILOpParamType; def SplitDoubleTy : DXILOpParamType; def BinaryWithCarryTy : DXILOpParamType; +def DimensionsTy : DXILOpParamType; class DXILOpClass; @@ -901,6 +902,13 @@ def CheckAccessFullyMapped : DXILOp<71, checkAccessFullyMapped> { let attributes = [Attributes<DXIL1_0, [ReadOnly]>]; } +def GetDimensions : DXILOp<72, getDimensions> { + let Doc = "gets the dimensions of a buffer or texture"; + let arguments = [HandleTy, Int32Ty]; + let result = DimensionsTy; + let stages = [Stages<DXIL1_0, [all_stages]>]; +} + def Barrier : DXILOp<80, barrier> { let Doc = "inserts a memory barrier in the shader"; let intrinsics = [ diff --git a/llvm/lib/Target/DirectX/DXILOpBuilder.cpp b/llvm/lib/Target/DirectX/DXILOpBuilder.cpp index 1aed8f9..944b2e6 100644 --- a/llvm/lib/Target/DirectX/DXILOpBuilder.cpp +++ b/llvm/lib/Target/DirectX/DXILOpBuilder.cpp @@ -261,6 +261,12 @@ static StructType *getBinaryWithCarryType(LLVMContext &Context) { return StructType::create({Int32Ty, Int1Ty}, "dx.types.i32c"); } +static StructType *getDimensionsType(LLVMContext &Ctx) { + Type *Int32Ty = Type::getInt32Ty(Ctx); + return getOrCreateStructType("dx.types.Dimensions", + {Int32Ty, Int32Ty, Int32Ty, Int32Ty}, Ctx); +} + static Type *getTypeFromOpParamType(OpParamType Kind, LLVMContext &Ctx, Type *OverloadTy) { switch (Kind) { @@ -318,6 +324,8 @@ static Type *getTypeFromOpParamType(OpParamType Kind, LLVMContext &Ctx, return getSplitDoubleType(Ctx); case OpParamType::BinaryWithCarryTy: return getBinaryWithCarryType(Ctx); + case OpParamType::DimensionsTy: + return getDimensionsType(Ctx); } llvm_unreachable("Invalid parameter kind"); return nullptr; diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp index 610d8b6..e46a393 100644 --- a/llvm/lib/Target/DirectX/DXILOpLowering.cpp +++ b/llvm/lib/Target/DirectX/DXILOpLowering.cpp @@ -627,6 +627,28 @@ public: }); } + [[nodiscard]] bool lowerGetDimensionsX(Function &F) { + IRBuilder<> &IRB = OpBuilder.getIRB(); + Type *Int32Ty = IRB.getInt32Ty(); + + return replaceFunction(F, [&](CallInst *CI) -> Error { + IRB.SetInsertPoint(CI); + Value *Handle = + createTmpHandleCast(CI->getArgOperand(0), OpBuilder.getHandleType()); + Value *Undef = UndefValue::get(Int32Ty); + + Expected<CallInst *> OpCall = OpBuilder.tryCreateOp( + OpCode::GetDimensions, {Handle, Undef}, CI->getName(), Int32Ty); + if (Error E = OpCall.takeError()) + return E; + Value *Dim = IRB.CreateExtractValue(*OpCall, 0); + + CI->replaceAllUsesWith(Dim); + CI->eraseFromParent(); + return Error::success(); + }); + } + [[nodiscard]] bool lowerGetPointer(Function &F) { // These should have already been handled in DXILResourceAccess, so we can // just clean up the dead prototype. @@ -934,6 +956,9 @@ public: case Intrinsic::dx_resource_updatecounter: HasErrors |= lowerUpdateCounter(F); break; + case Intrinsic::dx_resource_getdimensions_x: + HasErrors |= lowerGetDimensionsX(F); + break; case Intrinsic::ctpop: HasErrors |= lowerCtpopToCountBits(F); break; diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp index 82c43ff..26a8728 100644 --- a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp +++ b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp @@ -1165,12 +1165,15 @@ void DXILBitcodeWriter::writeValueSymbolTableForwardDecl() {} /// Returns the bit offset to backpatch with the location of the real VST. void DXILBitcodeWriter::writeModuleInfo() { // Emit various pieces of data attached to a module. - if (!M.getTargetTriple().empty()) - writeStringRecord(Stream, bitc::MODULE_CODE_TRIPLE, - M.getTargetTriple().str(), 0 /*TODO*/); - const std::string &DL = M.getDataLayoutStr(); - if (!DL.empty()) - writeStringRecord(Stream, bitc::MODULE_CODE_DATALAYOUT, DL, 0 /*TODO*/); + + // We need to hardcode a triple and datalayout that's compatible with the + // historical DXIL triple and datalayout from DXC. + StringRef Triple = "dxil-ms-dx"; + StringRef DL = "e-m:e-p:32:32-i1:8-i8:8-i16:32-i32:32-i64:64-" + "f16:32-f32:32-f64:64-n8:16:32:64"; + writeStringRecord(Stream, bitc::MODULE_CODE_TRIPLE, Triple, 0 /*TODO*/); + writeStringRecord(Stream, bitc::MODULE_CODE_DATALAYOUT, DL, 0 /*TODO*/); + if (!M.getModuleInlineAsm().empty()) writeStringRecord(Stream, bitc::MODULE_CODE_ASM, M.getModuleInlineAsm(), 0 /*TODO*/); diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILWriterPass.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILWriterPass.cpp index 1eb03bf..725f2b1 100644 --- a/llvm/lib/Target/DirectX/DXILWriter/DXILWriterPass.cpp +++ b/llvm/lib/Target/DirectX/DXILWriter/DXILWriterPass.cpp @@ -149,11 +149,6 @@ public: std::string Data; llvm::raw_string_ostream OS(Data); - Triple OriginalTriple = M.getTargetTriple(); - // Set to DXIL triple when write to bitcode. - // Only the output bitcode need to be DXIL triple. - M.setTargetTriple(Triple("dxil-ms-dx")); - // Perform late legalization of lifetime intrinsics that would otherwise // fail the Module Verifier if performed in an earlier pass legalizeLifetimeIntrinsics(M); @@ -165,9 +160,6 @@ public: // not-so-legal legalizations removeLifetimeIntrinsics(M); - // Recover triple. - M.setTargetTriple(OriginalTriple); - Constant *ModuleConstant = ConstantDataArray::get(M.getContext(), arrayRefFromStringRef(Data)); auto *GV = new llvm::GlobalVariable(M, ModuleConstant->getType(), true, diff --git a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp index 5f180d6..3bd6ed4 100644 --- a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp +++ b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp @@ -66,6 +66,10 @@ public: void remapInstruction(MCInst &Instr) const; + Expected<bool> onSymbolStart(SymbolInfoTy &Symbol, uint64_t &Size, + ArrayRef<uint8_t> Bytes, + uint64_t Address) const override; + private: bool makeBundle(ArrayRef<uint8_t> Bytes, uint64_t Address, uint64_t &BytesToSkip, raw_ostream &CS) const; @@ -567,6 +571,18 @@ DecodeStatus HexagonDisassembler::getSingleInstruction(MCInst &MI, MCInst &MCB, return Result; } +Expected<bool> HexagonDisassembler::onSymbolStart(SymbolInfoTy &Symbol, + uint64_t &Size, + ArrayRef<uint8_t> Bytes, + uint64_t Address) const { + // At the start of a symbol, force a fresh packet by resetting any + // in-progress bundle state. This prevents packets from straddling label + // boundaries when data (e.g. jump tables) appears in between. + Size = 0; + resetBundle(); + return true; +} + static DecodeStatus DecodeRegisterClass(MCInst &Inst, unsigned RegNo, ArrayRef<MCPhysReg> Table) { if (RegNo < Table.size()) { diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp index 662d3f6..b1794b7 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp @@ -717,6 +717,18 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST) .clampScalar(0, sXLen, sXLen) .lower(); + LegalityPredicate InsertVectorEltPred = [=](const LegalityQuery &Query) { + LLT VecTy = Query.Types[0]; + LLT EltTy = Query.Types[1]; + return VecTy.getElementType() == EltTy; + }; + + getActionDefinitionsBuilder(G_INSERT_VECTOR_ELT) + .legalIf(all(typeIsLegalIntOrFPVec(0, IntOrFPVecTys, ST), + InsertVectorEltPred, typeIs(2, sXLen))) + .legalIf(all(typeIsLegalBoolVec(0, BoolVecTys, ST), InsertVectorEltPred, + typeIs(2, sXLen))); + getLegacyLegalizerInfo().computeTables(); verify(*ST.getInstrInfo()); } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td index 0f6e1ca..eedfdb3 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -1771,6 +1771,10 @@ defm RELAXED_DOT_ADD : "i32x4.relaxed_dot_i8x16_i7x16_add_s\t$dst, $lhs, $rhs, $acc", "i32x4.relaxed_dot_i8x16_i7x16_add_s", 0x113>; +def : Pat<(v4i32 (partial_reduce_smla (v4i32 V128:$acc), (v16i8 V128:$lhs), + (v16i8 V128:$rhs))), + (RELAXED_DOT_ADD $lhs, $rhs, $acc)>, Requires<[HasRelaxedSIMD]>; + //===----------------------------------------------------------------------===// // Relaxed BFloat16 dot product //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td index 0fd44b7..ec31675 100644 --- a/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/llvm/lib/Target/X86/X86InstrCompiler.td @@ -1256,8 +1256,17 @@ def : Pat<(i64 (X86Wrapper tconstpool :$dst)), (MOV64ri32 tconstpool :$dst)>, Requires<[KernelCode]>; def : Pat<(i64 (X86Wrapper tjumptable :$dst)), (MOV64ri32 tjumptable :$dst)>, Requires<[KernelCode]>; -def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)), - (MOV64ri32 tglobaladdr :$dst)>, Requires<[KernelCode]>; + +// If the globaladdr is an absolute_symbol, don't bother using the sign extending +// instruction since there's no benefit to using it with absolute symbols. +def globalAddrNoAbsSym : PatLeaf<(tglobaladdr:$dst), [{ + auto *GA = cast<GlobalAddressSDNode>(N); + return !GA->getGlobal()->getAbsoluteSymbolRange(); +}]>; +def : Pat<(i64 (X86Wrapper globalAddrNoAbsSym:$dst)), + (MOV64ri32 tglobaladdr:$dst)>, + Requires<[KernelCode]>; + def : Pat<(i64 (X86Wrapper texternalsym:$dst)), (MOV64ri32 texternalsym:$dst)>, Requires<[KernelCode]>; def : Pat<(i64 (X86Wrapper mcsym:$dst)), diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp index 481a9be..1fca466f 100644 --- a/llvm/lib/Target/X86/X86MCInstLower.cpp +++ b/llvm/lib/Target/X86/X86MCInstLower.cpp @@ -1946,16 +1946,14 @@ static void addConstantComments(const MachineInstr *MI, CASE_ARITH_RM(PMADDUBSW) { unsigned SrcIdx = getSrcIdx(MI, 1); if (auto *C = X86::getConstantFromPool(*MI, SrcIdx + 1)) { - if (C->getType()->getScalarSizeInBits() == 8) { - std::string Comment; - raw_string_ostream CS(Comment); - unsigned VectorWidth = - X86::getVectorRegisterWidth(MI->getDesc().operands()[0]); - CS << "["; - printConstant(C, VectorWidth, CS); - CS << "]"; - OutStreamer.AddComment(CS.str()); - } + std::string Comment; + raw_string_ostream CS(Comment); + unsigned VectorWidth = + X86::getVectorRegisterWidth(MI->getDesc().operands()[0]); + CS << "["; + printConstant(C, VectorWidth, CS); + CS << "]"; + OutStreamer.AddComment(CS.str()); } break; } @@ -1967,16 +1965,14 @@ static void addConstantComments(const MachineInstr *MI, CASE_ARITH_RM(PMULHRSW) { unsigned SrcIdx = getSrcIdx(MI, 1); if (auto *C = X86::getConstantFromPool(*MI, SrcIdx + 1)) { - if (C->getType()->getScalarSizeInBits() == 16) { - std::string Comment; - raw_string_ostream CS(Comment); - unsigned VectorWidth = - X86::getVectorRegisterWidth(MI->getDesc().operands()[0]); - CS << "["; - printConstant(C, VectorWidth, CS); - CS << "]"; - OutStreamer.AddComment(CS.str()); - } + std::string Comment; + raw_string_ostream CS(Comment); + unsigned VectorWidth = + X86::getVectorRegisterWidth(MI->getDesc().operands()[0]); + CS << "["; + printConstant(C, VectorWidth, CS); + CS << "]"; + OutStreamer.AddComment(CS.str()); } break; } diff --git a/llvm/lib/TargetParser/TargetDataLayout.cpp b/llvm/lib/TargetParser/TargetDataLayout.cpp index 950bb2b..d765d9c 100644 --- a/llvm/lib/TargetParser/TargetDataLayout.cpp +++ b/llvm/lib/TargetParser/TargetDataLayout.cpp @@ -548,8 +548,11 @@ std::string Triple::computeDataLayout(StringRef ABIName) const { case Triple::csky: return computeCSKYDataLayout(*this); case Triple::dxil: + // TODO: We need to align vectors on the element size generally, but for now + // we hard code this for 3-element 32- and 64-bit vectors as a workaround. + // See https://github.com/llvm/llvm-project/issues/123968 return "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-" - "f32:32-f64:64-n8:16:32:64"; + "f32:32-f64:64-n8:16:32:64-v48:16:16-v96:32:32-v192:64:64"; case Triple::hexagon: return "e-m:e-p:32:32:32-a:0-n16:32-" "i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-" diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index 4c9b10a..cdc559b 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -156,9 +156,9 @@ Instruction *InstCombinerImpl::commonCastTransforms(CastInst &CI) { Value *Src = CI.getOperand(0); Type *Ty = CI.getType(); - if (auto *SrcC = dyn_cast<Constant>(Src)) - if (Constant *Res = ConstantFoldCastOperand(CI.getOpcode(), SrcC, Ty, DL)) - return replaceInstUsesWith(CI, Res); + if (Value *Res = + simplifyCastInst(CI.getOpcode(), Src, Ty, SQ.getWithInstruction(&CI))) + return replaceInstUsesWith(CI, Res); // Try to eliminate a cast of a cast. if (auto *CSrc = dyn_cast<CastInst>(Src)) { // A->B->C cast diff --git a/llvm/lib/Transforms/Scalar/GuardWidening.cpp b/llvm/lib/Transforms/Scalar/GuardWidening.cpp index d99f1eb..ddb99a5 100644 --- a/llvm/lib/Transforms/Scalar/GuardWidening.cpp +++ b/llvm/lib/Transforms/Scalar/GuardWidening.cpp @@ -75,8 +75,6 @@ static cl::opt<bool> "expressed as branches by widenable conditions"), cl::init(true)); -namespace { - // Get the condition of \p I. It can either be a guard or a conditional branch. static Value *getCondition(Instruction *I) { if (IntrinsicInst *GI = dyn_cast<IntrinsicInst>(I)) { @@ -130,6 +128,8 @@ findInsertionPointForWideCondition(Instruction *WCOrGuard) { return std::nullopt; } +namespace { + class GuardWideningImpl { DominatorTree &DT; PostDominatorTree *PDT; @@ -328,7 +328,7 @@ public: /// The entry point for this pass. bool run(); }; -} +} // namespace static bool isSupportedGuardInstruction(const Instruction *Insn) { if (isGuard(Insn)) diff --git a/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp b/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp index 3c14036e..6fb8197 100644 --- a/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp +++ b/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp @@ -26,8 +26,6 @@ using namespace llvm; -namespace llvm { - static cl::opt<unsigned> JumpTableSizeThreshold("jump-table-to-switch-size-threshold", cl::Hidden, cl::desc("Only split jump tables with size less or " @@ -43,8 +41,8 @@ static cl::opt<unsigned> FunctionSizeThreshold( "or equal than this threshold."), cl::init(50)); +namespace llvm { extern cl::opt<bool> ProfcheckDisableMetadataFixes; - } // end namespace llvm #define DEBUG_TYPE "jump-table-to-switch" diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp index 9655173..b2c526b 100644 --- a/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/llvm/lib/Transforms/Scalar/LICM.cpp @@ -116,8 +116,6 @@ STATISTIC(NumIntAssociationsHoisted, STATISTIC(NumBOAssociationsHoisted, "Number of invariant BinaryOp expressions " "reassociated and hoisted out of the loop"); -namespace llvm { - /// Memory promotion is enabled by default. static cl::opt<bool> DisablePromotion("disable-licm-promotion", cl::Hidden, cl::init(false), @@ -156,7 +154,7 @@ static cl::opt<unsigned> IntAssociationUpperLimit( // which may not be precise, since optimizeUses is capped. The result is // correct, but we may not get as "far up" as possible to get which access is // clobbering the one queried. -cl::opt<unsigned> SetLicmMssaOptCap( +cl::opt<unsigned> llvm::SetLicmMssaOptCap( "licm-mssa-optimization-cap", cl::init(100), cl::Hidden, cl::desc("Enable imprecision in LICM in pathological cases, in exchange " "for faster compile. Caps the MemorySSA clobbering calls.")); @@ -164,15 +162,15 @@ cl::opt<unsigned> SetLicmMssaOptCap( // Experimentally, memory promotion carries less importance than sinking and // hoisting. Limit when we do promotion when using MemorySSA, in order to save // compile time. -cl::opt<unsigned> SetLicmMssaNoAccForPromotionCap( +cl::opt<unsigned> llvm::SetLicmMssaNoAccForPromotionCap( "licm-mssa-max-acc-promotion", cl::init(250), cl::Hidden, cl::desc("[LICM & MemorySSA] When MSSA in LICM is disabled, this has no " "effect. When MSSA in LICM is enabled, then this is the maximum " "number of accesses allowed to be present in a loop in order to " "enable memory promotion.")); +namespace llvm { extern cl::opt<bool> ProfcheckDisableMetadataFixes; - } // end namespace llvm static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI); @@ -1120,11 +1118,10 @@ static bool isLoadInvariantInLoop(LoadInst *LI, DominatorTree *DT, return false; } -namespace { /// Return true if-and-only-if we know how to (mechanically) both hoist and /// sink a given instruction out of a loop. Does not address legality /// concerns such as aliasing or speculation safety. -bool isHoistableAndSinkableInst(Instruction &I) { +static bool isHoistableAndSinkableInst(Instruction &I) { // Only these instructions are hoistable/sinkable. return (isa<LoadInst>(I) || isa<StoreInst>(I) || isa<CallInst>(I) || isa<FenceInst>(I) || isa<CastInst>(I) || isa<UnaryOperator>(I) || @@ -1136,8 +1133,8 @@ bool isHoistableAndSinkableInst(Instruction &I) { } /// Return true if I is the only Instruction with a MemoryAccess in L. -bool isOnlyMemoryAccess(const Instruction *I, const Loop *L, - const MemorySSAUpdater &MSSAU) { +static bool isOnlyMemoryAccess(const Instruction *I, const Loop *L, + const MemorySSAUpdater &MSSAU) { for (auto *BB : L->getBlocks()) if (auto *Accs = MSSAU.getMemorySSA()->getBlockAccesses(BB)) { int NotAPhi = 0; @@ -1151,7 +1148,6 @@ bool isOnlyMemoryAccess(const Instruction *I, const Loop *L, } return true; } -} static MemoryAccess *getClobberingMemoryAccess(MemorySSA &MSSA, BatchAAResults &BAA, diff --git a/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp b/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp index 73f1942..7706de8 100644 --- a/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp +++ b/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp @@ -21,8 +21,7 @@ #define DEBUG_TYPE "loop-bound-split" -namespace llvm { - +using namespace llvm; using namespace PatternMatch; namespace { @@ -358,8 +357,7 @@ static bool splitLoopBound(Loop &L, DominatorTree &DT, LoopInfo &LI, IRBuilder<> Builder(&PostLoopPreHeader->front()); // Update phi nodes in header of post-loop. - bool isExitingLatch = - (L.getExitingBlock() == L.getLoopLatch()) ? true : false; + bool isExitingLatch = L.getExitingBlock() == L.getLoopLatch(); Value *ExitingCondLCSSAPhi = nullptr; for (PHINode &PN : L.getHeader()->phis()) { // Create LCSSA phi node in preheader of post-loop. @@ -472,8 +470,7 @@ static bool splitLoopBound(Loop &L, DominatorTree &DT, LoopInfo &LI, PreservedAnalyses LoopBoundSplitPass::run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U) { - Function &F = *L.getHeader()->getParent(); - (void)F; + [[maybe_unused]] Function &F = *L.getHeader()->getParent(); LLVM_DEBUG(dbgs() << "Spliting bound of loop in " << F.getName() << ": " << L << "\n"); @@ -486,5 +483,3 @@ PreservedAnalyses LoopBoundSplitPass::run(Loop &L, LoopAnalysisManager &AM, return getLoopPassPreservedAnalyses(); } - -} // end namespace llvm diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 62a81ba..280eb20 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7957,9 +7957,9 @@ bool VPRecipeBuilder::getScaledReductions( auto CollectExtInfo = [this, &Exts, &ExtOpTypes, &ExtKinds](SmallVectorImpl<Value *> &Ops) -> bool { for (const auto &[I, OpI] : enumerate(Ops)) { - auto *CI = dyn_cast<ConstantInt>(OpI); - if (I > 0 && CI && - canConstantBeExtended(CI, ExtOpTypes[0], ExtKinds[0])) { + const APInt *C; + if (I > 0 && match(OpI, m_APInt(C)) && + canConstantBeExtended(C, ExtOpTypes[0], ExtKinds[0])) { ExtOpTypes[I] = ExtOpTypes[0]; ExtKinds[I] = ExtKinds[0]; continue; diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 0101942..d167009 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -1753,14 +1753,14 @@ void LoopVectorizationPlanner::printPlans(raw_ostream &O) { } #endif -bool llvm::canConstantBeExtended(const ConstantInt *CI, Type *NarrowType, +bool llvm::canConstantBeExtended(const APInt *C, Type *NarrowType, TTI::PartialReductionExtendKind ExtKind) { - APInt TruncatedVal = CI->getValue().trunc(NarrowType->getScalarSizeInBits()); - unsigned WideSize = CI->getType()->getScalarSizeInBits(); + APInt TruncatedVal = C->trunc(NarrowType->getScalarSizeInBits()); + unsigned WideSize = C->getBitWidth(); APInt ExtendedVal = ExtKind == TTI::PR_SignExtend ? TruncatedVal.sext(WideSize) : TruncatedVal.zext(WideSize); - return ExtendedVal == CI->getValue(); + return ExtendedVal == *C; } TargetTransformInfo::OperandValueInfo diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 3bcd7cc..0e0b042 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -4267,12 +4267,14 @@ public: BackedgeTakenCount = new VPValue(); return BackedgeTakenCount; } + VPValue *getBackedgeTakenCount() const { return BackedgeTakenCount; } /// The vector trip count. VPValue &getVectorTripCount() { return VectorTripCount; } /// Returns the VF of the vector loop region. VPValue &getVF() { return VF; }; + const VPValue &getVF() const { return VF; }; /// Returns VF * UF of the vector loop region. VPValue &getVFxUF() { return VFxUF; } diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h index 1580a3b..2aaabd9 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h +++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h @@ -474,7 +474,7 @@ public: /// Check if a constant \p CI can be safely treated as having been extended /// from a narrower type with the given extension kind. -bool canConstantBeExtended(const ConstantInt *CI, Type *NarrowType, +bool canConstantBeExtended(const APInt *C, Type *NarrowType, TTI::PartialReductionExtendKind ExtKind); } // end namespace llvm diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index ecd5e96..d8203e2 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -173,10 +173,10 @@ inline int_pred_ty<is_zero_int> m_ZeroInt() { /// For vectors, this includes constants with undefined elements. inline int_pred_ty<is_one> m_One() { return int_pred_ty<is_one>(); } -struct bind_const_int { - uint64_t &Res; +struct bind_apint { + const APInt *&Res; - bind_const_int(uint64_t &Res) : Res(Res) {} + bind_apint(const APInt *&Res) : Res(Res) {} bool match(VPValue *VPV) const { if (!VPV->isLiveIn()) @@ -188,7 +188,23 @@ struct bind_const_int { const auto *CI = dyn_cast<ConstantInt>(V); if (!CI) return false; - if (auto C = CI->getValue().tryZExtValue()) { + Res = &CI->getValue(); + return true; + } +}; + +inline bind_apint m_APInt(const APInt *&C) { return C; } + +struct bind_const_int { + uint64_t &Res; + + bind_const_int(uint64_t &Res) : Res(Res) {} + + bool match(VPValue *VPV) const { + const APInt *APConst; + if (!bind_apint(APConst).match(VPV)) + return false; + if (auto C = APConst->tryZExtValue()) { Res = *C; return true; } @@ -400,6 +416,12 @@ m_AnyOf(const Op0_t &Op0) { return m_VPInstruction<VPInstruction::AnyOf>(Op0); } +template <typename Op0_t> +inline VPInstruction_match<VPInstruction::FirstActiveLane, Op0_t> +m_FirstActiveLane(const Op0_t &Op0) { + return m_VPInstruction<VPInstruction::FirstActiveLane>(Op0); +} + template <unsigned Opcode, typename Op0_t> inline AllRecipe_match<Opcode, Op0_t> m_Unary(const Op0_t &Op0) { return AllRecipe_match<Opcode, Op0_t>(Op0); diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 775837f..7a98c75 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -341,12 +341,12 @@ VPPartialReductionRecipe::computeCost(ElementCount VF, ExtAType = GetExtendKind(ExtAR); ExtBType = GetExtendKind(ExtBR); - if (!ExtBR && Widen->getOperand(1)->isLiveIn()) { - auto *CI = cast<ConstantInt>(Widen->getOperand(1)->getLiveInIRValue()); - if (canConstantBeExtended(CI, InputTypeA, ExtAType)) { - InputTypeB = InputTypeA; - ExtBType = ExtAType; - } + using namespace VPlanPatternMatch; + const APInt *C; + if (!ExtBR && match(Widen->getOperand(1), m_APInt(C)) && + canConstantBeExtended(C, InputTypeA, ExtAType)) { + InputTypeB = InputTypeA; + ExtBType = ExtAType; } }; diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 011466b..7f5a41c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -788,9 +788,7 @@ static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan, ScalarEvolution &SE) { VPValue *Incoming, *Mask; if (!match(Op, m_VPInstruction<VPInstruction::ExtractLane>( - m_VPInstruction<VPInstruction::FirstActiveLane>( - m_VPValue(Mask)), - m_VPValue(Incoming)))) + m_FirstActiveLane(m_VPValue(Mask)), m_VPValue(Incoming)))) return nullptr; auto *WideIV = getOptimizableIVOf(Incoming, SE); @@ -2124,9 +2122,13 @@ static void licm(VPlan &Plan) { VPBasicBlock *Preheader = Plan.getVectorPreheader(); // Return true if we do not know how to (mechanically) hoist a given recipe - // out of a loop region. Does not address legality concerns such as aliasing - // or speculation safety. + // out of a loop region. auto CannotHoistRecipe = [](VPRecipeBase &R) { + // TODO: Relax checks in the future, e.g. we could also hoist reads, if + // their memory location is not modified in the vector loop. + if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi()) + return true; + // Allocas cannot be hoisted. auto *RepR = dyn_cast<VPReplicateRecipe>(&R); return RepR && RepR->getOpcode() == Instruction::Alloca; @@ -2134,17 +2136,18 @@ static void licm(VPlan &Plan) { // Hoist any loop invariant recipes from the vector loop region to the // preheader. Preform a shallow traversal of the vector loop region, to - // exclude recipes in replicate regions. + // exclude recipes in replicate regions. Since the top-level blocks in the + // vector loop region are guaranteed to execute if the vector pre-header is, + // we don't need to check speculation safety. VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); + assert(Preheader->getSingleSuccessor() == LoopRegion && + "Expected vector prehader's successor to be the vector loop region"); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( vp_depth_first_shallow(LoopRegion->getEntry()))) { for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { if (CannotHoistRecipe(R)) continue; - // TODO: Relax checks in the future, e.g. we could also hoist reads, if - // their memory location is not modified in the vector loop. - if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi() || - any_of(R.operands(), [](VPValue *Op) { + if (any_of(R.operands(), [](VPValue *Op) { return !Op->isDefinedOutsideLoopRegions(); })) continue; diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp index 86a8b08..5aeda3e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp @@ -352,8 +352,7 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) { // Compute*Result which combine all parts to compute the final value. VPValue *Op1; if (match(&R, m_VPInstruction<VPInstruction::AnyOf>(m_VPValue(Op1))) || - match(&R, m_VPInstruction<VPInstruction::FirstActiveLane>( - m_VPValue(Op1))) || + match(&R, m_FirstActiveLane(m_VPValue(Op1))) || match(&R, m_VPInstruction<VPInstruction::ComputeAnyOfResult>( m_VPValue(), m_VPValue(), m_VPValue(Op1))) || match(&R, m_VPInstruction<VPInstruction::ComputeReductionResult>( diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index 7240188..8b1b0e5 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -53,7 +53,7 @@ VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr) { return Expanded; } -bool vputils::isHeaderMask(const VPValue *V, VPlan &Plan) { +bool vputils::isHeaderMask(const VPValue *V, const VPlan &Plan) { if (isa<VPActiveLaneMaskPHIRecipe>(V)) return true; @@ -74,7 +74,7 @@ bool vputils::isHeaderMask(const VPValue *V, VPlan &Plan) { IsWideCanonicalIV(A)); return match(V, m_ICmp(m_VPValue(A), m_VPValue(B))) && IsWideCanonicalIV(A) && - B == Plan.getOrCreateBackedgeTakenCount(); + B == Plan.getBackedgeTakenCount(); } const SCEV *vputils::getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h index 0222b0a..cf95ac0 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h @@ -90,7 +90,7 @@ inline bool isSingleScalar(const VPValue *VPV) { } /// Return true if \p V is a header mask in \p Plan. -bool isHeaderMask(const VPValue *V, VPlan &Plan); +bool isHeaderMask(const VPValue *V, const VPlan &Plan); /// Checks if \p V is uniform across all VF lanes and UF parts. It is considered /// as such if it is either loop invariant (defined outside the vector region) diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index 5262af6..91734a1 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -298,11 +298,16 @@ bool VPlanVerifier::verifyVPBasicBlock(const VPBasicBlock *VPBB) { return false; } } - if (const auto *EVL = dyn_cast<VPInstruction>(&R)) { - if (EVL->getOpcode() == VPInstruction::ExplicitVectorLength && - !verifyEVLRecipe(*EVL)) { - errs() << "EVL VPValue is not used correctly\n"; - return false; + if (const auto *VPI = dyn_cast<VPInstruction>(&R)) { + switch (VPI->getOpcode()) { + case VPInstruction::ExplicitVectorLength: + if (!verifyEVLRecipe(*VPI)) { + errs() << "EVL VPValue is not used correctly\n"; + return false; + } + break; + default: + break; } } } |