diff options
Diffstat (limited to 'llvm/lib')
-rw-r--r-- | llvm/lib/Analysis/MemoryProfileInfo.cpp | 41 | ||||
-rw-r--r-- | llvm/lib/CodeGen/RegAllocBasic.cpp | 97 | ||||
-rw-r--r-- | llvm/lib/CodeGen/RegAllocBasic.h | 104 | ||||
-rw-r--r-- | llvm/lib/MC/MCAssembler.cpp | 22 | ||||
-rw-r--r-- | llvm/lib/MC/MCFragment.cpp | 7 | ||||
-rw-r--r-- | llvm/lib/MC/MCMachOStreamer.cpp | 2 | ||||
-rw-r--r-- | llvm/lib/MC/MCObjectStreamer.cpp | 10 | ||||
-rw-r--r-- | llvm/lib/MC/MCPseudoProbe.cpp | 5 | ||||
-rw-r--r-- | llvm/lib/MC/WasmObjectWriter.cpp | 25 | ||||
-rw-r--r-- | llvm/lib/Support/CMakeLists.txt | 4 | ||||
-rw-r--r-- | llvm/lib/Target/AArch64/AArch64InstrInfo.td | 27 | ||||
-rw-r--r-- | llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp | 8 | ||||
-rw-r--r-- | llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 1 | ||||
-rw-r--r-- | llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 171 | ||||
-rw-r--r-- | llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp | 2 | ||||
-rw-r--r-- | llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp | 99 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 7 |
17 files changed, 436 insertions, 196 deletions
diff --git a/llvm/lib/Analysis/MemoryProfileInfo.cpp b/llvm/lib/Analysis/MemoryProfileInfo.cpp index c08024a..b3c8a7d 100644 --- a/llvm/lib/Analysis/MemoryProfileInfo.cpp +++ b/llvm/lib/Analysis/MemoryProfileInfo.cpp @@ -157,6 +157,8 @@ void CallStackTrie::addCallStack( } void CallStackTrie::addCallStack(MDNode *MIB) { + // Note that we are building this from existing MD_memprof metadata. + BuiltFromExistingMetadata = true; MDNode *StackMD = getMIBStackNode(MIB); assert(StackMD); std::vector<uint64_t> CallStack; @@ -187,8 +189,9 @@ void CallStackTrie::addCallStack(MDNode *MIB) { static MDNode *createMIBNode(LLVMContext &Ctx, ArrayRef<uint64_t> MIBCallStack, AllocationType AllocType, ArrayRef<ContextTotalSize> ContextSizeInfo, - const uint64_t MaxColdSize, uint64_t &TotalBytes, - uint64_t &ColdBytes) { + const uint64_t MaxColdSize, + bool BuiltFromExistingMetadata, + uint64_t &TotalBytes, uint64_t &ColdBytes) { SmallVector<Metadata *> MIBPayload( {buildCallstackMetadata(MIBCallStack, Ctx)}); MIBPayload.push_back( @@ -197,8 +200,9 @@ static MDNode *createMIBNode(LLVMContext &Ctx, ArrayRef<uint64_t> MIBCallStack, if (ContextSizeInfo.empty()) { // The profile matcher should have provided context size info if there was a // MinCallsiteColdBytePercent < 100. Here we check >=100 to gracefully - // handle a user-provided percent larger than 100. - assert(MinCallsiteColdBytePercent >= 100); + // handle a user-provided percent larger than 100. However, we may not have + // this information if we built the Trie from existing MD_memprof metadata. + assert(BuiltFromExistingMetadata || MinCallsiteColdBytePercent >= 100); return MDNode::get(Ctx, MIBPayload); } @@ -252,9 +256,19 @@ void CallStackTrie::convertHotToNotCold(CallStackTrieNode *Node) { static void saveFilteredNewMIBNodes(std::vector<Metadata *> &NewMIBNodes, std::vector<Metadata *> &SavedMIBNodes, unsigned CallerContextLength, - uint64_t TotalBytes, uint64_t ColdBytes) { + uint64_t TotalBytes, uint64_t ColdBytes, + bool BuiltFromExistingMetadata) { const bool MostlyCold = - MinCallsiteColdBytePercent < 100 && + // If we have built the Trie from existing MD_memprof metadata, we may or + // may not have context size information (in which case ColdBytes and + // TotalBytes are 0, which is not also guarded against below). Even if we + // do have some context size information from the the metadata, we have + // already gone through a round of discarding of small non-cold contexts + // during matching, and it would be overly aggressive to do it again, and + // we also want to maintain the same behavior with and without reporting + // of hinted bytes enabled. + !BuiltFromExistingMetadata && MinCallsiteColdBytePercent < 100 && + ColdBytes > 0 && ColdBytes * 100 >= MinCallsiteColdBytePercent * TotalBytes; // In the simplest case, with pruning disabled, keep all the new MIB nodes. @@ -386,9 +400,9 @@ bool CallStackTrie::buildMIBNodes(CallStackTrieNode *Node, LLVMContext &Ctx, if (hasSingleAllocType(Node->AllocTypes)) { std::vector<ContextTotalSize> ContextSizeInfo; collectContextSizeInfo(Node, ContextSizeInfo); - MIBNodes.push_back( - createMIBNode(Ctx, MIBCallStack, (AllocationType)Node->AllocTypes, - ContextSizeInfo, MaxColdSize, TotalBytes, ColdBytes)); + MIBNodes.push_back(createMIBNode( + Ctx, MIBCallStack, (AllocationType)Node->AllocTypes, ContextSizeInfo, + MaxColdSize, BuiltFromExistingMetadata, TotalBytes, ColdBytes)); return true; } @@ -416,7 +430,8 @@ bool CallStackTrie::buildMIBNodes(CallStackTrieNode *Node, LLVMContext &Ctx, // Pass in the stack length of the MIB nodes added for the immediate caller, // which is the current stack length plus 1. saveFilteredNewMIBNodes(NewMIBNodes, MIBNodes, MIBCallStack.size() + 1, - CallerTotalBytes, CallerColdBytes); + CallerTotalBytes, CallerColdBytes, + BuiltFromExistingMetadata); TotalBytes += CallerTotalBytes; ColdBytes += CallerColdBytes; @@ -441,9 +456,9 @@ bool CallStackTrie::buildMIBNodes(CallStackTrieNode *Node, LLVMContext &Ctx, return false; std::vector<ContextTotalSize> ContextSizeInfo; collectContextSizeInfo(Node, ContextSizeInfo); - MIBNodes.push_back(createMIBNode(Ctx, MIBCallStack, AllocationType::NotCold, - ContextSizeInfo, MaxColdSize, TotalBytes, - ColdBytes)); + MIBNodes.push_back(createMIBNode( + Ctx, MIBCallStack, AllocationType::NotCold, ContextSizeInfo, MaxColdSize, + BuiltFromExistingMetadata, TotalBytes, ColdBytes)); return true; } diff --git a/llvm/lib/CodeGen/RegAllocBasic.cpp b/llvm/lib/CodeGen/RegAllocBasic.cpp index 381249e..0b2a73b 100644 --- a/llvm/lib/CodeGen/RegAllocBasic.cpp +++ b/llvm/lib/CodeGen/RegAllocBasic.cpp @@ -5,35 +5,31 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// -// This file defines the RABasic function pass, which provides a minimal -// implementation of the basic register allocator. -// +/// +/// \file +/// This file defines the RABasic function pass, which provides a minimal +/// implementation of the basic register allocator. +/// //===----------------------------------------------------------------------===// +#include "RegAllocBasic.h" #include "AllocationOrder.h" -#include "RegAllocBase.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/CodeGen/CalcSpillWeights.h" #include "llvm/CodeGen/LiveDebugVariables.h" #include "llvm/CodeGen/LiveIntervals.h" -#include "llvm/CodeGen/LiveRangeEdit.h" #include "llvm/CodeGen/LiveRegMatrix.h" #include "llvm/CodeGen/LiveStacks.h" #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineDominators.h" -#include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/RegAllocRegistry.h" -#include "llvm/CodeGen/Spiller.h" -#include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/CodeGen/VirtRegMap.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include <queue> using namespace llvm; @@ -42,89 +38,8 @@ using namespace llvm; static RegisterRegAlloc basicRegAlloc("basic", "basic register allocator", createBasicRegisterAllocator); -namespace { - struct CompSpillWeight { - bool operator()(const LiveInterval *A, const LiveInterval *B) const { - return A->weight() < B->weight(); - } - }; -} - -namespace { -/// RABasic provides a minimal implementation of the basic register allocation -/// algorithm. It prioritizes live virtual registers by spill weight and spills -/// whenever a register is unavailable. This is not practical in production but -/// provides a useful baseline both for measuring other allocators and comparing -/// the speed of the basic algorithm against other styles of allocators. -class RABasic : public MachineFunctionPass, - public RegAllocBase, - private LiveRangeEdit::Delegate { - // context - MachineFunction *MF = nullptr; - - // state - std::unique_ptr<Spiller> SpillerInstance; - std::priority_queue<const LiveInterval *, std::vector<const LiveInterval *>, - CompSpillWeight> - Queue; - - // Scratch space. Allocated here to avoid repeated malloc calls in - // selectOrSplit(). - BitVector UsableRegs; - - bool LRE_CanEraseVirtReg(Register) override; - void LRE_WillShrinkVirtReg(Register) override; - -public: - RABasic(const RegAllocFilterFunc F = nullptr); - - /// Return the pass name. - StringRef getPassName() const override { return "Basic Register Allocator"; } - - /// RABasic analysis usage. - void getAnalysisUsage(AnalysisUsage &AU) const override; - - void releaseMemory() override; - - Spiller &spiller() override { return *SpillerInstance; } - - void enqueueImpl(const LiveInterval *LI) override { Queue.push(LI); } - - const LiveInterval *dequeue() override { - if (Queue.empty()) - return nullptr; - const LiveInterval *LI = Queue.top(); - Queue.pop(); - return LI; - } - - MCRegister selectOrSplit(const LiveInterval &VirtReg, - SmallVectorImpl<Register> &SplitVRegs) override; - - /// Perform register allocation. - bool runOnMachineFunction(MachineFunction &mf) override; - - MachineFunctionProperties getRequiredProperties() const override { - return MachineFunctionProperties().setNoPHIs(); - } - - MachineFunctionProperties getClearedProperties() const override { - return MachineFunctionProperties().setIsSSA(); - } - - // Helper for spilling all live virtual registers currently unified under preg - // that interfere with the most recently queried lvr. Return true if spilling - // was successful, and append any new spilled/split intervals to splitLVRs. - bool spillInterferences(const LiveInterval &VirtReg, MCRegister PhysReg, - SmallVectorImpl<Register> &SplitVRegs); - - static char ID; -}; - char RABasic::ID = 0; -} // end anonymous namespace - char &llvm::RABasicID = RABasic::ID; INITIALIZE_PASS_BEGIN(RABasic, "regallocbasic", "Basic Register Allocator", diff --git a/llvm/lib/CodeGen/RegAllocBasic.h b/llvm/lib/CodeGen/RegAllocBasic.h new file mode 100644 index 0000000..004bc1a --- /dev/null +++ b/llvm/lib/CodeGen/RegAllocBasic.h @@ -0,0 +1,104 @@ +//===-- RegAllocBasic.h - Basic Register Allocator Header -----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file declares the RABasic class, which provides a minimal +/// implementation of the basic register allocator. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CODEGEN_REGALLOCBASIC_H +#define LLVM_CODEGEN_REGALLOCBASIC_H + +#include "RegAllocBase.h" +#include "llvm/CodeGen/LiveRangeEdit.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/Spiller.h" +#include <queue> + +namespace llvm { + +struct CompSpillWeight { + bool operator()(const LiveInterval *A, const LiveInterval *B) const { + return A->weight() < B->weight(); + } +}; + +/// RABasic provides a minimal implementation of the basic register allocation +/// algorithm. It prioritizes live virtual registers by spill weight and spills +/// whenever a register is unavailable. This is not practical in production but +/// provides a useful baseline both for measuring other allocators and comparing +/// the speed of the basic algorithm against other styles of allocators. +class LLVM_LIBRARY_VISIBILITY RABasic : public MachineFunctionPass, + public RegAllocBase, + private LiveRangeEdit::Delegate { + // context + MachineFunction *MF = nullptr; + + // state + std::unique_ptr<Spiller> SpillerInstance; + std::priority_queue<const LiveInterval *, std::vector<const LiveInterval *>, + CompSpillWeight> + Queue; + + // Scratch space. Allocated here to avoid repeated malloc calls in + // selectOrSplit(). + BitVector UsableRegs; + + bool LRE_CanEraseVirtReg(Register) override; + void LRE_WillShrinkVirtReg(Register) override; + +public: + RABasic(const RegAllocFilterFunc F = nullptr); + + /// Return the pass name. + StringRef getPassName() const override { return "Basic Register Allocator"; } + + /// RABasic analysis usage. + void getAnalysisUsage(AnalysisUsage &AU) const override; + + void releaseMemory() override; + + Spiller &spiller() override { return *SpillerInstance; } + + void enqueueImpl(const LiveInterval *LI) override { Queue.push(LI); } + + const LiveInterval *dequeue() override { + if (Queue.empty()) + return nullptr; + const LiveInterval *LI = Queue.top(); + Queue.pop(); + return LI; + } + + MCRegister selectOrSplit(const LiveInterval &VirtReg, + SmallVectorImpl<Register> &SplitVRegs) override; + + /// Perform register allocation. + bool runOnMachineFunction(MachineFunction &mf) override; + + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::NoPHIs); + } + + MachineFunctionProperties getClearedProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::IsSSA); + } + + // Helper for spilling all live virtual registers currently unified under preg + // that interfere with the most recently queried lvr. Return true if spilling + // was successful, and append any new spilled/split intervals to splitLVRs. + bool spillInterferences(const LiveInterval &VirtReg, MCRegister PhysReg, + SmallVectorImpl<Register> &SplitVRegs); + + static char ID; +}; +} // namespace llvm +#endif diff --git a/llvm/lib/MC/MCAssembler.cpp b/llvm/lib/MC/MCAssembler.cpp index 3e96bdf..9420924 100644 --- a/llvm/lib/MC/MCAssembler.cpp +++ b/llvm/lib/MC/MCAssembler.cpp @@ -201,7 +201,6 @@ uint64_t MCAssembler::computeFragmentSize(const MCFragment &F) const { case MCFragment::FT_DwarfFrame: case MCFragment::FT_CVInlineLines: case MCFragment::FT_CVDefRange: - case MCFragment::FT_PseudoProbe: return F.getSize(); case MCFragment::FT_Fill: { auto &FF = cast<MCFillFragment>(F); @@ -424,8 +423,7 @@ static void writeFragment(raw_ostream &OS, const MCAssembler &Asm, case MCFragment::FT_Dwarf: case MCFragment::FT_DwarfFrame: case MCFragment::FT_CVInlineLines: - case MCFragment::FT_CVDefRange: - case MCFragment::FT_PseudoProbe: { + case MCFragment::FT_CVDefRange: { if (F.getKind() == MCFragment::FT_Data) ++stats::EmittedDataFragments; else if (F.getKind() == MCFragment::FT_Relaxable) @@ -974,22 +972,6 @@ bool MCAssembler::relaxFill(MCFillFragment &F) { return true; } -bool MCAssembler::relaxPseudoProbeAddr(MCPseudoProbeAddrFragment &PF) { - uint64_t OldSize = PF.getContents().size(); - int64_t AddrDelta; - bool Abs = PF.getAddrDelta().evaluateKnownAbsolute(AddrDelta, *this); - assert(Abs && "We created a pseudo probe with an invalid expression"); - (void)Abs; - SmallVector<char, 8> Data; - raw_svector_ostream OSE(Data); - - // AddrDelta is a signed integer - encodeSLEB128(AddrDelta, OSE, OldSize); - PF.setContents(Data); - PF.clearFixups(); - return OldSize != Data.size(); -} - bool MCAssembler::relaxFragment(MCFragment &F) { switch(F.getKind()) { default: @@ -1011,8 +993,6 @@ bool MCAssembler::relaxFragment(MCFragment &F) { return relaxCVDefRange(cast<MCCVDefRangeFragment>(F)); case MCFragment::FT_Fill: return relaxFill(cast<MCFillFragment>(F)); - case MCFragment::FT_PseudoProbe: - return relaxPseudoProbeAddr(cast<MCPseudoProbeAddrFragment>(F)); } } diff --git a/llvm/lib/MC/MCFragment.cpp b/llvm/lib/MC/MCFragment.cpp index bfe045a..fe7afd4 100644 --- a/llvm/lib/MC/MCFragment.cpp +++ b/llvm/lib/MC/MCFragment.cpp @@ -58,7 +58,6 @@ LLVM_DUMP_METHOD void MCFragment::dump() const { case MCFragment::FT_SymbolId: OS << "SymbolId"; break; case MCFragment::FT_CVInlineLines: OS << "CVInlineLineTable"; break; case MCFragment::FT_CVDefRange: OS << "CVDefRangeTable"; break; - case MCFragment::FT_PseudoProbe: OS << "PseudoProbe"; break; // clang-format on } @@ -182,12 +181,6 @@ LLVM_DUMP_METHOD void MCFragment::dump() const { } break; } - case MCFragment::FT_PseudoProbe: { - const auto *OF = cast<MCPseudoProbeAddrFragment>(this); - OS << " AddrDelta:"; - OF->getAddrDelta().print(OS, nullptr); - break; - } } } #endif diff --git a/llvm/lib/MC/MCMachOStreamer.cpp b/llvm/lib/MC/MCMachOStreamer.cpp index 43598ef..08d2b93 100644 --- a/llvm/lib/MC/MCMachOStreamer.cpp +++ b/llvm/lib/MC/MCMachOStreamer.cpp @@ -161,7 +161,7 @@ void MCMachOStreamer::emitLabel(MCSymbol *Symbol, SMLoc Loc) { // We have to create a new fragment if this is an atom defining symbol, // fragments cannot span atoms. if (cast<MCSymbolMachO>(Symbol)->isSymbolLinkerVisible()) - insert(getContext().allocFragment<MCFragment>()); + newFragment(); MCObjectStreamer::emitLabel(Symbol, Loc); diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp index d5b8f22..f61dda6 100644 --- a/llvm/lib/MC/MCObjectStreamer.cpp +++ b/llvm/lib/MC/MCObjectStreamer.cpp @@ -215,9 +215,8 @@ void MCObjectStreamer::emitULEB128Value(const MCExpr *Value) { return; } auto *F = getOrCreateDataFragment(); - F->Kind = MCFragment::FT_LEB; - F->setLEBSigned(false); - F->setLEBValue(Value); + F->makeLEB(false, Value); + newFragment(); } void MCObjectStreamer::emitSLEB128Value(const MCExpr *Value) { @@ -227,9 +226,8 @@ void MCObjectStreamer::emitSLEB128Value(const MCExpr *Value) { return; } auto *F = getOrCreateDataFragment(); - F->Kind = MCFragment::FT_LEB; - F->setLEBSigned(true); - F->setLEBValue(Value); + F->makeLEB(true, Value); + newFragment(); } void MCObjectStreamer::emitWeakReference(MCSymbol *Alias, diff --git a/llvm/lib/MC/MCPseudoProbe.cpp b/llvm/lib/MC/MCPseudoProbe.cpp index f87d27f..b493337 100644 --- a/llvm/lib/MC/MCPseudoProbe.cpp +++ b/llvm/lib/MC/MCPseudoProbe.cpp @@ -81,8 +81,9 @@ void MCPseudoProbe::emit(MCObjectStreamer *MCOS, if (AddrDelta->evaluateAsAbsolute(Delta, MCOS->getAssemblerPtr())) { MCOS->emitSLEB128IntValue(Delta); } else { - MCOS->insert(MCOS->getContext().allocFragment<MCPseudoProbeAddrFragment>( - AddrDelta)); + auto *F = MCOS->getCurrentFragment(); + F->makeLEB(true, AddrDelta); + MCOS->newFragment(); } } else { // Emit the GUID of the split function that the sentinel probe represents. diff --git a/llvm/lib/MC/WasmObjectWriter.cpp b/llvm/lib/MC/WasmObjectWriter.cpp index 7af240a..da6dbf3 100644 --- a/llvm/lib/MC/WasmObjectWriter.cpp +++ b/llvm/lib/MC/WasmObjectWriter.cpp @@ -1858,23 +1858,9 @@ uint64_t WasmObjectWriter::writeOneObject(MCAssembler &Asm, auto IT = WS.begin(); if (IT == WS.end()) continue; - const MCFragment &EmptyFrag = *IT; - if (EmptyFrag.getKind() != MCFragment::FT_Data) - report_fatal_error(".init_array section should be aligned"); - - const MCFragment *nextFrag = EmptyFrag.getNext(); - while (nextFrag != nullptr) { - const MCFragment &AlignFrag = *nextFrag; - if (AlignFrag.getKind() != MCFragment::FT_Align) - report_fatal_error(".init_array section should be aligned"); - if (cast<MCAlignFragment>(AlignFrag).getAlignment() != - Align(is64Bit() ? 8 : 4)) - report_fatal_error( - ".init_array section should be aligned for pointers"); - - const MCFragment &Frag = *AlignFrag.getNext(); - nextFrag = Frag.getNext(); - if (Frag.hasInstructions() || Frag.getKind() != MCFragment::FT_Data) + for (auto *Frag = &*IT; Frag; Frag = Frag->getNext()) { + if (Frag->hasInstructions() || (Frag->getKind() != MCFragment::FT_Align && + Frag->getKind() != MCFragment::FT_Data)) report_fatal_error("only data supported in .init_array section"); uint16_t Priority = UINT16_MAX; @@ -1886,9 +1872,8 @@ uint64_t WasmObjectWriter::writeOneObject(MCAssembler &Asm, if (WS.getName().substr(PrefixLength + 1).getAsInteger(10, Priority)) report_fatal_error("invalid .init_array section priority"); } - const auto &DataFrag = Frag; - assert(llvm::all_of(DataFrag.getContents(), [](char C) { return !C; })); - for (const MCFixup &Fixup : DataFrag.getFixups()) { + assert(llvm::all_of(Frag->getContents(), [](char C) { return !C; })); + for (const MCFixup &Fixup : Frag->getFixups()) { assert(Fixup.getKind() == MCFixup::getDataKindForSize(is64Bit() ? 8 : 4)); const MCExpr *Expr = Fixup.getValue(); diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt index a579eaf..10b6101 100644 --- a/llvm/lib/Support/CMakeLists.txt +++ b/llvm/lib/Support/CMakeLists.txt @@ -380,7 +380,7 @@ if(LLVM_WITH_Z3) ) endif() -target_include_directories(LLVMSupport SYSTEM +target_include_directories(LLVMSupport PRIVATE ${LLVM_THIRD_PARTY_DIR}/siphash/include - ) +) diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 6c46b18..9f8a257 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -1053,13 +1053,6 @@ def AArch64umaxv : SDNode<"AArch64ISD::UMAXV", SDT_AArch64UnaryVec>; def AArch64uaddlv : SDNode<"AArch64ISD::UADDLV", SDT_AArch64uaddlp>; def AArch64saddlv : SDNode<"AArch64ISD::SADDLV", SDT_AArch64uaddlp>; -def AArch64uabd : PatFrags<(ops node:$lhs, node:$rhs), - [(abdu node:$lhs, node:$rhs), - (int_aarch64_neon_uabd node:$lhs, node:$rhs)]>; -def AArch64sabd : PatFrags<(ops node:$lhs, node:$rhs), - [(abds node:$lhs, node:$rhs), - (int_aarch64_neon_sabd node:$lhs, node:$rhs)]>; - // Add Pairwise of two vectors def AArch64addp_n : SDNode<"AArch64ISD::ADDP", SDT_AArch64Zip>; // Add Long Pairwise @@ -5667,8 +5660,7 @@ let Predicates = [HasFullFP16] in { // Advanced SIMD two vector instructions. //===----------------------------------------------------------------------===// -defm UABDL : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl", - AArch64uabd>; +defm UABDL : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl", abdu>; // Match UABDL in log2-shuffle patterns. def : Pat<(abs (v8i16 (sub (zext (v8i8 V64:$opA)), (zext (v8i8 V64:$opB))))), @@ -6018,8 +6010,8 @@ defm MLS : SIMDThreeSameVectorBHSTied<1, 0b10010, "mls", null_frag>; defm MUL : SIMDThreeSameVectorBHS<0, 0b10011, "mul", mul>; defm PMUL : SIMDThreeSameVectorB<1, 0b10011, "pmul", int_aarch64_neon_pmul>; defm SABA : SIMDThreeSameVectorBHSTied<0, 0b01111, "saba", - TriOpFrag<(add node:$LHS, (AArch64sabd node:$MHS, node:$RHS))> >; -defm SABD : SIMDThreeSameVectorBHS<0,0b01110,"sabd", AArch64sabd>; + TriOpFrag<(add node:$LHS, (abds node:$MHS, node:$RHS))> >; +defm SABD : SIMDThreeSameVectorBHS<0,0b01110,"sabd", abds>; defm SHADD : SIMDThreeSameVectorBHS<0,0b00000,"shadd", avgfloors>; defm SHSUB : SIMDThreeSameVectorBHS<0,0b00100,"shsub", int_aarch64_neon_shsub>; defm SMAXP : SIMDThreeSameVectorBHS<0,0b10100,"smaxp", int_aarch64_neon_smaxp>; @@ -6037,8 +6029,8 @@ defm SRSHL : SIMDThreeSameVector<0,0b01010,"srshl", int_aarch64_neon_srshl>; defm SSHL : SIMDThreeSameVector<0,0b01000,"sshl", int_aarch64_neon_sshl>; defm SUB : SIMDThreeSameVector<1,0b10000,"sub", sub>; defm UABA : SIMDThreeSameVectorBHSTied<1, 0b01111, "uaba", - TriOpFrag<(add node:$LHS, (AArch64uabd node:$MHS, node:$RHS))> >; -defm UABD : SIMDThreeSameVectorBHS<1,0b01110,"uabd", AArch64uabd>; + TriOpFrag<(add node:$LHS, (abdu node:$MHS, node:$RHS))> >; +defm UABD : SIMDThreeSameVectorBHS<1,0b01110,"uabd", abdu>; defm UHADD : SIMDThreeSameVectorBHS<1,0b00000,"uhadd", avgflooru>; defm UHSUB : SIMDThreeSameVectorBHS<1,0b00100,"uhsub", int_aarch64_neon_uhsub>; defm UMAXP : SIMDThreeSameVectorBHS<1,0b10100,"umaxp", int_aarch64_neon_umaxp>; @@ -6759,10 +6751,8 @@ defm SUBHN : SIMDNarrowThreeVectorBHS<0,0b0110,"subhn", int_aarch64_neon_subhn> defm RADDHN : SIMDNarrowThreeVectorBHS<1,0b0100,"raddhn",int_aarch64_neon_raddhn>; defm RSUBHN : SIMDNarrowThreeVectorBHS<1,0b0110,"rsubhn",int_aarch64_neon_rsubhn>; defm PMULL : SIMDDifferentThreeVectorBD<0,0b1110,"pmull", AArch64pmull>; -defm SABAL : SIMDLongThreeVectorTiedBHSabal<0,0b0101,"sabal", - AArch64sabd>; -defm SABDL : SIMDLongThreeVectorBHSabdl<0, 0b0111, "sabdl", - AArch64sabd>; +defm SABAL : SIMDLongThreeVectorTiedBHSabal<0,0b0101,"sabal", abds>; +defm SABDL : SIMDLongThreeVectorBHSabdl<0, 0b0111, "sabdl", abds>; defm SADDL : SIMDLongThreeVectorBHS< 0, 0b0000, "saddl", BinOpFrag<(add (sext node:$LHS), (sext node:$RHS))>>; defm SADDW : SIMDWideThreeVectorBHS< 0, 0b0001, "saddw", @@ -6780,8 +6770,7 @@ defm SSUBL : SIMDLongThreeVectorBHS<0, 0b0010, "ssubl", BinOpFrag<(sub (sext node:$LHS), (sext node:$RHS))>>; defm SSUBW : SIMDWideThreeVectorBHS<0, 0b0011, "ssubw", BinOpFrag<(sub node:$LHS, (sext node:$RHS))>>; -defm UABAL : SIMDLongThreeVectorTiedBHSabal<1, 0b0101, "uabal", - AArch64uabd>; +defm UABAL : SIMDLongThreeVectorTiedBHSabal<1, 0b0101, "uabal", abdu>; defm UADDL : SIMDLongThreeVectorBHS<1, 0b0000, "uaddl", BinOpFrag<(add (zanyext node:$LHS), (zanyext node:$RHS))>>; defm UADDW : SIMDWideThreeVectorBHS<1, 0b0001, "uaddw", diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index 473ba5e..bb0f667b 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -287,6 +287,10 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) .moreElementsToNextPow2(0) .lower(); + getActionDefinitionsBuilder({G_ABDS, G_ABDU}) + .legalFor({v8s8, v16s8, v4s16, v8s16, v2s32, v4s32}) + .lower(); + getActionDefinitionsBuilder( {G_SADDE, G_SSUBE, G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_UADDO, G_USUBO}) .legalFor({{s32, s32}, {s64, s32}}) @@ -1794,6 +1798,10 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, return LowerBinOp(AArch64::G_SMULL); case Intrinsic::aarch64_neon_umull: return LowerBinOp(AArch64::G_UMULL); + case Intrinsic::aarch64_neon_sabd: + return LowerBinOp(TargetOpcode::G_ABDS); + case Intrinsic::aarch64_neon_uabd: + return LowerBinOp(TargetOpcode::G_ABDU); case Intrinsic::aarch64_neon_abs: { // Lower the intrinsic to G_ABS. MIB.buildInstr(TargetOpcode::G_ABS, {MI.getOperand(0)}, {MI.getOperand(2)}); diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index a5bb83d..b5df4c6 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -131,6 +131,7 @@ def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">; def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">; def hasDotInstructions : Predicate<"Subtarget->hasDotInstructions()">; def hasTcgen05Instructions : Predicate<"Subtarget->hasTcgen05Instructions()">; +def hasTMACTAGroupSupport : Predicate<"Subtarget->hasCpAsyncBulkTensorCTAGroupSupport()">; def hasF32x2Instructions : Predicate<"Subtarget->hasF32x2Instructions()">; class hasPTX<int version>: Predicate<"Subtarget->getPTXVersion() >= " # version>; diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 70150bd..f329f48 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -600,12 +600,23 @@ defm CP_ASYNC_BULK_PREFETCH_CH : CP_ASYNC_BULK_PREFETCH_INTR<has_ch = 1>; // TMA Async Bulk Tensor Copy Functions //------------------------------------- -class TMA_DIMS_UTIL<int dim> { +class TMA_DIMS_UTIL<int dim, string mode = ""> { // For example, when 'dim' is 3, this generates: // an ins_dag: B32:$d0, B32:$d1, B32:$d2 // with base_str: $d0, $d1, $d2 dag ins_dag = !dag(ins, !listsplat(B32, dim), !foreach(i, !range(dim), "d" # i)); string base_str = !interleave(!foreach(i, !range(dim), "$d" # i), ", "); + + // Tile::Gather4/scatter4 actually operate on a 2D tensor, + // though they take 5 co-ordinates. + // + // The scatter-gather happens over 4 rows with a fixed + // column-index. The first co-ordinate represents the + // col-index followed by four row-indices. + int num_dims = !cond( + !eq(mode, "tile_scatter4") : 2, + !eq(mode, "tile_gather4") : 2, + true : dim); // for all other modes } class TMA_IM2COL_UTIL<int dim, string mode> { @@ -692,14 +703,138 @@ foreach dim = [1, 2, 3, 4, 5] in { } } +multiclass TMA_TENSOR_G2S_INTR<int dim, string mode, list<Predicate> pred = []> { + defvar dims_dag = TMA_DIMS_UTIL<dim>.ins_dag; + defvar dims_str = TMA_DIMS_UTIL<dim>.base_str; + defvar asm_str_base = "$cg [$dst], [$tmap, {{" # dims_str # "}}], [$mbar]"; + + defvar im2col_dag = TMA_IM2COL_UTIL<dim, mode>.ins_dag; + defvar im2col_str = TMA_IM2COL_UTIL<dim, mode>.base_str; + defvar asm_str = !if(!empty(im2col_str), + asm_str_base, + asm_str_base # ", {{" # im2col_str # "}}"); + + defvar dim_val = TMA_DIMS_UTIL<dim, mode>.num_dims; + defvar inst_name = "cp.async.bulk.tensor" + # "." # dim_val # "d" + # "." # "shared::cluster.global" + # "." # !subst("_", "::", mode) + # "." # "mbarrier::complete_tx::bytes"; + defvar intr = !cast<Intrinsic>( + "int_nvvm_cp_async_bulk_tensor_g2s_" # mode # "_" # dim_val # "d"); + + defvar ins_dag = !con( + (ins ADDR:$dst, ADDR:$mbar, B64:$tmap), + dims_dag, im2col_dag, + (ins B16:$mc, B64:$ch, CTAGroupFlags:$cg)); + + defvar intr_dag_base = !con( + (intr addr:$dst, addr:$mbar, B64:$tmap), + !setdagop(dims_dag, intr), + !setdagop(im2col_dag, intr), + (intr B16:$mc, B64:$ch)); + defvar intr_dag_no_hints = !con(intr_dag_base, (intr 0, 0, timm:$cg)); + defvar intr_dag_with_mc = !con(intr_dag_base, (intr -1, 0, timm:$cg)); + defvar intr_dag_with_ch = !con(intr_dag_base, (intr 0, -1, timm:$cg)); + defvar intr_dag_with_mc_ch = !con(intr_dag_base, (intr -1, -1, timm:$cg)); + + def "" : NVPTXInst<(outs), ins_dag, + inst_name # asm_str # ";", + [intr_dag_no_hints]>, + Requires<pred>; + def _MC : NVPTXInst<(outs), ins_dag, + inst_name # ".multicast::cluster" # asm_str # ", $mc;", + [intr_dag_with_mc]>, + Requires<pred>; + def _CH : NVPTXInst<(outs), ins_dag, + inst_name # ".L2::cache_hint" # asm_str # ", $ch;", + [intr_dag_with_ch]>, + Requires<pred>; + def _MC_CH : NVPTXInst<(outs), ins_dag, + inst_name # ".multicast::cluster.L2::cache_hint" # asm_str # ", $mc, $ch;", + [intr_dag_with_mc_ch]>, + Requires<pred>; +} +foreach dim = 3...5 in { + foreach mode = ["im2col_w", "im2col_w_128"] in { + defm TMA_G2S_ # !toupper(mode) # "_" # dim # "D" + : TMA_TENSOR_G2S_INTR<dim, mode, [hasTMACTAGroupSupport]>; + } +} +defm TMA_G2S_TILE_GATHER4_2D : TMA_TENSOR_G2S_INTR<5, "tile_gather4", + [hasTMACTAGroupSupport]>; + +multiclass TMA_TENSOR_G2S_CTA_INTR<int dim, string mode, list<Predicate> pred = []> { + defvar dims_dag = TMA_DIMS_UTIL<dim>.ins_dag; + defvar dims_str = TMA_DIMS_UTIL<dim>.base_str; + defvar asm_str_base = " [$dst], [$tmap, {{" # dims_str # "}}], [$mbar]"; + + defvar im2col_dag = TMA_IM2COL_UTIL<dim, mode>.ins_dag; + defvar im2col_str = TMA_IM2COL_UTIL<dim, mode>.base_str; + defvar asm_str = !if(!empty(im2col_str), + asm_str_base, + asm_str_base # ", {{" # im2col_str # "}}"); + + defvar ins_dag = !con( + (ins ADDR:$dst, ADDR:$mbar, B64:$tmap), + dims_dag, im2col_dag, + (ins B64:$ch)); + + defvar dim_val = TMA_DIMS_UTIL<dim, mode>.num_dims; + defvar intr = !cast<Intrinsic>( + "int_nvvm_cp_async_bulk_tensor_g2s_cta_" # mode # "_" # dim_val # "d"); + defvar intr_dag = !con( + (intr addr:$dst, addr:$mbar, B64:$tmap), + !setdagop(dims_dag, intr), + !setdagop(im2col_dag, intr), + (intr B64:$ch, 0)); + defvar intr_dag_with_ch = !con( + (intr addr:$dst, addr:$mbar, B64:$tmap), + !setdagop(dims_dag, intr), + !setdagop(im2col_dag, intr), + (intr B64:$ch, -1)); + defvar inst_name = "cp.async.bulk.tensor" + # "." # dim_val # "d" + # "." # "shared::cta.global" + # "." # !subst("_", "::", mode) + # "." # "mbarrier::complete_tx::bytes"; + + def "" : NVPTXInst<(outs), ins_dag, + inst_name # asm_str # ";", + [intr_dag]>, + Requires<pred>; + def _CH : NVPTXInst<(outs), ins_dag, + inst_name # ".L2::cache_hint" # asm_str # ", $ch;", + [intr_dag_with_ch]>, + Requires<pred>; +} +foreach dim = 1...5 in { + defm TMA_G2S_CTA_TILE_ # dim # "D" + : TMA_TENSOR_G2S_CTA_INTR<dim, "tile", [hasPTX<86>, hasSM<90>]>; +} +foreach dim = 3...5 in { + defm TMA_G2S_CTA_IM2COL_ # dim # "D" + : TMA_TENSOR_G2S_CTA_INTR<dim, "im2col", [hasPTX<86>, hasSM<90>]>; + + defm TMA_G2S_CTA_IM2COL_W_ # dim # "D" + : TMA_TENSOR_G2S_CTA_INTR<dim, "im2col_w", [hasPTX<86>, hasSM<100>]>; + + defm TMA_G2S_CTA_IM2COL_W_128_ # dim # "D" + : TMA_TENSOR_G2S_CTA_INTR<dim, "im2col_w_128", [hasTMACTAGroupSupport]>; +} +defm TMA_G2S_CTA_TILE_GATHER4_2D : TMA_TENSOR_G2S_CTA_INTR<5, "tile_gather4", + [hasPTX<86>, hasSM<100>]>; + multiclass TMA_TENSOR_S2G_INTR<int dim, string mode, list<Predicate> pred = [hasPTX<80>, hasSM<90>]> { defvar dims_dag = TMA_DIMS_UTIL<dim>.ins_dag; defvar dims_str = TMA_DIMS_UTIL<dim>.base_str; defvar asm_str = " [$tmap, {{" # dims_str # "}}], [$src]"; + defvar dim_val = TMA_DIMS_UTIL<dim, mode>.num_dims; defvar intr = !cast<Intrinsic>( - "int_nvvm_cp_async_bulk_tensor_s2g_" # mode # "_" # dim # d); + "int_nvvm_cp_async_bulk_tensor_s2g_" # mode # "_" # dim_val # "d"); + defvar intr_dag = !con((intr addr:$src, B64:$tmap), !setdagop(dims_dag, intr), (intr B64:$ch, 0)); @@ -707,11 +842,13 @@ multiclass TMA_TENSOR_S2G_INTR<int dim, string mode, !setdagop(dims_dag, intr), (intr B64:$ch, -1)); - // For im2col mode, the actual asm_str is "im2col_no_offs" - defvar mode_asm_str = !if(!eq(mode, "im2col"), - "im2col_no_offs", mode); + // Fix-up the asm_str when it is im2col/scatter4. + defvar mode_asm_str = !cond( + !eq(mode, "im2col") : "im2col_no_offs", + !eq(mode, "tile_scatter4") : "tile::scatter4", + true : mode); defvar prefix = "cp.async.bulk.tensor" - # "." # dim # "d" + # "." # dim_val # "d" # ".global.shared::cta" # "." # mode_asm_str # ".bulk_group"; @@ -729,10 +866,12 @@ multiclass TMA_TENSOR_S2G_INTR<int dim, string mode, } foreach dim = 1...5 in { foreach mode = !if(!ge(dim, 3), ["tile", "im2col"], ["tile"]) in { - defvar suffix = !toupper(mode) # "_" # dim # D; + defvar suffix = !toupper(mode) # "_" # dim # "D"; defm TMA_TENSOR_S2G_ # suffix : TMA_TENSOR_S2G_INTR<dim, mode>; } } +defm TMA_S2G_TILE_SCATTER4_2D : TMA_TENSOR_S2G_INTR<5, "tile_scatter4", + [hasTMACTAGroupSupport]>; def TMAReductionFlags : Operand<i32> { let PrintMethod = "printTmaReductionMode"; @@ -786,13 +925,14 @@ multiclass TMA_TENSOR_PREFETCH_INTR<int dim, string mode, asm_str_base, asm_str_base # ", {{" # im2col_str # "}}"); + defvar dim_val = TMA_DIMS_UTIL<dim, mode>.num_dims; defvar inst_name = "cp.async.bulk.prefetch.tensor" - # "." # dim # "d" + # "." # dim_val # "d" # "." # "L2.global" - # "." # mode; + # "." # !subst("_", "::", mode); defvar intr = !cast<Intrinsic>( - "int_nvvm_cp_async_bulk_tensor_prefetch_" # mode # "_" # dim # d); + "int_nvvm_cp_async_bulk_tensor_prefetch_" # mode # "_" # dim_val # "d"); defvar ins_dag = !con((ins B64:$tmap), dims_dag, @@ -818,10 +958,19 @@ multiclass TMA_TENSOR_PREFETCH_INTR<int dim, string mode, } foreach dim = 1...5 in { foreach mode = !if(!ge(dim, 3), ["tile", "im2col"], ["tile"]) in { - defvar suffix = !toupper(mode) # "_" # dim # D; + defvar suffix = !toupper(mode) # "_" # dim # "D"; defm TMA_TENSOR_PF_ # suffix : TMA_TENSOR_PREFETCH_INTR<dim, mode>; } } +foreach dim = 3...5 in { + foreach mode = ["im2col_w", "im2col_w_128"] in { + defvar suffix = !toupper(mode) # "_" # dim # "D"; + defm TMA_TENSOR_PF_ # suffix : TMA_TENSOR_PREFETCH_INTR<dim, mode, + [hasTMACTAGroupSupport]>; + } +} +defm TMA_TENSOR_PF_TILE_GATHER4_2D : TMA_TENSOR_PREFETCH_INTR<5, "tile_gather4", + [hasTMACTAGroupSupport]>; //Prefetch and Prefetchu diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index 3d060c6..387d289 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -567,7 +567,7 @@ void X86AsmBackend::emitInstructionEnd(MCObjectStreamer &OS, // DataFragment, so that we can get the size of instructions later in // MCAssembler::relaxBoundaryAlign. The easiest way is to insert a new empty // DataFragment. - OS.insert(OS.getContext().allocFragment<MCFragment>()); + OS.newFragment(); // Update the maximum alignment on the current section if necessary. MCSection *Sec = OS.getCurrentSectionOnly(); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index 9df0855..c90ff2a 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "InstCombineInternal.h" +#include "llvm/ADT/APFloat.h" #include "llvm/ADT/APSInt.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" @@ -21,8 +22,10 @@ #include "llvm/Analysis/Utils/Local.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/ConstantRange.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Support/KnownBits.h" @@ -8222,6 +8225,98 @@ static Instruction *foldFCmpReciprocalAndZero(FCmpInst &I, Instruction *LHSI, return new FCmpInst(Pred, LHSI->getOperand(1), RHSC, "", &I); } +// Transform 'fptrunc(x) cmp C' to 'x cmp ext(C)' if possible. +// Patterns include: +// fptrunc(x) < C --> x < ext(C) +// fptrunc(x) <= C --> x <= ext(C) +// fptrunc(x) > C --> x > ext(C) +// fptrunc(x) >= C --> x >= ext(C) +// where 'ext(C)' is the extension of 'C' to the type of 'x' with a small bias +// due to precision loss. +static Instruction *foldFCmpFpTrunc(FCmpInst &I, const Instruction &FPTrunc, + const Constant &C) { + FCmpInst::Predicate Pred = I.getPredicate(); + bool RoundDown = false; + + if (Pred == FCmpInst::FCMP_OGE || Pred == FCmpInst::FCMP_UGE || + Pred == FCmpInst::FCMP_OLT || Pred == FCmpInst::FCMP_ULT) + RoundDown = true; + else if (Pred == FCmpInst::FCMP_OGT || Pred == FCmpInst::FCMP_UGT || + Pred == FCmpInst::FCMP_OLE || Pred == FCmpInst::FCMP_ULE) + RoundDown = false; + else + return nullptr; + + const APFloat *CValue; + if (!match(&C, m_APFloat(CValue))) + return nullptr; + + if (CValue->isNaN() || CValue->isInfinity()) + return nullptr; + + auto ConvertFltSema = [](const APFloat &Src, const fltSemantics &Sema) { + bool LosesInfo; + APFloat Dest = Src; + Dest.convert(Sema, APFloat::rmNearestTiesToEven, &LosesInfo); + return Dest; + }; + + auto NextValue = [](const APFloat &Value, bool RoundDown) { + APFloat NextValue = Value; + NextValue.next(RoundDown); + return NextValue; + }; + + APFloat NextCValue = NextValue(*CValue, RoundDown); + + Type *DestType = FPTrunc.getOperand(0)->getType(); + const fltSemantics &DestFltSema = + DestType->getScalarType()->getFltSemantics(); + + APFloat ExtCValue = ConvertFltSema(*CValue, DestFltSema); + APFloat ExtNextCValue = ConvertFltSema(NextCValue, DestFltSema); + + // When 'NextCValue' is infinity, use an imaged 'NextCValue' that equals + // 'CValue + bias' to avoid the infinity after conversion. The bias is + // estimated as 'CValue - PrevCValue', where 'PrevCValue' is the previous + // value of 'CValue'. + if (NextCValue.isInfinity()) { + APFloat PrevCValue = NextValue(*CValue, !RoundDown); + APFloat Bias = ConvertFltSema(*CValue - PrevCValue, DestFltSema); + + ExtNextCValue = ExtCValue + Bias; + } + + APFloat ExtMidValue = + scalbn(ExtCValue + ExtNextCValue, -1, APFloat::rmNearestTiesToEven); + + const fltSemantics &SrcFltSema = + C.getType()->getScalarType()->getFltSemantics(); + + // 'MidValue' might be rounded to 'NextCValue'. Correct it here. + APFloat MidValue = ConvertFltSema(ExtMidValue, SrcFltSema); + if (MidValue != *CValue) + ExtMidValue.next(!RoundDown); + + // Check whether 'ExtMidValue' is a valid result since the assumption on + // imaged 'NextCValue' might not hold for new float types. + // ppc_fp128 can't pass here when converting from max float because of + // APFloat implementation. + if (NextCValue.isInfinity()) { + // ExtMidValue --- narrowed ---> Finite + if (ConvertFltSema(ExtMidValue, SrcFltSema).isInfinity()) + return nullptr; + + // NextExtMidValue --- narrowed ---> Infinity + APFloat NextExtMidValue = NextValue(ExtMidValue, RoundDown); + if (ConvertFltSema(NextExtMidValue, SrcFltSema).isFinite()) + return nullptr; + } + + return new FCmpInst(Pred, FPTrunc.getOperand(0), + ConstantFP::get(DestType, ExtMidValue), "", &I); +} + /// Optimize fabs(X) compared with zero. static Instruction *foldFabsWithFcmpZero(FCmpInst &I, InstCombinerImpl &IC) { Value *X; @@ -8712,6 +8807,10 @@ Instruction *InstCombinerImpl::visitFCmpInst(FCmpInst &I) { cast<LoadInst>(LHSI), GEP, GV, I)) return Res; break; + case Instruction::FPTrunc: + if (Instruction *NV = foldFCmpFpTrunc(I, *LHSI, *RHSC)) + return NV; + break; } } diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 6a3b3e6..2a92083 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -3275,10 +3275,13 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, } auto *WideLoad = cast<VPWidenLoadRecipe>(R); + VPValue *PtrOp = WideLoad->getAddr(); + if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp)) + PtrOp = VecPtr->getOperand(0); // Narrow wide load to uniform scalar load, as transformed VPlan will only // process one original iteration. - auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), - WideLoad->operands(), /*IsUniform*/ true, + auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp}, + /*IsUniform*/ true, /*Mask*/ nullptr, *WideLoad); N->insertBefore(WideLoad); return N; |