diff options
Diffstat (limited to 'llvm/lib')
44 files changed, 619 insertions, 167 deletions
diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp index c4fee39..5169b43 100644 --- a/llvm/lib/Analysis/InlineCost.cpp +++ b/llvm/lib/Analysis/InlineCost.cpp @@ -1242,7 +1242,7 @@ public: return std::nullopt; } - virtual ~InlineCostCallAnalyzer() = default; + ~InlineCostCallAnalyzer() override = default; int getThreshold() const { return Threshold; } int getCost() const { return Cost; } int getStaticBonusApplied() const { return StaticBonusApplied; } diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp index 8da51d0..2a0a6a2 100644 --- a/llvm/lib/Analysis/InstructionSimplify.cpp +++ b/llvm/lib/Analysis/InstructionSimplify.cpp @@ -4866,6 +4866,89 @@ static Value *simplifySelectWithFCmp(Value *Cond, Value *T, Value *F, return nullptr; } +/// Look for the following pattern and simplify %to_fold to %identicalPhi. +/// Here %phi, %to_fold and %phi.next perform the same functionality as +/// %identicalPhi and hence the select instruction %to_fold can be folded +/// into %identicalPhi. +/// +/// BB1: +/// %identicalPhi = phi [ X, %BB0 ], [ %identicalPhi.next, %BB1 ] +/// %phi = phi [ X, %BB0 ], [ %phi.next, %BB1 ] +/// ... +/// %identicalPhi.next = select %cmp, %val, %identicalPhi +/// (or select %cmp, %identicalPhi, %val) +/// %to_fold = select %cmp2, %identicalPhi, %phi +/// %phi.next = select %cmp, %val, %to_fold +/// (or select %cmp, %to_fold, %val) +/// +/// Prove that %phi and %identicalPhi are the same by induction: +/// +/// Base case: Both %phi and %identicalPhi are equal on entry to the loop. +/// Inductive case: +/// Suppose %phi and %identicalPhi are equal at iteration i. +/// We look at their values at iteration i+1 which are %phi.next and +/// %identicalPhi.next. They would have become different only when %cmp is +/// false and the corresponding values %to_fold and %identicalPhi differ +/// (similar reason for the other "or" case in the bracket). +/// +/// The only condition when %to_fold and %identicalPh could differ is when %cmp2 +/// is false and %to_fold is %phi, which contradicts our inductive hypothesis +/// that %phi and %identicalPhi are equal. Thus %phi and %identicalPhi are +/// always equal at iteration i+1. +bool isSelectWithIdenticalPHI(PHINode &PN, PHINode &IdenticalPN) { + if (PN.getParent() != IdenticalPN.getParent()) + return false; + if (PN.getNumIncomingValues() != 2) + return false; + + // Check that only the backedge incoming value is different. + unsigned DiffVals = 0; + BasicBlock *DiffValBB = nullptr; + for (unsigned i = 0; i < 2; i++) { + BasicBlock *PredBB = PN.getIncomingBlock(i); + if (PN.getIncomingValue(i) != + IdenticalPN.getIncomingValueForBlock(PredBB)) { + DiffVals++; + DiffValBB = PredBB; + } + } + if (DiffVals != 1) + return false; + // Now check that the backedge incoming values are two select + // instructions with the same condition. Either their true + // values are the same, or their false values are the same. + auto *SI = dyn_cast<SelectInst>(PN.getIncomingValueForBlock(DiffValBB)); + auto *IdenticalSI = + dyn_cast<SelectInst>(IdenticalPN.getIncomingValueForBlock(DiffValBB)); + if (!SI || !IdenticalSI) + return false; + if (SI->getCondition() != IdenticalSI->getCondition()) + return false; + + SelectInst *SIOtherVal = nullptr; + Value *IdenticalSIOtherVal = nullptr; + if (SI->getTrueValue() == IdenticalSI->getTrueValue()) { + SIOtherVal = dyn_cast<SelectInst>(SI->getFalseValue()); + IdenticalSIOtherVal = IdenticalSI->getFalseValue(); + } else if (SI->getFalseValue() == IdenticalSI->getFalseValue()) { + SIOtherVal = dyn_cast<SelectInst>(SI->getTrueValue()); + IdenticalSIOtherVal = IdenticalSI->getTrueValue(); + } else { + return false; + } + + // Now check that the other values in select, i.e., %to_fold and + // %identicalPhi, are essentially the same value. + if (!SIOtherVal || IdenticalSIOtherVal != &IdenticalPN) + return false; + if (!(SIOtherVal->getTrueValue() == &IdenticalPN && + SIOtherVal->getFalseValue() == &PN) && + !(SIOtherVal->getTrueValue() == &PN && + SIOtherVal->getFalseValue() == &IdenticalPN)) + return false; + return true; +} + /// Given operands for a SelectInst, see if we can fold the result. /// If not, this returns null. static Value *simplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal, @@ -5041,7 +5124,14 @@ static Value *simplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal, std::optional<bool> Imp = isImpliedByDomCondition(Cond, Q.CxtI, Q.DL); if (Imp) return *Imp ? TrueVal : FalseVal; - + // Look for same PHIs in the true and false values. + if (auto *TruePHI = dyn_cast<PHINode>(TrueVal)) + if (auto *FalsePHI = dyn_cast<PHINode>(FalseVal)) { + if (isSelectWithIdenticalPHI(*TruePHI, *FalsePHI)) + return FalseVal; + if (isSelectWithIdenticalPHI(*FalsePHI, *TruePHI)) + return TrueVal; + } return nullptr; } diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index 7adb25d..e27a9b1 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -2982,6 +2982,10 @@ void LoopAccessInfo::collectStridedAccess(Value *MemAccess) { if (!StrideExpr) return; + if (auto *Unknown = dyn_cast<SCEVUnknown>(StrideExpr)) + if (isa<UndefValue>(Unknown->getValue())) + return; + LLVM_DEBUG(dbgs() << "LAA: Found a strided access that is a candidate for " "versioning:"); LLVM_DEBUG(dbgs() << " Ptr: " << *Ptr << " Stride: " << *StrideExpr << "\n"); diff --git a/llvm/lib/CodeGen/AsmPrinter/AddressPool.cpp b/llvm/lib/CodeGen/AsmPrinter/AddressPool.cpp index 12a784e..11ca48d 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AddressPool.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AddressPool.cpp @@ -18,8 +18,7 @@ using namespace llvm; unsigned AddressPool::getIndex(const MCSymbol *Sym, bool TLS) { resetUsedFlag(true); - auto IterBool = - Pool.insert(std::make_pair(Sym, AddressPoolEntry(Pool.size(), TLS))); + auto IterBool = Pool.try_emplace(Sym, Pool.size(), TLS); return IterBool.first->second.Number; } diff --git a/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp b/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp index 93ae548..7bef3a8 100644 --- a/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp +++ b/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp @@ -86,10 +86,7 @@ template <> struct llvm::DenseMapInfo<VariableID> { using VarLocInsertPt = PointerUnion<const Instruction *, const DbgRecord *>; template <> struct std::hash<VarLocInsertPt> { - using argument_type = VarLocInsertPt; - using result_type = std::size_t; - - result_type operator()(const argument_type &Arg) const { + std::size_t operator()(const VarLocInsertPt &Arg) const { return std::hash<void *>()(Arg.getOpaqueValue()); } }; diff --git a/llvm/lib/CodeGen/TailDuplicator.cpp b/llvm/lib/CodeGen/TailDuplicator.cpp index 8e48d19..109444b 100644 --- a/llvm/lib/CodeGen/TailDuplicator.cpp +++ b/llvm/lib/CodeGen/TailDuplicator.cpp @@ -363,7 +363,7 @@ void TailDuplicator::processPHI( Register SrcReg = MI->getOperand(SrcOpIdx).getReg(); unsigned SrcSubReg = MI->getOperand(SrcOpIdx).getSubReg(); const TargetRegisterClass *RC = MRI->getRegClass(DefReg); - LocalVRMap.insert(std::make_pair(DefReg, RegSubRegPair(SrcReg, SrcSubReg))); + LocalVRMap.try_emplace(DefReg, SrcReg, SrcSubReg); // Insert a copy from source to the end of the block. The def register is the // available value liveout of the block. @@ -411,7 +411,7 @@ void TailDuplicator::duplicateInstruction( const TargetRegisterClass *RC = MRI->getRegClass(Reg); Register NewReg = MRI->createVirtualRegister(RC); MO.setReg(NewReg); - LocalVRMap.insert(std::make_pair(Reg, RegSubRegPair(NewReg, 0))); + LocalVRMap.try_emplace(Reg, NewReg, 0); if (isDefLiveOut(Reg, TailBB, MRI) || UsedByPhi.count(Reg)) addSSAUpdateEntry(Reg, NewReg, PredBB); continue; @@ -463,7 +463,7 @@ void TailDuplicator::duplicateInstruction( NewReg) .addReg(VI->second.Reg, 0, VI->second.SubReg); LocalVRMap.erase(VI); - LocalVRMap.insert(std::make_pair(Reg, RegSubRegPair(NewReg, 0))); + LocalVRMap.try_emplace(Reg, NewReg, 0); MO.setReg(NewReg); // The composed VI.Reg:VI.SubReg is replaced with NewReg, which // is equivalent to the whole register Reg. Hence, Reg:subreg diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp index 1096e57..3c222f5 100644 --- a/llvm/lib/IR/AsmWriter.cpp +++ b/llvm/lib/IR/AsmWriter.cpp @@ -843,7 +843,7 @@ public: SlotTracker(const SlotTracker &) = delete; SlotTracker &operator=(const SlotTracker &) = delete; - ~SlotTracker() = default; + ~SlotTracker() override = default; void setProcessHook( std::function<void(AbstractSlotTrackerStorage *, const Module *, bool)>); @@ -5323,7 +5323,7 @@ struct MDTreeAsmWriterContext : public AsmWriterContext { --Level; } - ~MDTreeAsmWriterContext() { + ~MDTreeAsmWriterContext() override { for (const auto &Entry : Buffer) { MainOS << "\n"; unsigned NumIndent = Entry.first * 2U; diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp index 4bc2a18..b618222 100644 --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -1711,7 +1711,7 @@ public: /*ShouldEmitImportsFiles=*/false), IRFiles(std::move(IRFiles)), CombinedCGDataHash(CombinedCGDataHash) {} - virtual Error runThinLTOBackendThread( + Error runThinLTOBackendThread( AddStreamFn AddStream, FileCache Cache, unsigned Task, BitcodeModule BM, ModuleSummaryIndex &CombinedIndex, const FunctionImporter::ImportMapTy &ImportList, @@ -2271,8 +2271,8 @@ public: RemoteCompilerPrependArgs(RemoteCompilerPrependArgs), RemoteCompilerArgs(RemoteCompilerArgs), SaveTemps(SaveTemps) {} - virtual void setup(unsigned ThinLTONumTasks, unsigned ThinLTOTaskOffset, - llvm::Triple Triple) override { + void setup(unsigned ThinLTONumTasks, unsigned ThinLTOTaskOffset, + llvm::Triple Triple) override { UID = itostr(sys::Process::getProcessId()); Jobs.resize((size_t)ThinLTONumTasks); this->ThinLTOTaskOffset = ThinLTOTaskOffset; diff --git a/llvm/lib/MC/GOFFObjectWriter.cpp b/llvm/lib/MC/GOFFObjectWriter.cpp index d68f4af..71bd397 100644 --- a/llvm/lib/MC/GOFFObjectWriter.cpp +++ b/llvm/lib/MC/GOFFObjectWriter.cpp @@ -440,7 +440,7 @@ public: SetBuffer(Buffer, sizeof(Buffer)); } - ~TextStream() { flush(); } + ~TextStream() override { flush(); } }; } // namespace diff --git a/llvm/lib/MC/MCAsmStreamer.cpp b/llvm/lib/MC/MCAsmStreamer.cpp index be8c022..885fa55 100644 --- a/llvm/lib/MC/MCAsmStreamer.cpp +++ b/llvm/lib/MC/MCAsmStreamer.cpp @@ -295,7 +295,7 @@ public: unsigned Flags, unsigned Isa, unsigned Discriminator, StringRef FileName, StringRef Location = {}) override; - virtual void emitDwarfLocLabelDirective(SMLoc Loc, StringRef Name) override; + void emitDwarfLocLabelDirective(SMLoc Loc, StringRef Name) override; MCSymbol *getDwarfLineTableSymbol(unsigned CUID) override; diff --git a/llvm/lib/MC/MCParser/AsmLexer.cpp b/llvm/lib/MC/MCParser/AsmLexer.cpp index 968ccf7..a6188f0 100644 --- a/llvm/lib/MC/MCParser/AsmLexer.cpp +++ b/llvm/lib/MC/MCParser/AsmLexer.cpp @@ -835,7 +835,14 @@ AsmToken AsmLexer::LexToken() { } if (isAtStartOfComment(TokStart)) { - CurPtr += MAI.getCommentString().size() - 1; + StringRef CommentString = MAI.getCommentString(); + // For multi-char comment strings, advance CurPtr only if we matched the + // full string. This stops us from accidentally eating the newline if the + // current line ends in a single comment char. + if (CommentString.size() > 1 && + StringRef(TokStart, CommentString.size()) == CommentString) { + CurPtr += CommentString.size() - 1; + } return LexLineComment(); } diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp index acea3ab..dd1bc2b 100644 --- a/llvm/lib/MC/MCParser/AsmParser.cpp +++ b/llvm/lib/MC/MCParser/AsmParser.cpp @@ -726,7 +726,7 @@ public: Lexer.setLexHLASMStrings(true); } - ~HLASMAsmParser() { Lexer.setSkipSpace(true); } + ~HLASMAsmParser() override { Lexer.setSkipSpace(true); } bool parseStatement(ParseStatementInfo &Info, MCAsmParserSemaCallback *SI) override; diff --git a/llvm/lib/MC/XCOFFObjectWriter.cpp b/llvm/lib/MC/XCOFFObjectWriter.cpp index fce6b2a..d466009 100644 --- a/llvm/lib/MC/XCOFFObjectWriter.cpp +++ b/llvm/lib/MC/XCOFFObjectWriter.cpp @@ -184,7 +184,7 @@ struct CsectSectionEntry : public SectionEntry { Group->clear(); } - virtual ~CsectSectionEntry() = default; + ~CsectSectionEntry() override = default; }; struct DwarfSectionEntry : public SectionEntry { @@ -220,7 +220,7 @@ struct DwarfSectionEntry : public SectionEntry { DwarfSectionEntry(DwarfSectionEntry &&s) = default; - virtual ~DwarfSectionEntry() = default; + ~DwarfSectionEntry() override = default; }; struct ExceptionTableEntry { @@ -249,7 +249,7 @@ struct ExceptionSectionEntry : public SectionEntry { memcpy(Name, N.data(), N.size()); } - virtual ~ExceptionSectionEntry() = default; + ~ExceptionSectionEntry() override = default; }; struct CInfoSymInfo { @@ -276,7 +276,7 @@ struct CInfoSymSectionEntry : public SectionEntry { std::unique_ptr<CInfoSymInfo> Entry; CInfoSymSectionEntry(StringRef N, int32_t Flags) : SectionEntry(N, Flags) {} - virtual ~CInfoSymSectionEntry() = default; + ~CInfoSymSectionEntry() override = default; void addEntry(std::unique_ptr<CInfoSymInfo> NewEntry) { Entry = std::move(NewEntry); Entry->Offset = sizeof(uint32_t); diff --git a/llvm/lib/ObjCopy/ELF/ELFObject.h b/llvm/lib/ObjCopy/ELF/ELFObject.h index 7ec0e9b..4f6473f 100644 --- a/llvm/lib/ObjCopy/ELF/ELFObject.h +++ b/llvm/lib/ObjCopy/ELF/ELFObject.h @@ -109,7 +109,7 @@ protected: WritableMemoryBuffer &Out; public: - virtual ~SectionWriter() = default; + ~SectionWriter() override = default; Error visit(const Section &Sec) override; Error visit(const OwnedDataSection &Sec) override; @@ -134,7 +134,7 @@ private: using Elf_Sym = typename ELFT::Sym; public: - virtual ~ELFSectionWriter() {} + ~ELFSectionWriter() override {} Error visit(const SymbolTableSection &Sec) override; Error visit(const RelocationSection &Sec) override; Error visit(const GnuDebugLinkSection &Sec) override; @@ -180,7 +180,7 @@ public: class BinarySectionWriter : public SectionWriter { public: - virtual ~BinarySectionWriter() {} + ~BinarySectionWriter() override {} Error visit(const SymbolTableSection &Sec) override; Error visit(const RelocationSection &Sec) override; @@ -346,7 +346,7 @@ private: size_t totalSize() const; public: - virtual ~ELFWriter() {} + ~ELFWriter() override {} bool WriteSectionHeaders; // For --only-keep-debug, select an alternative section/segment layout @@ -367,7 +367,7 @@ private: uint64_t TotalSize = 0; public: - ~BinaryWriter() {} + ~BinaryWriter() override {} Error finalize() override; Error write() override; BinaryWriter(Object &Obj, raw_ostream &Out, const CommonConfig &Config) @@ -784,7 +784,7 @@ private: SymbolTableSection *Symbols = nullptr; public: - virtual ~SectionIndexSection() {} + ~SectionIndexSection() override {} void addIndex(uint32_t Index) { assert(Size > 0); Indexes.push_back(Index); diff --git a/llvm/lib/ObjectYAML/GOFFEmitter.cpp b/llvm/lib/ObjectYAML/GOFFEmitter.cpp index c26893c..82800b1 100644 --- a/llvm/lib/ObjectYAML/GOFFEmitter.cpp +++ b/llvm/lib/ObjectYAML/GOFFEmitter.cpp @@ -71,7 +71,7 @@ public: SetBufferSize(GOFF::PayloadLength); } - ~GOFFOstream() { finalize(); } + ~GOFFOstream() override { finalize(); } void makeNewRecord(GOFF::RecordType Type, size_t Size) { fillRecord(); diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp index 4787604..e21cf8e 100644 --- a/llvm/lib/Support/APFloat.cpp +++ b/llvm/lib/Support/APFloat.cpp @@ -5354,7 +5354,7 @@ APInt DoubleAPFloat::bitcastToAPInt() const { Floats[0].bitcastToAPInt().getRawData()[0], Floats[1].bitcastToAPInt().getRawData()[0], }; - return APInt(128, 2, Data); + return APInt(128, Data); } Expected<APFloat::opStatus> DoubleAPFloat::convertFromString(StringRef S, @@ -5643,8 +5643,7 @@ APFloat::opStatus DoubleAPFloat::convertFromUnsignedParts( // Create a minimally-sized APInt to represent the source value. const unsigned SrcBitWidth = SrcMSB + 1; - APSInt SrcInt{APInt{/*numBits=*/SrcBitWidth, - /*numWords=*/SrcCount, Src}, + APSInt SrcInt{APInt{/*numBits=*/SrcBitWidth, ArrayRef(Src, SrcCount)}, /*isUnsigned=*/true}; // Stage 1: Initial Approximation. diff --git a/llvm/lib/Support/LSP/Protocol.cpp b/llvm/lib/Support/LSP/Protocol.cpp index f221263..f8eeb32 100644 --- a/llvm/lib/Support/LSP/Protocol.cpp +++ b/llvm/lib/Support/LSP/Protocol.cpp @@ -96,10 +96,6 @@ static void percentEncode(StringRef Content, std::string &Out) { static std::string percentDecode(StringRef Content) { std::string Result; for (auto I = Content.begin(), E = Content.end(); I != E; ++I) { - if (*I != '%') { - Result += *I; - continue; - } if (*I == '%' && I + 2 < Content.end() && llvm::isHexDigit(*(I + 1)) && llvm::isHexDigit(*(I + 2))) { Result.push_back(llvm::hexFromNibbles(*(I + 1), *(I + 2))); diff --git a/llvm/lib/Support/SpecialCaseList.cpp b/llvm/lib/Support/SpecialCaseList.cpp index 3a97185..246d90c 100644 --- a/llvm/lib/Support/SpecialCaseList.cpp +++ b/llvm/lib/Support/SpecialCaseList.cpp @@ -94,6 +94,19 @@ void SpecialCaseList::GlobMatcher::preprocess(bool BySize) { StringRef Prefix = G.Pattern.prefix(); StringRef Suffix = G.Pattern.suffix(); + if (Suffix.empty() && Prefix.empty()) { + // If both prefix and suffix are empty put into special tree to search by + // substring in a middle. + StringRef Substr = G.Pattern.longest_substr(); + if (!Substr.empty()) { + // But only if substring is not empty. Searching this tree is more + // expensive. + auto &V = SubstrToGlob.emplace(Substr).first->second; + V.emplace_back(&G); + continue; + } + } + auto &SToGlob = PrefixSuffixToGlob.emplace(Prefix).first->second; auto &V = SToGlob.emplace(reverse(Suffix)).first->second; V.emplace_back(&G); @@ -119,6 +132,25 @@ void SpecialCaseList::GlobMatcher::match( } } } + + if (!SubstrToGlob.empty()) { + // As we don't know when substring exactly starts, we will try all + // possibilities. In most cases search will fail on first characters. + for (StringRef Q = Query; !Q.empty(); Q = Q.drop_front()) { + for (const auto &[_, V] : SubstrToGlob.find_prefixes(Q)) { + for (const auto *G : V) { + if (G->Pattern.match(Query)) { + Cb(G->Name, G->LineNo); + // As soon as we find a match in the vector, we can break for this + // vector, since the globs are already sorted by priority within the + // prefix group. However, we continue searching other prefix groups + // in the map, as they may contain a better match overall. + break; + } + } + } + } + } } SpecialCaseList::Matcher::Matcher(bool UseGlobs, bool RemoveDotSlash) diff --git a/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp b/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp index 137ff89..f13554f 100644 --- a/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp +++ b/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp @@ -47,6 +47,8 @@ public: StringRef getPassName() const override { return AARCH64_BRANCH_TARGETS_NAME; } private: + const AArch64Subtarget *Subtarget; + void addBTI(MachineBasicBlock &MBB, bool CouldCall, bool CouldJump, bool NeedsWinCFI); }; @@ -75,6 +77,8 @@ bool AArch64BranchTargets::runOnMachineFunction(MachineFunction &MF) { << "********** Function: " << MF.getName() << '\n'); const Function &F = MF.getFunction(); + Subtarget = &MF.getSubtarget<AArch64Subtarget>(); + // LLVM does not consider basic blocks which are the targets of jump tables // to be address-taken (the address can't escape anywhere else), but they are // used for indirect branches, so need BTI instructions. @@ -100,9 +104,8 @@ bool AArch64BranchTargets::runOnMachineFunction(MachineFunction &MF) { // a BTI, and pointing the indirect branch at that. For non-ELF targets we // can't rely on that, so we assume that `CouldCall` is _always_ true due // to the risk of long-branch thunks at link time. - if (&MBB == &*MF.begin() && - (!MF.getSubtarget<AArch64Subtarget>().isTargetELF() || - (F.hasAddressTaken() || !F.hasLocalLinkage()))) + if (&MBB == &*MF.begin() && (!Subtarget->isTargetELF() || + (F.hasAddressTaken() || !F.hasLocalLinkage()))) CouldCall = true; // If the block itself is address-taken, it could be indirectly branched @@ -132,9 +135,6 @@ void AArch64BranchTargets::addBTI(MachineBasicBlock &MBB, bool CouldCall, << (CouldCall ? "c" : "") << " to " << MBB.getName() << "\n"); - const AArch64InstrInfo *TII = static_cast<const AArch64InstrInfo *>( - MBB.getParent()->getSubtarget().getInstrInfo()); - unsigned HintNum = 32; if (CouldCall) HintNum |= 2; @@ -162,6 +162,8 @@ void AArch64BranchTargets::addBTI(MachineBasicBlock &MBB, bool CouldCall, MBBI->getOpcode() == AArch64::PACIBSP)) return; + const AArch64InstrInfo *TII = Subtarget->getInstrInfo(); + // Insert BTI exactly at the first executable instruction. const DebugLoc DL = MBB.findDebugLoc(MBBI); MachineInstr *BTI = BuildMI(MBB, MBBI, DL, TII->get(AArch64::HINT)) diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index 1e607f4..f63981b 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -1871,7 +1871,7 @@ bool AArch64ExpandPseudo::expandMBB(MachineBasicBlock &MBB) { } bool AArch64ExpandPseudo::runOnMachineFunction(MachineFunction &MF) { - TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo()); + TII = MF.getSubtarget<AArch64Subtarget>().getInstrInfo(); bool Modified = false; for (auto &MBB : MF) diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index c76689f..0f7b34c 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -644,10 +644,10 @@ bool AArch64FrameLowering::hasReservedCallFrame( MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr( MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { - const AArch64InstrInfo *TII = - static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo()); - const AArch64TargetLowering *TLI = - MF.getSubtarget<AArch64Subtarget>().getTargetLowering(); + + const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); + const AArch64InstrInfo *TII = Subtarget.getInstrInfo(); + const AArch64TargetLowering *TLI = Subtarget.getTargetLowering(); [[maybe_unused]] MachineFrameInfo &MFI = MF.getFrameInfo(); DebugLoc DL = I->getDebugLoc(); unsigned Opc = I->getOpcode(); @@ -1319,8 +1319,8 @@ StackOffset AArch64FrameLowering::getStackOffset(const MachineFunction &MF, // TODO: This function currently does not work for scalable vectors. int AArch64FrameLowering::getSEHFrameIndexOffset(const MachineFunction &MF, int FI) const { - const auto *RegInfo = static_cast<const AArch64RegisterInfo *>( - MF.getSubtarget().getRegisterInfo()); + const AArch64RegisterInfo *RegInfo = + MF.getSubtarget<AArch64Subtarget>().getRegisterInfo(); int ObjectOffset = MF.getFrameInfo().getObjectOffset(FI); return RegInfo->getLocalAddressRegister(MF) == AArch64::FP ? getFPOffset(MF, ObjectOffset).getFixed() @@ -1343,10 +1343,9 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference( TargetStackID::Value StackID, Register &FrameReg, bool PreferFP, bool ForSimm) const { const auto &MFI = MF.getFrameInfo(); - const auto *RegInfo = static_cast<const AArch64RegisterInfo *>( - MF.getSubtarget().getRegisterInfo()); - const auto *AFI = MF.getInfo<AArch64FunctionInfo>(); const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>(); + const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); + const auto *AFI = MF.getInfo<AArch64FunctionInfo>(); int64_t FPOffset = getFPOffset(MF, ObjectOffset).getFixed(); int64_t Offset = getStackOffset(MF, ObjectOffset).getFixed(); @@ -1466,7 +1465,7 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference( return FPOffset; } FrameReg = RegInfo->hasBasePointer(MF) ? RegInfo->getBaseRegister() - : (unsigned)AArch64::SP; + : MCRegister(AArch64::SP); return SPOffset; } @@ -1589,8 +1588,8 @@ static bool invalidateRegisterPairing(unsigned Reg1, unsigned Reg2, namespace { struct RegPairInfo { - unsigned Reg1 = AArch64::NoRegister; - unsigned Reg2 = AArch64::NoRegister; + Register Reg1; + Register Reg2; int FrameIdx; int Offset; enum RegType { GPR, FPR64, FPR128, PPR, ZPR, VG } Type; @@ -1598,21 +1597,21 @@ struct RegPairInfo { RegPairInfo() = default; - bool isPaired() const { return Reg2 != AArch64::NoRegister; } + bool isPaired() const { return Reg2.isValid(); } bool isScalable() const { return Type == PPR || Type == ZPR; } }; } // end anonymous namespace -unsigned findFreePredicateReg(BitVector &SavedRegs) { +MCRegister findFreePredicateReg(BitVector &SavedRegs) { for (unsigned PReg = AArch64::P8; PReg <= AArch64::P15; ++PReg) { if (SavedRegs.test(PReg)) { unsigned PNReg = PReg - AArch64::P0 + AArch64::PN0; - return PNReg; + return MCRegister(PNReg); } } - return AArch64::NoRegister; + return MCRegister(); } // The multivector LD/ST are available only for SME or SVE2p1 targets @@ -1930,8 +1929,8 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( } bool PTrueCreated = false; for (const RegPairInfo &RPI : llvm::reverse(RegPairs)) { - unsigned Reg1 = RPI.Reg1; - unsigned Reg2 = RPI.Reg2; + Register Reg1 = RPI.Reg1; + Register Reg2 = RPI.Reg2; unsigned StrOpc; // Issue sequence of spills for cs regs. The first spill may be converted @@ -1967,7 +1966,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( break; } - unsigned X0Scratch = AArch64::NoRegister; + Register X0Scratch; auto RestoreX0 = make_scope_exit([&] { if (X0Scratch != AArch64::NoRegister) BuildMI(MBB, MI, DL, TII.get(TargetOpcode::COPY), AArch64::X0) @@ -2009,11 +2008,15 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( } } - LLVM_DEBUG(dbgs() << "CSR spill: (" << printReg(Reg1, TRI); - if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI); - dbgs() << ") -> fi#(" << RPI.FrameIdx; - if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx + 1; - dbgs() << ")\n"); + LLVM_DEBUG({ + dbgs() << "CSR spill: (" << printReg(Reg1, TRI); + if (RPI.isPaired()) + dbgs() << ", " << printReg(Reg2, TRI); + dbgs() << ") -> fi#(" << RPI.FrameIdx; + if (RPI.isPaired()) + dbgs() << ", " << RPI.FrameIdx + 1; + dbgs() << ")\n"; + }); assert((!NeedsWinCFI || !(Reg1 == AArch64::LR && Reg2 == AArch64::FP)) && "Windows unwdinding requires a consecutive (FP,LR) pair"); @@ -2143,8 +2146,8 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( bool PTrueCreated = false; for (const RegPairInfo &RPI : RegPairs) { - unsigned Reg1 = RPI.Reg1; - unsigned Reg2 = RPI.Reg2; + Register Reg1 = RPI.Reg1; + Register Reg2 = RPI.Reg2; // Issue sequence of restores for cs regs. The last restore may be converted // to a post-increment load later by emitEpilogue if the callee-save stack @@ -2176,11 +2179,15 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( case RegPairInfo::VG: continue; } - LLVM_DEBUG(dbgs() << "CSR restore: (" << printReg(Reg1, TRI); - if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI); - dbgs() << ") -> fi#(" << RPI.FrameIdx; - if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx + 1; - dbgs() << ")\n"); + LLVM_DEBUG({ + dbgs() << "CSR restore: (" << printReg(Reg1, TRI); + if (RPI.isPaired()) + dbgs() << ", " << printReg(Reg2, TRI); + dbgs() << ") -> fi#(" << RPI.FrameIdx; + if (RPI.isPaired()) + dbgs() << ", " << RPI.FrameIdx + 1; + dbgs() << ")\n"; + }); // Windows unwind codes require consecutive registers if registers are // paired. Make the switch here, so that the code below will save (x,x+1) @@ -2435,8 +2442,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); - const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>( - MF.getSubtarget().getRegisterInfo()); + const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); unsigned UnspilledCSGPR = AArch64::NoRegister; unsigned UnspilledCSGPRPaired = AArch64::NoRegister; @@ -2444,9 +2450,8 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, MachineFrameInfo &MFI = MF.getFrameInfo(); const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs(); - unsigned BasePointerReg = RegInfo->hasBasePointer(MF) - ? RegInfo->getBaseRegister() - : (unsigned)AArch64::NoRegister; + MCRegister BasePointerReg = + RegInfo->hasBasePointer(MF) ? RegInfo->getBaseRegister() : MCRegister(); unsigned ExtraCSSpill = 0; bool HasUnpairedGPR64 = false; @@ -2456,7 +2461,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, // Figure out which callee-saved registers to save/restore. for (unsigned i = 0; CSRegs[i]; ++i) { - const unsigned Reg = CSRegs[i]; + const MCRegister Reg = CSRegs[i]; // Add the base pointer register to SavedRegs if it is callee-save. if (Reg == BasePointerReg) @@ -2470,7 +2475,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, } bool RegUsed = SavedRegs.test(Reg); - unsigned PairedReg = AArch64::NoRegister; + MCRegister PairedReg; const bool RegIsGPR64 = AArch64::GPR64RegClass.contains(Reg); if (RegIsGPR64 || AArch64::FPR64RegClass.contains(Reg) || AArch64::FPR128RegClass.contains(Reg)) { @@ -2522,8 +2527,8 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); // Find a suitable predicate register for the multi-vector spill/fill // instructions. - unsigned PnReg = findFreePredicateReg(SavedRegs); - if (PnReg != AArch64::NoRegister) + MCRegister PnReg = findFreePredicateReg(SavedRegs); + if (PnReg.isValid()) AFI->setPredicateRegForFillSpill(PnReg); // If no free callee-save has been found assign one. if (!AFI->getPredicateRegForFillSpill() && @@ -2558,7 +2563,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, unsigned PPRCSStackSize = 0; const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); for (unsigned Reg : SavedRegs.set_bits()) { - auto *RC = TRI->getMinimalPhysRegClass(Reg); + auto *RC = TRI->getMinimalPhysRegClass(MCRegister(Reg)); assert(RC && "expected register class!"); auto SpillSize = TRI->getSpillSize(*RC); bool IsZPR = AArch64::ZPRRegClass.contains(Reg); @@ -2600,7 +2605,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, LLVM_DEBUG({ dbgs() << "*** determineCalleeSaves\nSaved CSRs:"; for (unsigned Reg : SavedRegs.set_bits()) - dbgs() << ' ' << printReg(Reg, RegInfo); + dbgs() << ' ' << printReg(MCRegister(Reg), RegInfo); dbgs() << "\n"; }); diff --git a/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp b/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp index d67182d..03dd1cd 100644 --- a/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp +++ b/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp @@ -649,7 +649,7 @@ bool AArch64LowerHomogeneousPE::runOnMBB(MachineBasicBlock &MBB) { } bool AArch64LowerHomogeneousPE::runOnMachineFunction(MachineFunction &MF) { - TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo()); + TII = MF.getSubtarget<AArch64Subtarget>().getInstrInfo(); bool Modified = false; for (auto &MBB : MF) diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp index 79975b0..5bfb19d9 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -620,7 +620,7 @@ AArch64RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const { return RC; } -unsigned AArch64RegisterInfo::getBaseRegister() const { return AArch64::X19; } +MCRegister AArch64RegisterInfo::getBaseRegister() const { return AArch64::X19; } bool AArch64RegisterInfo::hasBasePointer(const MachineFunction &MF) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h index 47d76f3..3b0f4f6 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h @@ -124,7 +124,7 @@ public: bool requiresVirtualBaseRegisters(const MachineFunction &MF) const override; bool hasBasePointer(const MachineFunction &MF) const; - unsigned getBaseRegister() const; + MCRegister getBaseRegister() const; bool isArgumentRegister(const MachineFunction &MF, MCRegister Reg) const override; diff --git a/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp b/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp index d695f26..b4a4f4c 100644 --- a/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp +++ b/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp @@ -33,6 +33,7 @@ //===----------------------------------------------------------------------===// #include "AArch64InstrInfo.h" +#include "AArch64Subtarget.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringRef.h" @@ -49,8 +50,8 @@ #include "llvm/MC/MCInstrDesc.h" #include "llvm/MC/MCSchedule.h" #include "llvm/Pass.h" -#include <unordered_map> #include <map> +#include <unordered_map> using namespace llvm; @@ -67,7 +68,7 @@ namespace { struct AArch64SIMDInstrOpt : public MachineFunctionPass { static char ID; - const TargetInstrInfo *TII; + const AArch64InstrInfo *TII; MachineRegisterInfo *MRI; TargetSchedModel SchedModel; @@ -694,13 +695,9 @@ bool AArch64SIMDInstrOpt::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; - TII = MF.getSubtarget().getInstrInfo(); MRI = &MF.getRegInfo(); - const TargetSubtargetInfo &ST = MF.getSubtarget(); - const AArch64InstrInfo *AAII = - static_cast<const AArch64InstrInfo *>(ST.getInstrInfo()); - if (!AAII) - return false; + const AArch64Subtarget &ST = MF.getSubtarget<AArch64Subtarget>(); + TII = ST.getInstrInfo(); SchedModel.init(&ST); if (!SchedModel.hasInstrSchedModel()) return false; diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp index 5c3e26e..4cd51d6 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp @@ -1114,7 +1114,6 @@ bool AArch64InstPrinter::printSyslAlias(const MCInst *MI, } else return false; - std::string Str; llvm::transform(Name, Name.begin(), ::tolower); O << '\t' << Ins << '\t' << Reg.str() << ", " << Name; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 02c5390..6214f4d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -740,7 +740,7 @@ static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) { return "r600"; } -static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) { +static Reloc::Model getEffectiveRelocModel() { // The AMDGPU toolchain only supports generating shared objects, so we // must always use PIC. return Reloc::PIC_; @@ -754,8 +754,8 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, CodeGenOptLevel OptLevel) : CodeGenTargetMachineImpl( T, TT.computeDataLayout(), TT, getGPUOrDefault(TT, CPU), FS, Options, - getEffectiveRelocModel(RM), - getEffectiveCodeModel(CM, CodeModel::Small), OptLevel), + getEffectiveRelocModel(), getEffectiveCodeModel(CM, CodeModel::Small), + OptLevel), TLOF(createTLOF(getTargetTriple())) { initAsmInfo(); if (TT.isAMDGCN()) { diff --git a/llvm/lib/Target/ARM/ARMProcessors.td b/llvm/lib/Target/ARM/ARMProcessors.td index 7453727..b60569e 100644 --- a/llvm/lib/Target/ARM/ARMProcessors.td +++ b/llvm/lib/Target/ARM/ARMProcessors.td @@ -421,6 +421,17 @@ def : ProcessorModel<"cortex-m52", CortexM55Model, [ARMv81mMainline, FeatureMVEVectorCostFactor1, HasMVEFloatOps]>; +def : ProcessorModel<"star-mc3", CortexM55Model, [ARMv81mMainline, + FeatureDSP, + FeatureFPARMv8_D16, + FeatureHasNoBranchPredictor, + FeaturePACBTI, + FeatureUseMISched, + FeaturePreferBranchAlign32, + FeatureHasSlowFPVMLx, + FeatureMVEVectorCostFactor1, + HasMVEFloatOps]>; + def : ProcNoItin<"cortex-a32", [ARMv8a, FeatureHWDivThumb, FeatureHWDivARM, diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index ca4a655..80c96c6 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -1701,6 +1701,43 @@ lowerVECTOR_SHUFFLE_VSHUF4I(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, DAG.getConstant(Imm, DL, GRLenVT)); } +/// Lower VECTOR_SHUFFLE whose result is the reversed source vector. +/// +/// It is possible to do optimization for VECTOR_SHUFFLE performing vector +/// reverse whose mask likes: +/// <7, 6, 5, 4, 3, 2, 1, 0> +/// +/// When undef's appear in the mask they are treated as if they were whatever +/// value is necessary in order to fit the above forms. +static SDValue +lowerVECTOR_SHUFFLE_IsReverse(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, + SDValue V1, SelectionDAG &DAG, + const LoongArchSubtarget &Subtarget) { + // Only vectors with i8/i16 elements which cannot match other patterns + // directly needs to do this. + if (VT != MVT::v16i8 && VT != MVT::v8i16 && VT != MVT::v32i8 && + VT != MVT::v16i16) + return SDValue(); + + if (!ShuffleVectorInst::isReverseMask(Mask, Mask.size())) + return SDValue(); + + int WidenNumElts = VT.getVectorNumElements() / 4; + SmallVector<int, 16> WidenMask(WidenNumElts, -1); + for (int i = 0; i < WidenNumElts; ++i) + WidenMask[i] = WidenNumElts - 1 - i; + + MVT WidenVT = MVT::getVectorVT( + VT.getVectorElementType() == MVT::i8 ? MVT::i32 : MVT::i64, WidenNumElts); + SDValue NewV1 = DAG.getBitcast(WidenVT, V1); + SDValue WidenRev = DAG.getVectorShuffle(WidenVT, DL, NewV1, + DAG.getUNDEF(WidenVT), WidenMask); + + return DAG.getNode(LoongArchISD::VSHUF4I, DL, VT, + DAG.getBitcast(VT, WidenRev), + DAG.getConstant(27, DL, Subtarget.getGRLenVT())); +} + /// Lower VECTOR_SHUFFLE into VPACKEV (if possible). /// /// VPACKEV interleaves the even elements from each vector. @@ -2004,6 +2041,9 @@ static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, if ((Result = lowerVECTOR_SHUFFLE_VSHUF4I(DL, Mask, VT, V1, V2, DAG, Subtarget))) return Result; + if ((Result = + lowerVECTOR_SHUFFLE_IsReverse(DL, Mask, VT, V1, DAG, Subtarget))) + return Result; // TODO: This comment may be enabled in the future to better match the // pattern for instruction selection. @@ -2622,6 +2662,9 @@ static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, return Result; if ((Result = lowerVECTOR_SHUFFLE_XVPERM(DL, Mask, VT, V1, DAG, Subtarget))) return Result; + if ((Result = + lowerVECTOR_SHUFFLE_IsReverse(DL, Mask, VT, V1, DAG, Subtarget))) + return Result; // TODO: This comment may be enabled in the future to better match the // pattern for instruction selection. diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp index c5e26c1..9de4c9d 100644 --- a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp @@ -62,8 +62,7 @@ static cl::opt<bool> cl::desc("Enable the merge base offset pass"), cl::init(true), cl::Hidden); -static Reloc::Model getEffectiveRelocModel(const Triple &TT, - std::optional<Reloc::Model> RM) { +static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) { return RM.value_or(Reloc::Static); } @@ -92,7 +91,7 @@ LoongArchTargetMachine::LoongArchTargetMachine( const TargetOptions &Options, std::optional<Reloc::Model> RM, std::optional<CodeModel::Model> CM, CodeGenOptLevel OL, bool JIT) : CodeGenTargetMachineImpl(T, TT.computeDataLayout(), TT, CPU, FS, Options, - getEffectiveRelocModel(TT, RM), + getEffectiveRelocModel(RM), getEffectiveLoongArchCodeModel(TT, CM), OL), TLOF(std::make_unique<TargetLoweringObjectFileELF>()) { initAsmInfo(); diff --git a/llvm/lib/Target/M68k/M68kTargetMachine.cpp b/llvm/lib/Target/M68k/M68kTargetMachine.cpp index 847c27ba..f525d43 100644 --- a/llvm/lib/Target/M68k/M68kTargetMachine.cpp +++ b/llvm/lib/Target/M68k/M68kTargetMachine.cpp @@ -46,13 +46,9 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeM68kTarget() { namespace { -Reloc::Model getEffectiveRelocModel(const Triple &TT, - std::optional<Reloc::Model> RM) { +Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) { // If not defined we default to static - if (!RM.has_value()) - return Reloc::Static; - - return *RM; + return RM.value_or(Reloc::Static); } CodeModel::Model getEffectiveCodeModel(std::optional<CodeModel::Model> CM, @@ -73,7 +69,7 @@ M68kTargetMachine::M68kTargetMachine(const Target &T, const Triple &TT, std::optional<CodeModel::Model> CM, CodeGenOptLevel OL, bool JIT) : CodeGenTargetMachineImpl(T, TT.computeDataLayout(), TT, CPU, FS, Options, - getEffectiveRelocModel(TT, RM), + getEffectiveRelocModel(RM), ::getEffectiveCodeModel(CM, JIT), OL), TLOF(std::make_unique<M68kELFTargetObjectFile>()), Subtarget(TT, CPU, FS, *this) { diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 598735f..c923f0e 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -1082,6 +1082,161 @@ let Predicates = [hasPTX<70>, hasSM<80>] in { "mbarrier.pending_count.b64", [(set i32:$res, (int_nvvm_mbarrier_pending_count i64:$state))]>; } + +class MBAR_UTIL<string op, string scope, + string space = "", string sem = "", + bit tl = 0, bit parity = 0> { + // The mbarrier instructions in PTX ISA are of the general form: + // mbarrier.op.semantics.scope.space.b64 arg1, arg2 ... + // where: + // op -> arrive, expect_tx, complete_tx, arrive.expect_tx etc. + // semantics -> acquire, release, relaxed (default depends on the op) + // scope -> cta or cluster (default is cta-scope) + // space -> shared::cta or shared::cluster (default is shared::cta) + // + // The 'semantics' and 'scope' go together. If one is specified, + // then the other _must_ be specified. For example: + // (A) mbarrier.arrive <args> (valid, release and cta are default) + // (B) mbarrier.arrive.release.cta <args> (valid, sem/scope mentioned explicitly) + // (C) mbarrier.arrive.release <args> (invalid, needs scope) + // (D) mbarrier.arrive.cta <args> (invalid, needs order) + // + // Wherever possible, we prefer form (A) to (B) since it is available + // from early PTX versions. In most cases, explicitly specifying the + // scope requires a later version of PTX. + string _scope_asm = !cond( + !eq(scope, "scope_cluster") : "cluster", + !eq(scope, "scope_cta") : !if(!empty(sem), "", "cta"), + true : scope); + string _space_asm = !cond( + !eq(space, "space_cta") : "shared", + !eq(space, "space_cluster") : "shared::cluster", + true : space); + + string _parity = !if(parity, "parity", ""); + string asm_str = StrJoin<".", ["mbarrier", op, _parity, + sem, _scope_asm, _space_asm, "b64"]>.ret; + + string _intr_suffix = StrJoin<"_", [!subst(".", "_", op), _parity, + !if(tl, "tl", ""), + sem, scope, space]>.ret; + string intr_name = "int_nvvm_mbarrier_" # _intr_suffix; + + // Predicate checks: + // These are used only for the "test_wait/try_wait" variants as they + // have evolved since sm80 and are complex. The predicates for the + // remaining instructions are straightforward and have already been + // applied directly. + Predicate _sm_pred = !cond(!or( + !eq(op, "try_wait"), + !eq(scope, "scope_cluster"), + !eq(sem, "relaxed")) : hasSM<90>, + true : hasSM<80>); + Predicate _ptx_pred = !cond( + !eq(sem, "relaxed") : hasPTX<86>, + !ne(_scope_asm, "") : hasPTX<80>, + !eq(op, "try_wait") : hasPTX<78>, + parity : hasPTX<71>, + true : hasPTX<70>); + list<Predicate> preds = [_ptx_pred, _sm_pred]; +} + +foreach op = ["expect_tx", "complete_tx"] in { + foreach scope = ["scope_cta", "scope_cluster"] in { + foreach space = ["space_cta", "space_cluster"] in { + defvar intr = !cast<Intrinsic>(MBAR_UTIL<op, scope, space>.intr_name); + defvar suffix = StrJoin<"_", [op, scope, space]>.ret; + def mbar_ # suffix : BasicNVPTXInst<(outs), (ins ADDR:$addr, B32:$tx_count), + MBAR_UTIL<op, scope, space, "relaxed">.asm_str, + [(intr addr:$addr, i32:$tx_count)]>, + Requires<[hasPTX<80>, hasSM<90>]>; + } // space + } // scope +} // op + +multiclass MBAR_ARR_INTR<string op, string scope, string sem, + list<Predicate> pred = []> { + // When either of sem or scope is non-default, both have to + // be explicitly specified. So, explicitly state that + // sem is `release` when scope is `cluster`. + defvar asm_sem = !if(!and(!empty(sem), !eq(scope, "scope_cluster")), + "release", sem); + + defvar asm_cta = MBAR_UTIL<op, scope, "space_cta", asm_sem>.asm_str; + defvar intr_cta = !cast<Intrinsic>(MBAR_UTIL<op, scope, + "space_cta", sem>.intr_name); + + defvar asm_cluster = MBAR_UTIL<op, scope, "space_cluster", asm_sem>.asm_str; + defvar intr_cluster = !cast<Intrinsic>(MBAR_UTIL<op, scope, + "space_cluster", sem>.intr_name); + + def _CTA : NVPTXInst<(outs B64:$state), + (ins ADDR:$addr, B32:$tx_count), + asm_cta # " $state, [$addr], $tx_count;", + [(set i64:$state, (intr_cta addr:$addr, i32:$tx_count))]>, + Requires<pred>; + def _CLUSTER : NVPTXInst<(outs), + (ins ADDR:$addr, B32:$tx_count), + asm_cluster # " _, [$addr], $tx_count;", + [(intr_cluster addr:$addr, i32:$tx_count)]>, + Requires<pred>; +} +foreach op = ["arrive", "arrive.expect_tx", + "arrive_drop", "arrive_drop.expect_tx"] in { + foreach scope = ["scope_cta", "scope_cluster"] in { + defvar suffix = !subst(".", "_", op) # scope; + defm mbar_ # suffix # _release : MBAR_ARR_INTR<op, scope, "", [hasPTX<80>, hasSM<90>]>; + defm mbar_ # suffix # _relaxed : MBAR_ARR_INTR<op, scope, "relaxed", [hasPTX<86>, hasSM<90>]>; + } // scope +} // op + +multiclass MBAR_WAIT_INTR<string op, string scope, string sem, bit time_limit> { + // When either of sem or scope is non-default, both have to + // be explicitly specified. So, explicitly state that the + // semantics is `acquire` when the scope is `cluster`. + defvar asm_sem = !if(!and(!empty(sem), !eq(scope, "scope_cluster")), + "acquire", sem); + + defvar asm_parity = MBAR_UTIL<op, scope, "space_cta", asm_sem, + time_limit, 1>.asm_str; + defvar pred_parity = MBAR_UTIL<op, scope, "space_cta", asm_sem, + time_limit, 1>.preds; + defvar intr_parity = !cast<Intrinsic>(MBAR_UTIL<op, scope, "space_cta", + sem, time_limit, 1>.intr_name); + + defvar asm_state = MBAR_UTIL<op, scope, "space_cta", asm_sem, + time_limit>.asm_str; + defvar pred_state = MBAR_UTIL<op, scope, "space_cta", asm_sem, + time_limit>.preds; + defvar intr_state = !cast<Intrinsic>(MBAR_UTIL<op, scope, "space_cta", + sem, time_limit>.intr_name); + + defvar ins_tl_dag = !if(time_limit, (ins B32:$tl), (ins)); + defvar tl_suffix = !if(time_limit, ", $tl;", ";"); + defvar intr_state_dag = !con((intr_state addr:$addr, i64:$state), + !if(time_limit, (intr_state i32:$tl), (intr_state))); + defvar intr_parity_dag = !con((intr_parity addr:$addr, i32:$phase), + !if(time_limit, (intr_parity i32:$tl), (intr_parity))); + + def _STATE : NVPTXInst<(outs B1:$res), !con((ins ADDR:$addr, B64:$state), ins_tl_dag), + asm_state # " $res, [$addr], $state" # tl_suffix, + [(set i1:$res, intr_state_dag)]>, + Requires<pred_state>; + def _PARITY : NVPTXInst<(outs B1:$res), !con((ins ADDR:$addr, B32:$phase), ins_tl_dag), + asm_parity # " $res, [$addr], $phase" # tl_suffix, + [(set i1:$res, intr_parity_dag)]>, + Requires<pred_parity>; +} +foreach op = ["test_wait", "try_wait"] in { + foreach scope = ["scope_cta", "scope_cluster"] in { + foreach time_limit = !if(!eq(op, "try_wait"), [true, false], [false]) in { + defvar suffix = StrJoin<"_", [op, scope, !if(time_limit, "tl", "")]>.ret; + defm mbar_ # suffix # "_acquire" : MBAR_WAIT_INTR<op, scope, "", time_limit>; + defm mbar_ # suffix # "_relaxed" : MBAR_WAIT_INTR<op, scope, "relaxed", time_limit>; + } // time_limit + } // scope +} // op + //----------------------------------- // Math Functions //----------------------------------- diff --git a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp index 53633ea..8198173 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp @@ -92,6 +92,8 @@ private: void emitFence(AtomicOrdering FenceOrdering, SyncScope::ID FenceSSID, MachineIRBuilder &MIB) const; bool selectUnmergeValues(MachineInstr &MI, MachineIRBuilder &MIB) const; + bool selectIntrinsicWithSideEffects(MachineInstr &I, + MachineIRBuilder &MIB) const; ComplexRendererFns selectShiftMask(MachineOperand &Root, unsigned ShiftWidth) const; @@ -714,6 +716,88 @@ static unsigned selectRegImmLoadStoreOp(unsigned GenericOpc, unsigned OpSize) { return GenericOpc; } +bool RISCVInstructionSelector::selectIntrinsicWithSideEffects( + MachineInstr &I, MachineIRBuilder &MIB) const { + // Find the intrinsic ID. + unsigned IntrinID = cast<GIntrinsic>(I).getIntrinsicID(); + // Select the instruction. + switch (IntrinID) { + default: + return false; + case Intrinsic::riscv_vlm: + case Intrinsic::riscv_vle: + case Intrinsic::riscv_vle_mask: + case Intrinsic::riscv_vlse: + case Intrinsic::riscv_vlse_mask: { + bool IsMasked = IntrinID == Intrinsic::riscv_vle_mask || + IntrinID == Intrinsic::riscv_vlse_mask; + bool IsStrided = IntrinID == Intrinsic::riscv_vlse || + IntrinID == Intrinsic::riscv_vlse_mask; + LLT VT = MRI->getType(I.getOperand(0).getReg()); + unsigned Log2SEW = Log2_32(VT.getScalarSizeInBits()); + + // Result vector + const Register DstReg = I.getOperand(0).getReg(); + + // Sources + bool HasPassthruOperand = IntrinID != Intrinsic::riscv_vlm; + unsigned CurOp = 2; + SmallVector<SrcOp, 4> SrcOps; // Source registers. + + // Passthru + if (HasPassthruOperand) { + auto PassthruReg = I.getOperand(CurOp++).getReg(); + SrcOps.push_back(PassthruReg); + } else { + SrcOps.push_back(Register(RISCV::NoRegister)); + } + + // Base Pointer + auto PtrReg = I.getOperand(CurOp++).getReg(); + SrcOps.push_back(PtrReg); + + // Stride + if (IsStrided) { + auto StrideReg = I.getOperand(CurOp++).getReg(); + SrcOps.push_back(StrideReg); + } + + // Mask + if (IsMasked) { + auto MaskReg = I.getOperand(CurOp++).getReg(); + SrcOps.push_back(MaskReg); + } + + RISCVVType::VLMUL LMUL = RISCVTargetLowering::getLMUL(getMVTForLLT(VT)); + const RISCV::VLEPseudo *P = + RISCV::getVLEPseudo(IsMasked, IsStrided, /*FF*/ false, Log2SEW, + static_cast<unsigned>(LMUL)); + + auto PseudoMI = MIB.buildInstr(P->Pseudo, {DstReg}, SrcOps); + + // Select VL + auto VLOpFn = renderVLOp(I.getOperand(CurOp++)); + for (auto &RenderFn : *VLOpFn) + RenderFn(PseudoMI); + + // SEW + PseudoMI.addImm(Log2SEW); + + // Policy + uint64_t Policy = RISCVVType::MASK_AGNOSTIC; + if (IsMasked) + Policy = I.getOperand(CurOp++).getImm(); + PseudoMI.addImm(Policy); + + // Memref + PseudoMI.cloneMemRefs(I); + + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*PseudoMI, TII, TRI, RBI); + } + } +} + bool RISCVInstructionSelector::select(MachineInstr &MI) { MachineIRBuilder MIB(MI); @@ -984,6 +1068,8 @@ bool RISCVInstructionSelector::select(MachineInstr &MI) { return constrainSelectedInstRegOperands(*NewInst, TII, TRI, RBI); } + case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: + return selectIntrinsicWithSideEffects(MI, MIB); default: return false; } diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp index f81b1e12..ae54ff1 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -141,8 +141,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() { initializeRISCVAsmPrinterPass(*PR); } -static Reloc::Model getEffectiveRelocModel(const Triple &TT, - std::optional<Reloc::Model> RM) { +static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) { return RM.value_or(Reloc::Static); } @@ -154,7 +153,7 @@ RISCVTargetMachine::RISCVTargetMachine(const Target &T, const Triple &TT, CodeGenOptLevel OL, bool JIT) : CodeGenTargetMachineImpl( T, TT.computeDataLayout(Options.MCOptions.getABIName()), TT, CPU, FS, - Options, getEffectiveRelocModel(TT, RM), + Options, getEffectiveRelocModel(RM), getEffectiveCodeModel(CM, CodeModel::Small), OL), TLOF(std::make_unique<RISCVELFTargetObjectFile>()) { initAsmInfo(); diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp index a9c638c..621640c 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp @@ -127,16 +127,11 @@ LLVMInitializeWebAssemblyTarget() { // WebAssembly Lowering public interface. //===----------------------------------------------------------------------===// -static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM, - const Triple &TT) { - if (!RM) { - // Default to static relocation model. This should always be more optimial - // than PIC since the static linker can determine all global addresses and - // assume direct function calls. - return Reloc::Static; - } - - return *RM; +static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) { + // Default to static relocation model. This should always be more optimial + // than PIC since the static linker can determine all global addresses and + // assume direct function calls. + return RM.value_or(Reloc::Static); } using WebAssembly::WasmEnableEH; @@ -197,7 +192,7 @@ WebAssemblyTargetMachine::WebAssemblyTargetMachine( const TargetOptions &Options, std::optional<Reloc::Model> RM, std::optional<CodeModel::Model> CM, CodeGenOptLevel OL, bool JIT) : CodeGenTargetMachineImpl(T, TT.computeDataLayout(), TT, CPU, FS, Options, - getEffectiveRelocModel(RM, TT), + getEffectiveRelocModel(RM), getEffectiveCodeModel(CM, CodeModel::Large), OL), TLOF(new WebAssemblyTargetObjectFile()), UsesMultivalueABI(Options.MCOptions.getABIName() == "experimental-mv") { diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp index 6065575..c8d1938 100644 --- a/llvm/lib/TargetParser/Host.cpp +++ b/llvm/lib/TargetParser/Host.cpp @@ -369,6 +369,7 @@ getHostCPUNameForARMFromComponents(StringRef Implementer, StringRef Hardware, if (Implementer == "0x63") { // Arm China. return StringSwitch<const char *>(Part) .Case("0x132", "star-mc1") + .Case("0xd25", "star-mc3") .Default("generic"); } diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp index 6d16599..5048561 100644 --- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp +++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp @@ -1044,15 +1044,13 @@ struct AAPointerInfoImpl return AAPointerInfo::manifest(A); } - virtual const_bin_iterator begin() const override { return State::begin(); } - virtual const_bin_iterator end() const override { return State::end(); } - virtual int64_t numOffsetBins() const override { - return State::numOffsetBins(); - } - virtual bool reachesReturn() const override { + const_bin_iterator begin() const override { return State::begin(); } + const_bin_iterator end() const override { return State::end(); } + int64_t numOffsetBins() const override { return State::numOffsetBins(); } + bool reachesReturn() const override { return !ReturnedOffsets.isUnassigned(); } - virtual void addReturnedOffsetsTo(OffsetInfo &OI) const override { + void addReturnedOffsetsTo(OffsetInfo &OI) const override { if (ReturnedOffsets.isUnknown()) { OI.setUnknown(); return; @@ -6653,7 +6651,7 @@ struct AAHeapToStackFunction final : public AAHeapToStack { AAHeapToStackFunction(const IRPosition &IRP, Attributor &A) : AAHeapToStack(IRP, A) {} - ~AAHeapToStackFunction() { + ~AAHeapToStackFunction() override { // Ensure we call the destructor so we release any memory allocated in the // sets. for (auto &It : AllocationInfos) @@ -8374,7 +8372,7 @@ struct AAMemoryLocationImpl : public AAMemoryLocation { AccessKind2Accesses.fill(nullptr); } - ~AAMemoryLocationImpl() { + ~AAMemoryLocationImpl() override { // The AccessSets are allocated via a BumpPtrAllocator, we call // the destructor manually. for (AccessSet *AS : AccessKind2Accesses) diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp index 5e2247f..d7eb745 100644 --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -2693,7 +2693,7 @@ struct AAExecutionDomainFunction : public AAExecutionDomain { AAExecutionDomainFunction(const IRPosition &IRP, Attributor &A) : AAExecutionDomain(IRP, A) {} - ~AAExecutionDomainFunction() { delete RPOT; } + ~AAExecutionDomainFunction() override { delete RPOT; } void initialize(Attributor &A) override { Function *F = getAnchorScope(); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h index ede73f8..9c75d9a 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -72,7 +72,7 @@ public: : InstCombiner(Worklist, Builder, F, AA, AC, TLI, TTI, DT, ORE, BFI, BPI, PSI, DL, RPOT) {} - virtual ~InstCombinerImpl() = default; + ~InstCombinerImpl() override = default; /// Perform early cleanup and prepare the InstCombine worklist. bool prepareWorklist(Function &F); diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 9c8de45..67f837c 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -3358,21 +3358,21 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) { if (TyAllocSize == 1) { // Canonicalize (gep i8* X, (ptrtoint Y)-(ptrtoint X)) to (bitcast Y), - // but only if the result pointer is only used as if it were an integer, - // or both point to the same underlying object (otherwise provenance is - // not necessarily retained). + // but only if the result pointer is only used as if it were an integer. + // (The case where the underlying object is the same is handled by + // InstSimplify.) Value *X = GEP.getPointerOperand(); Value *Y; - if (match(GEP.getOperand(1), - m_Sub(m_PtrToInt(m_Value(Y)), m_PtrToInt(m_Specific(X)))) && + if (match(GEP.getOperand(1), m_Sub(m_PtrToIntOrAddr(m_Value(Y)), + m_PtrToIntOrAddr(m_Specific(X)))) && GEPType == Y->getType()) { - bool HasSameUnderlyingObject = - getUnderlyingObject(X) == getUnderlyingObject(Y); + bool HasNonAddressBits = + DL.getAddressSizeInBits(AS) != DL.getPointerSizeInBits(AS); bool Changed = false; GEP.replaceUsesWithIf(Y, [&](Use &U) { - bool ShouldReplace = HasSameUnderlyingObject || - isa<ICmpInst>(U.getUser()) || - isa<PtrToIntInst>(U.getUser()); + bool ShouldReplace = isa<PtrToAddrInst>(U.getUser()) || + (!HasNonAddressBits && + isa<ICmpInst, PtrToIntInst>(U.getUser())); Changed |= ShouldReplace; return ShouldReplace; }); diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index d831c27..c537be5c 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -7551,6 +7551,7 @@ static bool reduceSwitchRange(SwitchInst *SI, IRBuilder<> &Builder, /// log2(C)-indexed value table (instead of traditionally emitting a load of the /// address of the jump target, and indirectly jump to it). static bool simplifySwitchOfPowersOfTwo(SwitchInst *SI, IRBuilder<> &Builder, + DomTreeUpdater *DTU, const DataLayout &DL, const TargetTransformInfo &TTI) { Value *Condition = SI->getCondition(); @@ -7573,12 +7574,6 @@ static bool simplifySwitchOfPowersOfTwo(SwitchInst *SI, IRBuilder<> &Builder, if (SI->getNumCases() < 4) return false; - // We perform this optimization only for switches with - // unreachable default case. - // This assumtion will save us from checking if `Condition` is a power of two. - if (!SI->defaultDestUnreachable()) - return false; - // Check that switch cases are powers of two. SmallVector<uint64_t, 4> Values; for (const auto &Case : SI->cases()) { @@ -7598,6 +7593,24 @@ static bool simplifySwitchOfPowersOfTwo(SwitchInst *SI, IRBuilder<> &Builder, Builder.SetInsertPoint(SI); + if (!SI->defaultDestUnreachable()) { + // Let non-power-of-two inputs jump to the default case, when the latter is + // reachable. + auto *PopC = Builder.CreateUnaryIntrinsic(Intrinsic::ctpop, Condition); + auto *IsPow2 = Builder.CreateICmpEQ(PopC, ConstantInt::get(CondTy, 1)); + + auto *OrigBB = SI->getParent(); + auto *DefaultCaseBB = SI->getDefaultDest(); + BasicBlock *SplitBB = SplitBlock(OrigBB, SI, DTU); + auto It = OrigBB->getTerminator()->getIterator(); + BranchInst::Create(SplitBB, DefaultCaseBB, IsPow2, It); + It->eraseFromParent(); + + addPredecessorToBlock(DefaultCaseBB, OrigBB, SplitBB); + if (DTU) + DTU->applyUpdates({{DominatorTree::Insert, OrigBB, DefaultCaseBB}}); + } + // Replace each case with its trailing zeros number. for (auto &Case : SI->cases()) { auto *OrigValue = Case.getCaseValue(); @@ -7953,7 +7966,7 @@ bool SimplifyCFGOpt::simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) { Options.ConvertSwitchToLookupTable)) return requestResimplify(); - if (simplifySwitchOfPowersOfTwo(SI, Builder, DL, TTI)) + if (simplifySwitchOfPowersOfTwo(SI, Builder, DTU, DL, TTI)) return requestResimplify(); if (reduceSwitchRange(SI, Builder, DL, TTI)) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 3356516..facb0fa 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4378,8 +4378,21 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( const SCEV *TC = vputils::getSCEVExprForVPValue(getPlanFor(MainLoopVF).getTripCount(), SE); assert(!isa<SCEVCouldNotCompute>(TC) && "Trip count SCEV must be computable"); - RemainingIterations = - SE.getURemExpr(TC, SE.getElementCount(TCType, MainLoopVF * IC)); + const SCEV *KnownMinTC; + bool ScalableTC = match(TC, m_scev_c_Mul(m_SCEV(KnownMinTC), m_SCEVVScale())); + // Use versions of TC and VF in which both are either scalable or fixed. + if (ScalableTC == MainLoopVF.isScalable()) + RemainingIterations = + SE.getURemExpr(TC, SE.getElementCount(TCType, MainLoopVF * IC)); + else if (ScalableTC) { + const SCEV *EstimatedTC = SE.getMulExpr( + KnownMinTC, + SE.getConstant(TCType, CM.getVScaleForTuning().value_or(1))); + RemainingIterations = SE.getURemExpr( + EstimatedTC, SE.getElementCount(TCType, MainLoopVF * IC)); + } else + RemainingIterations = + SE.getURemExpr(TC, SE.getElementCount(TCType, EstimatedRuntimeVF * IC)); // No iterations left to process in the epilogue. if (RemainingIterations->isZero()) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index cdb9e7e..4fcaf6d 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -17641,12 +17641,28 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { [](Value *V) { return !isa<GetElementPtrInst>(V) && isa<Instruction>(V); })) || - all_of(E->Scalars, [&](Value *V) { - return isa<PoisonValue>(V) || - (E->Idx == 0 && isa<InsertElementInst>(V)) || - E->isCopyableElement(V) || - (!isVectorLikeInstWithConstOps(V) && isUsedOutsideBlock(V)); - })) + (all_of(E->Scalars, + [&](Value *V) { + return isa<PoisonValue>(V) || + (E->Idx == 0 && isa<InsertElementInst>(V)) || + E->isCopyableElement(V) || + (!isVectorLikeInstWithConstOps(V) && + isUsedOutsideBlock(V)); + }) && + (!E->doesNotNeedToSchedule() || + any_of(E->Scalars, + [&](Value *V) { + if (!isa<Instruction>(V) || + (E->hasCopyableElements() && E->isCopyableElement(V))) + return false; + return !areAllOperandsNonInsts(V); + }) || + none_of(E->Scalars, [&](Value *V) { + if (!isa<Instruction>(V) || + (E->hasCopyableElements() && E->isCopyableElement(V))) + return false; + return MustGather.contains(V); + })))) Res = FindLastInst(); else Res = FindFirstInst(); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 2591df8..5b9f005 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -398,7 +398,7 @@ public: DebugLoc DL = DebugLoc::getUnknown()) : VPDef(SC), VPUser(Operands), DL(DL) {} - virtual ~VPRecipeBase() = default; + ~VPRecipeBase() override = default; /// Clone the current recipe. virtual VPRecipeBase *clone() = 0; @@ -576,7 +576,7 @@ public: return R && classof(R); } - virtual VPSingleDefRecipe *clone() override = 0; + VPSingleDefRecipe *clone() override = 0; /// Returns the underlying instruction. Instruction *getUnderlyingInstr() { @@ -907,7 +907,7 @@ struct VPRecipeWithIRFlags : public VPSingleDefRecipe, public VPIRFlags { return R && classof(R); } - virtual VPRecipeWithIRFlags *clone() override = 0; + VPRecipeWithIRFlags *clone() override = 0; static inline bool classof(const VPSingleDefRecipe *U) { auto *R = dyn_cast<VPRecipeBase>(U); @@ -2068,7 +2068,7 @@ public: return classof(static_cast<const VPRecipeBase *>(R)); } - virtual void execute(VPTransformState &State) override = 0; + void execute(VPTransformState &State) override = 0; /// Returns the step value of the induction. VPValue *getStepValue() { return getOperand(1); } @@ -2557,7 +2557,7 @@ public: VPCostContext &Ctx) const override; /// Returns true if the recipe only uses the first lane of operand \p Op. - virtual bool onlyFirstLaneUsed(const VPValue *Op) const override = 0; + bool onlyFirstLaneUsed(const VPValue *Op) const override = 0; /// Returns the number of stored operands of this interleave group. Returns 0 /// for load interleave groups. |
