diff options
Diffstat (limited to 'llvm/lib')
23 files changed, 408 insertions, 228 deletions
diff --git a/llvm/lib/Analysis/CaptureTracking.cpp b/llvm/lib/Analysis/CaptureTracking.cpp index a0fe7f9..22229d9 100644 --- a/llvm/lib/Analysis/CaptureTracking.cpp +++ b/llvm/lib/Analysis/CaptureTracking.cpp @@ -320,8 +320,12 @@ UseCaptureInfo llvm::DetermineUseCaptureKind(const Use &U, const Value *Base) { return CaptureComponents::None; case Instruction::Store: // Stored the pointer - conservatively assume it may be captured. + if (U.getOperandNo() == 0) + return MDNode::toCaptureComponents( + I->getMetadata(LLVMContext::MD_captures)); + // Volatile stores make the address observable. - if (U.getOperandNo() == 0 || cast<StoreInst>(I)->isVolatile()) + if (cast<StoreInst>(I)->isVolatile()) return CaptureComponents::All; return CaptureComponents::None; case Instruction::AtomicRMW: { diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp index 6fb2807..0e5bc48 100644 --- a/llvm/lib/Analysis/LazyValueInfo.cpp +++ b/llvm/lib/Analysis/LazyValueInfo.cpp @@ -1632,19 +1632,25 @@ LazyValueInfoImpl::getEdgeValueLocal(Value *Val, BasicBlock *BBFrom, *getValueFromCondition(Usr->getOperand(0), Condition, isTrueDest, /*UseBlockValue*/ false); - if (!OpLatticeVal.isConstantRange()) - return OpLatticeVal; + if (OpLatticeVal.isConstantRange()) { + const unsigned ResultBitWidth = + Usr->getType()->getScalarSizeInBits(); + if (auto *Trunc = dyn_cast<TruncInst>(Usr)) + return ValueLatticeElement::getRange( + OpLatticeVal.getConstantRange().truncate( + ResultBitWidth, Trunc->getNoWrapKind())); - const unsigned ResultBitWidth = - Usr->getType()->getScalarSizeInBits(); - if (auto *Trunc = dyn_cast<TruncInst>(Usr)) return ValueLatticeElement::getRange( - OpLatticeVal.getConstantRange().truncate( - ResultBitWidth, Trunc->getNoWrapKind())); - - return ValueLatticeElement::getRange( - OpLatticeVal.getConstantRange().castOp( - cast<CastInst>(Usr)->getOpcode(), ResultBitWidth)); + OpLatticeVal.getConstantRange().castOp( + cast<CastInst>(Usr)->getOpcode(), ResultBitWidth)); + } + if (OpLatticeVal.isConstant()) { + Constant *C = OpLatticeVal.getConstant(); + if (auto *CastC = ConstantFoldCastOperand( + cast<CastInst>(Usr)->getOpcode(), C, Usr->getType(), DL)) + return ValueLatticeElement::get(CastC); + } + return ValueLatticeElement::getOverdefined(); } else { // If one of Val's operand has an inferred value, we may be able to // infer the value of Val. diff --git a/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp b/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp index 09ac0f1..f794780 100644 --- a/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp @@ -599,8 +599,7 @@ Expected<std::unique_ptr<LinkGraph>> createLinkGraphFromMachOObject_arm64( } static Error applyPACSigningToModInitPointers(LinkGraph &G) { - assert(G.getTargetTriple().getSubArch() == Triple::AArch64SubArch_arm64e && - "PAC signing only valid for arm64e"); + assert(G.getTargetTriple().isArm64e() && "PAC signing only valid for arm64e"); if (auto *ModInitSec = G.findSectionByName("__DATA,__mod_init_func")) { for (auto *B : ModInitSec->blocks()) { diff --git a/llvm/lib/IR/Metadata.cpp b/llvm/lib/IR/Metadata.cpp index 9cfb0ff..1add0c7 100644 --- a/llvm/lib/IR/Metadata.cpp +++ b/llvm/lib/IR/Metadata.cpp @@ -48,6 +48,7 @@ #include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" +#include "llvm/Support/ModRef.h" #include <cassert> #include <cstddef> #include <cstdint> @@ -1435,6 +1436,40 @@ MDNode *MDNode::getMostGenericAlignmentOrDereferenceable(MDNode *A, MDNode *B) { return B; } +CaptureComponents MDNode::toCaptureComponents(const MDNode *MD) { + if (!MD) + return CaptureComponents::All; + + CaptureComponents CC = CaptureComponents::None; + for (Metadata *Op : MD->operands()) { + CaptureComponents Component = + StringSwitch<CaptureComponents>(cast<MDString>(Op)->getString()) + .Case("address", CaptureComponents::Address) + .Case("address_is_null", CaptureComponents::AddressIsNull) + .Case("provenance", CaptureComponents::Provenance) + .Case("read_provenance", CaptureComponents::ReadProvenance); + CC |= Component; + } + return CC; +} + +MDNode *MDNode::fromCaptureComponents(LLVMContext &Ctx, CaptureComponents CC) { + assert(!capturesNothing(CC) && "Can't encode captures(none)"); + if (capturesAll(CC)) + return nullptr; + + SmallVector<Metadata *> Components; + if (capturesAddressIsNullOnly(CC)) + Components.push_back(MDString::get(Ctx, "address_is_null")); + else if (capturesAddress(CC)) + Components.push_back(MDString::get(Ctx, "address")); + if (capturesReadProvenanceOnly(CC)) + Components.push_back(MDString::get(Ctx, "read_provenance")); + else if (capturesFullProvenance(CC)) + Components.push_back(MDString::get(Ctx, "provenance")); + return MDNode::get(Ctx, Components); +} + //===----------------------------------------------------------------------===// // NamedMDNode implementation. // diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index 8c03d6f..6b3cd27 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -542,6 +542,7 @@ private: void visitAliasScopeMetadata(const MDNode *MD); void visitAliasScopeListMetadata(const MDNode *MD); void visitAccessGroupMetadata(const MDNode *MD); + void visitCapturesMetadata(Instruction &I, const MDNode *Captures); template <class Ty> bool isValidMetadataArray(const MDTuple &N); #define HANDLE_SPECIALIZED_MDNODE_LEAF(CLASS) void visit##CLASS(const CLASS &N); @@ -5373,6 +5374,27 @@ void Verifier::visitAccessGroupMetadata(const MDNode *MD) { } } +void Verifier::visitCapturesMetadata(Instruction &I, const MDNode *Captures) { + static const char *ValidArgs[] = {"address_is_null", "address", + "read_provenance", "provenance"}; + + auto *SI = dyn_cast<StoreInst>(&I); + Check(SI, "!captures metadata can only be applied to store instructions", &I); + Check(SI->getValueOperand()->getType()->isPointerTy(), + "!captures metadata can only be applied to store with value operand of " + "pointer type", + &I); + Check(Captures->getNumOperands() != 0, "!captures metadata cannot be empty", + &I); + + for (Metadata *Op : Captures->operands()) { + auto *Str = dyn_cast<MDString>(Op); + Check(Str, "!captures metadata must be a list of strings", &I); + Check(is_contained(ValidArgs, Str->getString()), + "invalid entry in !captures metadata", &I, Str); + } +} + /// verifyInstruction - Verify that an instruction is well formed. /// void Verifier::visitInstruction(Instruction &I) { @@ -5600,6 +5622,9 @@ void Verifier::visitInstruction(Instruction &I) { if (MDNode *Annotation = I.getMetadata(LLVMContext::MD_annotation)) visitAnnotationMetadata(Annotation); + if (MDNode *Captures = I.getMetadata(LLVMContext::MD_captures)) + visitCapturesMetadata(I, Captures); + if (MDNode *N = I.getDebugLoc().getAsMDNode()) { CheckDI(isa<DILocation>(N), "invalid !dbg metadata attachment", &I, N); visitMDNode(*N, AreDebugLocsAllowed::Yes); diff --git a/llvm/lib/Object/OffloadBundle.cpp b/llvm/lib/Object/OffloadBundle.cpp index a6a9628a..0dd378e 100644 --- a/llvm/lib/Object/OffloadBundle.cpp +++ b/llvm/lib/Object/OffloadBundle.cpp @@ -120,15 +120,14 @@ OffloadBundleFatBin::create(MemoryBufferRef Buf, uint64_t SectionOffset, if (identify_magic(Buf.getBuffer()) != file_magic::offload_bundle) return errorCodeToError(object_error::parse_failed); - std::unique_ptr<OffloadBundleFatBin> TheBundle( - new OffloadBundleFatBin(Buf, FileName)); + OffloadBundleFatBin *TheBundle = new OffloadBundleFatBin(Buf, FileName); // Read the Bundle Entries Error Err = TheBundle->readEntries(Buf.getBuffer(), SectionOffset); if (Err) return Err; - return TheBundle; + return std::unique_ptr<OffloadBundleFatBin>(TheBundle); } Error OffloadBundleFatBin::extractBundle(const ObjectFile &Source) { diff --git a/llvm/lib/Support/Mustache.cpp b/llvm/lib/Support/Mustache.cpp index 6275e5e..47860c0 100644 --- a/llvm/lib/Support/Mustache.cpp +++ b/llvm/lib/Support/Mustache.cpp @@ -329,6 +329,36 @@ struct Tag { size_t StartPosition = StringRef::npos; }; +[[maybe_unused]] static const char *tagKindToString(Tag::Kind K) { + switch (K) { + case Tag::Kind::None: + return "None"; + case Tag::Kind::Normal: + return "Normal"; + case Tag::Kind::Triple: + return "Triple"; + } + llvm_unreachable("Unknown Tag::Kind"); +} + +[[maybe_unused]] static const char *jsonKindToString(json::Value::Kind K) { + switch (K) { + case json::Value::Kind::Null: + return "JSON_KIND_NULL"; + case json::Value::Kind::Boolean: + return "JSON_KIND_BOOLEAN"; + case json::Value::Kind::Number: + return "JSON_KIND_NUMBER"; + case json::Value::Kind::String: + return "JSON_KIND_STRING"; + case json::Value::Kind::Array: + return "JSON_KIND_ARRAY"; + case json::Value::Kind::Object: + return "JSON_KIND_OBJECT"; + } + llvm_unreachable("Unknown json::Value::Kind"); +} + static Tag findNextTag(StringRef Template, size_t StartPos, StringRef Open, StringRef Close) { const StringLiteral TripleOpen("{{{"); @@ -373,11 +403,10 @@ static Tag findNextTag(StringRef Template, size_t StartPos, StringRef Open, static std::optional<std::pair<StringRef, StringRef>> processTag(const Tag &T, SmallVectorImpl<Token> &Tokens) { - LLVM_DEBUG(dbgs() << " Found tag: \"" << T.FullMatch << "\", Content: \"" - << T.Content << "\"\n"); + LLVM_DEBUG(dbgs() << "[Tag] " << T.FullMatch << ", Content: " << T.Content + << ", Kind: " << tagKindToString(T.TagKind) << "\n"); if (T.TagKind == Tag::Kind::Triple) { Tokens.emplace_back(T.FullMatch.str(), "&" + T.Content.str(), '&'); - LLVM_DEBUG(dbgs() << " Created UnescapeVariable token.\n"); return std::nullopt; } StringRef Interpolated = T.Content; @@ -385,7 +414,6 @@ processTag(const Tag &T, SmallVectorImpl<Token> &Tokens) { if (!Interpolated.trim().starts_with("=")) { char Front = Interpolated.empty() ? ' ' : Interpolated.trim().front(); Tokens.emplace_back(RawBody, Interpolated.str(), Front); - LLVM_DEBUG(dbgs() << " Created tag token of type '" << Front << "'\n"); return std::nullopt; } Tokens.emplace_back(RawBody, Interpolated.str(), '='); @@ -395,8 +423,8 @@ processTag(const Tag &T, SmallVectorImpl<Token> &Tokens) { DelimSpec = DelimSpec.trim(); std::pair<StringRef, StringRef> Ret = DelimSpec.split(' '); - LLVM_DEBUG(dbgs() << " Found Set Delimiter tag. NewOpen='" << Ret.first - << "', NewClose='" << Ret.second << "'\n"); + LLVM_DEBUG(dbgs() << "[Set Delimiter] NewOpen: " << Ret.first + << ", NewClose: " << Ret.second << "\n"); return Ret; } @@ -405,15 +433,15 @@ processTag(const Tag &T, SmallVectorImpl<Token> &Tokens) { // but we don't support that here. An unescape variable // is represented only by {{& variable}}. static SmallVector<Token> tokenize(StringRef Template) { - LLVM_DEBUG(dbgs() << "Tokenizing template: \"" << Template << "\"\n"); + LLVM_DEBUG(dbgs() << "[Tokenize Template] \"" << Template << "\"\n"); SmallVector<Token> Tokens; SmallString<8> Open("{{"); SmallString<8> Close("}}"); size_t Start = 0; while (Start < Template.size()) { - LLVM_DEBUG(dbgs() << "Loop start. Start=" << Start << ", Open='" << Open - << "', Close='" << Close << "'\n"); + LLVM_DEBUG(dbgs() << "[Tokenize Loop] Start:" << Start << ", Open:'" << Open + << "', Close:'" << Close << "'\n"); Tag T = findNextTag(Template, Start, Open, Close); if (T.TagKind == Tag::Kind::None) { @@ -428,7 +456,6 @@ static SmallVector<Token> tokenize(StringRef Template) { if (T.StartPosition > Start) { StringRef Text = Template.substr(Start, T.StartPosition - Start); Tokens.emplace_back(Text.str()); - LLVM_DEBUG(dbgs() << " Created Text token: \"" << Text << "\"\n"); } if (auto NewDelims = processTag(T, Tokens)) { @@ -479,7 +506,6 @@ static SmallVector<Token> tokenize(StringRef Template) { if ((!HasTextBehind && !HasTextAhead) || (!HasTextBehind && Idx == LastIdx)) stripTokenBefore(Tokens, Idx, CurrentToken, CurrentType); } - LLVM_DEBUG(dbgs() << "Tokenizing finished.\n"); return Tokens; } @@ -545,8 +571,8 @@ protected: Indent.resize(Indentation, ' '); for (char C : Data) { - LLVM_DEBUG(dbgs() << "IndentationStream: NeedsIndent=" << NeedsIndent - << ", C='" << C << "', Indentation=" << Indentation + LLVM_DEBUG(dbgs() << "[Indentation Stream] NeedsIndent:" << NeedsIndent + << ", C:'" << C << "', Indentation:" << Indentation << "\n"); if (NeedsIndent && C != '\n') { WrappedStream << Indent; @@ -654,7 +680,9 @@ void Parser::parseMustache(ASTNode *Parent) { } } static void toMustacheString(const json::Value &Data, raw_ostream &OS) { - LLVM_DEBUG(dbgs() << "toMustacheString: kind=" << (int)Data.kind() << "\n"); + LLVM_DEBUG(dbgs() << "[To Mustache String] Kind: " + << jsonKindToString(Data.kind()) << ", Data: " << Data + << "\n"); switch (Data.kind()) { case json::Value::Null: return; @@ -667,7 +695,6 @@ static void toMustacheString(const json::Value &Data, raw_ostream &OS) { } case json::Value::String: { auto Str = *Data.getAsString(); - LLVM_DEBUG(dbgs() << " --> writing string: \"" << Str << "\"\n"); OS << Str.str(); return; } @@ -696,8 +723,8 @@ void ASTNode::renderText(MustacheOutputStream &OS) { OS << Body; } void ASTNode::renderPartial(const json::Value &CurrentCtx, MustacheOutputStream &OS) { - LLVM_DEBUG(dbgs() << "renderPartial: Accessor=" << AccessorValue[0] - << ", Indentation=" << Indentation << "\n"); + LLVM_DEBUG(dbgs() << "[Render Partial] Accessor:" << AccessorValue[0] + << ", Indentation:" << Indentation << "\n"); auto Partial = Ctx.Partials.find(AccessorValue[0]); if (Partial != Ctx.Partials.end()) renderPartial(CurrentCtx, OS, Partial->getValue().get()); @@ -716,13 +743,12 @@ void ASTNode::renderVariable(const json::Value &CurrentCtx, void ASTNode::renderUnescapeVariable(const json::Value &CurrentCtx, MustacheOutputStream &OS) { - LLVM_DEBUG(dbgs() << "renderUnescapeVariable: Accessor=" << AccessorValue[0] + LLVM_DEBUG(dbgs() << "[Render UnescapeVariable] Accessor:" << AccessorValue[0] << "\n"); auto Lambda = Ctx.Lambdas.find(AccessorValue[0]); if (Lambda != Ctx.Lambdas.end()) { renderLambdas(CurrentCtx, OS, Lambda->getValue()); } else if (const json::Value *ContextPtr = findContext()) { - LLVM_DEBUG(dbgs() << " --> Found context value, writing to stream.\n"); OS.suspendIndentation(); toMustacheString(*ContextPtr, OS); OS.resumeIndentation(); @@ -792,8 +818,6 @@ void ASTNode::render(const llvm::json::Value &Data, MustacheOutputStream &OS) { } const json::Value *ASTNode::findContext() { - LLVM_DEBUG(dbgs() << "findContext: AccessorValue[0]=" << AccessorValue[0] - << "\n"); // The mustache spec allows for dot notation to access nested values // a single dot refers to the current context. // We attempt to find the JSON context in the current node, if it is not @@ -808,22 +832,12 @@ const json::Value *ASTNode::findContext() { StringRef CurrentAccessor = AccessorValue[0]; ASTNode *CurrentParent = Parent; - LLVM_DEBUG(dbgs() << "findContext: ParentContext: "; - if (ParentContext) ParentContext->print(dbgs()); - else dbgs() << "nullptr"; dbgs() << "\n"); - while (!CurrentContext || !CurrentContext->get(CurrentAccessor)) { - LLVM_DEBUG(dbgs() << "findContext: climbing parent\n"); if (CurrentParent->Ty != Root) { CurrentContext = CurrentParent->ParentContext->getAsObject(); CurrentParent = CurrentParent->Parent; - LLVM_DEBUG(dbgs() << "findContext: new ParentContext: "; - if (CurrentParent->ParentContext) - CurrentParent->ParentContext->print(dbgs()); - else dbgs() << "nullptr"; dbgs() << "\n"); continue; } - LLVM_DEBUG(dbgs() << "findContext: reached root, not found\n"); return nullptr; } const json::Value *Context = nullptr; @@ -839,9 +853,6 @@ const json::Value *ASTNode::findContext() { Context = CurrentValue; } } - LLVM_DEBUG(dbgs() << "findContext: found value: "; - if (Context) Context->print(dbgs()); else dbgs() << "nullptr"; - dbgs() << "\n"); return Context; } @@ -853,8 +864,7 @@ void ASTNode::renderChild(const json::Value &Contexts, void ASTNode::renderPartial(const json::Value &Contexts, MustacheOutputStream &OS, ASTNode *Partial) { - LLVM_DEBUG(dbgs() << "renderPartial (helper): Indentation=" << Indentation - << "\n"); + LLVM_DEBUG(dbgs() << "[Render Partial Indentation] Indentation: " << Indentation << "\n"); AddIndentationStringStream IS(OS, Indentation); Partial->render(Contexts, IS); } diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index f291191..3f9a1f4 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -495,13 +495,6 @@ public: bool isVMEMOrFlatVMEM(const MachineInstr &MI) const; bool run(MachineFunction &MF); - bool isForceEmitWaitcnt() const { - for (auto T : inst_counter_types()) - if (ForceEmitWaitcnt[T]) - return true; - return false; - } - void setForceEmitWaitcnt() { // For non-debug builds, ForceEmitWaitcnt has been initialized to false; // For debug builds, get the debug counter info and adjust if need be @@ -570,10 +563,6 @@ public: return VmemReadMapping[getVmemType(Inst)]; } - bool hasXcnt() const { return ST->hasWaitXCnt(); } - - bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const; - bool mayAccessLDSThroughFlat(const MachineInstr &MI) const; bool isVmemAccess(const MachineInstr &MI) const; bool generateWaitcntInstBefore(MachineInstr &MI, WaitcntBrackets &ScoreBrackets, @@ -591,7 +580,6 @@ public: WaitcntBrackets &ScoreBrackets); bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets); - static bool asynchronouslyWritesSCC(unsigned Opcode); }; // This objects maintains the current score brackets of each wait counter, and @@ -1109,7 +1097,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, setRegScore(FIRST_LDS_VGPR, T, CurrScore); } - if (Context->asynchronouslyWritesSCC(Inst.getOpcode())) { + if (SIInstrInfo::isSBarrierSCCWrite(Inst.getOpcode())) { setRegScore(SCC, T, CurrScore); PendingSCCWrite = &Inst; } @@ -1831,12 +1819,6 @@ bool WaitcntGeneratorGFX12Plus::createNewWaitcnt( return Modified; } -static bool readsVCCZ(const MachineInstr &MI) { - unsigned Opc = MI.getOpcode(); - return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) && - !MI.getOperand(1).isUndef(); -} - /// \returns true if the callee inserts an s_waitcnt 0 on function entry. static bool callWaitsOnFunctionEntry(const MachineInstr &MI) { // Currently all conventions wait, but this may not always be the case. @@ -1871,26 +1853,24 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, assert(!MI.isMetaInstruction()); AMDGPU::Waitcnt Wait; + const unsigned Opc = MI.getOpcode(); // FIXME: This should have already been handled by the memory legalizer. // Removing this currently doesn't affect any lit tests, but we need to // verify that nothing was relying on this. The number of buffer invalidates // being handled here should not be expanded. - if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 || - MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC || - MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL || - MI.getOpcode() == AMDGPU::BUFFER_GL0_INV || - MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) { + if (Opc == AMDGPU::BUFFER_WBINVL1 || Opc == AMDGPU::BUFFER_WBINVL1_SC || + Opc == AMDGPU::BUFFER_WBINVL1_VOL || Opc == AMDGPU::BUFFER_GL0_INV || + Opc == AMDGPU::BUFFER_GL1_INV) { Wait.LoadCnt = 0; } // All waits must be resolved at call return. // NOTE: this could be improved with knowledge of all call sites or // with knowledge of the called routines. - if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG || - MI.getOpcode() == AMDGPU::SI_RETURN || - MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN || - MI.getOpcode() == AMDGPU::S_SETPC_B64_return || + if (Opc == AMDGPU::SI_RETURN_TO_EPILOG || Opc == AMDGPU::SI_RETURN || + Opc == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN || + Opc == AMDGPU::S_SETPC_B64_return || (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) { Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false)); } @@ -1902,8 +1882,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, // send a message to explicitly release all VGPRs before the stores have // completed, but it is only safe to do this if there are no outstanding // scratch stores. - else if (MI.getOpcode() == AMDGPU::S_ENDPGM || - MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) { + else if (Opc == AMDGPU::S_ENDPGM || Opc == AMDGPU::S_ENDPGM_SAVED) { if (!WCG->isOptNone() && (MI.getMF()->getInfo<SIMachineFunctionInfo>()->isDynamicVGPREnabled() || (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && @@ -1912,8 +1891,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, ReleaseVGPRInsts.insert(&MI); } // Resolve vm waits before gs-done. - else if ((MI.getOpcode() == AMDGPU::S_SENDMSG || - MI.getOpcode() == AMDGPU::S_SENDMSGHALT) && + else if ((Opc == AMDGPU::S_SENDMSG || Opc == AMDGPU::S_SENDMSGHALT) && ST->hasLegacyGeometry() && ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) == AMDGPU::SendMsg::ID_GS_DONE_PreGFX11)) { @@ -1938,7 +1916,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, // Wait for any pending GDS instruction to complete before any // "Always GDS" instruction. - if (TII->isAlwaysGDS(MI.getOpcode()) && ScoreBrackets.hasPendingGDS()) + if (TII->isAlwaysGDS(Opc) && ScoreBrackets.hasPendingGDS()) addWait(Wait, DS_CNT, ScoreBrackets.getPendingGDSWait()); if (MI.isCall() && callWaitsOnFunctionEntry(MI)) { @@ -1964,7 +1942,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, Wait); } } - } else if (MI.getOpcode() == AMDGPU::S_BARRIER_WAIT) { + } else if (Opc == AMDGPU::S_BARRIER_WAIT) { ScoreBrackets.tryClearSCCWriteEvent(&MI); } else { // FIXME: Should not be relying on memoperands. @@ -2061,7 +2039,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, ScoreBrackets.determineWait(SmemAccessCounter, Interval, Wait); } - if (hasXcnt() && Op.isDef()) + if (ST->hasWaitXCnt() && Op.isDef()) ScoreBrackets.determineWait(X_CNT, Interval, Wait); } } @@ -2079,18 +2057,17 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, // // In all other cases, ensure safety by ensuring that there are no outstanding // memory operations. - if (MI.getOpcode() == AMDGPU::S_BARRIER && - !ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) { + if (Opc == AMDGPU::S_BARRIER && !ST->hasAutoWaitcntBeforeBarrier() && + !ST->supportsBackOffBarrier()) { Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true)); } // TODO: Remove this work-around, enable the assert for Bug 457939 // after fixing the scheduler. Also, the Shader Compiler code is // independent of target. - if (readsVCCZ(MI) && ST->hasReadVCCZBug()) { - if (ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) { - Wait.DsCnt = 0; - } + if (SIInstrInfo::isCBranchVCCZRead(MI) && ST->hasReadVCCZBug() && + ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) { + Wait.DsCnt = 0; } // Verify that the wait is actually needed. @@ -2165,19 +2142,19 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait, } // XCnt may be already consumed by a load wait. - if (Wait.KmCnt == 0 && Wait.XCnt != ~0u && - !ScoreBrackets.hasPendingEvent(SMEM_GROUP)) - Wait.XCnt = ~0u; + if (Wait.XCnt != ~0u) { + if (Wait.KmCnt == 0 && !ScoreBrackets.hasPendingEvent(SMEM_GROUP)) + Wait.XCnt = ~0u; - if (Wait.LoadCnt == 0 && Wait.XCnt != ~0u && - !ScoreBrackets.hasPendingEvent(VMEM_GROUP)) - Wait.XCnt = ~0u; + if (Wait.LoadCnt == 0 && !ScoreBrackets.hasPendingEvent(VMEM_GROUP)) + Wait.XCnt = ~0u; - // Since the translation for VMEM addresses occur in-order, we can skip the - // XCnt if the current instruction is of VMEM type and has a memory dependency - // with another VMEM instruction in flight. - if (Wait.XCnt != ~0u && isVmemAccess(*It)) - Wait.XCnt = ~0u; + // Since the translation for VMEM addresses occur in-order, we can skip the + // XCnt if the current instruction is of VMEM type and has a memory + // dependency with another VMEM instruction in flight. + if (isVmemAccess(*It)) + Wait.XCnt = ~0u; + } if (WCG->createNewWaitcnt(Block, It, Wait)) Modified = true; @@ -2185,75 +2162,11 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait, return Modified; } -// This is a flat memory operation. Check to see if it has memory tokens other -// than LDS. Other address spaces supported by flat memory operations involve -// global memory. -bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const { - assert(TII->isFLAT(MI)); - - // All flat instructions use the VMEM counter except prefetch. - if (!TII->usesVM_CNT(MI)) - return false; - - // If there are no memory operands then conservatively assume the flat - // operation may access VMEM. - if (MI.memoperands_empty()) - return true; - - // See if any memory operand specifies an address space that involves VMEM. - // Flat operations only supported FLAT, LOCAL (LDS), or address spaces - // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION - // (GDS) address space is not supported by flat operations. Therefore, simply - // return true unless only the LDS address space is found. - for (const MachineMemOperand *Memop : MI.memoperands()) { - unsigned AS = Memop->getAddrSpace(); - assert(AS != AMDGPUAS::REGION_ADDRESS); - if (AS != AMDGPUAS::LOCAL_ADDRESS) - return true; - } - - return false; -} - -// This is a flat memory operation. Check to see if it has memory tokens for -// either LDS or FLAT. -bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const { - assert(TII->isFLAT(MI)); - - // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter. - if (!TII->usesLGKM_CNT(MI)) - return false; - - // If in tgsplit mode then there can be no use of LDS. - if (ST->isTgSplitEnabled()) - return false; - - // If there are no memory operands then conservatively assume the flat - // operation may access LDS. - if (MI.memoperands_empty()) - return true; - - // See if any memory operand specifies an address space that involves LDS. - for (const MachineMemOperand *Memop : MI.memoperands()) { - unsigned AS = Memop->getAddrSpace(); - if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) - return true; - } - - return false; -} - bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const { - return (TII->isFLAT(MI) && mayAccessVMEMThroughFlat(MI)) || + return (TII->isFLAT(MI) && TII->mayAccessVMEMThroughFlat(MI)) || (TII->isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode())); } -static bool isGFX12CacheInvOrWBInst(MachineInstr &Inst) { - auto Opc = Inst.getOpcode(); - return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB || - Opc == AMDGPU::GLOBAL_WBINV; -} - // Return true if the next instruction is S_ENDPGM, following fallthrough // blocks if necessary. bool SIInsertWaitcnts::isNextENDPGM(MachineBasicBlock::instr_iterator It, @@ -2331,7 +2244,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst); } } else if (TII->isFLAT(Inst)) { - if (isGFX12CacheInvOrWBInst(Inst)) { + if (SIInstrInfo::isGFX12CacheInvOrWBInst(Inst.getOpcode())) { ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst), Inst); return; @@ -2341,14 +2254,14 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, int FlatASCount = 0; - if (mayAccessVMEMThroughFlat(Inst)) { + if (TII->mayAccessVMEMThroughFlat(Inst)) { ++FlatASCount; IsVMEMAccess = true; ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst), Inst); } - if (mayAccessLDSThroughFlat(Inst)) { + if (TII->mayAccessLDSThroughFlat(Inst)) { ++FlatASCount; ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst); } @@ -2394,7 +2307,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst); else ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst); - } else if (asynchronouslyWritesSCC(Inst.getOpcode())) { + } else if (SIInstrInfo::isSBarrierSCCWrite(Inst.getOpcode())) { ScoreBrackets->updateByEvent(TII, TRI, MRI, SCC_WRITE, Inst); } else { switch (Inst.getOpcode()) { @@ -2413,7 +2326,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, } } - if (!hasXcnt()) + if (!ST->hasWaitXCnt()) return; if (IsVMEMAccess) @@ -2478,9 +2391,8 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) { unsigned OldEventsHasSCCWrite = OldEvents & (1 << SCC_WRITE); if (!OldEventsHasSCCWrite) { PendingSCCWrite = Other.PendingSCCWrite; - } else { - if (PendingSCCWrite != Other.PendingSCCWrite) - PendingSCCWrite = nullptr; + } else if (PendingSCCWrite != Other.PendingSCCWrite) { + PendingSCCWrite = nullptr; } } } @@ -2516,12 +2428,6 @@ static bool isWaitInstr(MachineInstr &Inst) { counterTypeForInstr(Opcode).has_value(); } -bool SIInsertWaitcnts::asynchronouslyWritesSCC(unsigned Opcode) { - return Opcode == AMDGPU::S_BARRIER_LEAVE || - Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM || - Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0; -} - // Generate s_waitcnt instructions where needed. bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block, @@ -2578,7 +2484,7 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, OldWaitcntInstr = nullptr; // Restore vccz if it's not known to be correct already. - bool RestoreVCCZ = !VCCZCorrect && readsVCCZ(Inst); + bool RestoreVCCZ = !VCCZCorrect && SIInstrInfo::isCBranchVCCZRead(Inst); // Don't examine operands unless we need to track vccz correctness. if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) { @@ -2701,7 +2607,7 @@ bool SIInsertWaitcnts::isPreheaderToFlush( bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const { if (SIInstrInfo::isFLAT(MI)) - return mayAccessVMEMThroughFlat(MI); + return TII->mayAccessVMEMThroughFlat(MI); return SIInstrInfo::isVMEM(MI); } @@ -2724,11 +2630,10 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML, for (MachineBasicBlock *MBB : ML->blocks()) { for (MachineInstr &MI : *MBB) { if (isVMEMOrFlatVMEM(MI)) { - if (MI.mayLoad()) - HasVMemLoad = true; - if (MI.mayStore()) - HasVMemStore = true; + HasVMemLoad |= MI.mayLoad(); + HasVMemStore |= MI.mayStore(); } + for (const MachineOperand &Op : MI.all_uses()) { if (Op.isDebug() || !TRI->isVectorRegister(*MRI, Op.getReg())) continue; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 044ea86..56435a5 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -4344,6 +4344,59 @@ bool SIInstrInfo::mayAccessScratchThroughFlat(const MachineInstr &MI) const { }); } +bool SIInstrInfo::mayAccessVMEMThroughFlat(const MachineInstr &MI) const { + assert(isFLAT(MI)); + + // All flat instructions use the VMEM counter except prefetch. + if (!usesVM_CNT(MI)) + return false; + + // If there are no memory operands then conservatively assume the flat + // operation may access VMEM. + if (MI.memoperands_empty()) + return true; + + // See if any memory operand specifies an address space that involves VMEM. + // Flat operations only supported FLAT, LOCAL (LDS), or address spaces + // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION + // (GDS) address space is not supported by flat operations. Therefore, simply + // return true unless only the LDS address space is found. + for (const MachineMemOperand *Memop : MI.memoperands()) { + unsigned AS = Memop->getAddrSpace(); + assert(AS != AMDGPUAS::REGION_ADDRESS); + if (AS != AMDGPUAS::LOCAL_ADDRESS) + return true; + } + + return false; +} + +bool SIInstrInfo::mayAccessLDSThroughFlat(const MachineInstr &MI) const { + assert(isFLAT(MI)); + + // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter. + if (!usesLGKM_CNT(MI)) + return false; + + // If in tgsplit mode then there can be no use of LDS. + if (ST.isTgSplitEnabled()) + return false; + + // If there are no memory operands then conservatively assume the flat + // operation may access LDS. + if (MI.memoperands_empty()) + return true; + + // See if any memory operand specifies an address space that involves LDS. + for (const MachineMemOperand *Memop : MI.memoperands()) { + unsigned AS = Memop->getAddrSpace(); + if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) + return true; + } + + return false; +} + bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) { // Skip the full operand and register alias search modifiesRegister // does. There's only a handful of instructions that touch this, it's only an diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index c2252af..754f52a 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -688,6 +688,12 @@ public: /// to not hit scratch. bool mayAccessScratchThroughFlat(const MachineInstr &MI) const; + /// \returns true for FLAT instructions that can access VMEM. + bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const; + + /// \returns true for FLAT instructions that can access LDS. + bool mayAccessLDSThroughFlat(const MachineInstr &MI) const; + static bool isBlockLoadStore(uint16_t Opcode) { switch (Opcode) { case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE: @@ -748,6 +754,18 @@ public: return isLDSDMA(MI) && MI.getOpcode() != AMDGPU::BUFFER_STORE_LDS_DWORD; } + static bool isSBarrierSCCWrite(unsigned Opcode) { + return Opcode == AMDGPU::S_BARRIER_LEAVE || + Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM || + Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0; + } + + static bool isCBranchVCCZRead(const MachineInstr &MI) { + unsigned Opc = MI.getOpcode(); + return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) && + !MI.getOperand(1).isUndef(); + } + static bool isWQM(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::WQM; } @@ -1010,6 +1028,16 @@ public: Opcode == AMDGPU::DS_GWS_BARRIER; } + static bool isGFX12CacheInvOrWBInst(unsigned Opc) { + return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB || + Opc == AMDGPU::GLOBAL_WBINV; + } + + static bool isGFX12CacheInvOrWBInst(unsigned Opc) { + return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB || + Opc == AMDGPU::GLOBAL_WBINV; + } + static bool isF16PseudoScalarTrans(unsigned Opcode) { return Opcode == AMDGPU::V_S_EXP_F16_e64 || Opcode == AMDGPU::V_S_LOG_F16_e64 || diff --git a/llvm/lib/Target/ARM/ARMSubtarget.cpp b/llvm/lib/Target/ARM/ARMSubtarget.cpp index 3329bea..58bc338 100644 --- a/llvm/lib/Target/ARM/ARMSubtarget.cpp +++ b/llvm/lib/Target/ARM/ARMSubtarget.cpp @@ -225,7 +225,11 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { (isTargetDarwin() || DM == DenormalMode::getPreserveSign())) HasNEONForFP = true; - if (isRWPI()) + const ARM::ArchKind Arch = ARM::parseArch(TargetTriple.getArchName()); + if (isRWPI() || + (isTargetIOS() && + (Arch == ARM::ArchKind::ARMV6K || Arch == ARM::ArchKind::ARMV6) && + TargetTriple.isOSVersionLT(3, 0))) ReserveR9 = true; // If MVEVectorCostFactor is still 0 (has not been set to anything else), default it to 2 diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index 70b6c7e..1e6b04f8 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -3793,6 +3793,11 @@ bool RISCVInstrInfo::findCommutedOpIndices(const MachineInstr &MI, return false; // Operands 1 and 2 are commutable, if we switch the opcode. return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1, 2); + case RISCV::QC_SELECTIEQ: + case RISCV::QC_SELECTINE: + case RISCV::QC_SELECTIIEQ: + case RISCV::QC_SELECTIINE: + return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1, 2); case RISCV::QC_MVEQ: case RISCV::QC_MVNE: case RISCV::QC_MVLT: @@ -4018,6 +4023,11 @@ MachineInstr *RISCVInstrInfo::commuteInstructionImpl(MachineInstr &MI, return TargetInstrInfo::commuteInstructionImpl(WorkingMI, false, OpIdx1, OpIdx2); } + case RISCV::QC_SELECTIEQ: + case RISCV::QC_SELECTINE: + case RISCV::QC_SELECTIIEQ: + case RISCV::QC_SELECTIINE: + return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); case RISCV::QC_MVEQ: case RISCV::QC_MVNE: case RISCV::QC_MVLT: diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td index 4eb9a3be..d998316 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td @@ -345,7 +345,7 @@ defset list<VTypeInfo> AllVectors = { } } - defset list<VTypeInfo> AllFloatAndBFloatVectors = { + defset list<VTypeInfo> AllFloatAndBF16Vectors = { defset list<VTypeInfo> AllFloatVectors = { defset list<VTypeInfo> NoGroupFloatVectors = { defset list<VTypeInfo> FractionalGroupFloatVectors = { @@ -382,16 +382,16 @@ defset list<VTypeInfo> AllVectors = { } } - defset list<VTypeInfo> AllBFloatVectors = { - defset list<VTypeInfo> NoGroupBFloatVectors = { - defset list<VTypeInfo> FractionalGroupBFloatVectors = { + defset list<VTypeInfo> AllBF16Vectors = { + defset list<VTypeInfo> NoGroupBF16Vectors = { + defset list<VTypeInfo> FractionalGroupBF16Vectors = { def VBF16MF4: VTypeInfo<vbfloat16mf4_t, vbool64_t, 16, V_MF4, bf16, FPR16>; def VBF16MF2: VTypeInfo<vbfloat16mf2_t, vbool32_t, 16, V_MF2, bf16, FPR16>; } def VBF16M1: VTypeInfo<vbfloat16m1_t, vbool16_t, 16, V_M1, bf16, FPR16>; } - defset list<GroupVTypeInfo> GroupBFloatVectors = { + defset list<GroupVTypeInfo> GroupBF16Vectors = { def VBF16M2: GroupVTypeInfo<vbfloat16m2_t, vbfloat16m1_t, vbool8_t, 16, V_M2, bf16, FPR16>; def VBF16M4: GroupVTypeInfo<vbfloat16m4_t, vbfloat16m1_t, vbool4_t, 16, @@ -542,7 +542,7 @@ defset list<VTypeInfoToWide> AllWidenableIntToFloatVectors = { def : VTypeInfoToWide<VI32M4, VF64M8>; } -defset list<VTypeInfoToWide> AllWidenableBFloatToFloatVectors = { +defset list<VTypeInfoToWide> AllWidenableBF16ToFloatVectors = { def : VTypeInfoToWide<VBF16MF4, VF32MF2>; def : VTypeInfoToWide<VBF16MF2, VF32M1>; def : VTypeInfoToWide<VBF16M1, VF32M2>; @@ -5870,7 +5870,7 @@ multiclass VPatConversionWF_VF<string intrinsic, string instruction, multiclass VPatConversionWF_VF_BF<string intrinsic, string instruction, bit isSEWAware = 0> { - foreach fvtiToFWti = AllWidenableBFloatToFloatVectors in + foreach fvtiToFWti = AllWidenableBF16ToFloatVectors in { defvar fvti = fvtiToFWti.Vti; defvar fwti = fvtiToFWti.Wti; @@ -5977,7 +5977,7 @@ multiclass VPatConversionVF_WF_RTZ<string intrinsic, string instruction, multiclass VPatConversionVF_WF_BF_RM<string intrinsic, string instruction, bit isSEWAware = 0> { - foreach fvtiToFWti = AllWidenableBFloatToFloatVectors in { + foreach fvtiToFWti = AllWidenableBF16ToFloatVectors in { defvar fvti = fvtiToFWti.Vti; defvar fwti = fvtiToFWti.Wti; let Predicates = !listconcat(GetVTypePredicates<fvti>.Predicates, @@ -7154,7 +7154,7 @@ defm : VPatConversionVI_VF<"int_riscv_vfclass", "PseudoVFCLASS">; // We can use vmerge.vvm to support vector-vector vfmerge. // NOTE: Clang previously used int_riscv_vfmerge for vector-vector, but now uses // int_riscv_vmerge. Support both for compatibility. -foreach vti = AllFloatAndBFloatVectors in { +foreach vti = AllFloatAndBF16Vectors in { let Predicates = GetVTypeMinimalPredicates<vti>.Predicates in defm : VPatBinaryCarryInTAIL<"int_riscv_vmerge", "PseudoVMERGE", "VVM", vti.Vector, diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td index dc61361..139ff92 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td @@ -1388,7 +1388,7 @@ defm : VPatFPSetCCSDNode_VV_VF_FV<SETOLE, "PseudoVMFLE", "PseudoVMFGE">; // Floating-point vselects: // 11.15. Vector Integer Merge Instructions // 13.15. Vector Floating-Point Merge Instruction -foreach fvti = AllFloatAndBFloatVectors in { +foreach fvti = AllFloatAndBF16Vectors in { defvar ivti = GetIntVTypeInfo<fvti>.Vti; let Predicates = GetVTypePredicates<ivti>.Predicates in { def : Pat<(fvti.Vector (vselect (fvti.Mask VMV0:$vm), fvti.RegClass:$rs1, diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td index 1511f1b..cf904ea 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td @@ -2426,7 +2426,7 @@ foreach vti = AllFloatVectors in { // Floating-point vselects: // 11.15. Vector Integer Merge Instructions // 13.15. Vector Floating-Point Merge Instruction -foreach fvti = AllFloatAndBFloatVectors in { +foreach fvti = AllFloatAndBF16Vectors in { defvar ivti = GetIntVTypeInfo<fvti>.Vti; let Predicates = GetVTypePredicates<ivti>.Predicates in { def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask VMV0:$vm), @@ -2770,7 +2770,7 @@ foreach vti = NoGroupFloatVectors in { } } -foreach vti = AllFloatAndBFloatVectors in { +foreach vti = AllFloatAndBF16Vectors in { defvar ivti = GetIntVTypeInfo<vti>.Vti; let Predicates = GetVTypePredicates<ivti>.Predicates in { def : Pat<(vti.Vector diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td index 9835c03..b683e89 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td @@ -560,7 +560,7 @@ multiclass VPseudoVNCVT_BF16_S { } multiclass VPatConversionS_BF16<string intrinsic, string instruction> { - foreach fvtiToFWti = AllWidenableBFloatToFloatVectors in { + foreach fvtiToFWti = AllWidenableBF16ToFloatVectors in { defvar fvti = fvtiToFWti.Vti; defvar fwti = fvtiToFWti.Wti; let Predicates = [HasVendorXAndesVBFHCvt] in @@ -572,7 +572,7 @@ multiclass VPatConversionS_BF16<string intrinsic, string instruction> { } multiclass VPatConversionBF16_S<string intrinsic, string instruction> { - foreach fvtiToFWti = AllWidenableBFloatToFloatVectors in { + foreach fvtiToFWti = AllWidenableBF16ToFloatVectors in { defvar fvti = fvtiToFWti.Vti; defvar fwti = fvtiToFWti.Wti; let Predicates = [HasVendorXAndesVBFHCvt] in diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td index b546339..557d873 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td @@ -770,7 +770,7 @@ multiclass VPatVQMACCQOQ<string intrinsic, string instruction, string kind> : VPatVMACC<intrinsic, instruction, kind, VQMACCQOQInfoPairs, vint8m1_t>; multiclass VPatVFWMACC<string intrinsic, string instruction, string kind> - : VPatVMACC<intrinsic, instruction, kind, AllWidenableBFloatToFloatVectors, + : VPatVMACC<intrinsic, instruction, kind, AllWidenableBF16ToFloatVectors, vbfloat16m1_t>; defset list<VTypeInfoToWide> VFNRCLIPInfoPairs = { diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td index ff4a040..5407868 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td @@ -524,7 +524,7 @@ class QCIRVInstRI<bits<1> funct1, DAGOperand InTyImm11, let Inst{30-20} = imm11; } -let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in +let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCommutable = 1 in class QCISELECTIICC<bits<3> funct3, string opcodestr> : RVInstR4<0b00, funct3, OPC_CUSTOM_2, (outs GPRNoX0:$rd_wb), (ins GPRNoX0:$rd, GPRNoX0:$rs1, simm5:$simm1, simm5:$simm2), @@ -537,7 +537,7 @@ class QCISELECTIICC<bits<3> funct3, string opcodestr> let rs2 = simm1; } -let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in +let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCommutable = 1 in class QCISELECTICC<bits<3> funct3, string opcodestr> : RVInstR4<0b01, funct3, OPC_CUSTOM_2, (outs GPRNoX0:$rd_wb), (ins GPRNoX0:$rd, GPRNoX0:$rs1, GPRNoX0:$rs2, simm5:$simm2), diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td index 6d8672b..0be9eab 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td @@ -53,7 +53,7 @@ let Predicates = [HasStdExtZvfbfminOrZvfofp8min] in { defm : VPatConversionVF_WF_BF_RM<"int_riscv_vfncvtbf16_f_f_w", "PseudoVFNCVTBF16_F_F", isSEWAware=1>; - foreach fvtiToFWti = AllWidenableBFloatToFloatVectors in { + foreach fvtiToFWti = AllWidenableBF16ToFloatVectors in { defvar fvti = fvtiToFWti.Vti; defvar fwti = fvtiToFWti.Wti; let Predicates = [HasVInstructionsBF16Minimal] in @@ -91,9 +91,9 @@ let Predicates = [HasStdExtZvfbfminOrZvfofp8min] in { let Predicates = [HasStdExtZvfbfwma] in { defm : VPatTernaryW_VV_VX_RM<"int_riscv_vfwmaccbf16", "PseudoVFWMACCBF16", - AllWidenableBFloatToFloatVectors, isSEWAware=1>; + AllWidenableBF16ToFloatVectors, isSEWAware=1>; defm : VPatWidenFPMulAccVL_VV_VF_RM<riscv_vfwmadd_vl, "PseudoVFWMACCBF16", - AllWidenableBFloatToFloatVectors>; + AllWidenableBF16ToFloatVectors>; defm : VPatWidenFPMulAccSDNode_VV_VF_RM<"PseudoVFWMACCBF16", - AllWidenableBFloatToFloatVectors>; + AllWidenableBF16ToFloatVectors>; } diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td index 95f8a87..17a7948 100644 --- a/llvm/lib/Target/RISCV/RISCVProcessors.td +++ b/llvm/lib/Target/RISCV/RISCVProcessors.td @@ -347,16 +347,58 @@ defvar SiFiveP400TuneFeatures = [TuneNoDefaultUnroll, TunePostRAScheduler]; def SIFIVE_P450 : RISCVProcessorModel<"sifive-p450", SiFiveP400Model, - !listconcat(RVA22U64Features, - [FeatureStdExtZifencei, + [Feature64Bit, + FeatureStdExtI, + FeatureStdExtM, + FeatureStdExtA, + FeatureStdExtF, + FeatureStdExtD, + FeatureStdExtC, + FeatureStdExtZicsr, + FeatureStdExtZiccif, + FeatureStdExtZiccrse, + FeatureStdExtZiccamoa, + FeatureStdExtZicclsm, + FeatureStdExtZa64rs, + FeatureStdExtZihpm, + FeatureStdExtZihintpause, + FeatureStdExtB, + FeatureStdExtZic64b, + FeatureStdExtZicbom, + FeatureStdExtZicbop, + FeatureStdExtZicboz, + FeatureStdExtZfhmin, + FeatureStdExtZkt, + FeatureStdExtZifencei, FeatureStdExtZihintntl, FeatureUnalignedScalarMem, - FeatureUnalignedVectorMem]), + FeatureUnalignedVectorMem], SiFiveP400TuneFeatures>; def SIFIVE_P470 : RISCVProcessorModel<"sifive-p470", SiFiveP400Model, - !listconcat(RVA22U64Features, - [FeatureStdExtV, + [Feature64Bit, + FeatureStdExtI, + FeatureStdExtM, + FeatureStdExtA, + FeatureStdExtF, + FeatureStdExtD, + FeatureStdExtC, + FeatureStdExtZicsr, + FeatureStdExtZiccif, + FeatureStdExtZiccrse, + FeatureStdExtZiccamoa, + FeatureStdExtZicclsm, + FeatureStdExtZa64rs, + FeatureStdExtZihpm, + FeatureStdExtZihintpause, + FeatureStdExtB, + FeatureStdExtZic64b, + FeatureStdExtZicbom, + FeatureStdExtZicbop, + FeatureStdExtZicboz, + FeatureStdExtZfhmin, + FeatureStdExtZkt, + FeatureStdExtV, FeatureStdExtZifencei, FeatureStdExtZihintntl, FeatureStdExtZvl128b, @@ -368,7 +410,7 @@ def SIFIVE_P470 : RISCVProcessorModel<"sifive-p470", SiFiveP400Model, FeatureVendorXSiFivecdiscarddlone, FeatureVendorXSiFivecflushdlone, FeatureUnalignedScalarMem, - FeatureUnalignedVectorMem]), + FeatureUnalignedVectorMem], !listconcat(SiFiveP400TuneFeatures, [TuneNoSinkSplatOperands, TuneVXRMPipelineFlush])>; @@ -397,8 +439,29 @@ def SIFIVE_P550 : RISCVProcessorModel<"sifive-p550", SiFiveP500Model, } def SIFIVE_P670 : RISCVProcessorModel<"sifive-p670", SiFiveP600Model, - !listconcat(RVA22U64Features, - [FeatureStdExtV, + [Feature64Bit, + FeatureStdExtI, + FeatureStdExtM, + FeatureStdExtA, + FeatureStdExtF, + FeatureStdExtD, + FeatureStdExtC, + FeatureStdExtZicsr, + FeatureStdExtZiccif, + FeatureStdExtZiccrse, + FeatureStdExtZiccamoa, + FeatureStdExtZicclsm, + FeatureStdExtZa64rs, + FeatureStdExtZihpm, + FeatureStdExtZihintpause, + FeatureStdExtB, + FeatureStdExtZic64b, + FeatureStdExtZicbom, + FeatureStdExtZicbop, + FeatureStdExtZicboz, + FeatureStdExtZfhmin, + FeatureStdExtZkt, + FeatureStdExtV, FeatureStdExtZifencei, FeatureStdExtZihintntl, FeatureStdExtZvl128b, @@ -408,7 +471,7 @@ def SIFIVE_P670 : RISCVProcessorModel<"sifive-p670", SiFiveP600Model, FeatureStdExtZvksc, FeatureStdExtZvksg, FeatureUnalignedScalarMem, - FeatureUnalignedVectorMem]), + FeatureUnalignedVectorMem], [TuneNoDefaultUnroll, TuneConditionalCompressedMoveFusion, TuneLUIADDIFusion, diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp index f88d51f..99c4982 100644 --- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -1680,7 +1680,9 @@ processGlobal(GlobalValue &GV, /// FastCC. static void ChangeCalleesToFastCall(Function *F) { for (User *U : F->users()) - cast<CallBase>(U)->setCallingConv(CallingConv::Fast); + if (auto *Call = dyn_cast<CallBase>(U)) + if (Call->getCalledOperand() == F) + Call->setCallingConv(CallingConv::Fast); } static AttributeList StripAttr(LLVMContext &C, AttributeList Attrs, @@ -1766,10 +1768,12 @@ isValidCandidateForColdCC(Function &F, return false; for (User *U : F.users()) { - CallBase &CB = cast<CallBase>(*U); - Function *CallerFunc = CB.getParent()->getParent(); + CallBase *CB = dyn_cast<CallBase>(U); + if (!CB || CB->getCalledOperand() != &F) + continue; + Function *CallerFunc = CB->getParent()->getParent(); BlockFrequencyInfo &CallerBFI = GetBFI(*CallerFunc); - if (!isColdCallSite(CB, CallerBFI)) + if (!isColdCallSite(*CB, CallerBFI)) return false; if (!llvm::is_contained(AllCallsCold, CallerFunc)) return false; @@ -1779,7 +1783,9 @@ isValidCandidateForColdCC(Function &F, static void changeCallSitesToColdCC(Function *F) { for (User *U : F->users()) - cast<CallBase>(U)->setCallingConv(CallingConv::Cold); + if (auto *Call = dyn_cast<CallBase>(U)) + if (Call->getCalledOperand() == F) + Call->setCallingConv(CallingConv::Cold); } // This function iterates over all the call instructions in the input Function diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 8fbaf68..ff063f9 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -5169,6 +5169,7 @@ Instruction *InstCombinerImpl::visitFreeze(FreezeInst &I) { // - or: pick -1 // - select's condition: if the true value is constant, choose it by making // the condition true. + // - phi: pick the common constant across operands // - default: pick 0 // // Note that this transform is intentionally done here rather than @@ -5179,9 +5180,32 @@ Instruction *InstCombinerImpl::visitFreeze(FreezeInst &I) { // TODO: This could use getBinopAbsorber() / getBinopIdentity() to avoid // duplicating logic for binops at least. auto getUndefReplacement = [&](Type *Ty) { - Value *BestValue = nullptr; + auto pickCommonConstantFromPHI = [](PHINode &PN) -> Value * { + // phi(freeze(undef), C, C). Choose C for freeze so the PHI can be + // removed. + Constant *BestValue = nullptr; + for (Value *V : PN.incoming_values()) { + if (match(V, m_Freeze(m_Undef()))) + continue; + + Constant *C = dyn_cast<Constant>(V); + if (!C) + return nullptr; + + if (!isGuaranteedNotToBeUndefOrPoison(C)) + return nullptr; + + if (BestValue && BestValue != C) + return nullptr; + + BestValue = C; + } + return BestValue; + }; + Value *NullValue = Constant::getNullValue(Ty); - for (const auto *U : I.users()) { + Value *BestValue = nullptr; + for (auto *U : I.users()) { Value *V = NullValue; if (match(U, m_Or(m_Value(), m_Value()))) V = ConstantInt::getAllOnesValue(Ty); @@ -5190,6 +5214,9 @@ Instruction *InstCombinerImpl::visitFreeze(FreezeInst &I) { else if (match(U, m_c_Select(m_Specific(&I), m_Value(V)))) { if (!isGuaranteedNotToBeUndefOrPoison(V, &AC, &I, &DT)) V = NullValue; + } else if (auto *PHI = dyn_cast<PHINode>(U)) { + if (Value *MaybeV = pickCommonConstantFromPHI(*PHI)) + V = MaybeV; } if (!BestValue) diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index 123881e..21b2652 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -3025,6 +3025,12 @@ static void combineMetadata(Instruction *K, const Instruction *J, // Preserve !nosanitize if both K and J have it. K->setMetadata(Kind, JMD); break; + case LLVMContext::MD_captures: + K->setMetadata( + Kind, MDNode::fromCaptureComponents( + K->getContext(), MDNode::toCaptureComponents(JMD) | + MDNode::toCaptureComponents(KMD))); + break; } } // Set !invariant.group from J if J has it. If both instructions have it |