diff options
Diffstat (limited to 'llvm/lib')
105 files changed, 1954 insertions, 1011 deletions
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp index 2d52f34..dd98b62 100644 --- a/llvm/lib/Analysis/ConstantFolding.cpp +++ b/llvm/lib/Analysis/ConstantFolding.cpp @@ -2679,11 +2679,12 @@ static Constant *ConstantFoldScalarCall1(StringRef Name, case Intrinsic::nvvm_round_ftz_f: case Intrinsic::nvvm_round_f: case Intrinsic::nvvm_round_d: { - // Use APFloat implementation instead of native libm call, as some - // implementations (e.g. on PPC) do not preserve the sign of negative 0. + // nvvm_round is lowered to PTX cvt.rni, which will round to nearest + // integer, choosing even integer if source is equidistant between two + // integers, so the semantics are closer to "rint" rather than "round". bool IsFTZ = nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID); auto V = IsFTZ ? FTZPreserveSign(APF) : APF; - V.roundToIntegral(APFloat::rmNearestTiesToAway); + V.roundToIntegral(APFloat::rmNearestTiesToEven); return ConstantFP::get(Ty->getContext(), V); } diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp index 393f264..6fc81d787 100644 --- a/llvm/lib/Analysis/Loads.cpp +++ b/llvm/lib/Analysis/Loads.cpp @@ -342,7 +342,7 @@ bool llvm::isDereferenceableAndAlignedInLoop( : SE.getConstantMaxBackedgeTakenCount(L); } const auto &[AccessStart, AccessEnd] = getStartAndEndForAccess( - L, PtrScev, LI->getType(), BECount, MaxBECount, &SE, nullptr); + L, PtrScev, LI->getType(), BECount, MaxBECount, &SE, nullptr, &DT, AC); if (isa<SCEVCouldNotCompute>(AccessStart) || isa<SCEVCouldNotCompute>(AccessEnd)) return false; diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index 14be385..a553533 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -23,6 +23,8 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AliasSetTracker.h" +#include "llvm/Analysis/AssumeBundleQueries.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/LoopAnalysisManager.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopIterator.h" @@ -208,28 +210,46 @@ static const SCEV *mulSCEVOverflow(const SCEV *A, const SCEV *B, /// Return true, if evaluating \p AR at \p MaxBTC cannot wrap, because \p AR at /// \p MaxBTC is guaranteed inbounds of the accessed object. -static bool evaluatePtrAddRecAtMaxBTCWillNotWrap(const SCEVAddRecExpr *AR, - const SCEV *MaxBTC, - const SCEV *EltSize, - ScalarEvolution &SE, - const DataLayout &DL) { +static bool +evaluatePtrAddRecAtMaxBTCWillNotWrap(const SCEVAddRecExpr *AR, + const SCEV *MaxBTC, const SCEV *EltSize, + ScalarEvolution &SE, const DataLayout &DL, + DominatorTree *DT, AssumptionCache *AC) { auto *PointerBase = SE.getPointerBase(AR->getStart()); auto *StartPtr = dyn_cast<SCEVUnknown>(PointerBase); if (!StartPtr) return false; + const Loop *L = AR->getLoop(); bool CheckForNonNull, CheckForFreed; - uint64_t DerefBytes = StartPtr->getValue()->getPointerDereferenceableBytes( + Value *StartPtrV = StartPtr->getValue(); + uint64_t DerefBytes = StartPtrV->getPointerDereferenceableBytes( DL, CheckForNonNull, CheckForFreed); - if (CheckForNonNull || CheckForFreed) + if (DerefBytes && (CheckForNonNull || CheckForFreed)) return false; const SCEV *Step = AR->getStepRecurrence(SE); + Type *WiderTy = SE.getWiderType(MaxBTC->getType(), Step->getType()); + const SCEV *DerefBytesSCEV = SE.getConstant(WiderTy, DerefBytes); + + // Check if we have a suitable dereferencable assumption we can use. + if (!StartPtrV->canBeFreed()) { + RetainedKnowledge DerefRK = getKnowledgeValidInContext( + StartPtrV, {Attribute::Dereferenceable}, *AC, + L->getLoopPredecessor()->getTerminator(), DT); + if (DerefRK) { + DerefBytesSCEV = SE.getUMaxExpr( + DerefBytesSCEV, SE.getConstant(WiderTy, DerefRK.ArgValue)); + } + } + + if (DerefBytesSCEV->isZero()) + return false; + bool IsKnownNonNegative = SE.isKnownNonNegative(Step); if (!IsKnownNonNegative && !SE.isKnownNegative(Step)) return false; - Type *WiderTy = SE.getWiderType(MaxBTC->getType(), Step->getType()); Step = SE.getNoopOrSignExtend(Step, WiderTy); MaxBTC = SE.getNoopOrZeroExtend(MaxBTC, WiderTy); @@ -256,8 +276,7 @@ static bool evaluatePtrAddRecAtMaxBTCWillNotWrap(const SCEVAddRecExpr *AR, const SCEV *EndBytes = addSCEVNoOverflow(StartOffset, OffsetEndBytes, SE); if (!EndBytes) return false; - return SE.isKnownPredicate(CmpInst::ICMP_ULE, EndBytes, - SE.getConstant(WiderTy, DerefBytes)); + return SE.isKnownPredicate(CmpInst::ICMP_ULE, EndBytes, DerefBytesSCEV); } // For negative steps check if @@ -265,15 +284,15 @@ static bool evaluatePtrAddRecAtMaxBTCWillNotWrap(const SCEVAddRecExpr *AR, // * StartOffset <= DerefBytes. assert(SE.isKnownNegative(Step) && "must be known negative"); return SE.isKnownPredicate(CmpInst::ICMP_SGE, StartOffset, OffsetEndBytes) && - SE.isKnownPredicate(CmpInst::ICMP_ULE, StartOffset, - SE.getConstant(WiderTy, DerefBytes)); + SE.isKnownPredicate(CmpInst::ICMP_ULE, StartOffset, DerefBytesSCEV); } std::pair<const SCEV *, const SCEV *> llvm::getStartAndEndForAccess( const Loop *Lp, const SCEV *PtrExpr, Type *AccessTy, const SCEV *BTC, const SCEV *MaxBTC, ScalarEvolution *SE, DenseMap<std::pair<const SCEV *, Type *>, - std::pair<const SCEV *, const SCEV *>> *PointerBounds) { + std::pair<const SCEV *, const SCEV *>> *PointerBounds, + DominatorTree *DT, AssumptionCache *AC) { std::pair<const SCEV *, const SCEV *> *PtrBoundsPair; if (PointerBounds) { auto [Iter, Ins] = PointerBounds->insert( @@ -308,8 +327,8 @@ std::pair<const SCEV *, const SCEV *> llvm::getStartAndEndForAccess( // sets ScEnd to the maximum unsigned value for the type. Note that LAA // separately checks that accesses cannot not wrap, so unsigned max // represents an upper bound. - if (evaluatePtrAddRecAtMaxBTCWillNotWrap(AR, MaxBTC, EltSizeSCEV, *SE, - DL)) { + if (evaluatePtrAddRecAtMaxBTCWillNotWrap(AR, MaxBTC, EltSizeSCEV, *SE, DL, + DT, AC)) { ScEnd = AR->evaluateAtIteration(MaxBTC, *SE); } else { ScEnd = SE->getAddExpr( @@ -356,9 +375,9 @@ void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, const SCEV *PtrExpr, bool NeedsFreeze) { const SCEV *SymbolicMaxBTC = PSE.getSymbolicMaxBackedgeTakenCount(); const SCEV *BTC = PSE.getBackedgeTakenCount(); - const auto &[ScStart, ScEnd] = - getStartAndEndForAccess(Lp, PtrExpr, AccessTy, BTC, SymbolicMaxBTC, - PSE.getSE(), &DC.getPointerBounds()); + const auto &[ScStart, ScEnd] = getStartAndEndForAccess( + Lp, PtrExpr, AccessTy, BTC, SymbolicMaxBTC, PSE.getSE(), + &DC.getPointerBounds(), DC.getDT(), DC.getAC()); assert(!isa<SCEVCouldNotCompute>(ScStart) && !isa<SCEVCouldNotCompute>(ScEnd) && "must be able to compute both start and end expressions"); @@ -1961,13 +1980,15 @@ bool MemoryDepChecker::areAccessesCompletelyBeforeOrAfter(const SCEV *Src, const SCEV *BTC = PSE.getBackedgeTakenCount(); const SCEV *SymbolicMaxBTC = PSE.getSymbolicMaxBackedgeTakenCount(); ScalarEvolution &SE = *PSE.getSE(); - const auto &[SrcStart_, SrcEnd_] = getStartAndEndForAccess( - InnermostLoop, Src, SrcTy, BTC, SymbolicMaxBTC, &SE, &PointerBounds); + const auto &[SrcStart_, SrcEnd_] = + getStartAndEndForAccess(InnermostLoop, Src, SrcTy, BTC, SymbolicMaxBTC, + &SE, &PointerBounds, DT, AC); if (isa<SCEVCouldNotCompute>(SrcStart_) || isa<SCEVCouldNotCompute>(SrcEnd_)) return false; - const auto &[SinkStart_, SinkEnd_] = getStartAndEndForAccess( - InnermostLoop, Sink, SinkTy, BTC, SymbolicMaxBTC, &SE, &PointerBounds); + const auto &[SinkStart_, SinkEnd_] = + getStartAndEndForAccess(InnermostLoop, Sink, SinkTy, BTC, SymbolicMaxBTC, + &SE, &PointerBounds, DT, AC); if (isa<SCEVCouldNotCompute>(SinkStart_) || isa<SCEVCouldNotCompute>(SinkEnd_)) return false; @@ -3002,7 +3023,7 @@ LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE, const TargetTransformInfo *TTI, const TargetLibraryInfo *TLI, AAResults *AA, DominatorTree *DT, LoopInfo *LI, - bool AllowPartial) + AssumptionCache *AC, bool AllowPartial) : PSE(std::make_unique<PredicatedScalarEvolution>(*SE, *L)), PtrRtChecking(nullptr), TheLoop(L), AllowPartial(AllowPartial) { unsigned MaxTargetVectorWidthInBits = std::numeric_limits<unsigned>::max(); @@ -3012,8 +3033,8 @@ LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE, MaxTargetVectorWidthInBits = TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) * 2; - DepChecker = std::make_unique<MemoryDepChecker>(*PSE, L, SymbolicStrides, - MaxTargetVectorWidthInBits); + DepChecker = std::make_unique<MemoryDepChecker>( + *PSE, AC, DT, L, SymbolicStrides, MaxTargetVectorWidthInBits); PtrRtChecking = std::make_unique<RuntimePointerChecking>(*DepChecker, SE); if (canAnalyzeLoop()) CanVecMem = analyzeLoop(AA, LI, TLI, DT); @@ -3082,7 +3103,7 @@ const LoopAccessInfo &LoopAccessInfoManager::getInfo(Loop &L, // or if it was created with a different value of AllowPartial. if (Inserted || It->second->hasAllowPartial() != AllowPartial) It->second = std::make_unique<LoopAccessInfo>(&L, &SE, TTI, TLI, &AA, &DT, - &LI, AllowPartial); + &LI, AC, AllowPartial); return *It->second; } @@ -3125,7 +3146,8 @@ LoopAccessInfoManager LoopAccessAnalysis::run(Function &F, auto &LI = FAM.getResult<LoopAnalysis>(F); auto &TTI = FAM.getResult<TargetIRAnalysis>(F); auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F); - return LoopAccessInfoManager(SE, AA, DT, LI, &TTI, &TLI); + auto &AC = FAM.getResult<AssumptionAnalysis>(F); + return LoopAccessInfoManager(SE, AA, DT, LI, &TTI, &TLI, &AC); } AnalysisKey LoopAccessAnalysis::Key; diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp index 9d5c39c..c6fa8f4 100644 --- a/llvm/lib/CodeGen/MachineScheduler.cpp +++ b/llvm/lib/CodeGen/MachineScheduler.cpp @@ -3676,8 +3676,8 @@ void GenericScheduler::initialize(ScheduleDAGMI *dag) { TopCand.SU = nullptr; BotCand.SU = nullptr; - TopCluster = nullptr; - BotCluster = nullptr; + TopClusterID = InvalidClusterId; + BotClusterID = InvalidClusterId; } /// Initialize the per-region scheduling policy. @@ -3988,10 +3988,14 @@ bool GenericScheduler::tryCandidate(SchedCandidate &Cand, // This is a best effort to set things up for a post-RA pass. Optimizations // like generating loads of multiple registers should ideally be done within // the scheduler pass by combining the loads during DAG postprocessing. - const ClusterInfo *CandCluster = Cand.AtTop ? TopCluster : BotCluster; - const ClusterInfo *TryCandCluster = TryCand.AtTop ? TopCluster : BotCluster; - if (tryGreater(TryCandCluster && TryCandCluster->contains(TryCand.SU), - CandCluster && CandCluster->contains(Cand.SU), TryCand, Cand, + unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID; + unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID; + bool CandIsClusterSucc = + isTheSameCluster(CandZoneCluster, Cand.SU->ParentClusterIdx); + bool TryCandIsClusterSucc = + isTheSameCluster(TryCandZoneCluster, TryCand.SU->ParentClusterIdx); + + if (tryGreater(TryCandIsClusterSucc, CandIsClusterSucc, TryCand, Cand, Cluster)) return TryCand.Reason != NoCand; @@ -4251,24 +4255,30 @@ void GenericScheduler::reschedulePhysReg(SUnit *SU, bool isTop) { void GenericScheduler::schedNode(SUnit *SU, bool IsTopNode) { if (IsTopNode) { SU->TopReadyCycle = std::max(SU->TopReadyCycle, Top.getCurrCycle()); - TopCluster = DAG->getCluster(SU->ParentClusterIdx); - LLVM_DEBUG(if (TopCluster) { - dbgs() << " Top Cluster: "; - for (auto *N : *TopCluster) - dbgs() << N->NodeNum << '\t'; - dbgs() << '\n'; + TopClusterID = SU->ParentClusterIdx; + LLVM_DEBUG({ + if (TopClusterID != InvalidClusterId) { + ClusterInfo *TopCluster = DAG->getCluster(TopClusterID); + dbgs() << " Top Cluster: "; + for (auto *N : *TopCluster) + dbgs() << N->NodeNum << '\t'; + dbgs() << '\n'; + } }); Top.bumpNode(SU); if (SU->hasPhysRegUses) reschedulePhysReg(SU, true); } else { SU->BotReadyCycle = std::max(SU->BotReadyCycle, Bot.getCurrCycle()); - BotCluster = DAG->getCluster(SU->ParentClusterIdx); - LLVM_DEBUG(if (BotCluster) { - dbgs() << " Bot Cluster: "; - for (auto *N : *BotCluster) - dbgs() << N->NodeNum << '\t'; - dbgs() << '\n'; + BotClusterID = SU->ParentClusterIdx; + LLVM_DEBUG({ + if (BotClusterID != InvalidClusterId) { + ClusterInfo *BotCluster = DAG->getCluster(BotClusterID); + dbgs() << " Bot Cluster: "; + for (auto *N : *BotCluster) + dbgs() << N->NodeNum << '\t'; + dbgs() << '\n'; + } }); Bot.bumpNode(SU); if (SU->hasPhysRegDefs) @@ -4306,8 +4316,8 @@ void PostGenericScheduler::initialize(ScheduleDAGMI *Dag) { if (!Bot.HazardRec) { Bot.HazardRec = DAG->TII->CreateTargetMIHazardRecognizer(Itin, DAG); } - TopCluster = nullptr; - BotCluster = nullptr; + TopClusterID = InvalidClusterId; + BotClusterID = InvalidClusterId; } void PostGenericScheduler::initPolicy(MachineBasicBlock::iterator Begin, @@ -4373,10 +4383,14 @@ bool PostGenericScheduler::tryCandidate(SchedCandidate &Cand, return TryCand.Reason != NoCand; // Keep clustered nodes together. - const ClusterInfo *CandCluster = Cand.AtTop ? TopCluster : BotCluster; - const ClusterInfo *TryCandCluster = TryCand.AtTop ? TopCluster : BotCluster; - if (tryGreater(TryCandCluster && TryCandCluster->contains(TryCand.SU), - CandCluster && CandCluster->contains(Cand.SU), TryCand, Cand, + unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID; + unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID; + bool CandIsClusterSucc = + isTheSameCluster(CandZoneCluster, Cand.SU->ParentClusterIdx); + bool TryCandIsClusterSucc = + isTheSameCluster(TryCandZoneCluster, TryCand.SU->ParentClusterIdx); + + if (tryGreater(TryCandIsClusterSucc, CandIsClusterSucc, TryCand, Cand, Cluster)) return TryCand.Reason != NoCand; // Avoid critical resource consumption and balance the schedule. @@ -4575,11 +4589,11 @@ SUnit *PostGenericScheduler::pickNode(bool &IsTopNode) { void PostGenericScheduler::schedNode(SUnit *SU, bool IsTopNode) { if (IsTopNode) { SU->TopReadyCycle = std::max(SU->TopReadyCycle, Top.getCurrCycle()); - TopCluster = DAG->getCluster(SU->ParentClusterIdx); + TopClusterID = SU->ParentClusterIdx; Top.bumpNode(SU); } else { SU->BotReadyCycle = std::max(SU->BotReadyCycle, Bot.getCurrCycle()); - BotCluster = DAG->getCluster(SU->ParentClusterIdx); + BotClusterID = SU->ParentClusterIdx; Bot.bumpNode(SU); } } diff --git a/llvm/lib/CodeGen/RegAllocBase.cpp b/llvm/lib/CodeGen/RegAllocBase.cpp index 69b9291..2400a1f 100644 --- a/llvm/lib/CodeGen/RegAllocBase.cpp +++ b/llvm/lib/CodeGen/RegAllocBase.cpp @@ -178,10 +178,8 @@ void RegAllocBase::cleanupFailedVReg(Register FailedReg, MCRegister PhysReg, for (MCRegAliasIterator Aliases(PhysReg, TRI, true); Aliases.isValid(); ++Aliases) { for (MachineOperand &MO : MRI->reg_operands(*Aliases)) { - if (MO.readsReg()) { + if (MO.readsReg()) MO.setIsUndef(true); - LIS->removeAllRegUnitsForPhysReg(MO.getReg()); - } } } } diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index a43020e..11e869a 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -331,6 +331,11 @@ namespace { return CombineTo(N, To, 2, AddTo); } + SDValue CombineTo(SDNode *N, SmallVectorImpl<SDValue> *To, + bool AddTo = true) { + return CombineTo(N, To->data(), To->size(), AddTo); + } + void CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO); private: @@ -541,6 +546,7 @@ namespace { SDValue visitEXTRACT_VECTOR_ELT(SDNode *N); SDValue visitBUILD_VECTOR(SDNode *N); SDValue visitCONCAT_VECTORS(SDNode *N); + SDValue visitVECTOR_INTERLEAVE(SDNode *N); SDValue visitEXTRACT_SUBVECTOR(SDNode *N); SDValue visitVECTOR_SHUFFLE(SDNode *N); SDValue visitSCALAR_TO_VECTOR(SDNode *N); @@ -2021,6 +2027,7 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::EXTRACT_VECTOR_ELT: return visitEXTRACT_VECTOR_ELT(N); case ISD::BUILD_VECTOR: return visitBUILD_VECTOR(N); case ISD::CONCAT_VECTORS: return visitCONCAT_VECTORS(N); + case ISD::VECTOR_INTERLEAVE: return visitVECTOR_INTERLEAVE(N); case ISD::EXTRACT_SUBVECTOR: return visitEXTRACT_SUBVECTOR(N); case ISD::VECTOR_SHUFFLE: return visitVECTOR_SHUFFLE(N); case ISD::SCALAR_TO_VECTOR: return visitSCALAR_TO_VECTOR(N); @@ -4100,18 +4107,17 @@ SDValue DAGCombiner::visitSUB(SDNode *N) { // (sub x, ([v]select (uge x, y), y, 0)) -> (umin x, (sub x, y)) if (N1.hasOneUse() && hasUMin(VT)) { SDValue Y; - if (sd_match(N1, m_Select(m_SetCC(m_Specific(N0), m_Value(Y), - m_SpecificCondCode(ISD::SETULT)), - m_Zero(), m_Deferred(Y))) || - sd_match(N1, m_Select(m_SetCC(m_Specific(N0), m_Value(Y), - m_SpecificCondCode(ISD::SETUGE)), - m_Deferred(Y), m_Zero())) || - sd_match(N1, m_VSelect(m_SetCC(m_Specific(N0), m_Value(Y), - m_SpecificCondCode(ISD::SETULT)), - m_Zero(), m_Deferred(Y))) || - sd_match(N1, m_VSelect(m_SetCC(m_Specific(N0), m_Value(Y), - m_SpecificCondCode(ISD::SETUGE)), - m_Deferred(Y), m_Zero()))) + auto MS0 = m_Specific(N0); + auto MVY = m_Value(Y); + auto MZ = m_Zero(); + auto MCC1 = m_SpecificCondCode(ISD::SETULT); + auto MCC2 = m_SpecificCondCode(ISD::SETUGE); + + if (sd_match(N1, m_SelectCCLike(MS0, MVY, MZ, m_Deferred(Y), MCC1)) || + sd_match(N1, m_SelectCCLike(MS0, MVY, m_Deferred(Y), MZ, MCC2)) || + sd_match(N1, m_VSelect(m_SetCC(MS0, MVY, MCC1), MZ, m_Deferred(Y))) || + sd_match(N1, m_VSelect(m_SetCC(MS0, MVY, MCC2), m_Deferred(Y), MZ))) + return DAG.getNode(ISD::UMIN, DL, VT, N0, DAG.getNode(ISD::SUB, DL, VT, N0, Y)); } @@ -10616,6 +10622,19 @@ SDValue DAGCombiner::visitSHL(SDNode *N) { return DAG.getVScale(DL, VT, C0 << C1); } + SDValue X; + APInt VS0; + + // fold (shl (X * vscale(VS0)), C1) -> (X * vscale(VS0 << C1)) + if (N1C && sd_match(N0, m_Mul(m_Value(X), m_VScale(m_ConstInt(VS0))))) { + SDNodeFlags Flags; + Flags.setNoUnsignedWrap(N->getFlags().hasNoUnsignedWrap() && + N0->getFlags().hasNoUnsignedWrap()); + + SDValue VScale = DAG.getVScale(DL, VT, VS0 << N1C->getAPIntValue()); + return DAG.getNode(ISD::MUL, DL, VT, X, VScale, Flags); + } + // Fold (shl step_vector(C0), C1) to (step_vector(C0 << C1)). APInt ShlVal; if (N0.getOpcode() == ISD::STEP_VECTOR && @@ -25282,6 +25301,28 @@ static SDValue combineConcatVectorOfShuffleAndItsOperands( return DAG.getVectorShuffle(VT, dl, ShufOps[0], ShufOps[1], Mask); } +static SDValue combineConcatVectorOfSplats(SDNode *N, SelectionDAG &DAG, + const TargetLowering &TLI, + bool LegalTypes, + bool LegalOperations) { + EVT VT = N->getValueType(0); + + // Post-legalization we can only create wider SPLAT_VECTOR operations if both + // the type and operation is legal. The Hexagon target has custom + // legalization for SPLAT_VECTOR that splits the operation into two parts and + // concatenates them. Therefore, custom lowering must also be rejected in + // order to avoid an infinite loop. + if ((LegalTypes && !TLI.isTypeLegal(VT)) || + (LegalOperations && !TLI.isOperationLegal(ISD::SPLAT_VECTOR, VT))) + return SDValue(); + + SDValue Op0 = N->getOperand(0); + if (!llvm::all_equal(N->op_values()) || Op0.getOpcode() != ISD::SPLAT_VECTOR) + return SDValue(); + + return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, Op0.getOperand(0)); +} + SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) { // If we only have one input vector, we don't need to do any concatenation. if (N->getNumOperands() == 1) @@ -25405,6 +25446,10 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) { return DAG.getBuildVector(VT, SDLoc(N), Opnds); } + if (SDValue V = + combineConcatVectorOfSplats(N, DAG, TLI, LegalTypes, LegalOperations)) + return V; + // Fold CONCAT_VECTORS of only bitcast scalars (or undef) to BUILD_VECTOR. // FIXME: Add support for concat_vectors(bitcast(vec0),bitcast(vec1),...). if (SDValue V = combineConcatVectorOfScalars(N, DAG)) @@ -25473,6 +25518,21 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) { return SDValue(); } +SDValue DAGCombiner::visitVECTOR_INTERLEAVE(SDNode *N) { + // Check to see if all operands are identical. + if (!llvm::all_equal(N->op_values())) + return SDValue(); + + // Check to see if the identical operand is a splat. + if (!DAG.isSplatValue(N->getOperand(0))) + return SDValue(); + + // interleave splat(X), splat(X).... --> splat(X), splat(X).... + SmallVector<SDValue, 4> Ops; + Ops.append(N->op_values().begin(), N->op_values().end()); + return CombineTo(N, &Ops); +} + // Helper that peeks through INSERT_SUBVECTOR/CONCAT_VECTORS to find // if the subvector can be sourced for free. static SDValue getSubVectorSrc(SDValue V, unsigned Index, EVT SubVT) { @@ -28965,13 +29025,27 @@ SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1, ((N1C->isAllOnes() && CC == ISD::SETGT) || (N1C->isZero() && CC == ISD::SETLT)) && !TLI.shouldAvoidTransformToShift(VT, CmpOpVT.getScalarSizeInBits() - 1)) { - SDValue ASR = DAG.getNode( - ISD::SRA, DL, CmpOpVT, N0, - DAG.getConstant(CmpOpVT.getScalarSizeInBits() - 1, DL, CmpOpVT)); - return DAG.getNode(ISD::XOR, DL, VT, DAG.getSExtOrTrunc(ASR, DL, VT), + SDValue ASHR = + DAG.getNode(ISD::SRA, DL, CmpOpVT, N0, + DAG.getShiftAmountConstant( + CmpOpVT.getScalarSizeInBits() - 1, CmpOpVT, DL)); + return DAG.getNode(ISD::XOR, DL, VT, DAG.getSExtOrTrunc(ASHR, DL, VT), DAG.getSExtOrTrunc(CC == ISD::SETLT ? N3 : N2, DL, VT)); } + // Fold sign pattern select_cc setgt X, -1, 1, -1 -> or (ashr X, BW-1), 1 + if (CC == ISD::SETGT && N1C && N2C && N3C && N1C->isAllOnes() && + N2C->isOne() && N3C->isAllOnes() && + !TLI.shouldAvoidTransformToShift(CmpOpVT, + CmpOpVT.getScalarSizeInBits() - 1)) { + SDValue ASHR = + DAG.getNode(ISD::SRA, DL, CmpOpVT, N0, + DAG.getShiftAmountConstant( + CmpOpVT.getScalarSizeInBits() - 1, CmpOpVT, DL)); + return DAG.getNode(ISD::OR, DL, VT, DAG.getSExtOrTrunc(ASHR, DL, VT), + DAG.getConstant(1, DL, VT)); + } + if (SDValue S = PerformMinMaxFpToSatCombine(N0, N1, N2, N3, CC, DAG)) return S; if (SDValue S = PerformUMinFpToSatCombine(N0, N1, N2, N3, CC, DAG)) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index 583a85a..a5bd97a 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -2217,8 +2217,9 @@ SDValue DAGTypeLegalizer::PromoteIntOp_BITCAST(SDNode *N) { switch (getTypeAction(InVT)) { case TargetLowering::TypePromoteInteger: { - // TODO: Handle big endian - if (OutVT.isVector() && DAG.getDataLayout().isLittleEndian()) { + // TODO: Handle big endian & vector input type. + if (OutVT.isVector() && !InVT.isVector() && + DAG.getDataLayout().isLittleEndian()) { EVT EltVT = OutVT.getVectorElementType(); TypeSize EltSize = EltVT.getSizeInBits(); TypeSize NInSize = NInVT.getSizeInBits(); diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 48d6b99..e3f2a193 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -6482,8 +6482,8 @@ SDValue TargetLowering::buildSDIVPow2WithCMov( Created.push_back(CMov.getNode()); // Divide by pow2. - SDValue SRA = - DAG.getNode(ISD::SRA, DL, VT, CMov, DAG.getConstant(Lg2, DL, VT)); + SDValue SRA = DAG.getNode(ISD::SRA, DL, VT, CMov, + DAG.getShiftAmountConstant(Lg2, VT, DL)); // If we're dividing by a positive value, we're done. Otherwise, we must // negate the result. diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp index a8559e7..6de6cc7 100644 --- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp +++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp @@ -891,7 +891,7 @@ RuntimeDyldImpl::emitSection(const ObjectFile &Obj, // Align DataSize to stub alignment if we have any stubs (PaddingSize will // have been increased above to account for this). if (StubBufSize > 0) - DataSize &= -(uint64_t)getStubAlignment().value(); + DataSize &= -getStubAlignment().value(); } LLVM_DEBUG(dbgs() << "emitSection SectionID: " << SectionID << " Name: " diff --git a/llvm/lib/Frontend/Offloading/CMakeLists.txt b/llvm/lib/Frontend/Offloading/CMakeLists.txt index 8e1ede9..9747dbd 100644 --- a/llvm/lib/Frontend/Offloading/CMakeLists.txt +++ b/llvm/lib/Frontend/Offloading/CMakeLists.txt @@ -1,6 +1,7 @@ add_llvm_component_library(LLVMFrontendOffloading Utility.cpp OffloadWrapper.cpp + PropertySet.cpp ADDITIONAL_HEADER_DIRS ${LLVM_MAIN_INCLUDE_DIR}/llvm/Frontend diff --git a/llvm/lib/Frontend/Offloading/PropertySet.cpp b/llvm/lib/Frontend/Offloading/PropertySet.cpp new file mode 100644 index 0000000..a70290d --- /dev/null +++ b/llvm/lib/Frontend/Offloading/PropertySet.cpp @@ -0,0 +1,102 @@ +///===- llvm/Frontend/Offloading/PropertySet.cpp --------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/Frontend/Offloading/PropertySet.h" +#include "llvm/Support/Base64.h" +#include "llvm/Support/JSON.h" +#include "llvm/Support/MemoryBufferRef.h" + +using namespace llvm; +using namespace llvm::offloading; + +void llvm::offloading::writePropertiesToJSON( + const PropertySetRegistry &PSRegistry, raw_ostream &Out) { + json::OStream J(Out); + J.object([&] { + for (const auto &[CategoryName, PropSet] : PSRegistry) { + auto PropSetCapture = PropSet; + J.attributeObject(CategoryName, [&] { + for (const auto &[PropName, PropVal] : PropSetCapture) { + switch (PropVal.index()) { + case 0: + J.attribute(PropName, std::get<uint32_t>(PropVal)); + break; + case 1: + J.attribute(PropName, encodeBase64(std::get<ByteArray>(PropVal))); + break; + default: + llvm_unreachable("unsupported property type"); + } + } + }); + } + }); +} + +// note: createStringError has an overload that takes a format string, +// but it uses llvm::format instead of llvm::formatv, which does +// not work with json::Value. This is a helper function to use +// llvm::formatv with createStringError. +template <typename... Ts> auto createStringErrorV(Ts &&...Args) { + return createStringError(formatv(std::forward<Ts>(Args)...)); +} + +Expected<PropertyValue> +readPropertyValueFromJSON(const json::Value &PropValueVal) { + if (std::optional<uint64_t> Val = PropValueVal.getAsUINT64()) + return PropertyValue(static_cast<uint32_t>(*Val)); + + if (std::optional<StringRef> Val = PropValueVal.getAsString()) { + std::vector<char> Decoded; + if (Error E = decodeBase64(*Val, Decoded)) + return createStringErrorV("unable to base64 decode the string {0}: {1}", + Val, toString(std::move(E))); + return PropertyValue(ByteArray(Decoded.begin(), Decoded.end())); + } + + return createStringErrorV("expected a uint64 or a string, got {0}", + PropValueVal); +} + +Expected<PropertySetRegistry> +llvm::offloading::readPropertiesFromJSON(MemoryBufferRef Buf) { + PropertySetRegistry Res; + Expected<json::Value> V = json::parse(Buf.getBuffer()); + if (Error E = V.takeError()) + return E; + + const json::Object *O = V->getAsObject(); + if (!O) + return createStringErrorV( + "error while deserializing property set registry: " + "expected JSON object, got {0}", + *V); + + for (const auto &[CategoryName, Value] : *O) { + const json::Object *PropSetVal = Value.getAsObject(); + if (!PropSetVal) + return createStringErrorV("error while deserializing property set {0}: " + "expected JSON array, got {1}", + CategoryName.str(), Value); + + PropertySet &PropSet = Res[CategoryName.str()]; + for (const auto &[PropName, PropValueVal] : *PropSetVal) { + Expected<PropertyValue> Prop = readPropertyValueFromJSON(PropValueVal); + if (Error E = Prop.takeError()) + return createStringErrorV( + "error while deserializing property {0} in property set {1}: {2}", + PropName.str(), CategoryName.str(), toString(std::move(E))); + + auto [It, Inserted] = + PropSet.try_emplace(PropName.str(), std::move(*Prop)); + assert(Inserted && "Property already exists in PropertySet"); + (void)Inserted; + } + } + return Res; +} diff --git a/llvm/lib/IR/DebugInfoMetadata.cpp b/llvm/lib/IR/DebugInfoMetadata.cpp index f16963d..f1d4549 100644 --- a/llvm/lib/IR/DebugInfoMetadata.cpp +++ b/llvm/lib/IR/DebugInfoMetadata.cpp @@ -1012,7 +1012,7 @@ DIDerivedType *DIDerivedType::getImpl( std::optional<DIDerivedType::PtrAuthData> DIDerivedType::getPtrAuthData() const { return getTag() == dwarf::DW_TAG_LLVM_ptrauth_type - ? std::optional<PtrAuthData>(PtrAuthData(SubclassData32)) + ? std::make_optional<PtrAuthData>(SubclassData32) : std::nullopt; } diff --git a/llvm/lib/MC/MCAssembler.cpp b/llvm/lib/MC/MCAssembler.cpp index 8500fd1..5c8e904 100644 --- a/llvm/lib/MC/MCAssembler.cpp +++ b/llvm/lib/MC/MCAssembler.cpp @@ -59,7 +59,8 @@ STATISTIC(EmittedFillFragments, "Number of emitted assembler fragments - fill"); STATISTIC(EmittedNopsFragments, "Number of emitted assembler fragments - nops"); STATISTIC(EmittedOrgFragments, "Number of emitted assembler fragments - org"); -STATISTIC(evaluateFixup, "Number of evaluated fixups"); +STATISTIC(Fixups, "Number of fixups"); +STATISTIC(FixupEvalForRelax, "Number of fixup evaluations for relaxation"); STATISTIC(ObjectBytes, "Number of emitted object file bytes"); STATISTIC(RelaxationSteps, "Number of assembler layout and relaxation steps"); STATISTIC(RelaxedInstructions, "Number of relaxed instructions"); @@ -140,9 +141,9 @@ bool MCAssembler::isThumbFunc(const MCSymbol *Symbol) const { bool MCAssembler::evaluateFixup(const MCFragment &F, MCFixup &Fixup, MCValue &Target, uint64_t &Value, - bool RecordReloc, - MutableArrayRef<char> Contents) const { - ++stats::evaluateFixup; + bool RecordReloc, uint8_t *Data) const { + if (RecordReloc) + ++stats::Fixups; // FIXME: This code has some duplication with recordRelocation. We should // probably merge the two into a single callback that tries to evaluate a @@ -185,7 +186,7 @@ bool MCAssembler::evaluateFixup(const MCFragment &F, MCFixup &Fixup, if (IsResolved && mc::isRelocRelocation(Fixup.getKind())) IsResolved = false; - getBackend().applyFixup(F, Fixup, Target, Contents, Value, IsResolved); + getBackend().applyFixup(F, Fixup, Target, Data, Value, IsResolved); return true; } @@ -703,21 +704,25 @@ void MCAssembler::layout() { for (MCFixup &Fixup : F.getFixups()) { uint64_t FixedValue; MCValue Target; + assert(mc::isRelocRelocation(Fixup.getKind()) || + Fixup.getOffset() <= F.getFixedSize()); + auto *Data = + reinterpret_cast<uint8_t *>(Contents.data() + Fixup.getOffset()); evaluateFixup(F, Fixup, Target, FixedValue, - /*RecordReloc=*/true, Contents); + /*RecordReloc=*/true, Data); } - if (F.getVarFixups().size()) { - // In the variable part, fixup offsets are relative to the fixed part's - // start. Extend the variable contents to the left to account for the - // fixed part size. - Contents = MutableArrayRef(F.getParent()->ContentStorage) - .slice(F.VarContentStart - Contents.size(), F.getSize()); - for (MCFixup &Fixup : F.getVarFixups()) { - uint64_t FixedValue; - MCValue Target; - evaluateFixup(F, Fixup, Target, FixedValue, - /*RecordReloc=*/true, Contents); - } + // In the variable part, fixup offsets are relative to the fixed part's + // start. + for (MCFixup &Fixup : F.getVarFixups()) { + uint64_t FixedValue; + MCValue Target; + assert(mc::isRelocRelocation(Fixup.getKind()) || + (Fixup.getOffset() >= F.getFixedSize() && + Fixup.getOffset() <= F.getSize())); + auto *Data = reinterpret_cast<uint8_t *>( + F.getVarContents().data() + (Fixup.getOffset() - F.getFixedSize())); + evaluateFixup(F, Fixup, Target, FixedValue, + /*RecordReloc=*/true, Data); } } } @@ -735,7 +740,7 @@ void MCAssembler::Finish() { bool MCAssembler::fixupNeedsRelaxation(const MCFragment &F, const MCFixup &Fixup) const { - assert(getBackendPtr() && "Expected assembler backend"); + ++stats::FixupEvalForRelax; MCValue Target; uint64_t Value; bool Resolved = evaluateFixup(F, const_cast<MCFixup &>(Fixup), Target, Value, diff --git a/llvm/lib/MC/MCCodeView.cpp b/llvm/lib/MC/MCCodeView.cpp index 7d528a5..3a5f01c 100644 --- a/llvm/lib/MC/MCCodeView.cpp +++ b/llvm/lib/MC/MCCodeView.cpp @@ -695,5 +695,7 @@ void CodeViewContext::encodeDefRange(const MCAssembler &Asm, } Frag.setVarContents(Contents); + assert(Fixups.size() < 256 && "Store fixups outside of MCFragment's VarFixup " + "storage if the number ever exceeds 256"); Frag.setVarFixups(Fixups); } diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp index e277143..200e29a 100644 --- a/llvm/lib/MC/MCObjectStreamer.cpp +++ b/llvm/lib/MC/MCObjectStreamer.cpp @@ -46,27 +46,83 @@ MCAssembler *MCObjectStreamer::getAssemblerPtr() { return nullptr; } +constexpr size_t FragBlockSize = 16384; +// Ensure the new fragment can at least store a few bytes. +constexpr size_t NewFragHeadroom = 8; + +static_assert(NewFragHeadroom >= alignof(MCFragment)); +static_assert(FragBlockSize >= sizeof(MCFragment) + NewFragHeadroom); + +MCFragment *MCObjectStreamer::allocFragSpace(size_t Headroom) { + auto Size = std::max(FragBlockSize, sizeof(MCFragment) + Headroom); + FragSpace = Size - sizeof(MCFragment); + auto Block = std::unique_ptr<uint8_t[]>(new uint8_t[Size]); + auto *F = reinterpret_cast<MCFragment *>(Block.get()); + FragStorage.push_back(std::move(Block)); + return F; +} + void MCObjectStreamer::newFragment() { - addFragment(getContext().allocFragment<MCFragment>()); + MCFragment *F; + if (LLVM_LIKELY(sizeof(MCFragment) + NewFragHeadroom <= FragSpace)) { + auto End = reinterpret_cast<size_t>(getCurFragEnd()); + F = reinterpret_cast<MCFragment *>( + alignToPowerOf2(End, alignof(MCFragment))); + FragSpace -= size_t(F) - End + sizeof(MCFragment); + } else { + F = allocFragSpace(0); + } + new (F) MCFragment(); + addFragment(F); +} + +void MCObjectStreamer::ensureHeadroom(size_t Headroom) { + if (Headroom <= FragSpace) + return; + auto *F = allocFragSpace(Headroom); + new (F) MCFragment(); + addFragment(F); } -void MCObjectStreamer::insert(MCFragment *F) { - assert(F->getKind() != MCFragment::FT_Data && +void MCObjectStreamer::insert(MCFragment *Frag) { + assert(Frag->getKind() != MCFragment::FT_Data && "F should have a variable-size tail"); + // Frag is not connected to FragSpace. Before modifying CurFrag with + // addFragment(Frag), allocate an empty fragment to maintain FragSpace + // connectivity, potentially reusing CurFrag's associated space. + MCFragment *F; + if (LLVM_LIKELY(sizeof(MCFragment) + NewFragHeadroom <= FragSpace)) { + auto End = reinterpret_cast<size_t>(getCurFragEnd()); + F = reinterpret_cast<MCFragment *>( + alignToPowerOf2(End, alignof(MCFragment))); + FragSpace -= size_t(F) - End + sizeof(MCFragment); + } else { + F = allocFragSpace(0); + } + new (F) MCFragment(); + + addFragment(Frag); addFragment(F); - newFragment(); } void MCObjectStreamer::appendContents(ArrayRef<char> Contents) { - CurFrag->appendContents(Contents); + ensureHeadroom(Contents.size()); + assert(FragSpace >= Contents.size()); + llvm::copy(Contents, getCurFragEnd()); + CurFrag->FixedSize += Contents.size(); + FragSpace -= Contents.size(); } -void MCObjectStreamer::appendContents(size_t Num, char Elt) { - CurFrag->appendContents(Num, Elt); +void MCObjectStreamer::appendContents(size_t Num, uint8_t Elt) { + ensureHeadroom(Num); + MutableArrayRef<uint8_t> Data(getCurFragEnd(), Num); + llvm::fill(Data, Elt); + CurFrag->FixedSize += Num; + FragSpace -= Num; } void MCObjectStreamer::addFixup(const MCExpr *Value, MCFixupKind Kind) { - CurFrag->addFixup(MCFixup::create(CurFrag->getFixedSize(), Value, Kind)); + CurFrag->addFixup(MCFixup::create(getCurFragSize(), Value, Kind)); } // As a compile-time optimization, avoid allocating and evaluating an MCExpr @@ -115,6 +171,8 @@ void MCObjectStreamer::reset() { } EmitEHFrame = true; EmitDebugFrame = false; + FragStorage.clear(); + FragSpace = 0; MCStreamer::reset(); } @@ -143,7 +201,6 @@ void MCObjectStreamer::emitCFISections(bool EH, bool Debug, bool SFrame) { void MCObjectStreamer::emitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) { MCStreamer::emitValueImpl(Value, Size, Loc); - MCFragment *DF = getCurrentFragment(); MCDwarfLineEntry::make(this, getCurrentSectionOnly()); @@ -158,9 +215,9 @@ void MCObjectStreamer::emitValueImpl(const MCExpr *Value, unsigned Size, emitIntValue(AbsValue, Size); return; } - DF->addFixup(MCFixup::create(DF->getContents().size(), Value, - MCFixup::getDataKindForSize(Size))); - DF->appendContents(Size, 0); + ensureHeadroom(Size); + addFixup(Value, MCFixup::getDataKindForSize(Size)); + appendContents(Size, 0); } MCSymbol *MCObjectStreamer::emitCFILabel() { @@ -194,7 +251,7 @@ void MCObjectStreamer::emitLabel(MCSymbol *Symbol, SMLoc Loc) { // section. MCFragment *F = CurFrag; Symbol->setFragment(F); - Symbol->setOffset(F->getContents().size()); + Symbol->setOffset(F->getFixedSize()); emitPendingAssignments(Symbol); } @@ -260,6 +317,21 @@ void MCObjectStreamer::changeSection(MCSection *Section, uint32_t Subsection) { F0 = CurFrag; } + // To maintain connectivity between CurFrag and FragSpace when CurFrag is + // modified, allocate an empty fragment and append it to the fragment list. + // (Subsections[I].second.Tail is not connected to FragSpace.) + MCFragment *F; + if (LLVM_LIKELY(sizeof(MCFragment) + NewFragHeadroom <= FragSpace)) { + auto End = reinterpret_cast<size_t>(getCurFragEnd()); + F = reinterpret_cast<MCFragment *>( + alignToPowerOf2(End, alignof(MCFragment))); + FragSpace -= size_t(F) - End + sizeof(MCFragment); + } else { + F = allocFragSpace(0); + } + new (F) MCFragment(); + F->setParent(Section); + auto &Subsections = Section->Subsections; size_t I = 0, E = Subsections.size(); while (I != E && Subsections[I].first < Subsection) @@ -267,13 +339,16 @@ void MCObjectStreamer::changeSection(MCSection *Section, uint32_t Subsection) { // If the subsection number is not in the sorted Subsections list, create a // new fragment list. if (I == E || Subsections[I].first != Subsection) { - auto *F = getContext().allocFragment<MCFragment>(); - F->setParent(Section); Subsections.insert(Subsections.begin() + I, {Subsection, MCSection::FragList{F, F}}); + Section->CurFragList = &Subsections[I].second; + CurFrag = F; + } else { + Section->CurFragList = &Subsections[I].second; + CurFrag = Subsections[I].second.Tail; + // Ensure CurFrag is associated with FragSpace. + addFragment(F); } - Section->CurFragList = &Subsections[I].second; - CurFrag = Section->CurFragList->Tail; // Define the section symbol at subsection 0's initial fragment if required. if (!NewSec) @@ -344,11 +419,15 @@ void MCObjectStreamer::emitInstToData(const MCInst &Inst, MCFragment *F = getCurrentFragment(); // Append the instruction to the data fragment. - size_t CodeOffset = F->getContents().size(); + size_t CodeOffset = getCurFragSize(); + SmallString<16> Content; SmallVector<MCFixup, 1> Fixups; - getAssembler().getEmitter().encodeInstruction( - Inst, F->getContentsForAppending(), Fixups, STI); - F->doneAppending(); + getAssembler().getEmitter().encodeInstruction(Inst, Content, Fixups, STI); + appendContents(Content); + if (CurFrag != F) { + F = CurFrag; + CodeOffset = 0; + } F->setHasInstructions(STI); if (Fixups.empty()) diff --git a/llvm/lib/MC/MCSection.cpp b/llvm/lib/MC/MCSection.cpp index 4f28267..27ca131 100644 --- a/llvm/lib/MC/MCSection.cpp +++ b/llvm/lib/MC/MCSection.cpp @@ -83,12 +83,14 @@ void MCFragment::appendFixups(ArrayRef<MCFixup> Fixups) { } void MCFragment::setVarFixups(ArrayRef<MCFixup> Fixups) { + assert(Fixups.size() < 256 && + "variable-size tail cannot have more than 256 fixups"); auto &S = getParent()->FixupStorage; - if (VarFixupStart + Fixups.size() > VarFixupEnd) { + if (Fixups.size() > VarFixupSize) { VarFixupStart = S.size(); S.resize_for_overwrite(S.size() + Fixups.size()); } - VarFixupEnd = VarFixupStart + Fixups.size(); + VarFixupSize = Fixups.size(); // Source fixup offsets are relative to the variable part's start. Add the // fixed part size to make them relative to the fixed part's start. std::transform(Fixups.begin(), Fixups.end(), S.begin() + VarFixupStart, diff --git a/llvm/lib/MC/MCWin64EH.cpp b/llvm/lib/MC/MCWin64EH.cpp index 72a8dd7..a87648a 100644 --- a/llvm/lib/MC/MCWin64EH.cpp +++ b/llvm/lib/MC/MCWin64EH.cpp @@ -318,6 +318,9 @@ static void EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info) { // Emit the epilog instructions. if (EnableUnwindV2) { + // Ensure the fixups and appended content apply to the same fragment. + OS->ensureHeadroom(info->EpilogMap.size() * 2); + bool IsLast = true; for (const auto &Epilog : llvm::reverse(info->EpilogMap)) { if (IsLast) { diff --git a/llvm/lib/MC/MCWinCOFFStreamer.cpp b/llvm/lib/MC/MCWinCOFFStreamer.cpp index 1ffe25c..8be5054 100644 --- a/llvm/lib/MC/MCWinCOFFStreamer.cpp +++ b/llvm/lib/MC/MCWinCOFFStreamer.cpp @@ -280,6 +280,7 @@ void MCWinCOFFStreamer::emitCOFFSymbolIndex(MCSymbol const *Symbol) { void MCWinCOFFStreamer::emitCOFFSectionIndex(const MCSymbol *Symbol) { visitUsedSymbol(*Symbol); const MCSymbolRefExpr *SRE = MCSymbolRefExpr::create(Symbol, getContext()); + ensureHeadroom(2); addFixup(SRE, FK_SecRel_2); appendContents(2, 0); } @@ -293,6 +294,7 @@ void MCWinCOFFStreamer::emitCOFFSecRel32(const MCSymbol *Symbol, if (Offset) MCE = MCBinaryExpr::createAdd( MCE, MCConstantExpr::create(Offset, getContext()), getContext()); + ensureHeadroom(4); addFixup(MCE, FK_SecRel_4); // Emit 4 bytes (zeros) to the object file. appendContents(4, 0); @@ -308,6 +310,7 @@ void MCWinCOFFStreamer::emitCOFFImgRel32(const MCSymbol *Symbol, if (Offset) MCE = MCBinaryExpr::createAdd( MCE, MCConstantExpr::create(Offset, getContext()), getContext()); + ensureHeadroom(4); addFixup(MCE, FK_Data_4); // Emit 4 bytes (zeros) to the object file. appendContents(4, 0); @@ -318,6 +321,7 @@ void MCWinCOFFStreamer::emitCOFFSecNumber(MCSymbol const *Symbol) { // Create Symbol for section number. const MCExpr *MCE = MCCOFFSectionNumberTargetExpr::create( *Symbol, this->getWriter(), getContext()); + ensureHeadroom(4); addFixup(MCE, FK_Data_4); // Emit 4 bytes (zeros) to the object file. appendContents(4, 0); @@ -328,6 +332,7 @@ void MCWinCOFFStreamer::emitCOFFSecOffset(MCSymbol const *Symbol) { // Create Symbol for section offset. const MCExpr *MCE = MCCOFFSectionOffsetTargetExpr::create(*Symbol, getContext()); + ensureHeadroom(4); addFixup(MCE, FK_Data_4); // Emit 4 bytes (zeros) to the object file. appendContents(4, 0); diff --git a/llvm/lib/Remarks/RemarkLinker.cpp b/llvm/lib/Remarks/RemarkLinker.cpp index 0ca6217..b00419b 100644 --- a/llvm/lib/Remarks/RemarkLinker.cpp +++ b/llvm/lib/Remarks/RemarkLinker.cpp @@ -70,8 +70,8 @@ Error RemarkLinker::link(StringRef Buffer, Format RemarkFormat) { Expected<std::unique_ptr<RemarkParser>> MaybeParser = createRemarkParserFromMeta( RemarkFormat, Buffer, - PrependPath ? std::optional<StringRef>(StringRef(*PrependPath)) - : std::optional<StringRef>()); + PrependPath ? std::make_optional<StringRef>(*PrependPath) + : std::nullopt); if (!MaybeParser) return MaybeParser.takeError(); diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp index 5e0b29f..46084c5 100644 --- a/llvm/lib/Support/APFloat.cpp +++ b/llvm/lib/Support/APFloat.cpp @@ -900,6 +900,30 @@ writeSignedDecimal (char *dst, int value) return dst; } +// Compute the ULP of the input using a definition from: +// Jean-Michel Muller. On the definition of ulp(x). [Research Report] RR-5504, +// LIP RR-2005-09, INRIA, LIP. 2005, pp.16. inria-00070503 +static APFloat harrisonUlp(const APFloat &X) { + const fltSemantics &Sem = X.getSemantics(); + switch (X.getCategory()) { + case APFloat::fcNaN: + return APFloat::getQNaN(Sem); + case APFloat::fcInfinity: + return APFloat::getInf(Sem); + case APFloat::fcZero: + return APFloat::getSmallest(Sem); + case APFloat::fcNormal: + break; + } + if (X.isDenormal() || X.isSmallestNormalized()) + return APFloat::getSmallest(Sem); + int Exp = ilogb(X); + if (X.getExactLog2() != INT_MIN) + Exp -= 1; + return scalbn(APFloat::getOne(Sem), Exp - (Sem.precision - 1), + APFloat::rmNearestTiesToEven); +} + namespace detail { /* Constructors. */ void IEEEFloat::initialize(const fltSemantics *ourSemantics) { @@ -5306,12 +5330,110 @@ Expected<APFloat::opStatus> DoubleAPFloat::convertFromString(StringRef S, return Ret; } +// The double-double lattice of values corresponds to numbers which obey: +// - abs(lo) <= 1/2 * ulp(hi) +// - roundTiesToEven(hi + lo) == hi +// +// nextUp must choose the smallest output > input that follows these rules. +// nexDown must choose the largest output < input that follows these rules. APFloat::opStatus DoubleAPFloat::next(bool nextDown) { assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics"); - APFloat Tmp(semPPCDoubleDoubleLegacy, bitcastToAPInt()); - auto Ret = Tmp.next(nextDown); - *this = DoubleAPFloat(semPPCDoubleDouble, Tmp.bitcastToAPInt()); - return Ret; + // nextDown(x) = -nextUp(-x) + if (nextDown) { + changeSign(); + APFloat::opStatus Result = next(/*nextDown=*/false); + changeSign(); + return Result; + } + switch (getCategory()) { + case fcInfinity: + // nextUp(+inf) = +inf + // nextUp(-inf) = -getLargest() + if (isNegative()) + makeLargest(true); + return opOK; + + case fcNaN: + // IEEE-754R 2008 6.2 Par 2: nextUp(sNaN) = qNaN. Set Invalid flag. + // IEEE-754R 2008 6.2: nextUp(qNaN) = qNaN. Must be identity so we do not + // change the payload. + if (getFirst().isSignaling()) { + // For consistency, propagate the sign of the sNaN to the qNaN. + makeNaN(false, isNegative(), nullptr); + return opInvalidOp; + } + return opOK; + + case fcZero: + // nextUp(pm 0) = +getSmallest() + makeSmallest(false); + return opOK; + + case fcNormal: + break; + } + + const APFloat &HiOld = getFirst(); + const APFloat &LoOld = getSecond(); + + APFloat NextLo = LoOld; + NextLo.next(/*nextDown=*/false); + + // We want to admit values where: + // 1. abs(Lo) <= ulp(Hi)/2 + // 2. Hi == RTNE(Hi + lo) + auto InLattice = [](const APFloat &Hi, const APFloat &Lo) { + return Hi + Lo == Hi; + }; + + // Check if (HiOld, nextUp(LoOld) is in the lattice. + if (InLattice(HiOld, NextLo)) { + // Yes, the result is (HiOld, nextUp(LoOld)). + Floats[1] = std::move(NextLo); + + // TODO: Because we currently rely on semPPCDoubleDoubleLegacy, our maximum + // value is defined to have exactly 106 bits of precision. This limitation + // results in semPPCDoubleDouble being unable to reach its maximum canonical + // value. + DoubleAPFloat Largest{*Semantics, uninitialized}; + Largest.makeLargest(/*Neg=*/false); + if (compare(Largest) == cmpGreaterThan) + makeInf(/*Neg=*/false); + + return opOK; + } + + // Now we need to handle the cases where (HiOld, nextUp(LoOld)) is not the + // correct result. We know the new hi component will be nextUp(HiOld) but our + // lattice rules make it a little ambiguous what the correct NextLo must be. + APFloat NextHi = HiOld; + NextHi.next(/*nextDown=*/false); + + // nextUp(getLargest()) == INFINITY + if (NextHi.isInfinity()) { + makeInf(/*Neg=*/false); + return opOK; + } + + // IEEE 754-2019 5.3.1: + // "If x is the negative number of least magnitude in x's format, nextUp(x) is + // -0." + if (NextHi.isZero()) { + makeZero(/*Neg=*/true); + return opOK; + } + + // abs(NextLo) must be <= ulp(NextHi)/2. We want NextLo to be as close to + // negative infinity as possible. + NextLo = neg(scalbn(harrisonUlp(NextHi), -1, rmTowardZero)); + if (!InLattice(NextHi, NextLo)) + // RTNE may mean that Lo must be < ulp(NextHi) / 2 so we bump NextLo. + NextLo.next(/*nextDown=*/false); + + Floats[0] = std::move(NextHi); + Floats[1] = std::move(NextLo); + + return opOK; } APFloat::opStatus diff --git a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp index 509cbb0..e8d3161 100644 --- a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp @@ -813,8 +813,8 @@ bool AArch64Arm64ECCallLowering::runOnModule(Module &Mod) { } } - if (!F.hasFnAttribute(Attribute::HybridPatchable) || F.isDeclaration() || - F.hasLocalLinkage() || + if (!F.hasFnAttribute(Attribute::HybridPatchable) || + F.isDeclarationForLinker() || F.hasLocalLinkage() || F.getName().ends_with(HybridPatchableTargetSuffix)) continue; @@ -857,7 +857,7 @@ bool AArch64Arm64ECCallLowering::runOnModule(Module &Mod) { SetVector<GlobalValue *> DirectCalledFns; for (Function &F : Mod) - if (!F.isDeclaration() && + if (!F.isDeclarationForLinker() && F.getCallingConv() != CallingConv::ARM64EC_Thunk_Native && F.getCallingConv() != CallingConv::ARM64EC_Thunk_X64) processFunction(F, DirectCalledFns, FnsMap); @@ -869,7 +869,8 @@ bool AArch64Arm64ECCallLowering::runOnModule(Module &Mod) { }; SmallVector<ThunkInfo> ThunkMapping; for (Function &F : Mod) { - if (!F.isDeclaration() && (!F.hasLocalLinkage() || F.hasAddressTaken()) && + if (!F.isDeclarationForLinker() && + (!F.hasLocalLinkage() || F.hasAddressTaken()) && F.getCallingConv() != CallingConv::ARM64EC_Thunk_Native && F.getCallingConv() != CallingConv::ARM64EC_Thunk_X64) { if (!F.hasComdat()) @@ -959,7 +960,7 @@ bool AArch64Arm64ECCallLowering::processFunction( // unprototyped functions in C) if (Function *F = CB->getCalledFunction()) { if (!LowerDirectToIndirect || F->hasLocalLinkage() || - F->isIntrinsic() || !F->isDeclaration()) + F->isIntrinsic() || !F->isDeclarationForLinker()) continue; DirectCalledFns.insert(F); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 4f6e3dd..2b6ea86 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -162,10 +162,10 @@ static cl::opt<bool> UseFEATCPACodegen( cl::init(false)); /// Value type used for condition codes. -static const MVT MVT_CC = MVT::i32; +constexpr MVT CondCodeVT = MVT::i32; /// Value type used for NZCV flags. -static constexpr MVT FlagsVT = MVT::i32; +constexpr MVT FlagsVT = MVT::i32; static const MCPhysReg GPRArgRegs[] = {AArch64::X0, AArch64::X1, AArch64::X2, AArch64::X3, AArch64::X4, AArch64::X5, @@ -3472,6 +3472,12 @@ static void changeVectorFPCCToAArch64CC(ISD::CondCode CC, } } +/// Like SelectionDAG::getCondCode(), but for AArch64 condition codes. +static SDValue getCondCode(SelectionDAG &DAG, AArch64CC::CondCode CC) { + // TODO: Should be TargetConstant (need to s/imm/timm in patterns). + return DAG.getConstant(CC, SDLoc(), CondCodeVT); +} + static bool isLegalArithImmed(uint64_t C) { // Matches AArch64DAGToDAGISel::SelectArithImmed(). bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0); @@ -3678,7 +3684,7 @@ static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, if (Opcode == 0) Opcode = AArch64ISD::CCMP; - SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC); + SDValue Condition = getCondCode(DAG, Predicate); AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC); unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC); SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32); @@ -4075,7 +4081,7 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, Cmp = emitComparison(LHS, RHS, CC, DL, DAG); AArch64CC = changeIntCCToAArch64CC(CC); } - AArch64cc = DAG.getConstant(AArch64CC, DL, MVT_CC); + AArch64cc = getCondCode(DAG, AArch64CC); return Cmp; } @@ -4195,7 +4201,7 @@ SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const { AArch64CC::CondCode CC; SDValue Value, Overflow; std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG); - SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), DL, MVT::i32); + SDValue CCVal = getCondCode(DAG, getInvertedCondCode(CC)); return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal, CCVal, Overflow); } @@ -4274,8 +4280,8 @@ static SDValue carryFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG, SDLoc DL(Glue); SDValue Zero = DAG.getConstant(0, DL, VT); SDValue One = DAG.getConstant(1, DL, VT); - unsigned Cond = Invert ? AArch64CC::LO : AArch64CC::HS; - SDValue CC = DAG.getConstant(Cond, DL, MVT::i32); + AArch64CC::CondCode Cond = Invert ? AArch64CC::LO : AArch64CC::HS; + SDValue CC = getCondCode(DAG, Cond); return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue); } @@ -4285,7 +4291,7 @@ static SDValue overflowFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG) { SDLoc DL(Glue); SDValue Zero = DAG.getConstant(0, DL, VT); SDValue One = DAG.getConstant(1, DL, VT); - SDValue CC = DAG.getConstant(AArch64CC::VS, DL, MVT::i32); + SDValue CC = getCondCode(DAG, AArch64CC::VS); return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue); } @@ -4334,7 +4340,7 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { // We use an inverted condition, because the conditional select is inverted // too. This will allow it to be selected to a single instruction: // CSINC Wd, WZR, WZR, invert(cond). - SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), DL, MVT::i32); + SDValue CCVal = getCondCode(DAG, getInvertedCondCode(CC)); Overflow = DAG.getNode(AArch64ISD::CSEL, DL, MVT::i32, FVal, TVal, CCVal, Overflow); @@ -7124,8 +7130,7 @@ SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const { SDValue Cmp = DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT), Op.getOperand(0), DAG.getConstant(0, DL, VT)); return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg, - DAG.getConstant(AArch64CC::PL, DL, MVT::i32), - Cmp.getValue(1)); + getCondCode(DAG, AArch64CC::PL), Cmp.getValue(1)); } static SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) { @@ -7136,7 +7141,7 @@ static SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) { AArch64CC::CondCode CC; if (SDValue Cmp = emitConjunction(DAG, Cond, CC)) { SDLoc DL(Op); - SDValue CCVal = DAG.getConstant(CC, DL, MVT::i32); + SDValue CCVal = getCondCode(DAG, CC); return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CCVal, Cmp); } @@ -10575,7 +10580,7 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { if (CC == ISD::SETNE) OFCC = getInvertedCondCode(OFCC); - SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32); + SDValue CCVal = getCondCode(DAG, OFCC); return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CCVal, Overflow); @@ -10648,7 +10653,7 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { AArch64CC::isValidCBCond(changeIntCCToAArch64CC(CC)) && ProduceNonFlagSettingCondBr) { SDValue Cond = - DAG.getTargetConstant(changeIntCCToAArch64CC(CC), DL, MVT::i32); + DAG.getTargetConstant(changeIntCCToAArch64CC(CC), DL, CondCodeVT); return DAG.getNode(AArch64ISD::CB, DL, MVT::Other, Chain, Cond, LHS, RHS, Dest); } @@ -10667,11 +10672,11 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { SDValue Cmp = emitComparison(LHS, RHS, CC, DL, DAG); AArch64CC::CondCode CC1, CC2; changeFPCCToAArch64CC(CC, CC1, CC2); - SDValue CC1Val = DAG.getConstant(CC1, DL, MVT::i32); + SDValue CC1Val = getCondCode(DAG, CC1); SDValue BR1 = DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CC1Val, Cmp); if (CC2 != AArch64CC::AL) { - SDValue CC2Val = DAG.getConstant(CC2, DL, MVT::i32); + SDValue CC2Val = getCondCode(DAG, CC2); return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, BR1, Dest, CC2Val, Cmp); } @@ -11160,7 +11165,7 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { if (CC2 == AArch64CC::AL) { changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1, CC2); - SDValue CC1Val = DAG.getConstant(CC1, DL, MVT::i32); + SDValue CC1Val = getCondCode(DAG, CC1); // Note that we inverted the condition above, so we reverse the order of // the true and false operands here. This will allow the setcc to be @@ -11173,11 +11178,11 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { // of the first as the RHS. We're effectively OR'ing the two CC's together. // FIXME: It would be nice if we could match the two CSELs to two CSINCs. - SDValue CC1Val = DAG.getConstant(CC1, DL, MVT::i32); + SDValue CC1Val = getCondCode(DAG, CC1); SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, FVal, CC1Val, Cmp); - SDValue CC2Val = DAG.getConstant(CC2, DL, MVT::i32); + SDValue CC2Val = getCondCode(DAG, CC2); Res = DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, CS1, CC2Val, Cmp); } return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, DL) : Res; @@ -11205,8 +11210,7 @@ SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op, ISD::CondCode Cond = cast<CondCodeSDNode>(Op.getOperand(3))->get(); ISD::CondCode CondInv = ISD::getSetCCInverse(Cond, VT); - SDValue CCVal = - DAG.getConstant(changeIntCCToAArch64CC(CondInv), DL, MVT::i32); + SDValue CCVal = getCondCode(DAG, changeIntCCToAArch64CC(CondInv)); // Inputs are swapped because the condition is inverted. This will allow // matching with a single CSINC instruction. return DAG.getNode(AArch64ISD::CSEL, DL, OpVT, FVal, TVal, CCVal, @@ -11360,18 +11364,6 @@ SDValue AArch64TargetLowering::LowerSELECT_CC( ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal); ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal); ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS); - // Check for sign pattern (SELECT_CC setgt, iN lhs, -1, 1, -1) and transform - // into (OR (ASR lhs, N-1), 1), which requires less instructions for the - // supported types. - if (CC == ISD::SETGT && RHSC && RHSC->isAllOnes() && CTVal && CFVal && - CTVal->isOne() && CFVal->isAllOnes() && - LHS.getValueType() == TVal.getValueType()) { - EVT VT = LHS.getValueType(); - SDValue Shift = - DAG.getNode(ISD::SRA, DL, VT, LHS, - DAG.getConstant(VT.getSizeInBits() - 1, DL, VT)); - return DAG.getNode(ISD::OR, DL, VT, Shift, DAG.getConstant(1, DL, VT)); - } // Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns. // (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1)) @@ -11577,13 +11569,13 @@ SDValue AArch64TargetLowering::LowerSELECT_CC( } // Emit first, and possibly only, CSEL. - SDValue CC1Val = DAG.getConstant(CC1, DL, MVT::i32); + SDValue CC1Val = getCondCode(DAG, CC1); SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, FVal, CC1Val, Cmp); // If we need a second CSEL, emit it, using the output of the first as the // RHS. We're effectively OR'ing the two CC's together. if (CC2 != AArch64CC::AL) { - SDValue CC2Val = DAG.getConstant(CC2, DL, MVT::i32); + SDValue CC2Val = getCondCode(DAG, CC2); return DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, CS1, CC2Val, Cmp); } @@ -11685,7 +11677,7 @@ SDValue AArch64TargetLowering::LowerSELECT(SDValue Op, AArch64CC::CondCode OFCC; SDValue Value, Overflow; std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG); - SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32); + SDValue CCVal = getCondCode(DAG, OFCC); return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal, CCVal, Overflow); @@ -12525,10 +12517,10 @@ static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint) { /// WZR, invert(<cond>)'. static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL, SelectionDAG &DAG) { - return DAG.getNode( - AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32), - DAG.getConstant(0, DL, MVT::i32), - DAG.getConstant(getInvertedCondCode(CC), DL, MVT::i32), NZCV); + return DAG.getNode(AArch64ISD::CSINC, DL, MVT::i32, + DAG.getConstant(0, DL, MVT::i32), + DAG.getConstant(0, DL, MVT::i32), + getCondCode(DAG, getInvertedCondCode(CC)), NZCV); } // Lower @cc flag output via getSETCC. @@ -18699,7 +18691,7 @@ AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor, Created.push_back(Cmp.getNode()); Created.push_back(And.getNode()); } else { - SDValue CCVal = DAG.getConstant(AArch64CC::MI, DL, MVT_CC); + SDValue CCVal = getCondCode(DAG, AArch64CC::MI); SDVTList VTs = DAG.getVTList(VT, FlagsVT); SDValue Negs = DAG.getNode(AArch64ISD::SUBS, DL, VTs, Zero, N0); @@ -19571,11 +19563,11 @@ static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG) { if (N->getOpcode() == ISD::AND) { AArch64CC::CondCode InvCC0 = AArch64CC::getInvertedCondCode(CC0); - Condition = DAG.getConstant(InvCC0, DL, MVT_CC); + Condition = getCondCode(DAG, InvCC0); NZCV = AArch64CC::getNZCVToSatisfyCondCode(CC1); } else { AArch64CC::CondCode InvCC1 = AArch64CC::getInvertedCondCode(CC1); - Condition = DAG.getConstant(CC0, DL, MVT_CC); + Condition = getCondCode(DAG, CC0); NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvCC1); } @@ -19596,8 +19588,7 @@ static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG) { Cmp1.getOperand(1), NZCVOp, Condition, Cmp0); } return DAG.getNode(AArch64ISD::CSEL, DL, VT, CSel0.getOperand(0), - CSel0.getOperand(1), DAG.getConstant(CC1, DL, MVT::i32), - CCmp); + CSel0.getOperand(1), getCondCode(DAG, CC1), CCmp); } static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, @@ -19802,7 +19793,7 @@ static SDValue performANDSETCCCombine(SDNode *N, SDLoc DL(N); return DAG.getNode(AArch64ISD::CSINC, DL, VT, DAG.getConstant(0, DL, VT), DAG.getConstant(0, DL, VT), - DAG.getConstant(InvertedCC, DL, MVT::i32), Cmp); + getCondCode(DAG, InvertedCC), Cmp); } } return SDValue(); @@ -20793,7 +20784,7 @@ static SDValue performAddCSelIntoCSinc(SDNode *N, SelectionDAG &DAG) { "Unexpected constant value"); SDValue NewNode = DAG.getNode(ISD::ADD, DL, VT, RHS, SDValue(CTVal, 0)); - SDValue CCVal = DAG.getConstant(AArch64CC, DL, MVT::i32); + SDValue CCVal = getCondCode(DAG, AArch64CC); SDValue Cmp = LHS.getOperand(3); return DAG.getNode(AArch64ISD::CSINC, DL, VT, NewNode, RHS, CCVal, Cmp); @@ -20979,7 +20970,7 @@ static SDValue foldADCToCINC(SDNode *N, SelectionDAG &DAG) { SDLoc DL(N); // (CINC x cc cond) <=> (CSINC x x !cc cond) - SDValue CC = DAG.getConstant(AArch64CC::LO, DL, MVT::i32); + SDValue CC = getCondCode(DAG, AArch64CC::LO); return DAG.getNode(AArch64ISD::CSINC, DL, VT, LHS, LHS, CC, Cond); } @@ -22052,7 +22043,7 @@ static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op, // Convert CC to integer based on requested condition. // NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare. - SDValue CC = DAG.getConstant(getInvertedCondCode(Cond), DL, MVT::i32); + SDValue CC = getCondCode(DAG, getInvertedCondCode(Cond)); SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, OutVT, FVal, TVal, CC, Test); return DAG.getZExtOrTrunc(Res, DL, VT); } @@ -25093,10 +25084,9 @@ static SDValue performBRCONDCombine(SDNode *N, auto CSelCC = getCSETCondCode(CSel); if (CSelCC) { SDLoc DL(N); - return DAG.getNode( - N->getOpcode(), DL, N->getVTList(), Chain, Dest, - DAG.getConstant(getInvertedCondCode(*CSelCC), DL, MVT::i32), - CSel.getOperand(3)); + return DAG.getNode(N->getOpcode(), DL, N->getVTList(), Chain, Dest, + getCondCode(DAG, getInvertedCondCode(*CSelCC)), + CSel.getOperand(3)); } } @@ -25237,7 +25227,7 @@ static SDValue foldCSELOfCSEL(SDNode *Op, SelectionDAG &DAG) { SDLoc DL(Op); EVT VT = Op->getValueType(0); - SDValue CCValue = DAG.getConstant(CC, DL, MVT::i32); + SDValue CCValue = getCondCode(DAG, CC); return DAG.getNode(AArch64ISD::CSEL, DL, VT, L, R, CCValue, Cond); } @@ -25314,8 +25304,7 @@ static SDValue reassociateCSELOperandsForCSE(SDNode *N, SelectionDAG &DAG) { SDValue TValReassoc = Reassociate(TReassocOp, 0); SDValue FValReassoc = Reassociate(FReassocOp, 1); return DAG.getNode(AArch64ISD::CSEL, SDLoc(N), VT, TValReassoc, FValReassoc, - DAG.getConstant(NewCC, SDLoc(N->getOperand(2)), MVT_CC), - NewCmp.getValue(1)); + getCondCode(DAG, NewCC), NewCmp.getValue(1)); }; auto CC = static_cast<AArch64CC::CondCode>(N->getConstantOperandVal(2)); @@ -25456,8 +25445,7 @@ static SDValue performCSELCombine(SDNode *N, SDValue Sub = DAG.getNode(AArch64ISD::SUBS, DL, Cond->getVTList(), Cond.getOperand(1), Cond.getOperand(0)); return DAG.getNode(AArch64ISD::CSEL, DL, N->getVTList(), N->getOperand(0), - N->getOperand(1), - DAG.getConstant(NewCond, DL, MVT::i32), + N->getOperand(1), getCondCode(DAG, NewCond), Sub.getValue(1)); } } @@ -25557,10 +25545,9 @@ static SDValue performSETCCCombine(SDNode *N, auto NewCond = getInvertedCondCode(OldCond); // csel 0, 1, !cond, X - SDValue CSEL = - DAG.getNode(AArch64ISD::CSEL, DL, LHS.getValueType(), LHS.getOperand(0), - LHS.getOperand(1), DAG.getConstant(NewCond, DL, MVT::i32), - LHS.getOperand(3)); + SDValue CSEL = DAG.getNode(AArch64ISD::CSEL, DL, LHS.getValueType(), + LHS.getOperand(0), LHS.getOperand(1), + getCondCode(DAG, NewCond), LHS.getOperand(3)); return DAG.getZExtOrTrunc(CSEL, DL, VT); } @@ -25630,8 +25617,7 @@ static SDValue performFlagSettingCombine(SDNode *N, // If the flag result isn't used, convert back to a generic opcode. if (!N->hasAnyUseOfValue(1)) { SDValue Res = DCI.DAG.getNode(GenericOpcode, DL, VT, N->ops()); - return DCI.DAG.getMergeValues({Res, DCI.DAG.getConstant(0, DL, MVT::i32)}, - DL); + return DCI.CombineTo(N, Res, SDValue(N, 1)); } // Combine identical generic nodes into this node, re-using the result. @@ -27013,10 +26999,10 @@ static SDValue performRNDRCombine(SDNode *N, SelectionDAG &DAG) { SDValue A = DAG.getNode( AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, FlagsVT, MVT::Other), N->getOperand(0), DAG.getConstant(Register, DL, MVT::i32)); - SDValue B = DAG.getNode( - AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32), - DAG.getConstant(0, DL, MVT::i32), - DAG.getConstant(AArch64CC::NE, DL, MVT::i32), A.getValue(1)); + SDValue B = DAG.getNode(AArch64ISD::CSINC, DL, MVT::i32, + DAG.getConstant(0, DL, MVT::i32), + DAG.getConstant(0, DL, MVT::i32), + getCondCode(DAG, AArch64CC::NE), A.getValue(1)); return DAG.getMergeValues( {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL); } diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 251fd44..ac31236 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -448,8 +448,13 @@ def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3, SDTCisVT<1, FlagsVT>, SDTCisVT<4, FlagsVT>]>; +// Value type used for condition codes. +// Should be kept in sync with its C++ counterpart. +defvar CondCodeVT = i32; + def SDT_AArch64Brcond : SDTypeProfile<0, 3, - [SDTCisVT<0, OtherVT>, SDTCisVT<1, i32>, + [SDTCisVT<0, OtherVT>, + SDTCisVT<1, CondCodeVT>, SDTCisVT<2, FlagsVT>]>; def SDT_AArch64cbz : SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisVT<1, OtherVT>]>; def SDT_AArch64tbz : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, @@ -458,22 +463,22 @@ def SDT_AArch64tbz : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, def SDT_AArch64CSel : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, - SDTCisInt<3>, + SDTCisVT<3, CondCodeVT>, SDTCisVT<4, FlagsVT>]>; def SDT_AArch64CCMP : SDTypeProfile<1, 5, [SDTCisVT<0, FlagsVT>, SDTCisInt<1>, SDTCisSameAs<1, 2>, SDTCisInt<3>, - SDTCisInt<4>, - SDTCisVT<5, i32>]>; + SDTCisVT<4, CondCodeVT>, + SDTCisVT<5, FlagsVT>]>; def SDT_AArch64FCCMP : SDTypeProfile<1, 5, [SDTCisVT<0, FlagsVT>, SDTCisFP<1>, SDTCisSameAs<1, 2>, SDTCisInt<3>, - SDTCisInt<4>, - SDTCisVT<5, i32>]>; + SDTCisVT<4, CondCodeVT>, + SDTCisVT<5, FlagsVT>]>; def SDT_AArch64FCmp : SDTypeProfile<1, 2, [SDTCisVT<0, FlagsVT>, SDTCisFP<1>, SDTCisSameAs<2, 1>]>; @@ -546,7 +551,8 @@ def SDT_AArch64TBL : SDTypeProfile<1, 2, [ ]>; def SDT_AArch64cb : SDTypeProfile<0, 4, - [SDTCisVT<0, i32>, SDTCisInt<1>, SDTCisInt<2>, + [SDTCisVT<0, CondCodeVT>, + SDTCisInt<1>, SDTCisInt<2>, SDTCisVT<3, OtherVT>]>; // non-extending masked load fragment. diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 18ca22f..e1adc0b 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -270,6 +270,13 @@ bool AArch64TTIImpl::areInlineCompatible(const Function *Caller, const Function *Callee) const { SMECallAttrs CallAttrs(*Caller, *Callee); + // Never inline a function explicitly marked as being streaming, + // into a non-streaming function. Assume it was marked as streaming + // for a reason. + if (CallAttrs.caller().hasNonStreamingInterfaceAndBody() && + CallAttrs.callee().hasStreamingInterfaceOrBody()) + return false; + // When inlining, we should consider the body of the function, not the // interface. if (CallAttrs.callee().hasStreamingBody()) { diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp index b9d3e1b..7a2b679 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp @@ -79,8 +79,7 @@ public: } void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) override; + uint8_t *Data, uint64_t Value, bool IsResolved) override; bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value) const override; @@ -421,9 +420,8 @@ static bool shouldForceRelocation(const MCFixup &Fixup) { } void AArch64AsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) { + const MCValue &Target, uint8_t *Data, + uint64_t Value, bool IsResolved) { if (shouldForceRelocation(Fixup)) IsResolved = false; maybeAddReloc(F, Fixup, Target, Value, IsResolved); @@ -460,8 +458,8 @@ void AArch64AsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, // Shift the value into position. Value <<= Info.TargetOffset; - unsigned Offset = Fixup.getOffset(); - assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!"); + assert(Fixup.getOffset() + NumBytes <= F.getSize() && + "Invalid fixup offset!"); // Used to point to big endian bytes. unsigned FulleSizeInBytes = getFixupKindContainereSizeInBytes(Fixup.getKind()); @@ -471,15 +469,16 @@ void AArch64AsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, if (FulleSizeInBytes == 0) { // Handle as little-endian for (unsigned i = 0; i != NumBytes; ++i) { - Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff); + Data[i] |= uint8_t((Value >> (i * 8)) & 0xff); } } else { // Handle as big-endian - assert((Offset + FulleSizeInBytes) <= Data.size() && "Invalid fixup size!"); + assert(Fixup.getOffset() + FulleSizeInBytes <= F.getSize() && + "Invalid fixup size!"); assert(NumBytes <= FulleSizeInBytes && "Invalid fixup size!"); for (unsigned i = 0; i != NumBytes; ++i) { unsigned Idx = FulleSizeInBytes - 1 - i; - Data[Offset + Idx] |= uint8_t((Value >> (i * 8)) & 0xff); + Data[Idx] |= uint8_t((Value >> (i * 8)) & 0xff); } } @@ -492,9 +491,9 @@ void AArch64AsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, // If the immediate is negative, generate MOVN else MOVZ. // (Bit 30 = 0) ==> MOVN, (Bit 30 = 1) ==> MOVZ. if (SignedValue < 0) - Data[Offset + 3] &= ~(1 << 6); + Data[3] &= ~(1 << 6); else - Data[Offset + 3] |= (1 << 6); + Data[3] |= (1 << 6); } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 8a0c4ac..18f3c47 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1160,6 +1160,12 @@ def FeatureTanhInsts : SubtargetFeature<"tanh-insts", "Has v_tanh_f32/f16 instructions" >; +def FeatureTensorCvtLutInsts : SubtargetFeature<"tensor-cvt-lut-insts", + "HasTensorCvtLutInsts", + "true", + "Has v_perm_pk16* instructions" +>; + def FeatureTransposeLoadF4F6Insts : SubtargetFeature<"transpose-load-f4f6-insts", "HasTransposeLoadF4F6Insts", "true", @@ -2030,6 +2036,7 @@ def FeatureISAVersion12_50 : FeatureSet< FeatureDPPSrc1SGPR, FeatureBitOp3Insts, FeatureTanhInsts, + FeatureTensorCvtLutInsts, FeatureTransposeLoadF4F6Insts, FeatureBF16TransInsts, FeatureBF16ConversionInsts, @@ -2785,6 +2792,9 @@ def HasBitOp3Insts : Predicate<"Subtarget->hasBitOp3Insts()">, def HasTanhInsts : Predicate<"Subtarget->hasTanhInsts()">, AssemblerPredicate<(all_of FeatureTanhInsts)>; +def HasTensorCvtLutInsts : Predicate<"Subtarget->hasTensorCvtLutInsts()">, + AssemblerPredicate<(all_of FeatureTensorCvtLutInsts)>; + def HasTransposeLoadF4F6Insts : Predicate<"Subtarget->hasTransposeLoadF4F6Insts()">, AssemblerPredicate<(all_of FeatureTransposeLoadF4F6Insts)>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 31c4f62..d059480 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -367,6 +367,18 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setTruncStoreAction(MVT::v4f64, MVT::v4bf16, Expand); setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand); + setTruncStoreAction(MVT::v5i32, MVT::v5i1, Expand); + setTruncStoreAction(MVT::v5i32, MVT::v5i8, Expand); + setTruncStoreAction(MVT::v5i32, MVT::v5i16, Expand); + + setTruncStoreAction(MVT::v6i32, MVT::v6i1, Expand); + setTruncStoreAction(MVT::v6i32, MVT::v6i8, Expand); + setTruncStoreAction(MVT::v6i32, MVT::v6i16, Expand); + + setTruncStoreAction(MVT::v7i32, MVT::v7i1, Expand); + setTruncStoreAction(MVT::v7i32, MVT::v7i8, Expand); + setTruncStoreAction(MVT::v7i32, MVT::v7i16, Expand); + setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand); setTruncStoreAction(MVT::v8f64, MVT::v8bf16, Expand); setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index c8e45d4..d11e5a3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -3204,6 +3204,18 @@ void AMDGPURegisterBankInfo::applyMappingImpl( constrainOpWithReadfirstlane(B, MI, 5); return; } + case Intrinsic::amdgcn_permlane_bcast: + case Intrinsic::amdgcn_permlane_up: + case Intrinsic::amdgcn_permlane_down: + case Intrinsic::amdgcn_permlane_xor: + // Doing a waterfall loop over these wouldn't make any sense. + constrainOpWithReadfirstlane(B, MI, 3); + constrainOpWithReadfirstlane(B, MI, 4); + return; + case Intrinsic::amdgcn_permlane_idx_gen: { + constrainOpWithReadfirstlane(B, MI, 3); + return; + } case Intrinsic::amdgcn_sbfe: applyMappingBFE(B, OpdMapper, true); return; @@ -4591,6 +4603,42 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_cvt_scale_pk8_f32_fp8: case Intrinsic::amdgcn_cvt_scale_pk8_f32_bf8: case Intrinsic::amdgcn_cvt_scale_pk8_f32_fp4: + case Intrinsic::amdgcn_cvt_scale_pk16_f16_fp6: + case Intrinsic::amdgcn_cvt_scale_pk16_bf16_fp6: + case Intrinsic::amdgcn_cvt_scale_pk16_f16_bf6: + case Intrinsic::amdgcn_cvt_scale_pk16_bf16_bf6: + case Intrinsic::amdgcn_cvt_scale_pk16_f32_fp6: + case Intrinsic::amdgcn_cvt_scale_pk16_f32_bf6: + case Intrinsic::amdgcn_cvt_scalef32_pk8_fp8_bf16: + case Intrinsic::amdgcn_cvt_scalef32_pk8_bf8_bf16: + case Intrinsic::amdgcn_cvt_scalef32_pk8_fp8_f16: + case Intrinsic::amdgcn_cvt_scalef32_pk8_bf8_f16: + case Intrinsic::amdgcn_cvt_scalef32_pk8_fp8_f32: + case Intrinsic::amdgcn_cvt_scalef32_pk8_bf8_f32: + case Intrinsic::amdgcn_cvt_scalef32_pk8_fp4_f32: + case Intrinsic::amdgcn_cvt_scalef32_pk8_fp4_f16: + case Intrinsic::amdgcn_cvt_scalef32_pk8_fp4_bf16: + case Intrinsic::amdgcn_cvt_scalef32_pk16_fp6_f32: + case Intrinsic::amdgcn_cvt_scalef32_pk16_bf6_f32: + case Intrinsic::amdgcn_cvt_scalef32_pk16_fp6_f16: + case Intrinsic::amdgcn_cvt_scalef32_pk16_bf6_f16: + case Intrinsic::amdgcn_cvt_scalef32_pk16_fp6_bf16: + case Intrinsic::amdgcn_cvt_scalef32_pk16_bf6_bf16: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp8_bf16: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_bf8_bf16: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp8_f16: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_bf8_f16: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp8_f32: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_bf8_f32: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp4_f32: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp4_f16: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp4_bf16: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_fp6_f32: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_bf6_f32: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_fp6_f16: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_bf6_f16: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_fp6_bf16: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_bf6_bf16: case Intrinsic::amdgcn_sat_pk4_i4_i8: case Intrinsic::amdgcn_sat_pk4_u4_u8: case Intrinsic::amdgcn_fmed3: @@ -4765,6 +4813,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8: case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: + case Intrinsic::amdgcn_perm_pk16_b4_u4: + case Intrinsic::amdgcn_perm_pk16_b6_u4: + case Intrinsic::amdgcn_perm_pk16_b8_u4: return getDefaultMappingVOP(MI); case Intrinsic::amdgcn_log: case Intrinsic::amdgcn_exp2: @@ -4902,6 +4953,24 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); break; } + case Intrinsic::amdgcn_permlane_bcast: + case Intrinsic::amdgcn_permlane_up: + case Intrinsic::amdgcn_permlane_down: + case Intrinsic::amdgcn_permlane_xor: { + unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); + OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); + OpdsMapping[3] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); + OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); + break; + } + case Intrinsic::amdgcn_permlane_idx_gen: { + unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); + OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); + OpdsMapping[3] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); + break; + } case Intrinsic::amdgcn_permlane16_var: case Intrinsic::amdgcn_permlanex16_var: { unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td index dfe0cbf..10b8606 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -321,6 +321,11 @@ def : SourceOfDivergence<int_amdgcn_permlane16>; def : SourceOfDivergence<int_amdgcn_permlanex16>; def : SourceOfDivergence<int_amdgcn_permlane16_var>; def : SourceOfDivergence<int_amdgcn_permlanex16_var>; +def : SourceOfDivergence<int_amdgcn_permlane_bcast>; +def : SourceOfDivergence<int_amdgcn_permlane_up>; +def : SourceOfDivergence<int_amdgcn_permlane_down>; +def : SourceOfDivergence<int_amdgcn_permlane_xor>; +def : SourceOfDivergence<int_amdgcn_permlane_idx_gen>; def : SourceOfDivergence<int_amdgcn_mov_dpp>; def : SourceOfDivergence<int_amdgcn_mov_dpp8>; def : SourceOfDivergence<int_amdgcn_update_dpp>; diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 94886b0..96cb5ae 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -152,7 +152,12 @@ static bool isPermlane(const MachineInstr &MI) { Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e32 || Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e64 || Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e32 || - Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e64; + Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e64 || + Opcode == AMDGPU::V_PERMLANE_BCAST_B32_e64 || + Opcode == AMDGPU::V_PERMLANE_UP_B32_e64 || + Opcode == AMDGPU::V_PERMLANE_DOWN_B32_e64 || + Opcode == AMDGPU::V_PERMLANE_XOR_B32_e64 || + Opcode == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64; } static bool isLdsDma(const MachineInstr &MI) { diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 33b66a6..96d5668 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -592,10 +592,13 @@ bool GCNMaxILPSchedStrategy::tryCandidate(SchedCandidate &Cand, // This is a best effort to set things up for a post-RA pass. Optimizations // like generating loads of multiple registers should ideally be done within // the scheduler pass by combining the loads during DAG postprocessing. - const ClusterInfo *CandCluster = Cand.AtTop ? TopCluster : BotCluster; - const ClusterInfo *TryCandCluster = TryCand.AtTop ? TopCluster : BotCluster; - if (tryGreater(TryCandCluster && TryCandCluster->contains(TryCand.SU), - CandCluster && CandCluster->contains(Cand.SU), TryCand, Cand, + unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID; + unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID; + bool CandIsClusterSucc = + isTheSameCluster(CandZoneCluster, Cand.SU->ParentClusterIdx); + bool TryCandIsClusterSucc = + isTheSameCluster(TryCandZoneCluster, TryCand.SU->ParentClusterIdx); + if (tryGreater(TryCandIsClusterSucc, CandIsClusterSucc, TryCand, Cand, Cluster)) return TryCand.Reason != NoCand; @@ -666,10 +669,13 @@ bool GCNMaxMemoryClauseSchedStrategy::tryCandidate(SchedCandidate &Cand, // MaxMemoryClause-specific: We prioritize clustered instructions as we would // get more benefit from clausing these memory instructions. - const ClusterInfo *CandCluster = Cand.AtTop ? TopCluster : BotCluster; - const ClusterInfo *TryCandCluster = TryCand.AtTop ? TopCluster : BotCluster; - if (tryGreater(TryCandCluster && TryCandCluster->contains(TryCand.SU), - CandCluster && CandCluster->contains(Cand.SU), TryCand, Cand, + unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID; + unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID; + bool CandIsClusterSucc = + isTheSameCluster(CandZoneCluster, Cand.SU->ParentClusterIdx); + bool TryCandIsClusterSucc = + isTheSameCluster(TryCandZoneCluster, TryCand.SU->ParentClusterIdx); + if (tryGreater(TryCandIsClusterSucc, CandIsClusterSucc, TryCand, Cand, Cluster)) return TryCand.Reason != NoCand; @@ -936,11 +942,9 @@ void GCNScheduleDAGMILive::finalizeSchedule() { Pressure.resize(Regions.size()); RegionsWithHighRP.resize(Regions.size()); RegionsWithExcessRP.resize(Regions.size()); - RegionsWithMinOcc.resize(Regions.size()); RegionsWithIGLPInstrs.resize(Regions.size()); RegionsWithHighRP.reset(); RegionsWithExcessRP.reset(); - RegionsWithMinOcc.reset(); RegionsWithIGLPInstrs.reset(); runSchedStages(); @@ -1090,8 +1094,7 @@ bool PreRARematStage::initGCNSchedStage() { // fixed if there is another pass after this pass. assert(!S.hasNextStage()); - if (!GCNSchedStage::initGCNSchedStage() || DAG.RegionsWithMinOcc.none() || - DAG.Regions.size() == 1) + if (!GCNSchedStage::initGCNSchedStage() || DAG.Regions.size() == 1) return false; // Before performing any IR modification record the parent region of each MI @@ -1133,11 +1136,6 @@ void UnclusteredHighRPStage::finalizeGCNSchedStage() { SavedMutations.swap(DAG.Mutations); S.SGPRLimitBias = S.VGPRLimitBias = 0; if (DAG.MinOccupancy > InitialOccupancy) { - for (unsigned IDX = 0; IDX < DAG.Pressure.size(); ++IDX) - DAG.RegionsWithMinOcc[IDX] = - DAG.Pressure[IDX].getOccupancy( - DAG.ST, DAG.MFI.getDynamicVGPRBlockSize()) == DAG.MinOccupancy; - LLVM_DEBUG(dbgs() << StageID << " stage successfully increased occupancy to " << DAG.MinOccupancy << '\n'); @@ -1209,11 +1207,15 @@ bool GCNSchedStage::initGCNRegion() { } bool UnclusteredHighRPStage::initGCNRegion() { - // Only reschedule regions with the minimum occupancy or regions that may have - // spilling (excess register pressure). - if ((!DAG.RegionsWithMinOcc[RegionIdx] || - DAG.MinOccupancy <= InitialOccupancy) && - !DAG.RegionsWithExcessRP[RegionIdx]) + // Only reschedule regions that have excess register pressure (i.e. spilling) + // or had minimum occupancy at the beginning of the stage (as long as + // rescheduling of previous regions did not make occupancy drop back down to + // the initial minimum). + unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize(); + if (!DAG.RegionsWithExcessRP[RegionIdx] && + (DAG.MinOccupancy <= InitialOccupancy || + DAG.Pressure[RegionIdx].getOccupancy(ST, DynamicVGPRBlockSize) != + InitialOccupancy)) return false; return GCNSchedStage::initGCNRegion(); @@ -1278,9 +1280,6 @@ void GCNSchedStage::checkScheduling() { if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit && PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) { DAG.Pressure[RegionIdx] = PressureAfter; - DAG.RegionsWithMinOcc[RegionIdx] = - PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize) == - DAG.MinOccupancy; // Early out if we have achieved the occupancy target. LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n"); @@ -1314,7 +1313,6 @@ void GCNSchedStage::checkScheduling() { if (NewOccupancy < DAG.MinOccupancy) { DAG.MinOccupancy = NewOccupancy; MFI.limitOccupancy(DAG.MinOccupancy); - DAG.RegionsWithMinOcc.reset(); LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to " << DAG.MinOccupancy << ".\n"); } @@ -1336,14 +1334,10 @@ void GCNSchedStage::checkScheduling() { // Revert if this region's schedule would cause a drop in occupancy or // spilling. - if (shouldRevertScheduling(WavesAfter)) { + if (shouldRevertScheduling(WavesAfter)) revertScheduling(); - } else { + else DAG.Pressure[RegionIdx] = PressureAfter; - DAG.RegionsWithMinOcc[RegionIdx] = - PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize) == - DAG.MinOccupancy; - } } unsigned @@ -1573,9 +1567,6 @@ bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) { } void GCNSchedStage::revertScheduling() { - DAG.RegionsWithMinOcc[RegionIdx] = - PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()) == - DAG.MinOccupancy; LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n"); DAG.RegionEnd = DAG.RegionBegin; int SkippedDebugInstr = 0; diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h index 94cd795..32139a9 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -250,9 +250,6 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive { // limit. Register pressure in these regions usually will result in spilling. BitVector RegionsWithExcessRP; - // Regions that has the same occupancy as the latest MinOccupancy - BitVector RegionsWithMinOcc; - // Regions that have IGLP instructions (SCHED_GROUP_BARRIER or IGLP_OPT). BitVector RegionsWithIGLPInstrs; diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp index 0a0a107..0237a60 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp @@ -340,6 +340,43 @@ void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, Policy.ShouldTrackLaneMasks = true; } +void GCNSubtarget::overridePostRASchedPolicy(MachineSchedPolicy &Policy, + const SchedRegion &Region) const { + const Function &F = Region.RegionBegin->getMF()->getFunction(); + Attribute PostRADirectionAttr = F.getFnAttribute("amdgpu-post-ra-direction"); + if (!PostRADirectionAttr.isValid()) + return; + + StringRef PostRADirectionStr = PostRADirectionAttr.getValueAsString(); + if (PostRADirectionStr == "topdown") { + Policy.OnlyTopDown = true; + Policy.OnlyBottomUp = false; + } else if (PostRADirectionStr == "bottomup") { + Policy.OnlyTopDown = false; + Policy.OnlyBottomUp = true; + } else if (PostRADirectionStr == "bidirectional") { + Policy.OnlyTopDown = false; + Policy.OnlyBottomUp = false; + } else { + DiagnosticInfoOptimizationFailure Diag( + F, F.getSubprogram(), "invalid value for postRA direction attribute"); + F.getContext().diagnose(Diag); + } + + LLVM_DEBUG({ + const char *DirStr = "default"; + if (Policy.OnlyTopDown && !Policy.OnlyBottomUp) + DirStr = "topdown"; + else if (!Policy.OnlyTopDown && Policy.OnlyBottomUp) + DirStr = "bottomup"; + else if (!Policy.OnlyTopDown && !Policy.OnlyBottomUp) + DirStr = "bidirectional"; + + dbgs() << "Post-MI-sched direction (" << F.getName() << "): " << DirStr + << '\n'; + }); +} + void GCNSubtarget::mirFileLoaded(MachineFunction &MF) const { if (isWave32()) { // Fix implicit $vcc operands after MIParser has verified that they match diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index bdd900d..c84ba1a 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -236,6 +236,7 @@ protected: bool Has64BitLiterals = false; bool HasBitOp3Insts = false; bool HasTanhInsts = false; + bool HasTensorCvtLutInsts = false; bool HasTransposeLoadF4F6Insts = false; bool HasPrngInst = false; bool HasBVHDualAndBVH8Insts = false; @@ -1041,6 +1042,9 @@ public: void overrideSchedPolicy(MachineSchedPolicy &Policy, const SchedRegion &Region) const override; + void overridePostRASchedPolicy(MachineSchedPolicy &Policy, + const SchedRegion &Region) const override; + void mirFileLoaded(MachineFunction &MF) const override; unsigned getMaxNumUserSGPRs() const { @@ -1408,6 +1412,8 @@ public: bool hasTanhInsts() const { return HasTanhInsts; } + bool hasTensorCvtLutInsts() const { return HasTensorCvtLutInsts; } + bool hasAddPC64Inst() const { return GFX1250Insts; } bool hasMinimum3Maximum3PKF16() const { diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp index 2a920f6..4e4660c 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp @@ -33,8 +33,7 @@ public: AMDGPUAsmBackend(const Target &T) : MCAsmBackend(llvm::endianness::little) {} void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) override; + uint8_t *Data, uint64_t Value, bool IsResolved) override; bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value) const override; @@ -129,9 +128,8 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value, } void AMDGPUAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) { + const MCValue &Target, uint8_t *Data, + uint64_t Value, bool IsResolved) { if (Target.getSpecifier()) IsResolved = false; maybeAddReloc(F, Fixup, Target, Value, IsResolved); @@ -148,13 +146,13 @@ void AMDGPUAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, Value <<= Info.TargetOffset; unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind()); - uint32_t Offset = Fixup.getOffset(); - assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!"); + assert(Fixup.getOffset() + NumBytes <= F.getSize() && + "Invalid fixup offset!"); // For each byte of the fragment that the fixup touches, mask in the bits from // the fixup value. for (unsigned i = 0; i != NumBytes; ++i) - Data[Offset + i] |= static_cast<uint8_t>((Value >> (i * 8)) & 0xff); + Data[i] |= static_cast<uint8_t>((Value >> (i * 8)) & 0xff); } std::optional<MCFixupKind> diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index 11552b3..9b348d4 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -983,6 +983,7 @@ void SIFrameLowering::emitCSRSpillStores( const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo &TRI = TII->getRegisterInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); // Spill Whole-Wave Mode VGPRs. Save only the inactive lanes of the scratch // registers. However, save all lanes of callee-saved VGPRs. Due to this, we @@ -1005,6 +1006,12 @@ void SIFrameLowering::emitCSRSpillStores( } }; + for (const Register Reg : make_first_range(WWMScratchRegs)) { + if (!MRI.isReserved(Reg)) { + MRI.addLiveIn(Reg); + MBB.addLiveIn(Reg); + } + } StoreWWMRegisters(WWMScratchRegs); auto EnableAllLanes = [&]() { diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index ad26757..4d67e4a 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -16825,56 +16825,51 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_, return std::pair(0U, RC); } - if (Constraint.starts_with("{") && Constraint.ends_with("}")) { - StringRef RegName(Constraint.data() + 1, Constraint.size() - 2); - if (RegName.consume_front("v")) { + auto [Kind, Idx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Constraint); + if (Kind != '\0') { + if (Kind == 'v') { RC = &AMDGPU::VGPR_32RegClass; - } else if (RegName.consume_front("s")) { + } else if (Kind == 's') { RC = &AMDGPU::SGPR_32RegClass; - } else if (RegName.consume_front("a")) { + } else if (Kind == 'a') { RC = &AMDGPU::AGPR_32RegClass; } if (RC) { - uint32_t Idx; - if (RegName.consume_front("[")) { - uint32_t End; - bool Failed = RegName.consumeInteger(10, Idx); - Failed |= !RegName.consume_front(":"); - Failed |= RegName.consumeInteger(10, End); - Failed |= !RegName.consume_back("]"); - if (!Failed) { - uint32_t Width = (End - Idx + 1) * 32; - // Prohibit constraints for register ranges with a width that does not - // match the required type. - if (VT.SimpleTy != MVT::Other && Width != VT.getSizeInBits()) + if (NumRegs > 1) { + if (Idx >= RC->getNumRegs() || Idx + NumRegs - 1 > RC->getNumRegs()) + return std::pair(0U, nullptr); + + uint32_t Width = NumRegs * 32; + // Prohibit constraints for register ranges with a width that does not + // match the required type. + if (VT.SimpleTy != MVT::Other && Width != VT.getSizeInBits()) + return std::pair(0U, nullptr); + + MCRegister Reg = RC->getRegister(Idx); + if (SIRegisterInfo::isVGPRClass(RC)) + RC = TRI->getVGPRClassForBitWidth(Width); + else if (SIRegisterInfo::isSGPRClass(RC)) + RC = TRI->getSGPRClassForBitWidth(Width); + else if (SIRegisterInfo::isAGPRClass(RC)) + RC = TRI->getAGPRClassForBitWidth(Width); + if (RC) { + Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC); + if (!Reg) { + // The register class does not contain the requested register, + // e.g., because it is an SGPR pair that would violate alignment + // requirements. return std::pair(0U, nullptr); - MCRegister Reg = RC->getRegister(Idx); - if (SIRegisterInfo::isVGPRClass(RC)) - RC = TRI->getVGPRClassForBitWidth(Width); - else if (SIRegisterInfo::isSGPRClass(RC)) - RC = TRI->getSGPRClassForBitWidth(Width); - else if (SIRegisterInfo::isAGPRClass(RC)) - RC = TRI->getAGPRClassForBitWidth(Width); - if (RC) { - Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC); - if (!Reg) { - // The register class does not contain the requested register, - // e.g., because it is an SGPR pair that would violate alignment - // requirements. - return std::pair(0U, nullptr); - } - return std::pair(Reg, RC); } + return std::pair(Reg, RC); } - } else { - // Check for lossy scalar/vector conversions. - if (VT.isVector() && VT.getSizeInBits() != 32) - return std::pair(0U, nullptr); - bool Failed = RegName.getAsInteger(10, Idx); - if (!Failed && Idx < RC->getNumRegs()) - return std::pair(RC->getRegister(Idx), RC); } + + // Check for lossy scalar/vector conversions. + if (VT.isVector() && VT.getSizeInBits() != 32) + return std::pair(0U, nullptr); + if (Idx < RC->getNumRegs()) + return std::pair(RC->getRegister(Idx), RC); } } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 044a681..3f61bbd 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -6304,10 +6304,14 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI, }; if (Opc == AMDGPU::V_PERMLANE16_B32_e64 || - Opc == AMDGPU::V_PERMLANEX16_B32_e64) { + Opc == AMDGPU::V_PERMLANEX16_B32_e64 || + Opc == AMDGPU::V_PERMLANE_BCAST_B32_e64 || + Opc == AMDGPU::V_PERMLANE_UP_B32_e64 || + Opc == AMDGPU::V_PERMLANE_DOWN_B32_e64 || + Opc == AMDGPU::V_PERMLANE_XOR_B32_e64 || + Opc == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64) { // src1 and src2 must be scalar MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]); - MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]); const DebugLoc &DL = MI.getDebugLoc(); if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) { Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); @@ -6315,11 +6319,14 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI, .add(Src1); Src1.ChangeToRegister(Reg, false); } - if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) { - Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); - BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) - .add(Src2); - Src2.ChangeToRegister(Reg, false); + if (VOP3Idx[2] != -1) { + MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]); + if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) { + Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) + .add(Src2); + Src2.ChangeToRegister(Reg, false); + } } } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index a3e20ba..4698a58 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1774,6 +1774,7 @@ class getVALUDstForVT<ValueType VT, bit IsTrue16 = 0, bit IsVOP3Encoding = 0> { !eq(VT.Size, 256) : VOPDstOperand<VReg_256>, !eq(VT.Size, 192) : VOPDstOperand<VReg_192>, !eq(VT.Size, 128) : VOPDstOperand<VReg_128>, + !eq(VT.Size, 96) : VOPDstOperand<VReg_96>, !eq(VT.Size, 64) : VOPDstOperand<VReg_64>, !eq(VT.Size, 32) : VOPDstOperand<VGPR_32>, !eq(VT.Size, 16) : op16, @@ -1924,6 +1925,7 @@ class getVOP3DPPSrcForVT<ValueType VT, bit IsFake16 = 1> { !eq(VT, v2f16) : VCSrc_v2f16, !eq(VT, v2bf16) : VCSrc_v2bf16, !eq(VT, f32) : VCSrc_f32, + !eq(VT, v2i32) : VCSrc_v2b32, 1 : VCSrc_b32); } @@ -2935,6 +2937,9 @@ def VOP_V2BF16_F32_F32_I32 : VOPProfile <[v2bf16, f32, f32, i32]>; def VOP_V2F16_F32_F32_I32 : VOPProfile <[v2f16, f32, f32, i32]>; def VOP_V6I32_V32F16_F32 : VOPProfile<[v6i32, v32f16, f32, untyped]>; def VOP_V6I32_V32BF16_F32 : VOPProfile<[v6i32, v32bf16, f32, untyped]>; +def VOP_V3I32_V16F16_F32 : VOPProfile<[v3i32, v16f16, f32, untyped]>; +def VOP_V3I32_V16BF16_F32 : VOPProfile<[v3i32, v16bf16, f32, untyped]>; +def VOP_V3I32_V16F32_F32 : VOPProfile<[v3i32, v16f32, f32, untyped]>; def VOP_V6I32_V16F32_V16F32_F32 : VOPProfile<[v6i32, v16f32, v16f32, f32]>; def VOP_V2F16_I32_F32 : VOPProfile<[v2f16, i32, f32, untyped]>; def VOP_V2I16_F32_F32_F32 : VOPProfile<[v2i16, f32, f32, f32]>; @@ -2948,6 +2953,8 @@ def VOP_BF16_F32_I32 : VOPProfile<[bf16, f32, i32, untyped]>; def VOP_F16_F32_I32 : VOPProfile<[f16, f32, i32, untyped]>; def VOP_I32_BF16_I32_F32 : VOPProfile<[i32, bf16, i32, f32]>; def VOP_I32_F16_I32_F32 : VOPProfile<[i32, f16, i32, f32]>; +def VOP_V16F16_V3I32_I32 : VOPProfile<[v16f16, v3i32, i32, untyped]>; +def VOP_V16BF16_V3I32_I32 : VOPProfile<[v16bf16, v3i32, i32, untyped]>; def VOP_V8F16_V2I32_I32 : VOPProfile<[v8f16, v2i32, i32, untyped]>; def VOP_V8BF16_V2I32_I32 : VOPProfile<[v8bf16, v2i32, i32, untyped]>; def VOP_V8F16_I32_I32 : VOPProfile<[v8f16, i32, i32, untyped]>; @@ -2955,11 +2962,26 @@ def VOP_V8BF16_I32_I32 : VOPProfile<[v8bf16, i32, i32, untyped]>; def VOP_V16F32_V3I32_I32 : VOPProfile<[v16f32, v3i32, i32, untyped]>; def VOP_V8F32_V2I32_I32 : VOPProfile<[v8f32, v2i32, i32, untyped]>; def VOP_V8F32_I32_I32 : VOPProfile<[v8f32, i32, i32, untyped]>; +def VOP_V2I32_V8BF16_F32 : VOPProfile<[v2i32, v8bf16, f32, untyped]>; +def VOP_V2I32_V8F16_F32 : VOPProfile<[v2i32, v8f16, f32, untyped]>; +def VOP_V2I32_V8F32_F32 : VOPProfile<[v2i32, v8f32, f32, untyped]>; +def VOP_I32_V8F32_F32 : VOPProfile<[i32, v8f32, f32, untyped]>; +def VOP_I32_V8F16_F32 : VOPProfile<[i32, v8f16, f32, untyped]>; +def VOP_I32_V8BF16_F32 : VOPProfile<[i32, v8bf16, f32, untyped]>; def VOP_I32_F32_I32_F32 : VOPProfile<[i32, f32, i32, f32]>; def VOP_V6I32_V32BF16_I32_F32 : VOPProfile<[v6i32, v32bf16, i32, f32]>; def VOP_V6I32_V32F16_I32_F32 : VOPProfile<[v6i32, v32f16, i32, f32]>; def VOP_V6I32_V32F32_I32_F32 : VOPProfile<[v6i32, v32f32, i32, f32]>; +def VOP_V3I32_V16F16_I32_F32 : VOPProfile<[v3i32, v16f16, i32, f32]>; +def VOP_V3I32_V16BF16_I32_F32 : VOPProfile<[v3i32, v16bf16, i32, f32]>; +def VOP_V3I32_V16F32_I32_F32 : VOPProfile<[v3i32, v16f32, i32, f32]>; +def VOP_V2I32_V8BF16_I32_F32 : VOPProfile<[v2i32, v8bf16, i32, f32]>; +def VOP_V2I32_V8F16_I32_F32 : VOPProfile<[v2i32, v8f16, i32, f32]>; +def VOP_V2I32_V8F32_I32_F32 : VOPProfile<[v2i32, v8f32, i32, f32]>; +def VOP_I32_V8F32_I32_F32 : VOPProfile<[i32, v8f32, i32, f32]>; +def VOP_I32_V8F16_I32_F32 : VOPProfile<[i32, v8f16, i32, f32]>; +def VOP_I32_V8BF16_I32_F32 : VOPProfile<[i32, v8bf16, i32, f32]>; def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>; def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index 36d1a3b..08d07c9 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -1302,6 +1302,7 @@ def VCSrc_f64 : SrcRegOrImm9 <VS_64, "OPERAND_REG_INLINE_C_FP64">; def VCSrc_v2b16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2INT16">; def VCSrc_v2bf16: SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2BF16">; def VCSrc_v2f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2FP16">; +def VCSrc_v2b32 : SrcRegOrImm9 <VS_64, "OPERAND_REG_INLINE_C_V2INT32">; // True 16 Operands def VCSrcT_b16 : SrcRegOrImm9_t16 <"OPERAND_REG_INLINE_C_INT16">; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 5827f18..65fa088 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -1548,6 +1548,42 @@ bool shouldEmitConstantsToTextSection(const Triple &TT) { return TT.getArch() == Triple::r600; } +static bool isValidRegPrefix(char C) { + return C == 'v' || C == 's' || C == 'a'; +} + +std::tuple<char, unsigned, unsigned> +parseAsmConstraintPhysReg(StringRef Constraint) { + StringRef RegName = Constraint; + if (!RegName.consume_front("{") || !RegName.consume_back("}")) + return {}; + + char Kind = RegName.front(); + if (!isValidRegPrefix(Kind)) + return {}; + + RegName = RegName.drop_front(); + if (RegName.consume_front("[")) { + unsigned Idx, End; + bool Failed = RegName.consumeInteger(10, Idx); + Failed |= !RegName.consume_front(":"); + Failed |= RegName.consumeInteger(10, End); + Failed |= !RegName.consume_back("]"); + if (!Failed) { + unsigned NumRegs = End - Idx + 1; + if (NumRegs > 1) + return {Kind, Idx, NumRegs}; + } + } else { + unsigned Idx; + bool Failed = RegName.getAsInteger(10, Idx); + if (!Failed) + return {Kind, Idx, 1}; + } + + return {}; +} + std::pair<unsigned, unsigned> getIntegerPairAttribute(const Function &F, StringRef Name, std::pair<unsigned, unsigned> Default, diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 74d59f4..1252e35 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -1012,6 +1012,12 @@ bool isReadOnlySegment(const GlobalValue *GV); /// target triple \p TT, false otherwise. bool shouldEmitConstantsToTextSection(const Triple &TT); +/// Returns a valid charcode or 0 in the first entry if this is a valid physical +/// register constraint. Followed by the start register number, and the register +/// width. Does not validate the number of registers exists in the class. +std::tuple<char, unsigned, unsigned> +parseAsmConstraintPhysReg(StringRef Constraint); + /// \returns Integer value requested using \p F's \p Name attribute. /// /// \returns \p Default if attribute is not present. diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 1ffe39d..f4b6af6 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -1053,6 +1053,14 @@ def VOP3_PERMLANE_VAR_Profile : VOP3_Profile<VOPProfile <[i32, i32, i32, untyped let HasExtDPP = 0; } +class VOP3_PERMLANE_NOOPSEL_Profile<VOPProfile P> : VOP3_Profile<P> { + let Ins64 = !con((ins VRegSrc_32:$src0, SSrc_b32:$src1), + !if(P.HasSrc2, (ins SSrc_b32:$src2), (ins))); + let HasClamp = 0; + let HasExtVOP3DPP = 0; + let HasExtDPP = 0; +} + def opsel_i1timm : SDNodeXForm<timm, [{ return CurDAG->getTargetConstant( N->getZExtValue() ? SISrcMods::OP_SEL_0 : SISrcMods::NONE, @@ -1136,6 +1144,18 @@ class PermlaneVarPat<SDPatternOperator permlane, VGPR_32:$src1, VGPR_32:$vdst_in) >; +class PermlaneNoDppPat3Src<SDPatternOperator permlane, + Instruction inst> : GCNPat< + (permlane i32:$src0, i32:$src1, i32:$src2), + (inst VGPR_32:$src0, SCSrc_b32:$src1, SCSrc_b32:$src2) +>; + +class PermlaneNoDppPat2Src<SDPatternOperator permlane, + Instruction inst> : GCNPat< + (permlane i32:$src0, i32:$src1), + (inst VGPR_32:$src0, SCSrc_b32:$src1) +>; + class VOP3_BITOP3_Profile<VOPProfile pfl, VOP3Features f> : VOP3_Profile<pfl, f> { let HasClamp = 0; let HasOMod = 0; @@ -1522,6 +1542,20 @@ let SubtargetPredicate = isGFX12Plus in { } // End SubtargetPredicate = isGFX12Plus +let SubtargetPredicate = isGFX1250Plus, WaveSizePredicate = isWave32 in { + defm V_PERMLANE_BCAST_B32 : VOP3Inst<"v_permlane_bcast_b32", VOP3_PERMLANE_NOOPSEL_Profile<VOP_I32_I32_I32_I32>>; + defm V_PERMLANE_UP_B32 : VOP3Inst<"v_permlane_up_b32", VOP3_PERMLANE_NOOPSEL_Profile<VOP_I32_I32_I32_I32>>; + defm V_PERMLANE_DOWN_B32 : VOP3Inst<"v_permlane_down_b32", VOP3_PERMLANE_NOOPSEL_Profile<VOP_I32_I32_I32_I32>>; + defm V_PERMLANE_XOR_B32 : VOP3Inst<"v_permlane_xor_b32", VOP3_PERMLANE_NOOPSEL_Profile<VOP_I32_I32_I32_I32>>; + defm V_PERMLANE_IDX_GEN_B32 : VOP3Inst<"v_permlane_idx_gen_b32", VOP3_PERMLANE_NOOPSEL_Profile<VOP_I32_I32_I32>>; + + def : PermlaneNoDppPat3Src<int_amdgcn_permlane_bcast, V_PERMLANE_BCAST_B32_e64>; + def : PermlaneNoDppPat3Src<int_amdgcn_permlane_up, V_PERMLANE_UP_B32_e64>; + def : PermlaneNoDppPat3Src<int_amdgcn_permlane_down, V_PERMLANE_DOWN_B32_e64>; + def : PermlaneNoDppPat3Src<int_amdgcn_permlane_xor, V_PERMLANE_XOR_B32_e64>; + def : PermlaneNoDppPat2Src<int_amdgcn_permlane_idx_gen, V_PERMLANE_IDX_GEN_B32_e64>; +} // End SubtargetPredicate = isGFX1250Plus, WaveSizePredicate = isWave32 + let HasClamp = 0, HasModifiers = 1 in { def BitOp3_B16_Profile : VOP3_BITOP3_Profile<VOPProfile <[i16, i16, i16, i16, i32]>, VOP3_OPSEL>; def BitOp3_B16_t16_Profile : VOP3_Profile_True16<BitOp3_B16_Profile>; @@ -1692,6 +1726,12 @@ multiclass VOP3CvtScaleSelInst<string OpName, VOPProfile P, SDPatternOperator no } } +let HasExtVOP3DPP = 0, HasModifiers = 0 in { +def VOP3_V2I32_I32_I32_V2I32 : VOP3_Profile<VOPProfile<[v2i32, i32, i32, v2i32]>>; +def VOP3_V3I32_I32_I64_V2I32 : VOP3_Profile<VOPProfile<[v3i32, i32, i64, v2i32]>>; +def VOP3_V4I32_I64_I64_V2I32 : VOP3_Profile<VOPProfile<[v4i32, i64, i64, v2i32]>>; +} + let Src0RC64 = VSrc_NoInline_v2f16 in { def VOP3_CVT_PK_F8_F16_Profile : VOP3_Profile<VOP_I16_V2F16>; def VOP3_CVT_PK_F8_F16_True16_Profile : VOP3_Profile_True16<VOP3_CVT_PK_F8_F16_Profile>; @@ -1737,6 +1777,12 @@ let SubtargetPredicate = isGFX1250Plus in { defm V_CVT_SCALE_PK8_BF16_BF8 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_bf16_bf8", VOP_V8BF16_V2I32_I32, int_amdgcn_cvt_scale_pk8_bf16_bf8>; defm V_CVT_SCALE_PK8_F32_FP8 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f32_fp8", VOP_V8F32_V2I32_I32, int_amdgcn_cvt_scale_pk8_f32_fp8>; defm V_CVT_SCALE_PK8_F32_BF8 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f32_bf8", VOP_V8F32_V2I32_I32, int_amdgcn_cvt_scale_pk8_f32_bf8>; + defm V_CVT_SCALE_PK16_F16_FP6 : VOP3CvtScaleSelInst<"v_cvt_scale_pk16_f16_fp6", VOP_V16F16_V3I32_I32, int_amdgcn_cvt_scale_pk16_f16_fp6>; + defm V_CVT_SCALE_PK16_BF16_FP6 : VOP3CvtScaleSelInst<"v_cvt_scale_pk16_bf16_fp6", VOP_V16BF16_V3I32_I32, int_amdgcn_cvt_scale_pk16_bf16_fp6>; + defm V_CVT_SCALE_PK16_F16_BF6 : VOP3CvtScaleSelInst<"v_cvt_scale_pk16_f16_bf6", VOP_V16F16_V3I32_I32, int_amdgcn_cvt_scale_pk16_f16_bf6>; + defm V_CVT_SCALE_PK16_BF16_BF6 : VOP3CvtScaleSelInst<"v_cvt_scale_pk16_bf16_bf6", VOP_V16BF16_V3I32_I32, int_amdgcn_cvt_scale_pk16_bf16_bf6>; + defm V_CVT_SCALE_PK16_F32_FP6 : VOP3CvtScaleSelInst<"v_cvt_scale_pk16_f32_fp6", VOP_V16F32_V3I32_I32, int_amdgcn_cvt_scale_pk16_f32_fp6>; + defm V_CVT_SCALE_PK16_F32_BF6 : VOP3CvtScaleSelInst<"v_cvt_scale_pk16_f32_bf6", VOP_V16F32_V3I32_I32, int_amdgcn_cvt_scale_pk16_f32_bf6>; } // End Constraints = "@earlyclobber $vdst" defm V_CVT_SCALE_PK8_F16_FP4 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f16_fp4", VOP_V8F16_I32_I32, int_amdgcn_cvt_scale_pk8_f16_fp4>; @@ -1744,6 +1790,44 @@ let SubtargetPredicate = isGFX1250Plus in { defm V_CVT_SCALE_PK8_F32_FP4 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f32_fp4", VOP_V8F32_I32_I32, int_amdgcn_cvt_scale_pk8_f32_fp4>; } // End ReadsModeReg = 0 + let Constraints = "@earlyclobber $vdst" in { + let WaveSizePredicate = isWave32 in { + defm V_CVT_SCALEF32_PK8_FP8_BF16 : VOP3Inst<"v_cvt_scalef32_pk8_fp8_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8BF16_F32>, int_amdgcn_cvt_scalef32_pk8_fp8_bf16>; + defm V_CVT_SCALEF32_PK8_BF8_BF16 : VOP3Inst<"v_cvt_scalef32_pk8_bf8_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8BF16_F32>, int_amdgcn_cvt_scalef32_pk8_bf8_bf16>; + defm V_CVT_SCALEF32_PK8_FP8_F16 : VOP3Inst<"v_cvt_scalef32_pk8_fp8_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8F16_F32>, int_amdgcn_cvt_scalef32_pk8_fp8_f16>; + defm V_CVT_SCALEF32_PK8_BF8_F16 : VOP3Inst<"v_cvt_scalef32_pk8_bf8_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8F16_F32>, int_amdgcn_cvt_scalef32_pk8_bf8_f16>; + defm V_CVT_SCALEF32_PK8_FP8_F32 : VOP3Inst<"v_cvt_scalef32_pk8_fp8_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8F32_F32>, int_amdgcn_cvt_scalef32_pk8_fp8_f32>; + defm V_CVT_SCALEF32_PK8_BF8_F32 : VOP3Inst<"v_cvt_scalef32_pk8_bf8_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8F32_F32>, int_amdgcn_cvt_scalef32_pk8_bf8_f32>; + defm V_CVT_SCALEF32_PK8_FP4_F32 : VOP3Inst<"v_cvt_scalef32_pk8_fp4_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_I32_V8F32_F32>, int_amdgcn_cvt_scalef32_pk8_fp4_f32>; + defm V_CVT_SCALEF32_PK8_FP4_F16 : VOP3Inst<"v_cvt_scalef32_pk8_fp4_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_I32_V8F16_F32>, int_amdgcn_cvt_scalef32_pk8_fp4_f16>; + defm V_CVT_SCALEF32_PK8_FP4_BF16 : VOP3Inst<"v_cvt_scalef32_pk8_fp4_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_I32_V8BF16_F32>, int_amdgcn_cvt_scalef32_pk8_fp4_bf16>; + } // End WaveSizePredicate = isWave32 + defm V_CVT_SCALEF32_PK16_FP6_F32 : VOP3Inst<"v_cvt_scalef32_pk16_fp6_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F32_F32>, int_amdgcn_cvt_scalef32_pk16_fp6_f32>; + defm V_CVT_SCALEF32_PK16_BF6_F32 : VOP3Inst<"v_cvt_scalef32_pk16_bf6_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F32_F32>, int_amdgcn_cvt_scalef32_pk16_bf6_f32>; + defm V_CVT_SCALEF32_PK16_FP6_F16 : VOP3Inst<"v_cvt_scalef32_pk16_fp6_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F16_F32>, int_amdgcn_cvt_scalef32_pk16_fp6_f16>; + defm V_CVT_SCALEF32_PK16_BF6_F16 : VOP3Inst<"v_cvt_scalef32_pk16_bf6_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F16_F32>, int_amdgcn_cvt_scalef32_pk16_bf6_f16>; + defm V_CVT_SCALEF32_PK16_FP6_BF16 : VOP3Inst<"v_cvt_scalef32_pk16_fp6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16BF16_F32>, int_amdgcn_cvt_scalef32_pk16_fp6_bf16>; + defm V_CVT_SCALEF32_PK16_BF6_BF16 : VOP3Inst<"v_cvt_scalef32_pk16_bf6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16BF16_F32>, int_amdgcn_cvt_scalef32_pk16_bf6_bf16>; + + let WaveSizePredicate = isWave32 in { + defm V_CVT_SCALEF32_SR_PK8_FP8_BF16 : VOP3Inst<"v_cvt_scalef32_sr_pk8_fp8_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8BF16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_fp8_bf16>; + defm V_CVT_SCALEF32_SR_PK8_BF8_BF16 : VOP3Inst<"v_cvt_scalef32_sr_pk8_bf8_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8BF16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_bf8_bf16>; + defm V_CVT_SCALEF32_SR_PK8_FP8_F16 : VOP3Inst<"v_cvt_scalef32_sr_pk8_fp8_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8F16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_fp8_f16>; + defm V_CVT_SCALEF32_SR_PK8_BF8_F16 : VOP3Inst<"v_cvt_scalef32_sr_pk8_bf8_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8F16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_bf8_f16>; + defm V_CVT_SCALEF32_SR_PK8_FP8_F32 : VOP3Inst<"v_cvt_scalef32_sr_pk8_fp8_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8F32_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_fp8_f32>; + defm V_CVT_SCALEF32_SR_PK8_BF8_F32 : VOP3Inst<"v_cvt_scalef32_sr_pk8_bf8_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8F32_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_bf8_f32>; + defm V_CVT_SCALEF32_SR_PK8_FP4_F32 : VOP3Inst<"v_cvt_scalef32_sr_pk8_fp4_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_I32_V8F32_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_fp4_f32>; + defm V_CVT_SCALEF32_SR_PK8_FP4_F16 : VOP3Inst<"v_cvt_scalef32_sr_pk8_fp4_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_I32_V8F16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_fp4_f16>; + defm V_CVT_SCALEF32_SR_PK8_FP4_BF16 : VOP3Inst<"v_cvt_scalef32_sr_pk8_fp4_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_I32_V8BF16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_fp4_bf16>; + } // End WaveSizePredicate = isWave32 + defm V_CVT_SCALEF32_SR_PK16_BF6_BF16 : VOP3Inst<"v_cvt_scalef32_sr_pk16_bf6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16BF16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_bf6_bf16>; + defm V_CVT_SCALEF32_SR_PK16_BF6_F16 : VOP3Inst<"v_cvt_scalef32_sr_pk16_bf6_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_bf6_f16>; + defm V_CVT_SCALEF32_SR_PK16_BF6_F32 : VOP3Inst<"v_cvt_scalef32_sr_pk16_bf6_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F32_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_bf6_f32>; + defm V_CVT_SCALEF32_SR_PK16_FP6_BF16 : VOP3Inst<"v_cvt_scalef32_sr_pk16_fp6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16BF16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_fp6_bf16>; + defm V_CVT_SCALEF32_SR_PK16_FP6_F16 : VOP3Inst<"v_cvt_scalef32_sr_pk16_fp6_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_fp6_f16>; + defm V_CVT_SCALEF32_SR_PK16_FP6_F32 : VOP3Inst<"v_cvt_scalef32_sr_pk16_fp6_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F32_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_fp6_f32>; + } // End Constraints = "@earlyclobber $vdst" + let True16Predicate = UseRealTrue16Insts in { def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_fp8_f16, V_CVT_SR_FP8_F16_t16_e64, f16>; def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_bf8_f16, V_CVT_SR_BF8_F16_t16_e64, f16>; @@ -1754,6 +1838,12 @@ let SubtargetPredicate = isGFX1250Plus in { } } // End SubtargetPredicate = isGFX1250Plus +let SubtargetPredicate = HasTensorCvtLutInsts in { + defm V_PERM_PK16_B4_U4 : VOP3Inst<"v_perm_pk16_b4_u4", VOP3_V2I32_I32_I32_V2I32, int_amdgcn_perm_pk16_b4_u4>; + defm V_PERM_PK16_B6_U4 : VOP3Inst<"v_perm_pk16_b6_u4", VOP3_V3I32_I32_I64_V2I32, int_amdgcn_perm_pk16_b6_u4>; + defm V_PERM_PK16_B8_U4 : VOP3Inst<"v_perm_pk16_b8_u4", VOP3_V4I32_I64_I64_V2I32, int_amdgcn_perm_pk16_b8_u4>; +} // End SubtargetPredicate = HasTensorCvtLutInsts + class Cvt_Scale_Sr_F32ToBF16F16_Pat<SDPatternOperator node, VOP3_Pseudo inst, ValueType DstTy> : GCNPat< (DstTy (node DstTy:$vdst_in, f32:$src0, i32:$src1, timm:$word_sel)), (inst (DstSelToOpSelXForm $word_sel), $src0, 0, $src1, VGPR_32:$vdst_in) @@ -1973,6 +2063,11 @@ defm V_ADD_MAX_I32 : VOP3Only_Realtriple_gfx1250<0x25e>; defm V_ADD_MAX_U32 : VOP3Only_Realtriple_gfx1250<0x25f>; defm V_ADD_MIN_I32 : VOP3Only_Realtriple_gfx1250<0x260>; defm V_ADD_MIN_U32 : VOP3Only_Realtriple_gfx1250<0x261>; +defm V_PERMLANE_BCAST_B32 : VOP3Only_Real_Base_gfx12<0x270>; +defm V_PERMLANE_UP_B32 : VOP3Only_Real_Base_gfx12<0x271>; +defm V_PERMLANE_DOWN_B32 : VOP3Only_Real_Base_gfx12<0x272>; +defm V_PERMLANE_XOR_B32 : VOP3Only_Real_Base_gfx12<0x273>; +defm V_PERMLANE_IDX_GEN_B32 : VOP3Only_Real_Base_gfx12<0x314>; //===----------------------------------------------------------------------===// // GFX11, GFX12 @@ -2147,6 +2242,9 @@ let AssemblerPredicate = isGFX11Plus in { } // These instructions differ from GFX12 variant by supporting DPP: +defm V_PERM_PK16_B4_U4 : VOP3Only_Real_Base_gfx1250<0x23f>; +defm V_PERM_PK16_B6_U4 : VOP3Only_Real_Base_gfx1250<0x242>; +defm V_PERM_PK16_B8_U4 : VOP3Only_Real_Base_gfx1250<0x243>; defm V_LSHL_ADD_U64 : VOP3Only_Realtriple_gfx1250<0x252>; defm V_ASHR_PK_I8_I32 : VOP3Only_Realtriple_gfx1250<0x290>; defm V_ASHR_PK_U8_I32 : VOP3Only_Realtriple_gfx1250<0x291>; @@ -2159,6 +2257,42 @@ defm V_CVT_SCALE_PK8_F32_FP8 : VOP3Only_ScaleSel_Real_gfx1250<0x2aa>; defm V_CVT_SCALE_PK8_F16_BF8 : VOP3Only_ScaleSel_Real_gfx1250<0x2ab>; defm V_CVT_SCALE_PK8_BF16_BF8 : VOP3Only_ScaleSel_Real_gfx1250<0x2ac>; defm V_CVT_SCALE_PK8_F32_BF8 : VOP3Only_ScaleSel_Real_gfx1250<0x2ad>; +defm V_CVT_SCALEF32_PK8_FP4_F32 : VOP3Only_Real_Base_gfx1250<0x2b0>; +defm V_CVT_SCALEF32_PK8_FP4_F16 : VOP3Only_Real_Base_gfx1250<0x2b3>; +defm V_CVT_SCALEF32_PK8_FP8_BF16 : VOP3Only_Real_Base_gfx1250<0x2b4>; +defm V_CVT_SCALEF32_PK8_BF8_BF16 : VOP3Only_Real_Base_gfx1250<0x2b5>; +defm V_CVT_SCALEF32_PK8_FP4_BF16 : VOP3Only_Real_Base_gfx1250<0x2b8>; +defm V_CVT_SCALEF32_PK8_FP8_F32 : VOP3Only_Real_Base_gfx1250<0x2c3>; +defm V_CVT_SCALEF32_PK8_FP8_F16 : VOP3Only_Real_Base_gfx1250<0x2c4>; +defm V_CVT_SCALEF32_PK8_BF8_F32 : VOP3Only_Real_Base_gfx1250<0x2c5>; +defm V_CVT_SCALEF32_PK8_BF8_F16 : VOP3Only_Real_Base_gfx1250<0x2c6>; +defm V_CVT_SCALE_PK16_F16_FP6 : VOP3Only_ScaleSel_Real_gfx1250<0x2c7>; +defm V_CVT_SCALE_PK16_BF16_FP6 : VOP3Only_ScaleSel_Real_gfx1250<0x2c8>; +defm V_CVT_SCALE_PK16_F32_FP6 : VOP3Only_ScaleSel_Real_gfx1250<0x2c9>; +defm V_CVT_SCALE_PK16_F16_BF6 : VOP3Only_ScaleSel_Real_gfx1250<0x2ca>; +defm V_CVT_SCALE_PK16_BF16_BF6 : VOP3Only_ScaleSel_Real_gfx1250<0x2cb>; +defm V_CVT_SCALE_PK16_F32_BF6 : VOP3Only_ScaleSel_Real_gfx1250<0x2cc>; +defm V_CVT_SCALEF32_PK16_FP6_F32 : VOP3Only_Real_Base_gfx1250<0x2cd>; +defm V_CVT_SCALEF32_PK16_BF6_F32 : VOP3Only_Real_Base_gfx1250<0x2ce>; +defm V_CVT_SCALEF32_PK16_FP6_F16 : VOP3Only_Real_Base_gfx1250<0x2cf>; +defm V_CVT_SCALEF32_PK16_BF6_F16 : VOP3Only_Real_Base_gfx1250<0x2d0>; +defm V_CVT_SCALEF32_PK16_FP6_BF16 : VOP3Only_Real_Base_gfx1250<0x2d1>; +defm V_CVT_SCALEF32_PK16_BF6_BF16 : VOP3Only_Real_Base_gfx1250<0x2d2>; +defm V_CVT_SCALEF32_SR_PK16_FP6_F32 : VOP3Only_Real_Base_gfx1250<0x2d3>; +defm V_CVT_SCALEF32_SR_PK16_BF6_F32 : VOP3Only_Real_Base_gfx1250<0x2d4>; +defm V_CVT_SCALEF32_SR_PK16_FP6_F16 : VOP3Only_Real_Base_gfx1250<0x2d5>; +defm V_CVT_SCALEF32_SR_PK16_BF6_F16 : VOP3Only_Real_Base_gfx1250<0x2d6>; +defm V_CVT_SCALEF32_SR_PK16_FP6_BF16 : VOP3Only_Real_Base_gfx1250<0x2d7>; +defm V_CVT_SCALEF32_SR_PK16_BF6_BF16 : VOP3Only_Real_Base_gfx1250<0x2d8>; +defm V_CVT_SCALEF32_SR_PK8_FP4_F32 : VOP3Only_Real_Base_gfx1250<0x297>; +defm V_CVT_SCALEF32_SR_PK8_FP8_F32 : VOP3Only_Real_Base_gfx1250<0x298>; +defm V_CVT_SCALEF32_SR_PK8_BF8_F32 : VOP3Only_Real_Base_gfx1250<0x299>; +defm V_CVT_SCALEF32_SR_PK8_FP4_F16 : VOP3Only_Real_Base_gfx1250<0x2b9>; +defm V_CVT_SCALEF32_SR_PK8_FP4_BF16 : VOP3Only_Real_Base_gfx1250<0x2bc>; +defm V_CVT_SCALEF32_SR_PK8_FP8_F16 : VOP3Only_Real_Base_gfx1250<0x2bf>; +defm V_CVT_SCALEF32_SR_PK8_FP8_BF16 : VOP3Only_Real_Base_gfx1250<0x2c0>; +defm V_CVT_SCALEF32_SR_PK8_BF8_F16 : VOP3Only_Real_Base_gfx1250<0x2c1>; +defm V_CVT_SCALEF32_SR_PK8_BF8_BF16 : VOP3Only_Real_Base_gfx1250<0x2c2>; defm V_CVT_PK_BF16_F32 : VOP3Only_Realtriple_gfx1250<0x36d>; defm V_CVT_SR_PK_BF16_F32 : VOP3Only_Realtriple_gfx1250<0x36e>; defm V_CVT_PK_F16_F32 : VOP3Only_Realtriple_gfx1250<0x36f>; diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index bd4b75f..9366256 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -5521,18 +5521,6 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TrueVal); ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS); if (Op.getValueType().isInteger()) { - // Check for sign pattern (SELECT_CC setgt, iN lhs, -1, 1, -1) and transform - // into (OR (ASR lhs, N-1), 1), which requires less instructions for the - // supported types. - if (CC == ISD::SETGT && RHSC && RHSC->isAllOnes() && CTVal && CFVal && - CTVal->isOne() && CFVal->isAllOnes() && - LHS.getValueType() == TrueVal.getValueType()) { - EVT VT = LHS.getValueType(); - SDValue Shift = - DAG.getNode(ISD::SRA, dl, VT, LHS, - DAG.getConstant(VT.getSizeInBits() - 1, dl, VT)); - return DAG.getNode(ISD::OR, dl, VT, Shift, DAG.getConstant(1, dl, VT)); - } // Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns. // (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1)) diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp index 146fc67..c221d22 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp @@ -1108,9 +1108,8 @@ std::optional<bool> ARMAsmBackend::evaluateFixup(const MCFragment &F, } void ARMAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) { + const MCValue &Target, uint8_t *Data, + uint64_t Value, bool IsResolved) { if (IsResolved && shouldForceRelocation(Fixup, Target)) IsResolved = false; maybeAddReloc(F, Fixup, Target, Value, IsResolved); @@ -1124,14 +1123,15 @@ void ARMAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, return; // Doesn't change encoding. const unsigned NumBytes = getFixupKindNumBytes(Kind); - unsigned Offset = Fixup.getOffset(); - assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!"); + assert(Fixup.getOffset() + NumBytes <= F.getSize() && + "Invalid fixup offset!"); // Used to point to big endian bytes. unsigned FullSizeBytes; if (Endian == llvm::endianness::big) { FullSizeBytes = getFixupKindContainerSizeBytes(Kind); - assert((Offset + FullSizeBytes) <= Data.size() && "Invalid fixup size!"); + assert(Fixup.getOffset() + FullSizeBytes <= F.getSize() && + "Invalid fixup size!"); assert(NumBytes <= FullSizeBytes && "Invalid fixup size!"); } @@ -1141,7 +1141,7 @@ void ARMAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, for (unsigned i = 0; i != NumBytes; ++i) { unsigned Idx = Endian == llvm::endianness::little ? i : (FullSizeBytes - 1 - i); - Data[Offset + Idx] |= uint8_t((Value >> (i * 8)) & 0xff); + Data[Idx] |= uint8_t((Value >> (i * 8)) & 0xff); } } diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h index 07d2cf7..2844232 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h @@ -40,8 +40,7 @@ public: std::optional<bool> evaluateFixup(const MCFragment &, MCFixup &, MCValue &, uint64_t &) override; void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) override; + uint8_t *Data, uint64_t Value, bool IsResolved) override; unsigned getRelaxedOpcode(unsigned Op, const MCSubtargetInfo &STI) const; diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp index 128cc0b..05a7d03 100644 --- a/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp +++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp @@ -368,9 +368,8 @@ AVRAsmBackend::createObjectTargetWriter() const { } void AVRAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) { + const MCValue &Target, uint8_t *Data, + uint64_t Value, bool IsResolved) { // AVR sets the fixup value to bypass the assembly time overflow with a // relocation. if (IsResolved) { @@ -397,14 +396,14 @@ void AVRAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, // Shift the value into position. Value <<= Info.TargetOffset; - unsigned Offset = Fixup.getOffset(); - assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!"); + assert(Fixup.getOffset() + NumBytes <= F.getSize() && + "Invalid fixup offset!"); // For each byte of the fragment that the fixup touches, mask in the // bits from the fixup value. for (unsigned i = 0; i < NumBytes; ++i) { uint8_t mask = (((Value >> (i * 8)) & 0xff)); - Data[Offset + i] |= mask; + Data[i] |= mask; } } diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h b/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h index 68c839e..9633669 100644 --- a/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h +++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h @@ -38,8 +38,7 @@ public: createObjectTargetWriter() const override; void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) override; + uint8_t *Data, uint64_t Value, bool IsResolved) override; std::optional<MCFixupKind> getFixupKind(StringRef Name) const override; MCFixupKindInfo getFixupKindInfo(MCFixupKind Kind) const override; diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp b/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp index dda8753..53933f9 100644 --- a/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp +++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp @@ -27,8 +27,7 @@ public: ~BPFAsmBackend() override = default; void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) override; + uint8_t *Data, uint64_t Value, bool IsResolved) override; std::unique_ptr<MCObjectTargetWriter> createObjectTargetWriter() const override; @@ -66,35 +65,32 @@ bool BPFAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count, } void BPFAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) { + const MCValue &Target, uint8_t *Data, + uint64_t Value, bool IsResolved) { maybeAddReloc(F, Fixup, Target, Value, IsResolved); if (Fixup.getKind() == FK_SecRel_8) { // The Value is 0 for global variables, and the in-section offset // for static variables. Write to the immediate field of the inst. assert(Value <= UINT32_MAX); - support::endian::write<uint32_t>(&Data[Fixup.getOffset() + 4], - static_cast<uint32_t>(Value), + support::endian::write<uint32_t>(Data + 4, static_cast<uint32_t>(Value), Endian); } else if (Fixup.getKind() == FK_Data_4 && !Fixup.isPCRel()) { - support::endian::write<uint32_t>(&Data[Fixup.getOffset()], Value, Endian); + support::endian::write<uint32_t>(Data, Value, Endian); } else if (Fixup.getKind() == FK_Data_8) { - support::endian::write<uint64_t>(&Data[Fixup.getOffset()], Value, Endian); + support::endian::write<uint64_t>(Data, Value, Endian); } else if (Fixup.getKind() == FK_Data_4 && Fixup.isPCRel()) { Value = (uint32_t)((Value - 8) / 8); if (Endian == llvm::endianness::little) { - Data[Fixup.getOffset() + 1] = 0x10; - support::endian::write32le(&Data[Fixup.getOffset() + 4], Value); + Data[1] = 0x10; + support::endian::write32le(Data + 4, Value); } else { - Data[Fixup.getOffset() + 1] = 0x1; - support::endian::write32be(&Data[Fixup.getOffset() + 4], Value); + Data[1] = 0x1; + support::endian::write32be(Data + 4, Value); } } else if (Fixup.getKind() == BPF::FK_BPF_PCRel_4) { // The input Value represents the number of bytes. Value = (uint32_t)((Value - 8) / 8); - support::endian::write<uint32_t>(&Data[Fixup.getOffset() + 4], Value, - Endian); + support::endian::write<uint32_t>(Data + 4, Value, Endian); } else { assert(Fixup.getKind() == FK_Data_2 && Fixup.isPCRel()); @@ -103,8 +99,7 @@ void BPFAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, report_fatal_error("Branch target out of insn range"); Value = (uint16_t)((Value - 8) / 8); - support::endian::write<uint16_t>(&Data[Fixup.getOffset() + 2], Value, - Endian); + support::endian::write<uint16_t>(Data + 2, Value, Endian); } } diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp index 694d9ea..6964998 100644 --- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp +++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp @@ -197,9 +197,8 @@ std::optional<bool> CSKYAsmBackend::evaluateFixup(const MCFragment &F, } void CSKYAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) { + const MCValue &Target, uint8_t *Data, + uint64_t Value, bool IsResolved) { if (IsResolved && shouldForceRelocation(Fixup, Target)) IsResolved = false; maybeAddReloc(F, Fixup, Target, Value, IsResolved); @@ -217,10 +216,10 @@ void CSKYAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, // Shift the value into position. Value <<= Info.TargetOffset; - unsigned Offset = Fixup.getOffset(); unsigned NumBytes = alignTo(Info.TargetSize + Info.TargetOffset, 8) / 8; - assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!"); + assert(Fixup.getOffset() + NumBytes <= F.getSize() && + "Invalid fixup offset!"); // For each byte of the fragment that the fixup touches, mask in the // bits from the fixup value. @@ -228,14 +227,14 @@ void CSKYAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, bool IsInstFixup = (Kind >= FirstTargetFixupKind); if (IsLittleEndian && IsInstFixup && (NumBytes == 4)) { - Data[Offset + 0] |= uint8_t((Value >> 16) & 0xff); - Data[Offset + 1] |= uint8_t((Value >> 24) & 0xff); - Data[Offset + 2] |= uint8_t(Value & 0xff); - Data[Offset + 3] |= uint8_t((Value >> 8) & 0xff); + Data[0] |= uint8_t((Value >> 16) & 0xff); + Data[1] |= uint8_t((Value >> 24) & 0xff); + Data[2] |= uint8_t(Value & 0xff); + Data[3] |= uint8_t((Value >> 8) & 0xff); } else { for (unsigned I = 0; I != NumBytes; I++) { unsigned Idx = IsLittleEndian ? I : (NumBytes - 1 - I); - Data[Offset + Idx] |= uint8_t((Value >> (I * 8)) & 0xff); + Data[Idx] |= uint8_t((Value >> (I * 8)) & 0xff); } } } diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h index 1c8516f..5d8826a 100644 --- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h +++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h @@ -25,8 +25,7 @@ public: std::optional<bool> evaluateFixup(const MCFragment &, MCFixup &, MCValue &, uint64_t &) override; void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) override; + uint8_t *Data, uint64_t Value, bool IsResolved) override; MCFixupKindInfo getFixupKindInfo(MCFixupKind Kind) const override; diff --git a/llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.cpp b/llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.cpp index 5323be6..9a14c01 100644 --- a/llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.cpp +++ b/llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.cpp @@ -78,8 +78,7 @@ public: ~DXILAsmBackend() override = default; void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) override {} + uint8_t *Data, uint64_t Value, bool IsResolved) override {} std::unique_ptr<MCObjectTargetWriter> createObjectTargetWriter() const override { diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp index 7d3074b..1a0f1ab 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp @@ -402,8 +402,7 @@ public: } void applyFixup(const MCFragment &, const MCFixup &, const MCValue &, - MutableArrayRef<char> Data, uint64_t FixupValue, - bool IsResolved) override; + uint8_t *Data, uint64_t FixupValue, bool IsResolved) override; bool isInstRelaxable(MCInst const &HMI) const { const MCInstrDesc &MCID = HexagonMCInstrInfo::getDesc(*MCII, HMI); @@ -649,8 +648,7 @@ public: } // namespace void HexagonAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &Target, - MutableArrayRef<char> Data, + const MCValue &Target, uint8_t *InstAddr, uint64_t FixupValue, bool IsResolved) { if (IsResolved && shouldForceRelocation(Fixup)) IsResolved = false; @@ -667,10 +665,9 @@ void HexagonAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, // LLVM gives us an encoded value, we have to convert it back // to a real offset before we can use it. - uint32_t Offset = Fixup.getOffset(); unsigned NumBytes = getFixupKindNumBytes(Kind); - assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!"); - char *InstAddr = Data.data() + Offset; + assert(Fixup.getOffset() + NumBytes <= F.getSize() && + "Invalid fixup offset!"); Value = adjustFixupValue(Kind, FixupValue); if (!Value) @@ -757,8 +754,8 @@ void HexagonAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, uint32_t OldData = 0; for (unsigned i = 0; i < NumBytes; i++) OldData |= (InstAddr[i] << (i * 8)) & (0xff << (i * 8)); dbgs() << "\tBValue=0x"; dbgs().write_hex(Value) << ": AValue=0x"; - dbgs().write_hex(FixupValue) - << ": Offset=" << Offset << ": Size=" << Data.size() << ": OInst=0x"; + dbgs().write_hex(FixupValue) << ": Offset=" << Fixup.getOffset() + << ": Size=" << F.getSize() << ": OInst=0x"; dbgs().write_hex(OldData) << ": Reloc=0x"; dbgs().write_hex(Reloc);); // For each byte of the fragment that the fixup touches, mask in the diff --git a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp index 83d1697..3112dea 100644 --- a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp +++ b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp @@ -48,8 +48,7 @@ public: : MCAsmBackend(llvm::endianness::big), OSType(OST) {} void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) override; + uint8_t *Data, uint64_t Value, bool IsResolved) override; std::unique_ptr<MCObjectTargetWriter> createObjectTargetWriter() const override; @@ -72,9 +71,8 @@ bool LanaiAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count, } void LanaiAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) { + const MCValue &Target, uint8_t *Data, + uint64_t Value, bool IsResolved) { if (!IsResolved) Asm->getWriter().recordRelocation(F, Fixup, Target, Value); @@ -85,7 +83,6 @@ void LanaiAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, // Where in the object and where the number of bytes that need // fixing up - unsigned Offset = Fixup.getOffset(); unsigned NumBytes = (getFixupKindInfo(Kind).TargetSize + 7) / 8; unsigned FullSize = 4; @@ -95,8 +92,7 @@ void LanaiAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, // Load instruction and apply value for (unsigned i = 0; i != NumBytes; ++i) { unsigned Idx = (FullSize - 1 - i); - CurVal |= static_cast<uint64_t>(static_cast<uint8_t>(Data[Offset + Idx])) - << (i * 8); + CurVal |= static_cast<uint64_t>(static_cast<uint8_t>(Data[Idx])) << (i * 8); } uint64_t Mask = @@ -106,7 +102,7 @@ void LanaiAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, // Write out the fixed up bytes back to the code/data bits. for (unsigned i = 0; i != NumBytes; ++i) { unsigned Idx = (FullSize - 1 - i); - Data[Offset + Idx] = static_cast<uint8_t>((CurVal >> (i * 8)) & 0xff); + Data[Idx] = static_cast<uint8_t>((CurVal >> (i * 8)) & 0xff); } } diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp index d9ea88c..fda9d97 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp @@ -131,19 +131,18 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value, } } -static void fixupLeb128(MCContext &Ctx, const MCFixup &Fixup, - MutableArrayRef<char> Data, uint64_t Value) { +static void fixupLeb128(MCContext &Ctx, const MCFixup &Fixup, uint8_t *Data, + uint64_t Value) { unsigned I; - for (I = 0; I != Data.size() && Value; ++I, Value >>= 7) + for (I = 0; Value; ++I, Value >>= 7) Data[I] |= uint8_t(Value & 0x7f); if (Value) Ctx.reportError(Fixup.getLoc(), "Invalid uleb128 value!"); } void LoongArchAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) { + const MCValue &Target, uint8_t *Data, + uint64_t Value, bool IsResolved) { if (IsResolved && shouldForceRelocation(Fixup, Target)) IsResolved = false; IsResolved = addReloc(F, Fixup, Target, Value, IsResolved); @@ -166,14 +165,14 @@ void LoongArchAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, // Shift the value into position. Value <<= Info.TargetOffset; - unsigned Offset = Fixup.getOffset(); unsigned NumBytes = alignTo(Info.TargetSize + Info.TargetOffset, 8) / 8; - assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!"); + assert(Fixup.getOffset() + NumBytes <= F.getSize() && + "Invalid fixup offset!"); // For each byte of the fragment that the fixup touches, mask in the // bits from the fixup value. for (unsigned I = 0; I != NumBytes; ++I) { - Data[Offset + I] |= uint8_t((Value >> (I * 8)) & 0xff); + Data[I] |= uint8_t((Value >> (I * 8)) & 0xff); } } @@ -274,15 +273,14 @@ bool LoongArchAsmBackend::relaxDwarfLineAddr(MCFragment &F, int64_t LineDelta = F.getDwarfLineDelta(); const MCExpr &AddrDelta = F.getDwarfAddrDelta(); - SmallVector<MCFixup, 1> Fixups; size_t OldSize = F.getVarSize(); int64_t Value; if (AddrDelta.evaluateAsAbsolute(Value, *Asm)) return false; - bool IsAbsolute = AddrDelta.evaluateKnownAbsolute(Value, *Asm); - assert(IsAbsolute && "CFA with invalid expression"); - (void)IsAbsolute; + [[maybe_unused]] bool IsAbsolute = + AddrDelta.evaluateKnownAbsolute(Value, *Asm); + assert(IsAbsolute); SmallVector<char> Data; raw_svector_ostream OS(Data); @@ -293,33 +291,23 @@ bool LoongArchAsmBackend::relaxDwarfLineAddr(MCFragment &F, encodeSLEB128(LineDelta, OS); } - unsigned Offset; - std::pair<MCFixupKind, MCFixupKind> FK; - // According to the DWARF specification, the `DW_LNS_fixed_advance_pc` opcode // takes a single unsigned half (unencoded) operand. The maximum encodable // value is therefore 65535. Set a conservative upper bound for relaxation. + unsigned PCBytes; if (Value > 60000) { unsigned PtrSize = C.getAsmInfo()->getCodePointerSize(); - - OS << uint8_t(dwarf::DW_LNS_extended_op); - encodeULEB128(PtrSize + 1, OS); - - OS << uint8_t(dwarf::DW_LNE_set_address); - Offset = OS.tell(); assert((PtrSize == 4 || PtrSize == 8) && "Unexpected pointer size"); - FK = getRelocPairForSize(PtrSize == 4 ? 32 : 64); + PCBytes = PtrSize; + OS << uint8_t(dwarf::DW_LNS_extended_op) << uint8_t(PtrSize + 1) + << uint8_t(dwarf::DW_LNE_set_address); OS.write_zeros(PtrSize); } else { + PCBytes = 2; OS << uint8_t(dwarf::DW_LNS_fixed_advance_pc); - Offset = OS.tell(); - FK = getRelocPairForSize(16); support::endian::write<uint16_t>(OS, 0, llvm::endianness::little); } - - const MCBinaryExpr &MBE = cast<MCBinaryExpr>(AddrDelta); - Fixups.push_back(MCFixup::create(Offset, MBE.getLHS(), std::get<0>(FK))); - Fixups.push_back(MCFixup::create(Offset, MBE.getRHS(), std::get<1>(FK))); + auto Offset = OS.tell() - PCBytes; if (LineDelta == INT64_MAX) { OS << uint8_t(dwarf::DW_LNS_extended_op); @@ -330,7 +318,8 @@ bool LoongArchAsmBackend::relaxDwarfLineAddr(MCFragment &F, } F.setVarContents(Data); - F.setVarFixups(Fixups); + F.setVarFixups({MCFixup::create(Offset, &AddrDelta, + MCFixup::getDataKindForSize(PCBytes))}); WasRelaxed = OldSize != Data.size(); return true; } diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h index 3d929fc..1f13601 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h @@ -42,8 +42,7 @@ public: uint64_t &FixedValue, bool IsResolved); void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) override; + uint8_t *Data, uint64_t Value, bool IsResolved) override; bool shouldForceRelocation(const MCFixup &Fixup, const MCValue &Target); diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp b/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp index 5e03903..fe83dc6 100644 --- a/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp +++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp @@ -53,8 +53,7 @@ public: .Default(false)) {} void applyFixup(const MCFragment &, const MCFixup &, const MCValue &, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) override; + uint8_t *Data, uint64_t Value, bool IsResolved) override; bool mayNeedRelaxation(unsigned Opcode, ArrayRef<MCOperand> Operands, const MCSubtargetInfo &STI) const override; @@ -78,14 +77,13 @@ public: } // end anonymous namespace void M68kAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) { + const MCValue &Target, uint8_t *Data, + uint64_t Value, bool IsResolved) { if (!IsResolved) Asm->getWriter().recordRelocation(F, Fixup, Target, Value); unsigned Size = 1 << getFixupKindLog2Size(Fixup.getKind()); - assert(Fixup.getOffset() + Size <= Data.size() && "Invalid fixup offset!"); + assert(Fixup.getOffset() + Size <= F.getSize() && "Invalid fixup offset!"); // Check that uppper bits are either all zeros or all ones. // Specifically ignore overflow/underflow as long as the leakage is // limited to the lower bits. This is to remain compatible with @@ -95,8 +93,7 @@ void M68kAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, // Write in Big Endian for (unsigned i = 0; i != Size; ++i) - Data[Fixup.getOffset() + i] = - uint8_t(static_cast<int64_t>(Value) >> ((Size - i - 1) * 8)); + Data[i] = uint8_t(static_cast<int64_t>(Value) >> ((Size - i - 1) * 8)); } /// cc—Carry clear GE—Greater than or equal diff --git a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp index 29e5bfa..d892b3a 100644 --- a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp +++ b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp @@ -36,8 +36,7 @@ public: ~MSP430AsmBackend() override = default; void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) override; + uint8_t *Data, uint64_t Value, bool IsResolved) override; std::unique_ptr<MCObjectTargetWriter> createObjectTargetWriter() const override { @@ -105,9 +104,8 @@ uint64_t MSP430AsmBackend::adjustFixupValue(const MCFixup &Fixup, } void MSP430AsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) { + const MCValue &Target, uint8_t *Data, + uint64_t Value, bool IsResolved) { maybeAddReloc(F, Fixup, Target, Value, IsResolved); Value = adjustFixupValue(Fixup, Value, getContext()); MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind()); @@ -117,15 +115,14 @@ void MSP430AsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, // Shift the value into position. Value <<= Info.TargetOffset; - unsigned Offset = Fixup.getOffset(); unsigned NumBytes = alignTo(Info.TargetSize + Info.TargetOffset, 8) / 8; - - assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!"); + assert(Fixup.getOffset() + NumBytes <= F.getSize() && + "Invalid fixup offset!"); // For each byte of the fragment that the fixup touches, mask in the // bits from the fixup value. for (unsigned i = 0; i != NumBytes; ++i) { - Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff); + Data[i] |= uint8_t((Value >> (i * 8)) & 0xff); } } diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp index c2169be..33aab71 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp @@ -283,9 +283,8 @@ static bool shouldForceRelocation(const MCFixup &Fixup) { /// data fragment, at the offset specified by the fixup and following the /// fixup kind as appropriate. void MipsAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) { + const MCValue &Target, uint8_t *Data, + uint64_t Value, bool IsResolved) { if (shouldForceRelocation(Fixup)) IsResolved = false; maybeAddReloc(F, Fixup, Target, Value, IsResolved); @@ -297,7 +296,6 @@ void MipsAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, return; // Doesn't change encoding. // Where do we start in the object - unsigned Offset = Fixup.getOffset(); // Number of bytes we need to fixup unsigned NumBytes = (getFixupKindInfo(Kind).TargetSize + 7) / 8; // Used to point to big endian bytes @@ -328,7 +326,7 @@ void MipsAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, unsigned Idx = Endian == llvm::endianness::little ? (microMipsLEByteOrder ? calculateMMLEIndex(i) : i) : (FullSize - 1 - i); - CurVal |= (uint64_t)((uint8_t)Data[Offset + Idx]) << (i*8); + CurVal |= (uint64_t)((uint8_t)Data[Idx]) << (i * 8); } uint64_t Mask = ((uint64_t)(-1) >> @@ -340,7 +338,7 @@ void MipsAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, unsigned Idx = Endian == llvm::endianness::little ? (microMipsLEByteOrder ? calculateMMLEIndex(i) : i) : (FullSize - 1 - i); - Data[Offset + Idx] = (uint8_t)((CurVal >> (i*8)) & 0xff); + Data[Idx] = (uint8_t)((CurVal >> (i * 8)) & 0xff); } } diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h index 816626d..40b5853 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h @@ -40,8 +40,7 @@ public: createObjectTargetWriter() const override; void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) override; + uint8_t *Data, uint64_t Value, bool IsResolved) override; std::optional<MCFixupKind> getFixupKind(StringRef Name) const override; MCFixupKindInfo getFixupKindInfo(MCFixupKind Kind) const override; diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp index d9680c7..7a8395a 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp @@ -1034,12 +1034,14 @@ MCELFStreamer &MipsTargetELFStreamer::getStreamer() { void MipsTargetELFStreamer::emitGPRel32Value(const MCExpr *Value) { auto &S = getStreamer(); + S.ensureHeadroom(4); S.addFixup(Value, Mips::fixup_Mips_GPREL32); S.appendContents(4, 0); } void MipsTargetELFStreamer::emitGPRel64Value(const MCExpr *Value) { auto &S = getStreamer(); + S.ensureHeadroom(8); // fixup_Mips_GPREL32 desginates R_MIPS_GPREL32+R_MIPS_64 on MIPS64. S.addFixup(Value, Mips::fixup_Mips_GPREL32); S.appendContents(8, 0); @@ -1047,24 +1049,28 @@ void MipsTargetELFStreamer::emitGPRel64Value(const MCExpr *Value) { void MipsTargetELFStreamer::emitDTPRel32Value(const MCExpr *Value) { auto &S = getStreamer(); + S.ensureHeadroom(4); S.addFixup(Value, Mips::fixup_Mips_DTPREL32); S.appendContents(4, 0); } void MipsTargetELFStreamer::emitDTPRel64Value(const MCExpr *Value) { auto &S = getStreamer(); + S.ensureHeadroom(8); S.addFixup(Value, Mips::fixup_Mips_DTPREL64); S.appendContents(8, 0); } void MipsTargetELFStreamer::emitTPRel32Value(const MCExpr *Value) { auto &S = getStreamer(); + S.ensureHeadroom(4); S.addFixup(Value, Mips::fixup_Mips_TPREL32); S.appendContents(4, 0); } void MipsTargetELFStreamer::emitTPRel64Value(const MCExpr *Value) { auto &S = getStreamer(); + S.ensureHeadroom(8); S.addFixup(Value, Mips::fixup_Mips_TPREL64); S.appendContents(8, 0); } diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 65d1be3..15f45a1 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -382,6 +382,54 @@ static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL, } } +// We return an EVT that can hold N VTs +// If the VT is a vector, the resulting EVT is a flat vector with the same +// element type as VT's element type. +static EVT getVectorizedVT(EVT VT, unsigned N, LLVMContext &C) { + if (N == 1) + return VT; + + return VT.isVector() ? EVT::getVectorVT(C, VT.getScalarType(), + VT.getVectorNumElements() * N) + : EVT::getVectorVT(C, VT, N); +} + +static SDValue getExtractVectorizedValue(SDValue V, unsigned I, EVT VT, + const SDLoc &dl, SelectionDAG &DAG) { + if (V.getValueType() == VT) { + assert(I == 0 && "Index must be 0 for scalar value"); + return V; + } + + if (!VT.isVector()) + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, V, + DAG.getVectorIdxConstant(I, dl)); + + return DAG.getNode( + ISD::EXTRACT_SUBVECTOR, dl, VT, V, + DAG.getVectorIdxConstant(I * VT.getVectorNumElements(), dl)); +} + +template <typename T> +static inline SDValue getBuildVectorizedValue(unsigned N, const SDLoc &dl, + SelectionDAG &DAG, T GetElement) { + if (N == 1) + return GetElement(0); + + SmallVector<SDValue, 8> Values; + for (const unsigned I : llvm::seq(N)) { + SDValue Val = GetElement(I); + if (Val.getValueType().isVector()) + DAG.ExtractVectorElements(Val, Values); + else + Values.push_back(Val); + } + + EVT VT = EVT::getVectorVT(*DAG.getContext(), Values[0].getValueType(), + Values.size()); + return DAG.getBuildVector(VT, dl, Values); +} + /// PromoteScalarIntegerPTX /// Used to make sure the arguments/returns are suitable for passing /// and promote them to a larger size if they're not. @@ -420,9 +468,10 @@ static EVT promoteScalarIntegerPTX(const EVT VT) { // parameter starting at index Idx using a single vectorized op of // size AccessSize. If so, it returns the number of param pieces // covered by the vector op. Otherwise, it returns 1. -static unsigned CanMergeParamLoadStoresStartingAt( +template <typename T> +static unsigned canMergeParamLoadStoresStartingAt( unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs, - const SmallVectorImpl<uint64_t> &Offsets, Align ParamAlignment) { + const SmallVectorImpl<T> &Offsets, Align ParamAlignment) { // Can't vectorize if param alignment is not sufficient. if (ParamAlignment < AccessSize) @@ -472,10 +521,11 @@ static unsigned CanMergeParamLoadStoresStartingAt( // of the same size as ValueVTs indicating how each piece should be // loaded/stored (i.e. as a scalar, or as part of a vector // load/store). +template <typename T> static SmallVector<unsigned, 16> VectorizePTXValueVTs(const SmallVectorImpl<EVT> &ValueVTs, - const SmallVectorImpl<uint64_t> &Offsets, - Align ParamAlignment, bool IsVAArg = false) { + const SmallVectorImpl<T> &Offsets, Align ParamAlignment, + bool IsVAArg = false) { // Set vector size to match ValueVTs and mark all elements as // scalars by default. @@ -486,7 +536,7 @@ VectorizePTXValueVTs(const SmallVectorImpl<EVT> &ValueVTs, const auto GetNumElts = [&](unsigned I) -> unsigned { for (const unsigned AccessSize : {16, 8, 4, 2}) { - const unsigned NumElts = CanMergeParamLoadStoresStartingAt( + const unsigned NumElts = canMergeParamLoadStoresStartingAt( I, AccessSize, ValueVTs, Offsets, ParamAlignment); assert((NumElts == 1 || NumElts == 2 || NumElts == 4) && "Unexpected vectorization size"); @@ -1384,6 +1434,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Type *RetTy = CLI.RetTy; const CallBase *CB = CLI.CB; const DataLayout &DL = DAG.getDataLayout(); + LLVMContext &Ctx = *DAG.getContext(); const auto GetI32 = [&](const unsigned I) { return DAG.getConstant(I, dl, MVT::i32); @@ -1476,15 +1527,9 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, const SDValue ParamSymbol = getCallParamSymbol(DAG, IsVAArg ? FirstVAArg : ArgI, MVT::i32); - SmallVector<EVT, 16> VTs; - SmallVector<uint64_t, 16> Offsets; - assert((!IsByVal || Arg.IndirectType) && "byval arg must have indirect type"); Type *ETy = (IsByVal ? Arg.IndirectType : Arg.Ty); - ComputePTXValueVTs(*this, DL, ETy, VTs, &Offsets, IsByVal ? 0 : VAOffset); - assert(VTs.size() == Offsets.size() && "Size mismatch"); - assert((IsByVal || VTs.size() == ArgOuts.size()) && "Size mismatch"); const Align ArgAlign = [&]() { if (IsByVal) { @@ -1492,17 +1537,14 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // so we don't need to worry whether it's naturally aligned or not. // See TargetLowering::LowerCallTo(). const Align InitialAlign = ArgOuts[0].Flags.getNonZeroByValAlign(); - const Align ByValAlign = getFunctionByValParamAlign( - CB->getCalledFunction(), ETy, InitialAlign, DL); - if (IsVAArg) - VAOffset = alignTo(VAOffset, ByValAlign); - return ByValAlign; + return getFunctionByValParamAlign(CB->getCalledFunction(), ETy, + InitialAlign, DL); } return getArgumentAlignment(CB, Arg.Ty, ArgI + 1, DL); }(); - const unsigned TypeSize = DL.getTypeAllocSize(ETy); - assert((!IsByVal || TypeSize == ArgOuts[0].Flags.getByValSize()) && + const unsigned TySize = DL.getTypeAllocSize(ETy); + assert((!IsByVal || TySize == ArgOuts[0].Flags.getByValSize()) && "type size mismatch"); const SDValue ArgDeclare = [&]() { @@ -1510,105 +1552,120 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, return VADeclareParam; if (IsByVal || shouldPassAsArray(Arg.Ty)) - return MakeDeclareArrayParam(ParamSymbol, ArgAlign, TypeSize); + return MakeDeclareArrayParam(ParamSymbol, ArgAlign, TySize); assert(ArgOuts.size() == 1 && "We must pass only one value as non-array"); assert((ArgOuts[0].VT.isInteger() || ArgOuts[0].VT.isFloatingPoint()) && "Only int and float types are supported as non-array arguments"); - return MakeDeclareScalarParam(ParamSymbol, TypeSize); + return MakeDeclareScalarParam(ParamSymbol, TySize); }(); - // PTX Interoperability Guide 3.3(A): [Integer] Values shorter - // than 32-bits are sign extended or zero extended, depending on - // whether they are signed or unsigned types. This case applies - // only to scalar parameters and not to aggregate values. - const bool ExtendIntegerParam = - Arg.Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Arg.Ty) < 32; + if (IsByVal) { + assert(ArgOutVals.size() == 1 && "We must pass only one value as byval"); + SDValue SrcPtr = ArgOutVals[0]; + const auto PointerInfo = refinePtrAS(SrcPtr, DAG, DL, *this); + const Align BaseSrcAlign = ArgOuts[0].Flags.getNonZeroByValAlign(); - const auto GetStoredValue = [&](const unsigned I, EVT EltVT, - const MaybeAlign PartAlign) { - if (IsByVal) { - SDValue Ptr = ArgOutVals[0]; - auto MPI = refinePtrAS(Ptr, DAG, DL, *this); - SDValue SrcAddr = - DAG.getObjectPtrOffset(dl, Ptr, TypeSize::getFixed(Offsets[I])); - - return DAG.getLoad(EltVT, dl, CallChain, SrcAddr, MPI, PartAlign); + if (IsVAArg) + VAOffset = alignTo(VAOffset, ArgAlign); + + SmallVector<EVT, 4> ValueVTs, MemVTs; + SmallVector<TypeSize, 4> Offsets; + ComputeValueVTs(*this, DL, ETy, ValueVTs, &MemVTs, &Offsets); + + unsigned J = 0; + const auto VI = VectorizePTXValueVTs(MemVTs, Offsets, ArgAlign, IsVAArg); + for (const unsigned NumElts : VI) { + EVT LoadVT = getVectorizedVT(MemVTs[J], NumElts, Ctx); + Align SrcAlign = commonAlignment(BaseSrcAlign, Offsets[J]); + SDValue SrcAddr = DAG.getObjectPtrOffset(dl, SrcPtr, Offsets[J]); + SDValue SrcLoad = + DAG.getLoad(LoadVT, dl, CallChain, SrcAddr, PointerInfo, SrcAlign); + + TypeSize ParamOffset = Offsets[J].getWithIncrement(VAOffset); + Align ParamAlign = commonAlignment(ArgAlign, ParamOffset); + SDValue ParamAddr = + DAG.getObjectPtrOffset(dl, ParamSymbol, ParamOffset); + SDValue StoreParam = + DAG.getStore(ArgDeclare, dl, SrcLoad, ParamAddr, + MachinePointerInfo(ADDRESS_SPACE_PARAM), ParamAlign); + CallPrereqs.push_back(StoreParam); + + J += NumElts; } - SDValue StVal = ArgOutVals[I]; - assert(promoteScalarIntegerPTX(StVal.getValueType()) == - StVal.getValueType() && - "OutVal type should always be legal"); - - const EVT VTI = promoteScalarIntegerPTX(VTs[I]); - const EVT StoreVT = - ExtendIntegerParam ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI); - - return correctParamType(StVal, StoreVT, ArgOuts[I].Flags, DAG, dl); - }; - - const auto VectorInfo = - VectorizePTXValueVTs(VTs, Offsets, ArgAlign, IsVAArg); - - unsigned J = 0; - for (const unsigned NumElts : VectorInfo) { - const int CurOffset = Offsets[J]; - const EVT EltVT = promoteScalarIntegerPTX(VTs[J]); - - if (IsVAArg && !IsByVal) - // Align each part of the variadic argument to their type. - VAOffset = alignTo(VAOffset, DAG.getEVTAlign(EltVT)); - - assert((IsVAArg || VAOffset == 0) && - "VAOffset must be 0 for non-VA args"); + if (IsVAArg) + VAOffset += TySize; + } else { + SmallVector<EVT, 16> VTs; + SmallVector<uint64_t, 16> Offsets; + ComputePTXValueVTs(*this, DL, Arg.Ty, VTs, &Offsets, VAOffset); + assert(VTs.size() == Offsets.size() && "Size mismatch"); + assert(VTs.size() == ArgOuts.size() && "Size mismatch"); + + // PTX Interoperability Guide 3.3(A): [Integer] Values shorter + // than 32-bits are sign extended or zero extended, depending on + // whether they are signed or unsigned types. This case applies + // only to scalar parameters and not to aggregate values. + const bool ExtendIntegerParam = + Arg.Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Arg.Ty) < 32; + + const auto GetStoredValue = [&](const unsigned I) { + SDValue StVal = ArgOutVals[I]; + assert(promoteScalarIntegerPTX(StVal.getValueType()) == + StVal.getValueType() && + "OutVal type should always be legal"); + + const EVT VTI = promoteScalarIntegerPTX(VTs[I]); + const EVT StoreVT = + ExtendIntegerParam ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI); + + return correctParamType(StVal, StoreVT, ArgOuts[I].Flags, DAG, dl); + }; + + unsigned J = 0; + const auto VI = VectorizePTXValueVTs(VTs, Offsets, ArgAlign, IsVAArg); + for (const unsigned NumElts : VI) { + const EVT EltVT = promoteScalarIntegerPTX(VTs[J]); + + unsigned Offset; + if (IsVAArg) { + // TODO: We may need to support vector types that can be passed + // as scalars in variadic arguments. + assert(NumElts == 1 && + "Vectorization should be disabled for vaargs."); + + // Align each part of the variadic argument to their type. + VAOffset = alignTo(VAOffset, DAG.getEVTAlign(EltVT)); + Offset = VAOffset; + + const EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT; + VAOffset += DL.getTypeAllocSize(TheStoreType.getTypeForEVT(Ctx)); + } else { + assert(VAOffset == 0 && "VAOffset must be 0 for non-VA args"); + Offset = Offsets[J]; + } - const unsigned Offset = - (VAOffset + ((IsVAArg && !IsByVal) ? 0 : CurOffset)); - SDValue Ptr = - DAG.getObjectPtrOffset(dl, ParamSymbol, TypeSize::getFixed(Offset)); + SDValue Ptr = + DAG.getObjectPtrOffset(dl, ParamSymbol, TypeSize::getFixed(Offset)); - const MaybeAlign CurrentAlign = ExtendIntegerParam - ? MaybeAlign(std::nullopt) - : commonAlignment(ArgAlign, Offset); + const MaybeAlign CurrentAlign = ExtendIntegerParam + ? MaybeAlign(std::nullopt) + : commonAlignment(ArgAlign, Offset); - SDValue Val; - if (NumElts == 1) { - Val = GetStoredValue(J, EltVT, CurrentAlign); - } else { - SmallVector<SDValue, 8> StoreVals; - for (const unsigned K : llvm::seq(NumElts)) { - SDValue ValJ = GetStoredValue(J + K, EltVT, CurrentAlign); - if (ValJ.getValueType().isVector()) - DAG.ExtractVectorElements(ValJ, StoreVals); - else - StoreVals.push_back(ValJ); - } + SDValue Val = + getBuildVectorizedValue(NumElts, dl, DAG, [&](unsigned K) { + return GetStoredValue(J + K); + }); - EVT VT = EVT::getVectorVT( - *DAG.getContext(), StoreVals[0].getValueType(), StoreVals.size()); - Val = DAG.getBuildVector(VT, dl, StoreVals); - } + SDValue StoreParam = + DAG.getStore(ArgDeclare, dl, Val, Ptr, + MachinePointerInfo(ADDRESS_SPACE_PARAM), CurrentAlign); + CallPrereqs.push_back(StoreParam); - SDValue StoreParam = - DAG.getStore(ArgDeclare, dl, Val, Ptr, - MachinePointerInfo(ADDRESS_SPACE_PARAM), CurrentAlign); - CallPrereqs.push_back(StoreParam); - - // TODO: We may need to support vector types that can be passed - // as scalars in variadic arguments. - if (IsVAArg && !IsByVal) { - assert(NumElts == 1 && - "Vectorization is expected to be disabled for variadics."); - const EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT; - VAOffset += - DL.getTypeAllocSize(TheStoreType.getTypeForEVT(*DAG.getContext())); + J += NumElts; } - - J += NumElts; } - if (IsVAArg && IsByVal) - VAOffset += TypeSize; } // Handle Result @@ -1676,17 +1733,6 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, CallPrereqs.push_back(PrototypeDeclare); } - if (ConvertToIndirectCall) { - // Copy the function ptr to a ptx register and use the register to call the - // function. - const MVT DestVT = Callee.getValueType().getSimpleVT(); - MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - Register DestReg = MRI.createVirtualRegister(TLI.getRegClassFor(DestVT)); - auto RegCopy = DAG.getCopyToReg(DAG.getEntryNode(), dl, DestReg, Callee); - Callee = DAG.getCopyFromReg(RegCopy, dl, DestReg, DestVT); - } - const unsigned Proto = IsIndirectCall ? UniqueCallSite : 0; const unsigned NumArgs = std::min<unsigned>(CLI.NumFixedArgs + 1, Args.size()); @@ -1703,10 +1749,11 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (!Ins.empty()) { SmallVector<EVT, 16> VTs; SmallVector<uint64_t, 16> Offsets; - ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0); + ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets); assert(VTs.size() == Ins.size() && "Bad value decomposition"); const Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL); + const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32); // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than // 32-bits are sign extended or zero extended, depending on whether @@ -1714,9 +1761,9 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, const bool ExtendIntegerRetVal = RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32; - const auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign); unsigned I = 0; - for (const unsigned NumElts : VectorInfo) { + const auto VI = VectorizePTXValueVTs(VTs, Offsets, RetAlign); + for (const unsigned NumElts : VI) { const MaybeAlign CurrentAlign = ExtendIntegerRetVal ? MaybeAlign(std::nullopt) : commonAlignment(RetAlign, Offsets[I]); @@ -1724,16 +1771,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, const EVT VTI = promoteScalarIntegerPTX(VTs[I]); const EVT LoadVT = ExtendIntegerRetVal ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI); - - const unsigned PackingAmt = - LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1; - - const EVT VecVT = NumElts == 1 ? LoadVT - : EVT::getVectorVT(*DAG.getContext(), - LoadVT.getScalarType(), - NumElts * PackingAmt); - - const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32); + const EVT VecVT = getVectorizedVT(LoadVT, NumElts, Ctx); SDValue Ptr = DAG.getObjectPtrOffset(dl, RetSymbol, TypeSize::getFixed(Offsets[I])); @@ -1742,17 +1780,8 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, MachinePointerInfo(ADDRESS_SPACE_PARAM), CurrentAlign); LoadChains.push_back(R.getValue(1)); - - if (NumElts == 1) - ProxyRegOps.push_back(R); - else - for (const unsigned J : llvm::seq(NumElts)) { - SDValue Elt = DAG.getNode( - LoadVT.isVector() ? ISD::EXTRACT_SUBVECTOR - : ISD::EXTRACT_VECTOR_ELT, - dl, LoadVT, R, DAG.getVectorIdxConstant(J * PackingAmt, dl)); - ProxyRegOps.push_back(Elt); - } + for (const unsigned J : llvm::seq(NumElts)) + ProxyRegOps.push_back(getExtractVectorizedValue(R, J, LoadVT, dl, DAG)); I += NumElts; } } @@ -3227,11 +3256,10 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { - MachineFunction &MF = DAG.getMachineFunction(); const DataLayout &DL = DAG.getDataLayout(); auto PtrVT = getPointerTy(DAG.getDataLayout()); - const Function *F = &MF.getFunction(); + const Function &F = DAG.getMachineFunction().getFunction(); SDValue Root = DAG.getRoot(); SmallVector<SDValue, 16> OutChains; @@ -3247,7 +3275,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( // See similar issue in LowerCall. auto AllIns = ArrayRef(Ins); - for (const auto &Arg : F->args()) { + for (const auto &Arg : F.args()) { const auto ArgIns = AllIns.take_while( [&](auto I) { return I.OrigArgIndex == Arg.getArgNo(); }); AllIns = AllIns.drop_front(ArgIns.size()); @@ -3287,7 +3315,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( assert(ByvalIn.VT == PtrVT && "ByVal argument must be a pointer"); SDValue P; - if (isKernelFunction(*F)) { + if (isKernelFunction(F)) { P = ArgSymbol; P.getNode()->setIROrder(Arg.getArgNo() + 1); } else { @@ -3305,43 +3333,27 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( assert(VTs.size() == Offsets.size() && "Size mismatch"); const Align ArgAlign = getFunctionArgumentAlignment( - F, Ty, Arg.getArgNo() + AttributeList::FirstArgIndex, DL); + &F, Ty, Arg.getArgNo() + AttributeList::FirstArgIndex, DL); - const auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign); unsigned I = 0; - for (const unsigned NumElts : VectorInfo) { + const auto VI = VectorizePTXValueVTs(VTs, Offsets, ArgAlign); + for (const unsigned NumElts : VI) { // i1 is loaded/stored as i8 const EVT LoadVT = VTs[I] == MVT::i1 ? MVT::i8 : VTs[I]; - // If the element is a packed type (ex. v2f16, v4i8, etc) holding - // multiple elements. - const unsigned PackingAmt = - LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1; - - const EVT VecVT = - NumElts == 1 - ? LoadVT - : EVT::getVectorVT(F->getContext(), LoadVT.getScalarType(), - NumElts * PackingAmt); + const EVT VecVT = getVectorizedVT(LoadVT, NumElts, *DAG.getContext()); SDValue VecAddr = DAG.getObjectPtrOffset( dl, ArgSymbol, TypeSize::getFixed(Offsets[I])); - const MaybeAlign PartAlign = commonAlignment(ArgAlign, Offsets[I]); + const Align PartAlign = commonAlignment(ArgAlign, Offsets[I]); SDValue P = DAG.getLoad(VecVT, dl, Root, VecAddr, MachinePointerInfo(ADDRESS_SPACE_PARAM), PartAlign, MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant); - if (P.getNode()) - P.getNode()->setIROrder(Arg.getArgNo() + 1); + P.getNode()->setIROrder(Arg.getArgNo() + 1); for (const unsigned J : llvm::seq(NumElts)) { - SDValue Elt = - NumElts == 1 - ? P - : DAG.getNode(LoadVT.isVector() ? ISD::EXTRACT_SUBVECTOR - : ISD::EXTRACT_VECTOR_ELT, - dl, LoadVT, P, - DAG.getVectorIdxConstant(J * PackingAmt, dl)); + SDValue Elt = getExtractVectorizedValue(P, J, LoadVT, dl, DAG); Elt = correctParamType(Elt, ArgIns[I + J].VT, ArgIns[I + J].Flags, DAG, dl); @@ -3364,9 +3376,8 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, const SmallVectorImpl<ISD::OutputArg> &Outs, const SmallVectorImpl<SDValue> &OutVals, const SDLoc &dl, SelectionDAG &DAG) const { - const MachineFunction &MF = DAG.getMachineFunction(); - const Function &F = MF.getFunction(); - Type *RetTy = MF.getFunction().getReturnType(); + const Function &F = DAG.getMachineFunction().getFunction(); + Type *RetTy = F.getReturnType(); if (RetTy->isVoidTy()) { assert(OutVals.empty() && Outs.empty() && "Return value expected for void"); @@ -3374,10 +3385,9 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, } const DataLayout &DL = DAG.getDataLayout(); - SmallVector<EVT, 16> VTs; - SmallVector<uint64_t, 16> Offsets; - ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets); - assert(VTs.size() == OutVals.size() && "Bad return value decomposition"); + + const SDValue RetSymbol = DAG.getExternalSymbol("func_retval0", MVT::i32); + const auto RetAlign = getFunctionParamOptimizedAlign(&F, RetTy, DL); // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than // 32-bits are sign extended or zero extended, depending on whether @@ -3385,6 +3395,11 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, const bool ExtendIntegerRetVal = RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32; + SmallVector<EVT, 16> VTs; + SmallVector<uint64_t, 16> Offsets; + ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets); + assert(VTs.size() == OutVals.size() && "Bad return value decomposition"); + const auto GetRetVal = [&](unsigned I) -> SDValue { SDValue RetVal = OutVals[I]; assert(promoteScalarIntegerPTX(RetVal.getValueType()) == @@ -3397,33 +3412,16 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, return correctParamType(RetVal, StoreVT, Outs[I].Flags, DAG, dl); }; - const auto RetAlign = getFunctionParamOptimizedAlign(&F, RetTy, DL); - const auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign); unsigned I = 0; - for (const unsigned NumElts : VectorInfo) { + const auto VI = VectorizePTXValueVTs(VTs, Offsets, RetAlign); + for (const unsigned NumElts : VI) { const MaybeAlign CurrentAlign = ExtendIntegerRetVal ? MaybeAlign(std::nullopt) : commonAlignment(RetAlign, Offsets[I]); - SDValue Val; - if (NumElts == 1) { - Val = GetRetVal(I); - } else { - SmallVector<SDValue, 4> StoreVals; - for (const unsigned J : llvm::seq(NumElts)) { - SDValue ValJ = GetRetVal(I + J); - if (ValJ.getValueType().isVector()) - DAG.ExtractVectorElements(ValJ, StoreVals); - else - StoreVals.push_back(ValJ); - } - - EVT VT = EVT::getVectorVT(F.getContext(), StoreVals[0].getValueType(), - StoreVals.size()); - Val = DAG.getBuildVector(VT, dl, StoreVals); - } + SDValue Val = getBuildVectorizedValue( + NumElts, dl, DAG, [&](unsigned K) { return GetRetVal(I + K); }); - const SDValue RetSymbol = DAG.getExternalSymbol("func_retval0", MVT::i32); SDValue Ptr = DAG.getObjectPtrOffset(dl, RetSymbol, TypeSize::getFixed(Offsets[I])); diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index d8047d3..2ae7520 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -1602,8 +1602,6 @@ foreach is_convergent = [0, 1] in { } defvar call_inst = !cast<NVPTXInst>("CALL" # convergent_suffix); - def : Pat<(call is_convergent, 1, imm:$rets, imm:$params, globaladdr:$addr, imm:$proto), - (call_inst (to_tglobaladdr $addr), imm:$rets, imm:$params, imm:$proto)>; def : Pat<(call is_convergent, 1, imm:$rets, imm:$params, i32:$addr, imm:$proto), (call_inst $addr, imm:$rets, imm:$params, imm:$proto)>; def : Pat<(call is_convergent, 1, imm:$rets, imm:$params, i64:$addr, imm:$proto), @@ -1612,10 +1610,6 @@ foreach is_convergent = [0, 1] in { defvar call_uni_inst = !cast<NVPTXInst>("CALL_UNI" # convergent_suffix); def : Pat<(call is_convergent, 0, imm:$rets, imm:$params, globaladdr:$addr, 0), (call_uni_inst (to_tglobaladdr $addr), imm:$rets, imm:$params)>; - def : Pat<(call is_convergent, 0, imm:$rets, imm:$params, i32:$addr, 0), - (call_uni_inst $addr, imm:$rets, imm:$params)>; - def : Pat<(call is_convergent, 0, imm:$rets, imm:$params, i64:$addr, 0), - (call_uni_inst $addr, imm:$rets, imm:$params)>; } def DECLARE_PARAM_array : diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp index 0e8828f..ec97e2e 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp @@ -93,8 +93,8 @@ public: MCFixupKindInfo getFixupKindInfo(MCFixupKind Kind) const override; void applyFixup(const MCFragment &, const MCFixup &Fixup, - const MCValue &Target, MutableArrayRef<char> Data, - uint64_t Value, bool IsResolved) override; + const MCValue &Target, uint8_t *Data, uint64_t Value, + bool IsResolved) override; bool shouldForceRelocation(const MCFixup &Fixup, const MCValue &Target) { // If there is a @ specifier, unless it is optimized out (e.g. constant @l), @@ -185,9 +185,8 @@ MCFixupKindInfo PPCAsmBackend::getFixupKindInfo(MCFixupKind Kind) const { } void PPCAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &TargetVal, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) { + const MCValue &TargetVal, uint8_t *Data, + uint64_t Value, bool IsResolved) { // In PPC64 ELFv1, .quad .TOC.@tocbase in the .opd section is expected to // reference the null symbol. auto Target = TargetVal; @@ -205,7 +204,6 @@ void PPCAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, if (!Value) return; // Doesn't change encoding. - unsigned Offset = Fixup.getOffset(); unsigned NumBytes = getFixupKindNumBytes(Kind); // For each byte of the fragment that the fixup touches, mask in the bits @@ -213,7 +211,7 @@ void PPCAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, // bitfields above. for (unsigned i = 0; i != NumBytes; ++i) { unsigned Idx = Endian == llvm::endianness::little ? i : (NumBytes - 1 - i); - Data[Offset + i] |= uint8_t((Value >> (Idx * 8)) & 0xff); + Data[i] |= uint8_t((Value >> (Idx * 8)) & 0xff); } } diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 459525e..f179873 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -7296,9 +7296,17 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX( if (!ArgVT.isVector() && !ValVT.isVector() && ArgVT.isInteger() && ValVT.isInteger() && ArgVT.getScalarSizeInBits() < ValVT.getScalarSizeInBits()) { - SDValue ArgValueTrunc = DAG.getNode( - ISD::TRUNCATE, dl, ArgVT.getSimpleVT() == MVT::i1 ? MVT::i8 : ArgVT, - ArgValue); + // It is possible to have either real integer values + // or integers that were not originally integers. + // In the latter case, these could have came from structs, + // and these integers would not have an extend on the parameter. + // Since these types of integers do not have an extend specified + // in the first place, the type of extend that we do should not matter. + EVT TruncatedArgVT = ArgVT.isSimple() && ArgVT.getSimpleVT() == MVT::i1 + ? MVT::i8 + : ArgVT; + SDValue ArgValueTrunc = + DAG.getNode(ISD::TRUNCATE, dl, TruncatedArgVT, ArgValue); SDValue ArgValueExt = ArgSignExt ? DAG.getSExtOrTrunc(ArgValueTrunc, dl, ValVT) : DAG.getZExtOrTrunc(ArgValueTrunc, dl, ValVT); diff --git a/llvm/lib/Target/PowerPC/PPCMachineScheduler.cpp b/llvm/lib/Target/PowerPC/PPCMachineScheduler.cpp index 5eb1f01..b7e2263 100644 --- a/llvm/lib/Target/PowerPC/PPCMachineScheduler.cpp +++ b/llvm/lib/Target/PowerPC/PPCMachineScheduler.cpp @@ -100,10 +100,14 @@ bool PPCPreRASchedStrategy::tryCandidate(SchedCandidate &Cand, // This is a best effort to set things up for a post-RA pass. Optimizations // like generating loads of multiple registers should ideally be done within // the scheduler pass by combining the loads during DAG postprocessing. - const ClusterInfo *CandCluster = Cand.AtTop ? TopCluster : BotCluster; - const ClusterInfo *TryCandCluster = TryCand.AtTop ? TopCluster : BotCluster; - if (tryGreater(TryCandCluster && TryCandCluster->contains(TryCand.SU), - CandCluster && CandCluster->contains(Cand.SU), TryCand, Cand, + unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID; + unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID; + bool CandIsClusterSucc = + isTheSameCluster(CandZoneCluster, Cand.SU->ParentClusterIdx); + bool TryCandIsClusterSucc = + isTheSameCluster(TryCandZoneCluster, TryCand.SU->ParentClusterIdx); + + if (tryGreater(TryCandIsClusterSucc, CandIsClusterSucc, TryCand, Cand, Cluster)) return TryCand.Reason != NoCand; @@ -189,10 +193,14 @@ bool PPCPostRASchedStrategy::tryCandidate(SchedCandidate &Cand, return TryCand.Reason != NoCand; // Keep clustered nodes together. - const ClusterInfo *CandCluster = Cand.AtTop ? TopCluster : BotCluster; - const ClusterInfo *TryCandCluster = TryCand.AtTop ? TopCluster : BotCluster; - if (tryGreater(TryCandCluster && TryCandCluster->contains(TryCand.SU), - CandCluster && CandCluster->contains(Cand.SU), TryCand, Cand, + unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID; + unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID; + bool CandIsClusterSucc = + isTheSameCluster(CandZoneCluster, Cand.SU->ParentClusterIdx); + bool TryCandIsClusterSucc = + isTheSameCluster(TryCandZoneCluster, TryCand.SU->ParentClusterIdx); + + if (tryGreater(TryCandIsClusterSucc, CandIsClusterSucc, TryCand, Cand, Cluster)) return TryCand.Reason != NoCand; diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp index 82e3b5c..eb7460e 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp @@ -327,19 +327,19 @@ bool RISCVAsmBackend::relaxAlign(MCFragment &F, unsigned &Size) { bool RISCVAsmBackend::relaxDwarfLineAddr(MCFragment &F, bool &WasRelaxed) const { - MCContext &C = getContext(); - int64_t LineDelta = F.getDwarfLineDelta(); const MCExpr &AddrDelta = F.getDwarfAddrDelta(); - SmallVector<MCFixup, 1> Fixups; size_t OldSize = F.getVarSize(); int64_t Value; + // If the label difference can be resolved, use the default handling, which + // utilizes a shorter special opcode. + if (AddrDelta.evaluateAsAbsolute(Value, *Asm)) + return false; [[maybe_unused]] bool IsAbsolute = AddrDelta.evaluateKnownAbsolute(Value, *Asm); assert(IsAbsolute && "CFA with invalid expression"); - Fixups.clear(); SmallVector<char> Data; raw_svector_ostream OS(Data); @@ -349,33 +349,21 @@ bool RISCVAsmBackend::relaxDwarfLineAddr(MCFragment &F, encodeSLEB128(LineDelta, OS); } - unsigned Offset; - std::pair<MCFixupKind, MCFixupKind> Fixup; - // According to the DWARF specification, the `DW_LNS_fixed_advance_pc` opcode // takes a single unsigned half (unencoded) operand. The maximum encodable // value is therefore 65535. Set a conservative upper bound for relaxation. + unsigned PCBytes; if (Value > 60000) { - unsigned PtrSize = C.getAsmInfo()->getCodePointerSize(); - - OS << uint8_t(dwarf::DW_LNS_extended_op); - encodeULEB128(PtrSize + 1, OS); - - OS << uint8_t(dwarf::DW_LNE_set_address); - Offset = OS.tell(); - assert((PtrSize == 4 || PtrSize == 8) && "Unexpected pointer size"); - Fixup = RISCV::getRelocPairForSize(PtrSize); - OS.write_zeros(PtrSize); + PCBytes = getContext().getAsmInfo()->getCodePointerSize(); + OS << uint8_t(dwarf::DW_LNS_extended_op) << uint8_t(PCBytes + 1) + << uint8_t(dwarf::DW_LNE_set_address); + OS.write_zeros(PCBytes); } else { + PCBytes = 2; OS << uint8_t(dwarf::DW_LNS_fixed_advance_pc); - Offset = OS.tell(); - Fixup = RISCV::getRelocPairForSize(2); support::endian::write<uint16_t>(OS, 0, llvm::endianness::little); } - - const MCBinaryExpr &MBE = cast<MCBinaryExpr>(AddrDelta); - Fixups.push_back(MCFixup::create(Offset, MBE.getLHS(), std::get<0>(Fixup))); - Fixups.push_back(MCFixup::create(Offset, MBE.getRHS(), std::get<1>(Fixup))); + auto Offset = OS.tell() - PCBytes; if (LineDelta == INT64_MAX) { OS << uint8_t(dwarf::DW_LNS_extended_op); @@ -386,7 +374,8 @@ bool RISCVAsmBackend::relaxDwarfLineAddr(MCFragment &F, } F.setVarContents(Data); - F.setVarFixups(Fixups); + F.setVarFixups({MCFixup::create(Offset, &AddrDelta, + MCFixup::getDataKindForSize(PCBytes))}); WasRelaxed = OldSize != Data.size(); return true; } @@ -881,9 +870,8 @@ bool RISCVAsmBackend::addReloc(const MCFragment &F, const MCFixup &Fixup, } void RISCVAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) { + const MCValue &Target, uint8_t *Data, + uint64_t Value, bool IsResolved) { IsResolved = addReloc(F, Fixup, Target, Value, IsResolved); MCFixupKind Kind = Fixup.getKind(); if (mc::isRelocation(Kind)) @@ -898,15 +886,14 @@ void RISCVAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, // Shift the value into position. Value <<= Info.TargetOffset; - unsigned Offset = Fixup.getOffset(); unsigned NumBytes = alignTo(Info.TargetSize + Info.TargetOffset, 8) / 8; - - assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!"); + assert(Fixup.getOffset() + NumBytes <= F.getSize() && + "Invalid fixup offset!"); // For each byte of the fragment that the fixup touches, mask in the // bits from the fixup value. for (unsigned i = 0; i != NumBytes; ++i) { - Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff); + Data[i] |= uint8_t((Value >> (i * 8)) & 0xff); } } diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h index d97d632..adec1ec 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h @@ -46,8 +46,7 @@ public: void maybeAddVendorReloc(const MCFragment &, const MCFixup &); void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) override; + uint8_t *Data, uint64_t Value, bool IsResolved) override; std::unique_ptr<MCObjectTargetWriter> createObjectTargetWriter() const override; diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h index f816561c..98c8738 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h @@ -68,27 +68,6 @@ enum Fixups { fixup_riscv_invalid, NumTargetFixupKinds = fixup_riscv_invalid - FirstTargetFixupKind }; - -static inline std::pair<MCFixupKind, MCFixupKind> -getRelocPairForSize(unsigned Size) { - switch (Size) { - default: - llvm_unreachable("unsupported fixup size"); - case 1: - return std::make_pair(FirstLiteralRelocationKind + ELF::R_RISCV_ADD8, - FirstLiteralRelocationKind + ELF::R_RISCV_SUB8); - case 2: - return std::make_pair(FirstLiteralRelocationKind + ELF::R_RISCV_ADD16, - FirstLiteralRelocationKind + ELF::R_RISCV_SUB16); - case 4: - return std::make_pair(FirstLiteralRelocationKind + ELF::R_RISCV_ADD32, - FirstLiteralRelocationKind + ELF::R_RISCV_SUB32); - case 8: - return std::make_pair(FirstLiteralRelocationKind + ELF::R_RISCV_ADD64, - FirstLiteralRelocationKind + ELF::R_RISCV_SUB64); - } -} - } // end namespace llvm::RISCV #endif diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index f223fdbe..5998653 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -2827,6 +2827,8 @@ static bool selectConstantAddr(SelectionDAG *CurDAG, const SDLoc &DL, static bool isWorthFoldingAdd(SDValue Add) { for (auto *User : Add->users()) { if (User->getOpcode() != ISD::LOAD && User->getOpcode() != ISD::STORE && + User->getOpcode() != RISCVISD::LD_RV32 && + User->getOpcode() != RISCVISD::SD_RV32 && User->getOpcode() != ISD::ATOMIC_LOAD && User->getOpcode() != ISD::ATOMIC_STORE) return false; @@ -2841,6 +2843,9 @@ static bool isWorthFoldingAdd(SDValue Add) { if (User->getOpcode() == ISD::ATOMIC_STORE && cast<AtomicSDNode>(User)->getVal() == Add) return false; + if (User->getOpcode() == RISCVISD::SD_RV32 && + (User->getOperand(0) == Add || User->getOperand(1) == Add)) + return false; if (isStrongerThanMonotonic(cast<MemSDNode>(User)->getSuccessOrdering())) return false; } diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index c0ada51..adbfbeb 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -1819,6 +1819,13 @@ bool RISCVTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::riscv_seg6_load_mask: case Intrinsic::riscv_seg7_load_mask: case Intrinsic::riscv_seg8_load_mask: + case Intrinsic::riscv_sseg2_load_mask: + case Intrinsic::riscv_sseg3_load_mask: + case Intrinsic::riscv_sseg4_load_mask: + case Intrinsic::riscv_sseg5_load_mask: + case Intrinsic::riscv_sseg6_load_mask: + case Intrinsic::riscv_sseg7_load_mask: + case Intrinsic::riscv_sseg8_load_mask: return SetRVVLoadStoreInfo(/*PtrOp*/ 0, /*IsStore*/ false, /*IsUnitStrided*/ false, /*UsePtrVal*/ true); case Intrinsic::riscv_seg2_store_mask: @@ -10938,6 +10945,97 @@ static inline SDValue getVCIXISDNodeVOID(SDValue &Op, SelectionDAG &DAG, return DAG.getNode(Type, SDLoc(Op), Op.getValueType(), Operands); } +static SDValue +lowerFixedVectorSegLoadIntrinsics(unsigned IntNo, SDValue Op, + const RISCVSubtarget &Subtarget, + SelectionDAG &DAG) { + bool IsStrided; + switch (IntNo) { + case Intrinsic::riscv_seg2_load_mask: + case Intrinsic::riscv_seg3_load_mask: + case Intrinsic::riscv_seg4_load_mask: + case Intrinsic::riscv_seg5_load_mask: + case Intrinsic::riscv_seg6_load_mask: + case Intrinsic::riscv_seg7_load_mask: + case Intrinsic::riscv_seg8_load_mask: + IsStrided = false; + break; + case Intrinsic::riscv_sseg2_load_mask: + case Intrinsic::riscv_sseg3_load_mask: + case Intrinsic::riscv_sseg4_load_mask: + case Intrinsic::riscv_sseg5_load_mask: + case Intrinsic::riscv_sseg6_load_mask: + case Intrinsic::riscv_sseg7_load_mask: + case Intrinsic::riscv_sseg8_load_mask: + IsStrided = true; + break; + default: + llvm_unreachable("unexpected intrinsic ID"); + }; + + static const Intrinsic::ID VlsegInts[7] = { + Intrinsic::riscv_vlseg2_mask, Intrinsic::riscv_vlseg3_mask, + Intrinsic::riscv_vlseg4_mask, Intrinsic::riscv_vlseg5_mask, + Intrinsic::riscv_vlseg6_mask, Intrinsic::riscv_vlseg7_mask, + Intrinsic::riscv_vlseg8_mask}; + static const Intrinsic::ID VlssegInts[7] = { + Intrinsic::riscv_vlsseg2_mask, Intrinsic::riscv_vlsseg3_mask, + Intrinsic::riscv_vlsseg4_mask, Intrinsic::riscv_vlsseg5_mask, + Intrinsic::riscv_vlsseg6_mask, Intrinsic::riscv_vlsseg7_mask, + Intrinsic::riscv_vlsseg8_mask}; + + SDLoc DL(Op); + unsigned NF = Op->getNumValues() - 1; + assert(NF >= 2 && NF <= 8 && "Unexpected seg number"); + MVT XLenVT = Subtarget.getXLenVT(); + MVT VT = Op->getSimpleValueType(0); + MVT ContainerVT = ::getContainerForFixedLengthVector(DAG, VT, Subtarget); + unsigned Sz = NF * ContainerVT.getVectorMinNumElements() * + ContainerVT.getScalarSizeInBits(); + EVT VecTupTy = MVT::getRISCVVectorTupleVT(Sz, NF); + + // Operands: (chain, int_id, pointer, mask, vl) or + // (chain, int_id, pointer, offset, mask, vl) + SDValue VL = Op.getOperand(Op.getNumOperands() - 1); + SDValue Mask = Op.getOperand(Op.getNumOperands() - 2); + MVT MaskVT = Mask.getSimpleValueType(); + MVT MaskContainerVT = + ::getContainerForFixedLengthVector(DAG, MaskVT, Subtarget); + Mask = convertToScalableVector(MaskContainerVT, Mask, DAG, Subtarget); + + SDValue IntID = DAG.getTargetConstant( + IsStrided ? VlssegInts[NF - 2] : VlsegInts[NF - 2], DL, XLenVT); + auto *Load = cast<MemIntrinsicSDNode>(Op); + + SDVTList VTs = DAG.getVTList({VecTupTy, MVT::Other}); + SmallVector<SDValue, 9> Ops = { + Load->getChain(), + IntID, + DAG.getUNDEF(VecTupTy), + Op.getOperand(2), + Mask, + VL, + DAG.getTargetConstant( + RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC, DL, XLenVT), + DAG.getTargetConstant(Log2_64(VT.getScalarSizeInBits()), DL, XLenVT)}; + // Insert the stride operand. + if (IsStrided) + Ops.insert(std::next(Ops.begin(), 4), Op.getOperand(3)); + + SDValue Result = + DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops, + Load->getMemoryVT(), Load->getMemOperand()); + SmallVector<SDValue, 9> Results; + for (unsigned int RetIdx = 0; RetIdx < NF; RetIdx++) { + SDValue SubVec = DAG.getNode(RISCVISD::TUPLE_EXTRACT, DL, ContainerVT, + Result.getValue(0), + DAG.getTargetConstant(RetIdx, DL, MVT::i32)); + Results.push_back(convertFromScalableVector(VT, SubVec, DAG, Subtarget)); + } + Results.push_back(Result.getValue(1)); + return DAG.getMergeValues(Results, DL); +} + SDValue RISCVTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const { unsigned IntNo = Op.getConstantOperandVal(1); @@ -10950,57 +11048,16 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::riscv_seg5_load_mask: case Intrinsic::riscv_seg6_load_mask: case Intrinsic::riscv_seg7_load_mask: - case Intrinsic::riscv_seg8_load_mask: { - SDLoc DL(Op); - static const Intrinsic::ID VlsegInts[7] = { - Intrinsic::riscv_vlseg2_mask, Intrinsic::riscv_vlseg3_mask, - Intrinsic::riscv_vlseg4_mask, Intrinsic::riscv_vlseg5_mask, - Intrinsic::riscv_vlseg6_mask, Intrinsic::riscv_vlseg7_mask, - Intrinsic::riscv_vlseg8_mask}; - unsigned NF = Op->getNumValues() - 1; - assert(NF >= 2 && NF <= 8 && "Unexpected seg number"); - MVT XLenVT = Subtarget.getXLenVT(); - MVT VT = Op->getSimpleValueType(0); - MVT ContainerVT = getContainerForFixedLengthVector(VT); - unsigned Sz = NF * ContainerVT.getVectorMinNumElements() * - ContainerVT.getScalarSizeInBits(); - EVT VecTupTy = MVT::getRISCVVectorTupleVT(Sz, NF); - - // Operands: (chain, int_id, pointer, mask, vl) - SDValue VL = Op.getOperand(Op.getNumOperands() - 1); - SDValue Mask = Op.getOperand(3); - MVT MaskVT = Mask.getSimpleValueType(); - MVT MaskContainerVT = - ::getContainerForFixedLengthVector(DAG, MaskVT, Subtarget); - Mask = convertToScalableVector(MaskContainerVT, Mask, DAG, Subtarget); - - SDValue IntID = DAG.getTargetConstant(VlsegInts[NF - 2], DL, XLenVT); - auto *Load = cast<MemIntrinsicSDNode>(Op); + case Intrinsic::riscv_seg8_load_mask: + case Intrinsic::riscv_sseg2_load_mask: + case Intrinsic::riscv_sseg3_load_mask: + case Intrinsic::riscv_sseg4_load_mask: + case Intrinsic::riscv_sseg5_load_mask: + case Intrinsic::riscv_sseg6_load_mask: + case Intrinsic::riscv_sseg7_load_mask: + case Intrinsic::riscv_sseg8_load_mask: + return lowerFixedVectorSegLoadIntrinsics(IntNo, Op, Subtarget, DAG); - SDVTList VTs = DAG.getVTList({VecTupTy, MVT::Other}); - SDValue Ops[] = { - Load->getChain(), - IntID, - DAG.getUNDEF(VecTupTy), - Op.getOperand(2), - Mask, - VL, - DAG.getTargetConstant( - RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC, DL, XLenVT), - DAG.getTargetConstant(Log2_64(VT.getScalarSizeInBits()), DL, XLenVT)}; - SDValue Result = - DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops, - Load->getMemoryVT(), Load->getMemOperand()); - SmallVector<SDValue, 9> Results; - for (unsigned int RetIdx = 0; RetIdx < NF; RetIdx++) { - SDValue SubVec = DAG.getNode(RISCVISD::TUPLE_EXTRACT, DL, ContainerVT, - Result.getValue(0), - DAG.getTargetConstant(RetIdx, DL, MVT::i32)); - Results.push_back(convertFromScalableVector(VT, SubVec, DAG, Subtarget)); - } - Results.push_back(Result.getValue(1)); - return DAG.getMergeValues(Results, DL); - } case Intrinsic::riscv_sf_vc_v_x_se: return getVCIXISDNodeWCHAIN(Op, DAG, RISCVISD::SF_VC_V_X_SE); case Intrinsic::riscv_sf_vc_v_i_se: diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td index 31ea2de..cc2977c 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td @@ -910,7 +910,7 @@ foreach vti = AllIntegerVectors in { foreach vti = I64IntegerVectors in { let Predicates = [HasVInstructionsI64] in { def : Pat<(add (vti.Vector vti.RegClass:$rs1), - (vti.Vector (SplatPat_imm64_neg i64:$rs2))), + (vti.Vector (SplatPat_imm64_neg (i64 GPR:$rs2)))), (!cast<Instruction>("PseudoVSUB_VX_"#vti.LMul.MX) (vti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs1, diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td index 695223b..acbccdd 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td @@ -2123,7 +2123,7 @@ foreach vti = AllIntegerVectors in { foreach vti = I64IntegerVectors in { let Predicates = [HasVInstructionsI64] in { def : Pat<(riscv_add_vl (vti.Vector vti.RegClass:$rs1), - (vti.Vector (SplatPat_imm64_neg i64:$rs2)), + (vti.Vector (SplatPat_imm64_neg (i64 GPR:$rs2))), vti.RegClass:$passthru, (vti.Mask VMV0:$vm), VLOpFrag), (!cast<Instruction>("PseudoVSUB_VX_"#vti.LMul.MX#"_MASK") vti.RegClass:$passthru, vti.RegClass:$rs1, diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td index c0f7ab1..4c31ce4 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td @@ -590,12 +590,12 @@ let Predicates = [HasVendorXTHeadBb, IsRV64] in { def : PatGprImm<riscv_rorw, TH_SRRIW, uimm5>; def : Pat<(riscv_rolw GPR:$rs1, uimm5:$rs2), (TH_SRRIW GPR:$rs1, (ImmSubFrom32 uimm5:$rs2))>; -def : Pat<(sra (bswap i64:$rs1), (i64 32)), - (TH_REVW i64:$rs1)>; -def : Pat<(binop_allwusers<srl> (bswap i64:$rs1), (i64 32)), - (TH_REVW i64:$rs1)>; -def : Pat<(riscv_clzw i64:$rs1), - (TH_FF0 (i64 (SLLI (i64 (XORI i64:$rs1, -1)), 32)))>; +def : Pat<(i64 (sra (bswap GPR:$rs1), (i64 32))), + (TH_REVW GPR:$rs1)>; +def : Pat<(binop_allwusers<srl> (bswap GPR:$rs1), (i64 32)), + (TH_REVW GPR:$rs1)>; +def : Pat<(riscv_clzw GPR:$rs1), + (TH_FF0 (i64 (SLLI (i64 (XORI GPR:$rs1, -1)), 32)))>; } // Predicates = [HasVendorXTHeadBb, IsRV64] let Predicates = [HasVendorXTHeadBs] in { @@ -697,11 +697,13 @@ def uimm2_4 : Operand<XLenVT>, ImmLeaf<XLenVT, [{ }], uimm2_4_XFORM>; let Predicates = [HasVendorXTHeadMemPair, IsRV64] in { -def : Pat<(th_lwud i64:$rs1, uimm2_3:$uimm2_3), (TH_LWUD i64:$rs1, uimm2_3:$uimm2_3, 3)>; -def : Pat<(th_ldd i64:$rs1, uimm2_4:$uimm2_4), (TH_LDD i64:$rs1, uimm2_4:$uimm2_4, 4)>; +def : Pat<(th_lwud GPR:$rs1, (i64 uimm2_3:$uimm2_3)), + (TH_LWUD GPR:$rs1, uimm2_3:$uimm2_3, 3)>; +def : Pat<(th_ldd GPR:$rs1, (i64 uimm2_4:$uimm2_4)), + (TH_LDD GPR:$rs1, uimm2_4:$uimm2_4, 4)>; -def : Pat<(th_sdd i64:$rd1, i64:$rd2, i64:$rs1, uimm2_4:$uimm2_4), - (TH_SDD i64:$rd1, i64:$rd2, i64:$rs1, uimm2_4:$uimm2_4, 4)>; +def : Pat<(th_sdd (i64 GPR:$rd1), GPR:$rd2, GPR:$rs1, uimm2_4:$uimm2_4), + (TH_SDD GPR:$rd1, GPR:$rd2, GPR:$rs1, uimm2_4:$uimm2_4, 4)>; } let Predicates = [HasVendorXTHeadMemPair] in { diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td index a250ac8..1efe616 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td @@ -1128,13 +1128,13 @@ let Predicates = [HasStdExtZvkned] in { let Predicates = [HasStdExtZvknha] in { defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2ch", "PseudoVSHA2CH", I32IntegerVectors>; - defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2cl", "PseudoVSHA2CH", I32IntegerVectors>; + defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2cl", "PseudoVSHA2CL", I32IntegerVectors>; defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2ms", "PseudoVSHA2MS", I32IntegerVectors, isSEWAware=true>; } // Predicates = [HasStdExtZvknha] let Predicates = [HasStdExtZvknhb] in { defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2ch", "PseudoVSHA2CH", I32I64IntegerVectors>; - defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2cl", "PseudoVSHA2CH", I32I64IntegerVectors>; + defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2cl", "PseudoVSHA2CL", I32I64IntegerVectors>; defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2ms", "PseudoVSHA2MS", I32I64IntegerVectors, isSEWAware=true>; } // Predicates = [HasStdExtZvknhb] diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVAsmBackend.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVAsmBackend.cpp index ef84d43..5710cf2 100644 --- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVAsmBackend.cpp +++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVAsmBackend.cpp @@ -21,8 +21,7 @@ public: SPIRVAsmBackend(llvm::endianness Endian) : MCAsmBackend(Endian) {} void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) override {} + uint8_t *Data, uint64_t Value, bool IsResolved) override {} std::unique_ptr<MCObjectTargetWriter> createObjectTargetWriter() const override { diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp index 6ec7544..25cdf72 100644 --- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp @@ -148,6 +148,7 @@ struct ConvertBuiltin { bool IsSaturated; bool IsRounded; bool IsBfloat16; + bool IsTF32; FPRoundingMode::FPRoundingMode RoundingMode; }; @@ -230,6 +231,7 @@ std::string lookupBuiltinNameHelper(StringRef DemangledCall, // - "__spirv_SubgroupImageMediaBlockReadINTEL" // - "__spirv_SubgroupImageMediaBlockWriteINTEL" // - "__spirv_Convert" + // - "__spirv_Round" // - "__spirv_UConvert" // - "__spirv_SConvert" // - "__spirv_FConvert" @@ -242,7 +244,7 @@ std::string lookupBuiltinNameHelper(StringRef DemangledCall, "SDotKHR|SUDotKHR|SDotAccSatKHR|UDotAccSatKHR|SUDotAccSatKHR|" "ReadClockKHR|SubgroupBlockReadINTEL|SubgroupImageBlockReadINTEL|" "SubgroupImageMediaBlockReadINTEL|SubgroupImageMediaBlockWriteINTEL|" - "Convert|" + "Convert|Round|" "UConvert|SConvert|FConvert|SatConvert)[^_]*)(_R[^_]*_?(\\w+)?.*)?"); std::smatch Match; if (std::regex_match(BuiltinName, Match, SpvWithR) && Match.size() > 1) { @@ -697,7 +699,8 @@ static bool buildAtomicStoreInst(const SPIRV::IncomingCall *Call, MachineIRBuilder &MIRBuilder, SPIRVGlobalRegistry *GR) { if (Call->isSpirvOp()) - return buildOpFromWrapper(MIRBuilder, SPIRV::OpAtomicStore, Call, Register(0)); + return buildOpFromWrapper(MIRBuilder, SPIRV::OpAtomicStore, Call, + Register(0)); Register ScopeRegister = buildConstantIntReg32(SPIRV::Scope::Device, MIRBuilder, GR); @@ -2677,8 +2680,20 @@ static bool generateConvertInst(const StringRef DemangledCall, } } else if (GR->isScalarOrVectorOfType(Call->ReturnRegister, SPIRV::OpTypeFloat)) { - // Float -> Float - Opcode = SPIRV::OpFConvert; + if (Builtin->IsTF32) { + const auto *ST = static_cast<const SPIRVSubtarget *>( + &MIRBuilder.getMF().getSubtarget()); + if (!ST->canUseExtension( + SPIRV::Extension::SPV_INTEL_tensor_float32_conversion)) + NeedExtMsg = "SPV_INTEL_tensor_float32_conversion"; + IsRightComponentsNumber = + GR->getScalarOrVectorComponentCount(Call->Arguments[0]) == + GR->getScalarOrVectorComponentCount(Call->ReturnRegister); + Opcode = SPIRV::OpRoundFToTF32INTEL; + } else { + // Float -> Float + Opcode = SPIRV::OpFConvert; + } } } diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td index ea78dcd..d08560b 100644 --- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td +++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td @@ -1461,6 +1461,8 @@ class ConvertBuiltin<string name, InstructionSet set> { bit IsRounded = !not(!eq(!find(name, "_rt"), -1)); bit IsBfloat16 = !or(!not(!eq(!find(name, "BF16"), -1)), !not(!eq(!find(name, "bfloat16"), -1))); + bit IsTF32 = !or(!not(!eq(!find(name, "TF32"), -1)), + !not(!eq(!find(name, "tensor_float32"), -1))); FPRoundingMode RoundingMode = !cond(!not(!eq(!find(name, "_rte"), -1)) : RTE, !not(!eq(!find(name, "_rtz"), -1)) : RTZ, !not(!eq(!find(name, "_rtp"), -1)) : RTP, @@ -1472,7 +1474,7 @@ class ConvertBuiltin<string name, InstructionSet set> { def ConvertBuiltins : GenericTable { let FilterClass = "ConvertBuiltin"; let Fields = ["Name", "Set", "IsDestinationSigned", "IsSaturated", - "IsRounded", "IsBfloat16", "RoundingMode"]; + "IsRounded", "IsBfloat16", "IsTF32", "RoundingMode"]; string TypeOf_Set = "InstructionSet"; string TypeOf_RoundingMode = "FPRoundingMode"; } @@ -1556,6 +1558,25 @@ foreach conv = ["FToBF16INTEL", "BF16ToFINTEL"] in { def : ConvertBuiltin<!strconcat("__spirv_Convert", conv), OpenCL_std>; } +// cl_intel_tensor_float32_conversions / SPV_INTEL_tensor_float32_conversion +// Multiclass used to define at the same time both a demangled builtin record +// and a corresponding convert builtin record. +multiclass DemangledTF32RoundBuiltin<string name1, string name2> { + // Create records for scalar and vector conversions. + foreach i = ["", "2", "3", "4", "8", "16"] in { + def : DemangledBuiltin<!strconcat("intel_round_", name1, i, name2, i), OpenCL_std, Convert, 1, 1>; + def : ConvertBuiltin<!strconcat("intel_round_", name1, i, name2, i), OpenCL_std>; + } +} + +defm : DemangledTF32RoundBuiltin<"tensor_float32", "_as_float">; +defm : DemangledTF32RoundBuiltin<"as_tensor_float32", "_float">; + +foreach conv = ["FToTF32INTEL"] in { + def : DemangledBuiltin<!strconcat("__spirv_Round", conv), OpenCL_std, Convert, 1, 1>; + def : ConvertBuiltin<!strconcat("__spirv_Round", conv), OpenCL_std>; +} + //===----------------------------------------------------------------------===// // Class defining a vector data load/store builtin record used for lowering // into OpExtInst instruction. diff --git a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp index 2726203..d9265f4 100644 --- a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp @@ -102,7 +102,9 @@ static const std::map<std::string, SPIRV::Extension::Extension, std::less<>> SPIRV::Extension::Extension::SPV_INTEL_2d_block_io}, {"SPV_INTEL_int4", SPIRV::Extension::Extension::SPV_INTEL_int4}, {"SPV_KHR_float_controls2", - SPIRV::Extension::Extension::SPV_KHR_float_controls2}}; + SPIRV::Extension::Extension::SPV_KHR_float_controls2}, + {"SPV_INTEL_tensor_float32_conversion", + SPIRV::Extension::Extension::SPV_INTEL_tensor_float32_conversion}}; bool SPIRVExtensionsParser::parse(cl::Option &O, StringRef ArgName, StringRef ArgValue, diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp index 3c631ce..947b574 100644 --- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp @@ -194,6 +194,42 @@ class SPIRVEmitIntrinsics void useRoundingMode(ConstrainedFPIntrinsic *FPI, IRBuilder<> &B); + // Tries to walk the type accessed by the given GEP instruction. + // For each nested type access, one of the 2 callbacks is called: + // - OnLiteralIndexing when the index is a known constant value. + // Parameters: + // PointedType: the pointed type resulting of this indexing. + // If the parent type is an array, this is the index in the array. + // If the parent type is a struct, this is the field index. + // Index: index of the element in the parent type. + // - OnDynamnicIndexing when the index is a non-constant value. + // This callback is only called when indexing into an array. + // Parameters: + // ElementType: the type of the elements stored in the parent array. + // Offset: the Value* containing the byte offset into the array. + // Return true if an error occured during the walk, false otherwise. + bool walkLogicalAccessChain( + GetElementPtrInst &GEP, + const std::function<void(Type *PointedType, uint64_t Index)> + &OnLiteralIndexing, + const std::function<void(Type *ElementType, Value *Offset)> + &OnDynamicIndexing); + + // Returns the type accessed using the given GEP instruction by relying + // on the GEP type. + // FIXME: GEP types are not supposed to be used to retrieve the pointed + // type. This must be fixed. + Type *getGEPType(GetElementPtrInst *GEP); + + // Returns the type accessed using the given GEP instruction by walking + // the source type using the GEP indices. + // FIXME: without help from the frontend, this method cannot reliably retrieve + // the stored type, nor can robustly determine the depth of the type + // we are accessing. + Type *getGEPTypeLogical(GetElementPtrInst *GEP); + + Instruction *buildLogicalAccessChainFromGEP(GetElementPtrInst &GEP); + public: static char ID; SPIRVEmitIntrinsics(SPIRVTargetMachine *TM = nullptr) @@ -246,6 +282,17 @@ bool expectIgnoredInIRTranslation(const Instruction *I) { } } +// Returns the source pointer from `I` ignoring intermediate ptrcast. +Value *getPointerRoot(Value *I) { + if (auto *II = dyn_cast<IntrinsicInst>(I)) { + if (II->getIntrinsicID() == Intrinsic::spv_ptrcast) { + Value *V = II->getArgOperand(0); + return getPointerRoot(V); + } + } + return I; +} + } // namespace char SPIRVEmitIntrinsics::ID = 0; @@ -555,7 +602,112 @@ void SPIRVEmitIntrinsics::maybeAssignPtrType(Type *&Ty, Value *Op, Type *RefTy, Ty = RefTy; } -Type *getGEPType(GetElementPtrInst *Ref) { +bool SPIRVEmitIntrinsics::walkLogicalAccessChain( + GetElementPtrInst &GEP, + const std::function<void(Type *, uint64_t)> &OnLiteralIndexing, + const std::function<void(Type *, Value *)> &OnDynamicIndexing) { + // We only rewrite i8* GEP. Other should be left as-is. + // Valid i8* GEP must always have a single index. + assert(GEP.getSourceElementType() == + IntegerType::getInt8Ty(CurrF->getContext())); + assert(GEP.getNumIndices() == 1); + + auto &DL = CurrF->getDataLayout(); + Value *Src = getPointerRoot(GEP.getPointerOperand()); + Type *CurType = deduceElementType(Src, true); + + Value *Operand = *GEP.idx_begin(); + ConstantInt *CI = dyn_cast<ConstantInt>(Operand); + if (!CI) { + ArrayType *AT = dyn_cast<ArrayType>(CurType); + // Operand is not constant. Either we have an array and accept it, or we + // give up. + if (AT) + OnDynamicIndexing(AT->getElementType(), Operand); + return AT == nullptr; + } + + assert(CI); + uint64_t Offset = CI->getZExtValue(); + + do { + if (ArrayType *AT = dyn_cast<ArrayType>(CurType)) { + uint32_t EltTypeSize = DL.getTypeSizeInBits(AT->getElementType()) / 8; + assert(Offset < AT->getNumElements() * EltTypeSize); + uint64_t Index = Offset / EltTypeSize; + Offset = Offset - (Index * EltTypeSize); + CurType = AT->getElementType(); + OnLiteralIndexing(CurType, Index); + } else if (StructType *ST = dyn_cast<StructType>(CurType)) { + uint32_t StructSize = DL.getTypeSizeInBits(ST) / 8; + assert(Offset < StructSize); + (void)StructSize; + const auto &STL = DL.getStructLayout(ST); + unsigned Element = STL->getElementContainingOffset(Offset); + Offset -= STL->getElementOffset(Element); + CurType = ST->getElementType(Element); + OnLiteralIndexing(CurType, Element); + } else { + // Vector type indexing should not use GEP. + // So if we have an index left, something is wrong. Giving up. + return true; + } + } while (Offset > 0); + + return false; +} + +Instruction * +SPIRVEmitIntrinsics::buildLogicalAccessChainFromGEP(GetElementPtrInst &GEP) { + auto &DL = CurrF->getDataLayout(); + IRBuilder<> B(GEP.getParent()); + B.SetInsertPoint(&GEP); + + std::vector<Value *> Indices; + Indices.push_back(ConstantInt::get( + IntegerType::getInt32Ty(CurrF->getContext()), 0, /* Signed= */ false)); + walkLogicalAccessChain( + GEP, + [&Indices, &B](Type *EltType, uint64_t Index) { + Indices.push_back( + ConstantInt::get(B.getInt64Ty(), Index, /* Signed= */ false)); + }, + [&Indices, &B, &DL](Type *EltType, Value *Offset) { + uint32_t EltTypeSize = DL.getTypeSizeInBits(EltType) / 8; + Value *Index = B.CreateUDiv( + Offset, ConstantInt::get(Offset->getType(), EltTypeSize, + /* Signed= */ false)); + Indices.push_back(Index); + }); + + SmallVector<Type *, 2> Types = {GEP.getType(), GEP.getOperand(0)->getType()}; + SmallVector<Value *, 4> Args; + Args.push_back(B.getInt1(GEP.isInBounds())); + Args.push_back(GEP.getOperand(0)); + llvm::append_range(Args, Indices); + auto *NewI = B.CreateIntrinsic(Intrinsic::spv_gep, {Types}, {Args}); + replaceAllUsesWithAndErase(B, &GEP, NewI); + return NewI; +} + +Type *SPIRVEmitIntrinsics::getGEPTypeLogical(GetElementPtrInst *GEP) { + + Type *CurType = GEP->getResultElementType(); + + bool Interrupted = walkLogicalAccessChain( + *GEP, [&CurType](Type *EltType, uint64_t Index) { CurType = EltType; }, + [&CurType](Type *EltType, Value *Index) { CurType = EltType; }); + + return Interrupted ? GEP->getResultElementType() : CurType; +} + +Type *SPIRVEmitIntrinsics::getGEPType(GetElementPtrInst *Ref) { + if (Ref->getSourceElementType() == + IntegerType::getInt8Ty(CurrF->getContext()) && + TM->getSubtargetImpl()->isLogicalSPIRV()) { + return getGEPTypeLogical(Ref); + } + Type *Ty = nullptr; // TODO: not sure if GetElementPtrInst::getTypeAtIndex() does anything // useful here @@ -1395,6 +1547,13 @@ Instruction *SPIRVEmitIntrinsics::visitSwitchInst(SwitchInst &I) { } Instruction *SPIRVEmitIntrinsics::visitGetElementPtrInst(GetElementPtrInst &I) { + if (I.getSourceElementType() == IntegerType::getInt8Ty(CurrF->getContext()) && + TM->getSubtargetImpl()->isLogicalSPIRV()) { + Instruction *Result = buildLogicalAccessChainFromGEP(I); + if (Result) + return Result; + } + IRBuilder<> B(I.getParent()); B.SetInsertPoint(&I); SmallVector<Type *, 2> Types = {I.getType(), I.getOperand(0)->getType()}; @@ -1588,7 +1747,24 @@ void SPIRVEmitIntrinsics::insertPtrCastOrAssignTypeInstr(Instruction *I, } if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) { Value *Pointer = GEPI->getPointerOperand(); - Type *OpTy = GEPI->getSourceElementType(); + Type *OpTy = nullptr; + + // Knowing the accessed type is mandatory for logical SPIR-V. Sadly, + // the GEP source element type should not be used for this purpose, and + // the alternative type-scavenging method is not working. + // Physical SPIR-V can work around this, but not logical, hence still + // try to rely on the broken type scavenging for logical. + bool IsRewrittenGEP = + GEPI->getSourceElementType() == IntegerType::getInt8Ty(I->getContext()); + if (IsRewrittenGEP && TM->getSubtargetImpl()->isLogicalSPIRV()) { + Value *Src = getPointerRoot(Pointer); + OpTy = GR->findDeducedElementType(Src); + } + + // In all cases, fall back to the GEP type if type scavenging failed. + if (!OpTy) + OpTy = GEPI->getSourceElementType(); + replacePointerOperandWithPtrCast(I, Pointer, OpTy, 0, B); if (isNestedPointer(OpTy)) insertTodoType(Pointer); diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td index 049ba02..f0b938d 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td +++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td @@ -445,6 +445,9 @@ def OpCrossWorkgroupCastToPtrINTEL : UnOp<"OpCrossWorkgroupCastToPtrINTEL", 5938 def OpConvertFToBF16INTEL : UnOp<"OpConvertFToBF16INTEL", 6116>; def OpConvertBF16ToFINTEL : UnOp<"OpConvertBF16ToFINTEL", 6117>; +// SPV_INTEL_tensor_float32_conversion +def OpRoundFToTF32INTEL : UnOp<"OpRoundFToTF32INTEL", 6426>; + // 3.42.12 Composite Instructions def OpVectorExtractDynamic: Op<77, (outs ID:$res), (ins TYPE:$type, vID:$vec, ID:$idx), diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp index ad976e5..0cd9d78 100644 --- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp @@ -1564,6 +1564,13 @@ void addInstrRequirements(const MachineInstr &MI, Reqs.addCapability(SPIRV::Capability::BFloat16ConversionINTEL); } break; + case SPIRV::OpRoundFToTF32INTEL: + if (ST.canUseExtension( + SPIRV::Extension::SPV_INTEL_tensor_float32_conversion)) { + Reqs.addExtension(SPIRV::Extension::SPV_INTEL_tensor_float32_conversion); + Reqs.addCapability(SPIRV::Capability::TensorFloat32RoundingINTEL); + } + break; case SPIRV::OpVariableLengthArrayINTEL: case SPIRV::OpSaveMemoryINTEL: case SPIRV::OpRestoreMemoryINTEL: diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td index 548e9b7..614e83a 100644 --- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td +++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td @@ -320,6 +320,7 @@ defm SPV_INTEL_subgroup_matrix_multiply_accumulate : ExtensionOperand<121>; defm SPV_INTEL_2d_block_io : ExtensionOperand<122>; defm SPV_INTEL_int4 : ExtensionOperand<123>; defm SPV_KHR_float_controls2 : ExtensionOperand<124>; +defm SPV_INTEL_tensor_float32_conversion : ExtensionOperand<125>; //===----------------------------------------------------------------------===// // Multiclass used to define Capabilities enum values and at the same time @@ -529,6 +530,7 @@ defm Subgroup2DBlockTransformINTEL : CapabilityOperand<6229, 0, 0, [SPV_INTEL_2d defm Subgroup2DBlockTransposeINTEL : CapabilityOperand<6230, 0, 0, [SPV_INTEL_2d_block_io], [Subgroup2DBlockIOINTEL]>; defm Int4TypeINTEL : CapabilityOperand<5112, 0, 0, [SPV_INTEL_int4], []>; defm Int4CooperativeMatrixINTEL : CapabilityOperand<5114, 0, 0, [SPV_INTEL_int4], [Int4TypeINTEL, CooperativeMatrixKHR]>; +defm TensorFloat32RoundingINTEL : CapabilityOperand<6425, 0, 0, [SPV_INTEL_tensor_float32_conversion], []>; //===----------------------------------------------------------------------===// // Multiclass used to define SourceLanguage enum values and at the same time diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp index ba023af..bc60842 100644 --- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp +++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp @@ -127,8 +127,7 @@ public: std::optional<MCFixupKind> getFixupKind(StringRef Name) const override; MCFixupKindInfo getFixupKindInfo(MCFixupKind Kind) const override; void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) override; + uint8_t *Data, uint64_t Value, bool IsResolved) override; bool writeNopData(raw_ostream &OS, uint64_t Count, const MCSubtargetInfo *STI) const override { @@ -253,21 +252,19 @@ MCFixupKindInfo SparcAsmBackend::getFixupKindInfo(MCFixupKind Kind) const { } void SparcAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) { + const MCValue &Target, uint8_t *Data, + uint64_t Value, bool IsResolved) { maybeAddReloc(F, Fixup, Target, Value, IsResolved); if (!IsResolved) return; Value = adjustFixupValue(Fixup.getKind(), Value); unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind()); - unsigned Offset = Fixup.getOffset(); // For each byte of the fragment that the fixup touches, mask in the // bits from the fixup value. for (unsigned i = 0; i != NumBytes; ++i) { unsigned Idx = Endian == llvm::endianness::little ? i : (NumBytes - 1) - i; - Data[Offset + Idx] |= uint8_t((Value >> (i * 8)) & 0xff); + Data[Idx] |= uint8_t((Value >> (i * 8)) & 0xff); } } diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp index d5f8492..d692cbe 100644 --- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp +++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp @@ -113,8 +113,7 @@ public: std::optional<MCFixupKind> getFixupKind(StringRef Name) const override; MCFixupKindInfo getFixupKindInfo(MCFixupKind Kind) const override; void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) override; + uint8_t *Data, uint64_t Value, bool IsResolved) override; bool writeNopData(raw_ostream &OS, uint64_t Count, const MCSubtargetInfo *STI) const override; }; @@ -152,20 +151,18 @@ MCFixupKindInfo SystemZMCAsmBackend::getFixupKindInfo(MCFixupKind Kind) const { } void SystemZMCAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) { + const MCValue &Target, uint8_t *Data, + uint64_t Value, bool IsResolved) { if (Target.getSpecifier()) IsResolved = false; maybeAddReloc(F, Fixup, Target, Value, IsResolved); MCFixupKind Kind = Fixup.getKind(); if (mc::isRelocation(Kind)) return; - unsigned Offset = Fixup.getOffset(); unsigned BitSize = getFixupKindInfo(Kind).TargetSize; unsigned Size = (BitSize + 7) / 8; - assert(Offset + Size <= Data.size() && "Invalid fixup offset!"); + assert(Fixup.getOffset() + Size <= F.getSize() && "Invalid fixup offset!"); // Big-endian insertion of Size bytes. Value = extractBitsForFixup(Kind, Value, Fixup, getContext()); @@ -173,7 +170,7 @@ void SystemZMCAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, Value &= ((uint64_t)1 << BitSize) - 1; unsigned ShiftValue = (Size * 8) - 8; for (unsigned I = 0; I != Size; ++I) { - Data[Offset + I] |= uint8_t(Value >> ShiftValue); + Data[I] |= uint8_t(Value >> ShiftValue); ShiftValue -= 8; } } diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp index f987621..c1b9d9f 100644 --- a/llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp +++ b/llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp @@ -112,8 +112,7 @@ public: } void applyFixup(const MCFragment &, const MCFixup &, const MCValue &, - MutableArrayRef<char>, uint64_t Value, - bool IsResolved) override; + uint8_t *, uint64_t Value, bool IsResolved) override; bool mayNeedRelaxation(unsigned Opcode, ArrayRef<MCOperand> Operands, const MCSubtargetInfo &STI) const override { @@ -152,7 +151,7 @@ public: } // end anonymous namespace void VEAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &Target, MutableArrayRef<char> Data, + const MCValue &Target, uint8_t *Data, uint64_t Value, bool IsResolved) { switch (Fixup.getKind()) { case VE::fixup_ve_tls_gd_hi32: @@ -173,14 +172,14 @@ void VEAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, Value <<= Info.TargetOffset; unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind()); - unsigned Offset = Fixup.getOffset(); - assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!"); + assert(Fixup.getOffset() + NumBytes <= F.getSize() && + "Invalid fixup offset!"); // For each byte of the fragment that the fixup touches, mask in the bits // from the fixup value. The Value has been "split up" into the // appropriate bitfields above. for (unsigned i = 0; i != NumBytes; ++i) { unsigned Idx = Endian == llvm::endianness::little ? i : (NumBytes - 1) - i; - Data[Offset + Idx] |= static_cast<uint8_t>((Value >> (i * 8)) & 0xff); + Data[Idx] |= static_cast<uint8_t>((Value >> (i * 8)) & 0xff); } } diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp index 837fd8e..eecef31 100644 --- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp +++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp @@ -39,7 +39,7 @@ public: MCFixupKindInfo getFixupKindInfo(MCFixupKind Kind) const override; void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, bool) override; + uint8_t *Data, uint64_t Value, bool) override; std::unique_ptr<MCObjectTargetWriter> createObjectTargetWriter() const override; @@ -80,8 +80,7 @@ bool WebAssemblyAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count, void WebAssemblyAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &Target, - MutableArrayRef<char> Data, + const MCValue &Target, uint8_t *Data, uint64_t Value, bool IsResolved) { if (!IsResolved) Asm->getWriter().recordRelocation(F, Fixup, Target, Value); @@ -96,13 +95,13 @@ void WebAssemblyAsmBackend::applyFixup(const MCFragment &F, // Shift the value into position. Value <<= Info.TargetOffset; - unsigned Offset = Fixup.getOffset(); - assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!"); + assert(Fixup.getOffset() + NumBytes <= F.getSize() && + "Invalid fixup offset!"); // For each byte of the fragment that the fixup touches, mask in the // bits from the fixup value. for (unsigned I = 0; I != NumBytes; ++I) - Data[Offset + I] |= uint8_t((Value >> (I * 8)) & 0xff); + Data[I] |= uint8_t((Value >> (I * 8)) & 0xff); } std::unique_ptr<MCObjectTargetWriter> diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index 7f9d474..1f02e56 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -174,8 +174,7 @@ public: std::optional<bool> evaluateFixup(const MCFragment &, MCFixup &, MCValue &, uint64_t &) override; void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) override; + uint8_t *Data, uint64_t Value, bool IsResolved) override; bool mayNeedRelaxation(unsigned Opcode, ArrayRef<MCOperand> Operands, const MCSubtargetInfo &STI) const override; @@ -676,9 +675,8 @@ std::optional<bool> X86AsmBackend::evaluateFixup(const MCFragment &, } void X86AsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) { + const MCValue &Target, uint8_t *Data, + uint64_t Value, bool IsResolved) { // Force relocation when there is a specifier. This might be too conservative // - GAS doesn't emit a relocation for call local@plt; local:. if (Target.getSpecifier()) @@ -690,7 +688,7 @@ void X86AsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, return; unsigned Size = getFixupKindSize(Kind); - assert(Fixup.getOffset() + Size <= Data.size() && "Invalid fixup offset!"); + assert(Fixup.getOffset() + Size <= F.getSize() && "Invalid fixup offset!"); int64_t SignedValue = static_cast<int64_t>(Value); if (IsResolved && Fixup.isPCRel()) { @@ -710,7 +708,7 @@ void X86AsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, } for (unsigned i = 0; i != Size; ++i) - Data[Fixup.getOffset() + i] = uint8_t(Value >> (i * 8)); + Data[i] = uint8_t(Value >> (i * 8)); } bool X86AsmBackend::mayNeedRelaxation(unsigned Opcode, diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index bbbb1d9..ce4c061 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -23486,7 +23486,6 @@ static SDValue EmitCmp(SDValue Op0, SDValue Op1, X86::CondCode X86CC, } // Try to shrink i64 compares if the input has enough zero bits. - // TODO: Add sign-bits equivalent for isX86CCSigned(X86CC)? if (CmpVT == MVT::i64 && !isX86CCSigned(X86CC) && Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub. DAG.MaskedValueIsZero(Op1, APInt::getHighBitsSet(64, 32)) && @@ -23496,6 +23495,16 @@ static SDValue EmitCmp(SDValue Op0, SDValue Op1, X86::CondCode X86CC, Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1); } + // Try to shrink all i64 compares if the inputs are representable as signed + // i32. + if (CmpVT == MVT::i64 && + Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub. + DAG.ComputeNumSignBits(Op1) > 32 && DAG.ComputeNumSignBits(Op0) > 32) { + CmpVT = MVT::i32; + Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0); + Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1); + } + // 0-x == y --> x+y == 0 // 0-x != y --> x+y != 0 if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) && diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaAsmBackend.cpp b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaAsmBackend.cpp index 9167794..08936ad 100644 --- a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaAsmBackend.cpp +++ b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaAsmBackend.cpp @@ -37,8 +37,7 @@ public: std::optional<bool> evaluateFixup(const MCFragment &, MCFixup &, MCValue &, uint64_t &) override; void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) override; + uint8_t *Data, uint64_t Value, bool IsResolved) override; bool writeNopData(raw_ostream &OS, uint64_t Count, const MCSubtargetInfo *STI) const override; @@ -153,9 +152,8 @@ std::optional<bool> XtensaAsmBackend::evaluateFixup(const MCFragment &F, } void XtensaAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, - const MCValue &Target, - MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) { + const MCValue &Target, uint8_t *Data, + uint64_t Value, bool IsResolved) { maybeAddReloc(F, Fixup, Target, Value, IsResolved); MCContext &Ctx = getContext(); MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind()); @@ -168,11 +166,10 @@ void XtensaAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, if (!Value) return; // Doesn't change encoding. - unsigned Offset = Fixup.getOffset(); unsigned FullSize = getSize(Fixup.getKind()); for (unsigned i = 0; i != FullSize; ++i) { - Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff); + Data[i] |= uint8_t((Value >> (i * 8)) & 0xff); } } diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp index 126be71..e5d2e1c 100644 --- a/llvm/lib/TargetParser/TargetParser.cpp +++ b/llvm/lib/TargetParser/TargetParser.cpp @@ -444,6 +444,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T, Features["bitop3-insts"] = true; Features["prng-inst"] = true; Features["tanh-insts"] = true; + Features["tensor-cvt-lut-insts"] = true; Features["transpose-load-f4f6-insts"] = true; Features["bf16-trans-insts"] = true; Features["bf16-cvt-insts"] = true; diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp index 3c24d2e..01da012 100644 --- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp +++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp @@ -13404,7 +13404,7 @@ struct AAAllocationInfoImpl : public AAAllocationInfo { return indicatePessimisticFixpoint(); if (BinSize == 0) { - auto NewAllocationSize = std::optional<TypeSize>(TypeSize(0, false)); + auto NewAllocationSize = std::make_optional<TypeSize>(0, false); if (!changeAllocationSize(NewAllocationSize)) return ChangeStatus::UNCHANGED; return ChangeStatus::CHANGED; @@ -13422,8 +13422,7 @@ struct AAAllocationInfoImpl : public AAAllocationInfo { if (SizeOfBin >= *AllocationSize) return indicatePessimisticFixpoint(); - auto NewAllocationSize = - std::optional<TypeSize>(TypeSize(SizeOfBin * 8, false)); + auto NewAllocationSize = std::make_optional<TypeSize>(SizeOfBin * 8, false); if (!changeAllocationSize(NewAllocationSize)) return ChangeStatus::UNCHANGED; diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index c009c1e..b8c99f1 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -99,6 +99,9 @@ STATISTIC(SkippedCallsCloning, "Number of calls skipped during cloning due to unexpected operand"); STATISTIC(MismatchedCloneAssignments, "Number of callsites assigned to call multiple non-matching clones"); +STATISTIC(TotalMergeInvokes, "Number of merge invocations for nodes"); +STATISTIC(TotalMergeIters, "Number of merge iterations for nodes"); +STATISTIC(MaxMergeIters, "Max merge iterations for nodes"); static cl::opt<std::string> DotFilePathPrefix( "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden, @@ -109,6 +112,11 @@ static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(false), cl::Hidden, cl::desc("Export graph to dot files.")); +// TODO: Remove this option once new handling is validated more widely. +static cl::opt<bool> DoMergeIteration( + "memprof-merge-iteration", cl::init(true), cl::Hidden, + cl::desc("Iteratively apply merging on a node to catch new callers")); + // How much of the graph to export to dot. enum DotScope { All, // The full CCG graph. @@ -3995,7 +4003,7 @@ IndexCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const { void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) { - auto *CurF = cast<CallBase>(CallerCall.call())->getCalledFunction(); + auto *CurF = getCalleeFunc(CallerCall.call()); auto NewCalleeCloneNo = CalleeFunc.cloneNo(); if (isMemProfClone(*CurF)) { // If we already assigned this callsite to call a specific non-default @@ -4191,16 +4199,36 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeClones( if (!Inserted.second) return; - // Make a copy since the recursive call may move a caller edge to a new - // callee, messing up the iterator. - auto CallerEdges = Node->CallerEdges; - for (auto CallerEdge : CallerEdges) { - // Skip any caller edge moved onto a different callee during recursion. - if (CallerEdge->Callee != Node) - continue; - mergeClones(CallerEdge->Caller, Visited, ContextIdToAllocationNode); + // Iteratively perform merging on this node to handle new caller nodes created + // during the recursive traversal. We could do something more elegant such as + // maintain a worklist, but this is a simple approach that doesn't cause a + // measureable compile time effect, as most nodes don't have many caller + // edges to check. + bool FoundUnvisited = true; + unsigned Iters = 0; + while (FoundUnvisited) { + Iters++; + FoundUnvisited = false; + // Make a copy since the recursive call may move a caller edge to a new + // callee, messing up the iterator. + auto CallerEdges = Node->CallerEdges; + for (auto CallerEdge : CallerEdges) { + // Skip any caller edge moved onto a different callee during recursion. + if (CallerEdge->Callee != Node) + continue; + // If we found an unvisited caller, note that we should check the caller + // edges again as mergeClones may add or change caller nodes. + if (DoMergeIteration && !Visited.contains(CallerEdge->Caller)) + FoundUnvisited = true; + mergeClones(CallerEdge->Caller, Visited, ContextIdToAllocationNode); + } } + TotalMergeInvokes++; + TotalMergeIters += Iters; + if (Iters > MaxMergeIters) + MaxMergeIters = Iters; + // Merge for this node after we handle its callers. mergeNodeCalleeClones(Node, Visited, ContextIdToAllocationNode); } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp index 00b877b..fe0f308 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -462,6 +462,13 @@ Instruction *InstCombinerImpl::visitExtractElementInst(ExtractElementInst &EI) { return ScalarPHI; } + // If SrcVec is a subvector starting at index 0, extract from the + // wider source vector + Value *V; + if (match(SrcVec, + m_Intrinsic<Intrinsic::vector_extract>(m_Value(V), m_Zero()))) + return ExtractElementInst::Create(V, Index); + // TODO come up with a n-ary matcher that subsumes both unary and // binary matchers. UnaryOperator *UO; diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 56358b1..5ee3bb1 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -2011,12 +2011,17 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN, NewPN->addIncoming(NewPhiValues[i], PN->getIncomingBlock(i)); if (IdenticalUsers) { - for (User *U : make_early_inc_range(PN->users())) { + // Collect and deduplicate users up-front to avoid iterator invalidation. + SmallSetVector<Instruction *, 4> ToReplace; + for (User *U : PN->users()) { Instruction *User = cast<Instruction>(U); if (User == &I) continue; - replaceInstUsesWith(*User, NewPN); - eraseInstFromFunction(*User); + ToReplace.insert(User); + } + for (Instruction *I : ToReplace) { + replaceInstUsesWith(*I, NewPN); + eraseInstFromFunction(*I); } OneUse = true; } @@ -2654,9 +2659,18 @@ static Instruction *canonicalizeGEPOfConstGEPI8(GetElementPtrInst &GEP, APInt NewOffset = TypeSize * *C2 + *C1; if (NewOffset.isZero() || (Src->hasOneUse() && GEP.getOperand(1)->hasOneUse())) { + GEPNoWrapFlags Flags = GEPNoWrapFlags::none(); + if (GEP.hasNoUnsignedWrap() && + cast<GEPOperator>(Src)->hasNoUnsignedWrap() && + match(GEP.getOperand(1), m_NUWAddLike(m_Value(), m_Value()))) { + Flags |= GEPNoWrapFlags::noUnsignedWrap(); + if (GEP.isInBounds() && cast<GEPOperator>(Src)->isInBounds()) + Flags |= GEPNoWrapFlags::inBounds(); + } + Value *GEPConst = - IC.Builder.CreatePtrAdd(Base, IC.Builder.getInt(NewOffset)); - return GetElementPtrInst::Create(BaseType, GEPConst, VarIndex); + IC.Builder.CreatePtrAdd(Base, IC.Builder.getInt(NewOffset), "", Flags); + return GetElementPtrInst::Create(BaseType, GEPConst, VarIndex, Flags); } return nullptr; diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp index 68094c3..c3f80f9 100644 --- a/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/llvm/lib/Transforms/Scalar/LICM.cpp @@ -2508,6 +2508,12 @@ static bool hoistGEP(Instruction &I, Loop &L, ICFLoopSafetyInfo &SafetyInfo, if (!GEP) return false; + // Do not try to hoist a constant GEP out of the loop via reassociation. + // Constant GEPs can often be folded into addressing modes, and reassociating + // them may inhibit CSE of a common base. + if (GEP->hasAllConstantIndices()) + return false; + auto *Src = dyn_cast<GetElementPtrInst>(GEP->getPointerOperand()); if (!Src || !Src->hasOneUse() || !L.contains(Src)) return false; diff --git a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp index f3e992c..04039b8 100644 --- a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp +++ b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp @@ -1009,7 +1009,8 @@ PreservedAnalyses LoopFlattenPass::run(LoopNest &LN, LoopAnalysisManager &LAM, // in simplified form, and also needs LCSSA. Running // this pass will simplify all loops that contain inner loops, // regardless of whether anything ends up being flattened. - LoopAccessInfoManager LAIM(AR.SE, AR.AA, AR.DT, AR.LI, &AR.TTI, nullptr); + LoopAccessInfoManager LAIM(AR.SE, AR.AA, AR.DT, AR.LI, &AR.TTI, nullptr, + &AR.AC); for (Loop *InnerLoop : LN.getLoops()) { auto *OuterLoop = InnerLoop->getParentLoop(); if (!OuterLoop) diff --git a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp index 4f2bfb0..448dc2b 100644 --- a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp +++ b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp @@ -551,7 +551,7 @@ PreservedAnalyses LoopVersioningLICMPass::run(Loop &L, LoopAnalysisManager &AM, const Function *F = L.getHeader()->getParent(); OptimizationRemarkEmitter ORE(F); - LoopAccessInfoManager LAIs(*SE, *AA, *DT, LAR.LI, nullptr, nullptr); + LoopAccessInfoManager LAIs(*SE, *AA, *DT, LAR.LI, nullptr, nullptr, &LAR.AC); if (!LoopVersioningLICM(AA, SE, &ORE, LAIs, LAR.LI, &L).run(DT)) return PreservedAnalyses::all(); return getLoopPassPreservedAnalyses(); diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp index 320b792..6ffe841 100644 --- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp +++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp @@ -79,8 +79,7 @@ // ld.global.f32 %f4, [%rl6+132]; // much better // // Another improvement enabled by the LowerGEP flag is to lower a GEP with -// multiple indices to either multiple GEPs with a single index or arithmetic -// operations (depending on whether the target uses alias analysis in codegen). +// multiple indices to multiple GEPs with a single index. // Such transformation can have following benefits: // (1) It can always extract constants in the indices of structure type. // (2) After such Lowering, there are more optimization opportunities such as @@ -88,59 +87,33 @@ // // E.g. The following GEPs have multiple indices: // BB1: -// %p = getelementptr [10 x %struct]* %ptr, i64 %i, i64 %j1, i32 3 +// %p = getelementptr [10 x %struct], ptr %ptr, i64 %i, i64 %j1, i32 3 // load %p // ... // BB2: -// %p2 = getelementptr [10 x %struct]* %ptr, i64 %i, i64 %j1, i32 2 +// %p2 = getelementptr [10 x %struct], ptr %ptr, i64 %i, i64 %j1, i32 2 // load %p2 // ... // // We can not do CSE to the common part related to index "i64 %i". Lowering // GEPs can achieve such goals. -// If the target does not use alias analysis in codegen, this pass will -// lower a GEP with multiple indices into arithmetic operations: -// BB1: -// %1 = ptrtoint [10 x %struct]* %ptr to i64 ; CSE opportunity -// %2 = mul i64 %i, length_of_10xstruct ; CSE opportunity -// %3 = add i64 %1, %2 ; CSE opportunity -// %4 = mul i64 %j1, length_of_struct -// %5 = add i64 %3, %4 -// %6 = add i64 %3, struct_field_3 ; Constant offset -// %p = inttoptr i64 %6 to i32* -// load %p -// ... -// BB2: -// %7 = ptrtoint [10 x %struct]* %ptr to i64 ; CSE opportunity -// %8 = mul i64 %i, length_of_10xstruct ; CSE opportunity -// %9 = add i64 %7, %8 ; CSE opportunity -// %10 = mul i64 %j2, length_of_struct -// %11 = add i64 %9, %10 -// %12 = add i64 %11, struct_field_2 ; Constant offset -// %p = inttoptr i64 %12 to i32* -// load %p2 -// ... // -// If the target uses alias analysis in codegen, this pass will lower a GEP -// with multiple indices into multiple GEPs with a single index: +// This pass will lower a GEP with multiple indices into multiple GEPs with a +// single index: // BB1: -// %1 = bitcast [10 x %struct]* %ptr to i8* ; CSE opportunity -// %2 = mul i64 %i, length_of_10xstruct ; CSE opportunity -// %3 = getelementptr i8* %1, i64 %2 ; CSE opportunity +// %2 = mul i64 %i, length_of_10xstruct ; CSE opportunity +// %3 = getelementptr i8, ptr %ptr, i64 %2 ; CSE opportunity // %4 = mul i64 %j1, length_of_struct -// %5 = getelementptr i8* %3, i64 %4 -// %6 = getelementptr i8* %5, struct_field_3 ; Constant offset -// %p = bitcast i8* %6 to i32* +// %5 = getelementptr i8, ptr %3, i64 %4 +// %p = getelementptr i8, ptr %5, struct_field_3 ; Constant offset // load %p // ... // BB2: -// %7 = bitcast [10 x %struct]* %ptr to i8* ; CSE opportunity -// %8 = mul i64 %i, length_of_10xstruct ; CSE opportunity -// %9 = getelementptr i8* %7, i64 %8 ; CSE opportunity +// %8 = mul i64 %i, length_of_10xstruct ; CSE opportunity +// %9 = getelementptr i8, ptr %ptr, i64 %8 ; CSE opportunity // %10 = mul i64 %j2, length_of_struct -// %11 = getelementptr i8* %9, i64 %10 -// %12 = getelementptr i8* %11, struct_field_2 ; Constant offset -// %p2 = bitcast i8* %12 to i32* +// %11 = getelementptr i8, ptr %9, i64 %10 +// %p2 = getelementptr i8, ptr %11, struct_field_2 ; Constant offset // load %p2 // ... // @@ -408,16 +381,6 @@ private: void lowerToSingleIndexGEPs(GetElementPtrInst *Variadic, int64_t AccumulativeByteOffset); - /// Lower a GEP with multiple indices into ptrtoint+arithmetics+inttoptr form. - /// Function splitGEP already split the original GEP into a variadic part and - /// a constant offset (i.e., AccumulativeByteOffset). This function lowers the - /// variadic part into a set of arithmetic operations and applies - /// AccumulativeByteOffset to it. - /// \p Variadic The variadic part of the original GEP. - /// \p AccumulativeByteOffset The constant offset. - void lowerToArithmetics(GetElementPtrInst *Variadic, - int64_t AccumulativeByteOffset); - /// Finds the constant offset within each index and accumulates them. If /// LowerGEP is true, it finds in indices of both sequential and structure /// types, otherwise it only finds in sequential indices. The output @@ -951,55 +914,6 @@ void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs( Variadic->eraseFromParent(); } -void -SeparateConstOffsetFromGEP::lowerToArithmetics(GetElementPtrInst *Variadic, - int64_t AccumulativeByteOffset) { - IRBuilder<> Builder(Variadic); - Type *IntPtrTy = DL->getIntPtrType(Variadic->getType()); - assert(IntPtrTy == DL->getIndexType(Variadic->getType()) && - "Pointer type must match index type for arithmetic-based lowering of " - "split GEPs"); - - Value *ResultPtr = Builder.CreatePtrToInt(Variadic->getOperand(0), IntPtrTy); - gep_type_iterator GTI = gep_type_begin(*Variadic); - // Create ADD/SHL/MUL arithmetic operations for each sequential indices. We - // don't create arithmetics for structure indices, as they are accumulated - // in the constant offset index. - for (unsigned I = 1, E = Variadic->getNumOperands(); I != E; ++I, ++GTI) { - if (GTI.isSequential()) { - Value *Idx = Variadic->getOperand(I); - // Skip zero indices. - if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx)) - if (CI->isZero()) - continue; - - APInt ElementSize = APInt(IntPtrTy->getIntegerBitWidth(), - GTI.getSequentialElementStride(*DL)); - // Scale the index by element size. - if (ElementSize != 1) { - if (ElementSize.isPowerOf2()) { - Idx = Builder.CreateShl( - Idx, ConstantInt::get(IntPtrTy, ElementSize.logBase2())); - } else { - Idx = Builder.CreateMul(Idx, ConstantInt::get(IntPtrTy, ElementSize)); - } - } - // Create an ADD for each index. - ResultPtr = Builder.CreateAdd(ResultPtr, Idx); - } - } - - // Create an ADD for the constant offset index. - if (AccumulativeByteOffset != 0) { - ResultPtr = Builder.CreateAdd( - ResultPtr, ConstantInt::get(IntPtrTy, AccumulativeByteOffset)); - } - - ResultPtr = Builder.CreateIntToPtr(ResultPtr, Variadic->getType()); - Variadic->replaceAllUsesWith(ResultPtr); - Variadic->eraseFromParent(); -} - bool SeparateConstOffsetFromGEP::reorderGEP(GetElementPtrInst *GEP, TargetTransformInfo &TTI) { auto PtrGEP = dyn_cast<GetElementPtrInst>(GEP->getPointerOperand()); @@ -1091,8 +1005,8 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { // Notice that we don't remove struct field indices here. If LowerGEP is // disabled, a structure index is not accumulated and we still use the old // one. If LowerGEP is enabled, a structure index is accumulated in the - // constant offset. LowerToSingleIndexGEPs or lowerToArithmetics will later - // handle the constant offset and won't need a new structure index. + // constant offset. LowerToSingleIndexGEPs will later handle the constant + // offset and won't need a new structure index. gep_type_iterator GTI = gep_type_begin(*GEP); for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) { if (GTI.isSequential()) { @@ -1167,22 +1081,9 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { GEP->setNoWrapFlags(NewGEPFlags); - // Lowers a GEP to either GEPs with a single index or arithmetic operations. + // Lowers a GEP to GEPs with a single index. if (LowerGEP) { - // As currently BasicAA does not analyze ptrtoint/inttoptr, do not lower to - // arithmetic operations if the target uses alias analysis in codegen. - // Additionally, pointers that aren't integral (and so can't be safely - // converted to integers) or those whose offset size is different from their - // pointer size (which means that doing integer arithmetic on them could - // affect that data) can't be lowered in this way. - unsigned AddrSpace = GEP->getPointerAddressSpace(); - bool PointerHasExtraData = DL->getPointerSizeInBits(AddrSpace) != - DL->getIndexSizeInBits(AddrSpace); - if (TTI.useAA() || DL->isNonIntegralAddressSpace(AddrSpace) || - PointerHasExtraData) - lowerToSingleIndexGEPs(GEP, AccumulativeByteOffset); - else - lowerToArithmetics(GEP, AccumulativeByteOffset); + lowerToSingleIndexGEPs(GEP, AccumulativeByteOffset); return true; } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index 969d225..c47fd942 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -1665,13 +1665,12 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() { // Keep a record of all the exiting blocks. SmallVector<const SCEVPredicate *, 4> Predicates; - std::optional<std::pair<BasicBlock *, BasicBlock *>> SingleUncountableEdge; + BasicBlock *SingleUncountableExitingBlock = nullptr; for (BasicBlock *BB : ExitingBlocks) { const SCEV *EC = PSE.getSE()->getPredicatedExitCount(TheLoop, BB, &Predicates); if (isa<SCEVCouldNotCompute>(EC)) { - SmallVector<BasicBlock *, 2> Succs(successors(BB)); - if (Succs.size() != 2) { + if (size(successors(BB)) != 2) { reportVectorizationFailure( "Early exiting block does not have exactly two successors", "Incorrect number of successors from early exiting block", @@ -1679,15 +1678,7 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() { return false; } - BasicBlock *ExitBlock; - if (!TheLoop->contains(Succs[0])) - ExitBlock = Succs[0]; - else { - assert(!TheLoop->contains(Succs[1])); - ExitBlock = Succs[1]; - } - - if (SingleUncountableEdge) { + if (SingleUncountableExitingBlock) { reportVectorizationFailure( "Loop has too many uncountable exits", "Cannot vectorize early exit loop with more than one early exit", @@ -1695,7 +1686,7 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() { return false; } - SingleUncountableEdge = {BB, ExitBlock}; + SingleUncountableExitingBlock = BB; } else CountableExitingBlocks.push_back(BB); } @@ -1705,7 +1696,7 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() { // PSE.getSymbolicMaxBackedgeTakenCount() below. Predicates.clear(); - if (!SingleUncountableEdge) { + if (!SingleUncountableExitingBlock) { LLVM_DEBUG(dbgs() << "LV: Cound not find any uncountable exits"); return false; } @@ -1713,7 +1704,7 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() { // The only supported early exit loops so far are ones where the early // exiting block is a unique predecessor of the latch block. BasicBlock *LatchPredBB = LatchBB->getUniquePredecessor(); - if (LatchPredBB != SingleUncountableEdge->first) { + if (LatchPredBB != SingleUncountableExitingBlock) { reportVectorizationFailure("Early exit is not the latch predecessor", "Cannot vectorize early exit loop", "EarlyExitNotLatchPredecessor", ORE, TheLoop); @@ -1766,7 +1757,7 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() { } // The vectoriser cannot handle loads that occur after the early exit block. - assert(LatchBB->getUniquePredecessor() == SingleUncountableEdge->first && + assert(LatchBB->getUniquePredecessor() == SingleUncountableExitingBlock && "Expected latch predecessor to be the early exiting block"); // TODO: Handle loops that may fault. @@ -1789,7 +1780,7 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() { LLVM_DEBUG(dbgs() << "LV: Found an early exit loop with symbolic max " "backedge taken count: " << *SymbolicMaxBTC << '\n'); - UncountableEdge = SingleUncountableEdge; + UncountableExitingBB = SingleUncountableExitingBlock; return true; } @@ -1861,7 +1852,8 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) { return false; } else { if (!isVectorizableEarlyExitLoop()) { - UncountableEdge = std::nullopt; + assert(!hasUncountableEarlyExit() && + "Must be false without vectorizable early-exit loop"); if (DoExtraAnalysis) Result = false; else diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 850c4a1..b4ea70e 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1624,7 +1624,7 @@ private: /// presence of a cost for an instruction in the mapping indicates that the /// instruction will be scalarized when vectorizing with the associated /// vectorization factor. The entries are VF-ScalarCostTy pairs. - DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; + MapVector<ElementCount, ScalarCostsTy> InstsToScalarize; /// Holds the instructions known to be uniform after vectorization. /// The data is collected per VF. @@ -9788,6 +9788,10 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L, match( P.getIncomingValueForBlock(EPI.MainLoopIterationCountCheck), m_SpecificInt(0)) && + any_of(P.incoming_values(), + [&EPI](Value *Inc) { + return Inc == EPI.VectorTripCount; + }) && all_of(P.incoming_values(), [&EPI](Value *Inc) { return Inc == EPI.VectorTripCount || match(Inc, m_SpecificInt(0)); diff --git a/llvm/lib/Transforms/Vectorize/VPlanCFG.h b/llvm/lib/Transforms/Vectorize/VPlanCFG.h index b77aa9d..c79485c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanCFG.h +++ b/llvm/lib/Transforms/Vectorize/VPlanCFG.h @@ -231,6 +231,13 @@ vp_post_order_shallow(VPBlockBase *G) { } /// Returns an iterator range to traverse the graph starting at \p G in +/// post order while traversing through region blocks. +inline iterator_range<po_iterator<VPBlockDeepTraversalWrapper<VPBlockBase *>>> +vp_post_order_deep(VPBlockBase *G) { + return post_order(VPBlockDeepTraversalWrapper<VPBlockBase *>(G)); +} + +/// Returns an iterator range to traverse the graph starting at \p G in /// depth-first order while traversing through region blocks. inline iterator_range<df_iterator<VPBlockDeepTraversalWrapper<VPBlockBase *>>> vp_depth_first_deep(VPBlockBase *G) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index d249a34..98d11f0 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -2836,12 +2836,12 @@ static void scalarizeInstruction(const Instruction *Instr, Instruction *Cloned = Instr->clone(); if (!IsVoidRetTy) { Cloned->setName(Instr->getName() + ".cloned"); -#if !defined(NDEBUG) - // Verify that VPlan type inference results agree with the type of the - // generated values. - assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() && - "inferred type and type from generated instructions do not match"); -#endif + Type *ResultTy = State.TypeAnalysis.inferScalarType(RepRecipe); + // The operands of the replicate recipe may have been narrowed, resulting in + // a narrower result type. Update the type of the cloned instruction to the + // correct type. + if (ResultTy != Cloned->getType()) + Cloned->mutateType(ResultTy); } RepRecipe->applyFlags(*Cloned); @@ -3548,6 +3548,8 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { // Vectorize the interleaved store group. Value *MaskForGaps = createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group); + assert(((MaskForGaps != nullptr) == NeedsMaskForGaps) && + "Mismatch between NeedsMaskForGaps and MaskForGaps"); assert((!MaskForGaps || !State.VF.isScalable()) && "masking gaps for scalable vectors is not yet supported."); ArrayRef<VPValue *> StoredValues = getStoredValues(); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 18d331b..373b2f7 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -545,10 +545,8 @@ static bool isDeadRecipe(VPRecipeBase &R) { } void VPlanTransforms::removeDeadRecipes(VPlan &Plan) { - ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT( - Plan.getEntry()); - - for (VPBasicBlock *VPBB : reverse(VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT))) { + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( + vp_post_order_deep(Plan.getEntry()))) { // The recipes in the block are processed in reverse order, to catch chains // of dead recipes. for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) { @@ -1413,7 +1411,7 @@ static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, ElementCount BestVF, unsigned BestUF, ScalarEvolution &SE) { - if (match(Cond, m_Binary<Instruction::Or>(m_VPValue(), m_VPValue()))) + if (match(Cond, m_BinaryOr(m_VPValue(), m_VPValue()))) return any_of(Cond->getDefiningRecipe()->operands(), [&Plan, BestVF, BestUF, &SE](VPValue *C) { return isConditionTrueViaVFAndUF(C, Plan, BestVF, BestUF, SE); @@ -1431,15 +1429,15 @@ static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, // count is not conveniently available as SCEV so far, so we compare directly // against the original trip count. This is stricter than necessary, as we // will only return true if the trip count == vector trip count. - // TODO: Use SCEV for vector trip count once available, to cover cases where - // vector trip count == UF * VF, but original trip count != UF * VF. - const SCEV *TripCount = - vputils::getSCEVExprForVPValue(Plan.getTripCount(), SE); - assert(!isa<SCEVCouldNotCompute>(TripCount) && + const SCEV *VectorTripCount = + vputils::getSCEVExprForVPValue(&Plan.getVectorTripCount(), SE); + if (isa<SCEVCouldNotCompute>(VectorTripCount)) + VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), SE); + assert(!isa<SCEVCouldNotCompute>(VectorTripCount) && "Trip count SCEV must be computable"); ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF); - const SCEV *C = SE.getElementCount(TripCount->getType(), NumElements); - return SE.isKnownPredicate(CmpInst::ICMP_EQ, TripCount, C); + const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements); + return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C); } /// Try to simplify the branch condition of \p Plan. This may restrict the @@ -2563,7 +2561,8 @@ void VPlanTransforms::createInterleaveGroups( } bool NeedsMaskForGaps = - IG->requiresScalarEpilogue() && !ScalarEpilogueAllowed; + (IG->requiresScalarEpilogue() && !ScalarEpilogueAllowed) || + (!StoredValues.empty() && !IG->isFull()); Instruction *IRInsertPos = IG->getInsertPos(); auto *InsertPos = diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index 81bd21b..14f20c6 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -73,8 +73,11 @@ bool vputils::isHeaderMask(const VPValue *V, VPlan &Plan) { } const SCEV *vputils::getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE) { - if (V->isLiveIn()) - return SE.getSCEV(V->getLiveInIRValue()); + if (V->isLiveIn()) { + if (Value *LiveIn = V->getLiveInIRValue()) + return SE.getSCEV(LiveIn); + return SE.getCouldNotCompute(); + } // TODO: Support constructing SCEVs for more recipes as needed. return TypeSwitch<const VPRecipeBase *, const SCEV *>(V->getDefiningRecipe()) |