diff options
Diffstat (limited to 'llvm/lib')
109 files changed, 5609 insertions, 1013 deletions
diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp index 853bd66..a572eef 100644 --- a/llvm/lib/Analysis/DependenceAnalysis.cpp +++ b/llvm/lib/Analysis/DependenceAnalysis.cpp @@ -1582,6 +1582,23 @@ static const SCEV *minusSCEVNoSignedOverflow(const SCEV *A, const SCEV *B, return nullptr; } +/// Returns the absolute value of \p A. In the context of dependence analysis, +/// we need an absolute value in a mathematical sense. If \p A is the signed +/// minimum value, we cannot represent it unless extending the original type. +/// Thus if we cannot prove that \p A is not the signed minimum value, returns +/// nullptr. +static const SCEV *absSCEVNoSignedOverflow(const SCEV *A, ScalarEvolution &SE) { + IntegerType *Ty = cast<IntegerType>(A->getType()); + if (!Ty) + return nullptr; + + const SCEV *SMin = + SE.getConstant(APInt::getSignedMinValue(Ty->getBitWidth())); + if (!SE.isKnownPredicate(CmpInst::ICMP_NE, A, SMin)) + return nullptr; + return SE.getAbsExpr(A, /*IsNSW=*/true); +} + /// Returns true iff \p Test is enabled. static bool isDependenceTestEnabled(DependenceTestType Test) { if (EnableDependenceTest == DependenceTestType::All) @@ -1669,21 +1686,25 @@ bool DependenceInfo::strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst, LLVM_DEBUG(dbgs() << ", " << *Delta->getType() << "\n"); // check that |Delta| < iteration count - if (const SCEV *UpperBound = - collectUpperBound(CurSrcLoop, Delta->getType())) { + bool IsDeltaLarge = [&] { + const SCEV *UpperBound = collectUpperBound(CurSrcLoop, Delta->getType()); + if (!UpperBound) + return false; + LLVM_DEBUG(dbgs() << "\t UpperBound = " << *UpperBound); LLVM_DEBUG(dbgs() << ", " << *UpperBound->getType() << "\n"); - const SCEV *AbsDelta = - SE->isKnownNonNegative(Delta) ? Delta : SE->getNegativeSCEV(Delta); - const SCEV *AbsCoeff = - SE->isKnownNonNegative(Coeff) ? Coeff : SE->getNegativeSCEV(Coeff); + const SCEV *AbsDelta = absSCEVNoSignedOverflow(Delta, *SE); + const SCEV *AbsCoeff = absSCEVNoSignedOverflow(Coeff, *SE); + if (!AbsDelta || !AbsCoeff) + return false; const SCEV *Product = SE->getMulExpr(UpperBound, AbsCoeff); - if (isKnownPredicate(CmpInst::ICMP_SGT, AbsDelta, Product)) { - // Distance greater than trip count - no dependence - ++StrongSIVindependence; - ++StrongSIVsuccesses; - return true; - } + return isKnownPredicate(CmpInst::ICMP_SGT, AbsDelta, Product); + }(); + if (IsDeltaLarge) { + // Distance greater than trip count - no dependence + ++StrongSIVindependence; + ++StrongSIVsuccesses; + return true; } // Can we compute distance? @@ -2259,6 +2280,9 @@ bool DependenceInfo::weakZeroSrcSIVtest( const SCEVConstant *ConstCoeff = dyn_cast<SCEVConstant>(DstCoeff); if (!ConstCoeff) return false; + + // Since ConstCoeff is constant, !isKnownNegative means it's non-negative. + // TODO: Bail out if it's a signed minimum value. const SCEV *AbsCoeff = SE->isKnownNegative(ConstCoeff) ? SE->getNegativeSCEV(ConstCoeff) : ConstCoeff; @@ -2369,6 +2393,9 @@ bool DependenceInfo::weakZeroDstSIVtest( const SCEVConstant *ConstCoeff = dyn_cast<SCEVConstant>(SrcCoeff); if (!ConstCoeff) return false; + + // Since ConstCoeff is constant, !isKnownNegative means it's non-negative. + // TODO: Bail out if it's a signed minimum value. const SCEV *AbsCoeff = SE->isKnownNegative(ConstCoeff) ? SE->getNegativeSCEV(ConstCoeff) : ConstCoeff; diff --git a/llvm/lib/Analysis/MemoryProfileInfo.cpp b/llvm/lib/Analysis/MemoryProfileInfo.cpp index 92a5b6f..b09f4ed 100644 --- a/llvm/lib/Analysis/MemoryProfileInfo.cpp +++ b/llvm/lib/Analysis/MemoryProfileInfo.cpp @@ -241,9 +241,13 @@ static MDNode *createMIBNode(LLVMContext &Ctx, ArrayRef<uint64_t> MIBCallStack, ColdBytes += TotalSize; // If we have the max cold context size from summary information and have // requested identification of contexts above a percentage of the max, see - // if this context qualifies. - if (MaxColdSize > 0 && MinPercentMaxColdSize < 100 && - TotalSize * 100 >= MaxColdSize * MinPercentMaxColdSize) + // if this context qualifies. We should assume this is large if we rebuilt + // the trie from existing metadata (i.e. to update after inlining), in + // which case we don't have a MaxSize from the profile - we assume any + // context size info in existence on the metadata should be propagated. + if (BuiltFromExistingMetadata || + (MaxColdSize > 0 && MinPercentMaxColdSize < 100 && + TotalSize * 100 >= MaxColdSize * MinPercentMaxColdSize)) LargeColdContext = true; } // Only add the context size info as metadata if we need it in the thin diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index b425b95..1f10478 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -391,19 +391,6 @@ void CombinerHelper::applyCombineConcatVectors( MI.eraseFromParent(); } -bool CombinerHelper::matchCombineShuffleToBuildVector(MachineInstr &MI) const { - assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR && - "Invalid instruction"); - auto &Shuffle = cast<GShuffleVector>(MI); - - Register SrcVec1 = Shuffle.getSrc1Reg(); - Register SrcVec2 = Shuffle.getSrc2Reg(); - - LLT SrcVec1Type = MRI.getType(SrcVec1); - LLT SrcVec2Type = MRI.getType(SrcVec2); - return SrcVec1Type.isVector() && SrcVec2Type.isVector(); -} - void CombinerHelper::applyCombineShuffleToBuildVector(MachineInstr &MI) const { auto &Shuffle = cast<GShuffleVector>(MI); @@ -535,11 +522,9 @@ bool CombinerHelper::matchCombineShuffleVector( LLT DstType = MRI.getType(MI.getOperand(0).getReg()); Register Src1 = MI.getOperand(1).getReg(); LLT SrcType = MRI.getType(Src1); - // As bizarre as it may look, shuffle vector can actually produce - // scalar! This is because at the IR level a <1 x ty> shuffle - // vector is perfectly valid. - unsigned DstNumElts = DstType.isVector() ? DstType.getNumElements() : 1; - unsigned SrcNumElts = SrcType.isVector() ? SrcType.getNumElements() : 1; + + unsigned DstNumElts = DstType.getNumElements(); + unsigned SrcNumElts = SrcType.getNumElements(); // If the resulting vector is smaller than the size of the source // vectors being concatenated, we won't be able to replace the @@ -556,7 +541,7 @@ bool CombinerHelper::matchCombineShuffleVector( // // TODO: If the size between the source and destination don't match // we could still emit an extract vector element in that case. - if (DstNumElts < 2 * SrcNumElts && DstNumElts != 1) + if (DstNumElts < 2 * SrcNumElts) return false; // Check that the shuffle mask can be broken evenly between the @@ -619,39 +604,6 @@ void CombinerHelper::applyCombineShuffleVector( MI.eraseFromParent(); } -bool CombinerHelper::matchShuffleToExtract(MachineInstr &MI) const { - assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR && - "Invalid instruction kind"); - - ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask(); - return Mask.size() == 1; -} - -void CombinerHelper::applyShuffleToExtract(MachineInstr &MI) const { - Register DstReg = MI.getOperand(0).getReg(); - Builder.setInsertPt(*MI.getParent(), MI); - - int I = MI.getOperand(3).getShuffleMask()[0]; - Register Src1 = MI.getOperand(1).getReg(); - LLT Src1Ty = MRI.getType(Src1); - int Src1NumElts = Src1Ty.isVector() ? Src1Ty.getNumElements() : 1; - Register SrcReg; - if (I >= Src1NumElts) { - SrcReg = MI.getOperand(2).getReg(); - I -= Src1NumElts; - } else if (I >= 0) - SrcReg = Src1; - - if (I < 0) - Builder.buildUndef(DstReg); - else if (!MRI.getType(SrcReg).isVector()) - Builder.buildCopy(DstReg, SrcReg); - else - Builder.buildExtractVectorElementConstant(DstReg, SrcReg, I); - - MI.eraseFromParent(); -} - namespace { /// Select a preference between two uses. CurrentUse is the current preference @@ -8369,7 +8321,7 @@ bool CombinerHelper::matchShuffleDisjointMask(MachineInstr &MI, return false; ArrayRef<int> Mask = Shuffle.getMask(); - const unsigned NumSrcElems = Src1Ty.isVector() ? Src1Ty.getNumElements() : 1; + const unsigned NumSrcElems = Src1Ty.getNumElements(); bool TouchesSrc1 = false; bool TouchesSrc2 = false; diff --git a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp index 04d9309..d6f23b6 100644 --- a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp +++ b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp @@ -602,6 +602,8 @@ void GISelValueTracking::computeKnownBitsImpl(Register R, KnownBits &Known, Depth + 1); computeKnownBitsImpl(MI.getOperand(3).getReg(), WidthKnown, DemandedElts, Depth + 1); + OffsetKnown = OffsetKnown.sext(BitWidth); + WidthKnown = WidthKnown.sext(BitWidth); Known = extractBits(BitWidth, SrcOpKnown, OffsetKnown, WidthKnown); // Sign extend the extracted value using shift left and arithmetic shift // right. diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index b49040b..1fc90d0 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -3359,6 +3359,54 @@ bool IRTranslator::translateShuffleVector(const User &U, Mask = SVI->getShuffleMask(); else Mask = cast<ConstantExpr>(U).getShuffleMask(); + + // As GISel does not represent <1 x > vectors as a separate type from scalars, + // we transform shuffle_vector with a scalar output to an + // ExtractVectorElement. If the input type is also scalar it becomes a Copy. + unsigned DstElts = cast<FixedVectorType>(U.getType())->getNumElements(); + unsigned SrcElts = + cast<FixedVectorType>(U.getOperand(0)->getType())->getNumElements(); + if (DstElts == 1) { + unsigned M = Mask[0]; + if (SrcElts == 1) { + if (M == 0 || M == 1) + return translateCopy(U, *U.getOperand(M), MIRBuilder); + MIRBuilder.buildUndef(getOrCreateVReg(U)); + } else { + Register Dst = getOrCreateVReg(U); + if (M < SrcElts) { + MIRBuilder.buildExtractVectorElementConstant( + Dst, getOrCreateVReg(*U.getOperand(0)), M); + } else if (M < SrcElts * 2) { + MIRBuilder.buildExtractVectorElementConstant( + Dst, getOrCreateVReg(*U.getOperand(1)), M - SrcElts); + } else { + MIRBuilder.buildUndef(Dst); + } + } + return true; + } + + // A single element src is transformed to a build_vector. + if (SrcElts == 1) { + SmallVector<Register> Ops; + Register Undef; + for (int M : Mask) { + LLT SrcTy = getLLTForType(*U.getOperand(0)->getType(), *DL); + if (M == 0 || M == 1) { + Ops.push_back(getOrCreateVReg(*U.getOperand(M))); + } else { + if (!Undef.isValid()) { + Undef = MRI->createGenericVirtualRegister(SrcTy); + MIRBuilder.buildUndef(Undef); + } + Ops.push_back(Undef); + } + } + MIRBuilder.buildBuildVector(getOrCreateVReg(U), Ops); + return true; + } + ArrayRef<int> MaskAlloc = MF->allocateShuffleMask(Mask); MIRBuilder .buildInstr(TargetOpcode::G_SHUFFLE_VECTOR, {getOrCreateVReg(U)}, diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index 38ec83f..178529f 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -4748,6 +4748,9 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) { case G_FMINIMUMNUM: case G_FMAXIMUMNUM: return lowerFMinNumMaxNum(MI); + case G_FMINIMUM: + case G_FMAXIMUM: + return lowerFMinimumMaximum(MI); case G_MERGE_VALUES: return lowerMergeValues(MI); case G_UNMERGE_VALUES: @@ -5819,6 +5822,8 @@ LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorShuffle( } else if (InputUsed[0] == -1U) { // No input vectors were used! The result is undefined. Output = MIRBuilder.buildUndef(NarrowTy).getReg(0); + } else if (NewElts == 1) { + Output = MIRBuilder.buildCopy(NarrowTy, Inputs[InputUsed[0]]).getReg(0); } else { Register Op0 = Inputs[InputUsed[0]]; // If only one input was used, use an undefined vector for the other. @@ -8775,6 +8780,77 @@ LegalizerHelper::lowerFMinNumMaxNum(MachineInstr &MI) { return Legalized; } +LegalizerHelper::LegalizeResult +LegalizerHelper::lowerFMinimumMaximum(MachineInstr &MI) { + unsigned Opc = MI.getOpcode(); + auto [Dst, Src0, Src1] = MI.getFirst3Regs(); + LLT Ty = MRI.getType(Dst); + LLT CmpTy = Ty.changeElementSize(1); + + bool IsMax = (Opc == TargetOpcode::G_FMAXIMUM); + unsigned OpcIeee = + IsMax ? TargetOpcode::G_FMAXNUM_IEEE : TargetOpcode::G_FMINNUM_IEEE; + unsigned OpcNonIeee = + IsMax ? TargetOpcode::G_FMAXNUM : TargetOpcode::G_FMINNUM; + bool MinMaxMustRespectOrderedZero = false; + Register Res; + + // IEEE variants don't need canonicalization + if (LI.isLegalOrCustom({OpcIeee, Ty})) { + Res = MIRBuilder.buildInstr(OpcIeee, {Ty}, {Src0, Src1}).getReg(0); + MinMaxMustRespectOrderedZero = true; + } else if (LI.isLegalOrCustom({OpcNonIeee, Ty})) { + Res = MIRBuilder.buildInstr(OpcNonIeee, {Ty}, {Src0, Src1}).getReg(0); + } else { + auto Compare = MIRBuilder.buildFCmp( + IsMax ? CmpInst::FCMP_OGT : CmpInst::FCMP_OLT, CmpTy, Src0, Src1); + Res = MIRBuilder.buildSelect(Ty, Compare, Src0, Src1).getReg(0); + } + + // Propagate any NaN of both operands + if (!MI.getFlag(MachineInstr::FmNoNans) && + (!isKnownNeverNaN(Src0, MRI) || isKnownNeverNaN(Src1, MRI))) { + auto IsOrdered = MIRBuilder.buildFCmp(CmpInst::FCMP_ORD, CmpTy, Src0, Src1); + + LLT ElementTy = Ty.isScalar() ? Ty : Ty.getElementType(); + APFloat NaNValue = APFloat::getNaN(getFltSemanticForLLT(ElementTy)); + Register NaN = MIRBuilder.buildFConstant(ElementTy, NaNValue).getReg(0); + if (Ty.isVector()) + NaN = MIRBuilder.buildSplatBuildVector(Ty, NaN).getReg(0); + + Res = MIRBuilder.buildSelect(Ty, IsOrdered, Res, NaN).getReg(0); + } + + // fminimum/fmaximum requires -0.0 less than +0.0 + if (!MinMaxMustRespectOrderedZero && !MI.getFlag(MachineInstr::FmNsz)) { + GISelValueTracking VT(MIRBuilder.getMF()); + KnownFPClass Src0Info = VT.computeKnownFPClass(Src0, fcZero); + KnownFPClass Src1Info = VT.computeKnownFPClass(Src1, fcZero); + + if (!Src0Info.isKnownNeverZero() && !Src1Info.isKnownNeverZero()) { + const unsigned Flags = MI.getFlags(); + Register Zero = MIRBuilder.buildFConstant(Ty, 0.0).getReg(0); + auto IsZero = MIRBuilder.buildFCmp(CmpInst::FCMP_OEQ, CmpTy, Res, Zero); + + unsigned TestClass = IsMax ? fcPosZero : fcNegZero; + + auto LHSTestZero = MIRBuilder.buildIsFPClass(CmpTy, Src0, TestClass); + auto LHSSelect = + MIRBuilder.buildSelect(Ty, LHSTestZero, Src0, Res, Flags); + + auto RHSTestZero = MIRBuilder.buildIsFPClass(CmpTy, Src1, TestClass); + auto RHSSelect = + MIRBuilder.buildSelect(Ty, RHSTestZero, Src1, LHSSelect, Flags); + + Res = MIRBuilder.buildSelect(Ty, IsZero, RHSSelect, Res, Flags).getReg(0); + } + } + + MIRBuilder.buildCopy(Dst, Res); + MI.eraseFromParent(); + return Legalized; +} + LegalizerHelper::LegalizeResult LegalizerHelper::lowerFMad(MachineInstr &MI) { // Expand G_FMAD a, b, c -> G_FADD (G_FMUL a, b), c Register DstReg = MI.getOperand(0).getReg(); @@ -9016,22 +9092,18 @@ LegalizerHelper::lowerShuffleVector(MachineInstr &MI) { continue; } - if (Src0Ty.isScalar()) { - BuildVec.push_back(Idx == 0 ? Src0Reg : Src1Reg); - } else { - int NumElts = Src0Ty.getNumElements(); - Register SrcVec = Idx < NumElts ? Src0Reg : Src1Reg; - int ExtractIdx = Idx < NumElts ? Idx : Idx - NumElts; - auto IdxK = MIRBuilder.buildConstant(IdxTy, ExtractIdx); - auto Extract = MIRBuilder.buildExtractVectorElement(EltTy, SrcVec, IdxK); - BuildVec.push_back(Extract.getReg(0)); - } + assert(!Src0Ty.isScalar() && "Unexpected scalar G_SHUFFLE_VECTOR"); + + int NumElts = Src0Ty.getNumElements(); + Register SrcVec = Idx < NumElts ? Src0Reg : Src1Reg; + int ExtractIdx = Idx < NumElts ? Idx : Idx - NumElts; + auto IdxK = MIRBuilder.buildConstant(IdxTy, ExtractIdx); + auto Extract = MIRBuilder.buildExtractVectorElement(EltTy, SrcVec, IdxK); + BuildVec.push_back(Extract.getReg(0)); } - if (DstTy.isVector()) - MIRBuilder.buildBuildVector(DstReg, BuildVec); - else - MIRBuilder.buildCopy(DstReg, BuildVec[0]); + assert(DstTy.isVector() && "Unexpected scalar G_SHUFFLE_VECTOR"); + MIRBuilder.buildBuildVector(DstReg, BuildVec); MI.eraseFromParent(); return Legalized; } diff --git a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp index 27df7e3..4b4df98 100644 --- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp +++ b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp @@ -800,10 +800,11 @@ MachineInstrBuilder MachineIRBuilder::buildShuffleVector(const DstOp &Res, LLT DstTy = Res.getLLTTy(*getMRI()); LLT Src1Ty = Src1.getLLTTy(*getMRI()); LLT Src2Ty = Src2.getLLTTy(*getMRI()); - const LLT DstElemTy = DstTy.isVector() ? DstTy.getElementType() : DstTy; - const LLT ElemTy1 = Src1Ty.isVector() ? Src1Ty.getElementType() : Src1Ty; - const LLT ElemTy2 = Src2Ty.isVector() ? Src2Ty.getElementType() : Src2Ty; + const LLT DstElemTy = DstTy.getScalarType(); + const LLT ElemTy1 = Src1Ty.getScalarType(); + const LLT ElemTy2 = Src2Ty.getScalarType(); assert(DstElemTy == ElemTy1 && DstElemTy == ElemTy2); + assert(Mask.size() > 1 && "Scalar G_SHUFFLE_VECTOR are not supported"); (void)DstElemTy; (void)ElemTy1; (void)ElemTy2; diff --git a/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/llvm/lib/CodeGen/MIRParser/MIParser.cpp index 6a464d9..4795d81 100644 --- a/llvm/lib/CodeGen/MIRParser/MIParser.cpp +++ b/llvm/lib/CodeGen/MIRParser/MIParser.cpp @@ -2788,6 +2788,9 @@ bool MIParser::parseShuffleMaskOperand(MachineOperand &Dest) { if (expectAndConsume(MIToken::rparen)) return error("shufflemask should be terminated by ')'."); + if (ShufMask.size() < 2) + return error("shufflemask should have > 1 element"); + ArrayRef<int> MaskAlloc = MF.allocateShuffleMask(ShufMask); Dest = MachineOperand::CreateShuffleMask(MaskAlloc); return false; diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp index 1154855..c0710c4 100644 --- a/llvm/lib/CodeGen/MachineVerifier.cpp +++ b/llvm/lib/CodeGen/MachineVerifier.cpp @@ -1924,13 +1924,23 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) { if (Src0Ty != Src1Ty) report("Source operands must be the same type", MI); - if (Src0Ty.getScalarType() != DstTy.getScalarType()) + if (Src0Ty.getScalarType() != DstTy.getScalarType()) { report("G_SHUFFLE_VECTOR cannot change element type", MI); + break; + } + if (!Src0Ty.isVector()) { + report("G_SHUFFLE_VECTOR must have vector src", MI); + break; + } + if (!DstTy.isVector()) { + report("G_SHUFFLE_VECTOR must have vector dst", MI); + break; + } // Don't check that all operands are vector because scalars are used in // place of 1 element vectors. - int SrcNumElts = Src0Ty.isVector() ? Src0Ty.getNumElements() : 1; - int DstNumElts = DstTy.isVector() ? DstTy.getNumElements() : 1; + int SrcNumElts = Src0Ty.getNumElements(); + int DstNumElts = DstTy.getNumElements(); ArrayRef<int> MaskIdxes = MaskOp.getShuffleMask(); diff --git a/llvm/lib/CodeGen/RegAllocFast.cpp b/llvm/lib/CodeGen/RegAllocFast.cpp index 72b364c..697b779 100644 --- a/llvm/lib/CodeGen/RegAllocFast.cpp +++ b/llvm/lib/CodeGen/RegAllocFast.cpp @@ -211,7 +211,7 @@ private: unsigned getSparseSetIndex() const { return VirtReg.virtRegIndex(); } }; - using LiveRegMap = SparseSet<LiveReg, unsigned, identity_cxx20, uint16_t>; + using LiveRegMap = SparseSet<LiveReg, unsigned, identity, uint16_t>; /// This map contains entries for each virtual register that is currently /// available in a physical register. LiveRegMap LiveVirtRegs; diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index d2ea652..8676060 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -19993,8 +19993,12 @@ static SDNode *getPostIndexedLoadStoreOp(SDNode *N, bool &IsLoad, // nor a successor of N. Otherwise, if Op is folded that would // create a cycle. unsigned MaxSteps = SelectionDAG::getHasPredecessorMaxSteps(); - for (SDNode *Op : Ptr->users()) { + for (SDUse &U : Ptr->uses()) { + if (U.getResNo() != Ptr.getResNo()) + continue; + // Check for #1. + SDNode *Op = U.getUser(); if (!shouldCombineToPostInc(N, Ptr, Op, BasePtr, Offset, AM, DAG, TLI)) continue; diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index bfa566a..dee0909 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -1162,6 +1162,43 @@ SDValue SelectionDAGBuilder::getMemoryRoot() { return updateRoot(PendingLoads); } +SDValue SelectionDAGBuilder::getFPOperationRoot(fp::ExceptionBehavior EB) { + // If the new exception behavior differs from that of the pending + // ones, chain up them and update the root. + switch (EB) { + case fp::ExceptionBehavior::ebMayTrap: + case fp::ExceptionBehavior::ebIgnore: + // Floating-point exceptions produced by such operations are not intended + // to be observed, so the sequence of these operations does not need to be + // preserved. + // + // They however must not be mixed with the instructions that have strict + // exception behavior. Placing an operation with 'ebIgnore' behavior between + // 'ebStrict' operations could distort the observed exception behavior. + if (!PendingConstrainedFPStrict.empty()) { + assert(PendingConstrainedFP.empty()); + updateRoot(PendingConstrainedFPStrict); + } + break; + case fp::ExceptionBehavior::ebStrict: + // Floating-point exception produced by these operations may be observed, so + // they must be correctly chained. If trapping on FP exceptions is + // disabled, the exceptions can be observed only by functions that read + // exception flags, like 'llvm.get_fpenv' or 'fetestexcept'. It means that + // the order of operations is not significant between barriers. + // + // If trapping is enabled, each operation becomes an implicit observation + // point, so the operations must be sequenced according their original + // source order. + if (!PendingConstrainedFP.empty()) { + assert(PendingConstrainedFPStrict.empty()); + updateRoot(PendingConstrainedFP); + } + // TODO: Add support for trapping-enabled scenarios. + } + return DAG.getRoot(); +} + SDValue SelectionDAGBuilder::getRoot() { // Chain up all pending constrained intrinsics together with all // pending loads, by simply appending them to PendingLoads and @@ -8298,6 +8335,30 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, } } +void SelectionDAGBuilder::pushFPOpOutChain(SDValue Result, + fp::ExceptionBehavior EB) { + assert(Result.getNode()->getNumValues() == 2); + SDValue OutChain = Result.getValue(1); + assert(OutChain.getValueType() == MVT::Other); + + // Instead of updating the root immediately, push the produced chain to the + // appropriate list, deferring the update until the root is requested. In this + // case, the nodes from the lists are chained using TokenFactor, indicating + // that the operations are independent. + // + // In particular, the root is updated before any call that might access the + // floating-point environment, except for constrained intrinsics. + switch (EB) { + case fp::ExceptionBehavior::ebMayTrap: + case fp::ExceptionBehavior::ebIgnore: + PendingConstrainedFP.push_back(OutChain); + break; + case fp::ExceptionBehavior::ebStrict: + PendingConstrainedFPStrict.push_back(OutChain); + break; + } +} + void SelectionDAGBuilder::visitConstrainedFPIntrinsic( const ConstrainedFPIntrinsic &FPI) { SDLoc sdl = getCurSDLoc(); @@ -8305,42 +8366,16 @@ void SelectionDAGBuilder::visitConstrainedFPIntrinsic( // We do not need to serialize constrained FP intrinsics against // each other or against (nonvolatile) loads, so they can be // chained like loads. - SDValue Chain = DAG.getRoot(); + fp::ExceptionBehavior EB = *FPI.getExceptionBehavior(); + SDValue Chain = getFPOperationRoot(EB); SmallVector<SDValue, 4> Opers; Opers.push_back(Chain); for (unsigned I = 0, E = FPI.getNonMetadataArgCount(); I != E; ++I) Opers.push_back(getValue(FPI.getArgOperand(I))); - auto pushOutChain = [this](SDValue Result, fp::ExceptionBehavior EB) { - assert(Result.getNode()->getNumValues() == 2); - - // Push node to the appropriate list so that future instructions can be - // chained up correctly. - SDValue OutChain = Result.getValue(1); - switch (EB) { - case fp::ExceptionBehavior::ebIgnore: - // The only reason why ebIgnore nodes still need to be chained is that - // they might depend on the current rounding mode, and therefore must - // not be moved across instruction that may change that mode. - [[fallthrough]]; - case fp::ExceptionBehavior::ebMayTrap: - // These must not be moved across calls or instructions that may change - // floating-point exception masks. - PendingConstrainedFP.push_back(OutChain); - break; - case fp::ExceptionBehavior::ebStrict: - // These must not be moved across calls or instructions that may change - // floating-point exception masks or read floating-point exception flags. - // In addition, they cannot be optimized out even if unused. - PendingConstrainedFPStrict.push_back(OutChain); - break; - } - }; - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT VT = TLI.getValueType(DAG.getDataLayout(), FPI.getType()); SDVTList VTs = DAG.getVTList(VT, MVT::Other); - fp::ExceptionBehavior EB = *FPI.getExceptionBehavior(); SDNodeFlags Flags; if (EB == fp::ExceptionBehavior::ebIgnore) @@ -8364,7 +8399,7 @@ void SelectionDAGBuilder::visitConstrainedFPIntrinsic( !TLI.isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT)) { Opers.pop_back(); SDValue Mul = DAG.getNode(ISD::STRICT_FMUL, sdl, VTs, Opers, Flags); - pushOutChain(Mul, EB); + pushFPOpOutChain(Mul, EB); Opcode = ISD::STRICT_FADD; Opers.clear(); Opers.push_back(Mul.getValue(1)); @@ -8395,7 +8430,7 @@ void SelectionDAGBuilder::visitConstrainedFPIntrinsic( } SDValue Result = DAG.getNode(Opcode, sdl, VTs, Opers, Flags); - pushOutChain(Result, EB); + pushFPOpOutChain(Result, EB); SDValue FPResult = Result.getValue(0); setValue(&FPI, FPResult); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h index c7577fa..47e19f7 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -195,6 +195,11 @@ private: /// Update root to include all chains from the Pending list. SDValue updateRoot(SmallVectorImpl<SDValue> &Pending); + /// Given a node representing a floating-point operation and its specified + /// exception behavior, this either updates the root or stores the node in + /// a list to be added to chains latter. + void pushFPOpOutChain(SDValue Result, fp::ExceptionBehavior EB); + /// A unique monotonically increasing number used to order the SDNodes we /// create. unsigned SDNodeOrder; @@ -300,6 +305,13 @@ public: /// memory node that may need to be ordered after any prior load instructions. SDValue getMemoryRoot(); + /// Return the current virtual root of the Selection DAG, flushing + /// PendingConstrainedFP or PendingConstrainedFPStrict items if the new + /// exception behavior (specified by \p EB) differs from that of the pending + /// instructions. This must be done before emitting constrained FP operation + /// call. + SDValue getFPOperationRoot(fp::ExceptionBehavior EB); + /// Similar to getMemoryRoot, but also flushes PendingConstrainedFP(Strict) /// items. This must be done before emitting any call other any other node /// that may need to be ordered after FP instructions due to other side diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index 060b1dd..59798b3 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -2097,6 +2097,11 @@ Value *TargetLoweringBase::getSDagStackGuard(const Module &M) const { } Function *TargetLoweringBase::getSSPStackGuardCheck(const Module &M) const { + // MSVC CRT has a function to validate security cookie. + RTLIB::LibcallImpl SecurityCheckCookieLibcall = + getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE); + if (SecurityCheckCookieLibcall != RTLIB::Unsupported) + return M.getFunction(getLibcallImplName(SecurityCheckCookieLibcall)); return nullptr; } diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp index 488b078..1096e57 100644 --- a/llvm/lib/IR/AsmWriter.cpp +++ b/llvm/lib/IR/AsmWriter.cpp @@ -4082,10 +4082,10 @@ void AssemblyWriter::printTypeIdentities() { /// printFunction - Print all aspects of a function. void AssemblyWriter::printFunction(const Function *F) { - if (AnnotationWriter) AnnotationWriter->emitFunctionAnnot(F, Out); - if (F->isMaterializable()) Out << "; Materializable\n"; + else if (AnnotationWriter) + AnnotationWriter->emitFunctionAnnot(F, Out); const AttributeList &Attrs = F->getAttributes(); if (Attrs.hasFnAttrs()) { diff --git a/llvm/lib/IR/Module.cpp b/llvm/lib/IR/Module.cpp index 30b5e48..e19336e 100644 --- a/llvm/lib/IR/Module.cpp +++ b/llvm/lib/IR/Module.cpp @@ -403,9 +403,14 @@ void Module::setModuleFlag(ModFlagBehavior Behavior, StringRef Key, Metadata *Val) { NamedMDNode *ModFlags = getOrInsertModuleFlagsMetadata(); // Replace the flag if it already exists. - for (MDNode *Flag : ModFlags->operands()) { + for (unsigned i = 0; i < ModFlags->getNumOperands(); ++i) { + MDNode *Flag = ModFlags->getOperand(i); if (cast<MDString>(Flag->getOperand(1))->getString() == Key) { - Flag->replaceOperandWith(2, Val); + Type *Int32Ty = Type::getInt32Ty(Context); + Metadata *Ops[3] = { + ConstantAsMetadata::get(ConstantInt::get(Int32Ty, Behavior)), + MDString::get(Context, Key), Val}; + ModFlags->setOperand(i, MDNode::get(Context, Ops)); return; } } diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp index 9d0fa11..4bc2a18 100644 --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -471,16 +471,14 @@ static void thinLTOInternalizeAndPromoteGUID( ValueInfo VI, function_ref<bool(StringRef, ValueInfo)> isExported, function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)> isPrevailing) { - auto ExternallyVisibleCopies = - llvm::count_if(VI.getSummaryList(), - [](const std::unique_ptr<GlobalValueSummary> &Summary) { - return !GlobalValue::isLocalLinkage(Summary->linkage()); - }); - // Before performing index-based internalization and promotion for this GUID, // the local flag should be consistent with the summary list linkage types. VI.verifyLocal(); + const bool SingleExternallyVisibleCopy = + VI.getSummaryList().size() == 1 && + !GlobalValue::isLocalLinkage(VI.getSummaryList().front()->linkage()); + for (auto &S : VI.getSummaryList()) { // First see if we need to promote an internal value because it is not // exported. @@ -543,7 +541,9 @@ static void thinLTOInternalizeAndPromoteGUID( GlobalValue::isExternalWeakLinkage(S->linkage())) continue; - if (isPrevailing(VI.getGUID(), S.get()) && ExternallyVisibleCopies == 1) + // We may have a single summary copy that is externally visible but not + // prevailing if the prevailing copy is in a native object. + if (SingleExternallyVisibleCopy && isPrevailing(VI.getGUID(), S.get())) S->setLinkage(GlobalValue::InternalLinkage); } } @@ -1086,15 +1086,15 @@ LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms, GlobalValue::getGlobalIdentifier(Sym.getIRName(), GlobalValue::ExternalLinkage, "")); if (R.Prevailing) - ThinLTO.PrevailingModuleForGUID[GUID] = BM.getModuleIdentifier(); + ThinLTO.setPrevailingModuleForGUID(GUID, BM.getModuleIdentifier()); } } if (Error Err = BM.readSummary(ThinLTO.CombinedIndex, BM.getModuleIdentifier(), [&](GlobalValue::GUID GUID) { - return ThinLTO.PrevailingModuleForGUID[GUID] == - BM.getModuleIdentifier(); + return ThinLTO.isPrevailingModuleForGUID( + GUID, BM.getModuleIdentifier()); })) return Err; LLVM_DEBUG(dbgs() << "Module " << BM.getModuleIdentifier() << "\n"); @@ -1108,8 +1108,8 @@ LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms, GlobalValue::getGlobalIdentifier(Sym.getIRName(), GlobalValue::ExternalLinkage, "")); if (R.Prevailing) { - assert(ThinLTO.PrevailingModuleForGUID[GUID] == - BM.getModuleIdentifier()); + assert( + ThinLTO.isPrevailingModuleForGUID(GUID, BM.getModuleIdentifier())); // For linker redefined symbols (via --wrap or --defsym) we want to // switch the linkage to `weak` to prevent IPOs from happening. @@ -1988,7 +1988,7 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache, LocalWPDTargetsMap); auto isPrevailing = [&](GlobalValue::GUID GUID, const GlobalValueSummary *S) { - return ThinLTO.PrevailingModuleForGUID[GUID] == S->modulePath(); + return ThinLTO.isPrevailingModuleForGUID(GUID, S->modulePath()); }; if (EnableMemProfContextDisambiguation) { MemProfContextDisambiguation ContextDisambiguation; diff --git a/llvm/lib/MC/CMakeLists.txt b/llvm/lib/MC/CMakeLists.txt index 1e1d0a6..70c4577 100644 --- a/llvm/lib/MC/CMakeLists.txt +++ b/llvm/lib/MC/CMakeLists.txt @@ -73,9 +73,10 @@ add_llvm_component_library(LLVMMC ${LLVM_MAIN_INCLUDE_DIR}/llvm/MC LINK_COMPONENTS + BinaryFormat + DebugInfoDWARFLowLevel Support TargetParser - BinaryFormat DEPENDS intrinsics_gen diff --git a/llvm/lib/MC/MCSFrame.cpp b/llvm/lib/MC/MCSFrame.cpp index d6fa54c..e0a90df 100644 --- a/llvm/lib/MC/MCSFrame.cpp +++ b/llvm/lib/MC/MCSFrame.cpp @@ -8,6 +8,8 @@ #include "llvm/MC/MCSFrame.h" #include "llvm/BinaryFormat/SFrame.h" +#include "llvm/DebugInfo/DWARF/LowLevel/DWARFCFIProgram.h" +#include "llvm/DebugInfo/DWARF/LowLevel/DWARFDataExtractorSimple.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCObjectFileInfo.h" @@ -211,8 +213,152 @@ class SFrameEmitterImpl { return true; } + // Technically, the escape data could be anything, but it is commonly a dwarf + // CFI program. Even then, it could contain an arbitrarily complicated Dwarf + // expression. Following gnu-gas, look for certain common cases that could + // invalidate an FDE, emit a warning for those sequences, and don't generate + // an FDE in those cases. Allow any that are known safe. It is likely that + // more thorough test cases could refine this code, but it handles the most + // important ones compatibly with gas. + // Returns true if the CFI escape sequence is safe for sframes. + bool isCFIEscapeSafe(SFrameFDE &FDE, const SFrameFRE &FRE, + const MCCFIInstruction &CFI) { + const MCAsmInfo *AI = Streamer.getContext().getAsmInfo(); + DWARFDataExtractorSimple data(CFI.getValues(), AI->isLittleEndian(), + AI->getCodePointerSize()); + + // Normally, both alignment factors are extracted from the enclosing Dwarf + // FDE or CIE. We don't have one here. Alignments are used for scaling + // factors for ops like CFA_def_cfa_offset_sf. But this particular function + // is only interested in registers. + dwarf::CFIProgram P(/*CodeAlignmentFactor=*/1, + /*DataAlignmentFactor=*/1, + Streamer.getContext().getTargetTriple().getArch()); + uint64_t Offset = 0; + if (P.parse(data, &Offset, CFI.getValues().size())) { + // Not a parsable dwarf expression. Assume the worst. + Streamer.getContext().reportWarning( + CFI.getLoc(), + "skipping SFrame FDE; .cfi_escape with unknown effects"); + return false; + } + + // This loop deals with dwarf::CFIProgram::Instructions. Everywhere else + // this file deals with MCCFIInstructions. + for (const dwarf::CFIProgram::Instruction &I : P) { + switch (I.Opcode) { + case dwarf::DW_CFA_nop: + break; + case dwarf::DW_CFA_val_offset: { + // First argument is a register. Anything that touches CFA, FP, or RA is + // a problem, but allow others through. As an even more special case, + // allow SP + 0. + auto Reg = I.getOperandAsUnsigned(P, 0); + // The parser should have failed in this case. + assert(Reg && "DW_CFA_val_offset with no register."); + bool SPOk = true; + if (*Reg == SPReg) { + auto Opnd = I.getOperandAsSigned(P, 1); + if (!Opnd || *Opnd != 0) + SPOk = false; + } + if (!SPOk || *Reg == RAReg || *Reg == FPReg) { + StringRef RN = *Reg == SPReg + ? "SP reg " + : (*Reg == FPReg ? "FP reg " : "RA reg "); + Streamer.getContext().reportWarning( + CFI.getLoc(), + Twine( + "skipping SFrame FDE; .cfi_escape DW_CFA_val_offset with ") + + RN + Twine(*Reg)); + return false; + } + } break; + case dwarf::DW_CFA_expression: { + // First argument is a register. Anything that touches CFA, FP, or RA is + // a problem, but allow others through. + auto Reg = I.getOperandAsUnsigned(P, 0); + if (!Reg) { + Streamer.getContext().reportWarning( + CFI.getLoc(), + "skipping SFrame FDE; .cfi_escape with unknown effects"); + return false; + } + if (*Reg == SPReg || *Reg == RAReg || *Reg == FPReg) { + StringRef RN = *Reg == SPReg + ? "SP reg " + : (*Reg == FPReg ? "FP reg " : "RA reg "); + Streamer.getContext().reportWarning( + CFI.getLoc(), + Twine( + "skipping SFrame FDE; .cfi_escape DW_CFA_expression with ") + + RN + Twine(*Reg)); + return false; + } + } break; + case dwarf::DW_CFA_GNU_args_size: { + auto Size = I.getOperandAsSigned(P, 0); + // Zero size doesn't affect the cfa. + if (Size && *Size == 0) + break; + if (FRE.Info.getBaseRegister() != BaseReg::FP) { + Streamer.getContext().reportWarning( + CFI.getLoc(), + Twine("skipping SFrame FDE; .cfi_escape DW_CFA_GNU_args_size " + "with non frame-pointer CFA")); + return false; + } + } break; + // Cases that gas doesn't specially handle. TODO: Some of these could be + // analyzed and handled instead of just punting. But these are uncommon, + // or should be written as normal cfi directives. Some will need fixes to + // the scaling factor. + case dwarf::DW_CFA_advance_loc: + case dwarf::DW_CFA_offset: + case dwarf::DW_CFA_restore: + case dwarf::DW_CFA_set_loc: + case dwarf::DW_CFA_advance_loc1: + case dwarf::DW_CFA_advance_loc2: + case dwarf::DW_CFA_advance_loc4: + case dwarf::DW_CFA_offset_extended: + case dwarf::DW_CFA_restore_extended: + case dwarf::DW_CFA_undefined: + case dwarf::DW_CFA_same_value: + case dwarf::DW_CFA_register: + case dwarf::DW_CFA_remember_state: + case dwarf::DW_CFA_restore_state: + case dwarf::DW_CFA_def_cfa: + case dwarf::DW_CFA_def_cfa_register: + case dwarf::DW_CFA_def_cfa_offset: + case dwarf::DW_CFA_def_cfa_expression: + case dwarf::DW_CFA_offset_extended_sf: + case dwarf::DW_CFA_def_cfa_sf: + case dwarf::DW_CFA_def_cfa_offset_sf: + case dwarf::DW_CFA_val_offset_sf: + case dwarf::DW_CFA_val_expression: + case dwarf::DW_CFA_MIPS_advance_loc8: + case dwarf::DW_CFA_AARCH64_negate_ra_state_with_pc: + case dwarf::DW_CFA_AARCH64_negate_ra_state: + case dwarf::DW_CFA_LLVM_def_aspace_cfa: + case dwarf::DW_CFA_LLVM_def_aspace_cfa_sf: + Streamer.getContext().reportWarning( + CFI.getLoc(), "skipping SFrame FDE; .cfi_escape " + "CFA expression with unknown side effects"); + return false; + default: + // Dwarf expression was only partially valid, and user could have + // written anything. + Streamer.getContext().reportWarning( + CFI.getLoc(), + "skipping SFrame FDE; .cfi_escape with unknown effects"); + return false; + } + } + return true; + } + // Add the effects of CFI to the current FDE, creating a new FRE when - // necessary. + // necessary. Return true if the CFI is representable in the sframe format. bool handleCFI(SFrameFDE &FDE, SFrameFRE &FRE, const MCCFIInstruction &CFI) { switch (CFI.getOperation()) { case MCCFIInstruction::OpDefCfaRegister: @@ -265,10 +411,11 @@ class SFrameEmitterImpl { FRE = FDE.SaveState.pop_back_val(); return true; case MCCFIInstruction::OpEscape: - // TODO: Implement. Will use FDE. - return true; + // This is a string of bytes that contains an arbitrary dwarf-expression + // that may or may not affect unwind info. + return isCFIEscapeSafe(FDE, FRE, CFI); default: - // Instructions that don't affect the CFA, RA, and SP can be safely + // Instructions that don't affect the CFA, RA, and FP can be safely // ignored. return true; } diff --git a/llvm/lib/Support/Timer.cpp b/llvm/lib/Support/Timer.cpp index 67483ba..9d45096 100644 --- a/llvm/lib/Support/Timer.cpp +++ b/llvm/lib/Support/Timer.cpp @@ -240,7 +240,8 @@ private: getGroupEntry(StringRef GroupName, StringRef GroupDescription) { std::pair<TimerGroup *, Name2TimerMap> &GroupEntry = Map[GroupName]; if (!GroupEntry.first) - GroupEntry.first = new TimerGroup(GroupName, GroupDescription); + GroupEntry.first = + new TimerGroup(GroupName, GroupDescription, /*PrintOnExit=*/true); return GroupEntry; } @@ -270,9 +271,10 @@ TimerGroup &NamedRegionTimer::getNamedTimerGroup(StringRef GroupName, static TimerGroup *TimerGroupList = nullptr; TimerGroup::TimerGroup(StringRef Name, StringRef Description, - sys::SmartMutex<true> &lock) + sys::SmartMutex<true> &lock, bool PrintOnExit) : Name(Name.begin(), Name.end()), - Description(Description.begin(), Description.end()) { + Description(Description.begin(), Description.end()), + PrintOnExit(PrintOnExit) { // Add the group to TimerGroupList. sys::SmartScopedLock<true> L(lock); if (TimerGroupList) @@ -282,12 +284,12 @@ TimerGroup::TimerGroup(StringRef Name, StringRef Description, TimerGroupList = this; } -TimerGroup::TimerGroup(StringRef Name, StringRef Description) - : TimerGroup(Name, Description, timerLock()) {} +TimerGroup::TimerGroup(StringRef Name, StringRef Description, bool PrintOnExit) + : TimerGroup(Name, Description, timerLock(), PrintOnExit) {} TimerGroup::TimerGroup(StringRef Name, StringRef Description, - const StringMap<TimeRecord> &Records) - : TimerGroup(Name, Description) { + const StringMap<TimeRecord> &Records, bool PrintOnExit) + : TimerGroup(Name, Description, PrintOnExit) { TimersToPrint.reserve(Records.size()); for (const auto &P : Records) TimersToPrint.emplace_back(P.getValue(), std::string(P.getKey()), @@ -301,7 +303,7 @@ TimerGroup::~TimerGroup() { while (FirstTimer) removeTimer(*FirstTimer); - if (!TimersToPrint.empty()) { + if (!TimersToPrint.empty() && PrintOnExit) { std::unique_ptr<raw_ostream> OutStream = CreateInfoOutputFile(); PrintQueuedTimers(*OutStream); } @@ -530,7 +532,7 @@ public: sys::SmartMutex<true> TimerLock; TimerGroup DefaultTimerGroup{"misc", "Miscellaneous Ungrouped Timers", - TimerLock}; + TimerLock, /*PrintOnExit=*/true}; SignpostEmitter Signposts; // Order of these members and initialization below is important. For example diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td index 86f9548..a4529a5 100644 --- a/llvm/lib/Target/AArch64/AArch64.td +++ b/llvm/lib/Target/AArch64/AArch64.td @@ -73,9 +73,16 @@ def SVEUnsupported : AArch64Unsupported { SVE2Unsupported.F); } -let F = [HasSME2p2, HasSVE2p2_or_SME2p2, HasNonStreamingSVE_or_SME2p2, - HasNonStreamingSVE2p2_or_SME2p2] in -def SME2p2Unsupported : AArch64Unsupported; +def SME2p3Unsupported : AArch64Unsupported { + let F = [HasSVE2p3_or_SME2p3, HasSVE_B16MM]; +} + +def SME2p2Unsupported : AArch64Unsupported { + let F = !listconcat([HasSME2p2, HasSVE2p2_or_SME2p2, + HasNonStreamingSVE_or_SME2p2, + HasNonStreamingSVE2p2_or_SME2p2], + SME2p3Unsupported.F); +} def SME2p1Unsupported : AArch64Unsupported { let F = !listconcat([HasSME2p1, HasSVE2p1_or_SME2p1, diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td index ecaeff7..b3ec65c 100644 --- a/llvm/lib/Target/AArch64/AArch64Combine.td +++ b/llvm/lib/Target/AArch64/AArch64Combine.td @@ -71,7 +71,6 @@ def AArch64PreLegalizerCombiner: GICombiner< "AArch64PreLegalizerCombinerImpl", [all_combines, icmp_redundant_trunc, fold_global_offset, - shuffle_to_extract, ext_addv_to_udot_addv, ext_uaddv_to_uaddlv, push_sub_through_zext, diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td index 46f5f0c..0e94b78 100644 --- a/llvm/lib/Target/AArch64/AArch64Features.td +++ b/llvm/lib/Target/AArch64/AArch64Features.td @@ -585,6 +585,47 @@ def FeatureSME_TMOP: ExtensionWithMArch<"sme-tmop", "SME_TMOP", "FEAT_SME_TMOP", def FeatureSSVE_FEXPA : ExtensionWithMArch<"ssve-fexpa", "SSVE_FEXPA", "FEAT_SSVE_FEXPA", "Enable SVE FEXPA instruction in Streaming SVE mode", [FeatureSME2]>; +//===----------------------------------------------------------------------===// +// Armv9.7 Architecture Extensions +//===----------------------------------------------------------------------===// + +def FeatureCMH : ExtensionWithMArch<"cmh", "CMH", "FEAT_CMH", + "Enable Armv9.7-A Contention Management Hints">; + +def FeatureLSCP : ExtensionWithMArch<"lscp", "LSCP", "FEAT_LSCP", + "Enable Armv9.7-A Load-acquire and store-release pair extension">; + +def FeatureTLBID: ExtensionWithMArch<"tlbid", "TLBID", "FEAT_TLBID", + "Enable Armv9.7-A TLBI Domains extension">; + +def FeatureMPAMv2: ExtensionWithMArch<"mpamv2", "MPAMv2", "FEAT_MPAMv2", + "Enable Armv9.7-A MPAMv2 Lookaside Buffer Invalidate instructions">; + +def FeatureMTETC: ExtensionWithMArch<"mtetc", "MTETC", "FEAT_MTETC", + "Enable Virtual Memory Tagging Extension">; + +def FeatureGCIE: ExtensionWithMArch<"gcie", "GCIE", "FEAT_GCIE", + "Enable GICv5 (Generic Interrupt Controller) CPU Interface Extension">; + +def FeatureSVE2p3 : ExtensionWithMArch<"sve2p3", "SVE2p3", "FEAT_SVE2p3", + "Enable Armv9.7-A Scalable Vector Extension 2.3 instructions", [FeatureSVE2p2]>; + +def FeatureSME2p3 : ExtensionWithMArch<"sme2p3", "SME2p3", "FEAT_SME2p3", + "Enable Armv9.7-A Scalable Matrix Extension 2.3 instructions", [FeatureSME2p2]>; + +def FeatureSVE_B16MM : ExtensionWithMArch<"sve-b16mm", "SVE_B16MM", "FEAT_SVE_B16MM", + "Enable Armv9.7-A SVE non-widening BFloat16 matrix multiply-accumulate", [FeatureSVE]>; + +def FeatureF16MM : ExtensionWithMArch<"f16mm", "F16MM", "FEAT_F16MM", + "Enable Armv9.7-A non-widening half-precision matrix multiply-accumulate", [FeatureFullFP16]>; + +def FeatureF16F32DOT : ExtensionWithMArch<"f16f32dot", "F16F32DOT", "FEAT_F16F32DOT", + "Enable Armv9.7-A Advanced SIMD half-precision dot product accumulate to single-precision", [FeatureNEON, FeatureFullFP16]>; + +def FeatureF16F32MM : ExtensionWithMArch<"f16f32mm", "F16F32MM", "FEAT_F16F32MM", + "Enable Armv9.7-A Advanced SIMD half-precision matrix multiply-accumulate to single-precision", [FeatureNEON, FeatureFullFP16]>; + +//===----------------------------------------------------------------------===// // Other Features //===----------------------------------------------------------------------===// @@ -939,9 +980,12 @@ def HasV9_5aOps : Architecture64<9, 5, "a", "v9.5a", [HasV9_4aOps, FeatureCPA], !listconcat(HasV9_4aOps.DefaultExts, [FeatureCPA, FeatureLUT, FeatureFAMINMAX])>; def HasV9_6aOps : Architecture64<9, 6, "a", "v9.6a", - [HasV9_5aOps, FeatureCMPBR, FeatureFPRCVT, FeatureSVE2p2, FeatureLSUI, FeatureOCCMO], - !listconcat(HasV9_5aOps.DefaultExts, [FeatureCMPBR, FeatureFPRCVT, FeatureSVE2p2, + [HasV9_5aOps, FeatureCMPBR, FeatureLSUI, FeatureOCCMO], + !listconcat(HasV9_5aOps.DefaultExts, [FeatureCMPBR, FeatureLSUI, FeatureOCCMO])>; +def HasV9_7aOps : Architecture64<9, 7, "a", "v9.7a", + [HasV9_6aOps, FeatureSVE2p3, FeatureFPRCVT], + !listconcat(HasV9_6aOps.DefaultExts, [FeatureSVE2p3, FeatureFPRCVT])>; def HasV8_0rOps : Architecture64<8, 0, "r", "v8r", [ //v8.1 FeatureCRC, FeaturePAN, FeatureLSE, FeatureCONTEXTIDREL2, diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index a81de5c..d16b116 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -9002,12 +9002,12 @@ static void analyzeCallOperands(const AArch64TargetLowering &TLI, } static SMECallAttrs -getSMECallAttrs(const Function &Caller, const AArch64TargetLowering &TLI, +getSMECallAttrs(const Function &Caller, const RTLIB::RuntimeLibcallsInfo &RTLCI, const TargetLowering::CallLoweringInfo &CLI) { if (CLI.CB) - return SMECallAttrs(*CLI.CB, &TLI); + return SMECallAttrs(*CLI.CB, &RTLCI); if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee)) - return SMECallAttrs(SMEAttrs(Caller), SMEAttrs(ES->getSymbol(), TLI)); + return SMECallAttrs(SMEAttrs(Caller), SMEAttrs(ES->getSymbol(), RTLCI)); return SMECallAttrs(SMEAttrs(Caller), SMEAttrs(SMEAttrs::Normal)); } @@ -9029,7 +9029,8 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization( // SME Streaming functions are not eligible for TCO as they may require // the streaming mode or ZA to be restored after returning from the call. - SMECallAttrs CallAttrs = getSMECallAttrs(CallerF, *this, CLI); + SMECallAttrs CallAttrs = + getSMECallAttrs(CallerF, getRuntimeLibcallsInfo(), CLI); if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() || CallAttrs.requiresPreservingAllZAState() || CallAttrs.caller().hasStreamingBody()) @@ -9454,7 +9455,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, } // Determine whether we need any streaming mode changes. - SMECallAttrs CallAttrs = getSMECallAttrs(MF.getFunction(), *this, CLI); + SMECallAttrs CallAttrs = + getSMECallAttrs(MF.getFunction(), getRuntimeLibcallsInfo(), CLI); std::optional<unsigned> ZAMarkerNode; bool UseNewSMEABILowering = getTM().useNewSMEABILowering(); @@ -19476,6 +19478,61 @@ static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG) { Op1 ? Op1 : Mul->getOperand(1)); } +// Multiplying an RDSVL value by a constant can sometimes be done cheaper by +// folding a power-of-two factor of the constant into the RDSVL immediate and +// compensating with an extra shift. +// +// We rewrite: +// (mul (srl (rdsvl 1), w), x) +// to one of: +// (shl (rdsvl y), z) if z > 0 +// (srl (rdsvl y), abs(z)) if z < 0 +// where integers y, z satisfy x = y * 2^(w + z) and y ∈ [-32, 31]. +static SDValue performMulRdsvlCombine(SDNode *Mul, SelectionDAG &DAG) { + SDLoc DL(Mul); + EVT VT = Mul->getValueType(0); + SDValue MulOp0 = Mul->getOperand(0); + int ConstMultiplier = + cast<ConstantSDNode>(Mul->getOperand(1))->getSExtValue(); + if ((MulOp0->getOpcode() != ISD::SRL) || + (MulOp0->getOperand(0).getOpcode() != AArch64ISD::RDSVL)) + return SDValue(); + + unsigned AbsConstValue = abs(ConstMultiplier); + unsigned OperandShift = + cast<ConstantSDNode>(MulOp0->getOperand(1))->getZExtValue(); + + // z ≤ ctz(|x|) - w (largest extra shift we can take while keeping y + // integral) + int UpperBound = llvm::countr_zero(AbsConstValue) - OperandShift; + + // To keep y in range, with B = 31 for x > 0 and B = 32 for x < 0, we need: + // 2^(w + z) ≥ ceil(x / B) ⇒ z ≥ ceil_log2(ceil(x / B)) - w (LowerBound). + unsigned B = ConstMultiplier < 0 ? 32 : 31; + unsigned CeilAxOverB = (AbsConstValue + (B - 1)) / B; // ceil(|x|/B) + int LowerBound = llvm::Log2_32_Ceil(CeilAxOverB) - OperandShift; + + // No valid solution found. + if (LowerBound > UpperBound) + return SDValue(); + + // Any value of z in [LowerBound, UpperBound] is valid. Prefer no extra + // shift if possible. + int Shift = std::min(std::max(/*prefer*/ 0, LowerBound), UpperBound); + + // y = x / 2^(w + z) + int32_t RdsvlMul = (AbsConstValue >> (OperandShift + Shift)) * + (ConstMultiplier < 0 ? -1 : 1); + auto Rdsvl = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64, + DAG.getSignedConstant(RdsvlMul, DL, MVT::i32)); + + if (Shift == 0) + return Rdsvl; + return DAG.getNode(Shift < 0 ? ISD::SRL : ISD::SHL, DL, VT, Rdsvl, + DAG.getConstant(abs(Shift), DL, MVT::i32), + SDNodeFlags::Exact); +} + // Combine v4i32 Mul(And(Srl(X, 15), 0x10001), 0xffff) -> v8i16 CMLTz // Same for other types with equivalent constants. static SDValue performMulVectorCmpZeroCombine(SDNode *N, SelectionDAG &DAG) { @@ -19604,6 +19661,9 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, if (!isa<ConstantSDNode>(N1)) return SDValue(); + if (SDValue Ext = performMulRdsvlCombine(N, DAG)) + return Ext; + ConstantSDNode *C = cast<ConstantSDNode>(N1); const APInt &ConstValue = C->getAPIntValue(); @@ -26665,11 +26725,34 @@ static SDValue performDUPCombine(SDNode *N, } if (N->getOpcode() == AArch64ISD::DUP) { + SDValue Op = N->getOperand(0); + + // Optimize DUP(extload/zextload i8/i16/i32) to avoid GPR->FPR transfer. + // For example: + // v4i32 = DUP (i32 (zextloadi8 addr)) + // => + // v4i32 = SCALAR_TO_VECTOR (i32 (zextloadi8 addr)) ; Matches to ldr b0 + // v4i32 = DUPLANE32 (v4i32), 0 + if (auto *LD = dyn_cast<LoadSDNode>(Op)) { + ISD::LoadExtType ExtType = LD->getExtensionType(); + EVT MemVT = LD->getMemoryVT(); + EVT ElemVT = VT.getVectorElementType(); + if ((ExtType == ISD::EXTLOAD || ExtType == ISD::ZEXTLOAD) && + (MemVT == MVT::i8 || MemVT == MVT::i16 || MemVT == MVT::i32) && + ElemVT != MemVT && LD->hasOneUse()) { + EVT Vec128VT = EVT::getVectorVT(*DCI.DAG.getContext(), ElemVT, + 128 / ElemVT.getSizeInBits()); + SDValue ScalarToVec = + DCI.DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, Vec128VT, Op); + return DCI.DAG.getNode(getDUPLANEOp(ElemVT), DL, VT, ScalarToVec, + DCI.DAG.getConstant(0, DL, MVT::i64)); + } + } + // If the instruction is known to produce a scalar in SIMD registers, we can // duplicate it across the vector lanes using DUPLANE instead of moving it // to a GPR first. For example, this allows us to handle: // v4i32 = DUP (i32 (FCMGT (f32, f32))) - SDValue Op = N->getOperand(0); // FIXME: Ideally, we should be able to handle all instructions that // produce a scalar value in FPRs. if (Op.getOpcode() == AArch64ISD::FCMEQ || @@ -29430,15 +29513,6 @@ void AArch64TargetLowering::insertSSPDeclarations(Module &M) const { TargetLowering::insertSSPDeclarations(M); } -Function *AArch64TargetLowering::getSSPStackGuardCheck(const Module &M) const { - // MSVC CRT has a function to validate security cookie. - RTLIB::LibcallImpl SecurityCheckCookieLibcall = - getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE); - if (SecurityCheckCookieLibcall != RTLIB::Unsupported) - return M.getFunction(getLibcallImplName(SecurityCheckCookieLibcall)); - return TargetLowering::getSSPStackGuardCheck(M); -} - Value * AArch64TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const { // Android provides a fixed TLS slot for the SafeStack pointer. See the @@ -29447,11 +29521,6 @@ AArch64TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const { if (Subtarget->isTargetAndroid()) return UseTlsOffset(IRB, 0x48); - // Fuchsia is similar. - // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value. - if (Subtarget->isTargetFuchsia()) - return UseTlsOffset(IRB, -0x8); - return TargetLowering::getSafeStackPointerLocation(IRB); } @@ -29769,7 +29838,7 @@ bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const { // Checks to allow the use of SME instructions if (auto *Base = dyn_cast<CallBase>(&Inst)) { - auto CallAttrs = SMECallAttrs(*Base, this); + auto CallAttrs = SMECallAttrs(*Base, &getRuntimeLibcallsInfo()); if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() || CallAttrs.requiresPreservingZT0() || CallAttrs.requiresPreservingAllZAState()) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 9495c9f..2cb8ed2 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -366,7 +366,6 @@ public: Value *getIRStackGuard(IRBuilderBase &IRB) const override; void insertSSPDeclarations(Module &M) const override; - Function *getSSPStackGuardCheck(const Module &M) const override; /// If the target has a standard location for the unsafe stack pointer, /// returns the address of that location. Otherwise, returns nullptr. diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index 09ce713..58a53af 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -1894,6 +1894,21 @@ def btihint_op : Operand<i32> { }]; } +def CMHPriorityHintOperand : AsmOperandClass { + let Name = "CMHPriorityHint"; + let ParserMethod = "tryParseCMHPriorityHint"; +} + +def CMHPriorityHint_op : Operand<i32> { + let ParserMatchClass = CMHPriorityHintOperand; + let PrintMethod = "printCMHPriorityHintOp"; + let MCOperandPredicate = [{ + if (!MCOp.isImm()) + return false; + return AArch64CMHPriorityHint::lookupCMHPriorityHintByEncoding(MCOp.getImm()) != nullptr; + }]; +} + class MRSI : RtSystemI<1, (outs GPR64:$Rt), (ins mrs_sysreg_op:$systemreg), "mrs", "\t$Rt, $systemreg"> { bits<16> systemreg; @@ -4636,6 +4651,48 @@ multiclass StorePairOffset<bits<2> opc, bit V, RegisterOperand regtype, GPR64sp:$Rn, 0)>; } +class BaseLoadStoreAcquirePairOffset<bits<4> opc, bit L, dag oops, dag iops, + string asm> + : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn, #0]", "", []> { + bits<5> Rt; + bits<5> Rt2; + bits<5> Rn; + let Inst{31-23} = 0b110110010; + let Inst{22} = L; + let Inst{21} = 0b0; + let Inst{20-16} = Rt2; + let Inst{15-12} = opc; + let Inst{11-10} = 0b10; + let Inst{9-5} = Rn; + let Inst{4-0} = Rt; +} + +multiclass LoadAcquirePairOffset<bits<4> opc, string asm> { + let hasSideEffects = 0, mayStore = 0, mayLoad = 1 in + def i : BaseLoadStoreAcquirePairOffset<opc, 0b1, + (outs GPR64:$Rt, GPR64:$Rt2), + (ins GPR64sp:$Rn), asm>, + Sched<[WriteAtomic, WriteLDHi]>; + + def : InstAlias<asm # "\t$Rt, $Rt2, [$Rn]", + (!cast<Instruction>(NAME # "i") GPR64:$Rt, GPR64:$Rt2, + GPR64sp:$Rn)>; +} + + +multiclass StoreAcquirePairOffset<bits<4> opc, string asm> { + let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in + def i : BaseLoadStoreAcquirePairOffset<opc, 0b0, (outs), + (ins GPR64:$Rt, GPR64:$Rt2, + GPR64sp:$Rn), + asm>, + Sched<[WriteSTP]>; + + def : InstAlias<asm # "\t$Rt, $Rt2, [$Rn]", + (!cast<Instruction>(NAME # "i") GPR64:$Rt, GPR64:$Rt2, + GPR64sp:$Rn)>; +} + // (pre-indexed) class BaseLoadStorePairPreIdx<bits<2> opc, bit V, bit L, dag oops, dag iops, string asm> @@ -5241,7 +5298,7 @@ multiclass FPToIntegerUnscaled<bits<2> rmode, bits<3> opcode, string asm, } multiclass FPToIntegerSIMDScalar<bits<2> rmode, bits<3> opcode, string asm, - SDPatternOperator OpN = null_frag> { + SDPatternOperator OpN> { // double-precision to 32-bit SIMD/FPR def SDr : BaseFPToIntegerUnscaled<0b01, rmode, opcode, FPR64, FPR32, asm, [(set FPR32:$Rd, (i32 (OpN (f64 FPR64:$Rn))))]> { @@ -6481,8 +6538,7 @@ multiclass SIMDThreeSameVectorFML<bit U, bit b13, bits<3> size, string asm, } multiclass SIMDThreeSameVectorMLA<bit Q, string asm, SDPatternOperator op> { - - def v8f16 : BaseSIMDThreeSameVectorDot<Q, 0b0, 0b11, 0b1111, asm, ".8h", ".16b", + def v16i8_v8f16 : BaseSIMDThreeSameVectorDot<Q, 0b0, 0b11, 0b1111, asm, ".8h", ".16b", V128, v8f16, v16i8, op>; } @@ -6491,6 +6547,23 @@ multiclass SIMDThreeSameVectorMLAL<bit Q, bits<2> sz, string asm, SDPatternOpera V128, v4f32, v16i8, op>; } +multiclass SIMDThreeSameVectorFMLA<string asm> { + def v8f16_v8f16 : BaseSIMDThreeSameVectorDot<0b1, 0b0, 0b11, 0b1101, asm, ".8h", ".8h", + V128, v8f16, v8f16, null_frag>; +} + +multiclass SIMDThreeSameVectorFMLAWiden<string asm> { + def v8f16_v4f32 : BaseSIMDThreeSameVectorDot<0b1, 0b0, 0b01, 0b1101, asm, ".4s", ".8h", + V128, v4f32, v8f16, null_frag>; +} + +multiclass SIMDThreeSameVectorFDot<string asm, SDPatternOperator OpNode = null_frag> { + def v4f16_v2f32 : BaseSIMDThreeSameVectorDot<0, 0, 0b10, 0b1111, asm, ".2s", ".4h", V64, + v2f32, v4f16, OpNode>; + def v8f16_v4f32 : BaseSIMDThreeSameVectorDot<1, 0, 0b10, 0b1111, asm, ".4s", ".8h", V128, + v4f32, v8f16, OpNode>; +} + // FP8 assembly/disassembly classes //---------------------------------------------------------------------------- @@ -9112,6 +9185,13 @@ multiclass SIMDThreeSameVectorFMLIndex<bit U, bits<4> opc, string asm, V128, V128_lo, v4f32, v8f16, VectorIndexH, OpNode>; } +multiclass SIMDThreeSameVectorFDOTIndex<string asm> { + def v4f16_v2f32 : BaseSIMDThreeSameVectorIndexS<0b0, 0b0, 0b01, 0b1001, asm, ".2s", ".4h", ".2h", + V64, v2f32, v4f16, VectorIndexS, null_frag>; + def v8f16_v4f32 : BaseSIMDThreeSameVectorIndexS<0b1, 0b0, 0b01, 0b1001, asm, ".4s", ".8h",".2h", + V128, v4f32, v8f16, VectorIndexS, null_frag>; +} + //---------------------------------------------------------------------------- // FP8 Advanced SIMD vector x indexed element multiclass SIMD_FP8_Dot2_Index<string asm, SDPatternOperator op> { @@ -13227,3 +13307,34 @@ multiclass SIMDThreeSameVectorFP8MatrixMul<string asm>{ let Predicates = [HasNEON, HasF8F32MM]; } } + +//---------------------------------------------------------------------------- +// Contention Management Hints - FEAT_CMH +//---------------------------------------------------------------------------- + +class SHUHInst<string asm> : I< + (outs), + (ins CMHPriorityHint_op:$priority), + asm, "\t$priority", "", []>, Sched<[]> { + bits<1> priority; + let Inst{31-12} = 0b11010101000000110010; + let Inst{11-8} = 0b0110; + let Inst{7-6} = 0b01; + let Inst{5} = priority; + let Inst{4-0} = 0b11111; +} + +multiclass SHUH<string asm> { + def NAME : SHUHInst<asm>; + def : InstAlias<asm, (!cast<Instruction>(NAME) 0), 1>; +} + +class STCPHInst<string asm> : I< + (outs), + (ins), + asm, "", "", []>, Sched<[]> { + let Inst{31-12} = 0b11010101000000110010; + let Inst{11-8} = 0b0110; + let Inst{7-5} = 0b100; + let Inst{4-0} = 0b11111; +} diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 92f260f..b9e299e 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -50,63 +50,44 @@ def HasV9_4a : Predicate<"Subtarget->hasV9_4aOps()">, AssemblerPredicateWithAll<(all_of HasV9_4aOps), "armv9.4a">; def HasV8_0r : Predicate<"Subtarget->hasV8_0rOps()">, AssemblerPredicateWithAll<(all_of HasV8_0rOps), "armv8-r">; - def HasEL2VMSA : Predicate<"Subtarget->hasEL2VMSA()">, - AssemblerPredicateWithAll<(all_of FeatureEL2VMSA), "el2vmsa">; - + AssemblerPredicateWithAll<(all_of FeatureEL2VMSA), "el2vmsa">; def HasEL3 : Predicate<"Subtarget->hasEL3()">, - AssemblerPredicateWithAll<(all_of FeatureEL3), "el3">; - + AssemblerPredicateWithAll<(all_of FeatureEL3), "el3">; def HasVH : Predicate<"Subtarget->hasVH()">, - AssemblerPredicateWithAll<(all_of FeatureVH), "vh">; - + AssemblerPredicateWithAll<(all_of FeatureVH), "vh">; def HasLOR : Predicate<"Subtarget->hasLOR()">, - AssemblerPredicateWithAll<(all_of FeatureLOR), "lor">; - + AssemblerPredicateWithAll<(all_of FeatureLOR), "lor">; def HasPAuth : Predicate<"Subtarget->hasPAuth()">, - AssemblerPredicateWithAll<(all_of FeaturePAuth), "pauth">; - + AssemblerPredicateWithAll<(all_of FeaturePAuth), "pauth">; def HasPAuthLR : Predicate<"Subtarget->hasPAuthLR()">, - AssemblerPredicateWithAll<(all_of FeaturePAuthLR), "pauth-lr">; - + AssemblerPredicateWithAll<(all_of FeaturePAuthLR), "pauth-lr">; def HasJS : Predicate<"Subtarget->hasJS()">, - AssemblerPredicateWithAll<(all_of FeatureJS), "jsconv">; - + AssemblerPredicateWithAll<(all_of FeatureJS), "jsconv">; def HasCCIDX : Predicate<"Subtarget->hasCCIDX()">, - AssemblerPredicateWithAll<(all_of FeatureCCIDX), "ccidx">; - -def HasComplxNum : Predicate<"Subtarget->hasComplxNum()">, - AssemblerPredicateWithAll<(all_of FeatureComplxNum), "complxnum">; - + AssemblerPredicateWithAll<(all_of FeatureCCIDX), "ccidx">; +def HasComplxNum : Predicate<"Subtarget->hasComplxNum()">, + AssemblerPredicateWithAll<(all_of FeatureComplxNum), "complxnum">; def HasNV : Predicate<"Subtarget->hasNV()">, - AssemblerPredicateWithAll<(all_of FeatureNV), "nv">; - + AssemblerPredicateWithAll<(all_of FeatureNV), "nv">; def HasMPAM : Predicate<"Subtarget->hasMPAM()">, - AssemblerPredicateWithAll<(all_of FeatureMPAM), "mpam">; - + AssemblerPredicateWithAll<(all_of FeatureMPAM), "mpam">; def HasDIT : Predicate<"Subtarget->hasDIT()">, - AssemblerPredicateWithAll<(all_of FeatureDIT), "dit">; - -def HasTRACEV8_4 : Predicate<"Subtarget->hasTRACEV8_4()">, - AssemblerPredicateWithAll<(all_of FeatureTRACEV8_4), "tracev8.4">; - + AssemblerPredicateWithAll<(all_of FeatureDIT), "dit">; +def HasTRACEV8_4 : Predicate<"Subtarget->hasTRACEV8_4()">, + AssemblerPredicateWithAll<(all_of FeatureTRACEV8_4), "tracev8.4">; def HasAM : Predicate<"Subtarget->hasAM()">, - AssemblerPredicateWithAll<(all_of FeatureAM), "am">; - + AssemblerPredicateWithAll<(all_of FeatureAM), "am">; def HasSEL2 : Predicate<"Subtarget->hasSEL2()">, - AssemblerPredicateWithAll<(all_of FeatureSEL2), "sel2">; - -def HasTLB_RMI : Predicate<"Subtarget->hasTLB_RMI()">, - AssemblerPredicateWithAll<(all_of FeatureTLB_RMI), "tlb-rmi">; - + AssemblerPredicateWithAll<(all_of FeatureSEL2), "sel2">; +def HasTLB_RMI : Predicate<"Subtarget->hasTLB_RMI()">, + AssemblerPredicateWithAll<(all_of FeatureTLB_RMI), "tlb-rmi">; def HasFlagM : Predicate<"Subtarget->hasFlagM()">, - AssemblerPredicateWithAll<(all_of FeatureFlagM), "flagm">; - -def HasRCPC_IMMO : Predicate<"Subtarget->hasRCPC_IMMO()">, - AssemblerPredicateWithAll<(all_of FeatureRCPC_IMMO), "rcpc-immo">; - + AssemblerPredicateWithAll<(all_of FeatureFlagM), "flagm">; +def HasRCPC_IMMO : Predicate<"Subtarget->hasRCPC_IMMO()">, + AssemblerPredicateWithAll<(all_of FeatureRCPC_IMMO), "rcpc-immo">; def HasFPARMv8 : Predicate<"Subtarget->hasFPARMv8()">, - AssemblerPredicateWithAll<(all_of FeatureFPARMv8), "fp-armv8">; + AssemblerPredicateWithAll<(all_of FeatureFPARMv8), "fp-armv8">; def HasNEON : Predicate<"Subtarget->isNeonAvailable()">, AssemblerPredicateWithAll<(all_of FeatureNEON), "neon">; def HasSM4 : Predicate<"Subtarget->hasSM4()">, @@ -149,13 +130,13 @@ def HasSVE2 : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasS AssemblerPredicateWithAll<(all_of FeatureSVE2), "sve2">; def HasSVE2p1 : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2p1()">, AssemblerPredicateWithAll<(all_of FeatureSVE2p1), "sve2p1">; -def HasSVEAES : Predicate<"Subtarget->hasSVEAES()">, +def HasSVEAES : Predicate<"Subtarget->hasSVEAES()">, AssemblerPredicateWithAll<(all_of FeatureSVEAES), "sve-aes">; -def HasSVESM4 : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVESM4()">, +def HasSVESM4 : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVESM4()">, AssemblerPredicateWithAll<(all_of FeatureSVESM4), "sve-sm4">; -def HasSVESHA3 : Predicate<"Subtarget->hasSVESHA3()">, +def HasSVESHA3 : Predicate<"Subtarget->hasSVESHA3()">, AssemblerPredicateWithAll<(all_of FeatureSVESHA3), "sve-sha3">; -def HasSVEBitPerm : Predicate<"Subtarget->hasSVEBitPerm()">, +def HasSVEBitPerm : Predicate<"Subtarget->hasSVEBitPerm()">, AssemblerPredicateWithAll<(all_of FeatureSVEBitPerm), "sve-bitperm">; def HasSMEandIsNonStreamingSafe : Predicate<"Subtarget->hasSME()">, @@ -196,7 +177,7 @@ def HasSSVE_FP8DOT2 : Predicate<"Subtarget->hasSSVE_FP8DOT2() || " "(Subtarget->hasSVE2() && Subtarget->hasFP8DOT2())">, AssemblerPredicateWithAll<(any_of FeatureSSVE_FP8DOT2, (all_of FeatureSVE2, FeatureFP8DOT2)), - "ssve-fp8dot2 or (sve2 and fp8dot2)">; + "ssve-fp8dot2 or (sve2 and fp8dot2)">; def HasFP8DOT4 : Predicate<"Subtarget->hasFP8DOT4()">, AssemblerPredicateWithAll<(all_of FeatureFP8DOT4), "fp8dot4">; def HasSSVE_FP8DOT4 : Predicate<"Subtarget->hasSSVE_FP8DOT4() || " @@ -204,43 +185,60 @@ def HasSSVE_FP8DOT4 : Predicate<"Subtarget->hasSSVE_FP8DOT4() || " AssemblerPredicateWithAll<(any_of FeatureSSVE_FP8DOT4, (all_of FeatureSVE2, FeatureFP8DOT4)), "ssve-fp8dot4 or (sve2 and fp8dot4)">; -def HasLUT : Predicate<"Subtarget->hasLUT()">, +def HasLUT : Predicate<"Subtarget->hasLUT()">, AssemblerPredicateWithAll<(all_of FeatureLUT), "lut">; -def HasSME_LUTv2 : Predicate<"Subtarget->isStreaming() && Subtarget->hasSME_LUTv2()">, +def HasSME_LUTv2 : Predicate<"Subtarget->isStreaming() && Subtarget->hasSME_LUTv2()">, AssemblerPredicateWithAll<(all_of FeatureSME_LUTv2), "sme-lutv2">; -def HasSMEF8F16 : Predicate<"Subtarget->isStreaming() && Subtarget->hasSMEF8F16()">, +def HasSMEF8F16 : Predicate<"Subtarget->isStreaming() && Subtarget->hasSMEF8F16()">, AssemblerPredicateWithAll<(all_of FeatureSMEF8F16), "sme-f8f16">; -def HasSMEF8F32 : Predicate<"Subtarget->isStreaming() && Subtarget->hasSMEF8F32()">, +def HasSMEF8F32 : Predicate<"Subtarget->isStreaming() && Subtarget->hasSMEF8F32()">, AssemblerPredicateWithAll<(all_of FeatureSMEF8F32), "sme-f8f32">; -def HasSME_MOP4 : Predicate<"(Subtarget->isStreaming() && Subtarget->hasSME_MOP4())">, +def HasSME_MOP4 : Predicate<"(Subtarget->isStreaming() && Subtarget->hasSME_MOP4())">, AssemblerPredicateWithAll<(all_of FeatureSME_MOP4), "sme-mop4">; -def HasSME_TMOP : Predicate<"(Subtarget->isStreaming() && Subtarget->hasSME_TMOP())">, +def HasSME_TMOP : Predicate<"(Subtarget->isStreaming() && Subtarget->hasSME_TMOP())">, AssemblerPredicateWithAll<(all_of FeatureSME_TMOP), "sme-tmop">; - -def HasCMPBR : Predicate<"Subtarget->hasCMPBR()">, +def HasCMPBR : Predicate<"Subtarget->hasCMPBR()">, AssemblerPredicateWithAll<(all_of FeatureCMPBR), "cmpbr">; -def HasF8F32MM : Predicate<"Subtarget->hasF8F32MM()">, +def HasF8F32MM : Predicate<"Subtarget->hasF8F32MM()">, AssemblerPredicateWithAll<(all_of FeatureF8F32MM), "f8f32mm">; -def HasF8F16MM : Predicate<"Subtarget->hasF8F16MM()">, +def HasF8F16MM : Predicate<"Subtarget->hasF8F16MM()">, AssemblerPredicateWithAll<(all_of FeatureF8F16MM), "f8f16mm">; -def HasFPRCVT : Predicate<"Subtarget->hasFPRCVT()">, +def HasFPRCVT : Predicate<"Subtarget->hasFPRCVT()">, AssemblerPredicateWithAll<(all_of FeatureFPRCVT), "fprcvt">; -def HasLSFE : Predicate<"Subtarget->hasLSFE()">, +def HasLSFE : Predicate<"Subtarget->hasLSFE()">, AssemblerPredicateWithAll<(all_of FeatureLSFE), "lsfe">; -def HasSME2p2 : Predicate<"Subtarget->isStreaming() && Subtarget->hasSME2p2()">, +def HasSME2p2 : Predicate<"Subtarget->isStreaming() && Subtarget->hasSME2p2()">, AssemblerPredicateWithAll<(all_of FeatureSME2p2), "sme2p2">; -def HasSVEAES2 : Predicate<"Subtarget->hasSVEAES2()">, +def HasSVEAES2 : Predicate<"Subtarget->hasSVEAES2()">, AssemblerPredicateWithAll<(all_of FeatureSVEAES2), "sve-aes2">; -def HasSVEBFSCALE : Predicate<"Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSVEBFSCALE()">, +def HasSVEBFSCALE : Predicate<"Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSVEBFSCALE()">, AssemblerPredicateWithAll<(all_of FeatureSVEBFSCALE), "sve-bfscale">; -def HasSVE_F16F32MM : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE_F16F32MM()">, +def HasSVE_F16F32MM : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE_F16F32MM()">, AssemblerPredicateWithAll<(all_of FeatureSVE_F16F32MM), "sve-f16f32mm">; def HasPCDPHINT : Predicate<"Subtarget->hasPCDPHINT()">, - AssemblerPredicateWithAll<(all_of FeaturePCDPHINT), "pcdphint">; + AssemblerPredicateWithAll<(all_of FeaturePCDPHINT), "pcdphint">; def HasLSUI : Predicate<"Subtarget->hasLSUI()">, - AssemblerPredicateWithAll<(all_of FeatureLSUI), "lsui">; + AssemblerPredicateWithAll<(all_of FeatureLSUI), "lsui">; def HasOCCMO : Predicate<"Subtarget->hasOCCMO()">, - AssemblerPredicateWithAll<(all_of FeatureOCCMO), "occmo">; + AssemblerPredicateWithAll<(all_of FeatureOCCMO), "occmo">; +def HasCMH : Predicate<"Subtarget->hasCMH()">, + AssemblerPredicateWithAll<(all_of FeatureCMH), "cmh">; +def HasLSCP : Predicate<"Subtarget->hasLSCP()">, + AssemblerPredicateWithAll<(all_of FeatureLSCP), "lscp">; +def HasSVE2p2 : Predicate<"Subtarget->hasSVE2p2()">, + AssemblerPredicateWithAll<(all_of FeatureSVE2p2), "sve2p2">; +def HasSVE_B16MM : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE_B16MM()">, + AssemblerPredicateWithAll<(all_of FeatureSVE_B16MM), "sve-b16mm">; +def HasF16MM : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasF16MM()">, + AssemblerPredicateWithAll<(all_of FeatureF16MM), "f16mm">; +def HasSVE2p3 : Predicate<"Subtarget->hasSVE2p3()">, + AssemblerPredicateWithAll<(all_of FeatureSVE2p3), "sve2p3">; +def HasSME2p3 : Predicate<"Subtarget->hasSME2p3()">, + AssemblerPredicateWithAll<(all_of FeatureSME2p3), "sme2p3">; +def HasF16F32DOT : Predicate<"Subtarget->hasF16F32DOT()">, + AssemblerPredicateWithAll<(all_of FeatureF16F32DOT), "f16f32dot">; +def HasF16F32MM : Predicate<"Subtarget->hasF16F32MM()">, + AssemblerPredicateWithAll<(all_of FeatureF16F32MM), "f16f32mm">; // A subset of SVE(2) instructions are legal in Streaming SVE execution mode, // they should be enabled if either has been specified. @@ -310,6 +308,10 @@ def HasSVE2p2_or_SME2p2 : Predicate<"Subtarget->isSVEorStreamingSVEAvailable() && (Subtarget->hasSVE2p2() || Subtarget->hasSME2p2())">, AssemblerPredicateWithAll<(any_of FeatureSME2p2, FeatureSVE2p2), "sme2p2 or sve2p2">; +def HasSVE2p3_or_SME2p3 + : Predicate<"Subtarget->isSVEorStreamingSVEAvailable() && (Subtarget->hasSVE2p3() || Subtarget->hasSME2p3())">, + AssemblerPredicateWithAll<(any_of FeatureSME2p3, FeatureSVE2p3), + "sme2p3 or sve2p3">; def HasNonStreamingSVE2p2_or_SME2p2 : Predicate<"(Subtarget->isSVEAvailable() && Subtarget->hasSVE2p2()) ||" "(Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSME2p2())">, @@ -328,100 +330,110 @@ def HasNEONandIsStreamingSafe AssemblerPredicateWithAll<(any_of FeatureNEON), "neon">; // A subset of NEON instructions are legal in Streaming SVE mode only with +sme2p2. def HasNEONandIsSME2p2StreamingSafe - : Predicate<"Subtarget->isNeonAvailable() || (Subtarget->hasNEON() && Subtarget->hasSME2p2())">, - AssemblerPredicateWithAll<(any_of FeatureNEON), "neon">; + : Predicate<"Subtarget->isNeonAvailable() || (Subtarget->hasNEON() && Subtarget->hasSME2p2())">, + AssemblerPredicateWithAll<(any_of FeatureNEON), "neon">; def HasRCPC : Predicate<"Subtarget->hasRCPC()">, AssemblerPredicateWithAll<(all_of FeatureRCPC), "rcpc">; def HasAltNZCV : Predicate<"Subtarget->hasAlternativeNZCV()">, - AssemblerPredicateWithAll<(all_of FeatureAltFPCmp), "altnzcv">; + AssemblerPredicateWithAll<(all_of FeatureAltFPCmp), "altnzcv">; def HasFRInt3264 : Predicate<"Subtarget->hasFRInt3264()">, - AssemblerPredicateWithAll<(all_of FeatureFRInt3264), "frint3264">; + AssemblerPredicateWithAll<(all_of FeatureFRInt3264), "frint3264">; def HasSB : Predicate<"Subtarget->hasSB()">, - AssemblerPredicateWithAll<(all_of FeatureSB), "sb">; -def HasPredRes : Predicate<"Subtarget->hasPredRes()">, - AssemblerPredicateWithAll<(all_of FeaturePredRes), "predres">; + AssemblerPredicateWithAll<(all_of FeatureSB), "sb">; +def HasPredRes : Predicate<"Subtarget->hasPredRes()">, + AssemblerPredicateWithAll<(all_of FeaturePredRes), "predres">; def HasCCDP : Predicate<"Subtarget->hasCCDP()">, - AssemblerPredicateWithAll<(all_of FeatureCacheDeepPersist), "ccdp">; + AssemblerPredicateWithAll<(all_of FeatureCacheDeepPersist), "ccdp">; def HasBTI : Predicate<"Subtarget->hasBTI()">, - AssemblerPredicateWithAll<(all_of FeatureBranchTargetId), "bti">; + AssemblerPredicateWithAll<(all_of FeatureBranchTargetId), "bti">; def HasMTE : Predicate<"Subtarget->hasMTE()">, - AssemblerPredicateWithAll<(all_of FeatureMTE), "mte">; + AssemblerPredicateWithAll<(all_of FeatureMTE), "mte">; def HasTME : Predicate<"Subtarget->hasTME()">, - AssemblerPredicateWithAll<(all_of FeatureTME), "tme">; + AssemblerPredicateWithAll<(all_of FeatureTME), "tme">; def HasETE : Predicate<"Subtarget->hasETE()">, - AssemblerPredicateWithAll<(all_of FeatureETE), "ete">; + AssemblerPredicateWithAll<(all_of FeatureETE), "ete">; def HasTRBE : Predicate<"Subtarget->hasTRBE()">, - AssemblerPredicateWithAll<(all_of FeatureTRBE), "trbe">; + AssemblerPredicateWithAll<(all_of FeatureTRBE), "trbe">; def HasBF16 : Predicate<"Subtarget->hasBF16()">, - AssemblerPredicateWithAll<(all_of FeatureBF16), "bf16">; + AssemblerPredicateWithAll<(all_of FeatureBF16), "bf16">; def HasNoBF16 : Predicate<"!Subtarget->hasBF16()">; def HasMatMulInt8 : Predicate<"Subtarget->hasMatMulInt8()">, - AssemblerPredicateWithAll<(all_of FeatureMatMulInt8), "i8mm">; + AssemblerPredicateWithAll<(all_of FeatureMatMulInt8), "i8mm">; def HasMatMulFP32 : Predicate<"Subtarget->hasMatMulFP32()">, - AssemblerPredicateWithAll<(all_of FeatureMatMulFP32), "f32mm">; + AssemblerPredicateWithAll<(all_of FeatureMatMulFP32), "f32mm">; def HasMatMulFP64 : Predicate<"Subtarget->hasMatMulFP64()">, - AssemblerPredicateWithAll<(all_of FeatureMatMulFP64), "f64mm">; + AssemblerPredicateWithAll<(all_of FeatureMatMulFP64), "f64mm">; def HasXS : Predicate<"Subtarget->hasXS()">, - AssemblerPredicateWithAll<(all_of FeatureXS), "xs">; + AssemblerPredicateWithAll<(all_of FeatureXS), "xs">; def HasWFxT : Predicate<"Subtarget->hasWFxT()">, - AssemblerPredicateWithAll<(all_of FeatureWFxT), "wfxt">; + AssemblerPredicateWithAll<(all_of FeatureWFxT), "wfxt">; def HasLS64 : Predicate<"Subtarget->hasLS64()">, - AssemblerPredicateWithAll<(all_of FeatureLS64), "ls64">; + AssemblerPredicateWithAll<(all_of FeatureLS64), "ls64">; def HasBRBE : Predicate<"Subtarget->hasBRBE()">, - AssemblerPredicateWithAll<(all_of FeatureBRBE), "brbe">; + AssemblerPredicateWithAll<(all_of FeatureBRBE), "brbe">; def HasSPE_EEF : Predicate<"Subtarget->hasSPE_EEF()">, - AssemblerPredicateWithAll<(all_of FeatureSPE_EEF), "spe-eef">; + AssemblerPredicateWithAll<(all_of FeatureSPE_EEF), "spe-eef">; def HasHBC : Predicate<"Subtarget->hasHBC()">, - AssemblerPredicateWithAll<(all_of FeatureHBC), "hbc">; + AssemblerPredicateWithAll<(all_of FeatureHBC), "hbc">; def HasMOPS : Predicate<"Subtarget->hasMOPS()">, - AssemblerPredicateWithAll<(all_of FeatureMOPS), "mops">; + AssemblerPredicateWithAll<(all_of FeatureMOPS), "mops">; def HasCLRBHB : Predicate<"Subtarget->hasCLRBHB()">, - AssemblerPredicateWithAll<(all_of FeatureCLRBHB), "clrbhb">; + AssemblerPredicateWithAll<(all_of FeatureCLRBHB), "clrbhb">; def HasSPECRES2 : Predicate<"Subtarget->hasSPECRES2()">, - AssemblerPredicateWithAll<(all_of FeatureSPECRES2), "specres2">; + AssemblerPredicateWithAll<(all_of FeatureSPECRES2), "specres2">; def HasITE : Predicate<"Subtarget->hasITE()">, - AssemblerPredicateWithAll<(all_of FeatureITE), "ite">; + AssemblerPredicateWithAll<(all_of FeatureITE), "ite">; def HasTHE : Predicate<"Subtarget->hasTHE()">, - AssemblerPredicateWithAll<(all_of FeatureTHE), "the">; + AssemblerPredicateWithAll<(all_of FeatureTHE), "the">; def HasRCPC3 : Predicate<"Subtarget->hasRCPC3()">, - AssemblerPredicateWithAll<(all_of FeatureRCPC3), "rcpc3">; + AssemblerPredicateWithAll<(all_of FeatureRCPC3), "rcpc3">; def HasLSE128 : Predicate<"Subtarget->hasLSE128()">, - AssemblerPredicateWithAll<(all_of FeatureLSE128), "lse128">; + AssemblerPredicateWithAll<(all_of FeatureLSE128), "lse128">; def HasD128 : Predicate<"Subtarget->hasD128()">, - AssemblerPredicateWithAll<(all_of FeatureD128), "d128">; + AssemblerPredicateWithAll<(all_of FeatureD128), "d128">; def HasCHK : Predicate<"Subtarget->hasCHK()">, - AssemblerPredicateWithAll<(all_of FeatureCHK), "chk">; + AssemblerPredicateWithAll<(all_of FeatureCHK), "chk">; def HasGCS : Predicate<"Subtarget->hasGCS()">, - AssemblerPredicateWithAll<(all_of FeatureGCS), "gcs">; + AssemblerPredicateWithAll<(all_of FeatureGCS), "gcs">; def HasCPA : Predicate<"Subtarget->hasCPA()">, - AssemblerPredicateWithAll<(all_of FeatureCPA), "cpa">; + AssemblerPredicateWithAll<(all_of FeatureCPA), "cpa">; +def HasTLBID : Predicate<"Subtarget->hasTLBID()">, + AssemblerPredicateWithAll<(all_of FeatureTLBID), "tlbid">; +def HasMPAMv2 : Predicate<"Subtarget->hasMPAMv2()">, + AssemblerPredicateWithAll<(all_of FeatureMPAMv2), "mpamv2">; +def HasMTETC : Predicate<"Subtarget->hasMTETC()">, + AssemblerPredicateWithAll<(all_of FeatureMTETC), "mtetc">; +def HasGCIE : Predicate<"Subtarget->hasGCIE()">, + AssemblerPredicateWithAll<(all_of FeatureGCIE), "gcie">; def IsLE : Predicate<"Subtarget->isLittleEndian()">; def IsBE : Predicate<"!Subtarget->isLittleEndian()">; def IsWindows : Predicate<"Subtarget->isTargetWindows()">; def UseExperimentalZeroingPseudos - : Predicate<"Subtarget->useExperimentalZeroingPseudos()">; + : Predicate<"Subtarget->useExperimentalZeroingPseudos()">; def UseAlternateSExtLoadCVTF32 - : Predicate<"Subtarget->useAlternateSExtLoadCVTF32Pattern()">; + : Predicate<"Subtarget->useAlternateSExtLoadCVTF32Pattern()">; def UseNegativeImmediates - : Predicate<"false">, AssemblerPredicate<(all_of (not FeatureNoNegativeImmediates)), - "NegativeImmediates">; + : Predicate<"false">, + AssemblerPredicate<(all_of (not FeatureNoNegativeImmediates)), + "NegativeImmediates">; -def UseScalarIncVL : Predicate<"Subtarget->useScalarIncVL()">; +def UseScalarIncVL : Predicate<"Subtarget->useScalarIncVL()">; def NoUseScalarIncVL : Predicate<"!Subtarget->useScalarIncVL()">; -def HasFastIncVL : Predicate<"!Subtarget->hasDisableFastIncVL()">; +def HasFastIncVL : Predicate<"!Subtarget->hasDisableFastIncVL()">; -def UseSVEFPLD1R : Predicate<"!Subtarget->noSVEFPLD1R()">; +def UseSVEFPLD1R : Predicate<"!Subtarget->noSVEFPLD1R()">; -def UseLDAPUR : Predicate<"!Subtarget->avoidLDAPUR()">; +def UseLDAPUR : Predicate<"!Subtarget->avoidLDAPUR()">; def AArch64LocalRecover : SDNode<"ISD::LOCAL_RECOVER", SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisInt<1>]>>; -def AllowMisalignedMemAccesses : Predicate<"!Subtarget->requiresStrictAlign()">; +def AllowMisalignedMemAccesses + : Predicate<"!Subtarget->requiresStrictAlign()">; def UseWzrToVecMove : Predicate<"Subtarget->useWzrToVecMove()">; @@ -3692,6 +3704,12 @@ def UDF : UDFType<0, "udf">; // Load instructions. //===----------------------------------------------------------------------===// +let Predicates = [HasLSCP] in { +defm LDAP : LoadAcquirePairOffset<0b0101, "ldap">; +defm LDAPP : LoadAcquirePairOffset<0b0111, "ldapp">; +defm STLP : StoreAcquirePairOffset<0b0101, "stlp">; +} + // Pair (indexed, offset) defm LDPW : LoadPairOffset<0b00, 0, GPR32z, simm7s4, "ldp">; defm LDPX : LoadPairOffset<0b10, 0, GPR64z, simm7s8, "ldp">; @@ -4004,22 +4022,6 @@ defm LDRSW : LoadUI<0b10, 0, 0b10, GPR64, uimm12s4, "ldrsw", def : Pat<(i64 (zextloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))), (SUBREG_TO_REG (i64 0), (LDRWui GPR64sp:$Rn, uimm12s4:$offset), sub_32)>; -// load zero-extended i32, bitcast to f64 -def : Pat<(f64 (bitconvert (i64 (zextloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))), - (SUBREG_TO_REG (i64 0), (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>; -// load zero-extended i16, bitcast to f64 -def : Pat<(f64 (bitconvert (i64 (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))), - (SUBREG_TO_REG (i64 0), (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>; -// load zero-extended i8, bitcast to f64 -def : Pat<(f64 (bitconvert (i64 (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))), - (SUBREG_TO_REG (i64 0), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>; -// load zero-extended i16, bitcast to f32 -def : Pat<(f32 (bitconvert (i32 (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))), - (SUBREG_TO_REG (i32 0), (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>; -// load zero-extended i8, bitcast to f32 -def : Pat<(f32 (bitconvert (i32 (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))), - (SUBREG_TO_REG (i32 0), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>; - // Pre-fetch. def PRFMui : PrefetchUI<0b11, 0, 0b10, "prfm", [(AArch64Prefetch timm:$Rt, @@ -4371,6 +4373,64 @@ def : Pat <(v1i64 (scalar_to_vector (i64 (load (ro64.Xpat GPR64sp:$Rn, GPR64:$Rm, ro64.Xext:$extend))))), (LDRDroX GPR64sp:$Rn, GPR64:$Rm, ro64.Xext:$extend)>; +// Patterns for bitconvert or scalar_to_vector of load operations. +// Enables direct SIMD register loads for small integer types (i8/i16) that are +// naturally zero-extended to i32/i64. +multiclass ExtLoad8_16AllModes<ValueType OutTy, ValueType InnerTy, + SDPatternOperator OuterOp, + PatFrags LoadOp8, PatFrags LoadOp16> { + // 8-bit loads. + def : Pat<(OutTy (OuterOp (InnerTy (LoadOp8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))), + (SUBREG_TO_REG (i64 0), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>; + def : Pat<(OutTy (OuterOp (InnerTy (LoadOp8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))))), + (SUBREG_TO_REG (i64 0), (LDURBi GPR64sp:$Rn, simm9:$offset), bsub)>; + def : Pat<(OutTy (OuterOp (InnerTy (LoadOp8 (ro8.Wpat GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$extend))))), + (SUBREG_TO_REG (i64 0), (LDRBroW GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$extend), bsub)>; + def : Pat<(OutTy (OuterOp (InnerTy (LoadOp8 (ro8.Xpat GPR64sp:$Rn, GPR64:$Rm, ro8.Xext:$extend))))), + (SUBREG_TO_REG (i64 0), (LDRBroX GPR64sp:$Rn, GPR64:$Rm, ro8.Xext:$extend), bsub)>; + + // 16-bit loads. + def : Pat<(OutTy (OuterOp (InnerTy (LoadOp16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))), + (SUBREG_TO_REG (i64 0), (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>; + def : Pat<(OutTy (OuterOp (InnerTy (LoadOp16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))))), + (SUBREG_TO_REG (i64 0), (LDURHi GPR64sp:$Rn, simm9:$offset), hsub)>; + def : Pat<(OutTy (OuterOp (InnerTy (LoadOp16 (ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$extend))))), + (SUBREG_TO_REG (i64 0), (LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$extend), hsub)>; + def : Pat<(OutTy (OuterOp (InnerTy (LoadOp16 (ro16.Xpat GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$extend))))), + (SUBREG_TO_REG (i64 0), (LDRHroX GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$extend), hsub)>; +} + +// Extended multiclass that includes 32-bit loads in addition to 8-bit and 16-bit. +multiclass ExtLoad8_16_32AllModes<ValueType OutTy, ValueType InnerTy, + SDPatternOperator OuterOp, + PatFrags LoadOp8, PatFrags LoadOp16, PatFrags LoadOp32> { + defm : ExtLoad8_16AllModes<OutTy, InnerTy, OuterOp, LoadOp8, LoadOp16>; + + // 32-bit loads. + def : Pat<(OutTy (OuterOp (InnerTy (LoadOp32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))), + (SUBREG_TO_REG (i64 0), (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>; + def : Pat<(OutTy (OuterOp (InnerTy (LoadOp32 (am_unscaled32 GPR64sp:$Rn, simm9:$offset))))), + (SUBREG_TO_REG (i64 0), (LDURSi GPR64sp:$Rn, simm9:$offset), ssub)>; + def : Pat<(OutTy (OuterOp (InnerTy (LoadOp32 (ro32.Wpat GPR64sp:$Rn, GPR32:$Rm, ro32.Wext:$extend))))), + (SUBREG_TO_REG (i64 0), (LDRSroW GPR64sp:$Rn, GPR32:$Rm, ro32.Wext:$extend), ssub)>; + def : Pat<(OutTy (OuterOp (InnerTy (LoadOp32 (ro32.Xpat GPR64sp:$Rn, GPR64:$Rm, ro32.Xext:$extend))))), + (SUBREG_TO_REG (i64 0), (LDRSroX GPR64sp:$Rn, GPR64:$Rm, ro32.Xext:$extend), ssub)>; +} + +// Instantiate bitconvert patterns for floating-point types. +defm : ExtLoad8_16AllModes<f32, i32, bitconvert, zextloadi8, zextloadi16>; +defm : ExtLoad8_16_32AllModes<f64, i64, bitconvert, zextloadi8, zextloadi16, zextloadi32>; + +// Instantiate scalar_to_vector patterns for all vector types. +defm : ExtLoad8_16AllModes<v16i8, i32, scalar_to_vector, zextloadi8, zextloadi16>; +defm : ExtLoad8_16AllModes<v16i8, i32, scalar_to_vector, extloadi8, extloadi16>; +defm : ExtLoad8_16AllModes<v8i16, i32, scalar_to_vector, zextloadi8, zextloadi16>; +defm : ExtLoad8_16AllModes<v8i16, i32, scalar_to_vector, extloadi8, extloadi16>; +defm : ExtLoad8_16AllModes<v4i32, i32, scalar_to_vector, zextloadi8, zextloadi16>; +defm : ExtLoad8_16AllModes<v4i32, i32, scalar_to_vector, extloadi8, extloadi16>; +defm : ExtLoad8_16_32AllModes<v2i64, i64, scalar_to_vector, zextloadi8, zextloadi16, zextloadi32>; +defm : ExtLoad8_16_32AllModes<v2i64, i64, scalar_to_vector, extloadi8, extloadi16, extloadi32>; + // Pre-fetch. defm PRFUM : PrefetchUnscaled<0b11, 0, 0b10, "prfum", [(AArch64Prefetch timm:$Rt, @@ -5235,113 +5295,10 @@ let Predicates = [HasNEON, HasFPRCVT] in{ defm FCVTNU : FPToIntegerSIMDScalar<0b01, 0b011, "fcvtnu", int_aarch64_neon_fcvtnu>; defm FCVTPS : FPToIntegerSIMDScalar<0b10, 0b010, "fcvtps", int_aarch64_neon_fcvtps>; defm FCVTPU : FPToIntegerSIMDScalar<0b10, 0b011, "fcvtpu", int_aarch64_neon_fcvtpu>; - defm FCVTZS : FPToIntegerSIMDScalar<0b10, 0b110, "fcvtzs">; - defm FCVTZU : FPToIntegerSIMDScalar<0b10, 0b111, "fcvtzu">; -} - - -// AArch64's FCVT instructions saturate when out of range. -multiclass FPToIntegerSatPats<SDNode to_int_sat, SDNode to_int_sat_gi, string INST> { - let Predicates = [HasFullFP16] in { - def : Pat<(i32 (to_int_sat f16:$Rn, i32)), - (!cast<Instruction>(INST # UWHr) f16:$Rn)>; - def : Pat<(i64 (to_int_sat f16:$Rn, i64)), - (!cast<Instruction>(INST # UXHr) f16:$Rn)>; - } - def : Pat<(i32 (to_int_sat f32:$Rn, i32)), - (!cast<Instruction>(INST # UWSr) f32:$Rn)>; - def : Pat<(i64 (to_int_sat f32:$Rn, i64)), - (!cast<Instruction>(INST # UXSr) f32:$Rn)>; - def : Pat<(i32 (to_int_sat f64:$Rn, i32)), - (!cast<Instruction>(INST # UWDr) f64:$Rn)>; - def : Pat<(i64 (to_int_sat f64:$Rn, i64)), - (!cast<Instruction>(INST # UXDr) f64:$Rn)>; - - let Predicates = [HasFullFP16] in { - def : Pat<(i32 (to_int_sat_gi f16:$Rn)), - (!cast<Instruction>(INST # UWHr) f16:$Rn)>; - def : Pat<(i64 (to_int_sat_gi f16:$Rn)), - (!cast<Instruction>(INST # UXHr) f16:$Rn)>; - } - def : Pat<(i32 (to_int_sat_gi f32:$Rn)), - (!cast<Instruction>(INST # UWSr) f32:$Rn)>; - def : Pat<(i64 (to_int_sat_gi f32:$Rn)), - (!cast<Instruction>(INST # UXSr) f32:$Rn)>; - def : Pat<(i32 (to_int_sat_gi f64:$Rn)), - (!cast<Instruction>(INST # UWDr) f64:$Rn)>; - def : Pat<(i64 (to_int_sat_gi f64:$Rn)), - (!cast<Instruction>(INST # UXDr) f64:$Rn)>; - - let Predicates = [HasFullFP16] in { - def : Pat<(i32 (to_int_sat (fmul f16:$Rn, fixedpoint_f16_i32:$scale), i32)), - (!cast<Instruction>(INST # SWHri) $Rn, $scale)>; - def : Pat<(i64 (to_int_sat (fmul f16:$Rn, fixedpoint_f16_i64:$scale), i64)), - (!cast<Instruction>(INST # SXHri) $Rn, $scale)>; - } - def : Pat<(i32 (to_int_sat (fmul f32:$Rn, fixedpoint_f32_i32:$scale), i32)), - (!cast<Instruction>(INST # SWSri) $Rn, $scale)>; - def : Pat<(i64 (to_int_sat (fmul f32:$Rn, fixedpoint_f32_i64:$scale), i64)), - (!cast<Instruction>(INST # SXSri) $Rn, $scale)>; - def : Pat<(i32 (to_int_sat (fmul f64:$Rn, fixedpoint_f64_i32:$scale), i32)), - (!cast<Instruction>(INST # SWDri) $Rn, $scale)>; - def : Pat<(i64 (to_int_sat (fmul f64:$Rn, fixedpoint_f64_i64:$scale), i64)), - (!cast<Instruction>(INST # SXDri) $Rn, $scale)>; - - let Predicates = [HasFullFP16] in { - def : Pat<(i32 (to_int_sat_gi (fmul f16:$Rn, fixedpoint_f16_i32:$scale))), - (!cast<Instruction>(INST # SWHri) $Rn, $scale)>; - def : Pat<(i64 (to_int_sat_gi (fmul f16:$Rn, fixedpoint_f16_i64:$scale))), - (!cast<Instruction>(INST # SXHri) $Rn, $scale)>; - } - def : Pat<(i32 (to_int_sat_gi (fmul f32:$Rn, fixedpoint_f32_i32:$scale))), - (!cast<Instruction>(INST # SWSri) $Rn, $scale)>; - def : Pat<(i64 (to_int_sat_gi (fmul f32:$Rn, fixedpoint_f32_i64:$scale))), - (!cast<Instruction>(INST # SXSri) $Rn, $scale)>; - def : Pat<(i32 (to_int_sat_gi (fmul f64:$Rn, fixedpoint_f64_i32:$scale))), - (!cast<Instruction>(INST # SWDri) $Rn, $scale)>; - def : Pat<(i64 (to_int_sat_gi (fmul f64:$Rn, fixedpoint_f64_i64:$scale))), - (!cast<Instruction>(INST # SXDri) $Rn, $scale)>; -} - -defm : FPToIntegerSatPats<fp_to_sint_sat, fp_to_sint_sat_gi, "FCVTZS">; -defm : FPToIntegerSatPats<fp_to_uint_sat, fp_to_uint_sat_gi, "FCVTZU">; - -multiclass FPToIntegerPats<SDNode to_int, SDNode to_int_sat, SDNode round, string INST> { - def : Pat<(i32 (to_int (round f32:$Rn))), - (!cast<Instruction>(INST # UWSr) f32:$Rn)>; - def : Pat<(i64 (to_int (round f32:$Rn))), - (!cast<Instruction>(INST # UXSr) f32:$Rn)>; - def : Pat<(i32 (to_int (round f64:$Rn))), - (!cast<Instruction>(INST # UWDr) f64:$Rn)>; - def : Pat<(i64 (to_int (round f64:$Rn))), - (!cast<Instruction>(INST # UXDr) f64:$Rn)>; - - // These instructions saturate like fp_to_[su]int_sat. - let Predicates = [HasFullFP16] in { - def : Pat<(i32 (to_int_sat (round f16:$Rn), i32)), - (!cast<Instruction>(INST # UWHr) f16:$Rn)>; - def : Pat<(i64 (to_int_sat (round f16:$Rn), i64)), - (!cast<Instruction>(INST # UXHr) f16:$Rn)>; - } - def : Pat<(i32 (to_int_sat (round f32:$Rn), i32)), - (!cast<Instruction>(INST # UWSr) f32:$Rn)>; - def : Pat<(i64 (to_int_sat (round f32:$Rn), i64)), - (!cast<Instruction>(INST # UXSr) f32:$Rn)>; - def : Pat<(i32 (to_int_sat (round f64:$Rn), i32)), - (!cast<Instruction>(INST # UWDr) f64:$Rn)>; - def : Pat<(i64 (to_int_sat (round f64:$Rn), i64)), - (!cast<Instruction>(INST # UXDr) f64:$Rn)>; + defm FCVTZS : FPToIntegerSIMDScalar<0b10, 0b110, "fcvtzs", any_fp_to_sint>; + defm FCVTZU : FPToIntegerSIMDScalar<0b10, 0b111, "fcvtzu", any_fp_to_uint>; } -defm : FPToIntegerPats<fp_to_sint, fp_to_sint_sat, fceil, "FCVTPS">; -defm : FPToIntegerPats<fp_to_uint, fp_to_uint_sat, fceil, "FCVTPU">; -defm : FPToIntegerPats<fp_to_sint, fp_to_sint_sat, ffloor, "FCVTMS">; -defm : FPToIntegerPats<fp_to_uint, fp_to_uint_sat, ffloor, "FCVTMU">; -defm : FPToIntegerPats<fp_to_sint, fp_to_sint_sat, ftrunc, "FCVTZS">; -defm : FPToIntegerPats<fp_to_uint, fp_to_uint_sat, ftrunc, "FCVTZU">; -defm : FPToIntegerPats<fp_to_sint, fp_to_sint_sat, fround, "FCVTAS">; -defm : FPToIntegerPats<fp_to_uint, fp_to_uint_sat, fround, "FCVTAU">; - let Predicates = [HasFullFP16] in { @@ -6549,8 +6506,8 @@ defm FCVTNU : SIMDFPTwoScalar< 1, 0, 0b11010, "fcvtnu", int_aarch64_neon_fcvtn defm FCVTPS : SIMDFPTwoScalar< 0, 1, 0b11010, "fcvtps", int_aarch64_neon_fcvtps>; defm FCVTPU : SIMDFPTwoScalar< 1, 1, 0b11010, "fcvtpu", int_aarch64_neon_fcvtpu>; def FCVTXNv1i64 : SIMDInexactCvtTwoScalar<0b10110, "fcvtxn">; -defm FCVTZS : SIMDFPTwoScalar< 0, 1, 0b11011, "fcvtzs">; -defm FCVTZU : SIMDFPTwoScalar< 1, 1, 0b11011, "fcvtzu">; +defm FCVTZS : SIMDFPTwoScalar< 0, 1, 0b11011, "fcvtzs", any_fp_to_sint>; +defm FCVTZU : SIMDFPTwoScalar< 1, 1, 0b11011, "fcvtzu", any_fp_to_uint>; defm FRECPE : SIMDFPTwoScalar< 0, 1, 0b11101, "frecpe">; defm FRECPX : SIMDFPTwoScalar< 0, 1, 0b11111, "frecpx">; defm FRSQRTE : SIMDFPTwoScalar< 1, 1, 0b11101, "frsqrte">; @@ -6570,6 +6527,7 @@ defm USQADD : SIMDTwoScalarBHSDTied< 1, 0b00011, "usqadd", // Floating-point conversion patterns. multiclass FPToIntegerSIMDScalarPatterns<SDPatternOperator OpN, string INST> { + let Predicates = [HasFPRCVT] in { def : Pat<(f32 (bitconvert (i32 (OpN (f64 FPR64:$Rn))))), (!cast<Instruction>(INST # SDr) FPR64:$Rn)>; def : Pat<(f32 (bitconvert (i32 (OpN (f16 FPR16:$Rn))))), @@ -6578,6 +6536,7 @@ multiclass FPToIntegerSIMDScalarPatterns<SDPatternOperator OpN, string INST> { (!cast<Instruction>(INST # DHr) FPR16:$Rn)>; def : Pat<(f64 (bitconvert (i64 (OpN (f32 FPR32:$Rn))))), (!cast<Instruction>(INST # DSr) FPR32:$Rn)>; + } def : Pat<(f32 (bitconvert (i32 (OpN (f32 FPR32:$Rn))))), (!cast<Instruction>(INST # v1i32) FPR32:$Rn)>; def : Pat<(f64 (bitconvert (i64 (OpN (f64 FPR64:$Rn))))), @@ -6592,6 +6551,8 @@ defm: FPToIntegerSIMDScalarPatterns<int_aarch64_neon_fcvtns, "FCVTNS">; defm: FPToIntegerSIMDScalarPatterns<int_aarch64_neon_fcvtnu, "FCVTNU">; defm: FPToIntegerSIMDScalarPatterns<int_aarch64_neon_fcvtps, "FCVTPS">; defm: FPToIntegerSIMDScalarPatterns<int_aarch64_neon_fcvtpu, "FCVTPU">; +defm: FPToIntegerSIMDScalarPatterns<any_fp_to_sint, "FCVTZS">; +defm: FPToIntegerSIMDScalarPatterns<any_fp_to_uint, "FCVTZU">; multiclass FPToIntegerIntPats<Intrinsic round, string INST> { let Predicates = [HasFullFP16] in { @@ -6648,6 +6609,196 @@ multiclass FPToIntegerIntPats<Intrinsic round, string INST> { defm : FPToIntegerIntPats<int_aarch64_neon_fcvtzs, "FCVTZS">; defm : FPToIntegerIntPats<int_aarch64_neon_fcvtzu, "FCVTZU">; +// AArch64's FCVT instructions saturate when out of range. +multiclass FPToIntegerSatPats<SDNode to_int_sat, SDNode to_int_sat_gi, string INST> { + let Predicates = [HasFullFP16] in { + def : Pat<(i32 (to_int_sat f16:$Rn, i32)), + (!cast<Instruction>(INST # UWHr) f16:$Rn)>; + def : Pat<(i64 (to_int_sat f16:$Rn, i64)), + (!cast<Instruction>(INST # UXHr) f16:$Rn)>; + } + def : Pat<(i32 (to_int_sat f32:$Rn, i32)), + (!cast<Instruction>(INST # UWSr) f32:$Rn)>; + def : Pat<(i64 (to_int_sat f32:$Rn, i64)), + (!cast<Instruction>(INST # UXSr) f32:$Rn)>; + def : Pat<(i32 (to_int_sat f64:$Rn, i32)), + (!cast<Instruction>(INST # UWDr) f64:$Rn)>; + def : Pat<(i64 (to_int_sat f64:$Rn, i64)), + (!cast<Instruction>(INST # UXDr) f64:$Rn)>; + + let Predicates = [HasFullFP16] in { + def : Pat<(i32 (to_int_sat_gi f16:$Rn)), + (!cast<Instruction>(INST # UWHr) f16:$Rn)>; + def : Pat<(i64 (to_int_sat_gi f16:$Rn)), + (!cast<Instruction>(INST # UXHr) f16:$Rn)>; + } + def : Pat<(i32 (to_int_sat_gi f32:$Rn)), + (!cast<Instruction>(INST # UWSr) f32:$Rn)>; + def : Pat<(i64 (to_int_sat_gi f32:$Rn)), + (!cast<Instruction>(INST # UXSr) f32:$Rn)>; + def : Pat<(i32 (to_int_sat_gi f64:$Rn)), + (!cast<Instruction>(INST # UWDr) f64:$Rn)>; + def : Pat<(i64 (to_int_sat_gi f64:$Rn)), + (!cast<Instruction>(INST # UXDr) f64:$Rn)>; + + // For global-isel we can use register classes to determine + // which FCVT instruction to use. + let Predicates = [HasFPRCVT] in { + def : Pat<(i32 (to_int_sat_gi f16:$Rn)), + (!cast<Instruction>(INST # SHr) f16:$Rn)>; + def : Pat<(i64 (to_int_sat_gi f16:$Rn)), + (!cast<Instruction>(INST # DHr) f16:$Rn)>; + def : Pat<(i64 (to_int_sat_gi f32:$Rn)), + (!cast<Instruction>(INST # DSr) f32:$Rn)>; + def : Pat<(i32 (to_int_sat_gi f64:$Rn)), + (!cast<Instruction>(INST # SDr) f64:$Rn)>; + } + def : Pat<(i32 (to_int_sat_gi f32:$Rn)), + (!cast<Instruction>(INST # v1i32) f32:$Rn)>; + def : Pat<(i64 (to_int_sat_gi f64:$Rn)), + (!cast<Instruction>(INST # v1i64) f64:$Rn)>; + + let Predicates = [HasFPRCVT] in { + def : Pat<(f32 (bitconvert (i32 (to_int_sat f16:$Rn, i32)))), + (!cast<Instruction>(INST # SHr) f16:$Rn)>; + def : Pat<(f64 (bitconvert (i64 (to_int_sat f16:$Rn, i64)))), + (!cast<Instruction>(INST # DHr) f16:$Rn)>; + def : Pat<(f64 (bitconvert (i64 (to_int_sat f32:$Rn, i64)))), + (!cast<Instruction>(INST # DSr) f32:$Rn)>; + def : Pat<(f32 (bitconvert (i32 (to_int_sat f64:$Rn, i32)))), + (!cast<Instruction>(INST # SDr) f64:$Rn)>; + } + def : Pat<(f32 (bitconvert (i32 (to_int_sat f32:$Rn, i32)))), + (!cast<Instruction>(INST # v1i32) f32:$Rn)>; + def : Pat<(f64 (bitconvert (i64 (to_int_sat f64:$Rn, i64)))), + (!cast<Instruction>(INST # v1i64) f64:$Rn)>; + + let Predicates = [HasFullFP16] in { + def : Pat<(i32 (to_int_sat (fmul f16:$Rn, fixedpoint_f16_i32:$scale), i32)), + (!cast<Instruction>(INST # SWHri) $Rn, $scale)>; + def : Pat<(i64 (to_int_sat (fmul f16:$Rn, fixedpoint_f16_i64:$scale), i64)), + (!cast<Instruction>(INST # SXHri) $Rn, $scale)>; + } + def : Pat<(i32 (to_int_sat (fmul f32:$Rn, fixedpoint_f32_i32:$scale), i32)), + (!cast<Instruction>(INST # SWSri) $Rn, $scale)>; + def : Pat<(i64 (to_int_sat (fmul f32:$Rn, fixedpoint_f32_i64:$scale), i64)), + (!cast<Instruction>(INST # SXSri) $Rn, $scale)>; + def : Pat<(i32 (to_int_sat (fmul f64:$Rn, fixedpoint_f64_i32:$scale), i32)), + (!cast<Instruction>(INST # SWDri) $Rn, $scale)>; + def : Pat<(i64 (to_int_sat (fmul f64:$Rn, fixedpoint_f64_i64:$scale), i64)), + (!cast<Instruction>(INST # SXDri) $Rn, $scale)>; + + let Predicates = [HasFullFP16] in { + def : Pat<(i32 (to_int_sat_gi (fmul f16:$Rn, fixedpoint_f16_i32:$scale))), + (!cast<Instruction>(INST # SWHri) $Rn, $scale)>; + def : Pat<(i64 (to_int_sat_gi (fmul f16:$Rn, fixedpoint_f16_i64:$scale))), + (!cast<Instruction>(INST # SXHri) $Rn, $scale)>; + } + def : Pat<(i32 (to_int_sat_gi (fmul f32:$Rn, fixedpoint_f32_i32:$scale))), + (!cast<Instruction>(INST # SWSri) $Rn, $scale)>; + def : Pat<(i64 (to_int_sat_gi (fmul f32:$Rn, fixedpoint_f32_i64:$scale))), + (!cast<Instruction>(INST # SXSri) $Rn, $scale)>; + def : Pat<(i32 (to_int_sat_gi (fmul f64:$Rn, fixedpoint_f64_i32:$scale))), + (!cast<Instruction>(INST # SWDri) $Rn, $scale)>; + def : Pat<(i64 (to_int_sat_gi (fmul f64:$Rn, fixedpoint_f64_i64:$scale))), + (!cast<Instruction>(INST # SXDri) $Rn, $scale)>; +} + +defm : FPToIntegerSatPats<fp_to_sint_sat, fp_to_sint_sat_gi, "FCVTZS">; +defm : FPToIntegerSatPats<fp_to_uint_sat, fp_to_uint_sat_gi, "FCVTZU">; + +multiclass FPToIntegerPats<SDNode to_int, SDNode to_int_sat, SDNode to_int_sat_gi, SDNode round, string INST> { + def : Pat<(i32 (to_int (round f32:$Rn))), + (!cast<Instruction>(INST # UWSr) f32:$Rn)>; + def : Pat<(i64 (to_int (round f32:$Rn))), + (!cast<Instruction>(INST # UXSr) f32:$Rn)>; + def : Pat<(i32 (to_int (round f64:$Rn))), + (!cast<Instruction>(INST # UWDr) f64:$Rn)>; + def : Pat<(i64 (to_int (round f64:$Rn))), + (!cast<Instruction>(INST # UXDr) f64:$Rn)>; + + // For global-isel we can use register classes to determine + // which FCVT instruction to use. + let Predicates = [HasFPRCVT] in { + def : Pat<(i64 (to_int (round f32:$Rn))), + (!cast<Instruction>(INST # DSr) f32:$Rn)>; + def : Pat<(i32 (to_int (round f64:$Rn))), + (!cast<Instruction>(INST # SDr) f64:$Rn)>; + } + def : Pat<(i32 (to_int (round f32:$Rn))), + (!cast<Instruction>(INST # v1i32) f32:$Rn)>; + def : Pat<(i64 (to_int (round f64:$Rn))), + (!cast<Instruction>(INST # v1i64) f64:$Rn)>; + + let Predicates = [HasFPRCVT] in { + def : Pat<(f64 (bitconvert (i64 (to_int (round f32:$Rn))))), + (!cast<Instruction>(INST # DSr) f32:$Rn)>; + def : Pat<(f32 (bitconvert (i32 (to_int (round f64:$Rn))))), + (!cast<Instruction>(INST # SDr) f64:$Rn)>; + } + def : Pat<(f32 (bitconvert (i32 (to_int (round f32:$Rn))))), + (!cast<Instruction>(INST # v1i32) f32:$Rn)>; + def : Pat<(f64 (bitconvert (i64 (to_int (round f64:$Rn))))), + (!cast<Instruction>(INST # v1i64) f64:$Rn)>; + + // These instructions saturate like fp_to_[su]int_sat. + let Predicates = [HasFullFP16] in { + def : Pat<(i32 (to_int_sat (round f16:$Rn), i32)), + (!cast<Instruction>(INST # UWHr) f16:$Rn)>; + def : Pat<(i64 (to_int_sat (round f16:$Rn), i64)), + (!cast<Instruction>(INST # UXHr) f16:$Rn)>; + } + def : Pat<(i32 (to_int_sat (round f32:$Rn), i32)), + (!cast<Instruction>(INST # UWSr) f32:$Rn)>; + def : Pat<(i64 (to_int_sat (round f32:$Rn), i64)), + (!cast<Instruction>(INST # UXSr) f32:$Rn)>; + def : Pat<(i32 (to_int_sat (round f64:$Rn), i32)), + (!cast<Instruction>(INST # UWDr) f64:$Rn)>; + def : Pat<(i64 (to_int_sat (round f64:$Rn), i64)), + (!cast<Instruction>(INST # UXDr) f64:$Rn)>; + + // For global-isel we can use register classes to determine + // which FCVT instruction to use. + let Predicates = [HasFPRCVT] in { + def : Pat<(i32 (to_int_sat_gi (round f16:$Rn))), + (!cast<Instruction>(INST # SHr) f16:$Rn)>; + def : Pat<(i64 (to_int_sat_gi (round f16:$Rn))), + (!cast<Instruction>(INST # DHr) f16:$Rn)>; + def : Pat<(i64 (to_int_sat_gi (round f32:$Rn))), + (!cast<Instruction>(INST # DSr) f32:$Rn)>; + def : Pat<(i32 (to_int_sat_gi (round f64:$Rn))), + (!cast<Instruction>(INST # SDr) f64:$Rn)>; + } + def : Pat<(i32 (to_int_sat_gi (round f32:$Rn))), + (!cast<Instruction>(INST # v1i32) f32:$Rn)>; + def : Pat<(i64 (to_int_sat_gi (round f64:$Rn))), + (!cast<Instruction>(INST # v1i64) f64:$Rn)>; + + let Predicates = [HasFPRCVT] in { + def : Pat<(f32 (bitconvert (i32 (to_int_sat (round f16:$Rn), i32)))), + (!cast<Instruction>(INST # SHr) f16:$Rn)>; + def : Pat<(f64 (bitconvert (i64 (to_int_sat (round f16:$Rn), i64)))), + (!cast<Instruction>(INST # DHr) f16:$Rn)>; + def : Pat<(f64 (bitconvert (i64 (to_int_sat (round f32:$Rn), i64)))), + (!cast<Instruction>(INST # DSr) f32:$Rn)>; + def : Pat<(f32 (bitconvert (i32 (to_int_sat (round f64:$Rn), i32)))), + (!cast<Instruction>(INST # SDr) f64:$Rn)>; + } + def : Pat<(f32 (bitconvert (i32 (to_int_sat (round f32:$Rn), i32)))), + (!cast<Instruction>(INST # v1i32) f32:$Rn)>; + def : Pat<(f64 (bitconvert (i64 (to_int_sat (round f64:$Rn), i64)))), + (!cast<Instruction>(INST # v1i64) f64:$Rn)>; +} + +defm : FPToIntegerPats<fp_to_sint, fp_to_sint_sat, fp_to_sint_sat_gi, fceil, "FCVTPS">; +defm : FPToIntegerPats<fp_to_uint, fp_to_uint_sat, fp_to_uint_sat_gi, fceil, "FCVTPU">; +defm : FPToIntegerPats<fp_to_sint, fp_to_sint_sat, fp_to_sint_sat_gi, ffloor, "FCVTMS">; +defm : FPToIntegerPats<fp_to_uint, fp_to_uint_sat, fp_to_uint_sat_gi, ffloor, "FCVTMU">; +defm : FPToIntegerPats<fp_to_sint, fp_to_sint_sat, fp_to_sint_sat_gi, ftrunc, "FCVTZS">; +defm : FPToIntegerPats<fp_to_uint, fp_to_uint_sat, fp_to_uint_sat_gi, ftrunc, "FCVTZU">; +defm : FPToIntegerPats<fp_to_sint, fp_to_sint_sat, fp_to_sint_sat_gi, fround, "FCVTAS">; +defm : FPToIntegerPats<fp_to_uint, fp_to_uint_sat, fp_to_uint_sat_gi, fround, "FCVTAU">; + // f16 -> s16 conversions let Predicates = [HasFullFP16] in { def : Pat<(i16(fp_to_sint_sat_gi f16:$Rn)), (FCVTZSv1f16 f16:$Rn)>; @@ -11244,8 +11395,28 @@ let Predicates = [HasLSFE] in { def STBFMINNML : BaseAtomicFPStore<FPR16, 0b00, 0b1, 0b111, "stbfminnml">; } +let Predicates = [HasF16F32DOT] in { + defm FDOT :SIMDThreeSameVectorFDot<"fdot">; + defm FDOTlane: SIMDThreeSameVectorFDOTIndex<"fdot">; +} + +let Predicates = [HasF16MM] in + defm FMMLA : SIMDThreeSameVectorFMLA<"fmmla">; + +let Predicates = [HasF16F32MM] in + defm FMMLA : SIMDThreeSameVectorFMLAWiden<"fmmla">; + let Uses = [FPMR, FPCR] in -defm FMMLA : SIMDThreeSameVectorFP8MatrixMul<"fmmla">; + defm FMMLA : SIMDThreeSameVectorFP8MatrixMul<"fmmla">; + +//===----------------------------------------------------------------------===// +// Contention Management Hints (FEAT_CMH) +//===----------------------------------------------------------------------===// + +let Predicates = [HasCMH] in { + defm SHUH : SHUH<"shuh">; // Shared Update Hint instruction + def STCPH : STCPHInst<"stcph">; // Store Concurrent Priority Hint instruction +} include "AArch64InstrAtomics.td" include "AArch64SVEInstrInfo.td" diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td index 47144c7..cd94a25 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td @@ -1341,6 +1341,10 @@ def Z_q : RegisterOperand<ZPR, "printTypedVectorList<0,'q'>"> { let ParserMatchClass = ZPRVectorList<128, 1>; } +def ZZ_Any : RegisterOperand<ZPR2, "printTypedVectorList<0,0>"> { + let ParserMatchClass = ZPRVectorList<0, 2>; +} + def ZZ_b : RegisterOperand<ZPR2, "printTypedVectorList<0,'b'>"> { let ParserMatchClass = ZPRVectorList<8, 2>; } @@ -1361,6 +1365,10 @@ def ZZ_q : RegisterOperand<ZPR2, "printTypedVectorList<0,'q'>"> { let ParserMatchClass = ZPRVectorList<128, 2>; } +def ZZZ_Any : RegisterOperand<ZPR3, "printTypedVectorList<0,0>"> { + let ParserMatchClass = ZPRVectorList<0, 3>; +} + def ZZZ_b : RegisterOperand<ZPR3, "printTypedVectorList<0,'b'>"> { let ParserMatchClass = ZPRVectorList<8, 3>; } diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td index e552afe..752b185 100644 --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -1173,3 +1173,14 @@ let Predicates = [HasSME_MOP4, HasSMEF64F64] in { defm FMOP4A : sme2_fmop4as_fp64_non_widening<0, "fmop4a", "int_aarch64_sme_mop4a">; defm FMOP4S : sme2_fmop4as_fp64_non_widening<1, "fmop4s", "int_aarch64_sme_mop4s">; } + +//===----------------------------------------------------------------------===// +// SME2.3 instructions +//===----------------------------------------------------------------------===// +let Predicates = [HasSME2p3] in { + def LUTI6_ZTZ : sme2_lut_single<"luti6">; + def LUTI6_4ZT3Z : sme2_luti6_zt_consecutive<"luti6">; + def LUTI6_S_4ZT3Z : sme2_luti6_zt_strided<"luti6">; + def LUTI6_4Z2Z2ZI : sme2_luti6_vector_vg4_consecutive<"luti6">; + def LUTI6_S_4Z2Z2ZI : sme2_luti6_vector_vg4_strided<"luti6">; +} // [HasSME2p3] diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 98a128e..3b268dc 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -2569,7 +2569,7 @@ let Predicates = [HasBF16, HasSVE_or_SME] in { } // End HasBF16, HasSVE_or_SME let Predicates = [HasBF16, HasSVE] in { - defm BFMMLA_ZZZ_HtoS : sve_fp_matrix_mla<0b01, "bfmmla", ZPR32, ZPR16, int_aarch64_sve_bfmmla, nxv4f32, nxv8bf16>; + defm BFMMLA_ZZZ_HtoS : sve_fp_matrix_mla<0b011, "bfmmla", ZPR32, ZPR16, int_aarch64_sve_bfmmla, nxv4f32, nxv8bf16>; } // End HasBF16, HasSVE let Predicates = [HasBF16, HasSVE_or_SME] in { @@ -3680,15 +3680,15 @@ let Predicates = [HasSVE_or_SME, HasMatMulInt8] in { } // End HasSVE_or_SME, HasMatMulInt8 let Predicates = [HasSVE, HasMatMulFP32] in { - defm FMMLA_ZZZ_S : sve_fp_matrix_mla<0b10, "fmmla", ZPR32, ZPR32, int_aarch64_sve_fmmla, nxv4f32, nxv4f32>; + defm FMMLA_ZZZ_S : sve_fp_matrix_mla<0b101, "fmmla", ZPR32, ZPR32, int_aarch64_sve_fmmla, nxv4f32, nxv4f32>; } // End HasSVE, HasMatMulFP32 let Predicates = [HasSVE_F16F32MM] in { - def FMLLA_ZZZ_HtoS : sve_fp_matrix_mla<0b00, "fmmla", ZPR32, ZPR16>; + def FMLLA_ZZZ_HtoS : sve_fp_matrix_mla<0b001, "fmmla", ZPR32, ZPR16>; } // End HasSVE_F16F32MM let Predicates = [HasSVE, HasMatMulFP64] in { - defm FMMLA_ZZZ_D : sve_fp_matrix_mla<0b11, "fmmla", ZPR64, ZPR64, int_aarch64_sve_fmmla, nxv2f64, nxv2f64>; + defm FMMLA_ZZZ_D : sve_fp_matrix_mla<0b111, "fmmla", ZPR64, ZPR64, int_aarch64_sve_fmmla, nxv2f64, nxv2f64>; defm LD1RO_B_IMM : sve_mem_ldor_si<0b00, "ld1rob", Z_b, ZPR8, nxv16i8, nxv16i1, AArch64ld1ro_z>; defm LD1RO_H_IMM : sve_mem_ldor_si<0b01, "ld1roh", Z_h, ZPR16, nxv8i16, nxv8i1, AArch64ld1ro_z>; defm LD1RO_W_IMM : sve_mem_ldor_si<0b10, "ld1row", Z_s, ZPR32, nxv4i32, nxv4i1, AArch64ld1ro_z>; @@ -4272,9 +4272,9 @@ def : Pat<(nxv4i32 (partial_reduce_smla nxv4i32:$Acc, nxv8i16:$MulLHS, nxv8i16:$ defm SQCVTN_Z2Z_StoH : sve2p1_multi_vec_extract_narrow<"sqcvtn", 0b00, int_aarch64_sve_sqcvtn_x2>; defm UQCVTN_Z2Z_StoH : sve2p1_multi_vec_extract_narrow<"uqcvtn", 0b01, int_aarch64_sve_uqcvtn_x2>; defm SQCVTUN_Z2Z_StoH : sve2p1_multi_vec_extract_narrow<"sqcvtun", 0b10, int_aarch64_sve_sqcvtun_x2>; -defm SQRSHRN_Z2ZI_StoH : sve2p1_multi_vec_shift_narrow<"sqrshrn", 0b101, int_aarch64_sve_sqrshrn_x2>; -defm UQRSHRN_Z2ZI_StoH : sve2p1_multi_vec_shift_narrow<"uqrshrn", 0b111, int_aarch64_sve_uqrshrn_x2>; -defm SQRSHRUN_Z2ZI_StoH : sve2p1_multi_vec_shift_narrow<"sqrshrun", 0b001, int_aarch64_sve_sqrshrun_x2>; +defm SQRSHRN_Z2ZI_StoH : sve_multi_vec_shift_narrow<"sqrshrn", 0b101, int_aarch64_sve_sqrshrn_x2>; +defm UQRSHRN_Z2ZI_StoH : sve_multi_vec_shift_narrow<"uqrshrn", 0b111, int_aarch64_sve_uqrshrn_x2>; +defm SQRSHRUN_Z2ZI_StoH : sve_multi_vec_shift_narrow<"sqrshrun", 0b001, int_aarch64_sve_sqrshrun_x2>; defm WHILEGE_2PXX : sve2p1_int_while_rr_pair<"whilege", 0b000>; defm WHILEGT_2PXX : sve2p1_int_while_rr_pair<"whilegt", 0b001>; @@ -4615,6 +4615,75 @@ let Predicates = [HasSVE2p2_or_SME2p2] in { defm REVD_ZPzZ : sve_int_perm_rev_revd_z<"revd", AArch64revd_mt>; } // End HasSME2p2orSVE2p2 + +//===----------------------------------------------------------------------===// +// SME2.3 or SVE2.3 instructions +//===----------------------------------------------------------------------===// +let Predicates = [HasSVE2p3_or_SME2p3] in { + // SVE2 Add pairwise within quadword vector segments (unpredicated) + defm ADDQP_ZZZ : sve2_int_mul<0b110, "addqp", null_frag>; + + // SVE2 Add subtract/subtract pairwise + defm ADDSUBP_ZZZ : sve2_int_mul<0b111, "addsubp", null_frag>; + defm SUBP_ZPmZZ : sve2_int_arith_pred<0b100001, "subp", null_frag>; + + // SVE2 integer absolute difference and accumulate long + defm SABAL_ZZZ : sve2_int_two_way_absdiff_accum_long<0b0, "sabal">; + defm UABAL_ZZZ : sve2_int_two_way_absdiff_accum_long<0b1, "uabal">; + + // SVE2 integer dot product + def SDOT_ZZZ_BtoH : sve_intx_dot<0b01, 0b00000, 0b0, "sdot", ZPR16, ZPR8>; + def UDOT_ZZZ_BtoH : sve_intx_dot<0b01, 0b00000, 0b1, "udot", ZPR16, ZPR8>; + + // SVE2 integer indexed dot product + def SDOT_ZZZI_BtoH : sve_intx_dot_by_indexed_elem_x<0b0, "sdot">; + def UDOT_ZZZI_BtoH : sve_intx_dot_by_indexed_elem_x<0b1, "udot">; + + // SVE2 fp convert, narrow and interleave to integer, rounding toward zero + defm FCVTZSN_Z2Z : sve2_fp_to_int_downcvt<"fcvtzsn", 0b0>; + defm FCVTZUN_Z2Z : sve2_fp_to_int_downcvt<"fcvtzun", 0b1>; + + // SVE2 signed/unsigned integer convert to floating-point + defm SCVTF_ZZ : sve2_int_to_fp_upcvt<"scvtf", 0b00>; + defm SCVTFLT_ZZ : sve2_int_to_fp_upcvt<"scvtflt", 0b10>; + defm UCVTF_ZZ : sve2_int_to_fp_upcvt<"ucvtf", 0b01>; + defm UCVTFLT_ZZ : sve2_int_to_fp_upcvt<"ucvtflt", 0b11>; + + // SVE2 saturating shift right narrow by immediate and interleave + defm SQRSHRN_Z2ZI_HtoB : sve_multi_vec_round_shift_narrow<"sqrshrn", 0b101>; + defm SQRSHRUN_Z2ZI_HtoB : sve_multi_vec_round_shift_narrow<"sqrshrun", 0b001>; + defm SQSHRN_Z2ZI_HtoB : sve_multi_vec_round_shift_narrow<"sqshrn", 0b000>; + defm SQSHRUN_Z2ZI_HtoB : sve_multi_vec_round_shift_narrow<"sqshrun", 0b100>; + defm UQRSHRN_Z2ZI_HtoB : sve_multi_vec_round_shift_narrow<"uqrshrn", 0b111>; + defm UQSHRN_Z2ZI_HtoB : sve_multi_vec_round_shift_narrow<"uqshrn", 0b010>; + defm SQSHRUN_Z2ZI_StoH : sve_multi_vec_shift_narrow<"sqshrun", 0b100, null_frag>; + defm SQSHRN_Z2ZI_StoH : sve_multi_vec_shift_narrow<"sqshrn", 0b000, null_frag>; + defm UQSHRN_Z2ZI_StoH : sve_multi_vec_shift_narrow<"uqshrn", 0b010, null_frag>; + + defm LUTI6_Z2ZZI : sve2_luti6_vector_index<"luti6">; +} // End HasSME2p3orSVE2p3 + +//===----------------------------------------------------------------------===// +// SVE2.3 instructions +//===----------------------------------------------------------------------===// +let Predicates = [HasSVE2p3] in { + def LUTI6_Z2ZZ : sve2_luti6_vector<"luti6">; +} + +//===----------------------------------------------------------------------===// +// SVE_B16MM Instructions +//===----------------------------------------------------------------------===// +let Predicates = [HasSVE_B16MM] in { + def BFMMLA_ZZZ_H : sve_fp_matrix_mla<0b110, "bfmmla", ZPR16, ZPR16>; +} + +//===----------------------------------------------------------------------===// +// F16MM Instructions +//===----------------------------------------------------------------------===// +let Predicates = [HasSVE2p2, HasF16MM] in { + def FMMLA_ZZZ_H : sve_fp_matrix_mla<0b100, "fmmla", ZPR16, ZPR16>; +} + //===----------------------------------------------------------------------===// // SME2.2 or SVE2.2 instructions - Legal in streaming mode iff target has SME2p2 //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td index bdde8e3..2387f17 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td +++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td @@ -2762,11 +2762,11 @@ def : InstRW<[V2Write_11c_18L01_18V01], (instregex "^ST4[BHWD]_IMM$")>; def : InstRW<[V2Write_11c_18L01_18S_18V01], (instregex "^ST4[BHWD]$")>; // Non temporal store, scalar + imm -def : InstRW<[V2Write_2c_1L01_1V], (instregex "^STNT1[BHWD]_ZRI$")>; +def : InstRW<[V2Write_2c_1L01_1V01], (instregex "^STNT1[BHWD]_ZRI$")>; // Non temporal store, scalar + scalar -def : InstRW<[V2Write_2c_1L01_1S_1V], (instrs STNT1H_ZRR)>; -def : InstRW<[V2Write_2c_1L01_1V], (instregex "^STNT1[BWD]_ZRR$")>; +def : InstRW<[V2Write_2c_1L01_1S_1V01], (instrs STNT1H_ZRR)>; +def : InstRW<[V2Write_2c_1L01_1V01], (instregex "^STNT1[BWD]_ZRR$")>; // Scatter non temporal store, vector + scalar 32-bit element size def : InstRW<[V2Write_4c_4L01_4V01], (instregex "^STNT1[BHW]_ZZR_S")>; diff --git a/llvm/lib/Target/AArch64/AArch64SystemOperands.td b/llvm/lib/Target/AArch64/AArch64SystemOperands.td index 9438917..ae46d71 100644 --- a/llvm/lib/Target/AArch64/AArch64SystemOperands.td +++ b/llvm/lib/Target/AArch64/AArch64SystemOperands.td @@ -205,6 +205,7 @@ def lookupDCByName : SearchIndex { let Key = ["Name"]; } +// Op1 CRn CRm Op2 def : DC<"ZVA", 0b011, 0b0111, 0b0100, 0b001>; def : DC<"IVAC", 0b000, 0b0111, 0b0110, 0b001>; def : DC<"ISW", 0b000, 0b0111, 0b0110, 0b010>; @@ -241,6 +242,11 @@ def : DC<"CIGDVAC", 0b011, 0b0111, 0b1110, 0b101>; def : DC<"GZVA", 0b011, 0b0111, 0b0100, 0b100>; } +let Requires = [{ {AArch64::FeatureMTETC} }] in { +def : DC<"ZGBVA", 0b011, 0b0111, 0b0100, 0b101>; +def : DC<"GBVA", 0b011, 0b0111, 0b0100, 0b111>; +} + let Requires = [{ {AArch64::FeatureMEC} }] in { def : DC<"CIPAE", 0b100, 0b0111, 0b1110, 0b000>; def : DC<"CIGDPAE", 0b100, 0b0111, 0b1110, 0b111>; @@ -813,11 +819,26 @@ def : BTI<"j", 0b100>; def : BTI<"jc", 0b110>; //===----------------------------------------------------------------------===// +// CMHPriority instruction options. +//===----------------------------------------------------------------------===// + +class CMHPriorityHint<string name, bits<1> encoding> : SearchableTable { + let SearchableFields = ["Name", "Encoding"]; + let EnumValueField = "Encoding"; + + string Name = name; + bits<1> Encoding; + let Encoding = encoding; +} + +def : CMHPriorityHint<"ph", 0b1>; + +//===----------------------------------------------------------------------===// // TLBI (translation lookaside buffer invalidate) instruction options. //===----------------------------------------------------------------------===// class TLBICommon<string name, bits<3> op1, bits<4> crn, bits<4> crm, - bits<3> op2, bit needsreg> { + bits<3> op2, bit needsreg, bit optionalreg> { string Name = name; bits<14> Encoding; let Encoding{13-11} = op1; @@ -825,24 +846,25 @@ class TLBICommon<string name, bits<3> op1, bits<4> crn, bits<4> crm, let Encoding{6-3} = crm; let Encoding{2-0} = op2; bit NeedsReg = needsreg; + bit OptionalReg = optionalreg; list<string> Requires = []; list<string> ExtraRequires = []; code RequiresStr = [{ { }] # !interleave(Requires # ExtraRequires, [{, }]) # [{ } }]; } class TLBIEntry<string name, bits<3> op1, bits<4> crn, bits<4> crm, - bits<3> op2, bit needsreg> - : TLBICommon<name, op1, crn, crm, op2, needsreg>; + bits<3> op2, bit needsreg, bit optionalreg> + : TLBICommon<name, op1, crn, crm, op2, needsreg, optionalreg>; class TLBIPEntry<string name, bits<3> op1, bits<4> crn, bits<4> crm, - bits<3> op2, bit needsreg> - : TLBICommon<name, op1, crn, crm, op2, needsreg>; + bits<3> op2, bit needsreg, bit optionalreg> + : TLBICommon<name, op1, crn, crm, op2, needsreg, optionalreg>; multiclass TLBITableBase { def NAME # Table : GenericTable { let FilterClass = NAME # "Entry"; let CppTypeName = NAME; - let Fields = ["Name", "Encoding", "NeedsReg", "RequiresStr"]; + let Fields = ["Name", "Encoding", "NeedsReg", "OptionalReg", "RequiresStr"]; let PrimaryKey = ["Encoding"]; let PrimaryKeyName = "lookup" # NAME # "ByEncoding"; } @@ -856,60 +878,60 @@ defm TLBI : TLBITableBase; defm TLBIP : TLBITableBase; multiclass TLBI<string name, bit hasTLBIP, bits<3> op1, bits<4> crn, bits<4> crm, - bits<3> op2, bit needsreg = 1> { - def : TLBIEntry<name, op1, crn, crm, op2, needsreg>; - def : TLBIEntry<!strconcat(name, "nXS"), op1, crn, crm, op2, needsreg> { + bits<3> op2, bit needsreg = 1, bit optionalreg = 0> { + def : TLBIEntry<name, op1, crn, crm, op2, needsreg, optionalreg>; + def : TLBIEntry<!strconcat(name, "nXS"), op1, crn, crm, op2, needsreg, optionalreg> { let Encoding{7} = 1; let ExtraRequires = ["AArch64::FeatureXS"]; } if !eq(hasTLBIP, true) then { - def : TLBIPEntry<name, op1, crn, crm, op2, needsreg>; - def : TLBIPEntry<!strconcat(name, "nXS"), op1, crn, crm, op2, needsreg> { + def : TLBIPEntry<name, op1, crn, crm, op2, needsreg, optionalreg>; + def : TLBIPEntry<!strconcat(name, "nXS"), op1, crn, crm, op2, needsreg, optionalreg> { let Encoding{7} = 1; let ExtraRequires = ["AArch64::FeatureXS"]; } } } -// hasTLBIP op1 CRn CRm op2 needsreg +// hasTLBIP op1 CRn CRm op2 needsreg, optreg defm : TLBI<"IPAS2E1IS", 1, 0b100, 0b1000, 0b0000, 0b001>; defm : TLBI<"IPAS2LE1IS", 1, 0b100, 0b1000, 0b0000, 0b101>; -defm : TLBI<"VMALLE1IS", 0, 0b000, 0b1000, 0b0011, 0b000, 0>; -defm : TLBI<"ALLE2IS", 0, 0b100, 0b1000, 0b0011, 0b000, 0>; -defm : TLBI<"ALLE3IS", 0, 0b110, 0b1000, 0b0011, 0b000, 0>; +defm : TLBI<"VMALLE1IS", 0, 0b000, 0b1000, 0b0011, 0b000, 0, 1>; +defm : TLBI<"ALLE2IS", 0, 0b100, 0b1000, 0b0011, 0b000, 0, 1>; +defm : TLBI<"ALLE3IS", 0, 0b110, 0b1000, 0b0011, 0b000, 0, 1>; defm : TLBI<"VAE1IS", 1, 0b000, 0b1000, 0b0011, 0b001>; defm : TLBI<"VAE2IS", 1, 0b100, 0b1000, 0b0011, 0b001>; defm : TLBI<"VAE3IS", 1, 0b110, 0b1000, 0b0011, 0b001>; defm : TLBI<"ASIDE1IS", 0, 0b000, 0b1000, 0b0011, 0b010>; defm : TLBI<"VAAE1IS", 1, 0b000, 0b1000, 0b0011, 0b011>; -defm : TLBI<"ALLE1IS", 0, 0b100, 0b1000, 0b0011, 0b100, 0>; +defm : TLBI<"ALLE1IS", 0, 0b100, 0b1000, 0b0011, 0b100, 0, 1>; defm : TLBI<"VALE1IS", 1, 0b000, 0b1000, 0b0011, 0b101>; defm : TLBI<"VALE2IS", 1, 0b100, 0b1000, 0b0011, 0b101>; defm : TLBI<"VALE3IS", 1, 0b110, 0b1000, 0b0011, 0b101>; -defm : TLBI<"VMALLS12E1IS", 0, 0b100, 0b1000, 0b0011, 0b110, 0>; +defm : TLBI<"VMALLS12E1IS", 0, 0b100, 0b1000, 0b0011, 0b110, 0, 1>; defm : TLBI<"VAALE1IS", 1, 0b000, 0b1000, 0b0011, 0b111>; defm : TLBI<"IPAS2E1", 1, 0b100, 0b1000, 0b0100, 0b001>; defm : TLBI<"IPAS2LE1", 1, 0b100, 0b1000, 0b0100, 0b101>; -defm : TLBI<"VMALLE1", 0, 0b000, 0b1000, 0b0111, 0b000, 0>; -defm : TLBI<"ALLE2", 0, 0b100, 0b1000, 0b0111, 0b000, 0>; -defm : TLBI<"ALLE3", 0, 0b110, 0b1000, 0b0111, 0b000, 0>; +defm : TLBI<"VMALLE1", 0, 0b000, 0b1000, 0b0111, 0b000, 0, 0>; +defm : TLBI<"ALLE2", 0, 0b100, 0b1000, 0b0111, 0b000, 0, 0>; +defm : TLBI<"ALLE3", 0, 0b110, 0b1000, 0b0111, 0b000, 0, 0>; defm : TLBI<"VAE1", 1, 0b000, 0b1000, 0b0111, 0b001>; defm : TLBI<"VAE2", 1, 0b100, 0b1000, 0b0111, 0b001>; defm : TLBI<"VAE3", 1, 0b110, 0b1000, 0b0111, 0b001>; defm : TLBI<"ASIDE1", 0, 0b000, 0b1000, 0b0111, 0b010>; defm : TLBI<"VAAE1", 1, 0b000, 0b1000, 0b0111, 0b011>; -defm : TLBI<"ALLE1", 0, 0b100, 0b1000, 0b0111, 0b100, 0>; +defm : TLBI<"ALLE1", 0, 0b100, 0b1000, 0b0111, 0b100, 0, 0>; defm : TLBI<"VALE1", 1, 0b000, 0b1000, 0b0111, 0b101>; defm : TLBI<"VALE2", 1, 0b100, 0b1000, 0b0111, 0b101>; defm : TLBI<"VALE3", 1, 0b110, 0b1000, 0b0111, 0b101>; -defm : TLBI<"VMALLS12E1", 0, 0b100, 0b1000, 0b0111, 0b110, 0>; +defm : TLBI<"VMALLS12E1", 0, 0b100, 0b1000, 0b0111, 0b110, 0, 0>; defm : TLBI<"VAALE1", 1, 0b000, 0b1000, 0b0111, 0b111>; // Armv8.4-A Translation Lookaside Buffer Instructions (TLBI) let Requires = ["AArch64::FeatureTLB_RMI"] in { // Armv8.4-A Outer Sharable TLB Maintenance instructions: -// hasTLBIP op1 CRn CRm op2 needsreg -defm : TLBI<"VMALLE1OS", 0, 0b000, 0b1000, 0b0001, 0b000, 0>; +// hasTLBIP op1 CRn CRm op2 needsreg, optreg +defm : TLBI<"VMALLE1OS", 0, 0b000, 0b1000, 0b0001, 0b000, 0, 1>; defm : TLBI<"VAE1OS", 1, 0b000, 0b1000, 0b0001, 0b001>; defm : TLBI<"ASIDE1OS", 0, 0b000, 0b1000, 0b0001, 0b010>; defm : TLBI<"VAAE1OS", 1, 0b000, 0b1000, 0b0001, 0b011>; @@ -919,15 +941,15 @@ defm : TLBI<"IPAS2E1OS", 1, 0b100, 0b1000, 0b0100, 0b000>; defm : TLBI<"IPAS2LE1OS", 1, 0b100, 0b1000, 0b0100, 0b100>; defm : TLBI<"VAE2OS", 1, 0b100, 0b1000, 0b0001, 0b001>; defm : TLBI<"VALE2OS", 1, 0b100, 0b1000, 0b0001, 0b101>; -defm : TLBI<"VMALLS12E1OS", 0, 0b100, 0b1000, 0b0001, 0b110, 0>; +defm : TLBI<"VMALLS12E1OS", 0, 0b100, 0b1000, 0b0001, 0b110, 0, 1>; defm : TLBI<"VAE3OS", 1, 0b110, 0b1000, 0b0001, 0b001>; defm : TLBI<"VALE3OS", 1, 0b110, 0b1000, 0b0001, 0b101>; -defm : TLBI<"ALLE2OS", 0, 0b100, 0b1000, 0b0001, 0b000, 0>; -defm : TLBI<"ALLE1OS", 0, 0b100, 0b1000, 0b0001, 0b100, 0>; -defm : TLBI<"ALLE3OS", 0, 0b110, 0b1000, 0b0001, 0b000, 0>; +defm : TLBI<"ALLE2OS", 0, 0b100, 0b1000, 0b0001, 0b000, 0, 1>; +defm : TLBI<"ALLE1OS", 0, 0b100, 0b1000, 0b0001, 0b100, 0, 1>; +defm : TLBI<"ALLE3OS", 0, 0b110, 0b1000, 0b0001, 0b000, 0, 1>; // Armv8.4-A TLB Range Maintenance instructions: -// hasTLBIP op1 CRn CRm op2 needsreg +// hasTLBIP op1 CRn CRm op2 defm : TLBI<"RVAE1", 1, 0b000, 0b1000, 0b0110, 0b001>; defm : TLBI<"RVAAE1", 1, 0b000, 0b1000, 0b0110, 0b011>; defm : TLBI<"RVALE1", 1, 0b000, 0b1000, 0b0110, 0b101>; @@ -962,18 +984,19 @@ defm : TLBI<"RVALE3OS", 1, 0b110, 0b1000, 0b0101, 0b101>; // Armv9-A Realm Management Extension TLBI Instructions let Requires = ["AArch64::FeatureRME"] in { +// hasTLBIP op1 CRn CRm op2 needsreg defm : TLBI<"RPAOS", 0, 0b110, 0b1000, 0b0100, 0b011>; defm : TLBI<"RPALOS", 0, 0b110, 0b1000, 0b0100, 0b111>; -defm : TLBI<"PAALLOS", 0, 0b110, 0b1000, 0b0001, 0b100, 0>; -defm : TLBI<"PAALL", 0, 0b110, 0b1000, 0b0111, 0b100, 0>; +defm : TLBI<"PAALLOS", 0, 0b110, 0b1000, 0b0001, 0b100, 0, 0>; +defm : TLBI<"PAALL", 0, 0b110, 0b1000, 0b0111, 0b100, 0, 0>; } // Armv9.5-A TLBI VMALL for Dirty State let Requires = ["AArch64::FeatureTLBIW"] in { -// op1, CRn, CRm, op2, needsreg -defm : TLBI<"VMALLWS2E1", 0, 0b100, 0b1000, 0b0110, 0b010, 0>; -defm : TLBI<"VMALLWS2E1IS", 0, 0b100, 0b1000, 0b0010, 0b010, 0>; -defm : TLBI<"VMALLWS2E1OS", 0, 0b100, 0b1000, 0b0101, 0b010, 0>; +// hasTLBIP op1 CRn CRm op2 needsreg, optreg +defm : TLBI<"VMALLWS2E1", 0, 0b100, 0b1000, 0b0110, 0b010, 0, 0>; +defm : TLBI<"VMALLWS2E1IS", 0, 0b100, 0b1000, 0b0010, 0b010, 0, 1>; +defm : TLBI<"VMALLWS2E1OS", 0, 0b100, 0b1000, 0b0101, 0b010, 0, 1>; } //===----------------------------------------------------------------------===// @@ -1862,13 +1885,6 @@ def : ROSysReg<"ERXPFGF_EL1", 0b11, 0b000, 0b0101, 0b0100, 0b100>; // v8.4a MPAM registers // Op0 Op1 CRn CRm Op2 -let Requires = [{ {AArch64::FeatureMPAM} }] in { -def : RWSysReg<"MPAM0_EL1", 0b11, 0b000, 0b1010, 0b0101, 0b001>; -def : RWSysReg<"MPAM1_EL1", 0b11, 0b000, 0b1010, 0b0101, 0b000>; -def : RWSysReg<"MPAM2_EL2", 0b11, 0b100, 0b1010, 0b0101, 0b000>; -def : RWSysReg<"MPAM3_EL3", 0b11, 0b110, 0b1010, 0b0101, 0b000>; -def : RWSysReg<"MPAM1_EL12", 0b11, 0b101, 0b1010, 0b0101, 0b000>; -def : RWSysReg<"MPAMHCR_EL2", 0b11, 0b100, 0b1010, 0b0100, 0b000>; def : RWSysReg<"MPAMVPMV_EL2", 0b11, 0b100, 0b1010, 0b0100, 0b001>; def : RWSysReg<"MPAMVPM0_EL2", 0b11, 0b100, 0b1010, 0b0110, 0b000>; def : RWSysReg<"MPAMVPM1_EL2", 0b11, 0b100, 0b1010, 0b0110, 0b001>; @@ -1878,8 +1894,6 @@ def : RWSysReg<"MPAMVPM4_EL2", 0b11, 0b100, 0b1010, 0b0110, 0b100>; def : RWSysReg<"MPAMVPM5_EL2", 0b11, 0b100, 0b1010, 0b0110, 0b101>; def : RWSysReg<"MPAMVPM6_EL2", 0b11, 0b100, 0b1010, 0b0110, 0b110>; def : RWSysReg<"MPAMVPM7_EL2", 0b11, 0b100, 0b1010, 0b0110, 0b111>; -def : ROSysReg<"MPAMIDR_EL1", 0b11, 0b000, 0b1010, 0b0100, 0b100>; -} //FeatureMPAM // v8.4a Activity Monitor registers // Op0 Op1 CRn CRm Op2 @@ -2319,6 +2333,26 @@ def : RWSysReg<"MPAMBW0_EL1", 0b11, 0b000, 0b1010, 0b0101, 0b101>; def : RWSysReg<"MPAMBWCAP_EL2", 0b11, 0b100, 0b1010, 0b0101, 0b110>; def : RWSysReg<"MPAMBWSM_EL1", 0b11, 0b000, 0b1010, 0b0101, 0b111>; +// v9.7a Memory partitioning and monitoring version 2 +// (FEAT_MPAMv2) registers +// Op0 Op1 CRn CRm Op2 +// MPAM system registers that are also available for MPAMv2 +def : RWSysReg<"MPAM0_EL1", 0b11, 0b000, 0b1010, 0b0101, 0b001>; +def : RWSysReg<"MPAM1_EL1", 0b11, 0b000, 0b1010, 0b0101, 0b000>; +def : RWSysReg<"MPAM1_EL12", 0b11, 0b101, 0b1010, 0b0101, 0b000>; +def : RWSysReg<"MPAM2_EL2", 0b11, 0b100, 0b1010, 0b0101, 0b000>; +def : RWSysReg<"MPAM3_EL3", 0b11, 0b110, 0b1010, 0b0101, 0b000>; +def : RWSysReg<"MPAMHCR_EL2", 0b11, 0b100, 0b1010, 0b0100, 0b000>; +def : ROSysReg<"MPAMIDR_EL1", 0b11, 0b000, 0b1010, 0b0100, 0b100>; +// Only MPAMv2 registers +def : RWSysReg<"MPAMCTL_EL1", 0b11, 0b000, 0b1010, 0b0101, 0b010>; +def : RWSysReg<"MPAMCTL_EL12", 0b11, 0b101, 0b1010, 0b0101, 0b010>; +def : RWSysReg<"MPAMCTL_EL2", 0b11, 0b100, 0b1010, 0b0101, 0b010>; +def : RWSysReg<"MPAMCTL_EL3", 0b11, 0b110, 0b1010, 0b0101, 0b010>; +def : RWSysReg<"MPAMVIDCR_EL2", 0b11, 0b100, 0b1010, 0b0111, 0b000>; +def : RWSysReg<"MPAMVIDSR_EL2", 0b11, 0b100, 0b1010, 0b0111, 0b001>; +def : RWSysReg<"MPAMVIDSR_EL3", 0b11, 0b110, 0b1010, 0b0111, 0b001>; + //===----------------------------------------------------------------------===// // FEAT_SRMASK v9.6a registers //===----------------------------------------------------------------------===// @@ -2412,3 +2446,251 @@ def : DC<"CIVAPS", 0b000, 0b0111, 0b1111, 0b001>; let Requires = [{ {AArch64::FeaturePoPS, AArch64::FeatureMTE} }] in { def : DC<"CIGDVAPS", 0b000, 0b0111, 0b1111, 0b101>; } + +// v9.7a TLBI domains system registers (MemSys) +foreach n = 0-3 in { + defvar nb = !cast<bits<3>>(n); + def : RWSysReg<"VTLBID"#n#"_EL2", 0b11, 0b100, 0b0010, 0b1000, nb>; +} + +foreach n = 0-3 in { + defvar nb = !cast<bits<3>>(n); + def : RWSysReg<"VTLBIDOS"#n#"_EL2", 0b11, 0b100, 0b0010, 0b1001, nb>; +} + +def : ROSysReg<"TLBIDIDR_EL1", 0b11, 0b000, 0b1010, 0b0100, 0b110>; + +// MPAM Lookaside Buffer Invalidate (MLBI) instructions +class MLBI<string name, bits<3> op1, bits<4> crn, bits<4> crm, bits<3> op2, bit needsreg> { + string Name = name; + bits<14> Encoding; + let Encoding{13-11} = op1; + let Encoding{10-7} = crn; + let Encoding{6-3} = crm; + let Encoding{2-0} = op2; + bit NeedsReg = needsreg; + string RequiresStr = [{ {AArch64::FeatureMPAMv2} }]; +} + +def MLBITable : GenericTable { + let FilterClass = "MLBI"; + let CppTypeName = "MLBI"; + let Fields = ["Name", "Encoding", "NeedsReg", "RequiresStr"]; + + let PrimaryKey = ["Encoding"]; + let PrimaryKeyName = "lookupMLBIByEncoding"; +} + +def lookupMLBIByName : SearchIndex { + let Table = MLBITable; + let Key = ["Name"]; +} + +// Op1 CRn CRm Op2 needsReg +def : MLBI<"ALLE1", 0b100, 0b0111, 0b0000, 0b100, 0>; +def : MLBI<"VMALLE1", 0b100, 0b0111, 0b0000, 0b101, 0>; +def : MLBI<"VPIDE1", 0b100, 0b0111, 0b0000, 0b110, 1>; +def : MLBI<"VPMGE1", 0b100, 0b0111, 0b0000, 0b111, 1>; + + +// v9.7-A GICv5 (FEAT_GCIE) +// CPU Interface Registers +// Op0 Op1 CRn CRm Op2 +def : RWSysReg<"ICC_APR_EL1", 0b11, 0b001, 0b1100, 0b0000, 0b000>; +def : RWSysReg<"ICC_APR_EL3", 0b11, 0b110, 0b1100, 0b1000, 0b000>; +def : RWSysReg<"ICC_CR0_EL1", 0b11, 0b001, 0b1100, 0b0000, 0b001>; +def : RWSysReg<"ICC_CR0_EL3", 0b11, 0b110, 0b1100, 0b1001, 0b000>; +def : ROSysReg<"ICC_DOMHPPIR_EL3", 0b11, 0b110, 0b1100, 0b1000, 0b010>; +def : ROSysReg<"ICC_HAPR_EL1", 0b11, 0b001, 0b1100, 0b0000, 0b011>; +def : ROSysReg<"ICC_HPPIR_EL1", 0b11, 0b000, 0b1100, 0b1010, 0b011>; +def : ROSysReg<"ICC_HPPIR_EL3", 0b11, 0b110, 0b1100, 0b1001, 0b001>; +def : ROSysReg<"ICC_IAFFIDR_EL1", 0b11, 0b000, 0b1100, 0b1010, 0b101>; +def : RWSysReg<"ICC_ICSR_EL1", 0b11, 0b000, 0b1100, 0b1010, 0b100>; +def : ROSysReg<"ICC_IDR0_EL1", 0b11, 0b000, 0b1100, 0b1010, 0b010>; +def : RWSysReg<"ICC_PCR_EL1", 0b11, 0b001, 0b1100, 0b0000, 0b010>; +def : RWSysReg<"ICC_PCR_EL3", 0b11, 0b110, 0b1100, 0b1000, 0b001>; + +// Virtual CPU Interface Registers +// Op0 Op1 CRn CRm Op2 +def : RWSysReg<"ICV_APR_EL1", 0b11, 0b001, 0b1100, 0b0000, 0b000>; +def : RWSysReg<"ICV_CR0_EL1", 0b11, 0b001, 0b1100, 0b0000, 0b001>; +def : RWSysReg<"ICV_HAPR_EL1", 0b11, 0b001, 0b1100, 0b0000, 0b011>; +def : RWSysReg<"ICV_HPPIR_EL1", 0b11, 0b000, 0b1100, 0b1010, 0b011>; +def : RWSysReg<"ICV_PCR_EL1", 0b11, 0b001, 0b1100, 0b0000, 0b010>; + +foreach n=0-3 in { + defvar nb = !cast<bits<2>>(n); +// Op0 Op1 CRn CRm Op2 + def : RWSysReg<"ICC_PPI_DOMAINR"#n#"_EL3", 0b11, 0b110, 0b1100, 0b1000, {0b1,nb{1-0}}>; + +} + +foreach n=0-15 in{ + defvar nb = !cast<bits<4>>(n); +// Op0 Op1 CRn CRm Op2 + def : RWSysReg<"ICC_PPI_PRIORITYR"#n#"_EL1", 0b11, 0b000, 0b1100, {0b111,nb{3}}, nb{2-0}>; +} + +// PPI and Virtual PPI Registers +multiclass PPIRegisters<string prefix> { + foreach n=0-1 in { + defvar nb = !cast<bit>(n); +// Op0 Op1 CRn CRm Op2 + def : RWSysReg<prefix#"_PPI_CACTIVER"#n#"_EL1", 0b11, 0b000, 0b1100, 0b1101, {0b00,nb}>; + def : RWSysReg<prefix#"_PPI_CPENDR"#n#"_EL1", 0b11, 0b000, 0b1100, 0b1101, {0b10,nb}>; + def : RWSysReg<prefix#"_PPI_ENABLER"#n#"_EL1", 0b11, 0b000, 0b1100, 0b1010, {0b11,nb}>; + def : RWSysReg<prefix#"_PPI_SACTIVER"#n#"_EL1", 0b11, 0b000, 0b1100, 0b1101, {0b01,nb}>; + def : RWSysReg<prefix#"_PPI_SPENDR"#n#"_EL1", 0b11, 0b000, 0b1100, 0b1101, {0b11,nb}>; + def : RWSysReg<prefix#"_PPI_HMR"#n#"_EL1", 0b11, 0b000, 0b1100, 0b1010, {0b00,nb}>; + } +} + +defm : PPIRegisters<"ICC">; // PPI Registers +defm : PPIRegisters<"ICV">; // Virtual PPI Registers + +foreach n=0-15 in { + defvar nb = !cast<bits<4>>(n); +// Op0 Op1 CRn CRm Op2 + def : RWSysReg<"ICV_PPI_PRIORITYR"#n#"_EL1", 0b11, 0b000, 0b1100, {0b111,nb{3}}, nb{2-0}>; +} + +// Hypervisor Control Registers +// Op0 Op1 CRn CRm Op2 +def : RWSysReg<"ICH_APR_EL2", 0b11, 0b100, 0b1100, 0b1000, 0b100>; +def : RWSysReg<"ICH_CONTEXTR_EL2", 0b11, 0b100, 0b1100, 0b1011, 0b110>; +def : RWSysReg<"ICH_HFGITR_EL2", 0b11, 0b100, 0b1100, 0b1001, 0b111>; +def : RWSysReg<"ICH_HFGRTR_EL2", 0b11, 0b100, 0b1100, 0b1001, 0b100>; +def : RWSysReg<"ICH_HFGWTR_EL2", 0b11, 0b100, 0b1100, 0b1001, 0b110>; +def : ROSysReg<"ICH_HPPIR_EL2", 0b11, 0b100, 0b1100, 0b1000, 0b101>; +def : RWSysReg<"ICH_VCTLR_EL2", 0b11, 0b100, 0b1100, 0b1011, 0b100>; + +foreach n=0-1 in { + defvar nb = !cast<bit>(n); +// Op0 Op1 CRn CRm Op2 +def : RWSysReg<"ICH_PPI_ACTIVER"#n#"_EL2", 0b11, 0b100, 0b1100, 0b1010, {0b11,nb}>; +def : RWSysReg<"ICH_PPI_DVIR"#n#"_EL2", 0b11, 0b100, 0b1100, 0b1010, {0b00,nb}>; +def : RWSysReg<"ICH_PPI_ENABLER"#n#"_EL2", 0b11, 0b100, 0b1100, 0b1010, {0b01,nb}>; +def : RWSysReg<"ICH_PPI_PENDR"#n#"_EL2", 0b11, 0b100, 0b1100, 0b1010, {0b10,nb}>; +} + +foreach n=0-15 in { + defvar nb = !cast<bits<4>>(n); +// Op0 Op1 CRn CRm Op2 + def : RWSysReg<"ICH_PPI_PRIORITYR"#n#"_EL2", 0b11, 0b100, 0b1100, {0b111,nb{3}}, nb{2-0}>; +} + +//===----------------------------------------------------------------------===// +// GICv5 instruction options. +//===----------------------------------------------------------------------===// + +// GIC +class GIC<string name, bits<3> op1, bits<4> crn, bits<4> crm, bits<3> op2> { + string Name = name; + bits<14> Encoding; + let Encoding{13-11} = op1; + let Encoding{10-7} = crn; + let Encoding{6-3} = crm; + let Encoding{2-0} = op2; + bit NeedsReg = 1; + string RequiresStr = [{ {AArch64::FeatureGCIE} }]; +} + +// GSB +class GSB<string name, bits<3> op1, bits<4> crn, bits<4> crm, bits<3> op2> { + string Name = name; + bits<14> Encoding; + let Encoding{13-11} = op1; + let Encoding{10-7} = crn; + let Encoding{6-3} = crm; + let Encoding{2-0} = op2; + string RequiresStr = [{ {AArch64::FeatureGCIE} }]; +} + +// GICR +class GICR<string name, bits<3> op1, bits<4> crn, bits<4> crm, bits<3> op2> { + string Name = name; + bits<14> Encoding; + let Encoding{13-11} = op1; + let Encoding{10-7} = crn; + let Encoding{6-3} = crm; + let Encoding{2-0} = op2; + bit NeedsReg = 1; + string RequiresStr = [{ {AArch64::FeatureGCIE} }]; +} + +def GICTable : GenericTable { + let FilterClass = "GIC"; + let CppTypeName = "GIC"; + let Fields = ["Name", "Encoding", "NeedsReg", "RequiresStr"]; + + let PrimaryKey = ["Encoding"]; + let PrimaryKeyName = "lookupGICByEncoding"; +} + +def GSBTable : GenericTable { + let FilterClass = "GSB"; + let CppTypeName = "GSB"; + let Fields = ["Name", "Encoding", "RequiresStr"]; + + let PrimaryKey = ["Encoding"]; + let PrimaryKeyName = "lookupGSBByEncoding"; +} + +def GICRTable : GenericTable { + let FilterClass = "GICR"; + let CppTypeName = "GICR"; + let Fields = ["Name", "Encoding", "NeedsReg", "RequiresStr"]; + + let PrimaryKey = ["Encoding"]; + let PrimaryKeyName = "lookupGICRByEncoding"; +} + +def lookupGICByName : SearchIndex { + let Table = GICTable; + let Key = ["Name"]; +} + +def lookupGSBByName : SearchIndex { + let Table = GSBTable; + let Key = ["Name"]; +} + +def lookupGICRByName : SearchIndex { + let Table = GICRTable; + let Key = ["Name"]; +} + +// Op1 CRn CRm Op2 +def : GSB<"sys", 0b000, 0b1100, 0b0000, 0b000>; +def : GSB<"ack", 0b000, 0b1100, 0b0000, 0b001>; + +// Op1 CRn CRm Op2 +def : GICR<"cdia", 0b000, 0b1100, 0b0011, 0b000>; +def : GICR<"cdnmia", 0b000, 0b1100, 0b0011, 0b001>; + +// Op1 CRn CRm Op2 +def : GIC<"cdaff", 0b000, 0b1100, 0b0001, 0b011>; +def : GIC<"cddi", 0b000, 0b1100, 0b0010, 0b000>; +def : GIC<"cddis", 0b000, 0b1100, 0b0001, 0b000>; +def : GIC<"cden", 0b000, 0b1100, 0b0001, 0b001>; +def : GIC<"cdeoi", 0b000, 0b1100, 0b0001, 0b111>; +def : GIC<"cdhm", 0b000, 0b1100, 0b0010, 0b001>; +def : GIC<"cdpend", 0b000, 0b1100, 0b0001, 0b100>; +def : GIC<"cdpri", 0b000, 0b1100, 0b0001, 0b010>; +def : GIC<"cdrcfg", 0b000, 0b1100, 0b0001, 0b101>; +def : GIC<"vdaff", 0b100, 0b1100, 0b0001, 0b011>; +def : GIC<"vddi", 0b100, 0b1100, 0b0010, 0b000>; +def : GIC<"vddis", 0b100, 0b1100, 0b0001, 0b000>; +def : GIC<"vden", 0b100, 0b1100, 0b0001, 0b001>; +def : GIC<"vdhm", 0b100, 0b1100, 0b0010, 0b001>; +def : GIC<"vdpend", 0b100, 0b1100, 0b0001, 0b100>; +def : GIC<"vdpri", 0b100, 0b1100, 0b0001, 0b010>; +def : GIC<"vdrcfg", 0b100, 0b1100, 0b0001, 0b101>; +def : GIC<"ldaff", 0b110, 0b1100, 0b0001, 0b011>; +def : GIC<"lddi", 0b110, 0b1100, 0b0010, 0b000>; +def : GIC<"lddis", 0b110, 0b1100, 0b0001, 0b000>; +def : GIC<"lden", 0b110, 0b1100, 0b0001, 0b001>; +def : GIC<"ldhm", 0b110, 0b1100, 0b0010, 0b001>; +def : GIC<"ldpend", 0b110, 0b1100, 0b0001, 0b100>; +def : GIC<"ldpri", 0b110, 0b1100, 0b0001, 0b010>; +def : GIC<"ldrcfg", 0b110, 0b1100, 0b0001, 0b101>; diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 2053fc4..fede586 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -224,7 +224,8 @@ static cl::opt<bool> EnableScalableAutovecInStreamingMode( static bool isSMEABIRoutineCall(const CallInst &CI, const AArch64TargetLowering &TLI) { const auto *F = CI.getCalledFunction(); - return F && SMEAttrs(F->getName(), TLI).isSMEABIRoutine(); + return F && + SMEAttrs(F->getName(), TLI.getRuntimeLibcallsInfo()).isSMEABIRoutine(); } /// Returns true if the function has explicit operations that can only be @@ -355,7 +356,7 @@ AArch64TTIImpl::getInlineCallPenalty(const Function *F, const CallBase &Call, // change only once and avoid inlining of G into F. SMEAttrs FAttrs(*F); - SMECallAttrs CallAttrs(Call, getTLI()); + SMECallAttrs CallAttrs(Call, &getTLI()->getRuntimeLibcallsInfo()); if (SMECallAttrs(FAttrs, CallAttrs.callee()).requiresSMChange()) { if (F == Call.getCaller()) // (1) @@ -957,23 +958,50 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, return TyL.first + ExtraCost; } case Intrinsic::get_active_lane_mask: { - auto *RetTy = dyn_cast<FixedVectorType>(ICA.getReturnType()); - if (RetTy) { - EVT RetVT = getTLI()->getValueType(DL, RetTy); - EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]); - if (!getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT) && - !getTLI()->isTypeLegal(RetVT)) { - // We don't have enough context at this point to determine if the mask - // is going to be kept live after the block, which will force the vXi1 - // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32. - // For now, we just assume the vectorizer created this intrinsic and - // the result will be the input for a PHI. In this case the cost will - // be extremely high for fixed-width vectors. - // NOTE: getScalarizationOverhead returns a cost that's far too - // pessimistic for the actual generated codegen. In reality there are - // two instructions generated per lane. - return RetTy->getNumElements() * 2; + auto RetTy = cast<VectorType>(ICA.getReturnType()); + EVT RetVT = getTLI()->getValueType(DL, RetTy); + EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]); + if (getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT)) + break; + + if (RetTy->isScalableTy()) { + if (TLI->getTypeAction(RetTy->getContext(), RetVT) != + TargetLowering::TypeSplitVector) + break; + + auto LT = getTypeLegalizationCost(RetTy); + InstructionCost Cost = LT.first; + // When SVE2p1 or SME2 is available, we can halve getTypeLegalizationCost + // as get_active_lane_mask may lower to the sve_whilelo_x2 intrinsic, e.g. + // nxv32i1 = get_active_lane_mask(base, idx) -> + // {nxv16i1, nxv16i1} = sve_whilelo_x2(base, idx) + if (ST->hasSVE2p1() || ST->hasSME2()) { + Cost /= 2; + if (Cost == 1) + return Cost; } + + // If more than one whilelo intrinsic is required, include the extra cost + // required by the saturating add & select required to increment the + // start value after the first intrinsic call. + Type *OpTy = ICA.getArgTypes()[0]; + IntrinsicCostAttributes AddAttrs(Intrinsic::uadd_sat, OpTy, {OpTy, OpTy}); + InstructionCost SplitCost = getIntrinsicInstrCost(AddAttrs, CostKind); + Type *CondTy = OpTy->getWithNewBitWidth(1); + SplitCost += getCmpSelInstrCost(Instruction::Select, OpTy, CondTy, + CmpInst::ICMP_UGT, CostKind); + return Cost + (SplitCost * (Cost - 1)); + } else if (!getTLI()->isTypeLegal(RetVT)) { + // We don't have enough context at this point to determine if the mask + // is going to be kept live after the block, which will force the vXi1 + // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32. + // For now, we just assume the vectorizer created this intrinsic and + // the result will be the input for a PHI. In this case the cost will + // be extremely high for fixed-width vectors. + // NOTE: getScalarizationOverhead returns a cost that's far too + // pessimistic for the actual generated codegen. In reality there are + // two instructions generated per lane. + return cast<FixedVectorType>(RetTy)->getNumElements() * 2; } break; } diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index 636d4f8a..6273cfc 100644 --- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -159,6 +159,7 @@ private: SMLoc getLoc() const { return getParser().getTok().getLoc(); } bool parseSysAlias(StringRef Name, SMLoc NameLoc, OperandVector &Operands); + bool parseSyslAlias(StringRef Name, SMLoc NameLoc, OperandVector &Operands); bool parseSyspAlias(StringRef Name, SMLoc NameLoc, OperandVector &Operands); void createSysAlias(uint16_t Encoding, OperandVector &Operands, SMLoc S); AArch64CC::CondCode parseCondCodeString(StringRef Cond, @@ -266,6 +267,7 @@ private: ParseStatus tryParseRPRFMOperand(OperandVector &Operands); ParseStatus tryParsePSBHint(OperandVector &Operands); ParseStatus tryParseBTIHint(OperandVector &Operands); + ParseStatus tryParseCMHPriorityHint(OperandVector &Operands); ParseStatus tryParseAdrpLabel(OperandVector &Operands); ParseStatus tryParseAdrLabel(OperandVector &Operands); template <bool AddFPZeroAsLiteral> @@ -370,6 +372,7 @@ private: k_PSBHint, k_PHint, k_BTIHint, + k_CMHPriorityHint, } Kind; SMLoc StartLoc, EndLoc; @@ -499,6 +502,11 @@ private: unsigned Length; unsigned Val; }; + struct CMHPriorityHintOp { + const char *Data; + unsigned Length; + unsigned Val; + }; struct SVCROp { const char *Data; @@ -525,6 +533,7 @@ private: struct PSBHintOp PSBHint; struct PHintOp PHint; struct BTIHintOp BTIHint; + struct CMHPriorityHintOp CMHPriorityHint; struct ShiftExtendOp ShiftExtend; struct SVCROp SVCR; }; @@ -595,6 +604,9 @@ public: case k_BTIHint: BTIHint = o.BTIHint; break; + case k_CMHPriorityHint: + CMHPriorityHint = o.CMHPriorityHint; + break; case k_ShiftExtend: ShiftExtend = o.ShiftExtend; break; @@ -769,6 +781,16 @@ public: return StringRef(BTIHint.Data, BTIHint.Length); } + unsigned getCMHPriorityHint() const { + assert(Kind == k_CMHPriorityHint && "Invalid access!"); + return CMHPriorityHint.Val; + } + + StringRef getCMHPriorityHintName() const { + assert(Kind == k_CMHPriorityHint && "Invalid access!"); + return StringRef(CMHPriorityHint.Data, CMHPriorityHint.Length); + } + StringRef getSVCR() const { assert(Kind == k_SVCR && "Invalid access!"); return StringRef(SVCR.Data, SVCR.Length); @@ -1511,6 +1533,7 @@ public: bool isPSBHint() const { return Kind == k_PSBHint; } bool isPHint() const { return Kind == k_PHint; } bool isBTIHint() const { return Kind == k_BTIHint; } + bool isCMHPriorityHint() const { return Kind == k_CMHPriorityHint; } bool isShiftExtend() const { return Kind == k_ShiftExtend; } bool isShifter() const { if (!isShiftExtend()) @@ -2196,6 +2219,11 @@ public: Inst.addOperand(MCOperand::createImm(getBTIHint())); } + void addCMHPriorityHintOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createImm(getCMHPriorityHint())); + } + void addShifterOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); unsigned Imm = @@ -2547,6 +2575,17 @@ public: } static std::unique_ptr<AArch64Operand> + CreateCMHPriorityHint(unsigned Val, StringRef Str, SMLoc S, MCContext &Ctx) { + auto Op = std::make_unique<AArch64Operand>(k_CMHPriorityHint, Ctx); + Op->CMHPriorityHint.Val = Val; + Op->CMHPriorityHint.Data = Str.data(); + Op->CMHPriorityHint.Length = Str.size(); + Op->StartLoc = S; + Op->EndLoc = S; + return Op; + } + + static std::unique_ptr<AArch64Operand> CreateMatrixRegister(unsigned RegNum, unsigned ElementWidth, MatrixKind Kind, SMLoc S, SMLoc E, MCContext &Ctx) { auto Op = std::make_unique<AArch64Operand>(k_MatrixRegister, Ctx); @@ -2656,6 +2695,9 @@ void AArch64Operand::print(raw_ostream &OS, const MCAsmInfo &MAI) const { case k_BTIHint: OS << getBTIHintName(); break; + case k_CMHPriorityHint: + OS << getCMHPriorityHintName(); + break; case k_MatrixRegister: OS << "<matrix " << getMatrixReg() << ">"; break; @@ -3279,6 +3321,24 @@ ParseStatus AArch64AsmParser::tryParseBTIHint(OperandVector &Operands) { return ParseStatus::Success; } +/// tryParseCMHPriorityHint - Try to parse a CMHPriority operand +ParseStatus AArch64AsmParser::tryParseCMHPriorityHint(OperandVector &Operands) { + SMLoc S = getLoc(); + const AsmToken &Tok = getTok(); + if (Tok.isNot(AsmToken::Identifier)) + return TokError("invalid operand for instruction"); + + auto CMHPriority = + AArch64CMHPriorityHint::lookupCMHPriorityHintByName(Tok.getString()); + if (!CMHPriority) + return TokError("invalid operand for instruction"); + + Operands.push_back(AArch64Operand::CreateCMHPriorityHint( + CMHPriority->Encoding, Tok.getString(), S, getContext())); + Lex(); // Eat identifier token. + return ParseStatus::Success; +} + /// tryParseAdrpLabel - Parse and validate a source label for the ADRP /// instruction. ParseStatus AArch64AsmParser::tryParseAdrpLabel(OperandVector &Operands) { @@ -3824,6 +3884,18 @@ static const struct Extension { {"ssve-bitperm", {AArch64::FeatureSSVE_BitPerm}}, {"sme-mop4", {AArch64::FeatureSME_MOP4}}, {"sme-tmop", {AArch64::FeatureSME_TMOP}}, + {"cmh", {AArch64::FeatureCMH}}, + {"lscp", {AArch64::FeatureLSCP}}, + {"tlbid", {AArch64::FeatureTLBID}}, + {"mpamv2", {AArch64::FeatureMPAMv2}}, + {"mtetc", {AArch64::FeatureMTETC}}, + {"gcie", {AArch64::FeatureGCIE}}, + {"sme2p3", {AArch64::FeatureSME2p3}}, + {"sve2p3", {AArch64::FeatureSVE2p3}}, + {"sve-b16mm", {AArch64::FeatureSVE_B16MM}}, + {"f16mm", {AArch64::FeatureF16MM}}, + {"f16f32dot", {AArch64::FeatureF16F32DOT}}, + {"f16f32mm", {AArch64::FeatureF16F32MM}}, }; static void setRequiredFeatureString(FeatureBitset FBS, std::string &Str) { @@ -3861,6 +3933,8 @@ static void setRequiredFeatureString(FeatureBitset FBS, std::string &Str) { Str += "ARMv9.5a"; else if (FBS[AArch64::HasV9_6aOps]) Str += "ARMv9.6a"; + else if (FBS[AArch64::HasV9_7aOps]) + Str += "ARMv9.7a"; else if (FBS[AArch64::HasV8_0rOps]) Str += "ARMv8r"; else { @@ -3894,8 +3968,9 @@ void AArch64AsmParser::createSysAlias(uint16_t Encoding, OperandVector &Operands AArch64Operand::CreateImm(Expr, S, getLoc(), getContext())); } -/// parseSysAlias - The IC, DC, AT, and TLBI instructions are simple aliases for -/// the SYS instruction. Parse them specially so that we create a SYS MCInst. +/// parseSysAlias - The IC, DC, AT, TLBI, MLBI and GIC{R} and GSB instructions +/// are simple aliases for the SYS instruction. Parse them specially so that +/// we create a SYS MCInst. bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc, OperandVector &Operands) { if (Name.contains('.')) @@ -3908,6 +3983,8 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc, StringRef Op = Tok.getString(); SMLoc S = Tok.getLoc(); bool ExpectRegister = true; + bool OptionalRegister = false; + bool hasAll = getSTI().hasFeature(AArch64::FeatureAll); if (Mnemonic == "ic") { const AArch64IC::IC *IC = AArch64IC::lookupICByName(Op); @@ -3950,13 +4027,50 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc, return TokError(Str); } ExpectRegister = TLBI->NeedsReg; + bool hasTLBID = getSTI().hasFeature(AArch64::FeatureTLBID); + if (hasAll || hasTLBID) { + OptionalRegister = TLBI->OptionalReg; + } createSysAlias(TLBI->Encoding, Operands, S); - } else if (Mnemonic == "cfp" || Mnemonic == "dvp" || Mnemonic == "cpp" || Mnemonic == "cosp") { + } else if (Mnemonic == "mlbi") { + const AArch64MLBI::MLBI *MLBI = AArch64MLBI::lookupMLBIByName(Op); + if (!MLBI) + return TokError("invalid operand for MLBI instruction"); + else if (!MLBI->haveFeatures(getSTI().getFeatureBits())) { + std::string Str("MLBI " + std::string(MLBI->Name) + " requires: "); + setRequiredFeatureString(MLBI->getRequiredFeatures(), Str); + return TokError(Str); + } + ExpectRegister = MLBI->NeedsReg; + createSysAlias(MLBI->Encoding, Operands, S); + } else if (Mnemonic == "gic") { + const AArch64GIC::GIC *GIC = AArch64GIC::lookupGICByName(Op); + if (!GIC) + return TokError("invalid operand for GIC instruction"); + else if (!GIC->haveFeatures(getSTI().getFeatureBits())) { + std::string Str("GIC " + std::string(GIC->Name) + " requires: "); + setRequiredFeatureString(GIC->getRequiredFeatures(), Str); + return TokError(Str); + } + ExpectRegister = true; + createSysAlias(GIC->Encoding, Operands, S); + } else if (Mnemonic == "gsb") { + const AArch64GSB::GSB *GSB = AArch64GSB::lookupGSBByName(Op); + if (!GSB) + return TokError("invalid operand for GSB instruction"); + else if (!GSB->haveFeatures(getSTI().getFeatureBits())) { + std::string Str("GSB " + std::string(GSB->Name) + " requires: "); + setRequiredFeatureString(GSB->getRequiredFeatures(), Str); + return TokError(Str); + } + ExpectRegister = false; + createSysAlias(GSB->Encoding, Operands, S); + } else if (Mnemonic == "cfp" || Mnemonic == "dvp" || Mnemonic == "cpp" || + Mnemonic == "cosp") { if (Op.lower() != "rctx") return TokError("invalid operand for prediction restriction instruction"); - bool hasAll = getSTI().hasFeature(AArch64::FeatureAll); bool hasPredres = hasAll || getSTI().hasFeature(AArch64::FeaturePredRes); bool hasSpecres2 = hasAll || getSTI().hasFeature(AArch64::FeatureSPECRES2); @@ -3989,10 +4103,61 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc, HasRegister = true; } - if (ExpectRegister && !HasRegister) - return TokError("specified " + Mnemonic + " op requires a register"); - else if (!ExpectRegister && HasRegister) - return TokError("specified " + Mnemonic + " op does not use a register"); + if (!OptionalRegister) { + if (ExpectRegister && !HasRegister) + return TokError("specified " + Mnemonic + " op requires a register"); + else if (!ExpectRegister && HasRegister) + return TokError("specified " + Mnemonic + " op does not use a register"); + } + + if (parseToken(AsmToken::EndOfStatement, "unexpected token in argument list")) + return true; + + return false; +} + +/// parseSyslAlias - The GICR instructions are simple aliases for +/// the SYSL instruction. Parse them specially so that we create a +/// SYS MCInst. +bool AArch64AsmParser::parseSyslAlias(StringRef Name, SMLoc NameLoc, + OperandVector &Operands) { + + Mnemonic = Name; + Operands.push_back( + AArch64Operand::CreateToken("sysl", NameLoc, getContext())); + + // Now expect two operands (identifier + register) + SMLoc startLoc = getLoc(); + const AsmToken ®Tok = getTok(); + StringRef reg = regTok.getString(); + unsigned RegNum = matchRegisterNameAlias(reg.lower(), RegKind::Scalar); + if (!RegNum) + return TokError("expected register operand"); + + Operands.push_back(AArch64Operand::CreateReg( + RegNum, RegKind::Scalar, startLoc, getLoc(), getContext(), EqualsReg)); + + Lex(); // Eat token + if (parseToken(AsmToken::Comma)) + return true; + + // Check for identifier + const AsmToken &operandTok = getTok(); + StringRef Op = operandTok.getString(); + SMLoc S2 = operandTok.getLoc(); + Lex(); // Eat token + + if (Mnemonic == "gicr") { + const AArch64GICR::GICR *GICR = AArch64GICR::lookupGICRByName(Op); + if (!GICR) + return Error(S2, "invalid operand for GICR instruction"); + else if (!GICR->haveFeatures(getSTI().getFeatureBits())) { + std::string Str("GICR " + std::string(GICR->Name) + " requires: "); + setRequiredFeatureString(GICR->getRequiredFeatures(), Str); + return Error(S2, Str); + } + createSysAlias(GICR->Encoding, Operands, S2); + } if (parseToken(AsmToken::EndOfStatement, "unexpected token in argument list")) return true; @@ -4025,7 +4190,7 @@ bool AArch64AsmParser::parseSyspAlias(StringRef Name, SMLoc NameLoc, return TokError("invalid operand for TLBIP instruction"); const AArch64TLBIP::TLBIP TLBIP( TLBIPorig->Name, TLBIPorig->Encoding | (HasnXSQualifier ? (1 << 7) : 0), - TLBIPorig->NeedsReg, + TLBIPorig->NeedsReg, TLBIPorig->OptionalReg, HasnXSQualifier ? TLBIPorig->FeaturesRequired | FeatureBitset({AArch64::FeatureXS}) : TLBIPorig->FeaturesRequired); @@ -4719,6 +4884,13 @@ ParseStatus AArch64AsmParser::tryParseVectorList(OperandVector &Operands, FirstReg, Count, Stride, NumElements, ElementWidth, VectorKind, S, getLoc(), getContext())); + if (getTok().is(AsmToken::LBrac)) { + ParseStatus Res = tryParseVectorIndex(Operands); + if (Res.isFailure()) + return ParseStatus::Failure; + return ParseStatus::Success; + } + return ParseStatus::Success; } @@ -5267,12 +5439,17 @@ bool AArch64AsmParser::parseInstruction(ParseInstructionInfo &Info, size_t Start = 0, Next = Name.find('.'); StringRef Head = Name.slice(Start, Next); - // IC, DC, AT, TLBI and Prediction invalidation instructions are aliases for - // the SYS instruction. + // IC, DC, AT, TLBI, MLBI, GIC{R}, GSB and Prediction invalidation + // instructions are aliases for the SYS instruction. if (Head == "ic" || Head == "dc" || Head == "at" || Head == "tlbi" || - Head == "cfp" || Head == "dvp" || Head == "cpp" || Head == "cosp") + Head == "cfp" || Head == "dvp" || Head == "cpp" || Head == "cosp" || + Head == "mlbi" || Head == "gic" || Head == "gsb") return parseSysAlias(Head, NameLoc, Operands); + // GICR instructions are aliases for the SYSL instruction. + if (Head == "gicr") + return parseSyslAlias(Head, NameLoc, Operands); + // TLBIP instructions are aliases for the SYSP instruction. if (Head == "tlbip") return parseSyspAlias(Head, NameLoc, Operands); diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp index 3e55b76..14b0f9a 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp @@ -5126,23 +5126,13 @@ bool AArch64InstructionSelector::selectShuffleVector( MachineInstr &I, MachineRegisterInfo &MRI) { const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); Register Src1Reg = I.getOperand(1).getReg(); - const LLT Src1Ty = MRI.getType(Src1Reg); Register Src2Reg = I.getOperand(2).getReg(); - const LLT Src2Ty = MRI.getType(Src2Reg); ArrayRef<int> Mask = I.getOperand(3).getShuffleMask(); MachineBasicBlock &MBB = *I.getParent(); MachineFunction &MF = *MBB.getParent(); LLVMContext &Ctx = MF.getFunction().getContext(); - // G_SHUFFLE_VECTOR is weird in that the source operands can be scalars, if - // it's originated from a <1 x T> type. Those should have been lowered into - // G_BUILD_VECTOR earlier. - if (!Src1Ty.isVector() || !Src2Ty.isVector()) { - LLVM_DEBUG(dbgs() << "Could not select a \"scalar\" G_SHUFFLE_VECTOR\n"); - return false; - } - unsigned BytesPerElt = DstTy.getElementType().getSizeInBits() / 8; SmallVector<Constant *, 64> CstIdxs; diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index 05a4313..5f93847 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -1201,25 +1201,17 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) return llvm::is_contained( {v8s8, v16s8, v4s16, v8s16, v2s32, v4s32, v2s64}, DstTy); }) - // G_SHUFFLE_VECTOR can have scalar sources (from 1 x s vectors) or scalar - // destinations, we just want those lowered into G_BUILD_VECTOR or - // G_EXTRACT_ELEMENT. - .lowerIf([=](const LegalityQuery &Query) { - return !Query.Types[0].isVector() || !Query.Types[1].isVector(); - }) .moreElementsIf( [](const LegalityQuery &Query) { - return Query.Types[0].isVector() && Query.Types[1].isVector() && - Query.Types[0].getNumElements() > - Query.Types[1].getNumElements(); + return Query.Types[0].getNumElements() > + Query.Types[1].getNumElements(); }, changeTo(1, 0)) .moreElementsToNextPow2(0) .moreElementsIf( [](const LegalityQuery &Query) { - return Query.Types[0].isVector() && Query.Types[1].isVector() && - Query.Types[0].getNumElements() < - Query.Types[1].getNumElements(); + return Query.Types[0].getNumElements() < + Query.Types[1].getNumElements(); }, changeTo(0, 1)) .widenScalarOrEltToNextPow2OrMinSize(0, 8) diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp index 830a35bb..6d2d705 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp @@ -856,7 +856,9 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { break; } case TargetOpcode::G_FPTOSI_SAT: - case TargetOpcode::G_FPTOUI_SAT: { + case TargetOpcode::G_FPTOUI_SAT: + case TargetOpcode::G_FPTOSI: + case TargetOpcode::G_FPTOUI: { LLT DstType = MRI.getType(MI.getOperand(0).getReg()); if (DstType.isVector()) break; @@ -864,11 +866,19 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpRegBankIdx = {PMI_FirstFPR, PMI_FirstFPR}; break; } - OpRegBankIdx = {PMI_FirstGPR, PMI_FirstFPR}; + TypeSize DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, TRI); + TypeSize SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, TRI); + if (((DstSize == SrcSize) || STI.hasFeature(AArch64::FeatureFPRCVT)) && + all_of(MRI.use_nodbg_instructions(MI.getOperand(0).getReg()), + [&](const MachineInstr &UseMI) { + return onlyUsesFP(UseMI, MRI, TRI) || + prefersFPUse(UseMI, MRI, TRI); + })) + OpRegBankIdx = {PMI_FirstFPR, PMI_FirstFPR}; + else + OpRegBankIdx = {PMI_FirstGPR, PMI_FirstFPR}; break; } - case TargetOpcode::G_FPTOSI: - case TargetOpcode::G_FPTOUI: case TargetOpcode::G_INTRINSIC_LRINT: case TargetOpcode::G_INTRINSIC_LLRINT: if (MRI.getType(MI.getOperand(0).getReg()).isVector()) diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp index 35bd244..5c3e26e 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp @@ -84,6 +84,12 @@ void AArch64InstPrinter::printInst(const MCInst *MI, uint64_t Address, return; } + if (Opcode == AArch64::SYSLxt) + if (printSyslAlias(MI, STI, O)) { + printAnnotation(O, Annot); + return; + } + if (Opcode == AArch64::SYSPxt || Opcode == AArch64::SYSPxt_XZR) if (printSyspAlias(MI, STI, O)) { printAnnotation(O, Annot); @@ -909,13 +915,25 @@ bool AArch64InstPrinter::printSysAlias(const MCInst *MI, Encoding |= CnVal << 7; Encoding |= Op1Val << 11; - bool NeedsReg; + bool NeedsReg = false; + bool OptionalReg = false; std::string Ins; std::string Name; if (CnVal == 7) { switch (CmVal) { default: return false; + // MLBI aliases + case 0: { + const AArch64MLBI::MLBI *MLBI = + AArch64MLBI::lookupMLBIByEncoding(Encoding); + if (!MLBI || !MLBI->haveFeatures(STI.getFeatureBits())) + return false; + + NeedsReg = MLBI->NeedsReg; + Ins = "mlbi\t"; + Name = std::string(MLBI->Name); + } break; // Maybe IC, maybe Prediction Restriction case 1: switch (Op1Val) { @@ -1004,19 +1022,41 @@ bool AArch64InstPrinter::printSysAlias(const MCInst *MI, return false; NeedsReg = TLBI->NeedsReg; + if (STI.hasFeature(AArch64::FeatureAll) || + STI.hasFeature(AArch64::FeatureTLBID)) + OptionalReg = TLBI->OptionalReg; Ins = "tlbi\t"; Name = std::string(TLBI->Name); - } - else + } else if (CnVal == 12) { + if (CmVal != 0) { + // GIC aliases + const AArch64GIC::GIC *GIC = AArch64GIC::lookupGICByEncoding(Encoding); + if (!GIC || !GIC->haveFeatures(STI.getFeatureBits())) + return false; + + NeedsReg = true; + Ins = "gic\t"; + Name = std::string(GIC->Name); + } else { + // GSB aliases + const AArch64GSB::GSB *GSB = AArch64GSB::lookupGSBByEncoding(Encoding); + if (!GSB || !GSB->haveFeatures(STI.getFeatureBits())) + return false; + + NeedsReg = false; + Ins = "gsb\t"; + Name = std::string(GSB->Name); + } + } else return false; StringRef Reg = getRegisterName(MI->getOperand(4).getReg()); bool NotXZR = Reg != "xzr"; - // If a mandatory is not specified in the TableGen + // If a mandatory or optional register is not specified in the TableGen // (i.e. no register operand should be present), and the register value // is not xzr/x31, then disassemble to a SYS alias instead. - if (NotXZR && !NeedsReg) + if (NotXZR && !NeedsReg && !OptionalReg) return false; std::string Str = Ins + Name; @@ -1024,12 +1064,64 @@ bool AArch64InstPrinter::printSysAlias(const MCInst *MI, O << '\t' << Str; - if (NeedsReg) + // For optional registers, don't print the value if it's xzr/x31 + // since this defaults to xzr/x31 if register is not specified. + if (NeedsReg || (OptionalReg && NotXZR)) O << ", " << Reg; return true; } +bool AArch64InstPrinter::printSyslAlias(const MCInst *MI, + const MCSubtargetInfo &STI, + raw_ostream &O) { +#ifndef NDEBUG + unsigned Opcode = MI->getOpcode(); + assert(Opcode == AArch64::SYSLxt && "Invalid opcode for SYSL alias!"); +#endif + + StringRef Reg = getRegisterName(MI->getOperand(0).getReg()); + const MCOperand &Op1 = MI->getOperand(1); + const MCOperand &Cn = MI->getOperand(2); + const MCOperand &Cm = MI->getOperand(3); + const MCOperand &Op2 = MI->getOperand(4); + + unsigned Op1Val = Op1.getImm(); + unsigned CnVal = Cn.getImm(); + unsigned CmVal = Cm.getImm(); + unsigned Op2Val = Op2.getImm(); + + uint16_t Encoding = Op2Val; + Encoding |= CmVal << 3; + Encoding |= CnVal << 7; + Encoding |= Op1Val << 11; + + std::string Ins; + std::string Name; + + if (CnVal == 12) { + if (CmVal == 3) { + // GICR aliases + const AArch64GICR::GICR *GICR = + AArch64GICR::lookupGICRByEncoding(Encoding); + if (!GICR || !GICR->haveFeatures(STI.getFeatureBits())) + return false; + + Ins = "gicr"; + Name = std::string(GICR->Name); + } else + return false; + } else + return false; + + std::string Str; + llvm::transform(Name, Name.begin(), ::tolower); + + O << '\t' << Ins << '\t' << Reg.str() << ", " << Name; + + return true; +} + bool AArch64InstPrinter::printSyspAlias(const MCInst *MI, const MCSubtargetInfo &STI, raw_ostream &O) { @@ -1508,6 +1600,17 @@ void AArch64InstPrinter::printBTIHintOp(const MCInst *MI, unsigned OpNum, markup(O, Markup::Immediate) << '#' << formatImm(btihintop); } +void AArch64InstPrinter::printCMHPriorityHintOp(const MCInst *MI, + unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned priorityhint_op = MI->getOperand(OpNum).getImm(); + auto PHint = + AArch64CMHPriorityHint::lookupCMHPriorityHintByEncoding(priorityhint_op); + if (PHint) + O << PHint->Name; +} + void AArch64InstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O) { diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h index 15ef2dd..307402d 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h @@ -52,6 +52,8 @@ public: protected: bool printSysAlias(const MCInst *MI, const MCSubtargetInfo &STI, raw_ostream &O); + bool printSyslAlias(const MCInst *MI, const MCSubtargetInfo &STI, + raw_ostream &O); bool printSyspAlias(const MCInst *MI, const MCSubtargetInfo &STI, raw_ostream &O); bool printRangePrefetchAlias(const MCInst *MI, const MCSubtargetInfo &STI, @@ -151,6 +153,9 @@ protected: void printBTIHintOp(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O); + void printCMHPriorityHintOp(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printFPImmOperand(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O); diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td index 33f35ad..99836ae 100644 --- a/llvm/lib/Target/AArch64/SMEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td @@ -3920,6 +3920,78 @@ multiclass sme2_luti4_vector_vg4_index<string mnemonic> { def _S : sme2_luti4_vector_vg4_index<0b10, ZZZZ_s_mul_r, mnemonic>; } +// 8-bit Look up table +class sme2_lut_single<string asm> + : I<(outs ZPR8:$Zd), (ins ZTR:$ZTt, ZPRAny:$Zn), + asm, "\t$Zd, $ZTt, $Zn", "", []>, Sched<[]> { + bits<0> ZTt; + bits<5> Zd; + bits<5> Zn; + let Inst{31-10} = 0b1100000011001000010000; + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; +} + +//===----------------------------------------------------------------------===// +// Lookup table read with 6-bit indices (8-bit) +class sme2_luti6_zt_base<RegisterOperand zd_ty, string asm> + : I<(outs zd_ty:$Zd), (ins ZTR:$ZTt, ZZZ_Any:$Zn), + asm, "\t$Zd, $ZTt, $Zn", "", []>, Sched<[]> { + bits<0> ZTt; + bits<3> Zd; + bits<3> Zn; + let Inst{31-21} = 0b11000000100; + let Inst{19-10} = 0b1010000000; + let Inst{9-7} = Zn; + let Inst{6-5} = 0b00; +} + +class sme2_luti6_zt_consecutive<string asm> + : sme2_luti6_zt_base<ZZZZ_b_mul_r, asm> { + let Inst{20} = 0; + let Inst{4-2} = Zd; + let Inst{1-0} = 0b00; +} + +class sme2_luti6_zt_strided<string asm> + : sme2_luti6_zt_base<ZZZZ_b_strided, asm> { + let Inst{20} = 1; + let Inst{4} = Zd{2}; + let Inst{3-2} = 0b00; + let Inst{1-0} = Zd{1-0}; +} + +//===----------------------------------------------------------------------===// +// Lookup table read with 6-bit indices (8-bit) +class sme2_luti6_vector_vg4_base<RegisterOperand zd_ty, string asm> + : I<(outs zd_ty:$Zd), (ins ZZ_h:$Zn, ZZ_Any:$Zm, VectorIndexD:$i1), + asm, "\t$Zd, $Zn, $Zm$i1", "", []>, Sched<[]> { + bits<3> Zd; + bits<5> Zn; + bits<5> Zm; + bits<1> i1; + let Inst{31-23} = 0b110000010; + let Inst{22} = i1; + let Inst{21} = 0b1; + let Inst{20-16} = Zm; + let Inst{9-5} = Zn; +} + +class sme2_luti6_vector_vg4_consecutive<string asm> + : sme2_luti6_vector_vg4_base<ZZZZ_h_mul_r, asm> { + let Inst{15-10} = 0b111101; + let Inst{4-2} = Zd; + let Inst{1-0} = 0b00; +} + +class sme2_luti6_vector_vg4_strided<string asm> + : sme2_luti6_vector_vg4_base<ZZZZ_h_strided, asm> { + let Inst{15-10} = 0b111111; + let Inst{4} = Zd{2}; + let Inst{3-2} = 0b00; + let Inst{1-0} = Zd{1-0}; +} + //===----------------------------------------------------------------------===// // SME2 MOV class sme2_mova_vec_to_tile_vg2_multi_base<bits<2> sz, bit v, diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index 3cdd505..1664f4a 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -3787,7 +3787,7 @@ multiclass sve2p1_two_way_dot_vv<string mnemonic, bit u, SDPatternOperator intri // SVE Integer Dot Product Group - Indexed Group //===----------------------------------------------------------------------===// -class sve_intx_dot_by_indexed_elem<bit sz, bit U, string asm, +class sve_intx_dot_by_indexed_elem<bit U, string asm, ZPRRegOp zprty1, ZPRRegOp zprty2, ZPRRegOp zprty3, Operand itype> : I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty2:$Zn, zprty3:$Zm, itype:$iop), @@ -3795,8 +3795,7 @@ class sve_intx_dot_by_indexed_elem<bit sz, bit U, string asm, "", []>, Sched<[]> { bits<5> Zda; bits<5> Zn; - let Inst{31-23} = 0b010001001; - let Inst{22} = sz; + let Inst{31-24} = 0b01000100; let Inst{21} = 0b1; let Inst{15-11} = 0; let Inst{10} = U; @@ -3810,16 +3809,18 @@ class sve_intx_dot_by_indexed_elem<bit sz, bit U, string asm, multiclass sve_intx_dot_by_indexed_elem<bit opc, string asm, SDPatternOperator op> { - def _BtoS : sve_intx_dot_by_indexed_elem<0b0, opc, asm, ZPR32, ZPR8, ZPR3b8, VectorIndexS32b_timm> { + def _BtoS : sve_intx_dot_by_indexed_elem<opc, asm, ZPR32, ZPR8, ZPR3b8, VectorIndexS32b_timm> { bits<2> iop; bits<3> Zm; + let Inst{23-22} = 0b10; let Inst{20-19} = iop; let Inst{18-16} = Zm; } - def _HtoD : sve_intx_dot_by_indexed_elem<0b1, opc, asm, ZPR64, ZPR16, ZPR4b16, VectorIndexD32b_timm> { + def _HtoD : sve_intx_dot_by_indexed_elem<opc, asm, ZPR64, ZPR16, ZPR4b16, VectorIndexD32b_timm> { bits<1> iop; bits<4> Zm; - let Inst{20} = iop; + let Inst{23-22} = 0b11; + let Inst{20} = iop; let Inst{19-16} = Zm; } @@ -3827,6 +3828,16 @@ multiclass sve_intx_dot_by_indexed_elem<bit opc, string asm, def : SVE_4_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv8i16, nxv8i16, i32, VectorIndexD32b_timm, !cast<Instruction>(NAME # _HtoD)>; } +class sve_intx_dot_by_indexed_elem_x<bit opc, string asm> +: sve_intx_dot_by_indexed_elem<opc, asm, ZPR16, ZPR8, ZPR3b8, VectorIndexH32b_timm> { + bits<3> iop; + bits<3> Zm; + let Inst{23} = 0b0; + let Inst{22} = iop{2}; + let Inst{20-19} = iop{1-0}; + let Inst{18-16} = Zm; +} + //===----------------------------------------------------------------------===// // SVE2 Complex Integer Dot Product Group //===----------------------------------------------------------------------===// @@ -4085,7 +4096,7 @@ class sve2_int_arith_pred<bits<2> sz, bits<6> opc, string asm, bits<5> Zdn; let Inst{31-24} = 0b01000100; let Inst{23-22} = sz; - let Inst{21-20} = 0b01; + let Inst{21} = 0b0; let Inst{20-16} = opc{5-1}; let Inst{15-14} = 0b10; let Inst{13} = opc{0}; @@ -4590,15 +4601,15 @@ multiclass sve2_int_cadd<bit opc, string asm, SDPatternOperator op> { def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv2i64, i32, complexrotateopodd, !cast<Instruction>(NAME # _D)>; } -class sve2_int_absdiff_accum<bits<2> sz, bits<4> opc, string asm, +class sve2_int_absdiff_accum<bits<3> sz, bits<4> opc, string asm, ZPRRegOp zprty1, ZPRRegOp zprty2> : I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty2:$Zn, zprty2:$Zm), asm, "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> { bits<5> Zda; bits<5> Zn; bits<5> Zm; - let Inst{31-24} = 0b01000101; - let Inst{23-22} = sz; + let Inst{31-25} = 0b0100010; + let Inst{24-22} = sz; let Inst{21} = 0b0; let Inst{20-16} = Zm; let Inst{15-14} = 0b11; @@ -4613,10 +4624,10 @@ class sve2_int_absdiff_accum<bits<2> sz, bits<4> opc, string asm, } multiclass sve2_int_absdiff_accum<bit opc, string asm, SDPatternOperator op> { - def _B : sve2_int_absdiff_accum<0b00, { 0b111, opc }, asm, ZPR8, ZPR8>; - def _H : sve2_int_absdiff_accum<0b01, { 0b111, opc }, asm, ZPR16, ZPR16>; - def _S : sve2_int_absdiff_accum<0b10, { 0b111, opc }, asm, ZPR32, ZPR32>; - def _D : sve2_int_absdiff_accum<0b11, { 0b111, opc }, asm, ZPR64, ZPR64>; + def _B : sve2_int_absdiff_accum<0b100, { 0b111, opc }, asm, ZPR8, ZPR8>; + def _H : sve2_int_absdiff_accum<0b101, { 0b111, opc }, asm, ZPR16, ZPR16>; + def _S : sve2_int_absdiff_accum<0b110, { 0b111, opc }, asm, ZPR32, ZPR32>; + def _D : sve2_int_absdiff_accum<0b111, { 0b111, opc }, asm, ZPR64, ZPR64>; def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>; def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>; @@ -4626,20 +4637,26 @@ multiclass sve2_int_absdiff_accum<bit opc, string asm, SDPatternOperator op> { multiclass sve2_int_absdiff_accum_long<bits<2> opc, string asm, SDPatternOperator op> { - def _H : sve2_int_absdiff_accum<0b01, { 0b00, opc }, asm, ZPR16, ZPR8>; - def _S : sve2_int_absdiff_accum<0b10, { 0b00, opc }, asm, ZPR32, ZPR16>; - def _D : sve2_int_absdiff_accum<0b11, { 0b00, opc }, asm, ZPR64, ZPR32>; + def _H : sve2_int_absdiff_accum<0b101, { 0b00, opc }, asm, ZPR16, ZPR8>; + def _S : sve2_int_absdiff_accum<0b110, { 0b00, opc }, asm, ZPR32, ZPR16>; + def _D : sve2_int_absdiff_accum<0b111, { 0b00, opc }, asm, ZPR64, ZPR32>; def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _H)>; def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _S)>; def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _D)>; } +multiclass sve2_int_two_way_absdiff_accum_long<bit U, string asm> { + def _BtoH : sve2_int_absdiff_accum<0b001, { 0b01, U, 0b1 }, asm, ZPR16, ZPR8>; + def _HtoS : sve2_int_absdiff_accum<0b010, { 0b01, U, 0b1 }, asm, ZPR32, ZPR16>; + def _StoD : sve2_int_absdiff_accum<0b011, { 0b01, U, 0b1 }, asm, ZPR64, ZPR32>; +} + multiclass sve2_int_addsub_long_carry<bits<2> opc, string asm, SDPatternOperator op> { - def _S : sve2_int_absdiff_accum<{ opc{1}, 0b0 }, { 0b010, opc{0} }, asm, + def _S : sve2_int_absdiff_accum<{ 0b1, opc{1}, 0b0 }, { 0b010, opc{0} }, asm, ZPR32, ZPR32>; - def _D : sve2_int_absdiff_accum<{ opc{1}, 0b1 }, { 0b010, opc{0} }, asm, + def _D : sve2_int_absdiff_accum<{ 0b1, opc{1}, 0b1 }, { 0b010, opc{0} }, asm, ZPR64, ZPR64>; def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>; @@ -9610,17 +9627,18 @@ multiclass sve_int_dot_mixed_indexed<bit U, string asm, SDPatternOperator op> { // SVE Floating Point Matrix Multiply Accumulate Group //===----------------------------------------------------------------------===// -class sve_fp_matrix_mla<bits<2> opc, string asm, ZPRRegOp zda_ty, ZPRRegOp reg_ty> +class sve_fp_matrix_mla<bits<3> opc, string asm, ZPRRegOp zda_ty, ZPRRegOp reg_ty> : I<(outs zda_ty:$Zda), (ins zda_ty:$_Zda, reg_ty:$Zn, reg_ty:$Zm), asm, "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> { bits<5> Zda; bits<5> Zn; bits<5> Zm; let Inst{31-24} = 0b01100100; - let Inst{23-22} = opc; + let Inst{23-22} = opc{2-1}; let Inst{21} = 1; let Inst{20-16} = Zm; - let Inst{15-10} = 0b111001; + let Inst{15-11} = 0b11100; + let Inst{10} = opc{0}; let Inst{9-5} = Zn; let Inst{4-0} = Zda; @@ -9630,10 +9648,12 @@ class sve_fp_matrix_mla<bits<2> opc, string asm, ZPRRegOp zda_ty, ZPRRegOp reg_t let mayRaiseFPException = 1; } -multiclass sve_fp_matrix_mla<bits<2> opc, string asm, ZPRRegOp zda_ty, ZPRRegOp reg_ty, SDPatternOperator op, ValueType zda_vt, ValueType reg_vt> { +multiclass sve_fp_matrix_mla<bits<3> opc, string asm, ZPRRegOp zda_ty, + ZPRRegOp reg_ty, SDPatternOperator op, + ValueType zda_vt, ValueType reg_vt> { def NAME : sve_fp_matrix_mla<opc, asm, zda_ty, reg_ty>; - def : SVE_3_Op_Pat<zda_vt, op , zda_vt, reg_vt, reg_vt, !cast<Instruction>(NAME)>; + def : SVE_3_Op_Pat<zda_vt, op, zda_vt, reg_vt, reg_vt, !cast<Instruction>(NAME)>; } //===----------------------------------------------------------------------===// @@ -10030,18 +10050,19 @@ multiclass sve2p1_multi_vec_extract_narrow<string mnemonic, bits<2> opc, SDPatte } // SVE2 multi-vec shift narrow -class sve2p1_multi_vec_shift_narrow<string mnemonic, bits<3> opc, bits<2> tsz> - : I<(outs ZPR16:$Zd), (ins ZZ_s_mul_r:$Zn, vecshiftR16:$imm4), - mnemonic, "\t$Zd, $Zn, $imm4", +class sve2p1_multi_vec_shift_narrow<string mnemonic, ZPRRegOp ZdRC, RegisterOperand ZSrcOp, + Operand immtype, bits<3> opc, bits<2> tsz> + : I<(outs ZdRC:$Zd), (ins ZSrcOp:$Zn, immtype:$imm), + mnemonic, "\t$Zd, $Zn, $imm", "", []>, Sched<[]> { bits<5> Zd; bits<4> Zn; - bits<4> imm4; + bits<4> imm; let Inst{31-23} = 0b010001011; let Inst{22} = tsz{1}; let Inst{21} = 0b1; let Inst{20} = tsz{0}; - let Inst{19-16} = imm4; + let Inst{18-16} = imm{2-0}; // imm3 let Inst{15-14} = 0b00; let Inst{13-11} = opc; let Inst{10} = 0b0; @@ -10052,12 +10073,19 @@ class sve2p1_multi_vec_shift_narrow<string mnemonic, bits<3> opc, bits<2> tsz> let hasSideEffects = 0; } -multiclass sve2p1_multi_vec_shift_narrow<string mnemonic, bits<3> opc, SDPatternOperator intrinsic> { - def NAME : sve2p1_multi_vec_shift_narrow<mnemonic, opc, 0b01>; +multiclass sve_multi_vec_shift_narrow<string mnemonic, bits<3> opc, SDPatternOperator intrinsic> { + def NAME : sve2p1_multi_vec_shift_narrow<mnemonic, ZPR16, ZZ_s_mul_r, vecshiftR16, opc, 0b01> { + let Inst{19} = imm{3}; // imm4 + } def : SVE2p1_Sat_Shift_VG2_Pat<NAME, intrinsic, nxv8i16, nxv4i32, vecshiftR16>; } +multiclass sve_multi_vec_round_shift_narrow<string mnemonic, bits<3> opc> { + def NAME : sve2p1_multi_vec_shift_narrow<mnemonic, ZPR8, ZZ_h_mul_r, vecshiftR8, opc, 0b00> { + let Inst{19} = 0b1; // always 1 for imm3 version + } +} // SME2 multi-vec contiguous load (scalar plus scalar, two registers) class sve2p1_mem_cld_ss_2z<string mnemonic, bits<2> msz, bit n, @@ -11164,7 +11192,7 @@ multiclass sve2_fp8_dot_indexed_s<string asm, SDPatternOperator op> { def : SVE_4_Op_Pat<nxv4f32, op, nxv4f32, nxv16i8, nxv16i8, i32, !cast<Instruction>(NAME)>; } -// FP8 Look up table +// Look up table class sve2_lut_vector_index<ZPRRegOp zd_ty, RegisterOperand zn_ty, Operand idx_ty, bits<4>opc, string mnemonic> : I<(outs zd_ty:$Zd), (ins zn_ty:$Zn, ZPRAny:$Zm, idx_ty:$idx), @@ -11183,7 +11211,7 @@ class sve2_lut_vector_index<ZPRRegOp zd_ty, RegisterOperand zn_ty, let Inst{4-0} = Zd; } -// FP8 Look up table read with 2-bit indices +// Look up table read with 2-bit indices multiclass sve2_luti2_vector_index<string mnemonic> { def _B : sve2_lut_vector_index<ZPR8, Z_b, VectorIndexS32b, {?, 0b100}, mnemonic> { bits<2> idx; @@ -11205,7 +11233,7 @@ multiclass sve2_luti2_vector_index<string mnemonic> { i32, timm32_0_7, !cast<Instruction>(NAME # _H)>; } -// FP8 Look up table read with 4-bit indices +// Look up table read with 4-bit indices multiclass sve2_luti4_vector_index<string mnemonic> { def _B : sve2_lut_vector_index<ZPR8, Z_b, VectorIndexD32b, 0b1001, mnemonic> { bit idx; @@ -11226,7 +11254,7 @@ multiclass sve2_luti4_vector_index<string mnemonic> { i32, timm32_0_3, !cast<Instruction>(NAME # _H)>; } -// FP8 Look up table read with 4-bit indices (two contiguous registers) +// Look up table read with 4-bit indices (two contiguous registers) multiclass sve2_luti4_vector_vg2_index<string mnemonic> { def NAME : sve2_lut_vector_index<ZPR16, ZZ_h, VectorIndexS32b, {?, 0b101}, mnemonic> { bits<2> idx; @@ -11250,6 +11278,29 @@ multiclass sve2_luti4_vector_vg2_index<string mnemonic> { nxv16i8:$Op3, timm32_0_3:$Op4))>; } +// Look up table read with 6-bit indices +multiclass sve2_luti6_vector_index<string mnemonic> { + def _H : sve2_lut_vector_index<ZPR16, ZZ_h, VectorIndexD32b, 0b1011, mnemonic> { + bit idx; + let Inst{23} = idx; + } +} + +// Look up table +class sve2_luti6_vector<string mnemonic> + : I<(outs ZPR8:$Zd), (ins ZZ_b:$Zn, ZPRAny:$Zm), + mnemonic, "\t$Zd, $Zn, $Zm", + "", []>, Sched<[]> { + bits<5> Zd; + bits<5> Zn; + bits<5> Zm; + let Inst{31-21} = 0b01000101001; + let Inst{20-16} = Zm; + let Inst{15-10} = 0b101011; + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; +} + //===----------------------------------------------------------------------===// // Checked Pointer Arithmetic (FEAT_CPA) //===----------------------------------------------------------------------===// @@ -11280,3 +11331,49 @@ class sve_int_mla_cpa<string asm> let ElementSize = ZPR64.ElementSize; } + +//===----------------------------------------------------------------------===// +// FP to Int down-converts +//===----------------------------------------------------------------------===// +class sve2_fp_to_int_downcvt<string asm, ZPRRegOp ZdRC, RegisterOperand ZSrcOp, bits<2> size, bit U> + : I<(outs ZdRC:$Zd), (ins ZSrcOp:$Zn), + asm, "\t$Zd, $Zn", "", []>, Sched<[]> { + bits<5> Zd; + bits<4> Zn; + let Inst{31-24} = 0b01100101; + let Inst{23-22} = size; + let Inst{21-11} = 0b00110100110; + let Inst{10} = U; + let Inst{9-6} = Zn; + let Inst{5} = 0b0; + let Inst{4-0} = Zd; +} + +multiclass sve2_fp_to_int_downcvt<string asm, bit U> { + def _HtoB : sve2_fp_to_int_downcvt<asm, ZPR8, ZZ_h_mul_r, 0b01, U>; + def _StoH : sve2_fp_to_int_downcvt<asm, ZPR16, ZZ_s_mul_r, 0b10, U>; + def _DtoS : sve2_fp_to_int_downcvt<asm, ZPR32, ZZ_d_mul_r, 0b11, U>; +} + +//===----------------------------------------------------------------------===// +// Int to FP up-converts +//===----------------------------------------------------------------------===// +class sve2_int_to_fp_upcvt<string asm, ZPRRegOp ZdRC, ZPRRegOp ZnRC, + bits<2> size, bits<2> U> + : I<(outs ZdRC:$Zd), (ins ZnRC:$Zn), + asm, "\t$Zd, $Zn", "", []>, Sched<[]> { + bits<5> Zd; + bits<5> Zn; + let Inst{31-24} = 0b01100101; + let Inst{23-22} = size; + let Inst{21-12} = 0b0011000011; + let Inst{11-10} = U; + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; +} + +multiclass sve2_int_to_fp_upcvt<string asm, bits<2> U> { + def _BtoH : sve2_int_to_fp_upcvt<asm, ZPR16, ZPR8, 0b01, U>; + def _HtoS : sve2_int_to_fp_upcvt<asm, ZPR32, ZPR16, 0b10, U>; + def _StoD : sve2_int_to_fp_upcvt<asm, ZPR64, ZPR32, 0b11, U>; +} diff --git a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp index d6cb0e8..268a229 100644 --- a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp +++ b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp @@ -139,6 +139,13 @@ namespace llvm { } namespace llvm { +namespace AArch64CMHPriorityHint { +#define GET_CMHPRIORITYHINT_IMPL +#include "AArch64GenSystemOperands.inc" +} // namespace AArch64CMHPriorityHint +} // namespace llvm + +namespace llvm { namespace AArch64SysReg { #define GET_SysRegsList_IMPL #include "AArch64GenSystemOperands.inc" @@ -190,6 +197,32 @@ namespace AArch64TLBIP { #define GET_TLBIPTable_IMPL #include "AArch64GenSystemOperands.inc" } // namespace AArch64TLBIP + +namespace AArch64MLBI { +#define GET_MLBITable_IMPL +#include "AArch64GenSystemOperands.inc" +} // namespace AArch64MLBI +} // namespace llvm + +namespace llvm { +namespace AArch64GIC { +#define GET_GICTable_IMPL +#include "AArch64GenSystemOperands.inc" +} // namespace AArch64GIC +} // namespace llvm + +namespace llvm { +namespace AArch64GICR { +#define GET_GICRTable_IMPL +#include "AArch64GenSystemOperands.inc" +} // namespace AArch64GICR +} // namespace llvm + +namespace llvm { +namespace AArch64GSB { +#define GET_GSBTable_IMPL +#include "AArch64GenSystemOperands.inc" +} // namespace AArch64GSB } // namespace llvm namespace llvm { diff --git a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h index fea33ef..27812e9 100644 --- a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h +++ b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h @@ -409,6 +409,16 @@ struct SysAliasReg : SysAlias { : SysAlias(N, E, F), NeedsReg(R) {} }; +struct SysAliasOptionalReg : SysAlias { + bool NeedsReg; + bool OptionalReg; + constexpr SysAliasOptionalReg(const char *N, uint16_t E, bool R, bool O) + : SysAlias(N, E), NeedsReg(R), OptionalReg(O) {} + constexpr SysAliasOptionalReg(const char *N, uint16_t E, bool R, bool O, + FeatureBitset F) + : SysAlias(N, E, F), NeedsReg(R), OptionalReg(O) {} +}; + struct SysAliasImm : SysAlias { uint16_t ImmValue; constexpr SysAliasImm(const char *N, uint16_t E, uint16_t I) @@ -677,6 +687,14 @@ namespace AArch64BTIHint { #include "AArch64GenSystemOperands.inc" } +namespace AArch64CMHPriorityHint { +struct CMHPriorityHint : SysAlias { + using SysAlias::SysAlias; +}; +#define GET_CMHPRIORITYHINT_DECL +#include "AArch64GenSystemOperands.inc" +} // namespace AArch64CMHPriorityHint + namespace AArch64SME { enum ToggleCondition : unsigned { Always, @@ -788,21 +806,53 @@ namespace AArch64SysReg { } namespace AArch64TLBI { - struct TLBI : SysAliasReg { - using SysAliasReg::SysAliasReg; - }; - #define GET_TLBITable_DECL - #include "AArch64GenSystemOperands.inc" +struct TLBI : SysAliasOptionalReg { + using SysAliasOptionalReg::SysAliasOptionalReg; +}; +#define GET_TLBITable_DECL +#include "AArch64GenSystemOperands.inc" } namespace AArch64TLBIP { -struct TLBIP : SysAliasReg { - using SysAliasReg::SysAliasReg; +struct TLBIP : SysAliasOptionalReg { + using SysAliasOptionalReg::SysAliasOptionalReg; }; #define GET_TLBIPTable_DECL #include "AArch64GenSystemOperands.inc" } // namespace AArch64TLBIP +namespace AArch64MLBI { +struct MLBI : SysAliasReg { + using SysAliasReg::SysAliasReg; +}; +#define GET_MLBITable_DECL +#include "AArch64GenSystemOperands.inc" +} // namespace AArch64MLBI + +namespace AArch64GIC { +struct GIC : SysAliasReg { + using SysAliasReg::SysAliasReg; +}; +#define GET_GICTable_DECL +#include "AArch64GenSystemOperands.inc" +} // namespace AArch64GIC + +namespace AArch64GICR { +struct GICR : SysAliasReg { + using SysAliasReg::SysAliasReg; +}; +#define GET_GICRTable_DECL +#include "AArch64GenSystemOperands.inc" +} // namespace AArch64GICR + +namespace AArch64GSB { +struct GSB : SysAlias { + using SysAlias::SysAlias; +}; +#define GET_GSBTable_DECL +#include "AArch64GenSystemOperands.inc" +} // namespace AArch64GSB + namespace AArch64II { /// Target Operand Flag enum. enum TOF { diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp index d71f728..085c8588 100644 --- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp +++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp @@ -75,8 +75,8 @@ SMEAttrs::SMEAttrs(const AttributeList &Attrs) { } void SMEAttrs::addKnownFunctionAttrs(StringRef FuncName, - const AArch64TargetLowering &TLI) { - RTLIB::LibcallImpl Impl = TLI.getSupportedLibcallImpl(FuncName); + const RTLIB::RuntimeLibcallsInfo &RTLCI) { + RTLIB::LibcallImpl Impl = RTLCI.getSupportedLibcallImpl(FuncName); if (Impl == RTLIB::Unsupported) return; unsigned KnownAttrs = SMEAttrs::Normal; @@ -124,21 +124,22 @@ bool SMECallAttrs::requiresSMChange() const { return true; } -SMECallAttrs::SMECallAttrs(const CallBase &CB, const AArch64TargetLowering *TLI) +SMECallAttrs::SMECallAttrs(const CallBase &CB, + const RTLIB::RuntimeLibcallsInfo *RTLCI) : CallerFn(*CB.getFunction()), CalledFn(SMEAttrs::Normal), Callsite(CB.getAttributes()), IsIndirect(CB.isIndirectCall()) { if (auto *CalledFunction = CB.getCalledFunction()) - CalledFn = SMEAttrs(*CalledFunction, TLI); - - // An `invoke` of an agnostic ZA function may not return normally (it may - // resume in an exception block). In this case, it acts like a private ZA - // callee and may require a ZA save to be set up before it is called. - if (isa<InvokeInst>(CB)) - CalledFn.set(SMEAttrs::ZA_State_Agnostic, /*Enable=*/false); + CalledFn = SMEAttrs(*CalledFunction, RTLCI); // FIXME: We probably should not allow SME attributes on direct calls but // clang duplicates streaming mode attributes at each callsite. assert((IsIndirect || ((Callsite.withoutPerCallsiteFlags() | CalledFn) == CalledFn)) && "SME attributes at callsite do not match declaration"); + + // An `invoke` of an agnostic ZA function may not return normally (it may + // resume in an exception block). In this case, it acts like a private ZA + // callee and may require a ZA save to be set up before it is called. + if (isa<InvokeInst>(CB)) + CalledFn.set(SMEAttrs::ZA_State_Agnostic, /*Enable=*/false); } diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h index d26e3cd..28c397e 100644 --- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h +++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h @@ -12,8 +12,9 @@ #include "llvm/IR/Function.h" namespace llvm { - -class AArch64TargetLowering; +namespace RTLIB { +struct RuntimeLibcallsInfo; +} class Function; class CallBase; @@ -52,14 +53,14 @@ public: SMEAttrs() = default; SMEAttrs(unsigned Mask) { set(Mask); } - SMEAttrs(const Function &F, const AArch64TargetLowering *TLI = nullptr) + SMEAttrs(const Function &F, const RTLIB::RuntimeLibcallsInfo *RTLCI = nullptr) : SMEAttrs(F.getAttributes()) { - if (TLI) - addKnownFunctionAttrs(F.getName(), *TLI); + if (RTLCI) + addKnownFunctionAttrs(F.getName(), *RTLCI); } SMEAttrs(const AttributeList &L); - SMEAttrs(StringRef FuncName, const AArch64TargetLowering &TLI) { - addKnownFunctionAttrs(FuncName, TLI); + SMEAttrs(StringRef FuncName, const RTLIB::RuntimeLibcallsInfo &RTLCI) { + addKnownFunctionAttrs(FuncName, RTLCI); }; void set(unsigned M, bool Enable = true) { @@ -157,7 +158,7 @@ public: private: void addKnownFunctionAttrs(StringRef FuncName, - const AArch64TargetLowering &TLI); + const RTLIB::RuntimeLibcallsInfo &RTLCI); void validate() const; }; @@ -175,7 +176,7 @@ public: SMEAttrs Callsite = SMEAttrs::Normal) : CallerFn(Caller), CalledFn(Callee), Callsite(Callsite) {} - SMECallAttrs(const CallBase &CB, const AArch64TargetLowering *TLI); + SMECallAttrs(const CallBase &CB, const RTLIB::RuntimeLibcallsInfo *RTLCI); SMEAttrs &caller() { return CallerFn; } SMEAttrs &callee() { return IsIndirect ? Callsite : CalledFn; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td index e8b211f..7f00ead 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -176,6 +176,19 @@ def binop_s64_with_s32_mask_combines : GICombineGroup<[ combine_or_s64_with_s32_mask, combine_and_s64_with_s32_mask ]>; +// (or i64:x, (zext i32:y)) -> i64:(merge (or lo_32(x), i32:y), hi_32(x)) +// (or (zext i32:y), i64:x) -> i64:(merge (or lo_32(x), i32:y), hi_32(x)) +def or_s64_zext_s32_frag : GICombinePatFrag<(outs root:$dst), (ins $src_s64, $src_s32), + [(pattern (G_OR $dst, i64:$src_s64, i64:$zext_val), (G_ZEXT i64:$zext_val, i32:$src_s32)), + (pattern (G_OR $dst, i64:$zext_val, i64:$src_s64), (G_ZEXT i64:$zext_val, i32:$src_s32))]>; + +def combine_or_s64_s32 : GICombineRule< + (defs root:$dst), + (match (or_s64_zext_s32_frag $dst, i64:$x, i32:$y):$dst), + (apply (G_UNMERGE_VALUES $x_lo, $x_hi, $x), + (G_OR $or, $x_lo, $y), + (G_MERGE_VALUES $dst, $or, $x_hi))>; + let Predicates = [Has16BitInsts, NotHasMed3_16] in { // For gfx8, expand f16-fmed3-as-f32 into a min/max f16 sequence. This // saves one instruction compared to the promotion. @@ -206,7 +219,7 @@ def AMDGPUPreLegalizerCombiner: GICombiner< "AMDGPUPreLegalizerCombinerImpl", [all_combines, combine_fmul_with_select_to_fldexp, clamp_i64_to_i16, foldable_fneg, combine_shuffle_vector_to_build_vector, - binop_s64_with_s32_mask_combines]> { + binop_s64_with_s32_mask_combines, combine_or_s64_s32]> { let CombineAllMethodName = "tryCombineAllImpl"; } @@ -215,7 +228,7 @@ def AMDGPUPostLegalizerCombiner: GICombiner< [all_combines, gfx6gfx7_combines, gfx8_combines, combine_fmul_with_select_to_fldexp, uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg, rcp_sqrt_to_rsq, fdiv_by_sqrt_to_rsq_f16, sign_extension_in_reg, smulu64, - binop_s64_with_s32_mask_combines]> { + binop_s64_with_s32_mask_combines, combine_or_s64_s32]> { let CombineAllMethodName = "tryCombineAllImpl"; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 8ed4062..1b559a6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -514,8 +514,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, MVT::i64, Custom); setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); - setOperationAction({ISD::ABS, ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, - MVT::i32, Legal); + setOperationAction({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, MVT::i32, + Legal); setOperationAction( {ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 596a895..1a13b22 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -976,9 +976,25 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, FPOpActions.clampMaxNumElementsStrict(0, S32, 2); } + auto &MinNumMaxNumIeee = + getActionDefinitionsBuilder({G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); + + if (ST.hasVOP3PInsts()) { + MinNumMaxNumIeee.legalFor(FPTypesPK16) + .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) + .clampMaxNumElements(0, S16, 2) + .clampScalar(0, S16, S64) + .scalarize(0); + } else if (ST.has16BitInsts()) { + MinNumMaxNumIeee.legalFor(FPTypes16).clampScalar(0, S16, S64).scalarize(0); + } else { + MinNumMaxNumIeee.legalFor(FPTypesBase) + .clampScalar(0, S32, S64) + .scalarize(0); + } + auto &MinNumMaxNum = getActionDefinitionsBuilder( - {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM, G_FMINNUM_IEEE, - G_FMAXNUM_IEEE}); + {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM}); if (ST.hasVOP3PInsts()) { MinNumMaxNum.customFor(FPTypesPK16) @@ -2136,9 +2152,17 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .legalFor(FPTypesPK16) .clampMaxNumElements(0, S16, 2) .scalarize(0); + } else if (ST.hasVOP3PInsts()) { + getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}) + .lowerFor({V2S16}) + .clampMaxNumElementsStrict(0, S16, 2) + .scalarize(0) + .lower(); } else { - // TODO: Implement - getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower(); + getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}) + .scalarize(0) + .clampScalar(0, S32, S64) + .lower(); } getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET}) @@ -2195,8 +2219,6 @@ bool AMDGPULegalizerInfo::legalizeCustom( case TargetOpcode::G_FMAXNUM: case TargetOpcode::G_FMINIMUMNUM: case TargetOpcode::G_FMAXIMUMNUM: - case TargetOpcode::G_FMINNUM_IEEE: - case TargetOpcode::G_FMAXNUM_IEEE: return legalizeMinNumMaxNum(Helper, MI); case TargetOpcode::G_EXTRACT_VECTOR_ELT: return legalizeExtractVectorElt(MI, MRI, B); @@ -2817,23 +2839,8 @@ bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, MachineFunction &MF = Helper.MIRBuilder.getMF(); const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || - MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; - - // With ieee_mode disabled, the instructions have the correct behavior - // already for G_FMINIMUMNUM/G_FMAXIMUMNUM. - // - // FIXME: G_FMINNUM/G_FMAXNUM should match the behavior with ieee_mode - // enabled. - if (!MFI->getMode().IEEE) { - if (MI.getOpcode() == AMDGPU::G_FMINIMUMNUM || - MI.getOpcode() == AMDGPU::G_FMAXIMUMNUM) - return true; - - return !IsIEEEOp; - } - - if (IsIEEEOp) + // With ieee_mode disabled, the instructions have the correct behavior. + if (!MFI->getMode().IEEE) return true; return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 99ba043..5580e4c 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -1860,7 +1860,6 @@ private: bool validateTHAndScopeBits(const MCInst &Inst, const OperandVector &Operands, const unsigned CPol); bool validateTFE(const MCInst &Inst, const OperandVector &Operands); - bool validateSetVgprMSB(const MCInst &Inst, const OperandVector &Operands); bool validateLdsDirect(const MCInst &Inst, const OperandVector &Operands); bool validateWMMA(const MCInst &Inst, const OperandVector &Operands); unsigned getConstantBusLimit(unsigned Opcode) const; @@ -5506,22 +5505,6 @@ bool AMDGPUAsmParser::validateTFE(const MCInst &Inst, return true; } -bool AMDGPUAsmParser::validateSetVgprMSB(const MCInst &Inst, - const OperandVector &Operands) { - if (Inst.getOpcode() != AMDGPU::S_SET_VGPR_MSB_gfx12) - return true; - - int Simm16Pos = - AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::simm16); - if ((unsigned)Inst.getOperand(Simm16Pos).getImm() > 255) { - SMLoc Loc = Operands[1]->getStartLoc(); - Error(Loc, "s_set_vgpr_msb accepts values in range [0..255]"); - return false; - } - - return true; -} - bool AMDGPUAsmParser::validateWMMA(const MCInst &Inst, const OperandVector &Operands) { unsigned Opc = Inst.getOpcode(); @@ -5681,9 +5664,6 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst, SMLoc IDLoc, if (!validateTFE(Inst, Operands)) { return false; } - if (!validateSetVgprMSB(Inst, Operands)) { - return false; - } if (!validateWMMA(Inst, Operands)) { return false; } diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp index 09ef6ac..2aa54c9 100644 --- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -45,9 +45,6 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM, // Legalize loads and stores to the private address space. setOperationAction(ISD::LOAD, {MVT::i32, MVT::v2i32, MVT::v4i32}, Custom); - // 32-bit ABS is legal for AMDGPU except for R600 - setOperationAction(ISD::ABS, MVT::i32, Expand); - // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address // spaces, so it is custom lowered to handle those where it isn't. for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index a757421..be42291 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -298,7 +298,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::BR_CC, {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand); - setOperationAction({ISD::UADDO, ISD::USUBO}, MVT::i32, Legal); + setOperationAction({ISD::ABS, ISD::UADDO, ISD::USUBO}, MVT::i32, Legal); setOperationAction({ISD::UADDO_CARRY, ISD::USUBO_CARRY}, MVT::i32, Legal); diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index ee10190..05ba76a 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -976,10 +976,10 @@ def : GCNPat < } // End SubtargetPredicate = HasLshlAddU64Inst let SubtargetPredicate = HasAddMinMaxInsts in { -def : ThreeOp_i32_Pats<add, smax, V_ADD_MAX_I32_e64>; -def : ThreeOp_i32_Pats<add, umax, V_ADD_MAX_U32_e64>; -def : ThreeOp_i32_Pats<add, smin, V_ADD_MIN_I32_e64>; -def : ThreeOp_i32_Pats<add, umin, V_ADD_MIN_U32_e64>; +def : ThreeOp_i32_Pats<saddsat, smax, V_ADD_MAX_I32_e64>; +def : ThreeOp_i32_Pats<uaddsat, umax, V_ADD_MAX_U32_e64>; +def : ThreeOp_i32_Pats<saddsat, smin, V_ADD_MIN_I32_e64>; +def : ThreeOp_i32_Pats<uaddsat, umin, V_ADD_MIN_U32_e64>; } def : VOPBinOpClampPat<saddsat, V_ADD_I32_e64, i32>; diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index c4692b7..4ae2c1e 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -464,10 +464,10 @@ class ThreeOp_OpSelClampPats <SDPatternOperator op1, SDPatternOperator op2, >; let SubtargetPredicate = HasPkAddMinMaxInsts in { -def : ThreeOp_OpSelClampPats<add, smax, V_PK_ADD_MAX_I16>; -def : ThreeOp_OpSelClampPats<add, umax, V_PK_ADD_MAX_U16>; -def : ThreeOp_OpSelClampPats<add, smin, V_PK_ADD_MIN_I16>; -def : ThreeOp_OpSelClampPats<add, umin, V_PK_ADD_MIN_U16>; +def : ThreeOp_OpSelClampPats<saddsat, smax, V_PK_ADD_MAX_I16>; +def : ThreeOp_OpSelClampPats<uaddsat, umax, V_PK_ADD_MAX_U16>; +def : ThreeOp_OpSelClampPats<saddsat, smin, V_PK_ADD_MIN_I16>; +def : ThreeOp_OpSelClampPats<uaddsat, umin, V_PK_ADD_MIN_U16>; } let SubtargetPredicate = HasPkMinMax3Insts in { diff --git a/llvm/lib/Target/ARM/ARMArchitectures.td b/llvm/lib/Target/ARM/ARMArchitectures.td index 301ed5b..bfcecfe 100644 --- a/llvm/lib/Target/ARM/ARMArchitectures.td +++ b/llvm/lib/Target/ARM/ARMArchitectures.td @@ -297,6 +297,18 @@ def ARMv96a : Architecture<"armv9.6-a", "ARMv96a", [HasV9_6aOps, FeatureCRC, FeatureRAS, FeatureDotProd]>; +def ARMv97a : Architecture<"armv9.7-a", "ARMv97a", [HasV9_7aOps, + FeatureAClass, + FeatureDB, + FeatureFPARMv8, + FeatureNEON, + FeatureDSP, + FeatureTrustZone, + FeatureMP, + FeatureVirtualization, + FeatureCRC, + FeatureRAS, + FeatureDotProd]>; def ARMv8r : Architecture<"armv8-r", "ARMv8r", [HasV8Ops, FeatureRClass, FeatureDB, diff --git a/llvm/lib/Target/ARM/ARMFeatures.td b/llvm/lib/Target/ARM/ARMFeatures.td index 9b1fa5d..e562b21 100644 --- a/llvm/lib/Target/ARM/ARMFeatures.td +++ b/llvm/lib/Target/ARM/ARMFeatures.td @@ -712,6 +712,11 @@ def HasV9_6aOps : SubtargetFeature<"v9.6a", "HasV9_6aOps", "true", "Support ARM v9.6a instructions", [HasV9_5aOps]>; +// Armv9.7-A is a v9-only architecture. +def HasV9_7aOps : SubtargetFeature<"v9.7a", "HasV9_7aOps", "true", + "Support ARM v9.7a instructions", + [HasV9_6aOps]>; + def HasV8_1MMainlineOps : SubtargetFeature< "v8.1m.main", "HasV8_1MMainlineOps", "true", "Support ARM v8-1M Mainline instructions", diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 8122db2..313ae3d 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -21381,15 +21381,6 @@ void ARMTargetLowering::insertSSPDeclarations(Module &M) const { TargetLowering::insertSSPDeclarations(M); } -Function *ARMTargetLowering::getSSPStackGuardCheck(const Module &M) const { - // MSVC CRT has a function to validate security cookie. - RTLIB::LibcallImpl SecurityCheckCookie = - getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE); - if (SecurityCheckCookie != RTLIB::Unsupported) - return M.getFunction(getLibcallImplName(SecurityCheckCookie)); - return TargetLowering::getSSPStackGuardCheck(M); -} - bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx, unsigned &Cost) const { // If we do not have NEON, vector types are not natively supported. diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index 8c5e0cf..357d2c5 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -708,7 +708,6 @@ class VectorType; bool useLoadStackGuardNode(const Module &M) const override; void insertSSPDeclarations(Module &M) const override; - Function *getSSPStackGuardCheck(const Module &M) const override; bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx, unsigned &Cost) const override; diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td index 53be167..10d4cd5 100644 --- a/llvm/lib/Target/ARM/ARMInstrInfo.td +++ b/llvm/lib/Target/ARM/ARMInstrInfo.td @@ -6546,23 +6546,25 @@ def KCFI_CHECK_ARM : PseudoInst<(outs), (ins GPR:$ptr, i32imm:$type), NoItinerary, []>, Sched<[]>, Requires<[IsARM]> { - let Size = 28; // 7 instructions (bic, ldr, 4x eor, beq, udf) + let Size = 40; // worst-case 10 instructions @ 4 bytes each + // (push, bic, ldr, 4x eor, pop, beq, udf) } def KCFI_CHECK_Thumb2 : PseudoInst<(outs), (ins GPR:$ptr, i32imm:$type), NoItinerary, []>, Sched<[]>, Requires<[IsThumb2]> { - let Size = - 32; // worst-case 9 instructions (push, bic, ldr, 4x eor, pop, beq.w, udf) + let Size = 34; // worst-case (push.w[2], bic[4], ldr[4], 4x eor[16], pop.w[2], + // beq.w[4], udf[2]) } def KCFI_CHECK_Thumb1 : PseudoInst<(outs), (ins GPR:$ptr, i32imm:$type), NoItinerary, []>, Sched<[]>, Requires<[IsThumb1Only]> { - let Size = 50; // worst-case 25 instructions (pushes, bic helper, type - // building, cmp, pops) + let Size = 38; // worst-case 19 instructions @ 2 bytes each + // (2x push, 3x bic-helper, subs+ldr, 13x type-building, cmp, + // 2x pop, beq, bkpt) } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp index 0796746..94b511a 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp @@ -895,6 +895,7 @@ void ARMTargetELFStreamer::emitArchDefaultAttributes() { case ARM::ArchKind::ARMV9_4A: case ARM::ArchKind::ARMV9_5A: case ARM::ArchKind::ARMV9_6A: + case ARM::ArchKind::ARMV9_7A: S.setAttributeItem(CPU_arch_profile, ApplicationProfile, false); S.setAttributeItem(ARM_ISA_use, Allowed, false); S.setAttributeItem(THUMB_ISA_use, AllowThumb32, false); diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.td b/llvm/lib/Target/AVR/AVRInstrInfo.td index 02fb905..4a2f714 100644 --- a/llvm/lib/Target/AVR/AVRInstrInfo.td +++ b/llvm/lib/Target/AVR/AVRInstrInfo.td @@ -1504,14 +1504,26 @@ let Defs = [SREG], hasSideEffects = 0 in def FRMIDX : Pseudo<(outs DLDREGS:$dst), (ins DLDREGS:$src, i16imm:$src2), "frmidx\t$dst, $src, $src2", []>; +// The instructions STDSPQRr and STDWSPQRr are used to store to the stack +// frame. The most accurate implementation would be to load the SP into +// a temporary pointer variable and then STDPtrQRr. However for efficiency, +// we assume that R29R28 contains the current call frame pointer. +// However in the PEI pass we sometimes rewrite a ADJCALLSTACKDOWN pseudo, +// plus one or more STDSPQRr/STDWSPQRr pseudo instructions to use Z for a +// stack adjustment then as a base pointer. To avoid corruption, we thus +// specify special classes of registers, like GPR8 and DREGS, but with +// the Z register removed, as the source/input to these instructions. // This pseudo is either converted to a regular store or a push which clobbers // SP. -def STDSPQRr : StorePseudo<(outs), (ins memspi:$dst, GPR8:$src), +let Defs = [SP], Uses = [SP], hasSideEffects = 0 in +def STDSPQRr : StorePseudo<(outs), (ins memspi:$dst, GPR8NOZ:$src), "stdstk\t$dst, $src", [(store i8:$src, addr:$dst)]>; +// See the comment on STDSPQRr. // This pseudo is either converted to a regular store or a push which clobbers // SP. -def STDWSPQRr : StorePseudo<(outs), (ins memspi:$dt, DREGS:$src), +let Defs = [SP], Uses = [SP], hasSideEffects = 0 in +def STDWSPQRr : StorePseudo<(outs), (ins memspi:$dt, DREGSNOZ:$src), "stdwstk\t$dt, $src", [(store i16:$src, addr:$dt)]>; // SP read/write pseudos. diff --git a/llvm/lib/Target/AVR/AVRRegisterInfo.td b/llvm/lib/Target/AVR/AVRRegisterInfo.td index 182f92c..9b935b1 100644 --- a/llvm/lib/Target/AVR/AVRRegisterInfo.td +++ b/llvm/lib/Target/AVR/AVRRegisterInfo.td @@ -211,6 +211,31 @@ def PTRDISPREGS : RegisterClass<"AVR", [i16], 8, (add R31R30, R29R28), ptr>; // model this using a register class containing only the Z register. def ZREG : RegisterClass<"AVR", [i16], 8, (add R31R30)>; +// general registers excluding Z register lo/hi, these are the only +// registers that are always safe for STDSPQr instructions +def GPR8NOZ : RegisterClass<"AVR", [i8], 8, + (// Return value and argument registers. + add R24, R25, R18, R19, R20, R21, R22, R23, + // Scratch registers. + R26, R27, + // Callee saved registers. + R28, R29, R17, R16, R15, R14, R13, R12, R11, R10, + R9, R8, R7, R6, R5, R4, R3, R2, R0, R1)>; + +// 16-bit pair register class excluding Z register lo/hi, these are the only +// registers that are always safe for STDWSPQr instructions +def DREGSNOZ : RegisterClass<"AVR", [i16], 8, + (// Return value and arguments. + add R25R24, R19R18, R21R20, R23R22, + // Scratch registers. + R27R26, + // Callee saved registers. + R29R28, R17R16, R15R14, R13R12, R11R10, R9R8, + R7R6, R5R4, R3R2, R1R0, + // Pseudo regs for unaligned 16-bits + R26R25, R24R23, R22R21, R20R19, R18R17, R16R15, + R14R13, R12R11, R10R9)>; + // Register class used for the stack read pseudo instruction. def GPRSP : RegisterClass<"AVR", [i16], 8, (add SP)>; diff --git a/llvm/lib/Target/DirectX/DXILPrepare.cpp b/llvm/lib/Target/DirectX/DXILPrepare.cpp index 42e90f0..d6fa65f 100644 --- a/llvm/lib/Target/DirectX/DXILPrepare.cpp +++ b/llvm/lib/Target/DirectX/DXILPrepare.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// /// -/// \file This file contains pases and utilities to convert a modern LLVM +/// \file This file contains passes and utilities to convert a modern LLVM /// module into a module compatible with the LLVM 3.7-based DirectX Intermediate /// Language (DXIL). //===----------------------------------------------------------------------===// @@ -16,7 +16,6 @@ #include "DirectX.h" #include "DirectXIRPasses/PointerTypeAnalysis.h" #include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringSet.h" #include "llvm/Analysis/DXILMetadataAnalysis.h" #include "llvm/Analysis/DXILResource.h" @@ -27,7 +26,6 @@ #include "llvm/IR/Module.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" -#include "llvm/Support/Compiler.h" #include "llvm/Support/VersionTuple.h" #define DEBUG_TYPE "dxil-prepare" @@ -116,31 +114,6 @@ static void removeStringFunctionAttributes(Function &F, F.removeRetAttrs(DeadAttrs); } -static void cleanModuleFlags(Module &M) { - NamedMDNode *MDFlags = M.getModuleFlagsMetadata(); - if (!MDFlags) - return; - - SmallVector<llvm::Module::ModuleFlagEntry> FlagEntries; - M.getModuleFlagsMetadata(FlagEntries); - bool Updated = false; - for (auto &Flag : FlagEntries) { - // llvm 3.7 only supports behavior up to AppendUnique. - if (Flag.Behavior <= Module::ModFlagBehavior::AppendUnique) - continue; - Flag.Behavior = Module::ModFlagBehavior::Warning; - Updated = true; - } - - if (!Updated) - return; - - MDFlags->eraseFromParent(); - - for (auto &Flag : FlagEntries) - M.addModuleFlag(Flag.Behavior, Flag.Key->getString(), Flag.Val); -} - class DXILPrepareModule : public ModulePass { static Value *maybeGenerateBitcast(IRBuilder<> &Builder, @@ -202,15 +175,6 @@ class DXILPrepareModule : public ModulePass { Builder.getPtrTy(PtrTy->getAddressSpace()))); } - static std::array<unsigned, 6> getCompatibleInstructionMDs(llvm::Module &M) { - return {M.getMDKindID("dx.nonuniform"), - M.getMDKindID("dx.controlflow.hints"), - M.getMDKindID("dx.precise"), - llvm::LLVMContext::MD_range, - llvm::LLVMContext::MD_alias_scope, - llvm::LLVMContext::MD_noalias}; - } - public: bool runOnModule(Module &M) override { PointerTypeMap PointerTypes = PointerTypeAnalysis::run(M); @@ -224,10 +188,7 @@ public: const dxil::ModuleMetadataInfo MetadataInfo = getAnalysis<DXILMetadataAnalysisWrapperPass>().getModuleMetadata(); VersionTuple ValVer = MetadataInfo.ValidatorVersion; - bool SkipValidation = ValVer.getMajor() == 0 && ValVer.getMinor() == 0; - - // construct allowlist of valid metadata node kinds - std::array<unsigned, 6> DXILCompatibleMDs = getCompatibleInstructionMDs(M); + bool AllowExperimental = ValVer.getMajor() == 0 && ValVer.getMinor() == 0; for (auto &F : M.functions()) { F.removeFnAttrs(AttrMask); @@ -235,7 +196,7 @@ public: // Only remove string attributes if we are not skipping validation. // This will reserve the experimental attributes when validation version // is 0.0 for experiment mode. - removeStringFunctionAttributes(F, SkipValidation); + removeStringFunctionAttributes(F, AllowExperimental); for (size_t Idx = 0, End = F.arg_size(); Idx < End; ++Idx) F.removeParamAttrs(Idx, AttrMask); @@ -243,11 +204,17 @@ public: IRBuilder<> Builder(&BB); for (auto &I : make_early_inc_range(BB)) { - I.dropUnknownNonDebugMetadata(DXILCompatibleMDs); + if (auto *CB = dyn_cast<CallBase>(&I)) { + CB->removeFnAttrs(AttrMask); + CB->removeRetAttrs(AttrMask); + for (size_t Idx = 0, End = CB->arg_size(); Idx < End; ++Idx) + CB->removeParamAttrs(Idx, AttrMask); + continue; + } // Emtting NoOp bitcast instructions allows the ValueEnumerator to be // unmodified as it reserves instruction IDs during contruction. - if (auto LI = dyn_cast<LoadInst>(&I)) { + if (auto *LI = dyn_cast<LoadInst>(&I)) { if (Value *NoOpBitcast = maybeGenerateBitcast( Builder, PointerTypes, I, LI->getPointerOperand(), LI->getType())) { @@ -257,7 +224,7 @@ public: } continue; } - if (auto SI = dyn_cast<StoreInst>(&I)) { + if (auto *SI = dyn_cast<StoreInst>(&I)) { if (Value *NoOpBitcast = maybeGenerateBitcast( Builder, PointerTypes, I, SI->getPointerOperand(), SI->getValueOperand()->getType())) { @@ -268,39 +235,16 @@ public: } continue; } - if (auto GEP = dyn_cast<GetElementPtrInst>(&I)) { + if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) { if (Value *NoOpBitcast = maybeGenerateBitcast( Builder, PointerTypes, I, GEP->getPointerOperand(), GEP->getSourceElementType())) GEP->setOperand(0, NoOpBitcast); continue; } - if (auto *CB = dyn_cast<CallBase>(&I)) { - CB->removeFnAttrs(AttrMask); - CB->removeRetAttrs(AttrMask); - for (size_t Idx = 0, End = CB->arg_size(); Idx < End; ++Idx) - CB->removeParamAttrs(Idx, AttrMask); - continue; - } } } } - // Remove flags not for DXIL. - cleanModuleFlags(M); - - // dx.rootsignatures will have been parsed from its metadata form as its - // binary form as part of the RootSignatureAnalysisWrapper, so safely - // remove it as it is not recognized in DXIL - if (NamedMDNode *RootSignature = M.getNamedMetadata("dx.rootsignatures")) - RootSignature->eraseFromParent(); - - // llvm.errno.tbaa was recently added but is not supported in LLVM 3.7 and - // causes all tests using the DXIL Validator to fail. - // - // This is a temporary fix and should be replaced with a whitelist once - // we have determined all metadata that the DXIL Validator allows - if (NamedMDNode *ErrNo = M.getNamedMetadata("llvm.errno.tbaa")) - ErrNo->eraseFromParent(); return true; } @@ -308,11 +252,11 @@ public: DXILPrepareModule() : ModulePass(ID) {} void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<DXILMetadataAnalysisWrapperPass>(); - AU.addRequired<RootSignatureAnalysisWrapper>(); - AU.addPreserved<RootSignatureAnalysisWrapper>(); - AU.addPreserved<ShaderFlagsAnalysisWrapper>(); + AU.addPreserved<DXILMetadataAnalysisWrapperPass>(); AU.addPreserved<DXILResourceWrapperPass>(); + AU.addPreserved<RootSignatureAnalysisWrapper>(); + AU.addPreserved<ShaderFlagsAnalysisWrapper>(); } static char ID; // Pass identification. }; @@ -323,7 +267,6 @@ char DXILPrepareModule::ID = 0; INITIALIZE_PASS_BEGIN(DXILPrepareModule, DEBUG_TYPE, "DXIL Prepare Module", false, false) INITIALIZE_PASS_DEPENDENCY(DXILMetadataAnalysisWrapperPass) -INITIALIZE_PASS_DEPENDENCY(RootSignatureAnalysisWrapper) INITIALIZE_PASS_END(DXILPrepareModule, DEBUG_TYPE, "DXIL Prepare Module", false, false) diff --git a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp index 9eebcc9..1e4797b 100644 --- a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp +++ b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp @@ -7,8 +7,10 @@ //===----------------------------------------------------------------------===// #include "DXILTranslateMetadata.h" +#include "DXILRootSignature.h" #include "DXILShaderFlags.h" #include "DirectX.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Twine.h" #include "llvm/Analysis/DXILMetadataAnalysis.h" @@ -204,9 +206,9 @@ getEntryPropAsMetadata(const EntryProperties &EP, uint64_t EntryShaderFlags, return MDNode::get(Ctx, MDVals); } -MDTuple *constructEntryMetadata(const Function *EntryFn, MDTuple *Signatures, - MDNode *Resources, MDTuple *Properties, - LLVMContext &Ctx) { +static MDTuple *constructEntryMetadata(const Function *EntryFn, + MDTuple *Signatures, MDNode *Resources, + MDTuple *Properties, LLVMContext &Ctx) { // Each entry point metadata record specifies: // * reference to the entry point function global symbol // * unmangled name @@ -290,42 +292,82 @@ static MDTuple *emitTopLevelLibraryNode(Module &M, MDNode *RMD, return constructEntryMetadata(nullptr, nullptr, RMD, Properties, Ctx); } -// TODO: We might need to refactor this to be more generic, -// in case we need more metadata to be replaced. -static void translateBranchMetadata(Module &M) { - for (Function &F : M) { - for (BasicBlock &BB : F) { - Instruction *BBTerminatorInst = BB.getTerminator(); +static void translateBranchMetadata(Module &M, Instruction *BBTerminatorInst) { + MDNode *HlslControlFlowMD = + BBTerminatorInst->getMetadata("hlsl.controlflow.hint"); + + if (!HlslControlFlowMD) + return; - MDNode *HlslControlFlowMD = - BBTerminatorInst->getMetadata("hlsl.controlflow.hint"); + assert(HlslControlFlowMD->getNumOperands() == 2 && + "invalid operands for hlsl.controlflow.hint"); - if (!HlslControlFlowMD) - continue; + MDBuilder MDHelper(M.getContext()); - assert(HlslControlFlowMD->getNumOperands() == 2 && - "invalid operands for hlsl.controlflow.hint"); + llvm::Metadata *HintsStr = MDHelper.createString("dx.controlflow.hints"); + llvm::Metadata *HintsValue = MDHelper.createConstant( + mdconst::extract<ConstantInt>(HlslControlFlowMD->getOperand(1))); - MDBuilder MDHelper(M.getContext()); - ConstantInt *Op1 = - mdconst::extract<ConstantInt>(HlslControlFlowMD->getOperand(1)); + MDNode *MDNode = llvm::MDNode::get(M.getContext(), {HintsStr, HintsValue}); - SmallVector<llvm::Metadata *, 2> Vals( - ArrayRef<Metadata *>{MDHelper.createString("dx.controlflow.hints"), - MDHelper.createConstant(Op1)}); + BBTerminatorInst->setMetadata("dx.controlflow.hints", MDNode); + BBTerminatorInst->setMetadata("hlsl.controlflow.hint", nullptr); +} + +static std::array<unsigned, 6> getCompatibleInstructionMDs(llvm::Module &M) { + return { + M.getMDKindID("dx.nonuniform"), M.getMDKindID("dx.controlflow.hints"), + M.getMDKindID("dx.precise"), llvm::LLVMContext::MD_range, + llvm::LLVMContext::MD_alias_scope, llvm::LLVMContext::MD_noalias}; +} - MDNode *MDNode = llvm::MDNode::get(M.getContext(), Vals); +static void translateInstructionMetadata(Module &M) { + // construct allowlist of valid metadata node kinds + std::array<unsigned, 6> DXILCompatibleMDs = getCompatibleInstructionMDs(M); - BBTerminatorInst->setMetadata("dx.controlflow.hints", MDNode); - BBTerminatorInst->setMetadata("hlsl.controlflow.hint", nullptr); + for (Function &F : M) { + for (BasicBlock &BB : F) { + // This needs to be done first so that "hlsl.controlflow.hints" isn't + // removed in the whitelist below + if (auto *I = BB.getTerminator()) + translateBranchMetadata(M, I); + + for (auto &I : make_early_inc_range(BB)) { + I.dropUnknownNonDebugMetadata(DXILCompatibleMDs); + } } } } -static void translateMetadata(Module &M, DXILResourceMap &DRM, - DXILResourceTypeMap &DRTM, - const ModuleShaderFlags &ShaderFlags, - const ModuleMetadataInfo &MMDI) { +static void cleanModuleFlags(Module &M) { + NamedMDNode *MDFlags = M.getModuleFlagsMetadata(); + if (!MDFlags) + return; + + SmallVector<llvm::Module::ModuleFlagEntry> FlagEntries; + M.getModuleFlagsMetadata(FlagEntries); + bool Updated = false; + for (auto &Flag : FlagEntries) { + // llvm 3.7 only supports behavior up to AppendUnique. + if (Flag.Behavior <= Module::ModFlagBehavior::AppendUnique) + continue; + Flag.Behavior = Module::ModFlagBehavior::Warning; + Updated = true; + } + + if (!Updated) + return; + + MDFlags->eraseFromParent(); + + for (auto &Flag : FlagEntries) + M.addModuleFlag(Flag.Behavior, Flag.Key->getString(), Flag.Val); +} + +static void translateGlobalMetadata(Module &M, DXILResourceMap &DRM, + DXILResourceTypeMap &DRTM, + const ModuleShaderFlags &ShaderFlags, + const ModuleMetadataInfo &MMDI) { LLVMContext &Ctx = M.getContext(); IRBuilder<> IRB(Ctx); SmallVector<MDNode *> EntryFnMDNodes; @@ -381,6 +423,22 @@ static void translateMetadata(Module &M, DXILResourceMap &DRM, M.getOrInsertNamedMetadata("dx.entryPoints"); for (auto *Entry : EntryFnMDNodes) EntryPointsNamedMD->addOperand(Entry); + + cleanModuleFlags(M); + + // dx.rootsignatures will have been parsed from its metadata form as its + // binary form as part of the RootSignatureAnalysisWrapper, so safely + // remove it as it is not recognized in DXIL + if (NamedMDNode *RootSignature = M.getNamedMetadata("dx.rootsignatures")) + RootSignature->eraseFromParent(); + + // llvm.errno.tbaa was recently added but is not supported in LLVM 3.7 and + // causes all tests using the DXIL Validator to fail. + // + // This is a temporary fix and should be replaced with a allowlist once + // we have determined all metadata that the DXIL Validator allows + if (NamedMDNode *ErrNo = M.getNamedMetadata("llvm.errno.tbaa")) + ErrNo->eraseFromParent(); } PreservedAnalyses DXILTranslateMetadata::run(Module &M, @@ -390,8 +448,8 @@ PreservedAnalyses DXILTranslateMetadata::run(Module &M, const ModuleShaderFlags &ShaderFlags = MAM.getResult<ShaderFlagsAnalysis>(M); const dxil::ModuleMetadataInfo MMDI = MAM.getResult<DXILMetadataAnalysis>(M); - translateMetadata(M, DRM, DRTM, ShaderFlags, MMDI); - translateBranchMetadata(M); + translateGlobalMetadata(M, DRM, DRTM, ShaderFlags, MMDI); + translateInstructionMetadata(M); return PreservedAnalyses::all(); } @@ -409,10 +467,13 @@ public: AU.addRequired<DXILResourceWrapperPass>(); AU.addRequired<ShaderFlagsAnalysisWrapper>(); AU.addRequired<DXILMetadataAnalysisWrapperPass>(); - AU.addPreserved<DXILResourceWrapperPass>(); + AU.addRequired<RootSignatureAnalysisWrapper>(); + AU.addPreserved<DXILMetadataAnalysisWrapperPass>(); - AU.addPreserved<ShaderFlagsAnalysisWrapper>(); AU.addPreserved<DXILResourceBindingWrapperPass>(); + AU.addPreserved<DXILResourceWrapperPass>(); + AU.addPreserved<RootSignatureAnalysisWrapper>(); + AU.addPreserved<ShaderFlagsAnalysisWrapper>(); } bool runOnModule(Module &M) override { @@ -425,8 +486,8 @@ public: dxil::ModuleMetadataInfo MMDI = getAnalysis<DXILMetadataAnalysisWrapperPass>().getModuleMetadata(); - translateMetadata(M, DRM, DRTM, ShaderFlags, MMDI); - translateBranchMetadata(M); + translateGlobalMetadata(M, DRM, DRTM, ShaderFlags, MMDI); + translateInstructionMetadata(M); return true; } }; @@ -443,6 +504,7 @@ INITIALIZE_PASS_BEGIN(DXILTranslateMetadataLegacy, "dxil-translate-metadata", "DXIL Translate Metadata", false, false) INITIALIZE_PASS_DEPENDENCY(DXILResourceWrapperPass) INITIALIZE_PASS_DEPENDENCY(ShaderFlagsAnalysisWrapper) +INITIALIZE_PASS_DEPENDENCY(RootSignatureAnalysisWrapper) INITIALIZE_PASS_DEPENDENCY(DXILMetadataAnalysisWrapperPass) INITIALIZE_PASS_END(DXILTranslateMetadataLegacy, "dxil-translate-metadata", "DXIL Translate Metadata", false, false) diff --git a/llvm/lib/Target/DirectX/DXILTranslateMetadata.h b/llvm/lib/Target/DirectX/DXILTranslateMetadata.h index f3f5eb1..4c1ffac 100644 --- a/llvm/lib/Target/DirectX/DXILTranslateMetadata.h +++ b/llvm/lib/Target/DirectX/DXILTranslateMetadata.h @@ -13,7 +13,8 @@ namespace llvm { -/// A pass that transforms DXIL Intrinsics that don't have DXIL opCodes +/// A pass that transforms LLVM Metadata in the module to it's DXIL equivalent, +/// then emits all recognized DXIL Metadata class DXILTranslateMetadata : public PassInfoMixin<DXILTranslateMetadata> { public: PreservedAnalyses run(Module &M, ModuleAnalysisManager &); diff --git a/llvm/lib/Target/Hexagon/Hexagon.td b/llvm/lib/Target/Hexagon/Hexagon.td index fb0928b8..ede8463 100644 --- a/llvm/lib/Target/Hexagon/Hexagon.td +++ b/llvm/lib/Target/Hexagon/Hexagon.td @@ -79,6 +79,12 @@ def ExtensionHVXV79: SubtargetFeature<"hvxv79", "HexagonHVXVersion", ExtensionHVXV67, ExtensionHVXV68, ExtensionHVXV69, ExtensionHVXV71, ExtensionHVXV73, ExtensionHVXV75]>; +def ExtensionHVXV81: SubtargetFeature<"hvxv81", "HexagonHVXVersion", + "Hexagon::ArchEnum::V81", "Hexagon HVX instructions", + [ExtensionHVXV65, ExtensionHVXV66, ExtensionHVXV67, + ExtensionHVXV68, ExtensionHVXV69, ExtensionHVXV71, + ExtensionHVXV73, ExtensionHVXV75, ExtensionHVXV79]>; + def ExtensionHVX64B: SubtargetFeature<"hvx-length64b", "UseHVX64BOps", "true", "Hexagon HVX 64B instructions", [ExtensionHVX]>; def ExtensionHVX128B: SubtargetFeature<"hvx-length128b", "UseHVX128BOps", @@ -151,6 +157,8 @@ def UseHVXV75 : Predicate<"HST->useHVXV75Ops()">, AssemblerPredicate<(all_of ExtensionHVXV75)>; def UseHVXV79 : Predicate<"HST->useHVXV79Ops()">, AssemblerPredicate<(all_of ExtensionHVXV79)>; +def UseHVXV81 : Predicate<"HST->useHVXV81Ops()">, + AssemblerPredicate<(all_of ExtensionHVXV81)>; def UseAudio : Predicate<"HST->useAudioOps()">, AssemblerPredicate<(all_of ExtensionAudio)>; def UseZReg : Predicate<"HST->useZRegOps()">, @@ -488,6 +496,11 @@ def : Proc<"hexagonv79", HexagonModelV79, ArchV68, ArchV69, ArchV71, ArchV73, ArchV75, ArchV79, FeatureCompound, FeatureDuplex, FeatureMemNoShuf, FeatureMemops, FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData]>; +def : Proc<"hexagonv81", HexagonModelV81, + [ArchV65, ArchV66, ArchV67, ArchV68, ArchV69, ArchV71, ArchV73, + ArchV75, ArchV79, ArchV81, + FeatureCompound, FeatureDuplex, FeatureMemNoShuf, FeatureMemops, + FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData]>; // Need to update the correct features for tiny core. // Disable NewValueJumps since the packetizer is unable to handle a packet with diff --git a/llvm/lib/Target/Hexagon/HexagonDepArch.h b/llvm/lib/Target/Hexagon/HexagonDepArch.h index 8984534..9bf4034 100644 --- a/llvm/lib/Target/Hexagon/HexagonDepArch.h +++ b/llvm/lib/Target/Hexagon/HexagonDepArch.h @@ -29,7 +29,8 @@ enum class ArchEnum { V71, V73, V75, - V79 + V79, + V81 }; inline std::optional<Hexagon::ArchEnum> getCpu(StringRef CPU) { @@ -50,6 +51,7 @@ inline std::optional<Hexagon::ArchEnum> getCpu(StringRef CPU) { .Case("hexagonv73", Hexagon::ArchEnum::V73) .Case("hexagonv75", Hexagon::ArchEnum::V75) .Case("hexagonv79", Hexagon::ArchEnum::V79) + .Case("hexagonv81", Hexagon::ArchEnum::V81) .Default(std::nullopt); } } // namespace Hexagon diff --git a/llvm/lib/Target/Hexagon/HexagonDepArch.td b/llvm/lib/Target/Hexagon/HexagonDepArch.td index 8ec1d93..f623fd0 100644 --- a/llvm/lib/Target/Hexagon/HexagonDepArch.td +++ b/llvm/lib/Target/Hexagon/HexagonDepArch.td @@ -34,3 +34,5 @@ def ArchV75: SubtargetFeature<"v75", "HexagonArchVersion", "Hexagon::ArchEnum::V def HasV75 : Predicate<"HST->hasV75Ops()">, AssemblerPredicate<(all_of ArchV75)>; def ArchV79: SubtargetFeature<"v79", "HexagonArchVersion", "Hexagon::ArchEnum::V79", "Enable Hexagon V79 architecture">; def HasV79 : Predicate<"HST->hasV79Ops()">, AssemblerPredicate<(all_of ArchV79)>; +def ArchV81: SubtargetFeature<"v81", "HexagonArchVersion", "Hexagon::ArchEnum::V81", "Enable Hexagon V81 architecture">; +def HasV81 : Predicate<"HST->hasV81Ops()">, AssemblerPredicate<(all_of ArchV81)>; diff --git a/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td b/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td index 93696e0..f4e36fa7 100644 --- a/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td +++ b/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td @@ -7222,3 +7222,595 @@ class DepHVXItinV79 { [Hex_FWD, Hex_FWD, HVX_FWD]> ]; } + +class DepHVXItinV81 { + list<InstrItinData> DepHVXItinV81_list = [ + InstrItinData <tc_0390c1ca, /*SLOT01,LOAD,VA,VX_DV*/ + [InstrStage<1, [SLOT0, SLOT1], 0>, + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE], 0>, + InstrStage<1, [CVI_MPY01]>], [9, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_04da405a, /*SLOT0123,VP_VS*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_XLSHF]>], [9, 5], + [HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_05ca8cfd, /*SLOT0123,VS*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_SHIFT]>], [9, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_08a4f1b6, /*SLOT23,VX_DV*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_0afc8be9, /*SLOT23,VX_DV*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01]>], [9, 5], + [HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_0b04c6c7, /*SLOT23,VX_DV*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01]>], [9, 5, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_0ec46cf9, /*SLOT0123,VA*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7], + [HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_131f1c81, /*SLOT0,NOSLOT1,STORE,VP*/ + [InstrStage<1, [SLOT0], 0>, + InstrStage<1, [SLOT1], 0>, + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_XLANE]>], [2, 1, 2, 5], + [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData <tc_1381a97c, /*SLOT0123,4SLOT*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_ALL]>], [], + []>, + + InstrItinData <tc_15fdf750, /*SLOT23,VS_VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1], 0>, + InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 7, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_16ff9ef8, /*SLOT0123,VS*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_SHIFT]>], [9, 5, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_191381c1, /*SLOT0,STORE,VA*/ + [InstrStage<1, [SLOT0], 0>, + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 7, 1, 2, 7], + [Hex_FWD, HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData <tc_1ad8a370, /*SLOT23,VX_DV*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01]>], [9, 5, 2, 2], + [HVX_FWD, HVX_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_1ba8a0cd, /*SLOT01,LOAD,VA*/ + [InstrStage<1, [SLOT0, SLOT1], 0>, + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_20a4bbec, /*SLOT0,STORE*/ + [InstrStage<1, [SLOT0], 0>, + InstrStage<1, [CVI_ST]>], [3, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_227864f7, /*SLOT0,STORE,VA,VX_DV*/ + [InstrStage<1, [SLOT0], 0>, + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE], 0>, + InstrStage<1, [CVI_MPY01]>], [3, 1, 2, 5], + [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData <tc_257f6f7c, /*SLOT0123,VA*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 7], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_26a377fe, /*SLOT23,4SLOT_MPY*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2], + [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_2c745bb8, /*SLOT0123,VP_VS*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_XLSHF]>], [9, 7, 5], + [HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_2d4051cd, /*SLOT23,4SLOT_MPY*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 7, 5, 2], + [HVX_FWD, Hex_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_2e8f5f6e, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 7, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_309dbb4f, /*SLOT0123,VS*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_SHIFT]>], [9, 7, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_37820f4c, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_3904b926, /*SLOT01,LOAD*/ + [InstrStage<1, [SLOT0, SLOT1], 0>, + InstrStage<1, [CVI_LD]>], [9, 2, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_3aacf4a8, /*SLOT0123,VA*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 7], + [HVX_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData <tc_3ad719fb, /*SLOT01,ZW*/ + [InstrStage<1, [SLOT0, SLOT1], 0>, + InstrStage<1, [CVI_ZW]>], [3, 2, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_3c56e5ce, /*SLOT0,NOSLOT1,LOAD,VP*/ + [InstrStage<1, [SLOT0], 0>, + InstrStage<1, [SLOT1], 0>, + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_XLANE]>], [9, 3, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_3c8c15d0, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5], + [HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_3ce09744, /*SLOT0,STORE*/ + [InstrStage<1, [SLOT0], 0>, + InstrStage<1, [CVI_ST]>], [1, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_3e2aaafc, /*SLOT0,STORE,VA*/ + [InstrStage<1, [SLOT0], 0>, + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 1, 2, 7], + [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData <tc_447d9895, /*SLOT0,STORE,VA*/ + [InstrStage<1, [SLOT0], 0>, + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7], + [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData <tc_453fe68d, /*SLOT01,LOAD,VA*/ + [InstrStage<1, [SLOT0, SLOT1], 0>, + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 2, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_46d6c3e0, /*SLOT0123,VP*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_XLANE]>], [9, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_4942646a, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_51d0ecc3, /*SLOT0123,VS*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_SHIFT]>], [9, 5], + [HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_52447ecc, /*SLOT01,LOAD*/ + [InstrStage<1, [SLOT0, SLOT1], 0>, + InstrStage<1, [CVI_LD]>], [9, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_531b383c, /*SLOT0123*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [9, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_540c3da3, /*SLOT0,VA*/ + [InstrStage<1, [SLOT0], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [4, 7, 1], + [Hex_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_54a0dc47, /*SLOT0,STORE,VA*/ + [InstrStage<1, [SLOT0], 0>, + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 2, 1, 2, 7], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData <tc_561aaa58, /*SLOT0123,VP_VS*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_XLSHF]>], [9, 9, 5, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_56c4f9fe, /*SLOT0123,VA*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7], + [HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_56e64202, /*SLOT0123,VP*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_XLANE]>], [9, 5, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_58d21193, /*SLOT0,STORE,VA_DV*/ + [InstrStage<1, [SLOT0], 0>, + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7], + [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_XLANE]>], [9, 2], + [HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_5cdf8c84, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7], + [HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_61bf7c03, /*SLOT23,4SLOT_MPY*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_ALL_NOMEM]>], [9, 5, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_649072c2, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_660769f1, /*SLOT23,VX_DV*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_663c80a7, /*SLOT01,LOAD*/ + [InstrStage<1, [SLOT0, SLOT1], 0>, + InstrStage<1, [CVI_LD]>], [9, 3, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_6942b6e0, /*SLOT0,STORE*/ + [InstrStage<1, [SLOT0], 0>, + InstrStage<1, [CVI_ST]>], [3, 1, 2, 5], + [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData <tc_6e7fa133, /*SLOT0123,VP*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_XLANE]>], [9, 5, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_7095ecba, /*SLOT01,LOAD,VA_DV*/ + [InstrStage<1, [SLOT0, SLOT1], 0>, + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7], + [Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData <tc_71646d06, /*SLOT0123,VA_DV*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7, 7], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_7177e272, /*SLOT0,STORE*/ + [InstrStage<1, [SLOT0], 0>, + InstrStage<1, [CVI_ST]>], [2, 1, 2, 5], + [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData <tc_718b5c53, /*SLOT0123,VA_DV*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9], + [HVX_FWD]>, + + InstrItinData <tc_7273323b, /*SLOT0,STORE,VA_DV*/ + [InstrStage<1, [SLOT0], 0>, + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7, 7], + [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_72e2b393, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_73efe966, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_7417e785, /*SLOT0123,VS*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_SHIFT]>], [9, 5, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_767c4e9d, /*SLOT0123,4SLOT*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_ALL]>], [3, 2], + [HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_7d68d5c2, /*SLOT01,LOAD,VA*/ + [InstrStage<1, [SLOT0, SLOT1], 0>, + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7], + [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData <tc_7e6a3e89, /*SLOT0123,VA*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 9, 7, 7, 7], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_8772086c, /*SLOT0123,VA*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7], + [HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_87adc037, /*SLOT0123,VP_VS*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_XLSHF]>], [9, 5, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_8e420e4d, /*SLOT0,STORE,VA*/ + [InstrStage<1, [SLOT0], 0>, + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7, 7], + [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_90bcc1db, /*SLOT2,VX_DV*/ + [InstrStage<1, [SLOT2], 0>, + InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_933f2b39, /*SLOT23,4SLOT_MPY*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_ALL_NOMEM]>], [9, 7, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_946013d8, /*SLOT0123,VP*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_XLANE]>], [9, 5], + [HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_9a1cab75, /*SLOT01,LOAD,VA,VX_DV*/ + [InstrStage<1, [SLOT0, SLOT1], 0>, + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE], 0>, + InstrStage<1, [CVI_MPY01]>], [9, 3, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_9aff7a2a, /*SLOT0,STORE,VA,VX_DV*/ + [InstrStage<1, [SLOT0], 0>, + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE], 0>, + InstrStage<1, [CVI_MPY01]>], [1, 2, 5], + [Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData <tc_9d1dc972, /*SLOT0123,VP_VS*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_XLSHF]>], [9, 7, 5, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_9f363d21, /*SLOT0,STORE,VA*/ + [InstrStage<1, [SLOT0], 0>, + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7, 7], + [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_a02a10a8, /*SLOT0,STORE,VA*/ + [InstrStage<1, [SLOT0], 0>, + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [2, 1, 2, 7], + [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData <tc_a0dbea28, /*SLOT01,ZW*/ + [InstrStage<1, [SLOT0, SLOT1], 0>, + InstrStage<1, [CVI_ZW]>], [3, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_a19b9305, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_a28f32b5, /*SLOT01,LOAD,VA*/ + [InstrStage<1, [SLOT0, SLOT1], 0>, + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7], + [Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData <tc_a69eeee1, /*SLOT01,LOAD,VA_DV*/ + [InstrStage<1, [SLOT0, SLOT1], 0>, + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7], + [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData <tc_a7e6707d, /*SLOT0,NOSLOT1,LOAD,VP*/ + [InstrStage<1, [SLOT0], 0>, + InstrStage<1, [SLOT1], 0>, + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_XLANE]>], [9, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_ab23f776, /*SLOT0,STORE*/ + [InstrStage<1, [SLOT0], 0>, + InstrStage<1, [CVI_ST]>], [1, 2, 5], + [Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData <tc_abe8c3b2, /*SLOT01,LOAD,VA*/ + [InstrStage<1, [SLOT0, SLOT1], 0>, + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_ac4046bc, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_af25efd9, /*SLOT0123,VA_DV*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 2, 7, 7], + [HVX_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_b091f1c6, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_b28e51aa, /*SLOT0123,4SLOT*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_ALL]>], [2], + [Hex_FWD]>, + + InstrItinData <tc_b4416217, /*SLOT0123,VA_DV*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7], + [HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_b9db8205, /*SLOT01,LOAD*/ + [InstrStage<1, [SLOT0, SLOT1], 0>, + InstrStage<1, [CVI_LD]>], [9, 3, 2, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_bb599486, /*SLOT23,VX_DV*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_c0749f3c, /*SLOT01,LOAD,VA*/ + [InstrStage<1, [SLOT0, SLOT1], 0>, + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_c127de3a, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_c4edf264, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 2], + [HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_c5dba46e, /*SLOT0,STORE,VA*/ + [InstrStage<1, [SLOT0], 0>, + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7], + [Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData <tc_c7039829, /*SLOT0,NOSLOT1,STORE,VP*/ + [InstrStage<1, [SLOT0], 0>, + InstrStage<1, [SLOT1], 0>, + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_XLANE]>], [3, 2, 1, 2, 5], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData <tc_cd94bfe0, /*SLOT23,VS_VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1], 0>, + InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_cda936da, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 7], + [HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_d8287c14, /*SLOT23,VX_DV*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01]>], [9, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_db5555f3, /*SLOT0123,VA_DV*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7], + [HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_dcca380f, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_dd5b0695, /*SLOT01,ZW*/ + [InstrStage<1, [SLOT0, SLOT1], 0>, + InstrStage<1, [CVI_ZW]>], [2, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_df80eeb0, /*SLOT0123,VP_VS*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_XLSHF]>], [9, 7, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_e2d2e9e5, /*SLOT0,NOSLOT1,STORE,VP*/ + [InstrStage<1, [SLOT0], 0>, + InstrStage<1, [SLOT1], 0>, + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_XLANE]>], [3, 1, 2, 5], + [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData <tc_e2fdd6e6, /*SLOT0123*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [9, 5], + [HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_e35c1e93, /*SLOT0123,VA*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 9, 7, 7], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_e3f68a46, /*SLOT0123,4SLOT*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_ALL]>], [3], + [HVX_FWD]>, + + InstrItinData <tc_e675c45a, /*SLOT23,VX_DV*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_e699ae41, /*SLOT01,ZW*/ + [InstrStage<1, [SLOT0, SLOT1], 0>, + InstrStage<1, [CVI_ZW]>], [1, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_e99d4c2e, /*SLOT0,STORE*/ + [InstrStage<1, [SLOT0], 0>, + InstrStage<1, [CVI_ST]>], [3, 2, 1, 2, 5], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData <tc_f175e046, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_f1de44ef, /*SLOT2,VX_DV*/ + [InstrStage<1, [SLOT2], 0>, + InstrStage<1, [CVI_MPY01]>], [9, 5, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_f21e8abb, /*SLOT0,NOSLOT1,STORE,VP*/ + [InstrStage<1, [SLOT0], 0>, + InstrStage<1, [SLOT1], 0>, + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_XLANE]>], [1, 2, 5], + [Hex_FWD, Hex_FWD, HVX_FWD]> + ]; +}
\ No newline at end of file diff --git a/llvm/lib/Target/Hexagon/HexagonDepIICScalar.td b/llvm/lib/Target/Hexagon/HexagonDepIICScalar.td index 7a1ad3e..48b665c 100644 --- a/llvm/lib/Target/Hexagon/HexagonDepIICScalar.td +++ b/llvm/lib/Target/Hexagon/HexagonDepIICScalar.td @@ -13740,3 +13740,891 @@ class DepScalarItinV79 { [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]> ]; } + +class DepScalarItinV81 { + list<InstrItinData> DepScalarItinV81_list = [ + InstrItinData <tc_011e0e9d, /*tc_st*/ + [InstrStage<1, [SLOT0]>], [2, 1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_01d44cb2, /*tc_2*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_01e1be3b, /*tc_3x*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_02fe1c65, /*tc_4x*/ + [InstrStage<1, [SLOT2, SLOT3]>], [5, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_0655b949, /*tc_st*/ + [InstrStage<1, [SLOT0, SLOT1]>], [2, 3], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_075c8dd8, /*tc_ld*/ + [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_0a195f2c, /*tc_4x*/ + [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_0a43be35, /*tc_3x*/ + [InstrStage<1, [SLOT3]>], [1], + [Hex_FWD]>, + + InstrItinData <tc_0a6c20ae, /*tc_st*/ + [InstrStage<1, [SLOT0]>], [2, 1, 1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_0ba0d5da, /*tc_3stall*/ + [InstrStage<1, [SLOT2]>], [1], + [Hex_FWD]>, + + InstrItinData <tc_0dfac0a7, /*tc_2*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_0fac1eb8, /*tc_st*/ + [InstrStage<1, [SLOT0]>], [3, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_112d30d6, /*tc_1*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2], + [Hex_FWD]>, + + InstrItinData <tc_1242dc2a, /*tc_ld*/ + [InstrStage<1, [SLOT0]>], [2], + [Hex_FWD]>, + + InstrItinData <tc_1248597c, /*tc_3x*/ + [InstrStage<1, [SLOT3]>], [2, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_139ef484, /*tc_3stall*/ + [InstrStage<1, [SLOT2]>], [1, 1], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_14ab4f41, /*tc_newvjump*/ + [InstrStage<1, [SLOT0]>], [3, 3, 1], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_151bf368, /*tc_1*/ + [InstrStage<1, [SLOT2, SLOT3]>], [3, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_158aa3f7, /*tc_st*/ + [InstrStage<1, [SLOT0]>], [1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_197dce51, /*tc_3x*/ + [InstrStage<1, [SLOT3]>], [4, 2, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_1981450d, /*tc_newvjump*/ + [InstrStage<1, [SLOT0]>], [3], + [Hex_FWD]>, + + InstrItinData <tc_1c2c7a4a, /*tc_1*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_1c7522a8, /*tc_ld*/ + [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_1d41f8b7, /*tc_1*/ + [InstrStage<1, [SLOT2, SLOT3]>], [3, 4, 2, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_1fcb8495, /*tc_2*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_1fe4ab69, /*tc_st*/ + [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_20131976, /*tc_2*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_2237d952, /*tc_ld*/ + [InstrStage<1, [SLOT0]>], [1, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_23708a21, /*tc_1*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [], + []>, + + InstrItinData <tc_2471c1c8, /*tc_ld*/ + [InstrStage<1, [SLOT0]>], [4, 1], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_24e109c7, /*tc_newvjump*/ + [InstrStage<1, [SLOT0]>], [3, 3, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_24f426ab, /*tc_1*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_27106296, /*tc_3x*/ + [InstrStage<1, [SLOT3]>], [4, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_280f7fe1, /*tc_st*/ + [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_28e55c6f, /*tc_3x*/ + [InstrStage<1, [SLOT3]>], [1, 1], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_2c13e7f5, /*tc_2*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_2c3e17fc, /*tc_3x*/ + [InstrStage<1, [SLOT3]>], [1], + [Hex_FWD]>, + + InstrItinData <tc_2f573607, /*tc_1*/ + [InstrStage<1, [SLOT2]>], [2, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_33e7e673, /*tc_2early*/ + [InstrStage<1, [SLOT2]>], [], + []>, + + InstrItinData <tc_362b0be2, /*tc_3*/ + [InstrStage<1, [SLOT2]>], [1], + [Hex_FWD]>, + + InstrItinData <tc_38382228, /*tc_3x*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_388f9897, /*tc_1*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_38e0bae9, /*tc_3x*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 2, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_3d14a17b, /*tc_1*/ + [InstrStage<1, [SLOT0, SLOT1]>], [3, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_3edca78f, /*tc_2*/ + [InstrStage<1, [SLOT3]>], [4, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_3fbf1042, /*tc_1*/ + [InstrStage<1, [SLOT0, SLOT1]>], [3], + [Hex_FWD]>, + + InstrItinData <tc_407e96f9, /*tc_1*/ + [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_40d64c94, /*tc_newvjump*/ + [InstrStage<1, [SLOT0]>], [3, 1], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_4222e6bf, /*tc_ld*/ + [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_42ff66ba, /*tc_1*/ + [InstrStage<1, [SLOT2]>], [2, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_442395f3, /*tc_2latepred*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_449acf79, /*tc_latepredstaia*/ + [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2, 1], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_44d5a428, /*tc_st*/ + [InstrStage<1, [SLOT0, SLOT1]>], [1, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_44fffc58, /*tc_3*/ + [InstrStage<1, [SLOT2, SLOT3]>], [2], + [Hex_FWD]>, + + InstrItinData <tc_45791fb8, /*tc_ld*/ + [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_45f9d1be, /*tc_2early*/ + [InstrStage<1, [SLOT2]>], [2], + [Hex_FWD]>, + + InstrItinData <tc_46c18ecf, /*tc_3x*/ + [InstrStage<1, [SLOT3]>], [4, 1], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_49fdfd4b, /*tc_3stall*/ + [InstrStage<1, [SLOT3]>], [4, 1], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_4a55d03c, /*tc_1*/ + [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_4abdbdc6, /*tc_3x*/ + [InstrStage<1, [SLOT3]>], [2, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_4ac61d92, /*tc_2latepred*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 3, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_4bf903b0, /*tc_st*/ + [InstrStage<1, [SLOT0]>], [3], + [Hex_FWD]>, + + InstrItinData <tc_503ce0f3, /*tc_3x*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 1], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_512b1653, /*tc_st*/ + [InstrStage<1, [SLOT0]>], [1, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_53c851ab, /*tc_3stall*/ + [InstrStage<1, [SLOT2]>], [4, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_54f0cee2, /*tc_3stall*/ + [InstrStage<1, [SLOT3]>], [1], + [Hex_FWD]>, + + InstrItinData <tc_5502c366, /*tc_1*/ + [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_55255f2b, /*tc_3stall*/ + [InstrStage<1, [SLOT3]>], [], + []>, + + InstrItinData <tc_556f6577, /*tc_3x*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_55a9a350, /*tc_st*/ + [InstrStage<1, [SLOT0]>], [1, 2, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_55b33fda, /*tc_1*/ + [InstrStage<1, [SLOT2, SLOT3]>], [3, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_56a124a7, /*tc_1*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_57a55b54, /*tc_1*/ + [InstrStage<1, [SLOT3]>], [2, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_5944960d, /*tc_ld*/ + [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_59a7822c, /*tc_1*/ + [InstrStage<1, [SLOT0, SLOT1]>], [2, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_5a222e89, /*tc_2early*/ + [InstrStage<1, [SLOT2]>], [1, 1], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_5a4b5e58, /*tc_3x*/ + [InstrStage<1, [SLOT3]>], [4, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_5b347363, /*tc_1*/ + [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_5ceb2f9e, /*tc_ld*/ + [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_5da50c4b, /*tc_1*/ + [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_5deb5e47, /*tc_st*/ + [InstrStage<1, [SLOT0]>], [1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_5e4cf0e8, /*tc_2*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_5f2afaf7, /*tc_latepredldaia*/ + [InstrStage<1, [SLOT0, SLOT1]>], [4, 4, 3, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_60e324ff, /*tc_1*/ + [InstrStage<1, [SLOT2]>], [2], + [Hex_FWD]>, + + InstrItinData <tc_63567288, /*tc_2latepred*/ + [InstrStage<1, [SLOT0, SLOT1]>], [4], + [Hex_FWD]>, + + InstrItinData <tc_64b00d8a, /*tc_ld*/ + [InstrStage<1, [SLOT0]>], [4, 1], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_651cbe02, /*tc_1*/ + [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_65279839, /*tc_2*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_65cbd974, /*tc_st*/ + [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_69bfb303, /*tc_3*/ + [InstrStage<1, [SLOT2, SLOT3]>], [2, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_6aa823ab, /*tc_3stall*/ + [InstrStage<1, [SLOT3]>], [4, 1], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_6ae3426b, /*tc_3x*/ + [InstrStage<1, [SLOT3]>], [4, 1], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_6d861a95, /*tc_3x*/ + [InstrStage<1, [SLOT3]>], [2, 1], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_6e20402a, /*tc_st*/ + [InstrStage<1, [SLOT0]>], [2, 3], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_6f42bc60, /*tc_3stall*/ + [InstrStage<1, [SLOT0]>], [4, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_6fb52018, /*tc_3stall*/ + [InstrStage<1, [SLOT0]>], [1, 1], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_6fc5dbea, /*tc_1*/ + [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_711c805f, /*tc_1*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_713b66bf, /*tc_1*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_7401744f, /*tc_2*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_7476d766, /*tc_3stall*/ + [InstrStage<1, [SLOT3]>], [4, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_74a42bda, /*tc_ld*/ + [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_759e57be, /*tc_3stall*/ + [InstrStage<1, [SLOT2]>], [4, 1], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_76bb5435, /*tc_ld*/ + [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_7d6a2568, /*tc_3stall*/ + [InstrStage<1, [SLOT2]>], [1], + [Hex_FWD]>, + + InstrItinData <tc_77f94a5e, /*tc_st*/ + [InstrStage<1, [SLOT0]>], [], + []>, + + InstrItinData <tc_788b1d09, /*tc_3x*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_78f87ed3, /*tc_3stall*/ + [InstrStage<1, [SLOT0]>], [], + []>, + + InstrItinData <tc_7af3a37e, /*tc_st*/ + [InstrStage<1, [SLOT0]>], [1, 3], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_7b9187d3, /*tc_newvjump*/ + [InstrStage<1, [SLOT0]>], [3, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_7c28bd7e, /*tc_st*/ + [InstrStage<1, [SLOT0]>], [3], + [Hex_FWD]>, + + InstrItinData <tc_7c31e19a, /*tc_st*/ + [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_7c6d32e4, /*tc_ld*/ + [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_7dc63b5c, /*tc_3x*/ + [InstrStage<1, [SLOT3]>], [4, 1], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_7f58404a, /*tc_3stall*/ + [InstrStage<1, [SLOT3]>], [], + []>, + + InstrItinData <tc_7f7f45f5, /*tc_4x*/ + [InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_7f8ae742, /*tc_3x*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_8035e91f, /*tc_st*/ + [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_822c3c68, /*tc_ld*/ + [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_829d8a86, /*tc_st*/ + [InstrStage<1, [SLOT0]>], [3, 1, 1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_838c4d7a, /*tc_st*/ + [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_84a7500d, /*tc_2*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_86173609, /*tc_2latepred*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 3, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_887d1bb7, /*tc_st*/ + [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_8a6d0d94, /*tc_ld*/ + [InstrStage<1, [SLOT0, SLOT1]>], [4, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_8a825db2, /*tc_2*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_8b5bd4f5, /*tc_2*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_8e82e8ca, /*tc_st*/ + [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_8f36a2fd, /*tc_ld*/ + [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_9124c04f, /*tc_1*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_92240447, /*tc_st*/ + [InstrStage<1, [SLOT0]>], [3, 1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_934753bb, /*tc_ld*/ + [InstrStage<1, [SLOT0]>], [3, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_937dd41c, /*tc_ld*/ + [InstrStage<1, [SLOT0, SLOT1]>], [], + []>, + + InstrItinData <tc_9406230a, /*tc_3x*/ + [InstrStage<1, [SLOT3]>], [2, 1], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_95a33176, /*tc_2*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_95f43c5e, /*tc_3*/ + [InstrStage<1, [SLOT2]>], [1], + [Hex_FWD]>, + + InstrItinData <tc_96ef76ef, /*tc_st*/ + [InstrStage<1, [SLOT0]>], [1, 1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_975a4e54, /*tc_newvjump*/ + [InstrStage<1, [SLOT0]>], [3, 3, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_9783714b, /*tc_4x*/ + [InstrStage<1, [SLOT2, SLOT3]>], [5, 1], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_9b20a062, /*tc_3stall*/ + [InstrStage<1, [SLOT2]>], [4, 1], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_9b34f5e0, /*tc_3stall*/ + [InstrStage<1, [SLOT2]>], [], + []>, + + InstrItinData <tc_9b3c0462, /*tc_2*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_9bcfb2ee, /*tc_st*/ + [InstrStage<1, [SLOT0]>], [1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_9c52f549, /*tc_1*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_9e27f2f9, /*tc_1*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_9e72dc89, /*tc_4x*/ + [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_9edb7c77, /*tc_4x*/ + [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_9edefe01, /*tc_st*/ + [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_9f6cd987, /*tc_1*/ + [InstrStage<1, [SLOT2, SLOT3]>], [3, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_a08b630b, /*tc_2*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_a1297125, /*tc_1*/ + [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_a154b476, /*tc_3x*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_a2b365d2, /*tc_st*/ + [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_a3070909, /*tc_3stall*/ + [InstrStage<1, [SLOT0]>], [1, 1], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_a32e03e7, /*tc_ld*/ + [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_a38c45dc, /*tc_3x*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_a4e22bbd, /*tc_2*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_a4ee89db, /*tc_2early*/ + [InstrStage<1, [SLOT0]>], [], + []>, + + InstrItinData <tc_a724463d, /*tc_3stall*/ + [InstrStage<1, [SLOT0]>], [4, 1], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_a7a13fac, /*tc_1*/ + [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_a7bdb22c, /*tc_2*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_a9edeffa, /*tc_st*/ + [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_abfd9a6d, /*tc_ld*/ + [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_ac65613f, /*tc_ld*/ + [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_addc37a8, /*tc_st*/ + [InstrStage<1, [SLOT0]>], [3, 1, 2, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_ae5babd7, /*tc_st*/ + [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_aee6250c, /*tc_ld*/ + [InstrStage<1, [SLOT0, SLOT1]>], [4, 1], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_af6af259, /*tc_ld*/ + [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_b1ae5f67, /*tc_st*/ + [InstrStage<1, [SLOT0]>], [1], + [Hex_FWD]>, + + InstrItinData <tc_b2196a3f, /*tc_3stall*/ + [InstrStage<1, [SLOT3]>], [1, 1], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_b3d46584, /*tc_st*/ + [InstrStage<1, [SLOT0]>], [], + []>, + + InstrItinData <tc_b4dc7630, /*tc_st*/ + [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_b7c4062a, /*tc_ld*/ + [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_b837298f, /*tc_1*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [], + []>, + + InstrItinData <tc_b9bec29e, /*tc_3stall*/ + [InstrStage<1, [SLOT2]>], [], + []>, + + InstrItinData <tc_ba9255a6, /*tc_st*/ + [InstrStage<1, [SLOT0, SLOT1]>], [2, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_bb07f2c5, /*tc_st*/ + [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_bb78483e, /*tc_3stall*/ + [InstrStage<1, [SLOT3]>], [4, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_bb831a7c, /*tc_2*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_bf2ffc0f, /*tc_ld*/ + [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_c20701f0, /*tc_2*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_c21d7447, /*tc_3x*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_c57d9f39, /*tc_1*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_c818ff7f, /*tc_newvjump*/ + [InstrStage<1, [SLOT0]>], [], + []>, + + InstrItinData <tc_ce59038e, /*tc_st*/ + [InstrStage<1, [SLOT0]>], [3, 2, 1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_cfa0e29b, /*tc_st*/ + [InstrStage<1, [SLOT0]>], [2, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_d03278fd, /*tc_st*/ + [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_d234b61a, /*tc_st*/ + [InstrStage<1, [SLOT0]>], [1], + [Hex_FWD]>, + + InstrItinData <tc_d33e5eee, /*tc_1*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_d3632d88, /*tc_2*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_d45ba9cd, /*tc_ld*/ + [InstrStage<1, [SLOT0]>], [1], + [Hex_FWD]>, + + InstrItinData <tc_d57d649c, /*tc_3stall*/ + [InstrStage<1, [SLOT2]>], [2], + [Hex_FWD]>, + + InstrItinData <tc_d61dfdc3, /*tc_2*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_d68dca5c, /*tc_3stall*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_d71ea8fa, /*tc_3x*/ + [InstrStage<1, [SLOT3]>], [2, 1], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_d7718fbe, /*tc_3x*/ + [InstrStage<1, [SLOT3]>], [1], + [Hex_FWD]>, + + InstrItinData <tc_db596beb, /*tc_3x*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_db96aa6b, /*tc_st*/ + [InstrStage<1, [SLOT0]>], [1], + [Hex_FWD]>, + + InstrItinData <tc_dc51281d, /*tc_3*/ + [InstrStage<1, [SLOT2]>], [2, 1], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_decdde8a, /*tc_1*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2], + [Hex_FWD]>, + + InstrItinData <tc_df5d53f9, /*tc_newvjump*/ + [InstrStage<1, [SLOT0]>], [3, 2, 1], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_e3d699e3, /*tc_2*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_e60def48, /*tc_1*/ + [InstrStage<1, [SLOT2, SLOT3]>], [2], + [Hex_FWD]>, + + InstrItinData <tc_e9170fb7, /*tc_ld*/ + [InstrStage<1, [SLOT0, SLOT1]>], [4, 1], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_ed03645c, /*tc_1*/ + [InstrStage<1, [SLOT2]>], [3, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_ed3f8d2a, /*tc_ld*/ + [InstrStage<1, [SLOT0]>], [4, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_eed07714, /*tc_ld*/ + [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_eeda4109, /*tc_1*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_ef921005, /*tc_1*/ + [InstrStage<1, [SLOT2, SLOT3]>], [3, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_f098b237, /*tc_2*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_f0cdeccf, /*tc_3x*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_f0e8e832, /*tc_4x*/ + [InstrStage<1, [SLOT2, SLOT3]>], [5, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_f34c1c21, /*tc_2*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_f38f92e1, /*tc_newvjump*/ + [InstrStage<1, [SLOT0]>], [2], + [Hex_FWD]>, + + InstrItinData <tc_f529831b, /*tc_latepredstaia*/ + [InstrStage<1, [SLOT0]>], [4, 3, 1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_f6e2aff9, /*tc_newvjump*/ + [InstrStage<1, [SLOT0]>], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_f7569068, /*tc_4x*/ + [InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_f97707c1, /*tc_1*/ + [InstrStage<1, [SLOT2]>], [2], + [Hex_FWD]>, + + InstrItinData <tc_f999c66e, /*tc_1*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_fae9dfa5, /*tc_3x*/ + [InstrStage<1, [SLOT3]>], [4, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_fedb7e19, /*tc_ld*/ + [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]> + ]; +}
\ No newline at end of file diff --git a/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td b/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td index ae96753..f8f1c2a 100644 --- a/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td +++ b/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td @@ -39178,6 +39178,19 @@ let opNewValue = 0; let isCVI = 1; let DecoderNamespace = "EXT_mmvec"; } +def V6_vsub_hf_mix : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Vd32.qf16 = vsub($Vu32.hf,$Vv32.qf16)", +tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-5} = 0b100; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011010000; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} def V6_vsub_qf16 : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), @@ -39269,6 +39282,19 @@ let opNewValue = 0; let isCVI = 1; let DecoderNamespace = "EXT_mmvec"; } +def V6_vsub_sf_mix : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Vd32.qf32 = vsub($Vu32.sf,$Vv32.qf32)", +tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-5} = 0b000; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011010000; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} def V6_vsub_sf_sf : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), @@ -41116,6 +41142,17 @@ let hasNewValue = 1; let opNewValue = 0; let isSolo = 1; } +def Y2_tlbpp : HInst< +(outs IntRegs:$Rd32), +(ins DoubleRegs:$Rss32), +"$Rd32 = tlbp($Rss32)", +tc_6aa823ab, TypeCR>, Enc_90cd8b, Requires<[HasV81]> { +let Inst{13-5} = 0b000000000; +let Inst{31-21} = 0b01101100011; +let hasNewValue = 1; +let opNewValue = 0; +let isSolo = 1; +} def Y2_tlbr : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32), diff --git a/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td b/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td index 17cb96c..23f4b3a 100644 --- a/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td +++ b/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td @@ -3827,3 +3827,14 @@ def: Pat<(int_hexagon_V6_vsub_hf_f8 HvxVR:$src1, HvxVR:$src2), (V6_vsub_hf_f8 HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV79, UseHVX64B]>; def: Pat<(int_hexagon_V6_vsub_hf_f8_128B HvxVR:$src1, HvxVR:$src2), (V6_vsub_hf_f8 HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV79, UseHVX128B]>; + +// V81 HVX Instructions. + +def: Pat<(int_hexagon_V6_vsub_hf_mix HvxVR:$src1, HvxVR:$src2), + (V6_vsub_hf_mix HvxVR:$src1, HvxVR:$src2)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vsub_hf_mix_128B HvxVR:$src1, HvxVR:$src2), + (V6_vsub_hf_mix HvxVR:$src1, HvxVR:$src2)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vsub_sf_mix HvxVR:$src1, HvxVR:$src2), + (V6_vsub_sf_mix HvxVR:$src1, HvxVR:$src2)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vsub_sf_mix_128B HvxVR:$src1, HvxVR:$src2), + (V6_vsub_sf_mix HvxVR:$src1, HvxVR:$src2)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp index e285e04..7ee280d 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp @@ -654,7 +654,9 @@ void HexagonDAGToDAGISel::SelectIntrinsicWChain(SDNode *N) { IntNo == Intrinsic::hexagon_V6_vgathermh || IntNo == Intrinsic::hexagon_V6_vgathermh_128B || IntNo == Intrinsic::hexagon_V6_vgathermhw || - IntNo == Intrinsic::hexagon_V6_vgathermhw_128B) { + IntNo == Intrinsic::hexagon_V6_vgathermhw_128B || + IntNo == Intrinsic::hexagon_V6_vgather_vscattermh || + IntNo == Intrinsic::hexagon_V6_vgather_vscattermh_128B) { SelectV65Gather(N); return; } diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp index c7a4f68..3cc146b 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp @@ -2953,6 +2953,10 @@ void HexagonDAGToDAGISel::SelectV65Gather(SDNode *N) { case Intrinsic::hexagon_V6_vgathermhw_128B: Opcode = Hexagon::V6_vgathermhw_pseudo; break; + case Intrinsic::hexagon_V6_vgather_vscattermh: + case Intrinsic::hexagon_V6_vgather_vscattermh_128B: + Opcode = Hexagon::V6_vgather_vscatter_mh_pseudo; + break; } SDVTList VTs = CurDAG->getVTList(MVT::Other); diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp index 9f7f434..526b4de 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -2145,7 +2145,9 @@ bool HexagonTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::hexagon_V6_vgathermhq: case Intrinsic::hexagon_V6_vgathermhq_128B: case Intrinsic::hexagon_V6_vgathermhwq: - case Intrinsic::hexagon_V6_vgathermhwq_128B: { + case Intrinsic::hexagon_V6_vgathermhwq_128B: + case Intrinsic::hexagon_V6_vgather_vscattermh: + case Intrinsic::hexagon_V6_vgather_vscattermh_128B: { const Module &M = *I.getParent()->getParent()->getParent(); Info.opc = ISD::INTRINSIC_W_CHAIN; Type *VecTy = I.getArgOperand(1)->getType(); diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp index 939841a..47726d6 100644 --- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp @@ -1554,80 +1554,93 @@ HexagonInstrInfo::expandVGatherPseudo(MachineInstr &MI) const { MachineBasicBlock::iterator First; switch (Opc) { - case Hexagon::V6_vgathermh_pseudo: - First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermh)) - .add(MI.getOperand(2)) - .add(MI.getOperand(3)) - .add(MI.getOperand(4)); - BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai)) - .add(MI.getOperand(0)) - .addImm(MI.getOperand(1).getImm()) - .addReg(Hexagon::VTMP); - MBB.erase(MI); - return First.getInstrIterator(); - - case Hexagon::V6_vgathermw_pseudo: - First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermw)) - .add(MI.getOperand(2)) - .add(MI.getOperand(3)) - .add(MI.getOperand(4)); - BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai)) - .add(MI.getOperand(0)) - .addImm(MI.getOperand(1).getImm()) - .addReg(Hexagon::VTMP); - MBB.erase(MI); - return First.getInstrIterator(); - - case Hexagon::V6_vgathermhw_pseudo: - First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermhw)) - .add(MI.getOperand(2)) - .add(MI.getOperand(3)) - .add(MI.getOperand(4)); - BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai)) - .add(MI.getOperand(0)) - .addImm(MI.getOperand(1).getImm()) - .addReg(Hexagon::VTMP); - MBB.erase(MI); - return First.getInstrIterator(); - - case Hexagon::V6_vgathermhq_pseudo: - First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermhq)) - .add(MI.getOperand(2)) - .add(MI.getOperand(3)) - .add(MI.getOperand(4)) - .add(MI.getOperand(5)); - BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai)) - .add(MI.getOperand(0)) - .addImm(MI.getOperand(1).getImm()) - .addReg(Hexagon::VTMP); - MBB.erase(MI); - return First.getInstrIterator(); - - case Hexagon::V6_vgathermwq_pseudo: - First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermwq)) - .add(MI.getOperand(2)) - .add(MI.getOperand(3)) - .add(MI.getOperand(4)) - .add(MI.getOperand(5)); - BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai)) - .add(MI.getOperand(0)) - .addImm(MI.getOperand(1).getImm()) - .addReg(Hexagon::VTMP); - MBB.erase(MI); - return First.getInstrIterator(); - - case Hexagon::V6_vgathermhwq_pseudo: - First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermhwq)) - .add(MI.getOperand(2)) - .add(MI.getOperand(3)) - .add(MI.getOperand(4)) - .add(MI.getOperand(5)); - BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai)) - .add(MI.getOperand(0)) - .addImm(MI.getOperand(1).getImm()) - .addReg(Hexagon::VTMP); - MBB.erase(MI); - return First.getInstrIterator(); + case Hexagon::V6_vgather_vscatter_mh_pseudo: + // This is mainly a place holder. It will be extended. + First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermh)) + .add(MI.getOperand(2)) + .add(MI.getOperand(3)) + .add(MI.getOperand(4)); + BuildMI(MBB, MI, DL, get(Hexagon::V6_vscattermh)) + .add(MI.getOperand(2)) + .add(MI.getOperand(3)) + .add(MI.getOperand(4)) + .addReg(Hexagon::VTMP); + MBB.erase(MI); + return First.getInstrIterator(); + case Hexagon::V6_vgathermh_pseudo: + First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermh)) + .add(MI.getOperand(2)) + .add(MI.getOperand(3)) + .add(MI.getOperand(4)); + BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai)) + .add(MI.getOperand(0)) + .addImm(MI.getOperand(1).getImm()) + .addReg(Hexagon::VTMP); + MBB.erase(MI); + return First.getInstrIterator(); + + case Hexagon::V6_vgathermw_pseudo: + First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermw)) + .add(MI.getOperand(2)) + .add(MI.getOperand(3)) + .add(MI.getOperand(4)); + BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai)) + .add(MI.getOperand(0)) + .addImm(MI.getOperand(1).getImm()) + .addReg(Hexagon::VTMP); + MBB.erase(MI); + return First.getInstrIterator(); + + case Hexagon::V6_vgathermhw_pseudo: + First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermhw)) + .add(MI.getOperand(2)) + .add(MI.getOperand(3)) + .add(MI.getOperand(4)); + BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai)) + .add(MI.getOperand(0)) + .addImm(MI.getOperand(1).getImm()) + .addReg(Hexagon::VTMP); + MBB.erase(MI); + return First.getInstrIterator(); + + case Hexagon::V6_vgathermhq_pseudo: + First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermhq)) + .add(MI.getOperand(2)) + .add(MI.getOperand(3)) + .add(MI.getOperand(4)) + .add(MI.getOperand(5)); + BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai)) + .add(MI.getOperand(0)) + .addImm(MI.getOperand(1).getImm()) + .addReg(Hexagon::VTMP); + MBB.erase(MI); + return First.getInstrIterator(); + + case Hexagon::V6_vgathermwq_pseudo: + First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermwq)) + .add(MI.getOperand(2)) + .add(MI.getOperand(3)) + .add(MI.getOperand(4)) + .add(MI.getOperand(5)); + BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai)) + .add(MI.getOperand(0)) + .addImm(MI.getOperand(1).getImm()) + .addReg(Hexagon::VTMP); + MBB.erase(MI); + return First.getInstrIterator(); + + case Hexagon::V6_vgathermhwq_pseudo: + First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermhwq)) + .add(MI.getOperand(2)) + .add(MI.getOperand(3)) + .add(MI.getOperand(4)) + .add(MI.getOperand(5)); + BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai)) + .add(MI.getOperand(0)) + .addImm(MI.getOperand(1).getImm()) + .addReg(Hexagon::VTMP); + MBB.erase(MI); + return First.getInstrIterator(); } return MI.getIterator(); @@ -2806,6 +2819,7 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset, case Hexagon::V6_vL32b_nt_tmp_npred_ai: case Hexagon::V6_vS32Ub_npred_ai: case Hexagon::V6_vgathermh_pseudo: + case Hexagon::V6_vgather_vscatter_mh_pseudo: case Hexagon::V6_vgathermw_pseudo: case Hexagon::V6_vgathermhw_pseudo: case Hexagon::V6_vgathermhq_pseudo: diff --git a/llvm/lib/Target/Hexagon/HexagonPatternsV65.td b/llvm/lib/Target/Hexagon/HexagonPatternsV65.td index f927f9b..42393d0 100644 --- a/llvm/lib/Target/Hexagon/HexagonPatternsV65.td +++ b/llvm/lib/Target/Hexagon/HexagonPatternsV65.td @@ -40,6 +40,19 @@ defm V6_vgathermh_pseudo : vgathermh<HvxVR>; defm V6_vgathermw_pseudo : vgathermw<HvxVR>; defm V6_vgathermhw_pseudo : vgathermhw<HvxWR>; + +multiclass vgather_scatter_mh<RegisterClass RC> { + let isCodeGenOnly = 1, isPseudo = 1, mayLoad = 1, + mayStore = 1, addrMode = BaseImmOffset, accessSize = HalfWordAccess in + def NAME : CVI_GATHER_TMP_LD_Resource_NoOpcode<(outs ), + (ins IntRegs:$_dst_, s4_0Imm:$Ii, + IntRegs:$Rt, ModRegs:$Mu, RC:$Vv), + ".error \"should not emit\" ", + []>; +} + +defm V6_vgather_vscatter_mh_pseudo : vgather_scatter_mh<HvxVR>; + multiclass vgathermhq<RegisterClass RC1, RegisterClass RC2> { let isCodeGenOnly = 1, isPseudo = 1, mayLoad = 1, mayStore = 1, addrMode = BaseImmOffset, accessSize = HalfWordAccess in diff --git a/llvm/lib/Target/Hexagon/HexagonSchedule.td b/llvm/lib/Target/Hexagon/HexagonSchedule.td index b8a9cf3..9bcd4bf 100644 --- a/llvm/lib/Target/Hexagon/HexagonSchedule.td +++ b/llvm/lib/Target/Hexagon/HexagonSchedule.td @@ -75,3 +75,4 @@ include "HexagonScheduleV71T.td" include "HexagonScheduleV73.td" include "HexagonScheduleV75.td" include "HexagonScheduleV79.td" +include "HexagonScheduleV81.td"
\ No newline at end of file diff --git a/llvm/lib/Target/Hexagon/HexagonScheduleV81.td b/llvm/lib/Target/Hexagon/HexagonScheduleV81.td new file mode 100644 index 0000000..dd5f5a0 --- /dev/null +++ b/llvm/lib/Target/Hexagon/HexagonScheduleV81.td @@ -0,0 +1,31 @@ +//=-HexagonScheduleV81.td - HexagonV81 Scheduling Definitions *- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +def HexagonV81ItinList : DepScalarItinV81, ScalarItin, + DepHVXItinV81, HVXItin, PseudoItin { + list<InstrItinData> ItinList = + !listconcat(DepScalarItinV81_list, ScalarItin_list, + DepHVXItinV81_list, HVXItin_list, PseudoItin_list); +} + +def HexagonItinerariesV81 : + ProcessorItineraries<[SLOT0, SLOT1, SLOT2, SLOT3, SLOT_ENDLOOP, + CVI_ST, CVI_XLANE, CVI_SHIFT, CVI_MPY0, CVI_MPY1, + CVI_LD, CVI_XLSHF, CVI_MPY01, CVI_ALL, + CVI_ALL_NOMEM, CVI_ZW], + [Hex_FWD, HVX_FWD], + HexagonV81ItinList.ItinList>; + +def HexagonModelV81 : SchedMachineModel { + // Max issue per cycle == bundle width. + let IssueWidth = 4; + let Itineraries = HexagonItinerariesV81; + let LoadLatency = 1; + let CompleteModel = 0; +} diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.h b/llvm/lib/Target/Hexagon/HexagonSubtarget.h index 7430567..995f66d 100644 --- a/llvm/lib/Target/Hexagon/HexagonSubtarget.h +++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.h @@ -224,6 +224,15 @@ public: bool useHVXV79Ops() const { return HexagonHVXVersion >= Hexagon::ArchEnum::V79; } + bool hasV81Ops() const { + return getHexagonArchVersion() >= Hexagon::ArchEnum::V81; + } + bool hasV81OpsOnly() const { + return getHexagonArchVersion() == Hexagon::ArchEnum::V81; + } + bool useHVXV81Ops() const { + return HexagonHVXVersion >= Hexagon::ArchEnum::V81; + } bool useAudioOps() const { return UseAudioOps; } bool useCompound() const { return UseCompound; } diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp index 171e294..e925e04 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp @@ -31,6 +31,10 @@ using namespace llvm; static cl::opt<bool> HexagonAutoHVX("hexagon-autohvx", cl::init(false), cl::Hidden, cl::desc("Enable loop vectorizer for HVX")); +cl::opt<bool> HexagonAllowScatterGatherHVX( + "hexagon-allow-scatter-gather-hvx", cl::init(false), cl::Hidden, + cl::desc("Allow auto-generation of HVX scatter-gather")); + static cl::opt<bool> EnableV68FloatAutoHVX( "force-hvx-float", cl::Hidden, cl::desc("Enable auto-vectorization of floatint point types on v68.")); @@ -354,6 +358,61 @@ bool HexagonTTIImpl::isLegalMaskedLoad(Type *DataType, Align /*Alignment*/, return HexagonMaskedVMem && ST.isTypeForHVX(DataType); } +bool HexagonTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) const { + // For now assume we can not deal with all HVX datatypes. + if (!Ty->isVectorTy() || !ST.isTypeForHVX(Ty) || + !HexagonAllowScatterGatherHVX) + return false; + // This must be in sync with HexagonVectorCombine pass. + switch (Ty->getScalarSizeInBits()) { + case 8: + return (getTypeNumElements(Ty) == 128); + case 16: + if (getTypeNumElements(Ty) == 64 || getTypeNumElements(Ty) == 32) + return (Alignment >= 2); + break; + case 32: + if (getTypeNumElements(Ty) == 32) + return (Alignment >= 4); + break; + default: + break; + } + return false; +} + +bool HexagonTTIImpl::isLegalMaskedScatter(Type *Ty, Align Alignment) const { + if (!Ty->isVectorTy() || !ST.isTypeForHVX(Ty) || + !HexagonAllowScatterGatherHVX) + return false; + // This must be in sync with HexagonVectorCombine pass. + switch (Ty->getScalarSizeInBits()) { + case 8: + return (getTypeNumElements(Ty) == 128); + case 16: + if (getTypeNumElements(Ty) == 64) + return (Alignment >= 2); + break; + case 32: + if (getTypeNumElements(Ty) == 32) + return (Alignment >= 4); + break; + default: + break; + } + return false; +} + +bool HexagonTTIImpl::forceScalarizeMaskedGather(VectorType *VTy, + Align Alignment) const { + return !isLegalMaskedGather(VTy, Alignment); +} + +bool HexagonTTIImpl::forceScalarizeMaskedScatter(VectorType *VTy, + Align Alignment) const { + return !isLegalMaskedScatter(VTy, Alignment); +} + /// --- Vector TTI end --- unsigned HexagonTTIImpl::getPrefetchDistance() const { diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h index dbf16c9..cec2bf9 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h @@ -169,6 +169,12 @@ public: unsigned AddressSpace) const override; bool isLegalMaskedLoad(Type *DataType, Align Alignment, unsigned AddressSpace) const override; + bool isLegalMaskedGather(Type *Ty, Align Alignment) const override; + bool isLegalMaskedScatter(Type *Ty, Align Alignment) const override; + bool forceScalarizeMaskedGather(VectorType *VTy, + Align Alignment) const override; + bool forceScalarizeMaskedScatter(VectorType *VTy, + Align Alignment) const override; /// @} diff --git a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp index 9ab5202..5c50ec2 100644 --- a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp +++ b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp @@ -57,6 +57,11 @@ #define DEBUG_TYPE "hexagon-vc" +// This is a const that represents default HVX VTCM page size. +// It is boot time configurable, so we probably want an API to +// read it, but for now assume 128KB +#define DEFAULT_HVX_VTCM_PAGE_SIZE 131072 + using namespace llvm; namespace { @@ -418,6 +423,18 @@ raw_ostream &operator<<(raw_ostream &OS, const AlignVectors::ByteSpan &BS) { class HvxIdioms { public: + enum DstQualifier { + Undefined = 0, + Arithmetic, + LdSt, + LLVM_Gather, + LLVM_Scatter, + HEX_Gather_Scatter, + HEX_Gather, + HEX_Scatter, + Call + }; + HvxIdioms(const HexagonVectorCombine &HVC_) : HVC(HVC_) { auto *Int32Ty = HVC.getIntTy(32); HvxI32Ty = HVC.getHvxTy(Int32Ty, /*Pair=*/false); @@ -473,6 +490,11 @@ private: auto createMulLong(IRBuilderBase &Builder, ArrayRef<Value *> WordX, Signedness SgnX, ArrayRef<Value *> WordY, Signedness SgnY) const -> SmallVector<Value *>; + // Vector manipulations for Ripple + bool matchScatter(Instruction &In) const; + bool matchGather(Instruction &In) const; + Value *processVScatter(Instruction &In) const; + Value *processVGather(Instruction &In) const; VectorType *HvxI32Ty; VectorType *HvxP32Ty; @@ -1545,7 +1567,7 @@ auto AlignVectors::isSectorTy(Type *Ty) const -> bool { } auto AlignVectors::run() -> bool { - LLVM_DEBUG(dbgs() << "Running HVC::AlignVectors on " << HVC.F.getName() + LLVM_DEBUG(dbgs() << "\nRunning HVC::AlignVectors on " << HVC.F.getName() << '\n'); if (!createAddressGroups()) return false; @@ -1797,6 +1819,846 @@ auto HvxIdioms::processFxpMul(Instruction &In, const FxpOp &Op) const return Ext; } +inline bool HvxIdioms::matchScatter(Instruction &In) const { + IntrinsicInst *II = dyn_cast<IntrinsicInst>(&In); + if (!II) + return false; + return (II->getIntrinsicID() == Intrinsic::masked_scatter); +} + +inline bool HvxIdioms::matchGather(Instruction &In) const { + IntrinsicInst *II = dyn_cast<IntrinsicInst>(&In); + if (!II) + return false; + return (II->getIntrinsicID() == Intrinsic::masked_gather); +} + +Instruction *locateDestination(Instruction *In, HvxIdioms::DstQualifier &Qual); + +// Binary instructions we want to handle as users of gather/scatter. +inline bool isArithmetic(unsigned Opc) { + switch (Opc) { + case Instruction::Add: + case Instruction::Sub: + case Instruction::Mul: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + case Instruction::AShr: + case Instruction::LShr: + case Instruction::Shl: + case Instruction::UDiv: + return true; + } + return false; +} + +// TODO: Maybe use MemoryLocation for this. See getLocOrNone above. +inline Value *getPointer(Value *Ptr) { + assert(Ptr && "Unable to extract pointer"); + if (isa<AllocaInst>(Ptr) || isa<Argument>(Ptr) || isa<GlobalValue>(Ptr)) + return Ptr; + if (isa<LoadInst>(Ptr) || isa<StoreInst>(Ptr)) + return getLoadStorePointerOperand(Ptr); + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Ptr)) { + if (II->getIntrinsicID() == Intrinsic::masked_store) + return II->getOperand(1); + } + return nullptr; +} + +static Instruction *selectDestination(Instruction *In, + HvxIdioms::DstQualifier &Qual) { + Instruction *Destination = nullptr; + if (!In) + return Destination; + if (isa<StoreInst>(In)) { + Destination = In; + Qual = HvxIdioms::LdSt; + } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(In)) { + if (II->getIntrinsicID() == Intrinsic::masked_gather) { + Destination = In; + Qual = HvxIdioms::LLVM_Gather; + } else if (II->getIntrinsicID() == Intrinsic::masked_scatter) { + Destination = In; + Qual = HvxIdioms::LLVM_Scatter; + } else if (II->getIntrinsicID() == Intrinsic::masked_store) { + Destination = In; + Qual = HvxIdioms::LdSt; + } else if (II->getIntrinsicID() == + Intrinsic::hexagon_V6_vgather_vscattermh) { + Destination = In; + Qual = HvxIdioms::HEX_Gather_Scatter; + } else if (II->getIntrinsicID() == Intrinsic::hexagon_V6_vscattermh_128B) { + Destination = In; + Qual = HvxIdioms::HEX_Scatter; + } else if (II->getIntrinsicID() == Intrinsic::hexagon_V6_vgathermh_128B) { + Destination = In; + Qual = HvxIdioms::HEX_Gather; + } + } else if (isa<ZExtInst>(In)) { + return locateDestination(In, Qual); + } else if (isa<CastInst>(In)) { + return locateDestination(In, Qual); + } else if (isa<CallInst>(In)) { + Destination = In; + Qual = HvxIdioms::Call; + } else if (isa<GetElementPtrInst>(In)) { + return locateDestination(In, Qual); + } else if (isArithmetic(In->getOpcode())) { + Destination = In; + Qual = HvxIdioms::Arithmetic; + } else { + LLVM_DEBUG(dbgs() << "Unhandled destination : " << *In << "\n"); + } + return Destination; +} + +// This method attempts to find destination (user) for a given intrinsic. +// Given that these are produced only by Ripple, the number of options is +// limited. Simplest case is explicit store which in fact is redundant (since +// HVX gater creates its own store during packetization). Nevertheless we need +// to figure address where we storing. Other cases are more complicated, but +// still few. +Instruction *locateDestination(Instruction *In, HvxIdioms::DstQualifier &Qual) { + Instruction *Destination = nullptr; + if (!In) + return Destination; + // Get all possible destinations + SmallVector<Instruction *> Users; + // Iterate over the uses of the instruction + for (auto &U : In->uses()) { + if (auto *UI = dyn_cast<Instruction>(U.getUser())) { + Destination = selectDestination(UI, Qual); + if (Destination) + Users.push_back(Destination); + } + } + // Now see which of the users (if any) is a memory destination. + for (auto *I : Users) + if (getPointer(I)) + return I; + return Destination; +} + +// The two intrinsics we handle here have GEP in a different position. +inline GetElementPtrInst *locateGepFromIntrinsic(Instruction *In) { + assert(In && "Bad instruction"); + IntrinsicInst *IIn = dyn_cast<IntrinsicInst>(In); + assert((IIn && (IIn->getIntrinsicID() == Intrinsic::masked_gather || + IIn->getIntrinsicID() == Intrinsic::masked_scatter)) && + "Not a gather Intrinsic"); + GetElementPtrInst *GEPIndex = nullptr; + if (IIn->getIntrinsicID() == Intrinsic::masked_gather) + GEPIndex = dyn_cast<GetElementPtrInst>(IIn->getOperand(0)); + else + GEPIndex = dyn_cast<GetElementPtrInst>(IIn->getOperand(1)); + return GEPIndex; +} + +// Given the intrinsic find its GEP argument and extract base address it uses. +// The method relies on the way how Ripple typically forms the GEP for +// scatter/gather. +static Value *locateAddressFromIntrinsic(Instruction *In) { + GetElementPtrInst *GEPIndex = locateGepFromIntrinsic(In); + if (!GEPIndex) { + LLVM_DEBUG(dbgs() << " No GEP in intrinsic\n"); + return nullptr; + } + Value *BaseAddress = GEPIndex->getPointerOperand(); + auto *IndexLoad = dyn_cast<LoadInst>(BaseAddress); + if (IndexLoad) + return IndexLoad; + + auto *IndexZEx = dyn_cast<ZExtInst>(BaseAddress); + if (IndexZEx) { + IndexLoad = dyn_cast<LoadInst>(IndexZEx->getOperand(0)); + if (IndexLoad) + return IndexLoad; + IntrinsicInst *II = dyn_cast<IntrinsicInst>(IndexZEx->getOperand(0)); + if (II && II->getIntrinsicID() == Intrinsic::masked_gather) + return locateAddressFromIntrinsic(II); + } + auto *BaseShuffle = dyn_cast<ShuffleVectorInst>(BaseAddress); + if (BaseShuffle) { + IndexLoad = dyn_cast<LoadInst>(BaseShuffle->getOperand(0)); + if (IndexLoad) + return IndexLoad; + auto *IE = dyn_cast<InsertElementInst>(BaseShuffle->getOperand(0)); + if (IE) { + auto *Src = IE->getOperand(1); + IndexLoad = dyn_cast<LoadInst>(Src); + if (IndexLoad) + return IndexLoad; + auto *Alloca = dyn_cast<AllocaInst>(Src); + if (Alloca) + return Alloca; + if (isa<Argument>(Src)) { + return Src; + } + if (isa<GlobalValue>(Src)) { + return Src; + } + } + } + LLVM_DEBUG(dbgs() << " Unable to locate Address from intrinsic\n"); + return nullptr; +} + +static Type *getIndexType(Value *In) { + if (!In) + return nullptr; + + if (isa<LoadInst>(In) || isa<StoreInst>(In)) + return getLoadStoreType(In); + + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(In)) { + if (II->getIntrinsicID() == Intrinsic::masked_load) + return II->getType(); + if (II->getIntrinsicID() == Intrinsic::masked_store) + return II->getOperand(0)->getType(); + } + return In->getType(); +} + +static Value *locateIndexesFromGEP(Value *In) { + if (!In) + return nullptr; + if (isa<LoadInst>(In)) + return In; + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(In)) { + if (II->getIntrinsicID() == Intrinsic::masked_load) + return In; + if (II->getIntrinsicID() == Intrinsic::masked_gather) + return In; + } + if (auto *IndexZEx = dyn_cast<ZExtInst>(In)) + return locateIndexesFromGEP(IndexZEx->getOperand(0)); + if (auto *IndexSEx = dyn_cast<SExtInst>(In)) + return locateIndexesFromGEP(IndexSEx->getOperand(0)); + if (auto *BaseShuffle = dyn_cast<ShuffleVectorInst>(In)) + return locateIndexesFromGEP(BaseShuffle->getOperand(0)); + if (auto *IE = dyn_cast<InsertElementInst>(In)) + return locateIndexesFromGEP(IE->getOperand(1)); + if (auto *cstDataVector = dyn_cast<ConstantDataVector>(In)) + return cstDataVector; + if (auto *GEPIndex = dyn_cast<GetElementPtrInst>(In)) + return GEPIndex->getOperand(0); + return nullptr; +} + +// Given the intrinsic find its GEP argument and extract offsetts from the base +// address it uses. +static Value *locateIndexesFromIntrinsic(Instruction *In) { + GetElementPtrInst *GEPIndex = locateGepFromIntrinsic(In); + if (!GEPIndex) { + LLVM_DEBUG(dbgs() << " No GEP in intrinsic\n"); + return nullptr; + } + Value *Indexes = GEPIndex->getOperand(1); + if (auto *IndexLoad = locateIndexesFromGEP(Indexes)) + return IndexLoad; + + LLVM_DEBUG(dbgs() << " Unable to locate Index from intrinsic\n"); + return nullptr; +} + +// Because of aukward definition of many Hex intrinsics we often have to +// reinterprete HVX native <64 x i16> as <32 x i32> which in practice is a NOP +// for all use cases, so this only exist to make IR builder happy. +inline Value *getReinterpretiveCast_i16_to_i32(const HexagonVectorCombine &HVC, + IRBuilderBase &Builder, + LLVMContext &Ctx, Value *I) { + assert(I && "Unable to reinterprete cast"); + Type *NT = HVC.getHvxTy(HVC.getIntTy(32), false); + std::vector<unsigned> shuffleMask; + for (unsigned i = 0; i < 64; ++i) + shuffleMask.push_back(i); + Constant *Mask = llvm::ConstantDataVector::get(Ctx, shuffleMask); + Value *CastShuffle = + Builder.CreateShuffleVector(I, I, Mask, "identity_shuffle"); + return Builder.CreateBitCast(CastShuffle, NT, "cst64_i16_to_32_i32"); +} + +// Recast <128 x i8> as <32 x i32> +inline Value *getReinterpretiveCast_i8_to_i32(const HexagonVectorCombine &HVC, + IRBuilderBase &Builder, + LLVMContext &Ctx, Value *I) { + assert(I && "Unable to reinterprete cast"); + Type *NT = HVC.getHvxTy(HVC.getIntTy(32), false); + std::vector<unsigned> shuffleMask; + for (unsigned i = 0; i < 128; ++i) + shuffleMask.push_back(i); + Constant *Mask = llvm::ConstantDataVector::get(Ctx, shuffleMask); + Value *CastShuffle = + Builder.CreateShuffleVector(I, I, Mask, "identity_shuffle"); + return Builder.CreateBitCast(CastShuffle, NT, "cst128_i8_to_32_i32"); +} + +// Create <32 x i32> mask reinterpreted as <128 x i1> with a given pattern +inline Value *get_i32_Mask(const HexagonVectorCombine &HVC, + IRBuilderBase &Builder, LLVMContext &Ctx, + unsigned int pattern) { + std::vector<unsigned int> byteMask; + for (unsigned i = 0; i < 32; ++i) + byteMask.push_back(pattern); + + return Builder.CreateIntrinsic( + HVC.getBoolTy(128), HVC.HST.getIntrinsicId(Hexagon::V6_vandvrt), + {llvm::ConstantDataVector::get(Ctx, byteMask), HVC.getConstInt(~0)}, + nullptr); +} + +Value *HvxIdioms::processVScatter(Instruction &In) const { + auto *InpTy = dyn_cast<VectorType>(In.getOperand(0)->getType()); + assert(InpTy && "Cannot handle no vector type for llvm.scatter/gather"); + unsigned InpSize = HVC.getSizeOf(InpTy); + auto *F = In.getFunction(); + LLVMContext &Ctx = F->getContext(); + auto *ElemTy = dyn_cast<IntegerType>(InpTy->getElementType()); + assert(ElemTy && "llvm.scatter needs integer type argument"); + unsigned ElemWidth = HVC.DL.getTypeAllocSize(ElemTy); + LLVM_DEBUG({ + unsigned Elements = HVC.length(InpTy); + dbgs() << "\n[Process scatter](" << In << ")\n" << *In.getParent() << "\n"; + dbgs() << " Input type(" << *InpTy << ") elements(" << Elements + << ") VecLen(" << InpSize << ") type(" << *ElemTy << ") ElemWidth(" + << ElemWidth << ")\n"; + }); + + IRBuilder Builder(In.getParent(), In.getIterator(), + InstSimplifyFolder(HVC.DL)); + + auto *ValueToScatter = In.getOperand(0); + LLVM_DEBUG(dbgs() << " ValueToScatter : " << *ValueToScatter << "\n"); + + if (HVC.HST.getVectorLength() != InpSize) { + LLVM_DEBUG(dbgs() << "Unhandled vector size(" << InpSize + << ") for vscatter\n"); + return nullptr; + } + + // Base address of indexes. + auto *IndexLoad = locateAddressFromIntrinsic(&In); + if (!IndexLoad) + return nullptr; + LLVM_DEBUG(dbgs() << " IndexLoad : " << *IndexLoad << "\n"); + + // Address of destination. Must be in VTCM. + auto *Ptr = getPointer(IndexLoad); + if (!Ptr) + return nullptr; + LLVM_DEBUG(dbgs() << " Ptr : " << *Ptr << "\n"); + // Indexes/offsets + auto *Indexes = locateIndexesFromIntrinsic(&In); + if (!Indexes) + return nullptr; + LLVM_DEBUG(dbgs() << " Indexes : " << *Indexes << "\n"); + Value *CastedDst = Builder.CreateBitOrPointerCast(Ptr, Type::getInt32Ty(Ctx), + "cst_ptr_to_i32"); + LLVM_DEBUG(dbgs() << " CastedDst : " << *CastedDst << "\n"); + // Adjust Indexes + auto *cstDataVector = dyn_cast<ConstantDataVector>(Indexes); + Value *CastIndex = nullptr; + if (cstDataVector) { + // Our indexes are represented as a constant. We need it in a reg. + AllocaInst *IndexesAlloca = + Builder.CreateAlloca(HVC.getHvxTy(HVC.getIntTy(32), false)); + [[maybe_unused]] auto *StoreIndexes = + Builder.CreateStore(cstDataVector, IndexesAlloca); + LLVM_DEBUG(dbgs() << " StoreIndexes : " << *StoreIndexes << "\n"); + CastIndex = Builder.CreateLoad(IndexesAlloca->getAllocatedType(), + IndexesAlloca, "reload_index"); + } else { + if (ElemWidth == 2) + CastIndex = getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, Indexes); + else + CastIndex = Indexes; + } + LLVM_DEBUG(dbgs() << " Cast index : " << *CastIndex << ")\n"); + + if (ElemWidth == 1) { + // v128i8 There is no native instruction for this. + // Do this as two Hi/Lo gathers with masking. + Type *NT = HVC.getHvxTy(HVC.getIntTy(32), false); + // Extend indexes. We assume that indexes are in 128i8 format - need to + // expand them to Hi/Lo 64i16 + Value *CastIndexes = Builder.CreateBitCast(CastIndex, NT, "cast_to_32i32"); + auto V6_vunpack = HVC.HST.getIntrinsicId(Hexagon::V6_vunpackub); + auto *UnpackedIndexes = Builder.CreateIntrinsic( + HVC.getHvxTy(HVC.getIntTy(32), true), V6_vunpack, CastIndexes, nullptr); + LLVM_DEBUG(dbgs() << " UnpackedIndexes : " << *UnpackedIndexes << ")\n"); + + auto V6_hi = HVC.HST.getIntrinsicId(Hexagon::V6_hi); + auto V6_lo = HVC.HST.getIntrinsicId(Hexagon::V6_lo); + [[maybe_unused]] Value *IndexHi = + HVC.createHvxIntrinsic(Builder, V6_hi, NT, UnpackedIndexes); + [[maybe_unused]] Value *IndexLo = + HVC.createHvxIntrinsic(Builder, V6_lo, NT, UnpackedIndexes); + LLVM_DEBUG(dbgs() << " UnpackedIndHi : " << *IndexHi << ")\n"); + LLVM_DEBUG(dbgs() << " UnpackedIndLo : " << *IndexLo << ")\n"); + // Now unpack values to scatter + Value *CastSrc = + getReinterpretiveCast_i8_to_i32(HVC, Builder, Ctx, ValueToScatter); + LLVM_DEBUG(dbgs() << " CastSrc : " << *CastSrc << ")\n"); + auto *UnpackedValueToScatter = Builder.CreateIntrinsic( + HVC.getHvxTy(HVC.getIntTy(32), true), V6_vunpack, CastSrc, nullptr); + LLVM_DEBUG(dbgs() << " UnpackedValToScat: " << *UnpackedValueToScatter + << ")\n"); + + [[maybe_unused]] Value *UVSHi = + HVC.createHvxIntrinsic(Builder, V6_hi, NT, UnpackedValueToScatter); + [[maybe_unused]] Value *UVSLo = + HVC.createHvxIntrinsic(Builder, V6_lo, NT, UnpackedValueToScatter); + LLVM_DEBUG(dbgs() << " UVSHi : " << *UVSHi << ")\n"); + LLVM_DEBUG(dbgs() << " UVSLo : " << *UVSLo << ")\n"); + + // Create the mask for individual bytes + auto *QByteMask = get_i32_Mask(HVC, Builder, Ctx, 0x00ff00ff); + LLVM_DEBUG(dbgs() << " QByteMask : " << *QByteMask << "\n"); + [[maybe_unused]] auto *ResHi = Builder.CreateIntrinsic( + Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vscattermhq_128B, + {QByteMask, CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), + IndexHi, UVSHi}, + nullptr); + LLVM_DEBUG(dbgs() << " ResHi : " << *ResHi << ")\n"); + return Builder.CreateIntrinsic( + Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vscattermhq_128B, + {QByteMask, CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), + IndexLo, UVSLo}, + nullptr); + } else if (ElemWidth == 2) { + Value *CastSrc = + getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, ValueToScatter); + LLVM_DEBUG(dbgs() << " CastSrc : " << *CastSrc << ")\n"); + return Builder.CreateIntrinsic( + Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vscattermh_128B, + {CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), CastIndex, + CastSrc}, + nullptr); + } else if (ElemWidth == 4) { + return Builder.CreateIntrinsic( + Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vscattermw_128B, + {CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), CastIndex, + ValueToScatter}, + nullptr); + } else { + LLVM_DEBUG(dbgs() << "Unhandled element type for vscatter\n"); + return nullptr; + } +} + +Value *HvxIdioms::processVGather(Instruction &In) const { + [[maybe_unused]] auto *InpTy = + dyn_cast<VectorType>(In.getOperand(0)->getType()); + assert(InpTy && "Cannot handle no vector type for llvm.gather"); + [[maybe_unused]] auto *ElemTy = + dyn_cast<PointerType>(InpTy->getElementType()); + assert(ElemTy && "llvm.gather needs vector of ptr argument"); + auto *F = In.getFunction(); + LLVMContext &Ctx = F->getContext(); + LLVM_DEBUG(dbgs() << "\n[Process gather](" << In << ")\n" + << *In.getParent() << "\n"); + LLVM_DEBUG(dbgs() << " Input type(" << *InpTy << ") elements(" + << HVC.length(InpTy) << ") VecLen(" << HVC.getSizeOf(InpTy) + << ") type(" << *ElemTy << ") Access alignment(" + << *In.getOperand(1) << ") AddressSpace(" + << ElemTy->getAddressSpace() << ")\n"); + + // TODO: Handle masking of elements. + assert(dyn_cast<VectorType>(In.getOperand(2)->getType()) && + "llvm.gather needs vector for mask"); + IRBuilder Builder(In.getParent(), In.getIterator(), + InstSimplifyFolder(HVC.DL)); + + // See who is using the result. The difference between LLVM and HVX vgather + // Intrinsic makes it impossible to handle all cases with temp storage. Alloca + // in VTCM is not yet supported, so for now we just bail out for those cases. + HvxIdioms::DstQualifier Qual = HvxIdioms::Undefined; + Instruction *Dst = locateDestination(&In, Qual); + if (!Dst) { + LLVM_DEBUG(dbgs() << " Unable to locate vgather destination\n"); + return nullptr; + } + LLVM_DEBUG(dbgs() << " Destination : " << *Dst << " Qual(" << Qual + << ")\n"); + + // Address of destination. Must be in VTCM. + auto *Ptr = getPointer(Dst); + if (!Ptr) { + LLVM_DEBUG(dbgs() << "Could not locate vgather destination ptr\n"); + return nullptr; + } + + // Result type. Assume it is a vector type. + auto *DstType = cast<VectorType>(getIndexType(Dst)); + assert(DstType && "Cannot handle non vector dst type for llvm.gather"); + + // Base address for sources to be loaded + auto *IndexLoad = locateAddressFromIntrinsic(&In); + if (!IndexLoad) + return nullptr; + LLVM_DEBUG(dbgs() << " IndexLoad : " << *IndexLoad << "\n"); + + // Gather indexes/offsets + auto *Indexes = locateIndexesFromIntrinsic(&In); + if (!Indexes) + return nullptr; + LLVM_DEBUG(dbgs() << " Indexes : " << *Indexes << "\n"); + + Instruction *Gather = nullptr; + Type *NT = HVC.getHvxTy(HVC.getIntTy(32), false); + if (Qual == HvxIdioms::LdSt || Qual == HvxIdioms::Arithmetic) { + // We fully assume the address space is in VTCM. We also assume that all + // pointers in Operand(0) have the same base(!). + // This is the most basic case of all the above. + unsigned OutputSize = HVC.getSizeOf(DstType); + auto *DstElemTy = cast<IntegerType>(DstType->getElementType()); + unsigned ElemWidth = HVC.DL.getTypeAllocSize(DstElemTy); + LLVM_DEBUG(dbgs() << " Buffer type : " << *Ptr->getType() + << " Address space (" + << Ptr->getType()->getPointerAddressSpace() << ")\n" + << " Result type : " << *DstType + << "\n Size in bytes : " << OutputSize + << " element type(" << *DstElemTy + << ")\n ElemWidth : " << ElemWidth << " bytes\n"); + + auto *IndexType = cast<VectorType>(getIndexType(Indexes)); + assert(IndexType && "Cannot handle non vector index type for llvm.gather"); + unsigned IndexWidth = HVC.DL.getTypeAllocSize(IndexType->getElementType()); + LLVM_DEBUG(dbgs() << " IndexWidth(" << IndexWidth << ")\n"); + + // Intrinsic takes i32 instead of pointer so cast. + Value *CastedPtr = Builder.CreateBitOrPointerCast( + IndexLoad, Type::getInt32Ty(Ctx), "cst_ptr_to_i32"); + // [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, ...] + // int_hexagon_V6_vgathermh [... , llvm_v16i32_ty] + // int_hexagon_V6_vgathermh_128B [... , llvm_v32i32_ty] + // int_hexagon_V6_vgathermhw [... , llvm_v32i32_ty] + // int_hexagon_V6_vgathermhw_128B [... , llvm_v64i32_ty] + // int_hexagon_V6_vgathermw [... , llvm_v16i32_ty] + // int_hexagon_V6_vgathermw_128B [... , llvm_v32i32_ty] + if (HVC.HST.getVectorLength() == OutputSize) { + if (ElemWidth == 1) { + // v128i8 There is no native instruction for this. + // Do this as two Hi/Lo gathers with masking. + // Unpack indexes. We assume that indexes are in 128i8 format - need to + // expand them to Hi/Lo 64i16 + Value *CastIndexes = + Builder.CreateBitCast(Indexes, NT, "cast_to_32i32"); + auto V6_vunpack = HVC.HST.getIntrinsicId(Hexagon::V6_vunpackub); + auto *UnpackedIndexes = + Builder.CreateIntrinsic(HVC.getHvxTy(HVC.getIntTy(32), true), + V6_vunpack, CastIndexes, nullptr); + LLVM_DEBUG(dbgs() << " UnpackedIndexes : " << *UnpackedIndexes + << ")\n"); + + auto V6_hi = HVC.HST.getIntrinsicId(Hexagon::V6_hi); + auto V6_lo = HVC.HST.getIntrinsicId(Hexagon::V6_lo); + [[maybe_unused]] Value *IndexHi = + HVC.createHvxIntrinsic(Builder, V6_hi, NT, UnpackedIndexes); + [[maybe_unused]] Value *IndexLo = + HVC.createHvxIntrinsic(Builder, V6_lo, NT, UnpackedIndexes); + LLVM_DEBUG(dbgs() << " UnpackedIndHi : " << *IndexHi << ")\n"); + LLVM_DEBUG(dbgs() << " UnpackedIndLo : " << *IndexLo << ")\n"); + // Create the mask for individual bytes + auto *QByteMask = get_i32_Mask(HVC, Builder, Ctx, 0x00ff00ff); + LLVM_DEBUG(dbgs() << " QByteMask : " << *QByteMask << "\n"); + // We use our destination allocation as a temp storage + // This is unlikely to work properly for masked gather. + auto V6_vgather = HVC.HST.getIntrinsicId(Hexagon::V6_vgathermhq); + [[maybe_unused]] auto GatherHi = Builder.CreateIntrinsic( + Type::getVoidTy(Ctx), V6_vgather, + {Ptr, QByteMask, CastedPtr, + HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), IndexHi}, + nullptr); + LLVM_DEBUG(dbgs() << " GatherHi : " << *GatherHi << ")\n"); + // Rematerialize the result + [[maybe_unused]] Value *LoadedResultHi = Builder.CreateLoad( + HVC.getHvxTy(HVC.getIntTy(32), false), Ptr, "temp_result_hi"); + LLVM_DEBUG(dbgs() << " LoadedResultHi : " << *LoadedResultHi << "\n"); + // Same for the low part. Here we use Gather to return non-NULL result + // from this function and continue to iterate. We also are deleting Dst + // store below. + Gather = Builder.CreateIntrinsic( + Type::getVoidTy(Ctx), V6_vgather, + {Ptr, QByteMask, CastedPtr, + HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), IndexLo}, + nullptr); + LLVM_DEBUG(dbgs() << " GatherLo : " << *Gather << ")\n"); + Value *LoadedResultLo = Builder.CreateLoad( + HVC.getHvxTy(HVC.getIntTy(32), false), Ptr, "temp_result_lo"); + LLVM_DEBUG(dbgs() << " LoadedResultLo : " << *LoadedResultLo << "\n"); + // Now we have properly sized bytes in every other position + // B b A a c a A b B c f F g G h H is presented as + // B . b . A . a . c . a . A . b . B . c . f . F . g . G . h . H + // Use vpack to gather them + auto V6_vpackeb = HVC.HST.getIntrinsicId(Hexagon::V6_vpackeb); + [[maybe_unused]] auto Res = Builder.CreateIntrinsic( + NT, V6_vpackeb, {LoadedResultHi, LoadedResultLo}, nullptr); + LLVM_DEBUG(dbgs() << " ScaledRes : " << *Res << "\n"); + [[maybe_unused]] auto *StoreRes = Builder.CreateStore(Res, Ptr); + LLVM_DEBUG(dbgs() << " StoreRes : " << *StoreRes << "\n"); + } else if (ElemWidth == 2) { + // v32i16 + if (IndexWidth == 2) { + // Reinterprete 64i16 as 32i32. Only needed for syntactic IR match. + Value *CastIndex = + getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, Indexes); + LLVM_DEBUG(dbgs() << " Cast index: " << *CastIndex << ")\n"); + // shift all i16 left by 1 to match short addressing mode instead of + // byte. + auto V6_vaslh = HVC.HST.getIntrinsicId(Hexagon::V6_vaslh); + Value *AdjustedIndex = HVC.createHvxIntrinsic( + Builder, V6_vaslh, NT, {CastIndex, HVC.getConstInt(1)}); + LLVM_DEBUG(dbgs() + << " Shifted half index: " << *AdjustedIndex << ")\n"); + + auto V6_vgather = HVC.HST.getIntrinsicId(Hexagon::V6_vgathermh); + // The 3rd argument is the size of the region to gather from. Probably + // want to set it to max VTCM size. + Gather = Builder.CreateIntrinsic( + Type::getVoidTy(Ctx), V6_vgather, + {Ptr, CastedPtr, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), + AdjustedIndex}, + nullptr); + for (auto &U : Dst->uses()) { + if (auto *UI = dyn_cast<Instruction>(U.getUser())) + dbgs() << " dst used by: " << *UI << "\n"; + } + for (auto &U : In.uses()) { + if (auto *UI = dyn_cast<Instruction>(U.getUser())) + dbgs() << " In used by : " << *UI << "\n"; + } + // Create temp load from result in case the result is used by any + // other instruction. + Value *LoadedResult = Builder.CreateLoad( + HVC.getHvxTy(HVC.getIntTy(16), false), Ptr, "temp_result"); + LLVM_DEBUG(dbgs() << " LoadedResult : " << *LoadedResult << "\n"); + In.replaceAllUsesWith(LoadedResult); + } else { + dbgs() << " Unhandled index type for vgather\n"; + return nullptr; + } + } else if (ElemWidth == 4) { + if (IndexWidth == 4) { + // v32i32 + auto V6_vaslh = HVC.HST.getIntrinsicId(Hexagon::V6_vaslh); + Value *AdjustedIndex = HVC.createHvxIntrinsic( + Builder, V6_vaslh, NT, {Indexes, HVC.getConstInt(2)}); + LLVM_DEBUG(dbgs() + << " Shifted word index: " << *AdjustedIndex << ")\n"); + Gather = Builder.CreateIntrinsic( + Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermw_128B, + {Ptr, CastedPtr, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), + AdjustedIndex}, + nullptr); + } else { + LLVM_DEBUG(dbgs() << " Unhandled index type for vgather\n"); + return nullptr; + } + } else { + LLVM_DEBUG(dbgs() << " Unhandled element type for vgather\n"); + return nullptr; + } + } else if (HVC.HST.getVectorLength() == OutputSize * 2) { + // This is half of the reg width, duplicate low in high + LLVM_DEBUG(dbgs() << " Unhandled half of register size\n"); + return nullptr; + } else if (HVC.HST.getVectorLength() * 2 == OutputSize) { + LLVM_DEBUG(dbgs() << " Unhandle twice the register size\n"); + return nullptr; + } + // Erase the original intrinsic and store that consumes it. + // HVX will create a pseudo for gather that is expanded to gather + store + // during packetization. + Dst->eraseFromParent(); + } else if (Qual == HvxIdioms::LLVM_Scatter) { + // Gather feeds directly into scatter. + LLVM_DEBUG({ + auto *DstInpTy = cast<VectorType>(Dst->getOperand(1)->getType()); + assert(DstInpTy && "Cannot handle no vector type for llvm.scatter"); + unsigned DstInpSize = HVC.getSizeOf(DstInpTy); + unsigned DstElements = HVC.length(DstInpTy); + auto *DstElemTy = cast<PointerType>(DstInpTy->getElementType()); + assert(DstElemTy && "llvm.scatter needs vector of ptr argument"); + dbgs() << " Gather feeds into scatter\n Values to scatter : " + << *Dst->getOperand(0) << "\n"; + dbgs() << " Dst type(" << *DstInpTy << ") elements(" << DstElements + << ") VecLen(" << DstInpSize << ") type(" << *DstElemTy + << ") Access alignment(" << *Dst->getOperand(2) << ")\n"; + }); + // Address of source + auto *Src = getPointer(IndexLoad); + if (!Src) + return nullptr; + LLVM_DEBUG(dbgs() << " Src : " << *Src << "\n"); + + if (!isa<PointerType>(Src->getType())) { + LLVM_DEBUG(dbgs() << " Source is not a pointer type...\n"); + return nullptr; + } + + Value *CastedSrc = Builder.CreateBitOrPointerCast( + Src, Type::getInt32Ty(Ctx), "cst_ptr_to_i32"); + LLVM_DEBUG(dbgs() << " CastedSrc: " << *CastedSrc << "\n"); + + auto *DstLoad = locateAddressFromIntrinsic(Dst); + if (!DstLoad) { + LLVM_DEBUG(dbgs() << " Unable to locate DstLoad\n"); + return nullptr; + } + LLVM_DEBUG(dbgs() << " DstLoad : " << *DstLoad << "\n"); + + Value *Ptr = getPointer(DstLoad); + if (!Ptr) + return nullptr; + LLVM_DEBUG(dbgs() << " Ptr : " << *Ptr << "\n"); + Value *CastIndex = + getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, IndexLoad); + LLVM_DEBUG(dbgs() << " Cast index: " << *CastIndex << ")\n"); + // Shift all i16 left by 1 to match short addressing mode instead of + // byte. + auto V6_vaslh = HVC.HST.getIntrinsicId(Hexagon::V6_vaslh); + Value *AdjustedIndex = HVC.createHvxIntrinsic( + Builder, V6_vaslh, NT, {CastIndex, HVC.getConstInt(1)}); + LLVM_DEBUG(dbgs() << " Shifted half index: " << *AdjustedIndex << ")\n"); + + return Builder.CreateIntrinsic( + Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermh_128B, + {Ptr, CastedSrc, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), + AdjustedIndex}, + nullptr); + } else if (Qual == HvxIdioms::HEX_Gather_Scatter) { + // Gather feeds into previously inserted pseudo intrinsic. + // These could not be in the same packet, so we need to generate another + // pseudo that is expanded to .tmp + store V6_vgathermh_pseudo + // V6_vgathermh_pseudo (ins IntRegs:$_dst_, s4_0Imm:$Ii, IntRegs:$Rt, + // ModRegs:$Mu, HvxVR:$Vv) + if (isa<AllocaInst>(IndexLoad)) { + auto *cstDataVector = dyn_cast<ConstantDataVector>(Indexes); + if (cstDataVector) { + // Our indexes are represented as a constant. We need THEM in a reg. + // This most likely will not work properly since alloca gives us DDR + // stack location. This will be fixed once we teach compiler about VTCM. + AllocaInst *IndexesAlloca = Builder.CreateAlloca(NT); + [[maybe_unused]] auto *StoreIndexes = + Builder.CreateStore(cstDataVector, IndexesAlloca); + LLVM_DEBUG(dbgs() << " StoreIndexes : " << *StoreIndexes << "\n"); + Value *LoadedIndex = Builder.CreateLoad( + IndexesAlloca->getAllocatedType(), IndexesAlloca, "reload_index"); + AllocaInst *ResultAlloca = Builder.CreateAlloca(NT); + LLVM_DEBUG(dbgs() << " ResultAlloca : " << *ResultAlloca << "\n"); + + Value *CastedSrc = Builder.CreateBitOrPointerCast( + IndexLoad, Type::getInt32Ty(Ctx), "cst_ptr_to_i32"); + LLVM_DEBUG(dbgs() << " CastedSrc : " << *CastedSrc << "\n"); + + Gather = Builder.CreateIntrinsic( + Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermh_128B, + {ResultAlloca, CastedSrc, + HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), LoadedIndex}, + nullptr); + Value *LoadedResult = Builder.CreateLoad( + HVC.getHvxTy(HVC.getIntTy(16), false), ResultAlloca, "temp_result"); + LLVM_DEBUG(dbgs() << " LoadedResult : " << *LoadedResult << "\n"); + LLVM_DEBUG(dbgs() << " Gather : " << *Gather << "\n"); + In.replaceAllUsesWith(LoadedResult); + } + } else { + // Address of source + auto *Src = getPointer(IndexLoad); + if (!Src) + return nullptr; + LLVM_DEBUG(dbgs() << " Src : " << *Src << "\n"); + + Value *CastedSrc = Builder.CreateBitOrPointerCast( + Src, Type::getInt32Ty(Ctx), "cst_ptr_to_i32"); + LLVM_DEBUG(dbgs() << " CastedSrc: " << *CastedSrc << "\n"); + + auto *DstLoad = locateAddressFromIntrinsic(Dst); + if (!DstLoad) + return nullptr; + LLVM_DEBUG(dbgs() << " DstLoad : " << *DstLoad << "\n"); + auto *Ptr = getPointer(DstLoad); + if (!Ptr) + return nullptr; + LLVM_DEBUG(dbgs() << " Ptr : " << *Ptr << "\n"); + + Gather = Builder.CreateIntrinsic( + Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgather_vscattermh, + {Ptr, CastedSrc, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), + Indexes}, + nullptr); + } + return Gather; + } else if (Qual == HvxIdioms::HEX_Scatter) { + // This is the case when result of a gather is used as an argument to + // Intrinsic::hexagon_V6_vscattermh_128B. Most likely we just inserted it + // ourselves. We have to create alloca, store to it, and replace all uses + // with that. + AllocaInst *ResultAlloca = Builder.CreateAlloca(NT); + Value *CastedSrc = Builder.CreateBitOrPointerCast( + IndexLoad, Type::getInt32Ty(Ctx), "cst_ptr_to_i32"); + LLVM_DEBUG(dbgs() << " CastedSrc : " << *CastedSrc << "\n"); + Value *CastIndex = + getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, Indexes); + LLVM_DEBUG(dbgs() << " Cast index : " << *CastIndex << ")\n"); + + Gather = Builder.CreateIntrinsic( + Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermh_128B, + {ResultAlloca, CastedSrc, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), + CastIndex}, + nullptr); + Value *LoadedResult = Builder.CreateLoad( + HVC.getHvxTy(HVC.getIntTy(16), false), ResultAlloca, "temp_result"); + LLVM_DEBUG(dbgs() << " LoadedResult : " << *LoadedResult << "\n"); + In.replaceAllUsesWith(LoadedResult); + } else if (Qual == HvxIdioms::HEX_Gather) { + // Gather feeds to another gather but already replaced with + // hexagon_V6_vgathermh_128B + if (isa<AllocaInst>(IndexLoad)) { + auto *cstDataVector = dyn_cast<ConstantDataVector>(Indexes); + if (cstDataVector) { + // Our indexes are represented as a constant. We need it in a reg. + AllocaInst *IndexesAlloca = Builder.CreateAlloca(NT); + + [[maybe_unused]] auto *StoreIndexes = + Builder.CreateStore(cstDataVector, IndexesAlloca); + LLVM_DEBUG(dbgs() << " StoreIndexes : " << *StoreIndexes << "\n"); + Value *LoadedIndex = Builder.CreateLoad( + IndexesAlloca->getAllocatedType(), IndexesAlloca, "reload_index"); + AllocaInst *ResultAlloca = Builder.CreateAlloca(NT); + LLVM_DEBUG(dbgs() << " ResultAlloca : " << *ResultAlloca + << "\n AddressSpace: " + << ResultAlloca->getAddressSpace() << "\n";); + + Value *CastedSrc = Builder.CreateBitOrPointerCast( + IndexLoad, Type::getInt32Ty(Ctx), "cst_ptr_to_i32"); + LLVM_DEBUG(dbgs() << " CastedSrc : " << *CastedSrc << "\n"); + + Gather = Builder.CreateIntrinsic( + Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermh_128B, + {ResultAlloca, CastedSrc, + HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), LoadedIndex}, + nullptr); + Value *LoadedResult = Builder.CreateLoad( + HVC.getHvxTy(HVC.getIntTy(16), false), ResultAlloca, "temp_result"); + LLVM_DEBUG(dbgs() << " LoadedResult : " << *LoadedResult << "\n"); + LLVM_DEBUG(dbgs() << " Gather : " << *Gather << "\n"); + In.replaceAllUsesWith(LoadedResult); + } + } + } else if (Qual == HvxIdioms::LLVM_Gather) { + // Gather feeds into another gather + errs() << " Underimplemented vgather to vgather sequence\n"; + return nullptr; + } else + llvm_unreachable("Unhandled Qual enum"); + + return Gather; +} + auto HvxIdioms::processFxpMulChopped(IRBuilderBase &Builder, Instruction &In, const FxpOp &Op) const -> Value * { assert(Op.X.Val->getType() == Op.Y.Val->getType()); @@ -2138,6 +3000,26 @@ auto HvxIdioms::run() -> bool { It = StartOver ? B.rbegin() : cast<Instruction>(New)->getReverseIterator(); Changed = true; + } else if (matchGather(*It)) { + Value *New = processVGather(*It); + if (!New) + continue; + LLVM_DEBUG(dbgs() << " Gather : " << *New << "\n"); + // We replace original intrinsic with a new pseudo call. + It->eraseFromParent(); + It = cast<Instruction>(New)->getReverseIterator(); + RecursivelyDeleteTriviallyDeadInstructions(&*It, &HVC.TLI); + Changed = true; + } else if (matchScatter(*It)) { + Value *New = processVScatter(*It); + if (!New) + continue; + LLVM_DEBUG(dbgs() << " Scatter : " << *New << "\n"); + // We replace original intrinsic with a new pseudo call. + It->eraseFromParent(); + It = cast<Instruction>(New)->getReverseIterator(); + RecursivelyDeleteTriviallyDeadInstructions(&*It, &HVC.TLI); + Changed = true; } } } diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp index 6455757..2f59b7c 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp @@ -186,6 +186,9 @@ static unsigned featureToArchVersion(unsigned Feature) { case Hexagon::ArchV79: case Hexagon::ExtensionHVXV79: return 79; + case Hexagon::ArchV81: + case Hexagon::ExtensionHVXV81: + return 81; } llvm_unreachable("Expected valid arch feature"); return 0; diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp index 6b48a21..b8075bd 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp @@ -96,6 +96,8 @@ cl::opt<bool> MV75("mv75", cl::Hidden, cl::desc("Build for Hexagon V75"), cl::init(false)); cl::opt<bool> MV79("mv79", cl::Hidden, cl::desc("Build for Hexagon V79"), cl::init(false)); +cl::opt<bool> MV81("mv81", cl::Hidden, cl::desc("Build for Hexagon V81"), + cl::init(false)); } // namespace static cl::opt<Hexagon::ArchEnum> EnableHVX( @@ -111,6 +113,7 @@ static cl::opt<Hexagon::ArchEnum> EnableHVX( clEnumValN(Hexagon::ArchEnum::V73, "v73", "Build for HVX v73"), clEnumValN(Hexagon::ArchEnum::V75, "v75", "Build for HVX v75"), clEnumValN(Hexagon::ArchEnum::V79, "v79", "Build for HVX v79"), + clEnumValN(Hexagon::ArchEnum::V81, "v81", "Build for HVX v81"), // Sentinel for no value specified. clEnumValN(Hexagon::ArchEnum::Generic, "", "")), // Sentinel for flag not present. @@ -159,6 +162,8 @@ static StringRef HexagonGetArchVariant() { return "hexagonv75"; if (MV79) return "hexagonv79"; + if (MV81) + return "hexagonv81"; return ""; } @@ -474,6 +479,9 @@ std::string selectHexagonFS(StringRef CPU, StringRef FS) { case Hexagon::ArchEnum::V79: Result.push_back("+hvxv79"); break; + case Hexagon::ArchEnum::V81: + Result.push_back("+hvxv81"); + break; case Hexagon::ArchEnum::Generic: { Result.push_back(StringSwitch<StringRef>(CPU) @@ -489,7 +497,8 @@ std::string selectHexagonFS(StringRef CPU, StringRef FS) { .Case("hexagonv71t", "+hvxv71") .Case("hexagonv73", "+hvxv73") .Case("hexagonv75", "+hvxv75") - .Case("hexagonv79", "+hvxv79")); + .Case("hexagonv79", "+hvxv79") + .Case("hexagonv81", "+hvxv81")); break; } case Hexagon::ArchEnum::NoArch: @@ -538,8 +547,8 @@ FeatureBitset Hexagon_MC::completeHVXFeatures(const FeatureBitset &S) { FeatureBitset FB = S; unsigned CpuArch = ArchV5; for (unsigned F : - {ArchV79, ArchV75, ArchV73, ArchV71, ArchV69, ArchV68, ArchV67, ArchV66, - ArchV65, ArchV62, ArchV60, ArchV55, ArchV5}) { + {ArchV81, ArchV79, ArchV75, ArchV73, ArchV71, ArchV69, ArchV68, ArchV67, + ArchV66, ArchV65, ArchV62, ArchV60, ArchV55, ArchV5}) { if (!FB.test(F)) continue; CpuArch = F; @@ -556,7 +565,7 @@ FeatureBitset Hexagon_MC::completeHVXFeatures(const FeatureBitset &S) { for (unsigned F : {ExtensionHVXV60, ExtensionHVXV62, ExtensionHVXV65, ExtensionHVXV66, ExtensionHVXV67, ExtensionHVXV68, ExtensionHVXV69, ExtensionHVXV71, - ExtensionHVXV73, ExtensionHVXV75, ExtensionHVXV79}) { + ExtensionHVXV73, ExtensionHVXV75, ExtensionHVXV79, ExtensionHVXV81}) { if (!FB.test(F)) continue; HasHvxVer = true; @@ -569,6 +578,9 @@ FeatureBitset Hexagon_MC::completeHVXFeatures(const FeatureBitset &S) { // HasHvxVer is false, and UseHvx is true. switch (CpuArch) { + case ArchV81: + FB.set(ExtensionHVXV81); + [[fallthrough]]; case ArchV79: FB.set(ExtensionHVXV79); [[fallthrough]]; @@ -668,12 +680,12 @@ void Hexagon_MC::addArchSubtarget(MCSubtargetInfo const *STI, StringRef FS) { std::optional<unsigned> Hexagon_MC::getHVXVersion(const FeatureBitset &Features) { - for (auto Arch : {Hexagon::ExtensionHVXV79, Hexagon::ExtensionHVXV75, - Hexagon::ExtensionHVXV73, Hexagon::ExtensionHVXV71, - Hexagon::ExtensionHVXV69, Hexagon::ExtensionHVXV68, - Hexagon::ExtensionHVXV67, Hexagon::ExtensionHVXV66, - Hexagon::ExtensionHVXV65, Hexagon::ExtensionHVXV62, - Hexagon::ExtensionHVXV60}) + for (auto Arch : {Hexagon::ExtensionHVXV81, Hexagon::ExtensionHVXV79, + Hexagon::ExtensionHVXV75, Hexagon::ExtensionHVXV73, + Hexagon::ExtensionHVXV71, Hexagon::ExtensionHVXV69, + Hexagon::ExtensionHVXV68, Hexagon::ExtensionHVXV67, + Hexagon::ExtensionHVXV66, Hexagon::ExtensionHVXV65, + Hexagon::ExtensionHVXV62, Hexagon::ExtensionHVXV60}) if (Features.test(Arch)) return Arch; return {}; @@ -681,13 +693,13 @@ Hexagon_MC::getHVXVersion(const FeatureBitset &Features) { unsigned Hexagon_MC::getArchVersion(const FeatureBitset &Features) { for (auto Arch : - {Hexagon::ArchV79, Hexagon::ArchV75, Hexagon::ArchV73, Hexagon::ArchV71, - Hexagon::ArchV69, Hexagon::ArchV68, Hexagon::ArchV67, Hexagon::ArchV66, - Hexagon::ArchV65, Hexagon::ArchV62, Hexagon::ArchV60, Hexagon::ArchV55, - Hexagon::ArchV5}) + {Hexagon::ArchV81, Hexagon::ArchV79, Hexagon::ArchV75, Hexagon::ArchV73, + Hexagon::ArchV71, Hexagon::ArchV69, Hexagon::ArchV68, Hexagon::ArchV67, + Hexagon::ArchV66, Hexagon::ArchV65, Hexagon::ArchV62, Hexagon::ArchV60, + Hexagon::ArchV55, Hexagon::ArchV5}) if (Features.test(Arch)) return Arch; - llvm_unreachable("Expected arch v5-v79"); + llvm_unreachable("Expected arch v5-v81"); return 0; } @@ -708,7 +720,8 @@ unsigned Hexagon_MC::GetELFFlags(const MCSubtargetInfo &STI) { .Case("hexagonv71t", llvm::ELF::EF_HEXAGON_MACH_V71T) .Case("hexagonv73", llvm::ELF::EF_HEXAGON_MACH_V73) .Case("hexagonv75", llvm::ELF::EF_HEXAGON_MACH_V75) - .Case("hexagonv79", llvm::ELF::EF_HEXAGON_MACH_V79); + .Case("hexagonv79", llvm::ELF::EF_HEXAGON_MACH_V79) + .Case("hexagonv81", llvm::ELF::EF_HEXAGON_MACH_V81); } llvm::ArrayRef<MCPhysReg> Hexagon_MC::GetVectRegRev() { diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td index aca7abd..44d1a44 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -4578,6 +4578,8 @@ def : InstAlias<"mfamr $Rx", (MFSPR gprc:$Rx, 29)>; def : InstAlias<"mtpid $Rx", (MTSPR 48, gprc:$Rx)>, Requires<[IsBookE]>; def : InstAlias<"mfpid $Rx", (MFSPR gprc:$Rx, 48)>, Requires<[IsBookE]>; +def : InstAlias<"mtpidr $Rx", (MTSPR 48, gprc:$Rx)>, Requires<[IsISA3_0]>; +def : InstAlias<"mfpidr $Rx", (MFSPR gprc:$Rx, 48)>, Requires<[IsISA3_0]>; foreach SPRG = 4-7 in { def : InstAlias<"mfsprg $RT, "#SPRG, (MFSPR gprc:$RT, !add(SPRG, 256))>, diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index 9e6b7f0..2754d78 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -1124,7 +1124,8 @@ def HasStdExtZbkbOrP "'Base P' (Packed-SIMD)">; def HasStdExtZbbOrZbkbOrP - : Predicate<"Subtarget->HasStdExtZbbOrZbkb()|| Subtarget->hasStdExtP()">, + : Predicate<"Subtarget->hasStdExtZbb() || Subtarget->hasStdExtZbkb() || " + "Subtarget->hasStdExtP()">, AssemblerPredicate<(any_of FeatureStdExtZbb, FeatureStdExtZbkb, FeatureStdExtP), "'Zbb' (Basic Bit-Manipulation) or " "'Zbkb' (Bitmanip instructions for Cryptography) or " diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 26fe9ed..1c930ac 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -318,8 +318,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(ISD::EH_DWARF_CFA, MVT::i32, Custom); - if (!Subtarget.hasStdExtZbb() && !Subtarget.hasVendorXTHeadBb() && - !Subtarget.hasVendorXqcibm() && !Subtarget.hasVendorXAndesPerf() && + if (!Subtarget.hasStdExtZbb() && !Subtarget.hasStdExtP() && + !Subtarget.hasVendorXTHeadBb() && !Subtarget.hasVendorXqcibm() && + !Subtarget.hasVendorXAndesPerf() && !(Subtarget.hasVendorXCValu() && !Subtarget.is64Bit())) setOperationAction(ISD::SIGN_EXTEND_INREG, {MVT::i8, MVT::i16}, Expand); @@ -392,7 +393,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(ISD::BITREVERSE, MVT::i8, Custom); } - if (Subtarget.hasStdExtZbb() || + if (Subtarget.hasStdExtZbb() || Subtarget.hasStdExtP() || (Subtarget.hasVendorXCValu() && !Subtarget.is64Bit())) { setOperationAction({ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}, XLenVT, Legal); @@ -403,6 +404,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction({ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF}, MVT::i32, Custom); } else { setOperationAction(ISD::CTTZ, XLenVT, Expand); + // If have a CLZW, but not CTZW, custom promote i32. + if (Subtarget.hasStdExtP() && Subtarget.is64Bit()) + setOperationAction({ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF}, MVT::i32, Custom); } if (!Subtarget.hasCPOPLike()) { @@ -419,13 +423,15 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, // We need the custom lowering to make sure that the resulting sequence // for the 32bit case is efficient on 64bit targets. // Use default promotion for i32 without Zbb. - if (Subtarget.is64Bit() && Subtarget.hasStdExtZbb()) + if (Subtarget.is64Bit() && + (Subtarget.hasStdExtZbb() || Subtarget.hasStdExtP())) setOperationAction({ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, MVT::i32, Custom); } else { setOperationAction(ISD::CTLZ, XLenVT, Expand); } - if (Subtarget.hasVendorXCValu() && !Subtarget.is64Bit()) { + if (Subtarget.hasStdExtP() || + (Subtarget.hasVendorXCValu() && !Subtarget.is64Bit())) { setOperationAction(ISD::ABS, XLenVT, Legal); } else if (Subtarget.hasShortForwardBranchOpt()) { // We can use PseudoCCSUB to implement ABS. @@ -14669,6 +14675,25 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N, DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(0)); bool IsCTZ = N->getOpcode() == ISD::CTTZ || N->getOpcode() == ISD::CTTZ_ZERO_UNDEF; + + // Without Zbb, lower as 32 - clzw(~X & (X-1)) + if (IsCTZ && !Subtarget.hasStdExtZbb()) { + assert(Subtarget.hasStdExtP()); + + NewOp0 = DAG.getFreeze(NewOp0); + SDValue Not = DAG.getNOT(DL, NewOp0, MVT::i64); + SDValue Minus1 = DAG.getNode(ISD::SUB, DL, MVT::i64, NewOp0, + DAG.getConstant(1, DL, MVT::i64)); + SDValue And = DAG.getNode(ISD::AND, DL, MVT::i64, Not, Minus1); + SDValue CLZW = DAG.getNode(RISCVISD::CLZW, DL, MVT::i64, And); + SDValue Sub = DAG.getNode(ISD::SUB, DL, MVT::i64, + DAG.getConstant(32, DL, MVT::i64), CLZW); + SDValue Res = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, Sub, + DAG.getValueType(MVT::i32)); + Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res)); + return; + } + unsigned Opc = IsCTZ ? RISCVISD::CTZW : RISCVISD::CLZW; SDValue Res = DAG.getNode(Opc, DL, MVT::i64, NewOp0); Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res)); @@ -14797,7 +14822,7 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N, // to NEGW+MAX here requires a Freeze which breaks ComputeNumSignBits. SDValue Src = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, N->getOperand(0)); - SDValue Abs = DAG.getNode(RISCVISD::ABSW, DL, MVT::i64, Src); + SDValue Abs = DAG.getNode(RISCVISD::NEGW_MAX, DL, MVT::i64, Src); Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Abs)); return; } @@ -21813,7 +21838,7 @@ unsigned RISCVTargetLowering::ComputeNumSignBitsForTargetNode( // Output is either all zero or operand 0. We can propagate sign bit count // from operand 0. return DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1); - case RISCVISD::ABSW: { + case RISCVISD::NEGW_MAX: { // We expand this at isel to negw+max. The result will have 33 sign bits // if the input has at least 33 sign bits. unsigned Tmp = diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td index 7d8a919..cc085bb 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td @@ -1455,3 +1455,11 @@ let Predicates = [HasStdExtP, IsRV32] in { def PMAXU_DW : RVPPairBinaryExchanged_rr<0b1111, 0b01, "pmaxu.dw">; def PMAXU_DB : RVPPairBinaryExchanged_rr<0b1111, 0b10, "pmaxu.db">; } // Predicates = [HasStdExtP, IsRV32] + + +//===----------------------------------------------------------------------===// +// Codegen patterns +//===----------------------------------------------------------------------===// + +let Predicates = [HasStdExtP] in +def : PatGpr<abs, ABS>; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td index 4104abd..f7b4914 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td @@ -218,11 +218,13 @@ let Predicates = [HasVendorXSfvcp], mayLoad = 0, mayStore = 0, } let Predicates = [HasVendorXSfvfexpAny], DecoderNamespace = "XSfvector" in { - def SF_VFEXP_V : VALUVs2<0b010011, 0b00111, OPFVV, "sf.vfexp.v">; + def SF_VFEXP_V : VALUVs2<0b010011, 0b00111, OPFVV, "sf.vfexp.v">, + SchedUnaryMC<"WriteSF_VFExp", "ReadSF_VFExp">; } let Predicates = [HasVendorXSfvfexpa], DecoderNamespace = "XSfvector" in { - def SF_VFEXPA_V : VALUVs2<0b010011, 0b00110, OPFVV, "sf.vfexpa.v">; + def SF_VFEXPA_V : VALUVs2<0b010011, 0b00110, OPFVV, "sf.vfexpa.v">, + SchedUnaryMC<"WriteSF_VFExpa", "ReadSF_VFExpa">; } let Predicates = [HasVendorXSfvqmaccdod], DecoderNamespace = "XSfvector", @@ -482,11 +484,53 @@ let Predicates = [HasVendorXSfvfwmaccqqq] in { defm SF_VFWMACC_4x4x4 : VPseudoSiFiveVFWMACC; } -let Predicates = [HasVendorXSfvfnrclipxfqf] in { +let Predicates = [HasVendorXSfvfnrclipxfqf], AltFmtType = IS_NOT_ALTFMT in { defm SF_VFNRCLIP_XU_F_QF : VPseudoSiFiveVFNRCLIP; defm SF_VFNRCLIP_X_F_QF : VPseudoSiFiveVFNRCLIP; } +class VFExpSchedSEWSet<string mx, bit IsBF16, bit IsApprox> { + defvar BaseSet = SchedSEWSet<mx, isF=1>.val; + list<int> val = !if(IsBF16, !listremove(BaseSet, [32, 64]), + !if(IsApprox, BaseSet, !listremove(BaseSet, [64]))); +} +multiclass VPseudoVFExp_V<bit IsBF16 = false, bit IsApprox = false> { + defvar SchedSuffix = !if(IsApprox, "VFExpa", "VFExp"); + + foreach m = MxListF in { + defvar mx = m.MX; + foreach e = VFExpSchedSEWSet<mx, IsBF16, IsApprox>.val in { + let VLMul = m.value in { + def "_V_" # mx # "_E" # e + : VPseudoUnaryNoMask<m.vrclass, m.vrclass>, + SchedUnary<"WriteSF_" # SchedSuffix, "ReadSF_" # SchedSuffix, + mx, e, forcePassthruRead=true>; + def "_V_" # mx # "_E" # e # "_MASK" + : VPseudoUnaryMask<m.vrclass, m.vrclass>, + RISCVMaskedPseudo<MaskIdx = 2>, + SchedUnary<"WriteSF_" # SchedSuffix, "ReadSF_" # SchedSuffix, + mx, e, forcePassthruRead=true>; + } + } + } +} + +let Predicates = [HasVendorXSfvfbfexp16e], hasSideEffects = 0 in { + let AltFmtType = IS_ALTFMT in { + defm PseudoSF_VFEXP_ALT : VPseudoVFExp_V<IsBF16=true>; + } +} + +let Predicates = [HasVendorXSfvfexpAnyFloat], hasSideEffects = 0 in { + let AltFmtType = IS_NOT_ALTFMT in { + defm PseudoSF_VFEXP : VPseudoVFExp_V; + } +} + +let Predicates = [HasVendorXSfvfexpa], AltFmtType = IS_NOT_ALTFMT in { + defm PseudoSF_VFEXPA : VPseudoVFExp_V<IsApprox=true>; +} + // SDNode def SDT_SF_VC_V_X : SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCisVT<1, XLenVT>, @@ -893,3 +937,36 @@ let Predicates = [HasVendorXSfcease] in { let rs2 = 0b00101; } } + +let Predicates = [HasVendorXSfvfbfexp16e] in { + defm : VPatUnaryV_V<"int_riscv_sf_vfexp", "PseudoSF_VFEXP_ALT", + AllBF16Vectors, + isSEWAware=1>; +} + +let Predicates = [HasVendorXSfvfexp16e] in { + defm : VPatUnaryV_V<"int_riscv_sf_vfexp", "PseudoSF_VFEXP", + [VF16MF4, VF16MF2, VF16M1, VF16M2, VF16M4, VF16M8], + isSEWAware=1>; +} + +let Predicates = [HasVendorXSfvfexp32e] in { + defm : VPatUnaryV_V<"int_riscv_sf_vfexp", "PseudoSF_VFEXP", + [VF32MF2, VF32M1, VF32M2, VF32M4, VF32M8], isSEWAware=1>; +} + +let Predicates = [HasVendorXSfvfexpa] in { + defm : VPatUnaryV_V<"int_riscv_sf_vfexpa", "PseudoSF_VFEXPA", + [VF32MF2, VF32M1, VF32M2, VF32M4, VF32M8], isSEWAware=1>; +} + +let Predicates = [HasVendorXSfvfexpa, HasVInstructionsF16] in { + defm : VPatUnaryV_V<"int_riscv_sf_vfexpa", "PseudoSF_VFEXPA", + [VF16MF4, VF16MF2, VF16M1, VF16M2, VF16M4, VF16M8], + isSEWAware=1>; +} + +let Predicates = [HasVendorXSfvfexpa64e] in { + defm : VPatUnaryV_V<"int_riscv_sf_vfexpa", "PseudoSF_VFEXPA", + [VF64M1, VF64M2, VF64M4, VF64M8], isSEWAware=1>; +} diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td index 62b7bcd..5429c2a 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td @@ -51,7 +51,7 @@ def riscv_zip : RVSDNode<"ZIP", SDTIntUnaryOp>; def riscv_unzip : RVSDNode<"UNZIP", SDTIntUnaryOp>; // RV64IZbb absolute value for i32. Expanded to (max (negw X), X) during isel. -def riscv_absw : RVSDNode<"ABSW", SDTIntUnaryOp>; +def riscv_negw_max : RVSDNode<"NEGW_MAX", SDTIntUnaryOp>; // Scalar cryptography def riscv_clmul : RVSDNode<"CLMUL", SDTIntBinOp>; @@ -599,37 +599,43 @@ def : PatGpr<riscv_zip, ZIP_RV32, i32>; def : PatGpr<riscv_unzip, UNZIP_RV32, i32>; } // Predicates = [HasStdExtZbkb, IsRV32] -let Predicates = [HasStdExtZbb] in { +let Predicates = [HasStdExtZbbOrP] in { def : PatGpr<ctlz, CLZ>; +} + +let Predicates = [HasStdExtZbb] in { def : PatGpr<cttz, CTZ>; def : PatGpr<ctpop, CPOP>; } // Predicates = [HasStdExtZbb] -let Predicates = [HasStdExtZbb, IsRV64] in { +let Predicates = [HasStdExtZbbOrP, IsRV64] in { def : PatGpr<riscv_clzw, CLZW>; +} + +let Predicates = [HasStdExtZbb, IsRV64] in { def : PatGpr<riscv_ctzw, CTZW>; def : Pat<(i64 (ctpop (i64 (zexti32 (i64 GPR:$rs1))))), (CPOPW GPR:$rs1)>; -def : Pat<(i64 (riscv_absw GPR:$rs1)), +def : Pat<(i64 (riscv_negw_max GPR:$rs1)), (MAX GPR:$rs1, (XLenVT (SUBW (XLenVT X0), GPR:$rs1)))>; } // Predicates = [HasStdExtZbb, IsRV64] -let Predicates = [HasStdExtZbb] in { +let Predicates = [HasStdExtZbbOrP] in { def : Pat<(XLenVT (sext_inreg GPR:$rs1, i8)), (SEXT_B GPR:$rs1)>; def : Pat<(XLenVT (sext_inreg GPR:$rs1, i16)), (SEXT_H GPR:$rs1)>; } // Predicates = [HasStdExtZbb] -let Predicates = [HasStdExtZbb] in { +let Predicates = [HasStdExtZbbOrP] in { def : PatGprGpr<smin, MIN>; def : PatGprGpr<smax, MAX>; def : PatGprGpr<umin, MINU>; def : PatGprGpr<umax, MAXU>; } // Predicates = [HasStdExtZbb] -let Predicates = [HasStdExtZbbOrZbkb, IsRV32] in +let Predicates = [HasStdExtZbbOrZbkbOrP, IsRV32] in def : PatGpr<bswap, REV8_RV32, i32>; -let Predicates = [HasStdExtZbbOrZbkb, IsRV64] in +let Predicates = [HasStdExtZbbOrZbkbOrP, IsRV64] in def : PatGpr<bswap, REV8_RV64, i64>; let Predicates = [HasStdExtZbkb] in { diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td index 637d61fe..36a2f46 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td +++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td @@ -1588,6 +1588,10 @@ multiclass SiFive7SchedResources<int vlen, bit dualVALU, //===----------------------------------------------------------------------===// // Unsupported extensions defm : UnsupportedSchedQ; + // TODO: scheduling info of XSfvfexp* and XSfvfexpa* + // for SiFive7 will be added in follow-up patches. + defm : UnsupportedSchedXSfvfexp; + defm : UnsupportedSchedXSfvfexpa; defm : UnsupportedSchedZabha; defm : UnsupportedSchedZbc; defm : UnsupportedSchedZbkb; diff --git a/llvm/lib/Target/RISCV/RISCVSchedule.td b/llvm/lib/Target/RISCV/RISCVSchedule.td index 9ab9636..64ccfd8 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedule.td +++ b/llvm/lib/Target/RISCV/RISCVSchedule.td @@ -523,6 +523,8 @@ include "RISCVScheduleZvk.td" // Vendor Extensions multiclass UnsupportedSchedXsf { defm : UnsupportedSchedXsfvcp; + defm : UnsupportedSchedXSfvfexp; + defm : UnsupportedSchedXSfvfexpa; defm : UnsupportedSchedXSfvfnrclipxfqf; defm : UnsupportedSchedXSfvfwmaccqqq; defm : UnsupportedSchedXSfvqmaccdod; diff --git a/llvm/lib/Target/RISCV/RISCVScheduleXSf.td b/llvm/lib/Target/RISCV/RISCVScheduleXSf.td index 99632e4..1ee6dc1 100644 --- a/llvm/lib/Target/RISCV/RISCVScheduleXSf.td +++ b/llvm/lib/Target/RISCV/RISCVScheduleXSf.td @@ -99,3 +99,23 @@ defm : LMULWriteRes<"WriteSF_VFWMACC_QQQ", []>; defm : LMULReadAdvance<"ReadSF_VFWMACC_QQQ", 0>; } // Unsupported = true } + +defm "" : LMULSEWSchedWritesF<"WriteSF_VFExp">; +defm "" : LMULSEWSchedReadsF<"ReadSF_VFExp">; + +multiclass UnsupportedSchedXSfvfexp { +let Unsupported = true in { +defm : LMULSEWWriteResF<"WriteSF_VFExp", []>; +defm : LMULSEWReadAdvanceF<"ReadSF_VFExp", 0>; +} // Unsupported = true +} + +defm "" : LMULSEWSchedWritesF<"WriteSF_VFExpa">; +defm "" : LMULSEWSchedReadsF<"ReadSF_VFExpa">; + +multiclass UnsupportedSchedXSfvfexpa { +let Unsupported = true in { +defm : LMULSEWWriteResF<"WriteSF_VFExpa", []>; +defm : LMULSEWReadAdvanceF<"ReadSF_VFExpa", 0>; +} // Unsupported = true +} diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h index 334db4b..4b4fc8f 100644 --- a/llvm/lib/Target/RISCV/RISCVSubtarget.h +++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h @@ -187,7 +187,7 @@ public: } bool hasCLZLike() const { - return HasStdExtZbb || HasVendorXTHeadBb || + return HasStdExtZbb || HasStdExtP || HasVendorXTHeadBb || (HasVendorXCVbitmanip && !IsRV64); } bool hasCTZLike() const { @@ -197,7 +197,7 @@ public: return HasStdExtZbb || (HasVendorXCVbitmanip && !IsRV64); } bool hasREV8Like() const { - return HasStdExtZbb || HasStdExtZbkb || HasVendorXTHeadBb; + return HasStdExtZbb || HasStdExtZbkb || HasStdExtP || HasVendorXTHeadBb; } bool hasBEXTILike() const { return HasStdExtZbs || HasVendorXTHeadBs; } diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index 62073ec..4393f6e 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -4721,9 +4721,6 @@ bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) { if (!(Subtarget->hasVLX() || NVT.is512BitVector())) return false; - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); - auto getFoldableLogicOp = [](SDValue Op) { // Peek through single use bitcast. if (Op.getOpcode() == ISD::BITCAST && Op.hasOneUse()) @@ -4740,13 +4737,47 @@ bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) { return SDValue(); }; - SDValue A, FoldableOp; - if ((FoldableOp = getFoldableLogicOp(N1))) { - A = N0; - } else if ((FoldableOp = getFoldableLogicOp(N0))) { - A = N1; - } else - return false; + SDValue N0, N1, A, FoldableOp; + + // Identify and (optionally) peel an outer NOT that wraps a pure logic tree + auto tryPeelOuterNotWrappingLogic = [&](SDNode *Op) { + if (Op->getOpcode() == ISD::XOR && Op->hasOneUse() && + ISD::isBuildVectorAllOnes(Op->getOperand(1).getNode())) { + SDValue InnerOp = Op->getOperand(0); + + if (!getFoldableLogicOp(InnerOp)) + return SDValue(); + + N0 = InnerOp.getOperand(0); + N1 = InnerOp.getOperand(1); + if ((FoldableOp = getFoldableLogicOp(N1))) { + A = N0; + return InnerOp; + } + if ((FoldableOp = getFoldableLogicOp(N0))) { + A = N1; + return InnerOp; + } + } + return SDValue(); + }; + + bool PeeledOuterNot = false; + SDNode *OriN = N; + if (SDValue InnerOp = tryPeelOuterNotWrappingLogic(N)) { + PeeledOuterNot = true; + N = InnerOp.getNode(); + } else { + N0 = N->getOperand(0); + N1 = N->getOperand(1); + + if ((FoldableOp = getFoldableLogicOp(N1))) + A = N0; + else if ((FoldableOp = getFoldableLogicOp(N0))) + A = N1; + else + return false; + } SDValue B = FoldableOp.getOperand(0); SDValue C = FoldableOp.getOperand(1); @@ -4798,7 +4829,10 @@ bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) { case ISD::XOR: Imm ^= TernlogMagicA; break; } - return matchVPTERNLOG(N, ParentA, ParentB, ParentC, A, B, C, Imm); + if (PeeledOuterNot) + Imm = ~Imm; + + return matchVPTERNLOG(OriN, ParentA, ParentB, ParentC, A, B, C, Imm); } /// If the high bits of an 'and' operand are known zero, try setting the diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 4dfc400..410f20e 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -57617,10 +57617,10 @@ static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG, } // Fold any similar generic ADD/SUB opcodes to reuse this node. - auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) { + auto MatchGeneric = [&](unsigned Opc, SDValue N0, SDValue N1, bool Negate) { SDValue Ops[] = {N0, N1}; SDVTList VTs = DAG.getVTList(N->getValueType(0)); - if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) { + if (SDNode *GenericAddSub = DAG.getNodeIfExists(Opc, VTs, Ops)) { SDValue Op(N, 0); if (Negate) { // Bail if this is only used by a user of the x86 add/sub. @@ -57632,8 +57632,25 @@ static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG, DCI.CombineTo(GenericAddSub, Op); } }; - MatchGeneric(LHS, RHS, false); - MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode()); + MatchGeneric(GenericOpc, LHS, RHS, false); + MatchGeneric(GenericOpc, RHS, LHS, X86ISD::SUB == N->getOpcode()); + + if (auto *Const = dyn_cast<ConstantSDNode>(RHS)) { + SDValue NegC = DAG.getConstant(-Const->getAPIntValue(), DL, VT); + if (X86ISD::SUB == N->getOpcode()) { + // Fold generic add(LHS, -C) to X86ISD::SUB(LHS, C). + MatchGeneric(ISD::ADD, LHS, NegC, false); + } else { + // Negate X86ISD::ADD(LHS, C) and replace generic sub(-C, LHS). + MatchGeneric(ISD::SUB, NegC, LHS, true); + } + } else if (auto *Const = dyn_cast<ConstantSDNode>(LHS)) { + if (X86ISD::SUB == N->getOpcode()) { + SDValue NegC = DAG.getConstant(-Const->getAPIntValue(), DL, VT); + // Negate X86ISD::SUB(C, RHS) and replace generic add(RHS, -C). + MatchGeneric(ISD::ADD, RHS, NegC, true); + } + } // TODO: Can we drop the ZeroSecondOpOnly limit? This is to guarantee that the // EFLAGS result doesn't change. diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index e28b9c1..b7151f6 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1592,7 +1592,6 @@ namespace llvm { bool useLoadStackGuardNode(const Module &M) const override; bool useStackGuardXorFP() const override; void insertSSPDeclarations(Module &M) const override; - Function *getSSPStackGuardCheck(const Module &M) const override; SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val, const SDLoc &DL) const override; diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp index 37d7772..a61bbe5 100644 --- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp +++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp @@ -640,15 +640,6 @@ void X86TargetLowering::insertSSPDeclarations(Module &M) const { TargetLowering::insertSSPDeclarations(M); } -Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const { - // MSVC CRT has a function to validate security cookie. - if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() || - Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) { - return M.getFunction("__security_check_cookie"); - } - return TargetLowering::getSSPStackGuardCheck(M); -} - Value * X86TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const { // Android provides a fixed TLS slot for the SafeStack pointer. See the diff --git a/llvm/lib/Target/Xtensa/XtensaInstrInfo.td b/llvm/lib/Target/Xtensa/XtensaInstrInfo.td index edcf247..632c6a2 100644 --- a/llvm/lib/Target/Xtensa/XtensaInstrInfo.td +++ b/llvm/lib/Target/Xtensa/XtensaInstrInfo.td @@ -1407,7 +1407,7 @@ let isBarrier = 1, isTerminator = 1 in { let r = 0x04; } - def BREAK_N : RRRN_Inst<0x0C, (outs), (ins uimm4:$imm), + def BREAK_N : RRRN_Inst<0x0D, (outs), (ins uimm4:$imm), "break.n\t$imm", []>, Requires<[HasDensity, HasDebug]> { bits<4> imm; diff --git a/llvm/lib/TargetParser/ARMTargetParser.cpp b/llvm/lib/TargetParser/ARMTargetParser.cpp index 0fce5b9..709e5f0 100644 --- a/llvm/lib/TargetParser/ARMTargetParser.cpp +++ b/llvm/lib/TargetParser/ARMTargetParser.cpp @@ -88,6 +88,7 @@ unsigned ARM::parseArchVersion(StringRef Arch) { case ArchKind::ARMV9_4A: case ArchKind::ARMV9_5A: case ArchKind::ARMV9_6A: + case ArchKind::ARMV9_7A: return 9; case ArchKind::INVALID: return 0; @@ -127,6 +128,7 @@ static ARM::ProfileKind getProfileKind(ARM::ArchKind AK) { case ARM::ArchKind::ARMV9_4A: case ARM::ArchKind::ARMV9_5A: case ARM::ArchKind::ARMV9_6A: + case ARM::ArchKind::ARMV9_7A: return ARM::ProfileKind::A; case ARM::ArchKind::ARMV4: case ARM::ArchKind::ARMV4T: diff --git a/llvm/lib/TargetParser/ARMTargetParserCommon.cpp b/llvm/lib/TargetParser/ARMTargetParserCommon.cpp index f6cea85..15ba1eb 100644 --- a/llvm/lib/TargetParser/ARMTargetParserCommon.cpp +++ b/llvm/lib/TargetParser/ARMTargetParserCommon.cpp @@ -46,6 +46,7 @@ StringRef ARM::getArchSynonym(StringRef Arch) { .Case("v9.4a", "v9.4-a") .Case("v9.5a", "v9.5-a") .Case("v9.6a", "v9.6-a") + .Case("v9.7a", "v9.7-a") .Case("v8m.base", "v8-m.base") .Case("v8m.main", "v8-m.main") .Case("v8.1m.main", "v8.1-m.main") diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp index 1068ce4..11ba9ee 100644 --- a/llvm/lib/TargetParser/Triple.cpp +++ b/llvm/lib/TargetParser/Triple.cpp @@ -937,6 +937,8 @@ static Triple::SubArchType parseSubArch(StringRef SubArchName) { return Triple::ARMSubArch_v9_5a; case ARM::ArchKind::ARMV9_6A: return Triple::ARMSubArch_v9_6a; + case ARM::ArchKind::ARMV9_7A: + return Triple::ARMSubArch_v9_7a; case ARM::ArchKind::ARMV8R: return Triple::ARMSubArch_v8r; case ARM::ArchKind::ARMV8MBaseline: diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 669d4f0..8d9933b 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -582,6 +582,18 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombinerImpl &IC) { IC.Builder.CreateBinaryIntrinsic(Intrinsic::ctlz, C, Op1); return BinaryOperator::CreateSub(ConstCtlz, X); } + + // ctlz(~x & (x - 1)) -> bitwidth - cttz(x, false) + if (Op0->hasOneUse() && + match(Op0, + m_c_And(m_Not(m_Value(X)), m_Add(m_Deferred(X), m_AllOnes())))) { + Type *Ty = II.getType(); + unsigned BitWidth = Ty->getScalarSizeInBits(); + auto *Cttz = IC.Builder.CreateIntrinsic(Intrinsic::cttz, Ty, + {X, IC.Builder.getFalse()}); + auto *Bw = ConstantInt::get(Ty, APInt(BitWidth, BitWidth)); + return IC.replaceInstUsesWith(II, IC.Builder.CreateSub(Bw, Cttz)); + } } // cttz(Pow2) -> Log2(Pow2) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index 5aa8de3..f5130da 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -4697,5 +4697,31 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) { cast<IntrinsicInst>(TrueVal)->getParamAlign(0).valueOrOne(), CondVal, FalseVal)); + // Canonicalize sign function ashr pattern: select (icmp slt X, 1), ashr X, + // bitwidth-1, 1 -> scmp(X, 0) + // Also handles: select (icmp sgt X, 0), 1, ashr X, bitwidth-1 -> scmp(X, 0) + unsigned BitWidth = SI.getType()->getScalarSizeInBits(); + CmpPredicate Pred; + Value *CmpLHS, *CmpRHS; + + // Canonicalize sign function ashr patterns: + // select (icmp slt X, 1), ashr X, bitwidth-1, 1 -> scmp(X, 0) + // select (icmp sgt X, 0), 1, ashr X, bitwidth-1 -> scmp(X, 0) + if (match(&SI, m_Select(m_ICmp(Pred, m_Value(CmpLHS), m_Value(CmpRHS)), + m_Value(TrueVal), m_Value(FalseVal))) && + ((Pred == ICmpInst::ICMP_SLT && match(CmpRHS, m_One()) && + match(TrueVal, + m_AShr(m_Specific(CmpLHS), m_SpecificInt(BitWidth - 1))) && + match(FalseVal, m_One())) || + (Pred == ICmpInst::ICMP_SGT && match(CmpRHS, m_Zero()) && + match(TrueVal, m_One()) && + match(FalseVal, + m_AShr(m_Specific(CmpLHS), m_SpecificInt(BitWidth - 1)))))) { + + Function *Scmp = Intrinsic::getOrInsertDeclaration( + SI.getModule(), Intrinsic::scmp, {SI.getType(), SI.getType()}); + return CallInst::Create(Scmp, {CmpLHS, ConstantInt::get(SI.getType(), 0)}); + } + return nullptr; } diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 67e2aae..9c8de45 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -2327,6 +2327,18 @@ Constant *InstCombinerImpl::unshuffleConstant(ArrayRef<int> ShMask, Constant *C, return ConstantVector::get(NewVecC); } +// Get the result of `Vector Op Splat` (or Splat Op Vector if \p SplatLHS). +static Constant *constantFoldBinOpWithSplat(unsigned Opcode, Constant *Vector, + Constant *Splat, bool SplatLHS, + const DataLayout &DL) { + ElementCount EC = cast<VectorType>(Vector->getType())->getElementCount(); + Constant *LHS = ConstantVector::getSplat(EC, Splat); + Constant *RHS = Vector; + if (!SplatLHS) + std::swap(LHS, RHS); + return ConstantFoldBinaryOpOperands(Opcode, LHS, RHS, DL); +} + Instruction *InstCombinerImpl::foldVectorBinop(BinaryOperator &Inst) { if (!isa<VectorType>(Inst.getType())) return nullptr; @@ -2338,6 +2350,37 @@ Instruction *InstCombinerImpl::foldVectorBinop(BinaryOperator &Inst) { assert(cast<VectorType>(RHS->getType())->getElementCount() == cast<VectorType>(Inst.getType())->getElementCount()); + auto foldConstantsThroughSubVectorInsertSplat = + [&](Value *MaybeSubVector, Value *MaybeSplat, + bool SplatLHS) -> Instruction * { + Value *Idx; + Constant *Splat, *SubVector, *Dest; + if (!match(MaybeSplat, m_ConstantSplat(m_Constant(Splat))) || + !match(MaybeSubVector, + m_VectorInsert(m_Constant(Dest), m_Constant(SubVector), + m_Value(Idx)))) + return nullptr; + SubVector = + constantFoldBinOpWithSplat(Opcode, SubVector, Splat, SplatLHS, DL); + Dest = constantFoldBinOpWithSplat(Opcode, Dest, Splat, SplatLHS, DL); + if (!SubVector || !Dest) + return nullptr; + auto *InsertVector = + Builder.CreateInsertVector(Dest->getType(), Dest, SubVector, Idx); + return replaceInstUsesWith(Inst, InsertVector); + }; + + // If one operand is a constant splat and the other operand is a + // `vector.insert` where both the destination and subvector are constant, + // apply the operation to both the destination and subvector, returning a new + // constant `vector.insert`. This helps constant folding for scalable vectors. + if (Instruction *Folded = foldConstantsThroughSubVectorInsertSplat( + /*MaybeSubVector=*/LHS, /*MaybeSplat=*/RHS, /*SplatLHS=*/false)) + return Folded; + if (Instruction *Folded = foldConstantsThroughSubVectorInsertSplat( + /*MaybeSubVector=*/RHS, /*MaybeSplat=*/LHS, /*SplatLHS=*/true)) + return Folded; + // If both operands of the binop are vector concatenations, then perform the // narrow binop on each pair of the source operands followed by concatenation // of the results. diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index b6cbecb..10b03bb 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -226,6 +226,7 @@ static const Align kMinOriginAlignment = Align(4); static const Align kShadowTLSAlignment = Align(8); // These constants must be kept in sync with the ones in msan.h. +// TODO: increase size to match SVE/SVE2/SME/SME2 limits static const unsigned kParamTLSSize = 800; static const unsigned kRetvalTLSSize = 800; @@ -1544,6 +1545,22 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { } } + static bool isAArch64SVCount(Type *Ty) { + if (TargetExtType *TTy = dyn_cast<TargetExtType>(Ty)) + return TTy->getName() == "aarch64.svcount"; + return false; + } + + // This is intended to match the "AArch64 Predicate-as-Counter Type" (aka + // 'target("aarch64.svcount")', but not e.g., <vscale x 4 x i32>. + static bool isScalableNonVectorType(Type *Ty) { + if (!isAArch64SVCount(Ty)) + LLVM_DEBUG(dbgs() << "isScalableNonVectorType: Unexpected type " << *Ty + << "\n"); + + return Ty->isScalableTy() && !isa<VectorType>(Ty); + } + void materializeChecks() { #ifndef NDEBUG // For assert below. @@ -1672,6 +1689,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { LLVM_DEBUG(dbgs() << "getShadowTy: " << *ST << " ===> " << *Res << "\n"); return Res; } + if (isScalableNonVectorType(OrigTy)) { + LLVM_DEBUG(dbgs() << "getShadowTy: Scalable non-vector type: " << *OrigTy + << "\n"); + return OrigTy; + } + uint32_t TypeSize = DL.getTypeSizeInBits(OrigTy); return IntegerType::get(*MS.C, TypeSize); } @@ -2185,8 +2208,14 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { << *OrigIns << "\n"); return; } -#ifndef NDEBUG + Type *ShadowTy = Shadow->getType(); + if (isScalableNonVectorType(ShadowTy)) { + LLVM_DEBUG(dbgs() << "Skipping check of scalable non-vector " << *Shadow + << " before " << *OrigIns << "\n"); + return; + } +#ifndef NDEBUG assert((isa<IntegerType>(ShadowTy) || isa<VectorType>(ShadowTy) || isa<StructType>(ShadowTy) || isa<ArrayType>(ShadowTy)) && "Can only insert checks for integer, vector, and aggregate shadow " @@ -6972,6 +7001,15 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { // an extra "select". This results in much more compact IR. // Sa = select Sb, poisoned, (select b, Sc, Sd) Sa1 = getPoisonedShadow(getShadowTy(I.getType())); + } else if (isScalableNonVectorType(I.getType())) { + // This is intended to handle target("aarch64.svcount"), which can't be + // handled in the else branch because of incompatibility with CreateXor + // ("The supported LLVM operations on this type are limited to load, + // store, phi, select and alloca instructions"). + + // TODO: this currently underapproximates. Use Arm SVE EOR in the else + // branch as needed instead. + Sa1 = getCleanShadow(getShadowTy(I.getType())); } else { // Sa = select Sb, [ (c^d) | Sc | Sd ], [ b ? Sc : Sd ] // If Sb (condition is poisoned), look for bits in c and d that are equal diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index a1ad2db..2591df8 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -4172,11 +4172,6 @@ class VPlan { /// definitions are VPValues that hold a pointer to their underlying IR. SmallVector<VPValue *, 16> VPLiveIns; - /// Mapping from SCEVs to the VPValues representing their expansions. - /// NOTE: This mapping is temporary and will be removed once all users have - /// been modeled in VPlan directly. - DenseMap<const SCEV *, VPValue *> SCEVToExpansion; - /// Blocks allocated and owned by the VPlan. They will be deleted once the /// VPlan is destroyed. SmallVector<VPBlockBase *> CreatedBlocks; @@ -4424,15 +4419,6 @@ public: LLVM_DUMP_METHOD void dump() const; #endif - VPValue *getSCEVExpansion(const SCEV *S) const { - return SCEVToExpansion.lookup(S); - } - - void addSCEVExpansion(const SCEV *S, VPValue *V) { - assert(!SCEVToExpansion.contains(S) && "SCEV already expanded"); - SCEVToExpansion[S] = V; - } - /// Clone the current VPlan, update all VPValues of the new VPlan and cloned /// recipes to refer to the clones, and return it. VPlan *duplicate(); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 3e85e6f..84817d7 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -943,12 +943,40 @@ static void recursivelyDeleteDeadRecipes(VPValue *V) { } } +/// Get any instruction opcode or intrinsic ID data embedded in recipe \p R. +/// Returns an optional pair, where the first element indicates whether it is +/// an intrinsic ID. +static std::optional<std::pair<bool, unsigned>> +getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R) { + return TypeSwitch<const VPSingleDefRecipe *, + std::optional<std::pair<bool, unsigned>>>(R) + .Case<VPInstruction, VPWidenRecipe, VPWidenCastRecipe, + VPWidenSelectRecipe, VPWidenGEPRecipe, VPReplicateRecipe>( + [](auto *I) { return std::make_pair(false, I->getOpcode()); }) + .Case<VPWidenIntrinsicRecipe>([](auto *I) { + return std::make_pair(true, I->getVectorIntrinsicID()); + }) + .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe>([](auto *I) { + // For recipes that do not directly map to LLVM IR instructions, + // assign opcodes after the last VPInstruction opcode (which is also + // after the last IR Instruction opcode), based on the VPDefID. + return std::make_pair(false, + VPInstruction::OpsEnd + 1 + I->getVPDefID()); + }) + .Default([](auto *) { return std::nullopt; }); +} + /// Try to fold \p R using InstSimplifyFolder. Will succeed and return a -/// non-nullptr Value for a handled \p Opcode if corresponding \p Operands are -/// foldable live-ins. -static Value *tryToFoldLiveIns(const VPRecipeBase &R, unsigned Opcode, - ArrayRef<VPValue *> Operands, - const DataLayout &DL, VPTypeAnalysis &TypeInfo) { +/// non-nullptr VPValue for a handled opcode or intrinsic ID if corresponding \p +/// Operands are foldable live-ins. +static VPValue *tryToFoldLiveIns(VPSingleDefRecipe &R, + ArrayRef<VPValue *> Operands, + const DataLayout &DL, + VPTypeAnalysis &TypeInfo) { + auto OpcodeOrIID = getOpcodeOrIntrinsicID(&R); + if (!OpcodeOrIID) + return nullptr; + SmallVector<Value *, 4> Ops; for (VPValue *Op : Operands) { if (!Op->isLiveIn() || !Op->getLiveInIRValue()) @@ -956,43 +984,57 @@ static Value *tryToFoldLiveIns(const VPRecipeBase &R, unsigned Opcode, Ops.push_back(Op->getLiveInIRValue()); } - InstSimplifyFolder Folder(DL); - if (Instruction::isBinaryOp(Opcode)) - return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode), Ops[0], + auto FoldToIRValue = [&]() -> Value * { + InstSimplifyFolder Folder(DL); + if (OpcodeOrIID->first) { + if (R.getNumOperands() != 2) + return nullptr; + unsigned ID = OpcodeOrIID->second; + return Folder.FoldBinaryIntrinsic(ID, Ops[0], Ops[1], + TypeInfo.inferScalarType(&R)); + } + unsigned Opcode = OpcodeOrIID->second; + if (Instruction::isBinaryOp(Opcode)) + return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode), + Ops[0], Ops[1]); + if (Instruction::isCast(Opcode)) + return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0], + TypeInfo.inferScalarType(R.getVPSingleValue())); + switch (Opcode) { + case VPInstruction::LogicalAnd: + return Folder.FoldSelect(Ops[0], Ops[1], + ConstantInt::getNullValue(Ops[1]->getType())); + case VPInstruction::Not: + return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0], + Constant::getAllOnesValue(Ops[0]->getType())); + case Instruction::Select: + return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]); + case Instruction::ICmp: + case Instruction::FCmp: + return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0], Ops[1]); - if (Instruction::isCast(Opcode)) - return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0], - TypeInfo.inferScalarType(R.getVPSingleValue())); - switch (Opcode) { - case VPInstruction::LogicalAnd: - return Folder.FoldSelect(Ops[0], Ops[1], - ConstantInt::getNullValue(Ops[1]->getType())); - case VPInstruction::Not: - return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0], - Constant::getAllOnesValue(Ops[0]->getType())); - case Instruction::Select: - return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]); - case Instruction::ICmp: - case Instruction::FCmp: - return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0], - Ops[1]); - case Instruction::GetElementPtr: { - auto &RFlags = cast<VPRecipeWithIRFlags>(R); - auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr()); - return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0], drop_begin(Ops), - RFlags.getGEPNoWrapFlags()); - } - case VPInstruction::PtrAdd: - case VPInstruction::WidePtrAdd: - return Folder.FoldGEP(IntegerType::getInt8Ty(TypeInfo.getContext()), Ops[0], - Ops[1], - cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags()); - // An extract of a live-in is an extract of a broadcast, so return the - // broadcasted element. - case Instruction::ExtractElement: - assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar"); - return Ops[0]; - } + case Instruction::GetElementPtr: { + auto &RFlags = cast<VPRecipeWithIRFlags>(R); + auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr()); + return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0], + drop_begin(Ops), RFlags.getGEPNoWrapFlags()); + } + case VPInstruction::PtrAdd: + case VPInstruction::WidePtrAdd: + return Folder.FoldGEP(IntegerType::getInt8Ty(TypeInfo.getContext()), + Ops[0], Ops[1], + cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags()); + // An extract of a live-in is an extract of a broadcast, so return the + // broadcasted element. + case Instruction::ExtractElement: + assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar"); + return Ops[0]; + } + return nullptr; + }; + + if (Value *V = FoldToIRValue()) + return R.getParent()->getPlan()->getOrAddLiveIn(V); return nullptr; } @@ -1006,19 +1048,10 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { // Simplification of live-in IR values for SingleDef recipes using // InstSimplifyFolder. - if (TypeSwitch<VPRecipeBase *, bool>(&R) - .Case<VPInstruction, VPWidenRecipe, VPWidenCastRecipe, - VPReplicateRecipe, VPWidenSelectRecipe>([&](auto *I) { - const DataLayout &DL = - Plan->getScalarHeader()->getIRBasicBlock()->getDataLayout(); - Value *V = tryToFoldLiveIns(*I, I->getOpcode(), I->operands(), DL, - TypeInfo); - if (V) - I->replaceAllUsesWith(Plan->getOrAddLiveIn(V)); - return V; - }) - .Default([](auto *) { return false; })) - return; + const DataLayout &DL = + Plan->getScalarHeader()->getIRBasicBlock()->getDataLayout(); + if (VPValue *V = tryToFoldLiveIns(*Def, Def->operands(), DL, TypeInfo)) + return Def->replaceAllUsesWith(V); // Fold PredPHI LiveIn -> LiveIn. if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(&R)) { @@ -1996,29 +2029,6 @@ struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> { return Def == getEmptyKey() || Def == getTombstoneKey(); } - /// Get any instruction opcode or intrinsic ID data embedded in recipe \p R. - /// Returns an optional pair, where the first element indicates whether it is - /// an intrinsic ID. - static std::optional<std::pair<bool, unsigned>> - getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R) { - return TypeSwitch<const VPSingleDefRecipe *, - std::optional<std::pair<bool, unsigned>>>(R) - .Case<VPInstruction, VPWidenRecipe, VPWidenCastRecipe, - VPWidenSelectRecipe, VPWidenGEPRecipe, VPReplicateRecipe>( - [](auto *I) { return std::make_pair(false, I->getOpcode()); }) - .Case<VPWidenIntrinsicRecipe>([](auto *I) { - return std::make_pair(true, I->getVectorIntrinsicID()); - }) - .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe>([](auto *I) { - // For recipes that do not directly map to LLVM IR instructions, - // assign opcodes after the last VPInstruction opcode (which is also - // after the last IR Instruction opcode), based on the VPDefID. - return std::make_pair(false, - VPInstruction::OpsEnd + 1 + I->getVPDefID()); - }) - .Default([](auto *) { return std::nullopt; }); - } - /// If recipe \p R will lower to a GEP with a non-i8 source element type, /// return that source element type. static Type *getGEPSourceElementType(const VPSingleDefRecipe *R) { @@ -4119,7 +4129,7 @@ static bool isAlreadyNarrow(VPValue *VPV) { void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, unsigned VectorRegWidth) { VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion(); - if (!VectorLoop) + if (!VectorLoop || VectorLoop->getEntry()->getNumSuccessors() != 0) return; VPTypeAnalysis TypeInfo(Plan); diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index 06c3d75..fe66f13 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -32,8 +32,6 @@ bool vputils::onlyScalarValuesUsed(const VPValue *Def) { } VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr) { - if (auto *Expanded = Plan.getSCEVExpansion(Expr)) - return Expanded; VPValue *Expanded = nullptr; if (auto *E = dyn_cast<SCEVConstant>(Expr)) Expanded = Plan.getOrAddLiveIn(E->getValue()); @@ -50,7 +48,6 @@ VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr) { Plan.getEntry()->appendRecipe(Expanded->getDefiningRecipe()); } } - Plan.addSCEVExpansion(Expr, Expanded); return Expanded; } |
